From 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 20:24:20 +0200 Subject: Adding upstream version 14.2.21. Signed-off-by: Daniel Baumann --- src/rgw/CMakeLists.txt | 407 ++ src/rgw/librgw.cc | 729 +++ src/rgw/librgw_admin_user.cc | 186 + src/rgw/rgw-orphan-list | 144 + src/rgw/rgw_acl.cc | 174 + src/rgw/rgw_acl.h | 469 ++ src/rgw/rgw_acl_s3.cc | 616 ++ src/rgw/rgw_acl_s3.h | 111 + src/rgw/rgw_acl_swift.cc | 430 ++ src/rgw/rgw_acl_swift.h | 55 + src/rgw/rgw_admin.cc | 8463 ++++++++++++++++++++++++ src/rgw/rgw_admin_user.cc | 91 + src/rgw/rgw_admin_user.h | 43 + src/rgw/rgw_aio.h | 80 + src/rgw/rgw_aio_throttle.cc | 157 + src/rgw/rgw_aio_throttle.h | 83 + src/rgw/rgw_amqp.cc | 1035 +++ src/rgw/rgw_amqp.h | 77 + src/rgw/rgw_arn.cc | 385 ++ src/rgw/rgw_arn.h | 121 + src/rgw/rgw_asio_client.cc | 188 + src/rgw/rgw_asio_client.h | 62 + src/rgw/rgw_asio_frontend.cc | 834 +++ src/rgw/rgw_asio_frontend.h | 28 + src/rgw/rgw_auth.cc | 722 +++ src/rgw/rgw_auth.h | 696 ++ src/rgw/rgw_auth_filters.h | 290 + src/rgw/rgw_auth_keystone.cc | 491 ++ src/rgw/rgw_auth_keystone.h | 130 + src/rgw/rgw_auth_registry.h | 101 + src/rgw/rgw_auth_s3.cc | 1135 ++++ src/rgw/rgw_auth_s3.h | 615 ++ src/rgw/rgw_b64.h | 87 + src/rgw/rgw_basic_types.cc | 44 + src/rgw/rgw_basic_types.h | 213 + src/rgw/rgw_bucket.cc | 3178 +++++++++ src/rgw/rgw_bucket.h | 575 ++ src/rgw/rgw_cache.cc | 353 + src/rgw/rgw_cache.h | 219 + src/rgw/rgw_civetweb.cc | 248 + src/rgw/rgw_civetweb.h | 59 + src/rgw/rgw_civetweb_frontend.cc | 153 + src/rgw/rgw_civetweb_log.cc | 24 + src/rgw/rgw_civetweb_log.h | 10 + src/rgw/rgw_client_io.cc | 34 + src/rgw/rgw_client_io.h | 439 ++ src/rgw/rgw_client_io_filters.h | 456 ++ src/rgw/rgw_common.cc | 1921 ++++++ src/rgw/rgw_common.h | 2742 ++++++++ src/rgw/rgw_compression.cc | 201 + src/rgw/rgw_compression.h | 60 + src/rgw/rgw_coroutine.cc | 1058 +++ src/rgw/rgw_coroutine.h | 674 ++ src/rgw/rgw_cors.cc | 194 + src/rgw/rgw_cors.h | 136 + src/rgw/rgw_cors_s3.cc | 245 + src/rgw/rgw_cors_s3.h | 56 + src/rgw/rgw_cors_swift.h | 76 + src/rgw/rgw_cr_rados.cc | 916 +++ src/rgw/rgw_cr_rados.h | 1351 ++++ src/rgw/rgw_cr_rest.cc | 349 + src/rgw/rgw_cr_rest.h | 593 ++ src/rgw/rgw_cr_tools.cc | 275 + src/rgw/rgw_cr_tools.h | 75 + src/rgw/rgw_crypt.cc | 1317 ++++ src/rgw/rgw_crypt.h | 152 + src/rgw/rgw_crypt_sanitize.cc | 88 + src/rgw/rgw_crypt_sanitize.h | 71 + src/rgw/rgw_data_sync.cc | 3709 +++++++++++ src/rgw/rgw_data_sync.h | 625 ++ src/rgw/rgw_dencoder.cc | 564 ++ src/rgw/rgw_dmclock.h | 54 + src/rgw/rgw_dmclock_async_scheduler.cc | 175 + src/rgw/rgw_dmclock_async_scheduler.h | 217 + src/rgw/rgw_dmclock_scheduler.h | 89 + src/rgw/rgw_dmclock_scheduler_ctx.cc | 177 + src/rgw/rgw_dmclock_scheduler_ctx.h | 118 + src/rgw/rgw_dmclock_sync_scheduler.cc | 114 + src/rgw/rgw_dmclock_sync_scheduler.h | 79 + src/rgw/rgw_env.cc | 141 + src/rgw/rgw_es_main.cc | 76 + src/rgw/rgw_es_query.cc | 694 ++ src/rgw/rgw_es_query.h | 165 + src/rgw/rgw_etag_verifier.cc | 185 + src/rgw/rgw_etag_verifier.h | 85 + src/rgw/rgw_fcgi.cc | 91 + src/rgw/rgw_fcgi.h | 57 + src/rgw/rgw_fcgi_process.cc | 138 + src/rgw/rgw_file.cc | 2436 +++++++ src/rgw/rgw_file.h | 2806 ++++++++ src/rgw/rgw_formats.cc | 374 ++ src/rgw/rgw_formats.h | 136 + src/rgw/rgw_frontend.cc | 82 + src/rgw/rgw_frontend.h | 285 + src/rgw/rgw_gc.cc | 528 ++ src/rgw/rgw_gc.h | 77 + src/rgw/rgw_http_client.cc | 1255 ++++ src/rgw/rgw_http_client.h | 370 ++ src/rgw/rgw_http_client_curl.cc | 122 + src/rgw/rgw_http_client_curl.h | 32 + src/rgw/rgw_http_errors.h | 46 + src/rgw/rgw_iam_policy.cc | 1432 +++++ src/rgw/rgw_iam_policy.h | 480 ++ src/rgw/rgw_iam_policy_keywords.gperf | 130 + src/rgw/rgw_iam_policy_keywords.h | 139 + src/rgw/rgw_json_enc.cc | 1777 +++++ src/rgw/rgw_jsonparser.cc | 132 + src/rgw/rgw_kafka.cc | 719 +++ src/rgw/rgw_kafka.h | 81 + src/rgw/rgw_keystone.cc | 713 ++ src/rgw/rgw_keystone.h | 373 ++ src/rgw/rgw_lc.cc | 1678 +++++ src/rgw/rgw_lc.h | 539 ++ src/rgw/rgw_lc_s3.cc | 344 + src/rgw/rgw_lc_s3.h | 102 + src/rgw/rgw_ldap.cc | 128 + src/rgw/rgw_ldap.h | 143 + src/rgw/rgw_lib.h | 225 + src/rgw/rgw_lib_frontend.h | 115 + src/rgw/rgw_loadgen.cc | 128 + src/rgw/rgw_loadgen.h | 75 + src/rgw/rgw_loadgen_process.cc | 149 + src/rgw/rgw_log.cc | 467 ++ src/rgw/rgw_log.h | 144 + src/rgw/rgw_main.cc | 637 ++ src/rgw/rgw_meta_sync_status.h | 124 + src/rgw/rgw_metadata.cc | 1178 ++++ src/rgw/rgw_metadata.h | 426 ++ src/rgw/rgw_multi.cc | 384 ++ src/rgw/rgw_multi.h | 114 + src/rgw/rgw_multi_del.cc | 73 + src/rgw/rgw_multi_del.h | 66 + src/rgw/rgw_multiparser.cc | 46 + src/rgw/rgw_notify.cc | 141 + src/rgw/rgw_notify.h | 27 + src/rgw/rgw_notify_event_type.cc | 82 + src/rgw/rgw_notify_event_type.h | 35 + src/rgw/rgw_object_expirer.cc | 107 + src/rgw/rgw_object_expirer_core.cc | 294 + src/rgw/rgw_object_expirer_core.h | 100 + src/rgw/rgw_object_lock.cc | 96 + src/rgw/rgw_object_lock.h | 221 + src/rgw/rgw_op.cc | 7942 +++++++++++++++++++++++ src/rgw/rgw_op.h | 2346 +++++++ src/rgw/rgw_opa.cc | 82 + src/rgw/rgw_opa.h | 14 + src/rgw/rgw_orphan.cc | 1523 +++++ src/rgw/rgw_orphan.h | 290 + src/rgw/rgw_os_lib.cc | 62 + src/rgw/rgw_os_lib.h | 12 + src/rgw/rgw_otp.cc | 158 + src/rgw/rgw_otp.h | 15 + src/rgw/rgw_perf_counters.cc | 60 + src/rgw/rgw_perf_counters.h | 50 + src/rgw/rgw_period_history.cc | 354 + src/rgw/rgw_period_history.h | 114 + src/rgw/rgw_period_puller.cc | 114 + src/rgw/rgw_period_puller.h | 20 + src/rgw/rgw_period_pusher.cc | 307 + src/rgw/rgw_period_pusher.h | 56 + src/rgw/rgw_policy_s3.cc | 303 + src/rgw/rgw_policy_s3.h | 59 + src/rgw/rgw_process.cc | 323 + src/rgw/rgw_process.h | 199 + src/rgw/rgw_pubsub.cc | 872 +++ src/rgw/rgw_pubsub.h | 812 +++ src/rgw/rgw_pubsub_push.cc | 749 +++ src/rgw/rgw_pubsub_push.h | 57 + src/rgw/rgw_putobj.cc | 99 + src/rgw/rgw_putobj.h | 79 + src/rgw/rgw_putobj_processor.cc | 670 ++ src/rgw/rgw_putobj_processor.h | 263 + src/rgw/rgw_quota.cc | 1034 +++ src/rgw/rgw_quota.h | 123 + src/rgw/rgw_rados.cc | 10734 +++++++++++++++++++++++++++++++ src/rgw/rgw_rados.h | 2633 ++++++++ src/rgw/rgw_realm_reloader.cc | 176 + src/rgw/rgw_realm_reloader.h | 63 + src/rgw/rgw_realm_watcher.cc | 148 + src/rgw/rgw_realm_watcher.h | 69 + src/rgw/rgw_request.h | 64 + src/rgw/rgw_reshard.cc | 1177 ++++ src/rgw/rgw_reshard.h | 211 + src/rgw/rgw_resolve.cc | 44 + src/rgw/rgw_resolve.h | 27 + src/rgw/rgw_rest.cc | 2302 +++++++ src/rgw/rgw_rest.h | 816 +++ src/rgw/rgw_rest_admin.h | 15 + src/rgw/rgw_rest_bucket.cc | 350 + src/rgw/rgw_rest_bucket.h | 38 + src/rgw/rgw_rest_client.cc | 999 +++ src/rgw/rgw_rest_client.h | 226 + src/rgw/rgw_rest_config.cc | 85 + src/rgw/rgw_rest_config.h | 88 + src/rgw/rgw_rest_conn.cc | 466 ++ src/rgw/rgw_rest_conn.h | 521 ++ src/rgw/rgw_rest_iam.cc | 147 + src/rgw/rgw_rest_iam.h | 49 + src/rgw/rgw_rest_log.cc | 1060 +++ src/rgw/rgw_rest_log.h | 336 + src/rgw/rgw_rest_metadata.cc | 363 ++ src/rgw/rgw_rest_metadata.h | 135 + src/rgw/rgw_rest_pubsub.cc | 729 +++ src/rgw/rgw_rest_pubsub.h | 41 + src/rgw/rgw_rest_pubsub_common.cc | 259 + src/rgw/rgw_rest_pubsub_common.h | 287 + src/rgw/rgw_rest_realm.cc | 367 ++ src/rgw/rgw_rest_realm.h | 18 + src/rgw/rgw_rest_role.cc | 489 ++ src/rgw/rgw_rest_role.h | 131 + src/rgw/rgw_rest_s3.cc | 5133 +++++++++++++++ src/rgw/rgw_rest_s3.h | 1045 +++ src/rgw/rgw_rest_s3website.h | 103 + src/rgw/rgw_rest_sts.cc | 459 ++ src/rgw/rgw_rest_sts.h | 202 + src/rgw/rgw_rest_swift.cc | 3093 +++++++++ src/rgw/rgw_rest_swift.h | 681 ++ src/rgw/rgw_rest_usage.cc | 108 + src/rgw/rgw_rest_usage.h | 36 + src/rgw/rgw_rest_user.cc | 999 +++ src/rgw/rgw_rest_user.h | 38 + src/rgw/rgw_rest_user_policy.cc | 363 ++ src/rgw/rgw_rest_user_policy.h | 76 + src/rgw/rgw_role.cc | 502 ++ src/rgw/rgw_role.h | 161 + src/rgw/rgw_service.cc | 191 + src/rgw/rgw_service.h | 112 + src/rgw/rgw_string.cc | 45 + src/rgw/rgw_string.h | 236 + src/rgw/rgw_sts.cc | 427 ++ src/rgw/rgw_sts.h | 222 + src/rgw/rgw_swift_auth.cc | 759 +++ src/rgw/rgw_swift_auth.h | 341 + src/rgw/rgw_sync.cc | 3136 +++++++++ src/rgw/rgw_sync.h | 534 ++ src/rgw/rgw_sync_counters.cc | 28 + src/rgw/rgw_sync_counters.h | 25 + src/rgw/rgw_sync_log_trim.cc | 1094 ++++ src/rgw/rgw_sync_log_trim.h | 110 + src/rgw/rgw_sync_module.cc | 91 + src/rgw/rgw_sync_module.h | 197 + src/rgw/rgw_sync_module_aws.cc | 1807 ++++++ src/rgw/rgw_sync_module_aws.h | 111 + src/rgw/rgw_sync_module_es.cc | 918 +++ src/rgw/rgw_sync_module_es.h | 62 + src/rgw/rgw_sync_module_es_rest.cc | 423 ++ src/rgw/rgw_sync_module_es_rest.h | 20 + src/rgw/rgw_sync_module_log.cc | 74 + src/rgw/rgw_sync_module_log.h | 18 + src/rgw/rgw_sync_module_pubsub.cc | 1578 +++++ src/rgw/rgw_sync_module_pubsub.h | 40 + src/rgw/rgw_sync_module_pubsub_rest.cc | 526 ++ src/rgw/rgw_sync_module_pubsub_rest.h | 13 + src/rgw/rgw_sync_trace.cc | 288 + src/rgw/rgw_sync_trace.h | 142 + src/rgw/rgw_tag.cc | 59 + src/rgw/rgw_tag.h | 46 + src/rgw/rgw_tag_s3.cc | 65 + src/rgw/rgw_tag_s3.h | 53 + src/rgw/rgw_tar.h | 156 + src/rgw/rgw_token.cc | 143 + src/rgw/rgw_token.h | 169 + src/rgw/rgw_tools.cc | 527 ++ src/rgw/rgw_tools.h | 202 + src/rgw/rgw_torrent.cc | 266 + src/rgw/rgw_torrent.h | 142 + src/rgw/rgw_url.cc | 49 + src/rgw/rgw_url.h | 12 + src/rgw/rgw_usage.cc | 151 + src/rgw/rgw_usage.h | 30 + src/rgw/rgw_user.cc | 2958 +++++++++ src/rgw/rgw_user.h | 774 +++ src/rgw/rgw_web_idp.h | 29 + src/rgw/rgw_website.cc | 127 + src/rgw/rgw_website.h | 246 + src/rgw/rgw_xml.cc | 500 ++ src/rgw/rgw_xml.h | 352 + src/rgw/rgw_xml_enc.cc | 152 + src/rgw/rgw_zone.cc | 1937 ++++++ src/rgw/rgw_zone.h | 1145 ++++ src/rgw/services/svc_finisher.cc | 53 + src/rgw/services/svc_finisher.h | 45 + src/rgw/services/svc_notify.cc | 484 ++ src/rgw/services/svc_notify.h | 100 + src/rgw/services/svc_quota.cc | 15 + src/rgw/services/svc_quota.h | 23 + src/rgw/services/svc_rados.cc | 308 + src/rgw/services/svc_rados.h | 178 + src/rgw/services/svc_sync_modules.cc | 15 + src/rgw/services/svc_sync_modules.h | 26 + src/rgw/services/svc_sys_obj.cc | 192 + src/rgw/services/svc_sys_obj.h | 275 + src/rgw/services/svc_sys_obj_cache.cc | 506 ++ src/rgw/services/svc_sys_obj_cache.h | 176 + src/rgw/services/svc_sys_obj_core.cc | 595 ++ src/rgw/services/svc_sys_obj_core.h | 201 + src/rgw/services/svc_zone.cc | 1250 ++++ src/rgw/services/svc_zone.h | 134 + src/rgw/services/svc_zone_utils.cc | 59 + src/rgw/services/svc_zone_utils.h | 39 + 300 files changed, 154611 insertions(+) create mode 100644 src/rgw/CMakeLists.txt create mode 100644 src/rgw/librgw.cc create mode 100644 src/rgw/librgw_admin_user.cc create mode 100755 src/rgw/rgw-orphan-list create mode 100644 src/rgw/rgw_acl.cc create mode 100644 src/rgw/rgw_acl.h create mode 100644 src/rgw/rgw_acl_s3.cc create mode 100644 src/rgw/rgw_acl_s3.h create mode 100644 src/rgw/rgw_acl_swift.cc create mode 100644 src/rgw/rgw_acl_swift.h create mode 100644 src/rgw/rgw_admin.cc create mode 100644 src/rgw/rgw_admin_user.cc create mode 100644 src/rgw/rgw_admin_user.h create mode 100644 src/rgw/rgw_aio.h create mode 100644 src/rgw/rgw_aio_throttle.cc create mode 100644 src/rgw/rgw_aio_throttle.h create mode 100644 src/rgw/rgw_amqp.cc create mode 100644 src/rgw/rgw_amqp.h create mode 100644 src/rgw/rgw_arn.cc create mode 100644 src/rgw/rgw_arn.h create mode 100644 src/rgw/rgw_asio_client.cc create mode 100644 src/rgw/rgw_asio_client.h create mode 100644 src/rgw/rgw_asio_frontend.cc create mode 100644 src/rgw/rgw_asio_frontend.h create mode 100644 src/rgw/rgw_auth.cc create mode 100644 src/rgw/rgw_auth.h create mode 100644 src/rgw/rgw_auth_filters.h create mode 100644 src/rgw/rgw_auth_keystone.cc create mode 100644 src/rgw/rgw_auth_keystone.h create mode 100644 src/rgw/rgw_auth_registry.h create mode 100644 src/rgw/rgw_auth_s3.cc create mode 100644 src/rgw/rgw_auth_s3.h create mode 100644 src/rgw/rgw_b64.h create mode 100644 src/rgw/rgw_basic_types.cc create mode 100644 src/rgw/rgw_basic_types.h create mode 100644 src/rgw/rgw_bucket.cc create mode 100644 src/rgw/rgw_bucket.h create mode 100644 src/rgw/rgw_cache.cc create mode 100644 src/rgw/rgw_cache.h create mode 100644 src/rgw/rgw_civetweb.cc create mode 100644 src/rgw/rgw_civetweb.h create mode 100644 src/rgw/rgw_civetweb_frontend.cc create mode 100644 src/rgw/rgw_civetweb_log.cc create mode 100644 src/rgw/rgw_civetweb_log.h create mode 100644 src/rgw/rgw_client_io.cc create mode 100644 src/rgw/rgw_client_io.h create mode 100644 src/rgw/rgw_client_io_filters.h create mode 100644 src/rgw/rgw_common.cc create mode 100644 src/rgw/rgw_common.h create mode 100644 src/rgw/rgw_compression.cc create mode 100644 src/rgw/rgw_compression.h create mode 100644 src/rgw/rgw_coroutine.cc create mode 100644 src/rgw/rgw_coroutine.h create mode 100644 src/rgw/rgw_cors.cc create mode 100644 src/rgw/rgw_cors.h create mode 100644 src/rgw/rgw_cors_s3.cc create mode 100644 src/rgw/rgw_cors_s3.h create mode 100644 src/rgw/rgw_cors_swift.h create mode 100644 src/rgw/rgw_cr_rados.cc create mode 100644 src/rgw/rgw_cr_rados.h create mode 100644 src/rgw/rgw_cr_rest.cc create mode 100644 src/rgw/rgw_cr_rest.h create mode 100644 src/rgw/rgw_cr_tools.cc create mode 100644 src/rgw/rgw_cr_tools.h create mode 100644 src/rgw/rgw_crypt.cc create mode 100644 src/rgw/rgw_crypt.h create mode 100644 src/rgw/rgw_crypt_sanitize.cc create mode 100644 src/rgw/rgw_crypt_sanitize.h create mode 100644 src/rgw/rgw_data_sync.cc create mode 100644 src/rgw/rgw_data_sync.h create mode 100644 src/rgw/rgw_dencoder.cc create mode 100644 src/rgw/rgw_dmclock.h create mode 100644 src/rgw/rgw_dmclock_async_scheduler.cc create mode 100644 src/rgw/rgw_dmclock_async_scheduler.h create mode 100644 src/rgw/rgw_dmclock_scheduler.h create mode 100644 src/rgw/rgw_dmclock_scheduler_ctx.cc create mode 100644 src/rgw/rgw_dmclock_scheduler_ctx.h create mode 100644 src/rgw/rgw_dmclock_sync_scheduler.cc create mode 100644 src/rgw/rgw_dmclock_sync_scheduler.h create mode 100644 src/rgw/rgw_env.cc create mode 100644 src/rgw/rgw_es_main.cc create mode 100644 src/rgw/rgw_es_query.cc create mode 100644 src/rgw/rgw_es_query.h create mode 100644 src/rgw/rgw_etag_verifier.cc create mode 100644 src/rgw/rgw_etag_verifier.h create mode 100644 src/rgw/rgw_fcgi.cc create mode 100644 src/rgw/rgw_fcgi.h create mode 100644 src/rgw/rgw_fcgi_process.cc create mode 100644 src/rgw/rgw_file.cc create mode 100644 src/rgw/rgw_file.h create mode 100644 src/rgw/rgw_formats.cc create mode 100644 src/rgw/rgw_formats.h create mode 100644 src/rgw/rgw_frontend.cc create mode 100644 src/rgw/rgw_frontend.h create mode 100644 src/rgw/rgw_gc.cc create mode 100644 src/rgw/rgw_gc.h create mode 100644 src/rgw/rgw_http_client.cc create mode 100644 src/rgw/rgw_http_client.h create mode 100644 src/rgw/rgw_http_client_curl.cc create mode 100644 src/rgw/rgw_http_client_curl.h create mode 100644 src/rgw/rgw_http_errors.h create mode 100644 src/rgw/rgw_iam_policy.cc create mode 100644 src/rgw/rgw_iam_policy.h create mode 100644 src/rgw/rgw_iam_policy_keywords.gperf create mode 100644 src/rgw/rgw_iam_policy_keywords.h create mode 100644 src/rgw/rgw_json_enc.cc create mode 100644 src/rgw/rgw_jsonparser.cc create mode 100644 src/rgw/rgw_kafka.cc create mode 100644 src/rgw/rgw_kafka.h create mode 100644 src/rgw/rgw_keystone.cc create mode 100644 src/rgw/rgw_keystone.h create mode 100644 src/rgw/rgw_lc.cc create mode 100644 src/rgw/rgw_lc.h create mode 100644 src/rgw/rgw_lc_s3.cc create mode 100644 src/rgw/rgw_lc_s3.h create mode 100644 src/rgw/rgw_ldap.cc create mode 100644 src/rgw/rgw_ldap.h create mode 100644 src/rgw/rgw_lib.h create mode 100644 src/rgw/rgw_lib_frontend.h create mode 100644 src/rgw/rgw_loadgen.cc create mode 100644 src/rgw/rgw_loadgen.h create mode 100644 src/rgw/rgw_loadgen_process.cc create mode 100644 src/rgw/rgw_log.cc create mode 100644 src/rgw/rgw_log.h create mode 100644 src/rgw/rgw_main.cc create mode 100644 src/rgw/rgw_meta_sync_status.h create mode 100644 src/rgw/rgw_metadata.cc create mode 100644 src/rgw/rgw_metadata.h create mode 100644 src/rgw/rgw_multi.cc create mode 100644 src/rgw/rgw_multi.h create mode 100644 src/rgw/rgw_multi_del.cc create mode 100644 src/rgw/rgw_multi_del.h create mode 100644 src/rgw/rgw_multiparser.cc create mode 100644 src/rgw/rgw_notify.cc create mode 100644 src/rgw/rgw_notify.h create mode 100644 src/rgw/rgw_notify_event_type.cc create mode 100644 src/rgw/rgw_notify_event_type.h create mode 100644 src/rgw/rgw_object_expirer.cc create mode 100644 src/rgw/rgw_object_expirer_core.cc create mode 100644 src/rgw/rgw_object_expirer_core.h create mode 100644 src/rgw/rgw_object_lock.cc create mode 100644 src/rgw/rgw_object_lock.h create mode 100644 src/rgw/rgw_op.cc create mode 100644 src/rgw/rgw_op.h create mode 100644 src/rgw/rgw_opa.cc create mode 100644 src/rgw/rgw_opa.h create mode 100644 src/rgw/rgw_orphan.cc create mode 100644 src/rgw/rgw_orphan.h create mode 100644 src/rgw/rgw_os_lib.cc create mode 100644 src/rgw/rgw_os_lib.h create mode 100644 src/rgw/rgw_otp.cc create mode 100644 src/rgw/rgw_otp.h create mode 100644 src/rgw/rgw_perf_counters.cc create mode 100644 src/rgw/rgw_perf_counters.h create mode 100644 src/rgw/rgw_period_history.cc create mode 100644 src/rgw/rgw_period_history.h create mode 100644 src/rgw/rgw_period_puller.cc create mode 100644 src/rgw/rgw_period_puller.h create mode 100644 src/rgw/rgw_period_pusher.cc create mode 100644 src/rgw/rgw_period_pusher.h create mode 100644 src/rgw/rgw_policy_s3.cc create mode 100644 src/rgw/rgw_policy_s3.h create mode 100644 src/rgw/rgw_process.cc create mode 100644 src/rgw/rgw_process.h create mode 100644 src/rgw/rgw_pubsub.cc create mode 100644 src/rgw/rgw_pubsub.h create mode 100644 src/rgw/rgw_pubsub_push.cc create mode 100644 src/rgw/rgw_pubsub_push.h create mode 100644 src/rgw/rgw_putobj.cc create mode 100644 src/rgw/rgw_putobj.h create mode 100644 src/rgw/rgw_putobj_processor.cc create mode 100644 src/rgw/rgw_putobj_processor.h create mode 100644 src/rgw/rgw_quota.cc create mode 100644 src/rgw/rgw_quota.h create mode 100644 src/rgw/rgw_rados.cc create mode 100644 src/rgw/rgw_rados.h create mode 100644 src/rgw/rgw_realm_reloader.cc create mode 100644 src/rgw/rgw_realm_reloader.h create mode 100644 src/rgw/rgw_realm_watcher.cc create mode 100644 src/rgw/rgw_realm_watcher.h create mode 100644 src/rgw/rgw_request.h create mode 100644 src/rgw/rgw_reshard.cc create mode 100644 src/rgw/rgw_reshard.h create mode 100644 src/rgw/rgw_resolve.cc create mode 100644 src/rgw/rgw_resolve.h create mode 100644 src/rgw/rgw_rest.cc create mode 100644 src/rgw/rgw_rest.h create mode 100644 src/rgw/rgw_rest_admin.h create mode 100644 src/rgw/rgw_rest_bucket.cc create mode 100644 src/rgw/rgw_rest_bucket.h create mode 100644 src/rgw/rgw_rest_client.cc create mode 100644 src/rgw/rgw_rest_client.h create mode 100644 src/rgw/rgw_rest_config.cc create mode 100644 src/rgw/rgw_rest_config.h create mode 100644 src/rgw/rgw_rest_conn.cc create mode 100644 src/rgw/rgw_rest_conn.h create mode 100644 src/rgw/rgw_rest_iam.cc create mode 100644 src/rgw/rgw_rest_iam.h create mode 100644 src/rgw/rgw_rest_log.cc create mode 100644 src/rgw/rgw_rest_log.h create mode 100644 src/rgw/rgw_rest_metadata.cc create mode 100644 src/rgw/rgw_rest_metadata.h create mode 100644 src/rgw/rgw_rest_pubsub.cc create mode 100644 src/rgw/rgw_rest_pubsub.h create mode 100644 src/rgw/rgw_rest_pubsub_common.cc create mode 100644 src/rgw/rgw_rest_pubsub_common.h create mode 100644 src/rgw/rgw_rest_realm.cc create mode 100644 src/rgw/rgw_rest_realm.h create mode 100644 src/rgw/rgw_rest_role.cc create mode 100644 src/rgw/rgw_rest_role.h create mode 100644 src/rgw/rgw_rest_s3.cc create mode 100644 src/rgw/rgw_rest_s3.h create mode 100644 src/rgw/rgw_rest_s3website.h create mode 100644 src/rgw/rgw_rest_sts.cc create mode 100644 src/rgw/rgw_rest_sts.h create mode 100644 src/rgw/rgw_rest_swift.cc create mode 100644 src/rgw/rgw_rest_swift.h create mode 100644 src/rgw/rgw_rest_usage.cc create mode 100644 src/rgw/rgw_rest_usage.h create mode 100644 src/rgw/rgw_rest_user.cc create mode 100644 src/rgw/rgw_rest_user.h create mode 100644 src/rgw/rgw_rest_user_policy.cc create mode 100644 src/rgw/rgw_rest_user_policy.h create mode 100644 src/rgw/rgw_role.cc create mode 100644 src/rgw/rgw_role.h create mode 100644 src/rgw/rgw_service.cc create mode 100644 src/rgw/rgw_service.h create mode 100644 src/rgw/rgw_string.cc create mode 100644 src/rgw/rgw_string.h create mode 100644 src/rgw/rgw_sts.cc create mode 100644 src/rgw/rgw_sts.h create mode 100644 src/rgw/rgw_swift_auth.cc create mode 100644 src/rgw/rgw_swift_auth.h create mode 100644 src/rgw/rgw_sync.cc create mode 100644 src/rgw/rgw_sync.h create mode 100644 src/rgw/rgw_sync_counters.cc create mode 100644 src/rgw/rgw_sync_counters.h create mode 100644 src/rgw/rgw_sync_log_trim.cc create mode 100644 src/rgw/rgw_sync_log_trim.h create mode 100644 src/rgw/rgw_sync_module.cc create mode 100644 src/rgw/rgw_sync_module.h create mode 100644 src/rgw/rgw_sync_module_aws.cc create mode 100644 src/rgw/rgw_sync_module_aws.h create mode 100644 src/rgw/rgw_sync_module_es.cc create mode 100644 src/rgw/rgw_sync_module_es.h create mode 100644 src/rgw/rgw_sync_module_es_rest.cc create mode 100644 src/rgw/rgw_sync_module_es_rest.h create mode 100644 src/rgw/rgw_sync_module_log.cc create mode 100644 src/rgw/rgw_sync_module_log.h create mode 100644 src/rgw/rgw_sync_module_pubsub.cc create mode 100644 src/rgw/rgw_sync_module_pubsub.h create mode 100644 src/rgw/rgw_sync_module_pubsub_rest.cc create mode 100644 src/rgw/rgw_sync_module_pubsub_rest.h create mode 100644 src/rgw/rgw_sync_trace.cc create mode 100644 src/rgw/rgw_sync_trace.h create mode 100644 src/rgw/rgw_tag.cc create mode 100644 src/rgw/rgw_tag.h create mode 100644 src/rgw/rgw_tag_s3.cc create mode 100644 src/rgw/rgw_tag_s3.h create mode 100644 src/rgw/rgw_tar.h create mode 100644 src/rgw/rgw_token.cc create mode 100644 src/rgw/rgw_token.h create mode 100644 src/rgw/rgw_tools.cc create mode 100644 src/rgw/rgw_tools.h create mode 100644 src/rgw/rgw_torrent.cc create mode 100644 src/rgw/rgw_torrent.h create mode 100644 src/rgw/rgw_url.cc create mode 100644 src/rgw/rgw_url.h create mode 100644 src/rgw/rgw_usage.cc create mode 100644 src/rgw/rgw_usage.h create mode 100644 src/rgw/rgw_user.cc create mode 100644 src/rgw/rgw_user.h create mode 100644 src/rgw/rgw_web_idp.h create mode 100644 src/rgw/rgw_website.cc create mode 100644 src/rgw/rgw_website.h create mode 100755 src/rgw/rgw_xml.cc create mode 100644 src/rgw/rgw_xml.h create mode 100644 src/rgw/rgw_xml_enc.cc create mode 100644 src/rgw/rgw_zone.cc create mode 100644 src/rgw/rgw_zone.h create mode 100644 src/rgw/services/svc_finisher.cc create mode 100644 src/rgw/services/svc_finisher.h create mode 100644 src/rgw/services/svc_notify.cc create mode 100644 src/rgw/services/svc_notify.h create mode 100644 src/rgw/services/svc_quota.cc create mode 100644 src/rgw/services/svc_quota.h create mode 100644 src/rgw/services/svc_rados.cc create mode 100644 src/rgw/services/svc_rados.h create mode 100644 src/rgw/services/svc_sync_modules.cc create mode 100644 src/rgw/services/svc_sync_modules.h create mode 100644 src/rgw/services/svc_sys_obj.cc create mode 100644 src/rgw/services/svc_sys_obj.h create mode 100644 src/rgw/services/svc_sys_obj_cache.cc create mode 100644 src/rgw/services/svc_sys_obj_cache.h create mode 100644 src/rgw/services/svc_sys_obj_core.cc create mode 100644 src/rgw/services/svc_sys_obj_core.h create mode 100644 src/rgw/services/svc_zone.cc create mode 100644 src/rgw/services/svc_zone.h create mode 100644 src/rgw/services/svc_zone_utils.cc create mode 100644 src/rgw/services/svc_zone_utils.h (limited to 'src/rgw') diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt new file mode 100644 index 00000000..12f831fb --- /dev/null +++ b/src/rgw/CMakeLists.txt @@ -0,0 +1,407 @@ +add_custom_target(civetweb_h + COMMAND ${CMAKE_COMMAND} -E make_directory + "${CMAKE_BINARY_DIR}/src/include/civetweb" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${CMAKE_SOURCE_DIR}/src/civetweb/include/civetweb.h" + "${CMAKE_BINARY_DIR}/src/include/civetweb" + COMMENT "keep civetweb.h up-to-date") + +find_program(GPERF gperf) +if(NOT GPERF) + message(FATAL_ERROR "Can't find gperf") +endif() +function(gperf_generate input output) + add_custom_command( + OUTPUT ${output} + COMMAND ${GPERF} ${input} | sed "s/register //g" > ${output} + DEPENDS ${input} + COMMENT "Generate ${output}" + ) +endfunction() + +if(Boost_VERSION VERSION_GREATER 1.73) + add_definitions(-DBOOST_ASIO_USE_TS_EXECUTOR_AS_DEFAULT) +endif() + +set(librgw_common_srcs + services/svc_finisher.cc + services/svc_notify.cc + services/svc_quota.cc + services/svc_sync_modules.cc + services/svc_rados.cc + services/svc_sys_obj.cc + services/svc_sys_obj_cache.cc + services/svc_sys_obj_core.cc + services/svc_zone.cc + services/svc_zone_utils.cc + rgw_service.cc + rgw_acl.cc + rgw_acl_s3.cc + rgw_acl_swift.cc + rgw_aio_throttle.cc + rgw_auth.cc + rgw_auth_s3.cc + rgw_arn.cc + rgw_basic_types.cc + rgw_bucket.cc + rgw_cache.cc + rgw_common.cc + rgw_compression.cc + rgw_etag_verifier.cc + rgw_cors.cc + rgw_cors_s3.cc + rgw_dencoder.cc + rgw_env.cc + rgw_es_query.cc + rgw_formats.cc + rgw_gc.cc + rgw_http_client.cc + rgw_json_enc.cc + rgw_keystone.cc + rgw_ldap.cc + rgw_lc.cc + rgw_lc_s3.cc + rgw_metadata.cc + rgw_multi.cc + rgw_multi_del.cc + rgw_pubsub.cc + rgw_sync.cc + rgw_data_sync.cc + rgw_sync_counters.cc + rgw_sync_module.cc + rgw_sync_module_aws.cc + rgw_sync_module_es.cc + rgw_sync_module_es_rest.cc + rgw_sync_module_log.cc + rgw_sync_module_pubsub.cc + rgw_pubsub_push.cc + rgw_notify.cc + rgw_notify_event_type.cc + rgw_sync_module_pubsub_rest.cc + rgw_sync_log_trim.cc + rgw_sync_trace.cc + rgw_period_history.cc + rgw_period_puller.cc + rgw_reshard.cc + rgw_coroutine.cc + rgw_cr_rados.cc + rgw_cr_rest.cc + rgw_cr_tools.cc + rgw_object_expirer_core.cc + rgw_op.cc + rgw_otp.cc + rgw_policy_s3.cc + rgw_putobj.cc + rgw_putobj_processor.cc + rgw_quota.cc + rgw_rados.cc + rgw_resolve.cc + rgw_rest.cc + rgw_rest_client.cc + rgw_rest_conn.cc + rgw_rest_log.cc + rgw_rest_metadata.cc + rgw_rest_pubsub.cc + rgw_rest_pubsub_common.cc + rgw_rest_realm.cc + rgw_rest_role.cc + rgw_rest_s3.cc + rgw_role.cc + rgw_string.cc + rgw_tag.cc + rgw_tag_s3.cc + rgw_tools.cc + rgw_user.cc + rgw_website.cc + rgw_xml.cc + rgw_xml_enc.cc + rgw_torrent.cc + rgw_crypt.cc + rgw_crypt_sanitize.cc + rgw_iam_policy.cc + rgw_rest_user_policy.cc + rgw_zone.cc + rgw_sts.cc + rgw_rest_sts.cc + rgw_perf_counters.cc + rgw_object_lock.cc + rgw_rest_iam.cc + rgw_url.cc) + +if(WITH_RADOSGW_AMQP_ENDPOINT) + find_package(RabbitMQ REQUIRED) +endif() +if(WITH_RADOSGW_KAFKA_ENDPOINT) + find_package(RDKafka 1.9.2) + if(NOT RDKafka_FOUND) + set(WITH_RADOSGW_KAFKA_ENDPOINT OFF CACHE BOOL "Rados Gateway's pubsub support for Kafka push endpoint" FORCE) + message(STATUS "Disabling Kafka endpoint support") + endif() +endif() + +if(WITH_RADOSGW_AMQP_ENDPOINT) + list(APPEND librgw_common_srcs rgw_amqp.cc) +endif() +if(WITH_RADOSGW_KAFKA_ENDPOINT) + list(APPEND librgw_common_srcs rgw_kafka.cc) +endif() + +add_library(rgw_common OBJECT ${librgw_common_srcs}) + +target_include_directories(rgw_common SYSTEM PUBLIC "services") +target_include_directories(rgw_common PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src") + +if(WITH_LTTNG) + # rgw/rgw_op.cc includes "tracing/rgw_op.h" + # rgw/rgw_rados.cc includes "tracing/rgw_rados.h" + add_dependencies(rgw_common rgw_op-tp rgw_rados-tp) +endif() + +set(rgw_a_srcs + rgw_auth_keystone.cc + rgw_client_io.cc + rgw_frontend.cc + rgw_http_client_curl.cc + rgw_loadgen.cc + rgw_log.cc + rgw_period_pusher.cc + rgw_realm_reloader.cc + rgw_realm_watcher.cc + rgw_os_lib.cc + rgw_process.cc + rgw_rest_bucket.cc + rgw_rest_config.cc + rgw_rest_log.cc + rgw_rest_metadata.cc + rgw_rest_realm.cc + rgw_rest_swift.cc + rgw_rest_usage.cc + rgw_rest_user.cc + rgw_swift_auth.cc + rgw_usage.cc + rgw_opa.cc + rgw_sts.cc + rgw_rest_sts.cc) + +gperf_generate(${CMAKE_SOURCE_DIR}/src/rgw/rgw_iam_policy_keywords.gperf + rgw_iam_policy_keywords.frag.cc) +set_source_files_properties(rgw_iam_policy.cc PROPERTIES + OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/rgw/rgw_iam_policy_keywords.frag.cc + COMPILE_FLAGS -I${CMAKE_BINARY_DIR}/src/rgw) + + +if (WITH_RADOSGW_FCGI_FRONTEND) + list(APPEND rgw_a_srcs rgw_fcgi.cc) +endif() + +add_library(rgw_a STATIC + ${rgw_a_srcs} + $) + +add_dependencies(rgw_a civetweb_h) + +target_include_directories(rgw_a PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src") +target_include_directories(rgw_a SYSTEM PUBLIC "../rapidjson/include") + +target_link_libraries(rgw_a + PRIVATE + librados cls_otp_client cls_lock_client cls_rgw_client cls_refcount_client + cls_log_client cls_timeindex_client cls_version_client + cls_user_client ceph-common common_utf8 global + ${CURL_LIBRARIES} + ${EXPAT_LIBRARIES} + ${OPENLDAP_LIBRARIES} ${CRYPTO_LIBS} + OATH::OATH) + +if(WITH_CURL_OPENSSL) + # used by rgw_http_client_curl.cc + target_link_libraries(rgw_a PRIVATE OpenSSL::Crypto) +endif() + +if(WITH_BOOST_CONTEXT) + target_link_libraries(rgw_a PRIVATE Boost::coroutine Boost::context) +endif() + +set(rgw_libs rgw_a) +if(WITH_RADOSGW_AMQP_ENDPOINT) + # used by rgw_amqp.cc + list(APPEND rgw_libs RabbitMQ::RabbitMQ) +endif() +if(WITH_RADOSGW_KAFKA_ENDPOINT) + # used by rgw_kafka.cc + list(APPEND rgw_libs RDKafka::RDKafka) +endif() + +set(radosgw_srcs + rgw_loadgen_process.cc + rgw_civetweb.cc + rgw_civetweb_frontend.cc + rgw_civetweb_log.cc + rgw_dmclock_scheduler_ctx.cc + rgw_dmclock_sync_scheduler.cc) + +if (WITH_RADOSGW_FCGI_FRONTEND) + list(APPEND radosgw_srcs rgw_fcgi_process.cc) +endif() + +if(WITH_RADOSGW_BEAST_FRONTEND) + list(APPEND radosgw_srcs + rgw_asio_client.cc + rgw_asio_frontend.cc + rgw_dmclock_async_scheduler.cc) +endif() + +add_library(radosgw_a STATIC ${radosgw_srcs} + $) +target_link_libraries(radosgw_a PRIVATE ${rgw_libs}) +if(WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL) + # used by rgw_asio_frontend.cc + target_link_libraries(radosgw_a PRIVATE OpenSSL::SSL) +endif() + +add_executable(radosgw rgw_main.cc) +target_link_libraries(radosgw radosgw_a librados + cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client + cls_log_client cls_timeindex_client + cls_version_client cls_user_client + global dmclock::dmclock + ${FCGI_LIBRARY} ${LIB_RESOLV} + ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${BLKID_LIBRARIES} + ${ALLOC_LIBS}) +install(TARGETS radosgw DESTINATION bin) + +set(radosgw_admin_srcs + rgw_admin.cc + rgw_orphan.cc) +add_executable(radosgw-admin ${radosgw_admin_srcs}) +target_link_libraries(radosgw-admin ${rgw_libs} librados + cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client + cls_log_client cls_timeindex_client + cls_version_client cls_user_client + global ${FCGI_LIBRARY} ${LIB_RESOLV} + ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${BLKID_LIBRARIES}) +install(TARGETS radosgw-admin DESTINATION bin) + +set(radosgw_es_srcs + rgw_es_main.cc) +add_executable(radosgw-es ${radosgw_es_srcs}) +target_link_libraries(radosgw-es ${rgw_libs} librados + cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client + cls_log_client cls_timeindex_client + cls_version_client cls_user_client + global ${FCGI_LIBRARY} ${LIB_RESOLV} + ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${BLKID_LIBRARIES}) +install(TARGETS radosgw-es DESTINATION bin) + +set(radosgw_token_srcs + rgw_token.cc) +add_executable(radosgw-token ${radosgw_token_srcs}) +target_link_libraries(radosgw-token librados + global ${ALLOC_LIBS}) +install(TARGETS radosgw-token DESTINATION bin) + +set(radosgw_object_expirer_srcs + rgw_object_expirer.cc) +add_executable(radosgw-object-expirer ${radosgw_object_expirer_srcs}) +target_link_libraries(radosgw-object-expirer ${rgw_libs} librados + cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client + cls_log_client cls_timeindex_client + cls_version_client cls_user_client + global ${FCGI_LIBRARY} ${LIB_RESOLV} + ${CURL_LIBRARIES} ${EXPAT_LIBRARIES}) +install(TARGETS radosgw-object-expirer DESTINATION bin) + +set(librgw_srcs + librgw.cc + rgw_file.cc) +add_library(rgw SHARED ${librgw_srcs}) +target_link_libraries(rgw + PRIVATE + ${rgw_libs} + librados + cls_rgw_client + cls_otp_client + cls_lock_client + cls_refcount_client + cls_log_client + cls_timeindex_client + cls_version_client + cls_user_client + global + ${LIB_RESOLV} + ${CURL_LIBRARIES} + ${EXPAT_LIBRARIES} + PUBLIC + dmclock::dmclock) + +if(WITH_RADOSGW_AMQP_ENDPOINT) + target_link_libraries(rgw PRIVATE RabbitMQ::RabbitMQ) +endif() + +if(WITH_RADOSGW_KAFKA_ENDPOINT) + target_link_libraries(rgw PRIVATE RDKafka::RDKafka) +endif() + +set_target_properties(rgw PROPERTIES OUTPUT_NAME rgw VERSION 2.0.0 + SOVERSION 2) +install(TARGETS rgw DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +set(librgw_admin_user_srcs + librgw_admin_user.cc + rgw_admin_user.cc +) +add_library(rgw_admin_user SHARED + ${librgw_admin_user_srcs} + $) + +add_dependencies(rgw_admin_user civetweb_h) + +target_link_libraries(rgw_admin_user PRIVATE + librados + cls_rgw_client + cls_otp_client + cls_lock_client + cls_refcount_client + cls_log_client + cls_timeindex_client + cls_version_client + cls_user_client + global + ${CURL_LIBRARIES} + ${EXPAT_LIBRARIES} + ${OPENLDAP_LIBRARIES} + dmclock::dmclock) +set_target_properties(rgw_admin_user PROPERTIES OUTPUT_NAME rgw_admin_user VERSION 1.0.0 + SOVERSION 0) +install(TARGETS rgw_admin_user DESTINATION ${CMAKE_INSTALL_LIBDIR}) +if(WITH_RADOSGW_AMQP_ENDPOINT) + target_link_libraries(rgw_admin_user PRIVATE RabbitMQ::RabbitMQ) +endif() +if(WITH_RADOSGW_KAFKA_ENDPOINT) + target_link_libraries(rgw_admin_user PRIVATE RDKafka::RDKafka) +endif() +if(WITH_BOOST_CONTEXT) + target_link_libraries(rgw_admin_user PRIVATE Boost::coroutine Boost::context) +endif() + +if(WITH_TESTS) + add_executable(ceph_rgw_jsonparser + rgw_jsonparser.cc) + target_link_libraries(ceph_rgw_jsonparser + ${rgw_libs} + global) + + add_executable(ceph_rgw_multiparser + rgw_multiparser.cc) + target_link_libraries(ceph_rgw_multiparser + ${rgw_libs} + global) + + install(TARGETS + ceph_rgw_jsonparser + ceph_rgw_multiparser + DESTINATION bin) +endif(WITH_TESTS) + +install(PROGRAMS rgw-orphan-list + DESTINATION bin) diff --git a/src/rgw/librgw.cc b/src/rgw/librgw.cc new file mode 100644 index 00000000..1dd88982 --- /dev/null +++ b/src/rgw/librgw.cc @@ -0,0 +1,729 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include +#include +#include + +#include "include/types.h" +#include "include/rados/librgw.h" +#include "rgw/rgw_acl_s3.h" +#include "rgw_acl.h" + +#include "include/str_list.h" +#include "include/stringify.h" +#include "global/global_init.h" +#include "global/signal_handler.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" +#include "common/ceph_argparse.h" +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/dout.h" + +#include "rgw_rados.h" +#include "rgw_resolve.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_rest_user.h" +#include "rgw_rest_s3.h" +#include "rgw_os_lib.h" +#include "rgw_auth.h" +#include "rgw_auth_s3.h" +#include "rgw_lib.h" +#include "rgw_lib_frontend.h" +#include "rgw_http_client.h" +#include "rgw_http_client_curl.h" +#include "rgw_perf_counters.h" + +#include +#include +#include +#include + + +#define dout_subsys ceph_subsys_rgw + +bool global_stop = false; + +static void handle_sigterm(int signum) +{ + dout(20) << __func__ << " SIGUSR1 ignored" << dendl; +} + +namespace rgw { + + using std::string; + + static std::mutex librgw_mtx; + + RGWLib rgwlib; + + class C_InitTimeout : public Context { + public: + C_InitTimeout() {} + void finish(int r) override { + derr << "Initialization timeout, failed to initialize" << dendl; + exit(1); + } + }; + + void RGWLibProcess::checkpoint() + { + m_tp.drain(&req_wq); + } + +#define MIN_EXPIRE_S 120 + + void RGWLibProcess::run() + { + /* write completion interval */ + RGWLibFS::write_completion_interval_s = + cct->_conf->rgw_nfs_write_completion_interval_s; + + /* start write timer */ + RGWLibFS::write_timer.resume(); + + /* gc loop */ + while (! shutdown) { + lsubdout(cct, rgw, 5) << "RGWLibProcess GC" << dendl; + + /* dirent invalidate timeout--basically, the upper-bound on + * inconsistency with the S3 namespace */ + auto expire_s = cct->_conf->rgw_nfs_namespace_expire_secs; + + /* delay between gc cycles */ + auto delay_s = std::max(int64_t(1), std::min(int64_t(MIN_EXPIRE_S), expire_s/2)); + + unique_lock uniq(mtx); + restart: + int cur_gen = gen; + for (auto iter = mounted_fs.begin(); iter != mounted_fs.end(); + ++iter) { + RGWLibFS* fs = iter->first->ref(); + uniq.unlock(); + fs->gc(); + fs->update_user(); + fs->rele(); + uniq.lock(); + if (cur_gen != gen) + goto restart; /* invalidated */ + } + cv.wait_for(uniq, std::chrono::seconds(delay_s)); + uniq.unlock(); + } + } + + void RGWLibProcess::handle_request(RGWRequest* r) + { + /* + * invariant: valid requests are derived from RGWLibRequst + */ + RGWLibRequest* req = static_cast(r); + + // XXX move RGWLibIO and timing setup into process_request + +#if 0 /* XXX */ + utime_t tm = ceph_clock_now(); +#endif + + RGWLibIO io_ctx; + + int ret = process_request(req, &io_ctx); + if (ret < 0) { + /* we don't really care about return code */ + dout(20) << "process_request() returned " << ret << dendl; + + } + delete req; + } /* handle_request */ + + int RGWLibProcess::process_request(RGWLibRequest* req) + { + // XXX move RGWLibIO and timing setup into process_request + +#if 0 /* XXX */ + utime_t tm = ceph_clock_now(); +#endif + + RGWLibIO io_ctx; + + int ret = process_request(req, &io_ctx); + if (ret < 0) { + /* we don't really care about return code */ + dout(20) << "process_request() returned " << ret << dendl; + } + return ret; + } /* process_request */ + + static inline void abort_req(struct req_state *s, RGWOp *op, int err_no) + { + if (!s) + return; + + /* XXX the dump_errno and dump_bucket_from_state behaviors in + * the abort_early (rgw_rest.cc) might be valuable, but aren't + * safe to call presently as they return HTTP data */ + + perfcounter->inc(l_rgw_failed_req); + } /* abort_req */ + + int RGWLibProcess::process_request(RGWLibRequest* req, RGWLibIO* io) + { + int ret = 0; + bool should_log = true; // XXX + + dout(1) << "====== " << __func__ + << " starting new request req=" << hex << req << dec + << " ======" << dendl; + + /* + * invariant: valid requests are derived from RGWOp--well-formed + * requests should have assigned RGWRequest::op in their descendant + * constructor--if not, the compiler can find it, at the cost of + * a runtime check + */ + RGWOp *op = (req->op) ? req->op : dynamic_cast(req); + if (! op) { + dout(1) << "failed to derive cognate RGWOp (invalid op?)" << dendl; + return -EINVAL; + } + + io->init(req->cct); + + perfcounter->inc(l_rgw_req); + + RGWEnv& rgw_env = io->get_env(); + + /* XXX + * until major refactoring of req_state and req_info, we need + * to build their RGWEnv boilerplate from the RGWLibRequest, + * pre-staging any strings (HTTP_HOST) that provoke a crash when + * not found + */ + + /* XXX for now, use ""; could be a legit hostname, or, in future, + * perhaps a tenant (Yehuda) */ + rgw_env.set("HTTP_HOST", ""); + + /* XXX and -then- bloat up req_state with string copies from it */ + struct req_state rstate(req->cct, &rgw_env, req->get_user(), req->id); + struct req_state *s = &rstate; + + // XXX fix this + s->cio = io; + + RGWObjectCtx rados_ctx(store, s); // XXX holds std::map + + auto sysobj_ctx = store->svc.sysobj->init_obj_ctx(); + s->sysobj_ctx = &sysobj_ctx; + + /* XXX and -then- stash req_state pointers everywhere they are needed */ + ret = req->init(rgw_env, &rados_ctx, io, s); + if (ret < 0) { + dout(10) << "failed to initialize request" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* req is-a RGWOp, currently initialized separately */ + ret = req->op_init(); + if (ret < 0) { + dout(10) << "failed to initialize RGWOp" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* now expected by rgw_log_op() */ + rgw_env.set("REQUEST_METHOD", s->info.method); + rgw_env.set("REQUEST_URI", s->info.request_uri); + rgw_env.set("QUERY_STRING", ""); + + try { + /* XXX authorize does less here then in the REST path, e.g., + * the user's info is cached, but still incomplete */ + ldpp_dout(s, 2) << "authorizing" << dendl; + ret = req->authorize(op); + if (ret < 0) { + dout(10) << "failed to authorize request" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* FIXME: remove this after switching all handlers to the new + * authentication infrastructure. */ + if (! s->auth.identity) { + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } + + ldpp_dout(s, 2) << "reading op permissions" << dendl; + ret = req->read_permissions(op); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "init op" << dendl; + ret = op->init_processing(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "verifying op mask" << dendl; + ret = op->verify_op_mask(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "verifying op permissions" << dendl; + ret = op->verify_permission(); + if (ret < 0) { + if (s->system_request) { + dout(2) << "overriding permissions due to system operation" << dendl; + } else if (s->auth.identity->is_admin_of(s->user->user_id)) { + dout(2) << "overriding permissions due to admin operation" << dendl; + } else { + abort_req(s, op, ret); + goto done; + } + } + + ldpp_dout(s, 2) << "verifying op params" << dendl; + ret = op->verify_params(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "executing" << dendl; + op->pre_exec(); + op->execute(); + op->complete(); + + } catch (const ceph::crypto::DigestException& e) { + dout(0) << "authentication failed" << e.what() << dendl; + abort_req(s, op, -ERR_INVALID_SECRET_KEY); + } + + done: + try { + io->complete_request(); + } catch (rgw::io::Exception& e) { + dout(0) << "ERROR: io->complete_request() returned " + << e.what() << dendl; + } + if (should_log) { + rgw_log_op(store, nullptr /* !rest */, s, + (op ? op->name() : "unknown"), olog); + } + + int http_ret = s->err.http_ret; + + ldpp_dout(s, 2) << "http status=" << http_ret << dendl; + + dout(1) << "====== " << __func__ + << " req done req=" << hex << req << dec << " http_status=" + << http_ret + << " ======" << dendl; + + return (ret < 0 ? ret : s->err.ret); + } /* process_request */ + + int RGWLibProcess::start_request(RGWLibContinuedReq* req) + { + + dout(1) << "====== " << __func__ + << " starting new continued request req=" << hex << req << dec + << " ======" << dendl; + + /* + * invariant: valid requests are derived from RGWOp--well-formed + * requests should have assigned RGWRequest::op in their descendant + * constructor--if not, the compiler can find it, at the cost of + * a runtime check + */ + RGWOp *op = (req->op) ? req->op : dynamic_cast(req); + if (! op) { + dout(1) << "failed to derive cognate RGWOp (invalid op?)" << dendl; + return -EINVAL; + } + + struct req_state* s = req->get_state(); + + /* req is-a RGWOp, currently initialized separately */ + int ret = req->op_init(); + if (ret < 0) { + dout(10) << "failed to initialize RGWOp" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* XXX authorize does less here then in the REST path, e.g., + * the user's info is cached, but still incomplete */ + ldpp_dout(s, 2) << "authorizing" << dendl; + ret = req->authorize(op); + if (ret < 0) { + dout(10) << "failed to authorize request" << dendl; + abort_req(s, op, ret); + goto done; + } + + /* FIXME: remove this after switching all handlers to the new authentication + * infrastructure. */ + if (! s->auth.identity) { + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } + + ldpp_dout(s, 2) << "reading op permissions" << dendl; + ret = req->read_permissions(op); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "init op" << dendl; + ret = op->init_processing(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "verifying op mask" << dendl; + ret = op->verify_op_mask(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + ldpp_dout(s, 2) << "verifying op permissions" << dendl; + ret = op->verify_permission(); + if (ret < 0) { + if (s->system_request) { + dout(2) << "overriding permissions due to system operation" << dendl; + } else if (s->auth.identity->is_admin_of(s->user->user_id)) { + dout(2) << "overriding permissions due to admin operation" << dendl; + } else { + abort_req(s, op, ret); + goto done; + } + } + + ldpp_dout(s, 2) << "verifying op params" << dendl; + ret = op->verify_params(); + if (ret < 0) { + abort_req(s, op, ret); + goto done; + } + + op->pre_exec(); + req->exec_start(); + + done: + return (ret < 0 ? ret : s->err.ret); + } + + int RGWLibProcess::finish_request(RGWLibContinuedReq* req) + { + RGWOp *op = (req->op) ? req->op : dynamic_cast(req); + if (! op) { + dout(1) << "failed to derive cognate RGWOp (invalid op?)" << dendl; + return -EINVAL; + } + + int ret = req->exec_finish(); + int op_ret = op->get_ret(); + + dout(1) << "====== " << __func__ + << " finishing continued request req=" << hex << req << dec + << " op status=" << op_ret + << " ======" << dendl; + + return ret; + } + + int RGWLibFrontend::init() + { + pprocess = new RGWLibProcess(g_ceph_context, &env, + g_conf()->rgw_thread_pool_size, conf); + return 0; + } + + int RGWLib::init() + { + vector args; + return init(args); + } + + int RGWLib::init(vector& args) + { + int r = 0; + + /* alternative default for module */ + map defaults = { + { "debug_rgw", "1/5" }, + { "keyring", "$rgw_data/keyring" }, + { "log_file", "/var/log/radosgw/$cluster-$name.log" } + }; + + cct = global_init(&defaults, args, + CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + + Mutex mutex("main"); + SafeTimer init_timer(g_ceph_context, mutex); + init_timer.init(); + mutex.Lock(); + init_timer.add_event_after(g_conf()->rgw_init_timeout, new C_InitTimeout); + mutex.Unlock(); + + common_init_finish(g_ceph_context); + + rgw_tools_init(g_ceph_context); + + rgw_init_resolver(); + rgw::curl::setup_curl(boost::none); + rgw_http_client_init(g_ceph_context); + + store = RGWStoreManager::get_storage(g_ceph_context, + g_conf()->rgw_enable_gc_threads, + g_conf()->rgw_enable_lc_threads, + g_conf()->rgw_enable_quota_threads, + g_conf()->rgw_run_sync_thread, + g_conf().get_val("rgw_dynamic_resharding")); + + if (!store) { + mutex.Lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.Unlock(); + + derr << "Couldn't init storage provider (RADOS)" << dendl; + return -EIO; + } + + r = rgw_perf_start(g_ceph_context); + + rgw_rest_init(g_ceph_context, store, store->svc.zone->get_zonegroup()); + + mutex.Lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.Unlock(); + + if (r) + return -EIO; + + const string& ldap_uri = store->ctx()->_conf->rgw_ldap_uri; + const string& ldap_binddn = store->ctx()->_conf->rgw_ldap_binddn; + const string& ldap_searchdn = store->ctx()->_conf->rgw_ldap_searchdn; + const string& ldap_searchfilter = store->ctx()->_conf->rgw_ldap_searchfilter; + const string& ldap_dnattr = + store->ctx()->_conf->rgw_ldap_dnattr; + std::string ldap_bindpw = parse_rgw_ldap_bindpw(store->ctx()); + + ldh = new rgw::LDAPHelper(ldap_uri, ldap_binddn, ldap_bindpw.c_str(), + ldap_searchdn, ldap_searchfilter, ldap_dnattr); + ldh->init(); + ldh->bind(); + + rgw_user_init(store); + rgw_bucket_init(store->meta_mgr); + rgw_log_usage_init(g_ceph_context, store); + + // XXX ex-RGWRESTMgr_lib, mgr->set_logging(true) + + if (!g_conf()->rgw_ops_log_socket_path.empty()) { + olog = new OpsLogSocket(g_ceph_context, g_conf()->rgw_ops_log_data_backlog); + olog->init(g_conf()->rgw_ops_log_socket_path); + } + + int port = 80; + RGWProcessEnv env = { store, &rest, olog, port }; + + string fe_count{"0"}; + fec = new RGWFrontendConfig("rgwlib"); + fe = new RGWLibFrontend(env, fec); + + init_async_signal_handler(); + register_async_signal_handler(SIGUSR1, handle_sigterm); + + map service_map_meta; + service_map_meta["pid"] = stringify(getpid()); + service_map_meta["frontend_type#" + fe_count] = "rgw-nfs"; + service_map_meta["frontend_config#" + fe_count] = fec->get_config(); + + fe->init(); + if (r < 0) { + derr << "ERROR: failed initializing frontend" << dendl; + return r; + } + + fe->run(); + + r = store->register_to_service_map("rgw-nfs", service_map_meta); + if (r < 0) { + derr << "ERROR: failed to register to service map: " << cpp_strerror(-r) << dendl; + /* ignore error */ + } + + return 0; + } /* RGWLib::init() */ + + int RGWLib::stop() + { + derr << "shutting down" << dendl; + + fe->stop(); + + fe->join(); + + delete fe; + delete fec; + delete ldh; + + unregister_async_signal_handler(SIGUSR1, handle_sigterm); + shutdown_async_signal_handler(); + + rgw_log_usage_finalize(); + + delete olog; + + RGWStoreManager::close_storage(store); + + rgw_tools_cleanup(); + rgw_shutdown_resolver(); + rgw_http_client_cleanup(); + rgw::curl::cleanup_curl(); + + rgw_perf_stop(g_ceph_context); + + dout(1) << "final shutdown" << dendl; + cct.reset(); + + return 0; + } /* RGWLib::stop() */ + + int RGWLibIO::set_uid(RGWRados *store, const rgw_user& uid) + { + int ret = rgw_get_user_info_by_uid(store, uid, user_info, NULL); + if (ret < 0) { + derr << "ERROR: failed reading user info: uid=" << uid << " ret=" + << ret << dendl; + } + return ret; + } + + int RGWLibRequest::read_permissions(RGWOp* op) { + /* bucket and object ops */ + int ret = + rgw_build_bucket_policies(rgwlib.get_store(), get_state()); + if (ret < 0) { + ldout(get_state()->cct, 10) << "read_permissions (bucket policy) on " + << get_state()->bucket << ":" + << get_state()->object + << " only_bucket=" << only_bucket() + << " ret=" << ret << dendl; + if (ret == -ENODATA) + ret = -EACCES; + } else if (! only_bucket()) { + /* object ops */ + ret = rgw_build_object_policies(rgwlib.get_store(), get_state(), + op->prefetch_data()); + if (ret < 0) { + ldout(get_state()->cct, 10) << "read_permissions (object policy) on" + << get_state()->bucket << ":" + << get_state()->object + << " ret=" << ret << dendl; + if (ret == -ENODATA) + ret = -EACCES; + } + } + return ret; + } /* RGWLibRequest::read_permissions */ + + int RGWHandler_Lib::authorize(const DoutPrefixProvider *dpp) + { + /* TODO: handle + * 1. subusers + * 2. anonymous access + * 3. system access + * 4. ? + * + * Much or all of this depends on handling the cached authorization + * correctly (e.g., dealing with keystone) at mount time. + */ + s->perm_mask = RGW_PERM_FULL_CONTROL; + + // populate the owner info + s->owner.set_id(s->user->user_id); + s->owner.set_name(s->user->display_name); + + return 0; + } /* RGWHandler_Lib::authorize */ + +} /* namespace rgw */ + +extern "C" { + +int librgw_create(librgw_t* rgw, int argc, char **argv) +{ + using namespace rgw; + + int rc = -EINVAL; + + if (! g_ceph_context) { + std::lock_guard lg(librgw_mtx); + if (! g_ceph_context) { + vector args; + std::vector spl_args; + // last non-0 argument will be split and consumed + if (argc > 1) { + const std::string spl_arg{argv[(--argc)]}; + get_str_vec(spl_arg, " \t", spl_args); + } + argv_to_vec(argc, const_cast(argv), args); + // append split args, if any + for (const auto& elt : spl_args) { + args.push_back(elt.c_str()); + } + rc = rgwlib.init(args); + } + } + + *rgw = g_ceph_context->get(); + + return rc; +} + +void librgw_shutdown(librgw_t rgw) +{ + using namespace rgw; + + CephContext* cct = static_cast(rgw); + rgwlib.stop(); + cct->put(); +} + +} /* extern "C" */ diff --git a/src/rgw/librgw_admin_user.cc b/src/rgw/librgw_admin_user.cc new file mode 100644 index 00000000..928f04cb --- /dev/null +++ b/src/rgw/librgw_admin_user.cc @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * create rgw admin user + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include +#include +#include +#include +#include +#include +#include + +#include "include/types.h" +#include "include/rgw/librgw_admin_user.h" +#include "include/str_list.h" +#include "include/stringify.h" +#include "global/global_init.h" +#include "global/signal_handler.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" +#include "common/ceph_argparse.h" +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/dout.h" + +#include "rgw_admin_user.h" +#include "rgw_rados.h" +#include "rgw_os_lib.h" +#include "rgw_auth.h" +#include "rgw_auth_s3.h" + +#define dout_subsys ceph_subsys_rgw + +bool global_stop = false; + +static void handle_sigterm(int signum) +{ + dout(20) << __func__ << " SIGUSR1 ignored" << dendl; +} + +namespace rgw { + + using std::string; + + static std::mutex librgw_admin_user_mtx; + + RGWLibAdmin rgw_lib_admin; + + class C_InitTimeout : public Context { + public: + C_InitTimeout() {} + void finish(int r) override { + derr << "Initialization timeout, failed to initialize" << dendl; + exit(1); + } + }; + + int RGWLibAdmin::init() + { + vector args; + return init(args); + } + + int RGWLibAdmin::init(vector& args) + { + /* alternative default for module */ + map defaults = { + { "debug_rgw", "1/5" }, + { "keyring", "$rgw_data/keyring" }, + { "log_file", "/var/log/radosgw/$cluster-$name.log" } + }; + + cct = global_init(&defaults, args, + CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + + Mutex mutex("main"); + SafeTimer init_timer(g_ceph_context, mutex); + init_timer.init(); + mutex.Lock(); + init_timer.add_event_after(g_conf()->rgw_init_timeout, new C_InitTimeout); + mutex.Unlock(); + + common_init_finish(g_ceph_context); + + store = RGWStoreManager::get_storage(g_ceph_context, false, false, false, false, false); + + if (!store) { + mutex.Lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.Unlock(); + + derr << "Couldn't init storage provider (RADOS)" << dendl; + return -EIO; + } + + mutex.Lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.Unlock(); + + rgw_user_init(store); + + init_async_signal_handler(); + register_async_signal_handler(SIGUSR1, handle_sigterm); + + return 0; + } /* RGWLibAdmin::init() */ + + int RGWLibAdmin::stop() + { + derr << "shutting down" << dendl; + + unregister_async_signal_handler(SIGUSR1, handle_sigterm); + shutdown_async_signal_handler(); + + RGWStoreManager::close_storage(store); + + dout(1) << "final shutdown" << dendl; + cct.reset(); + + return 0; + } /* RGWLibAdmin::stop() */ + +} /* namespace rgw */ + +extern "C" { + +int librgw_admin_user_create(librgw_admin_user_t* rgw_admin_user, int argc, char **argv) +{ + using namespace rgw; + + int rc = -EINVAL; + + if (! g_ceph_context) { + std::lock_guard lg(librgw_admin_user_mtx); + if (! g_ceph_context) { + vector args; + std::vector spl_args; + // last non-0 argument will be split and consumed + if (argc > 1) { + const std::string spl_arg{argv[(--argc)]}; + get_str_vec(spl_arg, " \t", spl_args); + } + argv_to_vec(argc, const_cast(argv), args); + // append split args, if any + for (const auto& elt : spl_args) { + args.push_back(elt.c_str()); + } + rc = rgw_lib_admin.init(args); + } + } + + *rgw_admin_user = g_ceph_context->get(); + + return rc; +} + +void librgw_admin_user_shutdown(librgw_admin_user_t rgw_admin_user) +{ + using namespace rgw; + + CephContext* cct = static_cast(rgw_admin_user); + rgw_lib_admin.stop(); + cct->put(); +} + +} /* extern "C" */ + + diff --git a/src/rgw/rgw-orphan-list b/src/rgw/rgw-orphan-list new file mode 100755 index 00000000..7f60c651 --- /dev/null +++ b/src/rgw/rgw-orphan-list @@ -0,0 +1,144 @@ +#!/usr/bin/env bash + +# version 2020-10-20 + +# IMPORTANT: affects order produced by 'sort' and 'ceph-diff-sorted' +# relies on this ordering +export LANG=C + +out_dir="." +temp_file=/tmp/temp.$$ +timestamp=$(date -u +%Y%m%d%H%M%S) +lspools_err="${out_dir}/lspools-${timestamp}.error" +rados_out="${out_dir}/rados-${timestamp}.intermediate" +rados_odd="${out_dir}/rados-${timestamp}.issues" +rados_err="${out_dir}/rados-${timestamp}.error" +rgwadmin_out="${out_dir}/radosgw-admin-${timestamp}.intermediate" +rgwadmin_err="${out_dir}/radosgw-admin-${timestamp}.error" +delta_out="${out_dir}/orphan-list-${timestamp}.out" + +error_out() { + echo "An error was encountered while running '$1'. Aborting." + if [ $# -gt 2 ] ;then + echo "Error: $3" + fi + if [ $# -gt 1 ] ;then + echo "Review file '$2' for details." + fi + echo "***" + echo "*** WARNING: The results are incomplete. Do not use! ***" + echo "***" + exit 1 +} + +prompt_pool() { + # note: all prompts go to stderr so stdout contains just the result + >&2 echo "Available pools:" + rados lspools >"$temp_file" 2>"$lspools_err" + if [ "$?" -ne 0 ] ;then + error_out "rados lspools" "$lspools_err" + fi + >&2 sed 's/^/ /' "$temp_file" # list pools and indent + >&2 printf "Which pool do you want to search for orphans? " + local mypool + read mypool + echo $mypool +} + +if [ $# -eq 0 ] ;then + pool="$(prompt_pool)" +elif [ $# -eq 1 ] ;then + pool="$1" +else + error_out "Usage: $0 [pool]" +fi + +echo "Pool is \"$pool\"." + +echo "Note: output files produced will be tagged with the current timestamp -- ${timestamp}." + +echo "running 'rados ls' at $(date)" +# since --format is not specified, plain should be used +rados ls --pool="$pool" --all >"$rados_out" 2>"$rados_err" +if [ "$?" -ne 0 ] ;then + error_out "rados ls" "$rados_err" +fi + +# NOTE: Each entry (line of output) of `rados ls --all` should be in +# one of four formats depending on whether or not an entry has a +# namespace and/or locator: +# +# oid +# oidlocator +# namespaceoid +# namespaceoidlocator +# +# Any occurrences of the 2nd, 3rd, or 4th (i.e., existence of +# namespace and/or locator) should cause the create of the "odd" file +# and an explanation in the output, and those entries will not be +# retained, and therefore they will not be called out as orphans. They +# will need special handling by the end-user as we do not expect +# namespaces or locators. + +# check for namespaces -- any line that does not begin with a tab +# indicates a namespace; add those to "odd" file and set flag; note: +# this also picks up entries with namespace and locator +grep $'^[^\t]' "$rados_out" >"$rados_odd" +if [ "${PIPESTATUS[0]}" -eq 0 ] ;then + namespace_found=1 +fi + +# check for locators (w/o namespace); we idenitfy them by skipping +# past the empty namespace (i.e., one TAB), skipping past the oid, +# then looking for a TAB; note we use egrep to get the '+' character +# and the $ in front of the ' allows the \t to be interpreted as a TAB +egrep $'^\t[[:graph:]]+\t' "$rados_out" >>"$rados_odd" +if [ "${PIPESTATUS[0]}" -eq 0 ] ;then + locator_found=1 +fi + +# extract the entries that are just oids (i.e., no namespace or +# locator) for further processing; only look at lines that begin with +# a TAB and do not contain a second TAB, and then grab everything +# after the initial TAB +grep $'^\t' "$rados_out" | grep -v $'^\t.*\t' | sed -E 's/^\t//' >"$temp_file" +mv -f "$temp_file" "$rados_out" + +sort -u "$rados_out" >"$temp_file" +mv -f "$temp_file" "$rados_out" + +echo "running 'radosgw-admin bucket radoslist' at $(date)" +radosgw-admin bucket radoslist >"$rgwadmin_out" 2>"$rgwadmin_err" +if [ "$?" -ne 0 ] ;then + error_out "radosgw-admin radoslist" "$rgwadmin_err" +fi +sort -u "$rgwadmin_out" >"$temp_file" +mv -f "$temp_file" "$rgwadmin_out" + +echo "computing delta at $(date)" +ceph-diff-sorted "$rados_out" "$rgwadmin_out" | grep "^<" | sed 's/^< *//' >"$delta_out" +# use PIPESTATUS to get at exit status of first process in above pipe; +# 0 means same, 1 means different, >1 means error +if [ "${PIPESTATUS[0]}" -gt 1 ] ;then + error_out "ceph-diff-sorted" +fi + +found=$(wc -l < "$delta_out") +possible=$(wc -l < "$rados_out") +percentage=0 +if [ $possible -ne 0 ] ;then + percentage=$(expr 100 \* $found / $possible) +fi + +echo "$found potential orphans found out of a possible $possible (${percentage}%)." +echo "The results can be found in '${delta_out}'." +echo " Intermediate files are '${rados_out}' and '${rgwadmin_out}'." +if [ -n "$namespace_found" -o -n "$locator_found" ] ;then + echo " Note: 'rados ls' found entries that might be in a namespace or might" + echo " have a locator; see '${rados_odd}' for those entries." +fi +echo "***" +echo "*** WARNING: This is EXPERIMENTAL code and the results should be used" +echo "*** only with CAUTION!" +echo "***" +echo "Done at $(date)." diff --git a/src/rgw/rgw_acl.cc b/src/rgw/rgw_acl.cc new file mode 100644 index 00000000..8c02f8e3 --- /dev/null +++ b/src/rgw/rgw_acl.cc @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include + +#include "include/types.h" + +#include "common/Formatter.h" + +#include "rgw_acl.h" +#include "rgw_user.h" + +#define dout_subsys ceph_subsys_rgw + + +void RGWAccessControlList::_add_grant(ACLGrant *grant) +{ + ACLPermission& perm = grant->get_permission(); + ACLGranteeType& type = grant->get_type(); + switch (type.get_type()) { + case ACL_TYPE_REFERER: + referer_list.emplace_back(grant->get_referer(), perm.get_permissions()); + + /* We're specially handling the Swift's .r:* as the S3 API has a similar + * concept and thus we can have a small portion of compatibility here. */ + if (grant->get_referer() == RGW_REFERER_WILDCARD) { + acl_group_map[ACL_GROUP_ALL_USERS] |= perm.get_permissions(); + } + break; + case ACL_TYPE_GROUP: + acl_group_map[grant->get_group()] |= perm.get_permissions(); + break; + default: + { + rgw_user id; + if (!grant->get_id(id)) { + ldout(cct, 0) << "ERROR: grant->get_id() failed" << dendl; + } + acl_user_map[id.to_str()] |= perm.get_permissions(); + } + } +} + +void RGWAccessControlList::add_grant(ACLGrant *grant) +{ + rgw_user id; + grant->get_id(id); // not that this will return false for groups, but that's ok, we won't search groups + grant_map.insert(pair(id.to_str(), *grant)); + _add_grant(grant); +} + +uint32_t RGWAccessControlList::get_perm(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + const uint32_t perm_mask) +{ + ldpp_dout(dpp, 5) << "Searching permissions for identity=" << auth_identity + << " mask=" << perm_mask << dendl; + + return perm_mask & auth_identity.get_perms_from_aclspec(dpp, acl_user_map); +} + +uint32_t RGWAccessControlList::get_group_perm(ACLGroupTypeEnum group, + const uint32_t perm_mask) +{ + ldout(cct, 5) << "Searching permissions for group=" << (int)group + << " mask=" << perm_mask << dendl; + + const auto iter = acl_group_map.find((uint32_t)group); + if (iter != acl_group_map.end()) { + ldout(cct, 5) << "Found permission: " << iter->second << dendl; + return iter->second & perm_mask; + } + ldout(cct, 5) << "Permissions for group not found" << dendl; + return 0; +} + +uint32_t RGWAccessControlList::get_referer_perm(const uint32_t current_perm, + const std::string http_referer, + const uint32_t perm_mask) +{ + ldout(cct, 5) << "Searching permissions for referer=" << http_referer + << " mask=" << perm_mask << dendl; + + /* This function is basically a transformation from current perm to + * a new one that takes into consideration the Swift's HTTP referer- + * based ACLs. We need to go through all items to respect negative + * grants. */ + uint32_t referer_perm = current_perm; + for (const auto& r : referer_list) { + if (r.is_match(http_referer)) { + referer_perm = r.perm; + } + } + + ldout(cct, 5) << "Found referer permission=" << referer_perm << dendl; + return referer_perm & perm_mask; +} + +uint32_t RGWAccessControlPolicy::get_perm(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + const uint32_t perm_mask, + const char * const http_referer) +{ + ldpp_dout(dpp, 20) << "-- Getting permissions begin with perm_mask=" << perm_mask + << dendl; + + uint32_t perm = acl.get_perm(dpp, auth_identity, perm_mask); + + if (auth_identity.is_owner_of(owner.get_id())) { + perm |= perm_mask & (RGW_PERM_READ_ACP | RGW_PERM_WRITE_ACP); + } + + if (perm == perm_mask) { + return perm; + } + + /* should we continue looking up? */ + if ((perm & perm_mask) != perm_mask) { + perm |= acl.get_group_perm(ACL_GROUP_ALL_USERS, perm_mask); + + if (false == auth_identity.is_owner_of(rgw_user(RGW_USER_ANON_ID))) { + /* this is not the anonymous user */ + perm |= acl.get_group_perm(ACL_GROUP_AUTHENTICATED_USERS, perm_mask); + } + } + + /* Should we continue looking up even deeper? */ + if (nullptr != http_referer && (perm & perm_mask) != perm_mask) { + perm = acl.get_referer_perm(perm, http_referer, perm_mask); + } + + ldpp_dout(dpp, 5) << "-- Getting permissions done for identity=" << auth_identity + << ", owner=" << owner.get_id() + << ", perm=" << perm << dendl; + + return perm; +} + +bool RGWAccessControlPolicy::verify_permission(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + const uint32_t user_perm_mask, + const uint32_t perm, + const char * const http_referer) +{ + uint32_t test_perm = perm | RGW_PERM_READ_OBJS | RGW_PERM_WRITE_OBJS; + + uint32_t policy_perm = get_perm(dpp, auth_identity, test_perm, http_referer); + + /* the swift WRITE_OBJS perm is equivalent to the WRITE obj, just + convert those bits. Note that these bits will only be set on + buckets, so the swift READ permission on bucket will allow listing + the bucket content */ + if (policy_perm & RGW_PERM_WRITE_OBJS) { + policy_perm |= (RGW_PERM_WRITE | RGW_PERM_WRITE_ACP); + } + if (policy_perm & RGW_PERM_READ_OBJS) { + policy_perm |= (RGW_PERM_READ | RGW_PERM_READ_ACP); + } + + uint32_t acl_perm = policy_perm & perm & user_perm_mask; + + ldpp_dout(dpp, 10) << " identity=" << auth_identity + << " requested perm (type)=" << perm + << ", policy perm=" << policy_perm + << ", user_perm_mask=" << user_perm_mask + << ", acl perm=" << acl_perm << dendl; + + return (perm == acl_perm); +} + + diff --git a/src/rgw/rgw_acl.h b/src/rgw/rgw_acl.h new file mode 100644 index 00000000..0e84d75e --- /dev/null +++ b/src/rgw/rgw_acl.h @@ -0,0 +1,469 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_ACL_H +#define CEPH_RGW_ACL_H + +#include +#include +#include + +#include +#include + +#include "common/debug.h" + +#include "rgw_basic_types.h" + +#define RGW_PERM_NONE 0x00 +#define RGW_PERM_READ 0x01 +#define RGW_PERM_WRITE 0x02 +#define RGW_PERM_READ_ACP 0x04 +#define RGW_PERM_WRITE_ACP 0x08 +#define RGW_PERM_READ_OBJS 0x10 +#define RGW_PERM_WRITE_OBJS 0x20 +#define RGW_PERM_FULL_CONTROL ( RGW_PERM_READ | RGW_PERM_WRITE | \ + RGW_PERM_READ_ACP | RGW_PERM_WRITE_ACP ) +#define RGW_PERM_ALL_S3 RGW_PERM_FULL_CONTROL +#define RGW_PERM_INVALID 0xFF00 + +static constexpr char RGW_REFERER_WILDCARD[] = "*"; + +enum ACLGranteeTypeEnum { +/* numbers are encoded, should not change */ + ACL_TYPE_CANON_USER = 0, + ACL_TYPE_EMAIL_USER = 1, + ACL_TYPE_GROUP = 2, + ACL_TYPE_UNKNOWN = 3, + ACL_TYPE_REFERER = 4, +}; + +enum ACLGroupTypeEnum { +/* numbers are encoded should not change */ + ACL_GROUP_NONE = 0, + ACL_GROUP_ALL_USERS = 1, + ACL_GROUP_AUTHENTICATED_USERS = 2, +}; + +class ACLPermission +{ +protected: + int flags; +public: + ACLPermission() : flags(0) {} + ~ACLPermission() {} + uint32_t get_permissions() const { return flags; } + void set_permissions(uint32_t perm) { flags = perm; } + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(flags, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(flags, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(ACLPermission) + +class ACLGranteeType +{ +protected: + __u32 type; +public: + ACLGranteeType() : type(ACL_TYPE_UNKNOWN) {} + virtual ~ACLGranteeType() {} +// virtual const char *to_string() = 0; + ACLGranteeTypeEnum get_type() const { return (ACLGranteeTypeEnum)type; } + void set(ACLGranteeTypeEnum t) { type = t; } +// virtual void set(const char *s) = 0; + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(type, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(type, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(ACLGranteeType) + +class ACLGrantee +{ +public: + ACLGrantee() {} + ~ACLGrantee() {} +}; + + +class ACLGrant +{ +protected: + ACLGranteeType type; + rgw_user id; + string email; + ACLPermission permission; + string name; + ACLGroupTypeEnum group; + string url_spec; + +public: + ACLGrant() : group(ACL_GROUP_NONE) {} + virtual ~ACLGrant() {} + + /* there's an assumption here that email/uri/id encodings are + different and there can't be any overlap */ + bool get_id(rgw_user& _id) const { + switch(type.get_type()) { + case ACL_TYPE_EMAIL_USER: + _id = email; // implies from_str() that parses the 't:u' syntax + return true; + case ACL_TYPE_GROUP: + case ACL_TYPE_REFERER: + return false; + default: + _id = id; + return true; + } + } + ACLGranteeType& get_type() { return type; } + const ACLGranteeType& get_type() const { return type; } + ACLPermission& get_permission() { return permission; } + const ACLPermission& get_permission() const { return permission; } + ACLGroupTypeEnum get_group() const { return group; } + const string& get_referer() const { return url_spec; } + + void encode(bufferlist& bl) const { + ENCODE_START(5, 3, bl); + encode(type, bl); + string s; + id.to_str(s); + encode(s, bl); + string uri; + encode(uri, bl); + encode(email, bl); + encode(permission, bl); + encode(name, bl); + __u32 g = (__u32)group; + encode(g, bl); + encode(url_spec, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl); + decode(type, bl); + string s; + decode(s, bl); + id.from_str(s); + string uri; + decode(uri, bl); + decode(email, bl); + decode(permission, bl); + decode(name, bl); + if (struct_v > 1) { + __u32 g; + decode(g, bl); + group = (ACLGroupTypeEnum)g; + } else { + group = uri_to_group(uri); + } + if (struct_v >= 5) { + decode(url_spec, bl); + } else { + url_spec.clear(); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + ACLGroupTypeEnum uri_to_group(string& uri); + + void set_canon(const rgw_user& _id, const string& _name, const uint32_t perm) { + type.set(ACL_TYPE_CANON_USER); + id = _id; + name = _name; + permission.set_permissions(perm); + } + void set_group(ACLGroupTypeEnum _group, const uint32_t perm) { + type.set(ACL_TYPE_GROUP); + group = _group; + permission.set_permissions(perm); + } + void set_referer(const std::string& _url_spec, const uint32_t perm) { + type.set(ACL_TYPE_REFERER); + url_spec = _url_spec; + permission.set_permissions(perm); + } +}; +WRITE_CLASS_ENCODER(ACLGrant) + +struct ACLReferer { + std::string url_spec; + uint32_t perm; + + ACLReferer() : perm(0) {} + ACLReferer(const std::string& url_spec, + const uint32_t perm) + : url_spec(url_spec), + perm(perm) { + } + + bool is_match(boost::string_ref http_referer) const { + const auto http_host = get_http_host(http_referer); + if (!http_host || http_host->length() < url_spec.length()) { + return false; + } + + if ("*" == url_spec) { + return true; + } + + if (http_host->compare(url_spec) == 0) { + return true; + } + + if ('.' == url_spec[0]) { + /* Wildcard support: a referer matches the spec when its last char are + * perfectly equal to spec. */ + return http_host->ends_with(url_spec); + } + + return false; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(url_spec, bl); + encode(perm, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl); + decode(url_spec, bl); + decode(perm, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + +private: + boost::optional get_http_host(const boost::string_ref url) const { + size_t pos = url.find("://"); + if (pos == boost::string_ref::npos || url.starts_with("://") || + url.ends_with("://") || url.ends_with('@')) { + return boost::none; + } + boost::string_ref url_sub = url.substr(pos + strlen("://")); + pos = url_sub.find('@'); + if (pos != boost::string_ref::npos) { + url_sub = url_sub.substr(pos + 1); + } + pos = url_sub.find_first_of("/:"); + if (pos == boost::string_ref::npos) { + /* no port or path exists */ + return url_sub; + } + return url_sub.substr(0, pos); + } +}; +WRITE_CLASS_ENCODER(ACLReferer) + +namespace rgw { +namespace auth { + class Identity; +} +} + +class RGWAccessControlList +{ +protected: + CephContext *cct; + /* FIXME: in the feature we should consider switching to uint32_t also + * in data structures. */ + map acl_user_map; + map acl_group_map; + list referer_list; + multimap grant_map; + void _add_grant(ACLGrant *grant); +public: + explicit RGWAccessControlList(CephContext *_cct) : cct(_cct) {} + RGWAccessControlList() : cct(NULL) {} + + void set_ctx(CephContext *ctx) { + cct = ctx; + } + + virtual ~RGWAccessControlList() {} + + uint32_t get_perm(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + uint32_t perm_mask); + uint32_t get_group_perm(ACLGroupTypeEnum group, uint32_t perm_mask); + uint32_t get_referer_perm(uint32_t current_perm, + std::string http_referer, + uint32_t perm_mask); + void encode(bufferlist& bl) const { + ENCODE_START(4, 3, bl); + bool maps_initialized = true; + encode(maps_initialized, bl); + encode(acl_user_map, bl); + encode(grant_map, bl); + encode(acl_group_map, bl); + encode(referer_list, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl); + bool maps_initialized; + decode(maps_initialized, bl); + decode(acl_user_map, bl); + decode(grant_map, bl); + if (struct_v >= 2) { + decode(acl_group_map, bl); + } else if (!maps_initialized) { + multimap::iterator iter; + for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) { + ACLGrant& grant = iter->second; + _add_grant(&grant); + } + } + if (struct_v >= 4) { + decode(referer_list, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + void add_grant(ACLGrant *grant); + + multimap& get_grant_map() { return grant_map; } + const multimap& get_grant_map() const { return grant_map; } + + void create_default(const rgw_user& id, string name) { + acl_user_map.clear(); + acl_group_map.clear(); + referer_list.clear(); + + ACLGrant grant; + grant.set_canon(id, name, RGW_PERM_FULL_CONTROL); + add_grant(&grant); + } +}; +WRITE_CLASS_ENCODER(RGWAccessControlList) + +class ACLOwner +{ +protected: + rgw_user id; + string display_name; +public: + ACLOwner() {} + ~ACLOwner() {} + + void encode(bufferlist& bl) const { + ENCODE_START(3, 2, bl); + string s; + id.to_str(s); + encode(s, bl); + encode(display_name, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + string s; + decode(s, bl); + id.from_str(s); + decode(display_name, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(list& o); + void set_id(const rgw_user& _id) { id = _id; } + void set_name(const string& name) { display_name = name; } + + rgw_user& get_id() { return id; } + const rgw_user& get_id() const { return id; } + string& get_display_name() { return display_name; } +}; +WRITE_CLASS_ENCODER(ACLOwner) + +class RGWAccessControlPolicy +{ +protected: + CephContext *cct; + RGWAccessControlList acl; + ACLOwner owner; + +public: + explicit RGWAccessControlPolicy(CephContext *_cct) : cct(_cct), acl(_cct) {} + RGWAccessControlPolicy() : cct(NULL), acl(NULL) {} + virtual ~RGWAccessControlPolicy() {} + + void set_ctx(CephContext *ctx) { + cct = ctx; + acl.set_ctx(ctx); + } + + uint32_t get_perm(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + uint32_t perm_mask, + const char * http_referer); + uint32_t get_group_perm(ACLGroupTypeEnum group, uint32_t perm_mask); + bool verify_permission(const DoutPrefixProvider* dpp, + const rgw::auth::Identity& auth_identity, + uint32_t user_perm_mask, + uint32_t perm, + const char * http_referer = nullptr); + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(owner, bl); + encode(acl, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(owner, bl); + decode(acl, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + void decode_owner(bufferlist::const_iterator& bl) { // sometimes we only need that, should be faster + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(owner, bl); + DECODE_FINISH(bl); + } + + void set_owner(ACLOwner& o) { owner = o; } + ACLOwner& get_owner() { + return owner; + } + + void create_default(const rgw_user& id, string& name) { + acl.create_default(id, name); + owner.set_id(id); + owner.set_name(name); + } + RGWAccessControlList& get_acl() { + return acl; + } + const RGWAccessControlList& get_acl() const { + return acl; + } + + virtual bool compare_group_name(string& id, ACLGroupTypeEnum group) { return false; } +}; +WRITE_CLASS_ENCODER(RGWAccessControlPolicy) + +#endif diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc new file mode 100644 index 00000000..5f026ff3 --- /dev/null +++ b/src/rgw/rgw_acl_s3.cc @@ -0,0 +1,616 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_acl_s3.h" +#include "rgw_user.h" + +#define dout_subsys ceph_subsys_rgw + + + +#define RGW_URI_ALL_USERS "http://acs.amazonaws.com/groups/global/AllUsers" +#define RGW_URI_AUTH_USERS "http://acs.amazonaws.com/groups/global/AuthenticatedUsers" + +static string rgw_uri_all_users = RGW_URI_ALL_USERS; +static string rgw_uri_auth_users = RGW_URI_AUTH_USERS; + +void ACLPermission_S3::to_xml(ostream& out) +{ + if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) { + out << "FULL_CONTROL"; + } else { + if (flags & RGW_PERM_READ) + out << "READ"; + if (flags & RGW_PERM_WRITE) + out << "WRITE"; + if (flags & RGW_PERM_READ_ACP) + out << "READ_ACP"; + if (flags & RGW_PERM_WRITE_ACP) + out << "WRITE_ACP"; + } +} + +bool ACLPermission_S3:: +xml_end(const char *el) +{ + const char *s = data.c_str(); + if (strcasecmp(s, "READ") == 0) { + flags |= RGW_PERM_READ; + return true; + } else if (strcasecmp(s, "WRITE") == 0) { + flags |= RGW_PERM_WRITE; + return true; + } else if (strcasecmp(s, "READ_ACP") == 0) { + flags |= RGW_PERM_READ_ACP; + return true; + } else if (strcasecmp(s, "WRITE_ACP") == 0) { + flags |= RGW_PERM_WRITE_ACP; + return true; + } else if (strcasecmp(s, "FULL_CONTROL") == 0) { + flags |= RGW_PERM_FULL_CONTROL; + return true; + } + return false; +} + + +class ACLGranteeType_S3 { +public: + static const char *to_string(ACLGranteeType& type) { + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + return "CanonicalUser"; + case ACL_TYPE_EMAIL_USER: + return "AmazonCustomerByEmail"; + case ACL_TYPE_GROUP: + return "Group"; + default: + return "unknown"; + } + } + + static void set(const char *s, ACLGranteeType& type) { + if (!s) { + type.set(ACL_TYPE_UNKNOWN); + return; + } + if (strcmp(s, "CanonicalUser") == 0) + type.set(ACL_TYPE_CANON_USER); + else if (strcmp(s, "AmazonCustomerByEmail") == 0) + type.set(ACL_TYPE_EMAIL_USER); + else if (strcmp(s, "Group") == 0) + type.set(ACL_TYPE_GROUP); + else + type.set(ACL_TYPE_UNKNOWN); + } +}; + +class ACLID_S3 : public XMLObj +{ +public: + ACLID_S3() {} + ~ACLID_S3() override {} + string& to_str() { return data; } +}; + +class ACLURI_S3 : public XMLObj +{ +public: + ACLURI_S3() {} + ~ACLURI_S3() override {} +}; + +class ACLEmail_S3 : public XMLObj +{ +public: + ACLEmail_S3() {} + ~ACLEmail_S3() override {} +}; + +class ACLDisplayName_S3 : public XMLObj +{ +public: + ACLDisplayName_S3() {} + ~ACLDisplayName_S3() override {} +}; + +bool ACLOwner_S3::xml_end(const char *el) { + ACLID_S3 *acl_id = static_cast(find_first("ID")); + ACLID_S3 *acl_name = static_cast(find_first("DisplayName")); + + // ID is mandatory + if (!acl_id) + return false; + id = acl_id->get_data(); + + // DisplayName is optional + if (acl_name) + display_name = acl_name->get_data(); + else + display_name = ""; + + return true; +} + +void ACLOwner_S3::to_xml(ostream& out) { + string s; + id.to_str(s); + if (s.empty()) + return; + out << "" << "" << s << ""; + if (!display_name.empty()) + out << "" << display_name << ""; + out << ""; +} + +bool ACLGrant_S3::xml_end(const char *el) { + ACLGrantee_S3 *acl_grantee; + ACLID_S3 *acl_id; + ACLURI_S3 *acl_uri; + ACLEmail_S3 *acl_email; + ACLPermission_S3 *acl_permission; + ACLDisplayName_S3 *acl_name; + string uri; + + acl_grantee = static_cast(find_first("Grantee")); + if (!acl_grantee) + return false; + string type_str; + if (!acl_grantee->get_attr("xsi:type", type_str)) + return false; + ACLGranteeType_S3::set(type_str.c_str(), type); + + acl_permission = static_cast(find_first("Permission")); + if (!acl_permission) + return false; + + permission = *acl_permission; + + id.clear(); + name.clear(); + email.clear(); + + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + acl_id = static_cast(acl_grantee->find_first("ID")); + if (!acl_id) + return false; + id = acl_id->to_str(); + acl_name = static_cast(acl_grantee->find_first("DisplayName")); + if (acl_name) + name = acl_name->get_data(); + break; + case ACL_TYPE_GROUP: + acl_uri = static_cast(acl_grantee->find_first("URI")); + if (!acl_uri) + return false; + uri = acl_uri->get_data(); + group = uri_to_group(uri); + break; + case ACL_TYPE_EMAIL_USER: + acl_email = static_cast(acl_grantee->find_first("EmailAddress")); + if (!acl_email) + return false; + email = acl_email->get_data(); + break; + default: + // unknown user type + return false; + }; + return true; +} + +void ACLGrant_S3::to_xml(CephContext *cct, ostream& out) { + ACLPermission_S3& perm = static_cast(permission); + + /* only show s3 compatible permissions */ + if (!(perm.get_permissions() & RGW_PERM_ALL_S3)) + return; + + string uri; + + out << "" << + ""; + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + out << "" << id << ""; + if (name.size()) { + out << "" << name << ""; + } + break; + case ACL_TYPE_EMAIL_USER: + out << "" << email << ""; + break; + case ACL_TYPE_GROUP: + if (!group_to_uri(group, uri)) { + ldout(cct, 0) << "ERROR: group_to_uri failed with group=" << (int)group << dendl; + break; + } + out << "" << uri << ""; + break; + default: + break; + } + out << ""; + perm.to_xml(out); + out << ""; +} + +bool ACLGrant_S3::group_to_uri(ACLGroupTypeEnum group, string& uri) +{ + switch (group) { + case ACL_GROUP_ALL_USERS: + uri = rgw_uri_all_users; + return true; + case ACL_GROUP_AUTHENTICATED_USERS: + uri = rgw_uri_auth_users; + return true; + default: + return false; + } +} + +bool RGWAccessControlList_S3::xml_end(const char *el) { + XMLObjIter iter = find("Grant"); + ACLGrant_S3 *grant = static_cast(iter.get_next()); + while (grant) { + add_grant(grant); + grant = static_cast(iter.get_next()); + } + return true; +} + +void RGWAccessControlList_S3::to_xml(ostream& out) { + multimap::iterator iter; + out << ""; + for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) { + ACLGrant_S3& grant = static_cast(iter->second); + grant.to_xml(cct, out); + } + out << ""; +} + +struct s3_acl_header { + int rgw_perm; + const char *http_header; +}; + +static const char *get_acl_header(const RGWEnv *env, + const struct s3_acl_header *perm) +{ + const char *header = perm->http_header; + + return env->get(header, NULL); +} + +static int parse_grantee_str(RGWRados *store, string& grantee_str, + const struct s3_acl_header *perm, ACLGrant& grant) +{ + string id_type, id_val_quoted; + int rgw_perm = perm->rgw_perm; + int ret; + + RGWUserInfo info; + + ret = parse_key_value(grantee_str, id_type, id_val_quoted); + if (ret < 0) + return ret; + + string id_val = rgw_trim_quotes(id_val_quoted); + + if (strcasecmp(id_type.c_str(), "emailAddress") == 0) { + ret = rgw_get_user_info_by_email(store, id_val, info); + if (ret < 0) + return ret; + + grant.set_canon(info.user_id, info.display_name, rgw_perm); + } else if (strcasecmp(id_type.c_str(), "id") == 0) { + rgw_user user(id_val); + ret = rgw_get_user_info_by_uid(store, user, info); + if (ret < 0) + return ret; + + grant.set_canon(info.user_id, info.display_name, rgw_perm); + } else if (strcasecmp(id_type.c_str(), "uri") == 0) { + ACLGroupTypeEnum gid = grant.uri_to_group(id_val); + if (gid == ACL_GROUP_NONE) + return -EINVAL; + + grant.set_group(gid, rgw_perm); + } else { + return -EINVAL; + } + + return 0; +} + +static int parse_acl_header(RGWRados *store, const RGWEnv *env, + const struct s3_acl_header *perm, std::list& _grants) +{ + std::list grantees; + std::string hacl_str; + + const char *hacl = get_acl_header(env, perm); + if (hacl == NULL) + return 0; + + hacl_str = hacl; + get_str_list(hacl_str, ",", grantees); + + for (list::iterator it = grantees.begin(); it != grantees.end(); ++it) { + ACLGrant grant; + int ret = parse_grantee_str(store, *it, perm, grant); + if (ret < 0) + return ret; + + _grants.push_back(grant); + } + + return 0; +} + +int RGWAccessControlList_S3::create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl) +{ + acl_user_map.clear(); + grant_map.clear(); + + ACLGrant owner_grant; + + rgw_user bid = bucket_owner.get_id(); + string bname = bucket_owner.get_display_name(); + + /* owner gets full control */ + owner_grant.set_canon(owner.get_id(), owner.get_display_name(), RGW_PERM_FULL_CONTROL); + add_grant(&owner_grant); + + if (canned_acl.size() == 0 || canned_acl.compare("private") == 0) { + return 0; + } + + ACLGrant bucket_owner_grant; + ACLGrant group_grant; + if (canned_acl.compare("public-read") == 0) { + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); + add_grant(&group_grant); + } else if (canned_acl.compare("public-read-write") == 0) { + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); + add_grant(&group_grant); + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_WRITE); + add_grant(&group_grant); + } else if (canned_acl.compare("authenticated-read") == 0) { + group_grant.set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_READ); + add_grant(&group_grant); + } else if (canned_acl.compare("bucket-owner-read") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_READ); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else if (canned_acl.compare("bucket-owner-full-control") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_FULL_CONTROL); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else { + return -EINVAL; + } + + return 0; +} + +int RGWAccessControlList_S3::create_from_grants(std::list& grants) +{ + if (grants.empty()) + return -EINVAL; + + acl_user_map.clear(); + grant_map.clear(); + + for (std::list::iterator it = grants.begin(); it != grants.end(); ++it) { + ACLGrant g = *it; + add_grant(&g); + } + + return 0; +} + +bool RGWAccessControlPolicy_S3::xml_end(const char *el) { + RGWAccessControlList_S3 *s3acl = + static_cast(find_first("AccessControlList")); + if (!s3acl) + return false; + + acl = *s3acl; + + ACLOwner *owner_p = static_cast(find_first("Owner")); + if (!owner_p) + return false; + owner = *owner_p; + return true; +} + +void RGWAccessControlPolicy_S3::to_xml(ostream& out) { + out << ""; + ACLOwner_S3& _owner = static_cast(owner); + RGWAccessControlList_S3& _acl = static_cast(acl); + _owner.to_xml(out); + _acl.to_xml(out); + out << ""; +} + +static const s3_acl_header acl_header_perms[] = { + {RGW_PERM_READ, "HTTP_X_AMZ_GRANT_READ"}, + {RGW_PERM_WRITE, "HTTP_X_AMZ_GRANT_WRITE"}, + {RGW_PERM_READ_ACP,"HTTP_X_AMZ_GRANT_READ_ACP"}, + {RGW_PERM_WRITE_ACP, "HTTP_X_AMZ_GRANT_WRITE_ACP"}, + {RGW_PERM_FULL_CONTROL, "HTTP_X_AMZ_GRANT_FULL_CONTROL"}, + {0, NULL} +}; + +int RGWAccessControlPolicy_S3::create_from_headers(RGWRados *store, const RGWEnv *env, ACLOwner& _owner) +{ + std::list grants; + int r = 0; + + for (const struct s3_acl_header *p = acl_header_perms; p->rgw_perm; p++) { + r = parse_acl_header(store, env, p, grants); + if (r < 0) { + return r; + } + } + + RGWAccessControlList_S3& _acl = static_cast(acl); + r = _acl.create_from_grants(grants); + + owner = _owner; + + return r; +} + +/* + can only be called on object that was parsed + */ +int RGWAccessControlPolicy_S3::rebuild(RGWRados *store, ACLOwner *owner, RGWAccessControlPolicy& dest) +{ + if (!owner) + return -EINVAL; + + ACLOwner *requested_owner = static_cast(find_first("Owner")); + if (requested_owner) { + rgw_user& requested_id = requested_owner->get_id(); + if (!requested_id.empty() && requested_id.compare(owner->get_id()) != 0) + return -EPERM; + } + + RGWUserInfo owner_info; + if (rgw_get_user_info_by_uid(store, owner->get_id(), owner_info) < 0) { + ldout(cct, 10) << "owner info does not exist" << dendl; + return -EINVAL; + } + ACLOwner& dest_owner = dest.get_owner(); + dest_owner.set_id(owner->get_id()); + dest_owner.set_name(owner_info.display_name); + + ldout(cct, 20) << "owner id=" << owner->get_id() << dendl; + ldout(cct, 20) << "dest owner id=" << dest.get_owner().get_id() << dendl; + + RGWAccessControlList& dst_acl = dest.get_acl(); + + multimap& grant_map = acl.get_grant_map(); + multimap::iterator iter; + for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) { + ACLGrant& src_grant = iter->second; + ACLGranteeType& type = src_grant.get_type(); + ACLGrant new_grant; + bool grant_ok = false; + rgw_user uid; + RGWUserInfo grant_user; + switch (type.get_type()) { + case ACL_TYPE_EMAIL_USER: + { + string email; + rgw_user u; + if (!src_grant.get_id(u)) { + ldout(cct, 0) << "ERROR: src_grant.get_id() failed" << dendl; + return -EINVAL; + } + email = u.id; + ldout(cct, 10) << "grant user email=" << email << dendl; + if (rgw_get_user_info_by_email(store, email, grant_user) < 0) { + ldout(cct, 10) << "grant user email not found or other error" << dendl; + return -ERR_UNRESOLVABLE_EMAIL; + } + uid = grant_user.user_id; + } + case ACL_TYPE_CANON_USER: + { + if (type.get_type() == ACL_TYPE_CANON_USER) { + if (!src_grant.get_id(uid)) { + ldout(cct, 0) << "ERROR: src_grant.get_id() failed" << dendl; + return -EINVAL; + } + } + + if (grant_user.user_id.empty() && rgw_get_user_info_by_uid(store, uid, grant_user) < 0) { + ldout(cct, 10) << "grant user does not exist:" << uid << dendl; + return -EINVAL; + } else { + ACLPermission& perm = src_grant.get_permission(); + new_grant.set_canon(uid, grant_user.display_name, perm.get_permissions()); + grant_ok = true; + rgw_user new_id; + new_grant.get_id(new_id); + ldout(cct, 10) << "new grant: " << new_id << ":" << grant_user.display_name << dendl; + } + } + break; + case ACL_TYPE_GROUP: + { + string uri; + if (ACLGrant_S3::group_to_uri(src_grant.get_group(), uri)) { + new_grant = src_grant; + grant_ok = true; + ldout(cct, 10) << "new grant: " << uri << dendl; + } else { + ldout(cct, 10) << "bad grant group:" << (int)src_grant.get_group() << dendl; + return -EINVAL; + } + } + default: + break; + } + if (grant_ok) { + dst_acl.add_grant(&new_grant); + } + } + + return 0; +} + +bool RGWAccessControlPolicy_S3::compare_group_name(string& id, ACLGroupTypeEnum group) +{ + switch (group) { + case ACL_GROUP_ALL_USERS: + return (id.compare(RGW_USER_ANON_ID) == 0); + case ACL_GROUP_AUTHENTICATED_USERS: + return (id.compare(rgw_uri_auth_users) == 0); + default: + return id.empty(); + } + + // shouldn't get here + return false; +} + +XMLObj *RGWACLXMLParser_S3::alloc_obj(const char *el) +{ + XMLObj * obj = NULL; + if (strcmp(el, "AccessControlPolicy") == 0) { + obj = new RGWAccessControlPolicy_S3(cct); + } else if (strcmp(el, "Owner") == 0) { + obj = new ACLOwner_S3(); + } else if (strcmp(el, "AccessControlList") == 0) { + obj = new RGWAccessControlList_S3(cct); + } else if (strcmp(el, "ID") == 0) { + obj = new ACLID_S3(); + } else if (strcmp(el, "DisplayName") == 0) { + obj = new ACLDisplayName_S3(); + } else if (strcmp(el, "Grant") == 0) { + obj = new ACLGrant_S3(); + } else if (strcmp(el, "Grantee") == 0) { + obj = new ACLGrantee_S3(); + } else if (strcmp(el, "Permission") == 0) { + obj = new ACLPermission_S3(); + } else if (strcmp(el, "URI") == 0) { + obj = new ACLURI_S3(); + } else if (strcmp(el, "EmailAddress") == 0) { + obj = new ACLEmail_S3(); + } + + return obj; +} + diff --git a/src/rgw/rgw_acl_s3.h b/src/rgw/rgw_acl_s3.h new file mode 100644 index 00000000..41877667 --- /dev/null +++ b/src/rgw/rgw_acl_s3.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_ACL_S3_H +#define CEPH_RGW_ACL_S3_H + +#include +#include +#include +#include + +#include "include/str_list.h" +#include "rgw_xml.h" +#include "rgw_acl.h" + +class RGWRados; + +class ACLPermission_S3 : public ACLPermission, public XMLObj +{ +public: + ACLPermission_S3() {} + ~ACLPermission_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(ostream& out); +}; + +class ACLGrantee_S3 : public ACLGrantee, public XMLObj +{ +public: + ACLGrantee_S3() {} + ~ACLGrantee_S3() override {} + + bool xml_start(const char *el, const char **attr); +}; + + +class ACLGrant_S3 : public ACLGrant, public XMLObj +{ +public: + ACLGrant_S3() {} + ~ACLGrant_S3() override {} + + void to_xml(CephContext *cct, ostream& out); + bool xml_end(const char *el) override; + bool xml_start(const char *el, const char **attr); + + static ACLGroupTypeEnum uri_to_group(string& uri); + static bool group_to_uri(ACLGroupTypeEnum group, string& uri); +}; + +class RGWAccessControlList_S3 : public RGWAccessControlList, public XMLObj +{ +public: + explicit RGWAccessControlList_S3(CephContext *_cct) : RGWAccessControlList(_cct) {} + ~RGWAccessControlList_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(ostream& out); + + int create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl); + int create_from_grants(std::list& grants); +}; + +class ACLOwner_S3 : public ACLOwner, public XMLObj +{ +public: + ACLOwner_S3() {} + ~ACLOwner_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(ostream& out); +}; + +class RGWEnv; + +class RGWAccessControlPolicy_S3 : public RGWAccessControlPolicy, public XMLObj +{ +public: + explicit RGWAccessControlPolicy_S3(CephContext *_cct) : RGWAccessControlPolicy(_cct) {} + ~RGWAccessControlPolicy_S3() override {} + + bool xml_end(const char *el) override; + + void to_xml(ostream& out); + int rebuild(RGWRados *store, ACLOwner *owner, RGWAccessControlPolicy& dest); + bool compare_group_name(string& id, ACLGroupTypeEnum group) override; + + virtual int create_canned(ACLOwner& _owner, ACLOwner& bucket_owner, const string& canned_acl) { + RGWAccessControlList_S3& _acl = static_cast(acl); + int ret = _acl.create_canned(_owner, bucket_owner, canned_acl); + owner = _owner; + return ret; + } + int create_from_headers(RGWRados *store, const RGWEnv *env, ACLOwner& _owner); +}; + +/** + * Interfaces with the webserver's XML handling code + * to parse it in a way that makes sense for the rgw. + */ +class RGWACLXMLParser_S3 : public RGWXMLParser +{ + CephContext *cct; + + XMLObj *alloc_obj(const char *el) override; +public: + explicit RGWACLXMLParser_S3(CephContext *_cct) : cct(_cct) {} +}; + +#endif diff --git a/src/rgw/rgw_acl_swift.cc b/src/rgw/rgw_acl_swift.cc new file mode 100644 index 00000000..18a99912 --- /dev/null +++ b/src/rgw/rgw_acl_swift.cc @@ -0,0 +1,430 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include + +#include + +#include "common/ceph_json.h" +#include "rgw_common.h" +#include "rgw_user.h" +#include "rgw_acl_swift.h" + +#define dout_subsys ceph_subsys_rgw + + +#define SWIFT_PERM_READ RGW_PERM_READ_OBJS +#define SWIFT_PERM_WRITE RGW_PERM_WRITE_OBJS +/* FIXME: do we really need separate RW? */ +#define SWIFT_PERM_RWRT (SWIFT_PERM_READ | SWIFT_PERM_WRITE) +#define SWIFT_PERM_ADMIN RGW_PERM_FULL_CONTROL + +#define SWIFT_GROUP_ALL_USERS ".r:*" + +static int parse_list(const char* uid_list, + std::vector& uids) /* out */ +{ + char *s = strdup(uid_list); + if (!s) { + return -ENOMEM; + } + + char *tokctx; + const char *p = strtok_r(s, " ,", &tokctx); + while (p) { + if (*p) { + string acl = p; + uids.push_back(acl); + } + p = strtok_r(NULL, " ,", &tokctx); + } + free(s); + return 0; +} + +static bool is_referrer(const std::string& designator) +{ + return designator.compare(".r") == 0 || + designator.compare(".ref") == 0 || + designator.compare(".referer") == 0 || + designator.compare(".referrer") == 0; +} + +static bool uid_is_public(const string& uid) +{ + if (uid[0] != '.' || uid[1] != 'r') + return false; + + int pos = uid.find(':'); + if (pos < 0 || pos == (int)uid.size()) + return false; + + string sub = uid.substr(0, pos); + string after = uid.substr(pos + 1); + + if (after.compare("*") != 0) + return false; + + return is_referrer(sub); +} + +static boost::optional referrer_to_grant(std::string url_spec, + const uint32_t perm) +{ + /* This function takes url_spec as non-ref std::string because of the trim + * operation that is essential to preserve compliance with Swift. It can't + * be easily accomplished with boost::string_ref. */ + try { + bool is_negative; + ACLGrant grant; + + if ('-' == url_spec[0]) { + url_spec = url_spec.substr(1); + boost::algorithm::trim(url_spec); + + is_negative = true; + } else { + is_negative = false; + } + + if (url_spec != RGW_REFERER_WILDCARD) { + if ('*' == url_spec[0]) { + url_spec = url_spec.substr(1); + boost::algorithm::trim(url_spec); + } + + if (url_spec.empty() || url_spec == ".") { + return boost::none; + } + } else { + /* Please be aware we're specially handling the .r:* in _add_grant() + * of RGWAccessControlList as the S3 API has a similar concept, and + * thus we can have a small portion of compatibility. */ + } + + grant.set_referer(url_spec, is_negative ? 0 : perm); + return grant; + } catch (const std::out_of_range&) { + return boost::none; + } +} + +static ACLGrant user_to_grant(CephContext* const cct, + RGWRados* const store, + const std::string& uid, + const uint32_t perm) +{ + rgw_user user(uid); + RGWUserInfo grant_user; + ACLGrant grant; + + if (rgw_get_user_info_by_uid(store, user, grant_user) < 0) { + ldout(cct, 10) << "grant user does not exist: " << uid << dendl; + /* skipping silently */ + grant.set_canon(user, std::string(), perm); + } else { + grant.set_canon(user, grant_user.display_name, perm); + } + + return grant; +} + +int RGWAccessControlPolicy_SWIFT::add_grants(RGWRados* const store, + const std::vector& uids, + const uint32_t perm) +{ + for (const auto& uid : uids) { + boost::optional grant; + ldout(cct, 20) << "trying to add grant for ACL uid=" << uid << dendl; + + /* Let's check whether the item has a separator potentially indicating + * a special meaning (like an HTTP referral-based grant). */ + const size_t pos = uid.find(':'); + if (std::string::npos == pos) { + /* No, it don't have -- we've got just a regular user identifier. */ + grant = user_to_grant(cct, store, uid, perm); + } else { + /* Yes, *potentially* an HTTP referral. */ + auto designator = uid.substr(0, pos); + auto designatee = uid.substr(pos + 1); + + /* Swift strips whitespaces at both beginning and end. */ + boost::algorithm::trim(designator); + boost::algorithm::trim(designatee); + + if (! boost::algorithm::starts_with(designator, ".")) { + grant = user_to_grant(cct, store, uid, perm); + } else if ((perm & SWIFT_PERM_WRITE) == 0 && is_referrer(designator)) { + /* HTTP referrer-based ACLs aren't acceptable for writes. */ + grant = referrer_to_grant(designatee, perm); + } + } + + if (grant) { + acl.add_grant(&*grant); + } else { + return -EINVAL; + } + } + + return 0; +} + + +int RGWAccessControlPolicy_SWIFT::create(RGWRados* const store, + const rgw_user& id, + const std::string& name, + const char* read_list, + const char* write_list, + uint32_t& rw_mask) +{ + acl.create_default(id, name); + owner.set_id(id); + owner.set_name(name); + rw_mask = 0; + + if (read_list) { + std::vector uids; + int r = parse_list(read_list, uids); + if (r < 0) { + ldout(cct, 0) << "ERROR: parse_list for read returned r=" + << r << dendl; + return r; + } + + r = add_grants(store, uids, SWIFT_PERM_READ); + if (r < 0) { + ldout(cct, 0) << "ERROR: add_grants for read returned r=" + << r << dendl; + return r; + } + rw_mask |= SWIFT_PERM_READ; + } + if (write_list) { + std::vector uids; + int r = parse_list(write_list, uids); + if (r < 0) { + ldout(cct, 0) << "ERROR: parse_list for write returned r=" + << r << dendl; + return r; + } + + r = add_grants(store, uids, SWIFT_PERM_WRITE); + if (r < 0) { + ldout(cct, 0) << "ERROR: add_grants for write returned r=" + << r << dendl; + return r; + } + rw_mask |= SWIFT_PERM_WRITE; + } + return 0; +} + +void RGWAccessControlPolicy_SWIFT::filter_merge(uint32_t rw_mask, + RGWAccessControlPolicy_SWIFT *old) +{ + /* rw_mask&SWIFT_PERM_READ => setting read acl, + * rw_mask&SWIFT_PERM_WRITE => setting write acl + * when bit is cleared, copy matching elements from old. + */ + if (rw_mask == (SWIFT_PERM_READ|SWIFT_PERM_WRITE)) { + return; + } + rw_mask ^= (SWIFT_PERM_READ|SWIFT_PERM_WRITE); + for (auto &iter: old->acl.get_grant_map()) { + ACLGrant& grant = iter.second; + uint32_t perm = grant.get_permission().get_permissions(); + rgw_user id; + string url_spec; + if (!grant.get_id(id)) { + if (grant.get_group() != ACL_GROUP_ALL_USERS) { + url_spec = grant.get_referer(); + if (url_spec.empty()) { + continue; + } + if (perm == 0) { + /* We need to carry also negative, HTTP referrer-based ACLs. */ + perm = SWIFT_PERM_READ; + } + } + } + if (perm & rw_mask) { + acl.add_grant(&grant); + } + } +} + +void RGWAccessControlPolicy_SWIFT::to_str(string& read, string& write) +{ + multimap& m = acl.get_grant_map(); + multimap::iterator iter; + + for (iter = m.begin(); iter != m.end(); ++iter) { + ACLGrant& grant = iter->second; + const uint32_t perm = grant.get_permission().get_permissions(); + rgw_user id; + string url_spec; + if (!grant.get_id(id)) { + if (grant.get_group() == ACL_GROUP_ALL_USERS) { + id = SWIFT_GROUP_ALL_USERS; + } else { + url_spec = grant.get_referer(); + if (url_spec.empty()) { + continue; + } + id = (perm != 0) ? ".r:" + url_spec : ".r:-" + url_spec; + } + } + if (perm & SWIFT_PERM_READ) { + if (!read.empty()) { + read.append(","); + } + read.append(id.to_str()); + } else if (perm & SWIFT_PERM_WRITE) { + if (!write.empty()) { + write.append(","); + } + write.append(id.to_str()); + } else if (perm == 0 && !url_spec.empty()) { + /* only X-Container-Read headers support referers */ + if (!read.empty()) { + read.append(","); + } + read.append(id.to_str()); + } + } +} + +void RGWAccessControlPolicy_SWIFTAcct::add_grants(RGWRados * const store, + const std::vector& uids, + const uint32_t perm) +{ + for (const auto& uid : uids) { + ACLGrant grant; + RGWUserInfo grant_user; + + if (uid_is_public(uid)) { + grant.set_group(ACL_GROUP_ALL_USERS, perm); + acl.add_grant(&grant); + } else { + rgw_user user(uid); + + if (rgw_get_user_info_by_uid(store, user, grant_user) < 0) { + ldout(cct, 10) << "grant user does not exist:" << uid << dendl; + /* skipping silently */ + grant.set_canon(user, std::string(), perm); + acl.add_grant(&grant); + } else { + grant.set_canon(user, grant_user.display_name, perm); + acl.add_grant(&grant); + } + } + } +} + +bool RGWAccessControlPolicy_SWIFTAcct::create(RGWRados * const store, + const rgw_user& id, + const std::string& name, + const std::string& acl_str) +{ + acl.create_default(id, name); + owner.set_id(id); + owner.set_name(name); + + JSONParser parser; + + if (!parser.parse(acl_str.c_str(), acl_str.length())) { + ldout(cct, 0) << "ERROR: JSONParser::parse returned error=" << dendl; + return false; + } + + JSONObjIter iter = parser.find_first("admin"); + if (!iter.end() && (*iter)->is_array()) { + std::vector admin; + decode_json_obj(admin, *iter); + ldout(cct, 0) << "admins: " << admin << dendl; + + add_grants(store, admin, SWIFT_PERM_ADMIN); + } + + iter = parser.find_first("read-write"); + if (!iter.end() && (*iter)->is_array()) { + std::vector readwrite; + decode_json_obj(readwrite, *iter); + ldout(cct, 0) << "read-write: " << readwrite << dendl; + + add_grants(store, readwrite, SWIFT_PERM_RWRT); + } + + iter = parser.find_first("read-only"); + if (!iter.end() && (*iter)->is_array()) { + std::vector readonly; + decode_json_obj(readonly, *iter); + ldout(cct, 0) << "read-only: " << readonly << dendl; + + add_grants(store, readonly, SWIFT_PERM_READ); + } + + return true; +} + +boost::optional RGWAccessControlPolicy_SWIFTAcct::to_str() const +{ + std::vector admin; + std::vector readwrite; + std::vector readonly; + + /* Parition the grant map into three not-overlapping groups. */ + for (const auto& item : get_acl().get_grant_map()) { + const ACLGrant& grant = item.second; + const uint32_t perm = grant.get_permission().get_permissions(); + + rgw_user id; + if (!grant.get_id(id)) { + if (grant.get_group() != ACL_GROUP_ALL_USERS) { + continue; + } + id = SWIFT_GROUP_ALL_USERS; + } else if (owner.get_id() == id) { + continue; + } + + if (SWIFT_PERM_ADMIN == (perm & SWIFT_PERM_ADMIN)) { + admin.insert(admin.end(), id.to_str()); + } else if (SWIFT_PERM_RWRT == (perm & SWIFT_PERM_RWRT)) { + readwrite.insert(readwrite.end(), id.to_str()); + } else if (SWIFT_PERM_READ == (perm & SWIFT_PERM_READ)) { + readonly.insert(readonly.end(), id.to_str()); + } else { + // FIXME: print a warning + } + } + + /* If there is no grant to serialize, let's exit earlier to not return + * an empty JSON object which brakes the functional tests of Swift. */ + if (admin.empty() && readwrite.empty() && readonly.empty()) { + return boost::none; + } + + /* Serialize the groups. */ + JSONFormatter formatter; + + formatter.open_object_section("acl"); + if (!readonly.empty()) { + encode_json("read-only", readonly, &formatter); + } + if (!readwrite.empty()) { + encode_json("read-write", readwrite, &formatter); + } + if (!admin.empty()) { + encode_json("admin", admin, &formatter); + } + formatter.close_section(); + + std::ostringstream oss; + formatter.flush(oss); + + return oss.str(); +} diff --git a/src/rgw/rgw_acl_swift.h b/src/rgw/rgw_acl_swift.h new file mode 100644 index 00000000..f5365b04 --- /dev/null +++ b/src/rgw/rgw_acl_swift.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_ACL_SWIFT_H +#define CEPH_RGW_ACL_SWIFT_H + +#include +#include +#include +#include + +#include + +#include "rgw_acl.h" + +class RGWAccessControlPolicy_SWIFT : public RGWAccessControlPolicy +{ + int add_grants(RGWRados *store, + const std::vector& uids, + uint32_t perm); + +public: + explicit RGWAccessControlPolicy_SWIFT(CephContext* const cct) + : RGWAccessControlPolicy(cct) { + } + ~RGWAccessControlPolicy_SWIFT() override = default; + + int create(RGWRados *store, + const rgw_user& id, + const std::string& name, + const char* read_list, + const char* write_list, + uint32_t& rw_mask); + void filter_merge(uint32_t mask, RGWAccessControlPolicy_SWIFT *policy); + void to_str(std::string& read, std::string& write); +}; + +class RGWAccessControlPolicy_SWIFTAcct : public RGWAccessControlPolicy +{ +public: + explicit RGWAccessControlPolicy_SWIFTAcct(CephContext * const cct) + : RGWAccessControlPolicy(cct) { + } + ~RGWAccessControlPolicy_SWIFTAcct() override {} + + void add_grants(RGWRados *store, + const std::vector& uids, + uint32_t perm); + bool create(RGWRados *store, + const rgw_user& id, + const std::string& name, + const std::string& acl_str); + boost::optional to_str() const; +}; +#endif diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc new file mode 100644 index 00000000..675cce34 --- /dev/null +++ b/src/rgw/rgw_admin.cc @@ -0,0 +1,8463 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + +#include + +extern "C" { +#include +} + +#include "auth/Crypto.h" +#include "compressor/Compressor.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" +#include "common/safe_io.h" + +#include "include/util.h" + +#include "cls/rgw/cls_rgw_types.h" +#include "cls/rgw/cls_rgw_client.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_otp.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_lc.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_orphan.h" +#include "rgw_sync.h" +#include "rgw_sync_log_trim.h" +#include "rgw_data_sync.h" +#include "rgw_rest_conn.h" +#include "rgw_realm_watcher.h" +#include "rgw_role.h" +#include "rgw_reshard.h" +#include "rgw_http_client_curl.h" +#include "rgw_zone.h" +#include "rgw_pubsub.h" +#include "rgw_sync_module_pubsub.h" + +#include "services/svc_sync_modules.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +#define SECRET_KEY_LEN 40 +#define PUBLIC_ID_LEN 20 + +static RGWRados *store = NULL; + +static const DoutPrefixProvider* dpp() { + struct GlobalPrefix : public DoutPrefixProvider { + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const override { return dout_subsys; } + std::ostream& gen_prefix(std::ostream& out) const override { return out; } + }; + static GlobalPrefix global_dpp; + return &global_dpp; +} + +void usage() +{ + cout << "usage: radosgw-admin [options...]" << std::endl; + cout << "commands:\n"; + cout << " user create create a new user\n" ; + cout << " user modify modify user\n"; + cout << " user info get user info\n"; + cout << " user rm remove user\n"; + cout << " user suspend suspend a user\n"; + cout << " user enable re-enable user after suspension\n"; + cout << " user check check user info\n"; + cout << " user stats show user stats as accounted by quota subsystem\n"; + cout << " user list list users\n"; + cout << " caps add add user capabilities\n"; + cout << " caps rm remove user capabilities\n"; + cout << " subuser create create a new subuser\n" ; + cout << " subuser modify modify subuser\n"; + cout << " subuser rm remove subuser\n"; + cout << " key create create access key\n"; + cout << " key rm remove access key\n"; + cout << " bucket list list buckets (specify --allow-unordered for\n"; + cout << " faster, unsorted listing)\n"; + cout << " bucket limit check show bucket sharding stats\n"; + cout << " bucket link link bucket to specified user\n"; + cout << " bucket unlink unlink bucket from specified user\n"; + cout << " bucket stats returns bucket statistics\n"; + cout << " bucket rm remove bucket\n"; + cout << " bucket check check bucket index\n"; + cout << " bucket reshard reshard bucket\n"; + cout << " bucket rewrite rewrite all objects in the specified bucket\n"; + cout << " bucket sync disable disable bucket sync\n"; + cout << " bucket sync enable enable bucket sync\n"; + cout << " bucket radoslist list rados objects backing bucket's objects\n"; + cout << " bi get retrieve bucket index object entries\n"; + cout << " bi put store bucket index object entries\n"; + cout << " bi list list raw bucket index entries\n"; + cout << " bi purge purge bucket index entries\n"; + cout << " object rm remove object\n"; + cout << " object put put object\n"; + cout << " object stat stat an object for its metadata\n"; + cout << " object unlink unlink object from bucket index\n"; + cout << " object rewrite rewrite the specified object\n"; + cout << " objects expire run expired objects cleanup\n"; + cout << " objects expire-stale list list stale expired objects (caused by reshard)\n"; + cout << " objects expire-stale rm remove stale expired objects\n"; + cout << " period rm remove a period\n"; + cout << " period get get period info\n"; + cout << " period get-current get current period info\n"; + cout << " period pull pull a period\n"; + cout << " period push push a period\n"; + cout << " period list list all periods\n"; + cout << " period update update the staging period\n"; + cout << " period commit commit the staging period\n"; + cout << " quota set set quota params\n"; + cout << " quota enable enable quota\n"; + cout << " quota disable disable quota\n"; + cout << " global quota get view global quota params\n"; + cout << " global quota set set global quota params\n"; + cout << " global quota enable enable a global quota\n"; + cout << " global quota disable disable a global quota\n"; + cout << " realm create create a new realm\n"; + cout << " realm rm remove a realm\n"; + cout << " realm get show realm info\n"; + cout << " realm get-default get default realm name\n"; + cout << " realm list list realms\n"; + cout << " realm list-periods list all realm periods\n"; + cout << " realm rename rename a realm\n"; + cout << " realm set set realm info (requires infile)\n"; + cout << " realm default set realm as default\n"; + cout << " realm pull pull a realm and its current period\n"; + cout << " zonegroup add add a zone to a zonegroup\n"; + cout << " zonegroup create create a new zone group info\n"; + cout << " zonegroup default set default zone group\n"; + cout << " zonegroup rm remove a zone group info\n"; + cout << " zonegroup get show zone group info\n"; + cout << " zonegroup modify modify an existing zonegroup\n"; + cout << " zonegroup set set zone group info (requires infile)\n"; + cout << " zonegroup rm remove a zone from a zonegroup\n"; + cout << " zonegroup rename rename a zone group\n"; + cout << " zonegroup list list all zone groups set on this cluster\n"; + cout << " zonegroup placement list list zonegroup's placement targets\n"; + cout << " zonegroup placement get get a placement target of a specific zonegroup\n"; + cout << " zonegroup placement add add a placement target id to a zonegroup\n"; + cout << " zonegroup placement modify modify a placement target of a specific zonegroup\n"; + cout << " zonegroup placement rm remove a placement target from a zonegroup\n"; + cout << " zonegroup placement default set a zonegroup's default placement target\n"; + cout << " zone create create a new zone\n"; + cout << " zone rm remove a zone\n"; + cout << " zone get show zone cluster params\n"; + cout << " zone modify modify an existing zone\n"; + cout << " zone set set zone cluster params (requires infile)\n"; + cout << " zone list list all zones set on this cluster\n"; + cout << " zone rename rename a zone\n"; + cout << " zone placement list list zone's placement targets\n"; + cout << " zone placement get get a zone placement target\n"; + cout << " zone placement add add a zone placement target\n"; + cout << " zone placement modify modify a zone placement target\n"; + cout << " zone placement rm remove a zone placement target\n"; + cout << " metadata sync status get metadata sync status\n"; + cout << " metadata sync init init metadata sync\n"; + cout << " metadata sync run run metadata sync\n"; + cout << " data sync status get data sync status of the specified source zone\n"; + cout << " data sync init init data sync for the specified source zone\n"; + cout << " data sync run run data sync for the specified source zone\n"; + cout << " pool add add an existing pool for data placement\n"; + cout << " pool rm remove an existing pool from data placement set\n"; + cout << " pools list list placement active set\n"; + cout << " policy read bucket/object policy\n"; + cout << " log list list log objects\n"; + cout << " log show dump a log from specific object or (bucket + date\n"; + cout << " + bucket-id)\n"; + cout << " (NOTE: required to specify formatting of date\n"; + cout << " to \"YYYY-MM-DD-hh\")\n"; + cout << " log rm remove log object\n"; + cout << " usage show show usage (by user, by bucket, date range)\n"; + cout << " usage trim trim usage (by user, by bucket, date range)\n"; + cout << " usage clear reset all the usage stats for the cluster\n"; + cout << " gc list dump expired garbage collection objects (specify\n"; + cout << " --include-all to list all entries, including unexpired)\n"; + cout << " gc process manually process garbage (specify\n"; + cout << " --include-all to process all entries, including unexpired)\n"; + cout << " lc list list all bucket lifecycle progress\n"; + cout << " lc get get a lifecycle bucket configuration\n"; + cout << " lc process manually process lifecycle\n"; + cout << " lc reshard fix fix LC for a resharded bucket\n"; + cout << " metadata get get metadata info\n"; + cout << " metadata put put metadata info\n"; + cout << " metadata rm remove metadata info\n"; + cout << " metadata list list metadata info\n"; + cout << " mdlog list list metadata log\n"; + cout << " mdlog trim trim metadata log (use start-date, end-date or\n"; + cout << " start-marker, end-marker)\n"; + cout << " mdlog status read metadata log status\n"; + cout << " bilog list list bucket index log\n"; + cout << " bilog trim trim bucket index log (use start-marker, end-marker)\n"; + cout << " datalog list list data log\n"; + cout << " datalog trim trim data log\n"; + cout << " datalog status read data log status\n"; + cout << " orphans find init and run search for leaked rados objects (use job-id, pool)\n"; + cout << " orphans finish clean up search for leaked rados objects\n"; + cout << " orphans list-jobs list the current job-ids for orphans search\n"; + cout << " role create create a AWS role for use with STS\n"; + cout << " role rm remove a role\n"; + cout << " role get get a role\n"; + cout << " role list list roles with specified path prefix\n"; + cout << " role modify modify the assume role policy of an existing role\n"; + cout << " role-policy put add/update permission policy to role\n"; + cout << " role-policy list list policies attached to a role\n"; + cout << " role-policy get get the specified inline policy document embedded with the given role\n"; + cout << " role-policy rm remove policy attached to a role\n"; + cout << " reshard add schedule a resharding of a bucket\n"; + cout << " reshard list list all bucket resharding or scheduled to be resharded\n"; + cout << " reshard status read bucket resharding status\n"; + cout << " reshard process process of scheduled reshard jobs\n"; + cout << " reshard cancel cancel resharding a bucket\n"; + cout << " reshard stale-instances list list stale-instances from bucket resharding\n"; + cout << " reshard stale-instances rm cleanup stale-instances from bucket resharding\n"; + cout << " sync error list list sync error\n"; + cout << " sync error trim trim sync error\n"; + cout << " mfa create create a new MFA TOTP token\n"; + cout << " mfa list list MFA TOTP tokens\n"; + cout << " mfa get show MFA TOTP token\n"; + cout << " mfa remove delete MFA TOTP token\n"; + cout << " mfa check check MFA TOTP token\n"; + cout << " mfa resync re-sync MFA TOTP token\n"; + cout << "options:\n"; + cout << " --tenant= tenant name\n"; + cout << " --uid= user id\n"; + cout << " --subuser= subuser name\n"; + cout << " --access-key= S3 access key\n"; + cout << " --email= user's email address\n"; + cout << " --secret/--secret-key=\n"; + cout << " specify secret key\n"; + cout << " --gen-access-key generate random access key (for S3)\n"; + cout << " --gen-secret generate random secret key\n"; + cout << " --key-type= key type, options are: swift, s3\n"; + cout << " --temp-url-key[-2]= temp url key\n"; + cout << " --access= Set access permissions for sub-user, should be one\n"; + cout << " of read, write, readwrite, full\n"; + cout << " --display-name= user's display name\n"; + cout << " --max-buckets max number of buckets for a user\n"; + cout << " --admin set the admin flag on the user\n"; + cout << " --system set the system flag on the user\n"; + cout << " --op-mask set the op mask on the user\n"; + cout << " --bucket= Specify the bucket name. Also used by the quota command.\n"; + cout << " --pool= Specify the pool name. Also used to scan for leaked rados objects.\n"; + cout << " --object= object name\n"; + cout << " --date= date in the format yyyy-mm-dd\n"; + cout << " --start-date= start date in the format yyyy-mm-dd\n"; + cout << " --end-date= end date in the format yyyy-mm-dd\n"; + cout << " --bucket-id= bucket id\n"; + cout << " --shard-id= optional for: \n"; + cout << " mdlog list\n"; + cout << " data sync status\n"; + cout << " required for: \n"; + cout << " mdlog trim\n"; + cout << " --max-entries= max entries for listing operations\n"; + cout << " --metadata-key= key to retrieve metadata from with metadata get\n"; + cout << " --remote= zone or zonegroup id of remote gateway\n"; + cout << " --period= period id\n"; + cout << " --url= url for pushing/pulling period/realm\n"; + cout << " --epoch= period epoch\n"; + cout << " --commit commit the period during 'period update'\n"; + cout << " --staging get staging period info\n"; + cout << " --master set as master\n"; + cout << " --master-zone= master zone id\n"; + cout << " --rgw-realm= realm name\n"; + cout << " --realm-id= realm id\n"; + cout << " --realm-new-name= realm new name\n"; + cout << " --rgw-zonegroup= zonegroup name\n"; + cout << " --zonegroup-id= zonegroup id\n"; + cout << " --zonegroup-new-name=\n"; + cout << " zonegroup new name\n"; + cout << " --rgw-zone= name of zone in which radosgw is running\n"; + cout << " --zone-id= zone id\n"; + cout << " --zone-new-name= zone new name\n"; + cout << " --source-zone specify the source zone (for data sync)\n"; + cout << " --default set entity (realm, zonegroup, zone) as default\n"; + cout << " --read-only set zone as read-only (when adding to zonegroup)\n"; + cout << " --redirect-zone specify zone id to redirect when response is 404 (not found)\n"; + cout << " --placement-id placement id for zonegroup placement commands\n"; + cout << " --storage-class storage class for zonegroup placement commands\n"; + cout << " --tags= list of tags for zonegroup placement add and modify commands\n"; + cout << " --tags-add= list of tags to add for zonegroup placement modify command\n"; + cout << " --tags-rm= list of tags to remove for zonegroup placement modify command\n"; + cout << " --endpoints= zone endpoints\n"; + cout << " --index-pool= placement target index pool\n"; + cout << " --data-pool= placement target data pool\n"; + cout << " --data-extra-pool= placement target data extra (non-ec) pool\n"; + cout << " --placement-index-type=\n"; + cout << " placement target index type (normal, indexless, or #id)\n"; + cout << " --compression= placement target compression type (plugin name or empty/none)\n"; + cout << " --tier-type= zone tier type\n"; + cout << " --tier-config==[,...]\n"; + cout << " set zone tier config keys, values\n"; + cout << " --tier-config-rm=[,...]\n"; + cout << " unset zone tier config keys\n"; + cout << " --sync-from-all[=false] set/reset whether zone syncs from all zonegroup peers\n"; + cout << " --sync-from=[zone-name][,...]\n"; + cout << " set list of zones to sync from\n"; + cout << " --sync-from-rm=[zone-name][,...]\n"; + cout << " remove zones from list of zones to sync from\n"; + cout << " --fix besides checking bucket index, will also fix it\n"; + cout << " --check-objects bucket check: rebuilds bucket index according to\n"; + cout << " actual objects state\n"; + cout << " --format= specify output format for certain operations: xml,\n"; + cout << " json\n"; + cout << " --purge-data when specified, user removal will also purge all the\n"; + cout << " user data\n"; + cout << " --purge-keys when specified, subuser removal will also purge all the\n"; + cout << " subuser keys\n"; + cout << " --purge-objects remove a bucket's objects before deleting it\n"; + cout << " (NOTE: required to delete a non-empty bucket)\n"; + cout << " --sync-stats option to 'user stats', update user stats with current\n"; + cout << " stats reported by user's buckets indexes\n"; + cout << " --reset-stats option to 'user stats', reset stats in accordance with user buckets\n"; + cout << " --show-log-entries= enable/disable dump of log entries on log show\n"; + cout << " --show-log-sum= enable/disable dump of log summation on log show\n"; + cout << " --skip-zero-entries log show only dumps entries that don't have zero value\n"; + cout << " in one of the numeric field\n"; + cout << " --infile= specify a file to read in when setting data\n"; + cout << " --categories= comma separated list of categories, used in usage show\n"; + cout << " --caps= list of caps (e.g., \"usage=read, write; user=read\")\n"; + cout << " --yes-i-really-mean-it required for certain operations\n"; + cout << " --warnings-only when specified with bucket limit check, list\n"; + cout << " only buckets nearing or over the current max\n"; + cout << " objects per shard value\n"; + cout << " --bypass-gc when specified with bucket deletion, triggers\n"; + cout << " object deletions by not involving GC\n"; + cout << " --inconsistent-index when specified with bucket deletion and bypass-gc set to true,\n"; + cout << " ignores bucket index consistency\n"; + cout << " --min-rewrite-size min object size for bucket rewrite (default 4M)\n"; + cout << " --max-rewrite-size max object size for bucket rewrite (default ULLONG_MAX)\n"; + cout << " --min-rewrite-stripe-size min stripe size for object rewrite (default 0)\n"; + cout << " --trim-delay-ms time interval in msec to limit the frequency of sync error log entries trimming operations,\n"; + cout << " the trimming process will sleep the specified msec for every 1000 entries trimmed\n"; + cout << "\n"; + cout << " := \"YYYY-MM-DD[ hh:mm:ss]\"\n"; + cout << "\nQuota options:\n"; + cout << " --max-objects specify max objects (negative value to disable)\n"; + cout << " --max-size specify max size (in B/K/M/G/T, negative value to disable)\n"; + cout << " --quota-scope scope of quota (bucket, user)\n"; + cout << "\nOrphans search options:\n"; + cout << " --num-shards num of shards to use for keeping the temporary scan info\n"; + cout << " --orphan-stale-secs num of seconds to wait before declaring an object to be an orphan (default: 86400)\n"; + cout << " --job-id set the job id (for orphans find)\n"; + cout << " --max-concurrent-ios maximum concurrent ios for orphans find (default: 32)\n"; + cout << " --detail detailed mode, log and stat head objects as well\n"; + cout << "\nOrphans list-jobs options:\n"; + cout << " --extra-info provide extra info in job list\n"; + cout << "\nRole options:\n"; + cout << " --role-name name of the role to create\n"; + cout << " --path path to the role\n"; + cout << " --assume-role-policy-doc the trust relationship policy document that grants an entity permission to assume the role\n"; + cout << " --policy-name name of the policy document\n"; + cout << " --policy-doc permission policy document\n"; + cout << " --path-prefix path prefix for filtering roles\n"; + cout << "\nMFA options:\n"; + cout << " --totp-serial a string that represents the ID of a TOTP token\n"; + cout << " --totp-seed the secret seed that is used to calculate the TOTP\n"; + cout << " --totp-seconds the time resolution that is being used for TOTP generation\n"; + cout << " --totp-window the number of TOTP tokens that are checked before and after the current token when validating token\n"; + cout << " --totp-pin the valid value of a TOTP token at a certain time\n"; + cout << "\n"; + generic_client_usage(); +} + +enum { + OPT_NO_CMD = 0, + OPT_USER_CREATE, + OPT_USER_INFO, + OPT_USER_MODIFY, + OPT_USER_RM, + OPT_USER_SUSPEND, + OPT_USER_ENABLE, + OPT_USER_CHECK, + OPT_USER_STATS, + OPT_USER_LIST, + OPT_SUBUSER_CREATE, + OPT_SUBUSER_MODIFY, + OPT_SUBUSER_RM, + OPT_KEY_CREATE, + OPT_KEY_RM, + OPT_BUCKETS_LIST, + OPT_BUCKET_LIMIT_CHECK, + OPT_BUCKET_LINK, + OPT_BUCKET_UNLINK, + OPT_BUCKET_STATS, + OPT_BUCKET_CHECK, + OPT_BUCKET_SYNC_STATUS, + OPT_BUCKET_SYNC_MARKERS, + OPT_BUCKET_SYNC_INIT, + OPT_BUCKET_SYNC_RUN, + OPT_BUCKET_SYNC_DISABLE, + OPT_BUCKET_SYNC_ENABLE, + OPT_BUCKET_RM, + OPT_BUCKET_REWRITE, + OPT_BUCKET_RESHARD, + OPT_BUCKET_CHOWN, + OPT_BUCKET_RADOS_LIST, + OPT_POLICY, + OPT_POOL_ADD, + OPT_POOL_RM, + OPT_POOLS_LIST, + OPT_LOG_LIST, + OPT_LOG_SHOW, + OPT_LOG_RM, + OPT_USAGE_SHOW, + OPT_USAGE_TRIM, + OPT_USAGE_CLEAR, + OPT_OBJECT_PUT, + OPT_OBJECT_RM, + OPT_OBJECT_UNLINK, + OPT_OBJECT_STAT, + OPT_OBJECT_REWRITE, + OPT_OBJECTS_EXPIRE, + OPT_OBJECTS_EXPIRE_STALE_LIST, + OPT_OBJECTS_EXPIRE_STALE_RM, + OPT_BI_GET, + OPT_BI_PUT, + OPT_BI_LIST, + OPT_BI_PURGE, + OPT_OLH_GET, + OPT_OLH_READLOG, + OPT_QUOTA_SET, + OPT_QUOTA_ENABLE, + OPT_QUOTA_DISABLE, + OPT_GC_LIST, + OPT_GC_PROCESS, + OPT_LC_LIST, + OPT_LC_GET, + OPT_LC_PROCESS, + OPT_LC_RESHARD_FIX, + OPT_ORPHANS_FIND, + OPT_ORPHANS_FINISH, + OPT_ORPHANS_LIST_JOBS, + OPT_ZONEGROUP_ADD, + OPT_ZONEGROUP_CREATE, + OPT_ZONEGROUP_DEFAULT, + OPT_ZONEGROUP_DELETE, + OPT_ZONEGROUP_GET, + OPT_ZONEGROUP_MODIFY, + OPT_ZONEGROUP_SET, + OPT_ZONEGROUP_LIST, + OPT_ZONEGROUP_REMOVE, + OPT_ZONEGROUP_RENAME, + OPT_ZONEGROUP_PLACEMENT_ADD, + OPT_ZONEGROUP_PLACEMENT_MODIFY, + OPT_ZONEGROUP_PLACEMENT_RM, + OPT_ZONEGROUP_PLACEMENT_LIST, + OPT_ZONEGROUP_PLACEMENT_GET, + OPT_ZONEGROUP_PLACEMENT_DEFAULT, + OPT_ZONE_CREATE, + OPT_ZONE_DELETE, + OPT_ZONE_GET, + OPT_ZONE_MODIFY, + OPT_ZONE_SET, + OPT_ZONE_LIST, + OPT_ZONE_RENAME, + OPT_ZONE_DEFAULT, + OPT_ZONE_PLACEMENT_ADD, + OPT_ZONE_PLACEMENT_MODIFY, + OPT_ZONE_PLACEMENT_RM, + OPT_ZONE_PLACEMENT_LIST, + OPT_ZONE_PLACEMENT_GET, + OPT_CAPS_ADD, + OPT_CAPS_RM, + OPT_METADATA_GET, + OPT_METADATA_PUT, + OPT_METADATA_RM, + OPT_METADATA_LIST, + OPT_METADATA_SYNC_STATUS, + OPT_METADATA_SYNC_INIT, + OPT_METADATA_SYNC_RUN, + OPT_MDLOG_LIST, + OPT_MDLOG_AUTOTRIM, + OPT_MDLOG_TRIM, + OPT_MDLOG_FETCH, + OPT_MDLOG_STATUS, + OPT_SYNC_ERROR_LIST, + OPT_SYNC_ERROR_TRIM, + OPT_BILOG_LIST, + OPT_BILOG_TRIM, + OPT_BILOG_STATUS, + OPT_BILOG_AUTOTRIM, + OPT_DATA_SYNC_STATUS, + OPT_DATA_SYNC_INIT, + OPT_DATA_SYNC_RUN, + OPT_DATALOG_LIST, + OPT_DATALOG_STATUS, + OPT_DATALOG_AUTOTRIM, + OPT_DATALOG_TRIM, + OPT_REALM_CREATE, + OPT_REALM_DELETE, + OPT_REALM_GET, + OPT_REALM_GET_DEFAULT, + OPT_REALM_LIST, + OPT_REALM_LIST_PERIODS, + OPT_REALM_RENAME, + OPT_REALM_SET, + OPT_REALM_DEFAULT, + OPT_REALM_PULL, + OPT_PERIOD_DELETE, + OPT_PERIOD_GET, + OPT_PERIOD_GET_CURRENT, + OPT_PERIOD_PULL, + OPT_PERIOD_PUSH, + OPT_PERIOD_LIST, + OPT_PERIOD_UPDATE, + OPT_PERIOD_COMMIT, + OPT_GLOBAL_QUOTA_GET, + OPT_GLOBAL_QUOTA_SET, + OPT_GLOBAL_QUOTA_ENABLE, + OPT_GLOBAL_QUOTA_DISABLE, + OPT_SYNC_STATUS, + OPT_ROLE_CREATE, + OPT_ROLE_DELETE, + OPT_ROLE_GET, + OPT_ROLE_MODIFY, + OPT_ROLE_LIST, + OPT_ROLE_POLICY_PUT, + OPT_ROLE_POLICY_LIST, + OPT_ROLE_POLICY_GET, + OPT_ROLE_POLICY_DELETE, + OPT_RESHARD_ADD, + OPT_RESHARD_LIST, + OPT_RESHARD_STATUS, + OPT_RESHARD_PROCESS, + OPT_RESHARD_CANCEL, + OPT_MFA_CREATE, + OPT_MFA_REMOVE, + OPT_MFA_GET, + OPT_MFA_LIST, + OPT_MFA_CHECK, + OPT_MFA_RESYNC, + OPT_RESHARD_STALE_INSTANCES_LIST, + OPT_RESHARD_STALE_INSTANCES_DELETE, + OPT_PUBSUB_TOPICS_LIST, + OPT_PUBSUB_TOPIC_CREATE, + OPT_PUBSUB_TOPIC_GET, + OPT_PUBSUB_TOPIC_RM, + OPT_PUBSUB_NOTIFICATION_CREATE, + OPT_PUBSUB_NOTIFICATION_RM, + OPT_PUBSUB_SUB_GET, + OPT_PUBSUB_SUB_CREATE, + OPT_PUBSUB_SUB_RM, + OPT_PUBSUB_SUB_PULL, + OPT_PUBSUB_EVENT_RM, +}; + +static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_cmd, bool *need_more) +{ + using ceph::util::match_str; + + *need_more = false; + // NOTE: please keep the checks in alphabetical order !!! + if (strcmp(cmd, "bi") == 0 || + strcmp(cmd, "bilog") == 0 || + strcmp(cmd, "buckets") == 0 || + strcmp(cmd, "caps") == 0 || + strcmp(cmd, "data") == 0 || + strcmp(cmd, "datalog") == 0 || + strcmp(cmd, "error") == 0 || + strcmp(cmd, "event") == 0 || + strcmp(cmd, "expire-stale") == 0 || + strcmp(cmd, "gc") == 0 || + strcmp(cmd, "global") == 0 || + strcmp(cmd, "key") == 0 || + strcmp(cmd, "log") == 0 || + strcmp(cmd, "lc") == 0 || + strcmp(cmd, "mdlog") == 0 || + strcmp(cmd, "metadata") == 0 || + strcmp(cmd, "mfa") == 0 || + strcmp(cmd, "notification") == 0 || + strcmp(cmd, "object") == 0 || + strcmp(cmd, "objects") == 0 || + strcmp(cmd, "olh") == 0 || + strcmp(cmd, "orphans") == 0 || + strcmp(cmd, "period") == 0 || + strcmp(cmd, "placement") == 0 || + strcmp(cmd, "pool") == 0 || + strcmp(cmd, "pools") == 0 || + strcmp(cmd, "pubsub") == 0 || + strcmp(cmd, "quota") == 0 || + strcmp(cmd, "realm") == 0 || + strcmp(cmd, "role") == 0 || + strcmp(cmd, "role-policy") == 0 || + strcmp(cmd, "stale-instances") == 0 || + strcmp(cmd, "sub") == 0 || + strcmp(cmd, "subuser") == 0 || + strcmp(cmd, "sync") == 0 || + strcmp(cmd, "topic") == 0 || + strcmp(cmd, "topics") == 0 || + strcmp(cmd, "usage") == 0 || + strcmp(cmd, "user") == 0 || + strcmp(cmd, "zone") == 0 || + strcmp(cmd, "zonegroup") == 0 || + strcmp(cmd, "zonegroups") == 0) { + *need_more = true; + return 0; + } + + /* + * can do both radosgw-admin bucket reshard, and radosgw-admin reshard bucket + */ + if (strcmp(cmd, "reshard") == 0 && + !(prev_cmd && strcmp(prev_cmd, "bucket") == 0)) { + *need_more = true; + return 0; + } + if (strcmp(cmd, "bucket") == 0 && + !(prev_cmd && strcmp(prev_cmd, "reshard") == 0)) { + *need_more = true; + return 0; + } + + if (strcmp(cmd, "policy") == 0) + return OPT_POLICY; + + if (!prev_cmd) + return -EINVAL; + + if (strcmp(prev_cmd, "user") == 0) { + if (strcmp(cmd, "create") == 0) + return OPT_USER_CREATE; + if (strcmp(cmd, "info") == 0) + return OPT_USER_INFO; + if (strcmp(cmd, "modify") == 0) + return OPT_USER_MODIFY; + if (strcmp(cmd, "rm") == 0) + return OPT_USER_RM; + if (strcmp(cmd, "suspend") == 0) + return OPT_USER_SUSPEND; + if (strcmp(cmd, "enable") == 0) + return OPT_USER_ENABLE; + if (strcmp(cmd, "check") == 0) + return OPT_USER_CHECK; + if (strcmp(cmd, "stats") == 0) + return OPT_USER_STATS; + if (strcmp(cmd, "list") == 0) + return OPT_USER_LIST; + } else if (strcmp(prev_cmd, "subuser") == 0) { + if (strcmp(cmd, "create") == 0) + return OPT_SUBUSER_CREATE; + if (strcmp(cmd, "modify") == 0) + return OPT_SUBUSER_MODIFY; + if (strcmp(cmd, "rm") == 0) + return OPT_SUBUSER_RM; + } else if (strcmp(prev_cmd, "key") == 0) { + if (strcmp(cmd, "create") == 0) + return OPT_KEY_CREATE; + if (strcmp(cmd, "rm") == 0) + return OPT_KEY_RM; + } else if (strcmp(prev_cmd, "buckets") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_BUCKETS_LIST; + } else if (strcmp(prev_cmd, "bucket") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_BUCKETS_LIST; + if (strcmp(cmd, "link") == 0) + return OPT_BUCKET_LINK; + if (strcmp(cmd, "unlink") == 0) + return OPT_BUCKET_UNLINK; + if (strcmp(cmd, "stats") == 0) + return OPT_BUCKET_STATS; + if (strcmp(cmd, "rm") == 0) + return OPT_BUCKET_RM; + if (strcmp(cmd, "rewrite") == 0) + return OPT_BUCKET_REWRITE; + if (strcmp(cmd, "reshard") == 0) + return OPT_BUCKET_RESHARD; + if (strcmp(cmd, "check") == 0) + return OPT_BUCKET_CHECK; + if (strcmp(cmd, "radoslist") == 0) + return OPT_BUCKET_RADOS_LIST; + if (strcmp(cmd, "sync") == 0) { + *need_more = true; + return 0; + } + if (strcmp(cmd, "limit") == 0) { + *need_more = true; + return 0; + } + } else if (prev_prev_cmd && strcmp(prev_prev_cmd, "bucket") == 0) { + if (strcmp(prev_cmd, "sync") == 0) { + if (strcmp(cmd, "status") == 0) + return OPT_BUCKET_SYNC_STATUS; + if (strcmp(cmd, "markers") == 0) + return OPT_BUCKET_SYNC_MARKERS; + if (strcmp(cmd, "init") == 0) + return OPT_BUCKET_SYNC_INIT; + if (strcmp(cmd, "run") == 0) + return OPT_BUCKET_SYNC_RUN; + if (strcmp(cmd, "disable") == 0) + return OPT_BUCKET_SYNC_DISABLE; + if (strcmp(cmd, "enable") == 0) + return OPT_BUCKET_SYNC_ENABLE; + } else if ((strcmp(prev_cmd, "limit") == 0) && + (strcmp(cmd, "check") == 0)) { + return OPT_BUCKET_LIMIT_CHECK; + } + } else if (strcmp(prev_cmd, "log") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_LOG_LIST; + if (strcmp(cmd, "show") == 0) + return OPT_LOG_SHOW; + if (strcmp(cmd, "rm") == 0) + return OPT_LOG_RM; + } else if (strcmp(prev_cmd, "usage") == 0) { + if (strcmp(cmd, "show") == 0) + return OPT_USAGE_SHOW; + if (strcmp(cmd, "trim") == 0) + return OPT_USAGE_TRIM; + if (strcmp(cmd, "clear") == 0) + return OPT_USAGE_CLEAR; + } else if (strcmp(prev_cmd, "caps") == 0) { + if (strcmp(cmd, "add") == 0) + return OPT_CAPS_ADD; + if (strcmp(cmd, "rm") == 0) + return OPT_CAPS_RM; + } else if (strcmp(prev_cmd, "pool") == 0) { + if (strcmp(cmd, "add") == 0) + return OPT_POOL_ADD; + if (strcmp(cmd, "rm") == 0) + return OPT_POOL_RM; + if (strcmp(cmd, "list") == 0) + return OPT_POOLS_LIST; + } else if (strcmp(prev_cmd, "pools") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_POOLS_LIST; + } else if (strcmp(prev_cmd, "object") == 0) { + if (strcmp(cmd, "put") == 0) + return OPT_OBJECT_PUT; + if (strcmp(cmd, "rm") == 0) + return OPT_OBJECT_RM; + if (strcmp(cmd, "unlink") == 0) + return OPT_OBJECT_UNLINK; + if (strcmp(cmd, "stat") == 0) + return OPT_OBJECT_STAT; + if (strcmp(cmd, "rewrite") == 0) + return OPT_OBJECT_REWRITE; + } else if (strcmp(prev_cmd, "objects") == 0) { + if (strcmp(cmd, "expire") == 0) + return OPT_OBJECTS_EXPIRE; + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "objects") == 0) && + (strcmp(prev_cmd, "expire-stale") == 0)) { + if (strcmp(cmd, "list") == 0) + return OPT_OBJECTS_EXPIRE_STALE_LIST; + if (strcmp(cmd, "rm") == 0) + return OPT_OBJECTS_EXPIRE_STALE_RM; + } else if (strcmp(prev_cmd, "olh") == 0) { + if (strcmp(cmd, "get") == 0) + return OPT_OLH_GET; + if (strcmp(cmd, "readlog") == 0) + return OPT_OLH_READLOG; + } else if (strcmp(prev_cmd, "bi") == 0) { + if (strcmp(cmd, "get") == 0) + return OPT_BI_GET; + if (strcmp(cmd, "put") == 0) + return OPT_BI_PUT; + if (strcmp(cmd, "list") == 0) + return OPT_BI_LIST; + if (strcmp(cmd, "purge") == 0) + return OPT_BI_PURGE; + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "global") == 0) && + (strcmp(prev_cmd, "quota") == 0)) { + if (strcmp(cmd, "get") == 0) + return OPT_GLOBAL_QUOTA_GET; + if (strcmp(cmd, "set") == 0) + return OPT_GLOBAL_QUOTA_SET; + if (strcmp(cmd, "enable") == 0) + return OPT_GLOBAL_QUOTA_ENABLE; + if (strcmp(cmd, "disable") == 0) + return OPT_GLOBAL_QUOTA_DISABLE; + } else if (strcmp(prev_cmd, "period") == 0) { + if (match_str(cmd, "rm", "delete")) + return OPT_PERIOD_DELETE; + if (strcmp(cmd, "get") == 0) + return OPT_PERIOD_GET; + if (strcmp(cmd, "get-current") == 0) + return OPT_PERIOD_GET_CURRENT; + if (strcmp(cmd, "pull") == 0) + return OPT_PERIOD_PULL; + if (strcmp(cmd, "push") == 0) + return OPT_PERIOD_PUSH; + if (strcmp(cmd, "list") == 0) + return OPT_PERIOD_LIST; + if (strcmp(cmd, "update") == 0) + return OPT_PERIOD_UPDATE; + if (strcmp(cmd, "commit") == 0) + return OPT_PERIOD_COMMIT; + } else if (strcmp(prev_cmd, "realm") == 0) { + if (strcmp(cmd, "create") == 0) + return OPT_REALM_CREATE; + if (match_str(cmd, "rm", "delete")) + return OPT_REALM_DELETE; + if (strcmp(cmd, "get") == 0) + return OPT_REALM_GET; + if (strcmp(cmd, "get-default") == 0) + return OPT_REALM_GET_DEFAULT; + if (strcmp(cmd, "list") == 0) + return OPT_REALM_LIST; + if (strcmp(cmd, "list-periods") == 0) + return OPT_REALM_LIST_PERIODS; + if (strcmp(cmd, "rename") == 0) + return OPT_REALM_RENAME; + if (strcmp(cmd, "set") == 0) + return OPT_REALM_SET; + if (strcmp(cmd, "default") == 0) + return OPT_REALM_DEFAULT; + if (strcmp(cmd, "pull") == 0) + return OPT_REALM_PULL; + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "zonegroup") == 0) && + (strcmp(prev_cmd, "placement") == 0)) { + if (strcmp(cmd, "add") == 0) + return OPT_ZONEGROUP_PLACEMENT_ADD; + if (strcmp(cmd, "modify") == 0) + return OPT_ZONEGROUP_PLACEMENT_MODIFY; + if (strcmp(cmd, "rm") == 0) + return OPT_ZONEGROUP_PLACEMENT_RM; + if (strcmp(cmd, "list") == 0) + return OPT_ZONEGROUP_PLACEMENT_LIST; + if (strcmp(cmd, "get") == 0) + return OPT_ZONEGROUP_PLACEMENT_GET; + if (strcmp(cmd, "default") == 0) + return OPT_ZONEGROUP_PLACEMENT_DEFAULT; + } else if (strcmp(prev_cmd, "zonegroup") == 0) { + if (strcmp(cmd, "add") == 0) + return OPT_ZONEGROUP_ADD; + if (strcmp(cmd, "create")== 0) + return OPT_ZONEGROUP_CREATE; + if (strcmp(cmd, "default") == 0) + return OPT_ZONEGROUP_DEFAULT; + if (strcmp(cmd, "delete") == 0) + return OPT_ZONEGROUP_DELETE; + if (strcmp(cmd, "get") == 0) + return OPT_ZONEGROUP_GET; + if (strcmp(cmd, "modify") == 0) + return OPT_ZONEGROUP_MODIFY; + if (strcmp(cmd, "list") == 0) + return OPT_ZONEGROUP_LIST; + if (strcmp(cmd, "set") == 0) + return OPT_ZONEGROUP_SET; + if (match_str(cmd, "rm", "remove")) + return OPT_ZONEGROUP_REMOVE; + if (strcmp(cmd, "rename") == 0) + return OPT_ZONEGROUP_RENAME; + } else if (strcmp(prev_cmd, "quota") == 0) { + if (strcmp(cmd, "set") == 0) + return OPT_QUOTA_SET; + if (strcmp(cmd, "enable") == 0) + return OPT_QUOTA_ENABLE; + if (strcmp(cmd, "disable") == 0) + return OPT_QUOTA_DISABLE; + } else if (strcmp(prev_cmd, "zonegroups") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_ZONEGROUP_LIST; + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "zone") == 0) && + (strcmp(prev_cmd, "placement") == 0)) { + if (strcmp(cmd, "add") == 0) + return OPT_ZONE_PLACEMENT_ADD; + if (strcmp(cmd, "modify") == 0) + return OPT_ZONE_PLACEMENT_MODIFY; + if (strcmp(cmd, "rm") == 0) + return OPT_ZONE_PLACEMENT_RM; + if (strcmp(cmd, "list") == 0) + return OPT_ZONE_PLACEMENT_LIST; + if (strcmp(cmd, "get") == 0) + return OPT_ZONE_PLACEMENT_GET; + } else if (strcmp(prev_cmd, "zone") == 0) { + if (match_str(cmd, "rm", "delete")) + return OPT_ZONE_DELETE; + if (strcmp(cmd, "create") == 0) + return OPT_ZONE_CREATE; + if (strcmp(cmd, "get") == 0) + return OPT_ZONE_GET; + if (strcmp(cmd, "set") == 0) + return OPT_ZONE_SET; + if (strcmp(cmd, "list") == 0) + return OPT_ZONE_LIST; + if (strcmp(cmd, "modify") == 0) + return OPT_ZONE_MODIFY; + if (strcmp(cmd, "rename") == 0) + return OPT_ZONE_RENAME; + if (strcmp(cmd, "default") == 0) + return OPT_ZONE_DEFAULT; + } else if (strcmp(prev_cmd, "zones") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_ZONE_LIST; + } else if (strcmp(prev_cmd, "gc") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_GC_LIST; + if (strcmp(cmd, "process") == 0) + return OPT_GC_PROCESS; + } else if (strcmp(prev_cmd, "lc") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_LC_LIST; + if (strcmp(cmd, "get") == 0) + return OPT_LC_GET; + if (strcmp(cmd, "process") == 0) + return OPT_LC_PROCESS; + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "lc") == 0) && + strcmp(prev_cmd, "reshard") == 0) { + if (strcmp(cmd, "fix") == 0) + return OPT_LC_RESHARD_FIX; + } else if (strcmp(prev_cmd, "orphans") == 0) { + if (strcmp(cmd, "find") == 0) + return OPT_ORPHANS_FIND; + if (strcmp(cmd, "finish") == 0) + return OPT_ORPHANS_FINISH; + if (strcmp(cmd, "list-jobs") == 0) + return OPT_ORPHANS_LIST_JOBS; + } else if (strcmp(prev_cmd, "metadata") == 0) { + if (strcmp(cmd, "get") == 0) + return OPT_METADATA_GET; + if (strcmp(cmd, "put") == 0) + return OPT_METADATA_PUT; + if (strcmp(cmd, "rm") == 0) + return OPT_METADATA_RM; + if (strcmp(cmd, "list") == 0) + return OPT_METADATA_LIST; + if (strcmp(cmd, "sync") == 0) { + *need_more = true; + return 0; + } + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "metadata") == 0) && + (strcmp(prev_cmd, "sync") == 0)) { + if (strcmp(cmd, "status") == 0) + return OPT_METADATA_SYNC_STATUS; + if (strcmp(cmd, "init") == 0) + return OPT_METADATA_SYNC_INIT; + if (strcmp(cmd, "run") == 0) + return OPT_METADATA_SYNC_RUN; + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "sync") == 0) && + (strcmp(prev_cmd, "error") == 0)) { + if (strcmp(cmd, "list") == 0) + return OPT_SYNC_ERROR_LIST; + if (strcmp(cmd, "trim") == 0) + return OPT_SYNC_ERROR_TRIM; + } else if (strcmp(prev_cmd, "mdlog") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_MDLOG_LIST; + if (strcmp(cmd, "autotrim") == 0) + return OPT_MDLOG_AUTOTRIM; + if (strcmp(cmd, "trim") == 0) + return OPT_MDLOG_TRIM; + if (strcmp(cmd, "fetch") == 0) + return OPT_MDLOG_FETCH; + if (strcmp(cmd, "status") == 0) + return OPT_MDLOG_STATUS; + } else if (strcmp(prev_cmd, "bilog") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_BILOG_LIST; + if (strcmp(cmd, "trim") == 0) + return OPT_BILOG_TRIM; + if (strcmp(cmd, "status") == 0) + return OPT_BILOG_STATUS; + if (strcmp(cmd, "autotrim") == 0) + return OPT_BILOG_AUTOTRIM; + } else if (strcmp(prev_cmd, "data") == 0) { + if (strcmp(cmd, "sync") == 0) { + *need_more = true; + return 0; + } + } else if (strcmp(prev_cmd, "datalog") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_DATALOG_LIST; + if (strcmp(cmd, "autotrim") == 0) + return OPT_DATALOG_AUTOTRIM; + if (strcmp(cmd, "trim") == 0) + return OPT_DATALOG_TRIM; + if (strcmp(cmd, "status") == 0) + return OPT_DATALOG_STATUS; + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "data") == 0) && + (strcmp(prev_cmd, "sync") == 0)) { + if (strcmp(cmd, "status") == 0) + return OPT_DATA_SYNC_STATUS; + if (strcmp(cmd, "init") == 0) + return OPT_DATA_SYNC_INIT; + if (strcmp(cmd, "run") == 0) + return OPT_DATA_SYNC_RUN; + } else if (strcmp(prev_cmd, "sync") == 0) { + if (strcmp(cmd, "status") == 0) + return OPT_SYNC_STATUS; + } else if (strcmp(prev_cmd, "role") == 0) { + if (strcmp(cmd, "create") == 0) + return OPT_ROLE_CREATE; + if (strcmp(cmd, "rm") == 0) + return OPT_ROLE_DELETE; + if (strcmp(cmd, "get") == 0) + return OPT_ROLE_GET; + if (strcmp(cmd, "modify") == 0) + return OPT_ROLE_MODIFY; + if (strcmp(cmd, "list") == 0) + return OPT_ROLE_LIST; + } else if (strcmp(prev_cmd, "role-policy") == 0) { + if (strcmp(cmd, "put") == 0) + return OPT_ROLE_POLICY_PUT; + if (strcmp(cmd, "list") == 0) + return OPT_ROLE_POLICY_LIST; + if (strcmp(cmd, "get") == 0) + return OPT_ROLE_POLICY_GET; + if (match_str(cmd, "rm", "delete")) + return OPT_ROLE_POLICY_DELETE; + } else if (strcmp(prev_cmd, "reshard") == 0) { + if (strcmp(cmd, "bucket") == 0) + return OPT_BUCKET_RESHARD; + if (strcmp(cmd, "add") == 0) + return OPT_RESHARD_ADD; + if (strcmp(cmd, "list") == 0) + return OPT_RESHARD_LIST; + if (strcmp(cmd, "status") == 0) + return OPT_RESHARD_STATUS; + if (strcmp(cmd, "process") == 0) + return OPT_RESHARD_PROCESS; + if (strcmp(cmd, "cancel") == 0) + return OPT_RESHARD_CANCEL; + } else if (strcmp(prev_cmd, "mfa") == 0) { + if (strcmp(cmd, "create") == 0) + return OPT_MFA_CREATE; + if (strcmp(cmd, "remove") == 0) + return OPT_MFA_REMOVE; + if (strcmp(cmd, "get") == 0) + return OPT_MFA_GET; + if (strcmp(cmd, "list") == 0) + return OPT_MFA_LIST; + if (strcmp(cmd, "check") == 0) + return OPT_MFA_CHECK; + if (strcmp(cmd, "resync") == 0) + return OPT_MFA_RESYNC; + } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "reshard") == 0) && + (strcmp(prev_cmd, "stale-instances") == 0)) { + if (strcmp(cmd, "list") == 0) + return OPT_RESHARD_STALE_INSTANCES_LIST; + if (match_str(cmd, "rm", "delete")) + return OPT_RESHARD_STALE_INSTANCES_DELETE; + } else if (prev_prev_cmd && strcmp(prev_prev_cmd, "pubsub") == 0) { + if (strcmp(prev_cmd, "topics") == 0) { + if (strcmp(cmd, "list") == 0) + return OPT_PUBSUB_TOPICS_LIST; + } else if (strcmp(prev_cmd, "topic") == 0) { + if (strcmp(cmd, "create") == 0) + return OPT_PUBSUB_TOPIC_CREATE; + if (strcmp(cmd, "get") == 0) + return OPT_PUBSUB_TOPIC_GET; + if (strcmp(cmd, "rm") == 0) + return OPT_PUBSUB_TOPIC_RM; + } else if (strcmp(prev_cmd, "notification") == 0) { + if (strcmp(cmd, "create") == 0) + return OPT_PUBSUB_NOTIFICATION_CREATE; + if (strcmp(cmd, "rm") == 0) + return OPT_PUBSUB_NOTIFICATION_RM; + } else if (strcmp(prev_cmd, "sub") == 0) { + if (strcmp(cmd, "get") == 0) + return OPT_PUBSUB_SUB_GET; + if (strcmp(cmd, "create") == 0) + return OPT_PUBSUB_SUB_CREATE; + if (strcmp(cmd, "rm") == 0) + return OPT_PUBSUB_SUB_RM; + if (strcmp(cmd, "pull") == 0) + return OPT_PUBSUB_SUB_PULL; + } else if (strcmp(prev_cmd, "event") == 0) { + if (strcmp(cmd, "rm") == 0) + return OPT_PUBSUB_EVENT_RM; + } + } + return -EINVAL; +} + +BIIndexType get_bi_index_type(const string& type_str) { + if (type_str == "plain") + return BIIndexType::Plain; + if (type_str == "instance") + return BIIndexType::Instance; + if (type_str == "olh") + return BIIndexType::OLH; + + return BIIndexType::Invalid; +} + +void dump_bi_entry(bufferlist& bl, BIIndexType index_type, Formatter *formatter) +{ + auto iter = bl.cbegin(); + switch (index_type) { + case BIIndexType::Plain: + case BIIndexType::Instance: + { + rgw_bucket_dir_entry entry; + decode(entry, iter); + encode_json("entry", entry, formatter); + } + break; + case BIIndexType::OLH: + { + rgw_bucket_olh_entry entry; + decode(entry, iter); + encode_json("entry", entry, formatter); + } + break; + default: + ceph_abort(); + break; + } +} + +static void show_user_info(RGWUserInfo& info, Formatter *formatter) +{ + encode_json("user_info", info, formatter); + formatter->flush(cout); + cout << std::endl; +} + +static void show_perm_policy(string perm_policy, Formatter* formatter) +{ + formatter->open_object_section("role"); + formatter->dump_string("Permission policy", perm_policy); + formatter->close_section(); + formatter->flush(cout); +} + +static void show_policy_names(std::vector policy_names, Formatter* formatter) +{ + formatter->open_array_section("PolicyNames"); + for (const auto& it : policy_names) { + formatter->dump_string("policyname", it); + } + formatter->close_section(); + formatter->flush(cout); +} + +static void show_role_info(RGWRole& role, Formatter* formatter) +{ + formatter->open_object_section("role"); + role.dump(formatter); + formatter->close_section(); + formatter->flush(cout); +} + +static void show_roles_info(vector& roles, Formatter* formatter) +{ + formatter->open_array_section("Roles"); + for (const auto& it : roles) { + formatter->open_object_section("role"); + it.dump(formatter); + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); +} + +static void show_reshard_status( + const list& status, Formatter *formatter) +{ + formatter->open_array_section("status"); + for (const auto& entry : status) { + formatter->open_object_section("entry"); + formatter->dump_string("reshard_status", to_string(entry.reshard_status)); + formatter->dump_string("new_bucket_instance_id", + entry.new_bucket_instance_id); + formatter->dump_int("num_shards", entry.num_shards); + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); +} + +class StoreDestructor { + RGWRados *store; +public: + explicit StoreDestructor(RGWRados *_s) : store(_s) {} + ~StoreDestructor() { + RGWStoreManager::close_storage(store); + rgw_http_client_cleanup(); + } +}; + +static int init_bucket(const string& tenant_name, + const string& bucket_name, + const string& bucket_id, + RGWBucketInfo& bucket_info, + rgw_bucket& bucket, + map *pattrs = nullptr) +{ + if (!bucket_name.empty()) { + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r; + if (bucket_id.empty()) { + r = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, nullptr, pattrs); + } else { + string bucket_instance_id = bucket_name + ":" + bucket_id; + r = store->get_bucket_instance_info(obj_ctx, bucket_instance_id, bucket_info, NULL, pattrs); + } + if (r < 0) { + cerr << "could not get bucket info for bucket=" << bucket_name << std::endl; + return r; + } + bucket = bucket_info.bucket; + } + return 0; +} + +static int read_input(const string& infile, bufferlist& bl) +{ + int fd = 0; + if (infile.size()) { + fd = open(infile.c_str(), O_RDONLY); + if (fd < 0) { + int err = -errno; + cerr << "error reading input file " << infile << std::endl; + return err; + } + } + +#define READ_CHUNK 8196 + int r; + int err; + + do { + char buf[READ_CHUNK]; + + r = safe_read(fd, buf, READ_CHUNK); + if (r < 0) { + err = -errno; + cerr << "error while reading input" << std::endl; + goto out; + } + bl.append(buf, r); + } while (r > 0); + err = 0; + + out: + if (infile.size()) { + close(fd); + } + return err; +} + +template +static int read_decode_json(const string& infile, T& t) +{ + bufferlist bl; + int ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl; + return ret; + } + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + cout << "failed to parse JSON" << std::endl; + return -EINVAL; + } + + try { + decode_json_obj(t, &p); + } catch (JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.message << std::endl; + return -EINVAL; + } + return 0; +} + +template +static int read_decode_json(const string& infile, T& t, K *k) +{ + bufferlist bl; + int ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl; + return ret; + } + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + cout << "failed to parse JSON" << std::endl; + return -EINVAL; + } + + try { + t.decode_json(&p, k); + } catch (JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.message << std::endl; + return -EINVAL; + } + return 0; +} + +static int parse_date_str(const string& date_str, utime_t& ut) +{ + uint64_t epoch = 0; + uint64_t nsec = 0; + + if (!date_str.empty()) { + int ret = utime_t::parse_date(date_str, &epoch, &nsec); + if (ret < 0) { + cerr << "ERROR: failed to parse date: " << date_str << std::endl; + return -EINVAL; + } + } + + ut = utime_t(epoch, nsec); + + return 0; +} + +template +static bool decode_dump(const char *field_name, bufferlist& bl, Formatter *f) +{ + T t; + + auto iter = bl.cbegin(); + + try { + decode(t, iter); + } catch (buffer::error& err) { + return false; + } + + encode_json(field_name, t, f); + + return true; +} + +static bool dump_string(const char *field_name, bufferlist& bl, Formatter *f) +{ + string val = bl.to_str(); + f->dump_string(field_name, val.c_str() /* hide encoded null termination chars */); + + return true; +} + +void set_quota_info(RGWQuotaInfo& quota, int opt_cmd, int64_t max_size, int64_t max_objects, + bool have_max_size, bool have_max_objects) +{ + switch (opt_cmd) { + case OPT_QUOTA_ENABLE: + case OPT_GLOBAL_QUOTA_ENABLE: + quota.enabled = true; + + // falling through on purpose + + case OPT_QUOTA_SET: + case OPT_GLOBAL_QUOTA_SET: + if (have_max_objects) { + if (max_objects < 0) { + quota.max_objects = -1; + } else { + quota.max_objects = max_objects; + } + } + if (have_max_size) { + if (max_size < 0) { + quota.max_size = -1; + } else { + quota.max_size = rgw_rounded_kb(max_size) * 1024; + } + } + break; + case OPT_QUOTA_DISABLE: + case OPT_GLOBAL_QUOTA_DISABLE: + quota.enabled = false; + break; + } +} + +int set_bucket_quota(RGWRados *store, int opt_cmd, + const string& tenant_name, const string& bucket_name, + int64_t max_size, int64_t max_objects, + bool have_max_size, bool have_max_objects) +{ + RGWBucketInfo bucket_info; + map attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL, &attrs); + if (r < 0) { + cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + + set_quota_info(bucket_info.quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects); + + r = store->put_bucket_instance_info(bucket_info, false, real_time(), &attrs); + if (r < 0) { + cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl; + return -r; + } + return 0; +} + +int set_user_bucket_quota(int opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects, + bool have_max_size, bool have_max_objects) +{ + RGWUserInfo& user_info = op_state.get_user_info(); + + set_quota_info(user_info.bucket_quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects); + + op_state.set_bucket_quota(user_info.bucket_quota); + + string err; + int r = user.modify(op_state, &err); + if (r < 0) { + cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + return 0; +} + +int set_user_quota(int opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects, + bool have_max_size, bool have_max_objects) +{ + RGWUserInfo& user_info = op_state.get_user_info(); + + set_quota_info(user_info.user_quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects); + + op_state.set_user_quota(user_info.user_quota); + + string err; + int r = user.modify(op_state, &err); + if (r < 0) { + cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + return 0; +} + +static bool bucket_object_check_filter(const string& name) +{ + rgw_obj_key k; + string ns; /* empty namespace */ + return rgw_obj_key::oid_to_key_in_ns(name, &k, ns); +} + +int check_min_obj_stripe_size(RGWRados *store, RGWBucketInfo& bucket_info, rgw_obj& obj, uint64_t min_stripe_size, bool *need_rewrite) +{ + map attrs; + uint64_t obj_size; + + RGWObjectCtx obj_ctx(store); + RGWRados::Object op_target(store, bucket_info, obj_ctx, obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrs; + read_op.params.obj_size = &obj_size; + + int ret = read_op.prepare(); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: failed to stat object, returned error: " << cpp_strerror(-ret) << dendl; + return ret; + } + + map::iterator iter; + iter = attrs.find(RGW_ATTR_MANIFEST); + if (iter == attrs.end()) { + *need_rewrite = (obj_size >= min_stripe_size); + return 0; + } + + RGWObjManifest manifest; + + try { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + decode(manifest, biter); + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: failed to decode manifest" << dendl; + return -EIO; + } + + map& objs = manifest.get_explicit_objs(); + map::iterator oiter; + for (oiter = objs.begin(); oiter != objs.end(); ++oiter) { + RGWObjManifestPart& part = oiter->second; + + if (part.size >= min_stripe_size) { + *need_rewrite = true; + return 0; + } + } + *need_rewrite = false; + + return 0; +} + + +int check_obj_locator_underscore(RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_obj_key& key, bool fix, bool remove_bad, Formatter *f) { + f->open_object_section("object"); + f->open_object_section("key"); + f->dump_string("type", "head"); + f->dump_string("name", key.name); + f->dump_string("instance", key.instance); + f->close_section(); + + string oid; + string locator; + + get_obj_bucket_and_oid_loc(obj, oid, locator); + + f->dump_string("oid", oid); + f->dump_string("locator", locator); + + + RGWObjectCtx obj_ctx(store); + + RGWRados::Object op_target(store, bucket_info, obj_ctx, obj); + RGWRados::Object::Read read_op(&op_target); + + int ret = read_op.prepare(); + bool needs_fixing = (ret == -ENOENT); + + f->dump_bool("needs_fixing", needs_fixing); + + string status = (needs_fixing ? "needs_fixing" : "ok"); + + if ((needs_fixing || remove_bad) && fix) { + ret = store->fix_head_obj_locator(bucket_info, needs_fixing, remove_bad, key); + if (ret < 0) { + cerr << "ERROR: fix_head_object_locator() returned ret=" << ret << std::endl; + goto done; + } + status = "fixed"; + } + +done: + f->dump_string("status", status); + + f->close_section(); + + return 0; +} + +int check_obj_tail_locator_underscore(RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_obj_key& key, bool fix, Formatter *f) { + f->open_object_section("object"); + f->open_object_section("key"); + f->dump_string("type", "tail"); + f->dump_string("name", key.name); + f->dump_string("instance", key.instance); + f->close_section(); + + bool needs_fixing; + string status; + + int ret = store->fix_tail_obj_locator(bucket_info, key, fix, &needs_fixing); + if (ret < 0) { + cerr << "ERROR: fix_tail_object_locator_underscore() returned ret=" << ret << std::endl; + status = "failed"; + } else { + status = (needs_fixing && !fix ? "needs_fixing" : "ok"); + } + + f->dump_bool("needs_fixing", needs_fixing); + f->dump_string("status", status); + + f->close_section(); + + return 0; +} + +int do_check_object_locator(const string& tenant_name, const string& bucket_name, + bool fix, bool remove_bad, Formatter *f) +{ + if (remove_bad && !fix) { + cerr << "ERROR: can't have remove_bad specified without fix" << std::endl; + return -EINVAL; + } + + RGWBucketInfo bucket_info; + rgw_bucket bucket; + string bucket_id; + + f->open_object_section("bucket"); + f->dump_string("bucket", bucket_name); + int ret = init_bucket(tenant_name, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return ret; + } + bool truncated; + int count = 0; + + int max_entries = 1000; + + string prefix; + string delim; + vector result; + map common_prefixes; + string ns; + + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + + string marker; + + list_op.params.prefix = prefix; + list_op.params.delim = delim; + list_op.params.marker = rgw_obj_key(marker); + list_op.params.ns = ns; + list_op.params.enforce_ns = true; + list_op.params.list_versions = true; + + f->open_array_section("check_objects"); + do { + ret = list_op.list_objects(max_entries - count, &result, &common_prefixes, &truncated); + if (ret < 0) { + cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += result.size(); + + for (vector::iterator iter = result.begin(); iter != result.end(); ++iter) { + rgw_obj_key key = iter->key; + rgw_obj obj(bucket, key); + + if (key.name[0] == '_') { + ret = check_obj_locator_underscore(bucket_info, obj, key, fix, remove_bad, f); + + if (ret >= 0) { + ret = check_obj_tail_locator_underscore(bucket_info, obj, key, fix, f); + if (ret < 0) { + cerr << "ERROR: check_obj_tail_locator_underscore(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + } + f->flush(cout); + } while (truncated && count < max_entries); + f->close_section(); + f->close_section(); + + f->flush(cout); + + return 0; +} + +int set_bucket_sync_enabled(RGWRados *store, int opt_cmd, const string& tenant_name, const string& bucket_name) +{ + RGWBucketInfo bucket_info; + map attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int r = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL, &attrs); + if (r < 0) { + cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + + if (opt_cmd == OPT_BUCKET_SYNC_ENABLE) { + bucket_info.flags &= ~BUCKET_DATASYNC_DISABLED; + } else if (opt_cmd == OPT_BUCKET_SYNC_DISABLE) { + bucket_info.flags |= BUCKET_DATASYNC_DISABLED; + } + + r = store->put_bucket_instance_info(bucket_info, false, real_time(), &attrs); + if (r < 0) { + cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl; + return -r; + } + + int shards_num = bucket_info.num_shards? bucket_info.num_shards : 1; + int shard_id = bucket_info.num_shards? 0 : -1; + + if (opt_cmd == OPT_BUCKET_SYNC_DISABLE) { + r = store->stop_bi_log_entries(bucket_info, -1); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed writing stop bilog" << dendl; + return r; + } + } else { + r = store->resync_bi_log_entries(bucket_info, -1); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed writing resync bilog" << dendl; + return r; + } + } + + for (int i = 0; i < shards_num; ++i, ++shard_id) { + r = store->data_log->add_entry(bucket_info.bucket, shard_id); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; + return r; + } + } + + return 0; +} + + +/// search for a matching zone/zonegroup id and return a connection if found +static boost::optional get_remote_conn(RGWRados *store, + const RGWZoneGroup& zonegroup, + const std::string& remote) +{ + boost::optional conn; + if (remote == zonegroup.get_id()) { + conn.emplace(store->ctx(), store->svc.zone, remote, zonegroup.endpoints); + } else { + for (const auto& z : zonegroup.zones) { + const auto& zone = z.second; + if (remote == zone.id) { + conn.emplace(store->ctx(), store->svc.zone, remote, zone.endpoints); + break; + } + } + } + return conn; +} + +/// search each zonegroup for a connection +static boost::optional get_remote_conn(RGWRados *store, + const RGWPeriodMap& period_map, + const std::string& remote) +{ + boost::optional conn; + for (const auto& zg : period_map.zonegroups) { + conn = get_remote_conn(store, zg.second, remote); + if (conn) { + break; + } + } + return conn; +} + +// we expect a very small response +static constexpr size_t MAX_REST_RESPONSE = 128 * 1024; + +static int send_to_remote_gateway(RGWRESTConn* conn, req_info& info, + bufferlist& in_data, JSONParser& parser) +{ + if (!conn) { + return -EINVAL; + } + + ceph::bufferlist response; + rgw_user user; + int ret = conn->forward(user, info, nullptr, MAX_REST_RESPONSE, &in_data, &response); + + int parse_ret = parser.parse(response.c_str(), response.length()); + if (parse_ret < 0) { + cerr << "failed to parse response" << std::endl; + return parse_ret; + } + return ret; +} + +static int send_to_url(const string& url, const string& access, + const string& secret, req_info& info, + bufferlist& in_data, JSONParser& parser) +{ + if (access.empty() || secret.empty()) { + cerr << "An --access-key and --secret must be provided with --url." << std::endl; + return -EINVAL; + } + RGWAccessKey key; + key.id = access; + key.key = secret; + + param_vec_t params; + RGWRESTSimpleRequest req(g_ceph_context, info.method, url, NULL, ¶ms); + + bufferlist response; + int ret = req.forward_request(key, info, MAX_REST_RESPONSE, &in_data, &response); + + int parse_ret = parser.parse(response.c_str(), response.length()); + if (parse_ret < 0) { + cout << "failed to parse response" << std::endl; + return parse_ret; + } + return ret; +} + +static int send_to_remote_or_url(RGWRESTConn *conn, const string& url, + const string& access, const string& secret, + req_info& info, bufferlist& in_data, + JSONParser& parser) +{ + if (url.empty()) { + return send_to_remote_gateway(conn, info, in_data, parser); + } + return send_to_url(url, access, secret, info, in_data, parser); +} + +static int commit_period(RGWRealm& realm, RGWPeriod& period, + string remote, const string& url, + const string& access, const string& secret, + bool force) +{ + const string& master_zone = period.get_master_zone(); + if (master_zone.empty()) { + cerr << "cannot commit period: period does not have a master zone of a master zonegroup" << std::endl; + return -EINVAL; + } + // are we the period's master zone? + if (store->svc.zone->get_zone_params().get_id() == master_zone) { + // read the current period + RGWPeriod current_period; + int ret = current_period.init(g_ceph_context, store->svc.sysobj, realm.get_id()); + if (ret < 0) { + cerr << "Error initializing current period: " + << cpp_strerror(-ret) << std::endl; + return ret; + } + // the master zone can commit locally + ret = period.commit(store, realm, current_period, cerr, force); + if (ret < 0) { + cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl; + } + return ret; + } + + if (remote.empty() && url.empty()) { + // use the new master zone's connection + remote = master_zone; + cout << "Sending period to new master zone " << remote << std::endl; + } + boost::optional conn; + RGWRESTConn *remote_conn = nullptr; + if (!remote.empty()) { + conn = get_remote_conn(store, period.get_map(), remote); + if (!conn) { + cerr << "failed to find a zone or zonegroup for remote " + << remote << std::endl; + return -ENOENT; + } + remote_conn = &*conn; + } + + // push period to the master with an empty period id + period.set_id(""); + + RGWEnv env; + req_info info(g_ceph_context, &env); + info.method = "POST"; + info.request_uri = "/admin/realm/period"; + + // json format into a bufferlist + JSONFormatter jf(false); + encode_json("period", period, &jf); + bufferlist bl; + jf.flush(bl); + + JSONParser p; + int ret = send_to_remote_or_url(remote_conn, url, access, secret, info, bl, p); + if (ret < 0) { + cerr << "request failed: " << cpp_strerror(-ret) << std::endl; + + // did we parse an error message? + auto message = p.find_obj("Message"); + if (message) { + cerr << "Reason: " << message->get_data() << std::endl; + } + return ret; + } + + // decode the response and store it back + try { + decode_json_obj(period, &p); + } catch (JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.message << std::endl; + return -EINVAL; + } + if (period.get_id().empty()) { + cerr << "Period commit got back an empty period id" << std::endl; + return -EINVAL; + } + // the master zone gave us back the period that it committed, so it's + // safe to save it as our latest epoch + ret = period.store_info(false); + if (ret < 0) { + cerr << "Error storing committed period " << period.get_id() << ": " + << cpp_strerror(ret) << std::endl; + return ret; + } + ret = period.set_latest_epoch(period.get_epoch()); + if (ret < 0) { + cerr << "Error updating period epoch: " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = period.reflect(); + if (ret < 0) { + cerr << "Error updating local objects: " << cpp_strerror(ret) << std::endl; + return ret; + } + realm.notify_new_period(period); + return ret; +} + +static int update_period(const string& realm_id, const string& realm_name, + const string& period_id, const string& period_epoch, + bool commit, const string& remote, const string& url, + const string& access, const string& secret, + Formatter *formatter, bool force) +{ + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0 ) { + cerr << "Error initializing realm " << cpp_strerror(-ret) << std::endl; + return ret; + } + epoch_t epoch = 0; + if (!period_epoch.empty()) { + epoch = atoi(period_epoch.c_str()); + } + RGWPeriod period(period_id, epoch); + ret = period.init(g_ceph_context, store->svc.sysobj, realm.get_id()); + if (ret < 0) { + cerr << "period init failed: " << cpp_strerror(-ret) << std::endl; + return ret; + } + period.fork(); + ret = period.update(); + if(ret < 0) { + // Dropping the error message here, as both the ret codes were handled in + // period.update() + return ret; + } + ret = period.store_info(false); + if (ret < 0) { + cerr << "failed to store period: " << cpp_strerror(-ret) << std::endl; + return ret; + } + if (commit) { + ret = commit_period(realm, period, remote, url, access, secret, force); + if (ret < 0) { + cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl; + return ret; + } + } + encode_json("period", period, formatter); + formatter->flush(cout); + return 0; +} + +static int init_bucket_for_sync(const string& tenant, const string& bucket_name, + const string& bucket_id, rgw_bucket& bucket) +{ + RGWBucketInfo bucket_info; + + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + return 0; +} + +static int do_period_pull(RGWRESTConn *remote_conn, const string& url, + const string& access_key, const string& secret_key, + const string& realm_id, const string& realm_name, + const string& period_id, const string& period_epoch, + RGWPeriod *period) +{ + RGWEnv env; + req_info info(g_ceph_context, &env); + info.method = "GET"; + info.request_uri = "/admin/realm/period"; + + map ¶ms = info.args.get_params(); + if (!realm_id.empty()) + params["realm_id"] = realm_id; + if (!realm_name.empty()) + params["realm_name"] = realm_name; + if (!period_id.empty()) + params["period_id"] = period_id; + if (!period_epoch.empty()) + params["epoch"] = period_epoch; + + bufferlist bl; + JSONParser p; + int ret = send_to_remote_or_url(remote_conn, url, access_key, secret_key, + info, bl, p); + if (ret < 0) { + cerr << "request failed: " << cpp_strerror(-ret) << std::endl; + return ret; + } + ret = period->init(g_ceph_context, store->svc.sysobj, false); + if (ret < 0) { + cerr << "faile to init period " << cpp_strerror(-ret) << std::endl; + return ret; + } + try { + decode_json_obj(*period, &p); + } catch (JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.message << std::endl; + return -EINVAL; + } + ret = period->store_info(false); + if (ret < 0) { + cerr << "Error storing period " << period->get_id() << ": " << cpp_strerror(ret) << std::endl; + } + // store latest epoch (ignore errors) + period->update_latest_epoch(period->get_epoch()); + return 0; +} + +static int read_current_period_id(RGWRados* store, const std::string& realm_id, + const std::string& realm_name, + std::string* period_id) +{ + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + std::cerr << "failed to read realm: " << cpp_strerror(-ret) << std::endl; + return ret; + } + *period_id = realm.get_current_period(); + return 0; +} + +void flush_ss(stringstream& ss, list& l) +{ + if (!ss.str().empty()) { + l.push_back(ss.str()); + } + ss.str(""); +} + +stringstream& push_ss(stringstream& ss, list& l, int tab = 0) +{ + flush_ss(ss, l); + if (tab > 0) { + ss << setw(tab) << "" << setw(1); + } + return ss; +} + +static void get_md_sync_status(list& status) +{ + RGWMetaSyncStatusManager sync(store, store->get_async_rados()); + + int ret = sync.init(); + if (ret < 0) { + status.push_back(string("failed to retrieve sync info: sync.init() failed: ") + cpp_strerror(-ret)); + return; + } + + rgw_meta_sync_status sync_status; + ret = sync.read_sync_status(&sync_status); + if (ret < 0) { + status.push_back(string("failed to read sync status: ") + cpp_strerror(-ret)); + return; + } + + string status_str; + switch (sync_status.sync_info.state) { + case rgw_meta_sync_info::StateInit: + status_str = "init"; + break; + case rgw_meta_sync_info::StateBuildingFullSyncMaps: + status_str = "preparing for full sync"; + break; + case rgw_meta_sync_info::StateSync: + status_str = "syncing"; + break; + default: + status_str = "unknown"; + } + + status.push_back(status_str); + + uint64_t full_total = 0; + uint64_t full_complete = 0; + + int num_full = 0; + int num_inc = 0; + int total_shards = 0; + set shards_behind_set; + + for (auto marker_iter : sync_status.sync_markers) { + full_total += marker_iter.second.total_entries; + total_shards++; + if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) { + num_full++; + full_complete += marker_iter.second.pos; + int shard_id = marker_iter.first; + shards_behind_set.insert(shard_id); + } else { + full_complete += marker_iter.second.total_entries; + } + if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync) { + num_inc++; + } + } + + stringstream ss; + push_ss(ss, status) << "full sync: " << num_full << "/" << total_shards << " shards"; + + if (num_full > 0) { + push_ss(ss, status) << "full sync: " << full_total - full_complete << " entries to sync"; + } + + push_ss(ss, status) << "incremental sync: " << num_inc << "/" << total_shards << " shards"; + + map master_shards_info; + string master_period = store->svc.zone->get_current_period_id(); + + ret = sync.read_master_log_shards_info(master_period, &master_shards_info); + if (ret < 0) { + status.push_back(string("failed to fetch master sync status: ") + cpp_strerror(-ret)); + return; + } + + map shards_behind; + if (sync_status.sync_info.period != master_period) { + status.push_back(string("master is on a different period: master_period=" + + master_period + " local_period=" + sync_status.sync_info.period)); + } else { + for (auto local_iter : sync_status.sync_markers) { + int shard_id = local_iter.first; + auto iter = master_shards_info.find(shard_id); + + if (iter == master_shards_info.end()) { + /* huh? */ + derr << "ERROR: could not find remote sync shard status for shard_id=" << shard_id << dendl; + continue; + } + auto master_marker = iter->second.marker; + if (local_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync && + master_marker > local_iter.second.marker) { + shards_behind[shard_id] = local_iter.second.marker; + shards_behind_set.insert(shard_id); + } + } + } + + int total_behind = shards_behind.size() + (sync_status.sync_info.num_shards - num_inc); + if (total_behind == 0) { + push_ss(ss, status) << "metadata is caught up with master"; + } else { + push_ss(ss, status) << "metadata is behind on " << total_behind << " shards"; + + push_ss(ss, status) << "behind shards: " << "[" << shards_behind_set << "]"; + + map master_pos; + ret = sync.read_master_log_shards_next(sync_status.sync_info.period, shards_behind, &master_pos); + if (ret < 0) { + derr << "ERROR: failed to fetch master next positions (" << cpp_strerror(-ret) << ")" << dendl; + } else { + std::optional> oldest; + + for (auto iter : master_pos) { + rgw_mdlog_shard_data& shard_data = iter.second; + + if (!shard_data.entries.empty()) { + rgw_mdlog_entry& entry = shard_data.entries.front(); + if (!oldest) { + oldest.emplace(iter.first, entry.timestamp); + } else if (!ceph::real_clock::is_zero(entry.timestamp) && entry.timestamp < oldest->second) { + oldest.emplace(iter.first, entry.timestamp); + } + } + } + + if (oldest) { + push_ss(ss, status) << "oldest incremental change not applied: " + << oldest->second << " [" << oldest->first << ']'; + } + } + } + + flush_ss(ss, status); +} + +static void get_data_sync_status(const string& source_zone, list& status, int tab) +{ + stringstream ss; + + RGWZone *sz; + + if (!store->svc.zone->find_zone_by_id(source_zone, &sz)) { + push_ss(ss, status, tab) << string("zone not found"); + flush_ss(ss, status); + return; + } + + if (!store->svc.zone->zone_syncs_from(store->svc.zone->get_zone(), *sz)) { + push_ss(ss, status, tab) << string("not syncing from zone"); + flush_ss(ss, status); + return; + } + RGWDataSyncStatusManager sync(store, store->get_async_rados(), source_zone, nullptr); + + int ret = sync.init(); + if (ret < 0) { + push_ss(ss, status, tab) << string("failed to retrieve sync info: ") + cpp_strerror(-ret); + flush_ss(ss, status); + return; + } + + rgw_data_sync_status sync_status; + ret = sync.read_sync_status(&sync_status); + if (ret < 0 && ret != -ENOENT) { + push_ss(ss, status, tab) << string("failed read sync status: ") + cpp_strerror(-ret); + return; + } + + set recovering_shards; + ret = sync.read_recovering_shards(sync_status.sync_info.num_shards, recovering_shards); + if (ret < 0 && ret != ENOENT) { + push_ss(ss, status, tab) << string("failed read recovering shards: ") + cpp_strerror(-ret); + return; + } + + string status_str; + switch (sync_status.sync_info.state) { + case rgw_data_sync_info::StateInit: + status_str = "init"; + break; + case rgw_data_sync_info::StateBuildingFullSyncMaps: + status_str = "preparing for full sync"; + break; + case rgw_data_sync_info::StateSync: + status_str = "syncing"; + break; + default: + status_str = "unknown"; + } + + push_ss(ss, status, tab) << status_str; + + uint64_t full_total = 0; + uint64_t full_complete = 0; + + int num_full = 0; + int num_inc = 0; + int total_shards = 0; + set shards_behind_set; + + for (auto marker_iter : sync_status.sync_markers) { + full_total += marker_iter.second.total_entries; + total_shards++; + if (marker_iter.second.state == rgw_data_sync_marker::SyncState::FullSync) { + num_full++; + full_complete += marker_iter.second.pos; + int shard_id = marker_iter.first; + shards_behind_set.insert(shard_id); + } else { + full_complete += marker_iter.second.total_entries; + } + if (marker_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync) { + num_inc++; + } + } + + push_ss(ss, status, tab) << "full sync: " << num_full << "/" << total_shards << " shards"; + + if (num_full > 0) { + push_ss(ss, status, tab) << "full sync: " << full_total - full_complete << " buckets to sync"; + } + + push_ss(ss, status, tab) << "incremental sync: " << num_inc << "/" << total_shards << " shards"; + + map source_shards_info; + + ret = sync.read_source_log_shards_info(&source_shards_info); + if (ret < 0) { + push_ss(ss, status, tab) << string("failed to fetch source sync status: ") + cpp_strerror(-ret); + return; + } + + map shards_behind; + + for (auto local_iter : sync_status.sync_markers) { + int shard_id = local_iter.first; + auto iter = source_shards_info.find(shard_id); + + if (iter == source_shards_info.end()) { + /* huh? */ + derr << "ERROR: could not find remote sync shard status for shard_id=" << shard_id << dendl; + continue; + } + auto master_marker = iter->second.marker; + if (local_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync && + master_marker > local_iter.second.marker) { + shards_behind[shard_id] = local_iter.second.marker; + shards_behind_set.insert(shard_id); + } + } + + int total_behind = shards_behind.size() + (sync_status.sync_info.num_shards - num_inc); + int total_recovering = recovering_shards.size(); + if (total_behind == 0 && total_recovering == 0) { + push_ss(ss, status, tab) << "data is caught up with source"; + } else if (total_behind > 0) { + push_ss(ss, status, tab) << "data is behind on " << total_behind << " shards"; + + push_ss(ss, status, tab) << "behind shards: " << "[" << shards_behind_set << "]" ; + + map master_pos; + ret = sync.read_source_log_shards_next(shards_behind, &master_pos); + if (ret < 0) { + derr << "ERROR: failed to fetch next positions (" << cpp_strerror(-ret) << ")" << dendl; + } else { + std::optional> oldest; + + for (auto iter : master_pos) { + rgw_datalog_shard_data& shard_data = iter.second; + + if (!shard_data.entries.empty()) { + rgw_datalog_entry& entry = shard_data.entries.front(); + if (!oldest) { + oldest.emplace(iter.first, entry.timestamp); + } else if (!ceph::real_clock::is_zero(entry.timestamp) && entry.timestamp < oldest->second) { + oldest.emplace(iter.first, entry.timestamp); + } + } + } + + if (oldest) { + push_ss(ss, status, tab) << "oldest incremental change not applied: " + << oldest->second << " [" << oldest->first << ']'; + } + } + } + + if (total_recovering > 0) { + push_ss(ss, status, tab) << total_recovering << " shards are recovering"; + push_ss(ss, status, tab) << "recovering shards: " << "[" << recovering_shards << "]"; + } + + flush_ss(ss, status); +} + +static void tab_dump(const string& header, int width, const list& entries) +{ + string s = header; + + for (auto e : entries) { + cout << std::setw(width) << s << std::setw(1) << " " << e << std::endl; + s.clear(); + } +} + + +static void sync_status(Formatter *formatter) +{ + const RGWRealm& realm = store->svc.zone->get_realm(); + const RGWZoneGroup& zonegroup = store->svc.zone->get_zonegroup(); + const RGWZone& zone = store->svc.zone->get_zone(); + + int width = 15; + + cout << std::setw(width) << "realm" << std::setw(1) << " " << realm.get_id() << " (" << realm.get_name() << ")" << std::endl; + cout << std::setw(width) << "zonegroup" << std::setw(1) << " " << zonegroup.get_id() << " (" << zonegroup.get_name() << ")" << std::endl; + cout << std::setw(width) << "zone" << std::setw(1) << " " << zone.id << " (" << zone.name << ")" << std::endl; + + list md_status; + + if (store->svc.zone->is_meta_master()) { + md_status.push_back("no sync (zone is master)"); + } else { + get_md_sync_status(md_status); + } + + tab_dump("metadata sync", width, md_status); + + list data_status; + + auto& zone_conn_map = store->svc.zone->get_zone_conn_map(); + + for (auto iter : zone_conn_map) { + const string& source_id = iter.first; + string source_str = "source: "; + string s = source_str + source_id; + RGWZone *sz; + if (store->svc.zone->find_zone_by_id(source_id, &sz)) { + s += string(" (") + sz->name + ")"; + } + data_status.push_back(s); + get_data_sync_status(source_id, data_status, source_str.size()); + } + + tab_dump("data sync", width, data_status); +} + +struct indented { + int w; // indent width + std::string_view header; + indented(int w, std::string_view header = "") : w(w), header(header) {} +}; +std::ostream& operator<<(std::ostream& out, const indented& h) { + return out << std::setw(h.w) << h.header << std::setw(1) << ' '; +} + +static int remote_bilog_markers(RGWRados *store, const RGWZone& source, + RGWRESTConn *conn, const RGWBucketInfo& info, + BucketIndexShardsManager *markers) +{ + const auto instance_key = info.bucket.get_key(); + const rgw_http_param_pair params[] = { + { "type" , "bucket-index" }, + { "bucket-instance", instance_key.c_str() }, + { "info" , nullptr }, + { nullptr, nullptr } + }; + rgw_bucket_index_marker_info result; + int r = conn->get_json_resource("/admin/log/", params, result); + if (r < 0) { + lderr(store->ctx()) << "failed to fetch remote log markers: " << cpp_strerror(r) << dendl; + return r; + } + r = markers->from_string(result.max_marker, -1); + if (r < 0) { + lderr(store->ctx()) << "failed to decode remote log markers" << dendl; + return r; + } + return 0; +} + +static int bucket_source_sync_status(RGWRados *store, const RGWZone& zone, + const RGWZone& source, RGWRESTConn *conn, + const RGWBucketInfo& bucket_info, + int width, std::ostream& out) +{ + out << indented{width, "source zone"} << source.id << " (" << source.name << ")\n"; + + // syncing from this zone? + if (!zone.syncs_from(source.name)) { + out << indented{width} << "not in sync_from\n"; + return 0; + } + std::vector status; + int r = rgw_bucket_sync_status(dpp(), store, source.id, bucket_info, &status); + if (r < 0) { + lderr(store->ctx()) << "failed to read bucket sync status: " << cpp_strerror(r) << dendl; + return r; + } + + int num_full = 0; + int num_inc = 0; + uint64_t full_complete = 0; + const size_t total_shards = status.size(); + + using BucketSyncState = rgw_bucket_shard_sync_info::SyncState; + for (size_t shard_id = 0; shard_id < total_shards; shard_id++) { + auto& m = status[shard_id]; + if (m.state == BucketSyncState::StateFullSync) { + num_full++; + full_complete += m.full_marker.count; + } else if (m.state == BucketSyncState::StateIncrementalSync) { + num_inc++; + } + } + + out << indented{width} << "full sync: " << num_full << "/" << total_shards << " shards\n"; + if (num_full > 0) { + out << indented{width} << "full sync: " << full_complete << " objects completed\n"; + } + out << indented{width} << "incremental sync: " << num_inc << "/" << total_shards << " shards\n"; + + BucketIndexShardsManager remote_markers; + r = remote_bilog_markers(store, source, conn, bucket_info, &remote_markers); + if (r < 0) { + lderr(store->ctx()) << "failed to read remote log: " << cpp_strerror(r) << dendl; + return r; + } + + std::set shards_behind; + for (auto& r : remote_markers.get()) { + auto shard_id = r.first; + auto& m = status[shard_id]; + if (r.second.empty()) { + continue; // empty bucket index shard + } + auto pos = BucketIndexShardsManager::get_shard_marker(m.inc_marker.position); + if (m.state != BucketSyncState::StateIncrementalSync || pos != r.second) { + shards_behind.insert(shard_id); + } + } + if (!shards_behind.empty()) { + out << indented{width} << "bucket is behind on " << shards_behind.size() << " shards\n"; + out << indented{width} << "behind shards: [" << shards_behind << "]\n" ; + } else if (!num_full) { + out << indented{width} << "bucket is caught up with source\n"; + } + return 0; +} + +static int bucket_sync_status(RGWRados *store, const RGWBucketInfo& info, + const std::string& source_zone_id, + std::ostream& out) +{ + const RGWRealm& realm = store->svc.zone->get_realm(); + const RGWZoneGroup& zonegroup = store->svc.zone->get_zonegroup(); + const RGWZone& zone = store->svc.zone->get_zone(); + constexpr int width = 15; + + out << indented{width, "realm"} << realm.get_id() << " (" << realm.get_name() << ")\n"; + out << indented{width, "zonegroup"} << zonegroup.get_id() << " (" << zonegroup.get_name() << ")\n"; + out << indented{width, "zone"} << zone.id << " (" << zone.name << ")\n"; + out << indented{width, "bucket"} << info.bucket << "\n\n"; + + if (!info.datasync_flag_enabled()) { + out << "Sync is disabled for bucket " << info.bucket.name << '\n'; + return 0; + } + + auto& zone_conn_map = store->svc.zone->get_zone_conn_map(); + if (!source_zone_id.empty()) { + auto z = zonegroup.zones.find(source_zone_id); + if (z == zonegroup.zones.end()) { + lderr(store->ctx()) << "Source zone not found in zonegroup " + << zonegroup.get_name() << dendl; + return -EINVAL; + } + auto c = zone_conn_map.find(source_zone_id); + if (c == zone_conn_map.end()) { + lderr(store->ctx()) << "No connection to zone " << z->second.name << dendl; + return -EINVAL; + } + return bucket_source_sync_status(store, zone, z->second, c->second, + info, width, out); + } + + for (const auto& z : zonegroup.zones) { + auto c = zone_conn_map.find(z.second.id); + if (c != zone_conn_map.end()) { + bucket_source_sync_status(store, zone, z.second, c->second, + info, width, out); + } + } + return 0; +} + +static void parse_tier_config_param(const string& s, map& out) +{ + int level = 0; + string cur_conf; + list confs; + for (auto c : s) { + if (c == ',') { + if (level == 0) { + confs.push_back(cur_conf); + cur_conf.clear(); + continue; + } + } + if (c == '{') { + ++level; + } else if (c == '}') { + --level; + } + cur_conf += c; + } + if (!cur_conf.empty()) { + confs.push_back(cur_conf); + } + + for (auto c : confs) { + ssize_t pos = c.find("="); + if (pos < 0) { + out[c] = ""; + } else { + out[c.substr(0, pos)] = c.substr(pos + 1); + } + } +} + +static int check_pool_support_omap(const rgw_pool& pool) +{ + librados::IoCtx io_ctx; + int ret = store->get_rados_handle()->ioctx_create(pool.to_str().c_str(), io_ctx); + if (ret < 0) { + // the pool may not exist at this moment, we have no way to check if it supports omap. + return 0; + } + + ret = io_ctx.omap_clear("__omap_test_not_exist_oid__"); + if (ret == -EOPNOTSUPP) { + io_ctx.close(); + return ret; + } + io_ctx.close(); + return 0; +} + +int check_reshard_bucket_params(RGWRados *store, + const string& bucket_name, + const string& tenant, + const string& bucket_id, + bool num_shards_specified, + int num_shards, + int yes_i_really_mean_it, + rgw_bucket& bucket, + RGWBucketInfo& bucket_info, + map& attrs) +{ + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return -EINVAL; + } + + if (!num_shards_specified) { + cerr << "ERROR: --num-shards not specified" << std::endl; + return -EINVAL; + } + + if (num_shards > (int)store->get_max_bucket_shards()) { + cerr << "ERROR: num_shards too high, max value: " << store->get_max_bucket_shards() << std::endl; + return -EINVAL; + } + + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket, &attrs); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + if (bucket_info.reshard_status != CLS_RGW_RESHARD_NOT_RESHARDING) { + // if in_progress or done then we have an old BucketInfo + cerr << "ERROR: the bucket is currently undergoing resharding and " + "cannot be added to the reshard list at this time" << std::endl; + return -EBUSY; + } + + int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1); + + if (num_shards <= num_source_shards && !yes_i_really_mean_it) { + cerr << "num shards is less or equal to current shards count" << std::endl + << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return -EINVAL; + } + return 0; +} + +int create_new_bucket_instance(RGWRados *store, + int new_num_shards, + const RGWBucketInfo& bucket_info, + map& attrs, + RGWBucketInfo& new_bucket_info) +{ + + store->create_bucket_id(&new_bucket_info.bucket.bucket_id); + new_bucket_info.bucket.oid.clear(); + + new_bucket_info.num_shards = new_num_shards; + new_bucket_info.objv_tracker.clear(); + + int ret = store->init_bucket_index(new_bucket_info, new_bucket_info.num_shards); + if (ret < 0) { + cerr << "ERROR: failed to init new bucket indexes: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = store->put_bucket_instance_info(new_bucket_info, true, real_time(), &attrs); + if (ret < 0) { + cerr << "ERROR: failed to store new bucket instance info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + return 0; +} + +static int scan_totp(CephContext *cct, ceph::real_time& now, rados::cls::otp::otp_info_t& totp, vector& pins, + time_t *pofs) +{ +#define MAX_TOTP_SKEW_HOURS (24 * 7) + ceph_assert(pins.size() == 2); + + time_t start_time = ceph::real_clock::to_time_t(now); + time_t time_ofs = 0, time_ofs_abs = 0; + time_t step_size = totp.step_size; + if (step_size == 0) { + step_size = OATH_TOTP_DEFAULT_TIME_STEP_SIZE; + } + uint32_t count = 0; + int sign = 1; + + uint32_t max_skew = MAX_TOTP_SKEW_HOURS * 3600; + + while (time_ofs_abs < max_skew) { + int rc = oath_totp_validate2(totp.seed_bin.c_str(), totp.seed_bin.length(), + start_time, + step_size, + time_ofs, + 1, + nullptr, + pins[0].c_str()); + if (rc != OATH_INVALID_OTP) { + rc = oath_totp_validate2(totp.seed_bin.c_str(), totp.seed_bin.length(), + start_time, + step_size, + time_ofs - step_size, /* smaller time_ofs moves time forward */ + 1, + nullptr, + pins[1].c_str()); + if (rc != OATH_INVALID_OTP) { + *pofs = time_ofs - step_size + step_size * totp.window / 2; + ldout(cct, 20) << "found at time=" << start_time - time_ofs << " time_ofs=" << time_ofs << dendl; + return 0; + } + } + sign = -sign; + time_ofs_abs = (++count) * step_size; + time_ofs = sign * time_ofs_abs; + } + + return -ENOENT; +} + +static int trim_sync_error_log(int shard_id, const ceph::real_time& start_time, + const ceph::real_time& end_time, + const string& start_marker, const string& end_marker, + int delay_ms) +{ + auto oid = RGWSyncErrorLogger::get_shard_oid(RGW_SYNC_ERROR_LOG_SHARD_PREFIX, + shard_id); + // call cls_log_trim() until it returns -ENODATA + for (;;) { + int ret = store->time_log_trim(oid, start_time, end_time, + start_marker, end_marker); + if (ret == -ENODATA) { + return 0; + } + if (ret < 0) { + return ret; + } + if (delay_ms) { + std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms)); + } + } + // unreachable +} + +const string& get_tier_type(RGWRados *store) { + return store->svc.zone->get_zone().tier_type; +} + +int main(int argc, const char **argv) +{ + vector args; + argv_to_vec(argc, (const char **)argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + + // for region -> zonegroup conversion (must happen before common_init_finish()) + if (!g_conf()->rgw_region.empty() && g_conf()->rgw_zonegroup.empty()) { + g_conf().set_val_or_die("rgw_zonegroup", g_conf()->rgw_region.c_str()); + } + + common_init_finish(g_ceph_context); + + rgw_user user_id; + string tenant; + std::string access_key, secret_key, user_email, display_name; + std::string bucket_name, pool_name, object; + rgw_pool pool; + std::string date, subuser, access, format; + std::string start_date, end_date; + std::string key_type_str; + std::string period_id, period_epoch, remote, url; + std::string master_zone; + std::string realm_name, realm_id, realm_new_name; + std::string zone_name, zone_id, zone_new_name; + std::string zonegroup_name, zonegroup_id, zonegroup_new_name; + std::string api_name; + std::string role_name, path, assume_role_doc, policy_name, perm_policy_doc, path_prefix; + std::string redirect_zone; + bool redirect_zone_set = false; + list endpoints; + int tmp_int; + int sync_from_all_specified = false; + bool sync_from_all = false; + list sync_from; + list sync_from_rm; + int is_master_int; + int set_default = 0; + bool is_master = false; + bool is_master_set = false; + int read_only_int; + bool read_only = false; + int is_read_only_set = false; + int commit = false; + int staging = false; + int key_type = KEY_TYPE_UNDEFINED; + rgw_bucket bucket; + uint32_t perm_mask = 0; + RGWUserInfo info; + int opt_cmd = OPT_NO_CMD; + bool need_more; + int gen_access_key = 0; + int gen_secret_key = 0; + bool set_perm = false; + bool set_temp_url_key = false; + map temp_url_keys; + string bucket_id; + Formatter *formatter = NULL; + int purge_data = false; + int pretty_format = false; + int show_log_entries = true; + int show_log_sum = true; + int skip_zero_entries = false; // log show + int purge_keys = false; + int yes_i_really_mean_it = false; + int delete_child_objects = false; + int fix = false; + int remove_bad = false; + int check_head_obj_locator = false; + int max_buckets = -1; + bool max_buckets_specified = false; + map categories; + string caps; + int check_objects = false; + RGWUserAdminOpState user_op; + RGWBucketAdminOpState bucket_op; + string infile; + string metadata_key; + RGWObjVersionTracker objv_tracker; + string marker; + string start_marker; + string end_marker; + int max_entries = -1; + bool max_entries_specified = false; + int admin = false; + bool admin_specified = false; + int system = false; + bool system_specified = false; + int shard_id = -1; + bool specified_shard_id = false; + string client_id; + string op_id; + string op_mask_str; + string quota_scope; + string object_version; + string placement_id; + string storage_class; + list tags; + list tags_add; + list tags_rm; + + int64_t max_objects = -1; + int64_t max_size = -1; + bool have_max_objects = false; + bool have_max_size = false; + int include_all = false; + int allow_unordered = false; + + int sync_stats = false; + int reset_stats = false; + int bypass_gc = false; + int warnings_only = false; + int inconsistent_index = false; + + int verbose = false; + + int extra_info = false; + + uint64_t min_rewrite_size = 4 * 1024 * 1024; + uint64_t max_rewrite_size = ULLONG_MAX; + uint64_t min_rewrite_stripe_size = 0; + + BIIndexType bi_index_type = BIIndexType::Plain; + + string job_id; + int num_shards = 0; + bool num_shards_specified = false; + int max_concurrent_ios = 32; + uint64_t orphan_stale_secs = (24 * 3600); + int detail = false; + + std::string val; + std::ostringstream errs; + string err; + + string source_zone_name; + string source_zone; /* zone id */ + + string tier_type; + bool tier_type_specified = false; + + map tier_config_add; + map tier_config_rm; + + boost::optional index_pool; + boost::optional data_pool; + boost::optional data_extra_pool; + RGWBucketIndexType placement_index_type = RGWBIType_Normal; + bool index_type_specified = false; + + boost::optional compression_type; + + string totp_serial; + string totp_seed; + string totp_seed_type = "hex"; + vector totp_pin; + int totp_seconds = 0; + int totp_window = 0; + int trim_delay_ms = 0; + + string topic_name; + string sub_name; + string sub_oid_prefix; + string sub_dest_bucket; + string sub_push_endpoint; + string event_id; + rgw::notify::EventTypeList event_types; + + for (std::vector::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_witharg(args, i, &val, "-i", "--uid", (char*)NULL)) { + user_id.from_str(val); + if (user_id.empty()) { + cerr << "no value for uid" << std::endl; + exit(1); + } + } else if (ceph_argparse_witharg(args, i, &val, "--tenant", (char*)NULL)) { + tenant = val; + } else if (ceph_argparse_witharg(args, i, &val, "--access-key", (char*)NULL)) { + access_key = val; + } else if (ceph_argparse_witharg(args, i, &val, "--subuser", (char*)NULL)) { + subuser = val; + } else if (ceph_argparse_witharg(args, i, &val, "--secret", "--secret-key", (char*)NULL)) { + secret_key = val; + } else if (ceph_argparse_witharg(args, i, &val, "-e", "--email", (char*)NULL)) { + user_email = val; + user_op.user_email_specified=true; + } else if (ceph_argparse_witharg(args, i, &val, "-n", "--display-name", (char*)NULL)) { + display_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "-b", "--bucket", (char*)NULL)) { + bucket_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) { + pool_name = val; + pool = rgw_pool(pool_name); + } else if (ceph_argparse_witharg(args, i, &val, "-o", "--object", (char*)NULL)) { + object = val; + } else if (ceph_argparse_witharg(args, i, &val, "--object-version", (char*)NULL)) { + object_version = val; + } else if (ceph_argparse_witharg(args, i, &val, "--client-id", (char*)NULL)) { + client_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--op-id", (char*)NULL)) { + op_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--op-mask", (char*)NULL)) { + op_mask_str = val; + } else if (ceph_argparse_witharg(args, i, &val, "--key-type", (char*)NULL)) { + key_type_str = val; + if (key_type_str.compare("swift") == 0) { + key_type = KEY_TYPE_SWIFT; + } else if (key_type_str.compare("s3") == 0) { + key_type = KEY_TYPE_S3; + } else { + cerr << "bad key type: " << key_type_str << std::endl; + exit(1); + } + } else if (ceph_argparse_witharg(args, i, &val, "--job-id", (char*)NULL)) { + job_id = val; + } else if (ceph_argparse_binary_flag(args, i, &gen_access_key, NULL, "--gen-access-key", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &gen_secret_key, NULL, "--gen-secret", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &show_log_entries, NULL, "--show-log-entries", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &show_log_sum, NULL, "--show-log-sum", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &skip_zero_entries, NULL, "--skip-zero-entries", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &admin, NULL, "--admin", (char*)NULL)) { + admin_specified = true; + } else if (ceph_argparse_binary_flag(args, i, &system, NULL, "--system", (char*)NULL)) { + system_specified = true; + } else if (ceph_argparse_binary_flag(args, i, &verbose, NULL, "--verbose", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &staging, NULL, "--staging", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &commit, NULL, "--commit", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_witharg(args, i, &val, "--min-rewrite-size", (char*)NULL)) { + min_rewrite_size = (uint64_t)atoll(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--max-rewrite-size", (char*)NULL)) { + max_rewrite_size = (uint64_t)atoll(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--min-rewrite-stripe-size", (char*)NULL)) { + min_rewrite_stripe_size = (uint64_t)atoll(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--max-buckets", (char*)NULL)) { + max_buckets = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max buckets: " << err << std::endl; + return EINVAL; + } + max_buckets_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) { + max_entries = (int)strict_strtol(val.c_str(), 10, &err); + max_entries_specified = true; + if (!err.empty()) { + cerr << "ERROR: failed to parse max entries: " << err << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) { + max_size = strict_iec_cast(val.c_str(), &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max size: " << err << std::endl; + return EINVAL; + } + have_max_size = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) { + max_objects = (int64_t)strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max objects: " << err << std::endl; + return EINVAL; + } + have_max_objects = true; + } else if (ceph_argparse_witharg(args, i, &val, "--date", "--time", (char*)NULL)) { + date = val; + if (end_date.empty()) + end_date = date; + } else if (ceph_argparse_witharg(args, i, &val, "--start-date", "--start-time", (char*)NULL)) { + start_date = val; + } else if (ceph_argparse_witharg(args, i, &val, "--end-date", "--end-time", (char*)NULL)) { + end_date = val; + } else if (ceph_argparse_witharg(args, i, &val, "--num-shards", (char*)NULL)) { + num_shards = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse num shards: " << err << std::endl; + return EINVAL; + } + num_shards_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--max-concurrent-ios", (char*)NULL)) { + max_concurrent_ios = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse max concurrent ios: " << err << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--orphan-stale-secs", (char*)NULL)) { + orphan_stale_secs = (uint64_t)strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse orphan stale secs: " << err << std::endl; + return EINVAL; + } + } else if (ceph_argparse_witharg(args, i, &val, "--shard-id", (char*)NULL)) { + shard_id = (int)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse shard id: " << err << std::endl; + return EINVAL; + } + specified_shard_id = true; + } else if (ceph_argparse_witharg(args, i, &val, "--access", (char*)NULL)) { + access = val; + perm_mask = rgw_str_to_perm(access.c_str()); + set_perm = true; + } else if (ceph_argparse_witharg(args, i, &val, "--temp-url-key", (char*)NULL)) { + temp_url_keys[0] = val; + set_temp_url_key = true; + } else if (ceph_argparse_witharg(args, i, &val, "--temp-url-key2", "--temp-url-key-2", (char*)NULL)) { + temp_url_keys[1] = val; + set_temp_url_key = true; + } else if (ceph_argparse_witharg(args, i, &val, "--bucket-id", (char*)NULL)) { + bucket_id = val; + if (bucket_id.empty()) { + cerr << "no value for bucket-id" << std::endl; + exit(1); + } + } else if (ceph_argparse_witharg(args, i, &val, "--format", (char*)NULL)) { + format = val; + } else if (ceph_argparse_witharg(args, i, &val, "--categories", (char*)NULL)) { + string cat_str = val; + list cat_list; + list::iterator iter; + get_str_list(cat_str, cat_list); + for (iter = cat_list.begin(); iter != cat_list.end(); ++iter) { + categories[*iter] = true; + } + } else if (ceph_argparse_binary_flag(args, i, &delete_child_objects, NULL, "--purge-objects", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &pretty_format, NULL, "--pretty-format", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &purge_data, NULL, "--purge-data", (char*)NULL)) { + delete_child_objects = purge_data; + } else if (ceph_argparse_binary_flag(args, i, &purge_keys, NULL, "--purge-keys", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &yes_i_really_mean_it, NULL, "--yes-i-really-mean-it", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &fix, NULL, "--fix", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &remove_bad, NULL, "--remove-bad", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &check_head_obj_locator, NULL, "--check-head-obj-locator", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &check_objects, NULL, "--check-objects", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &sync_stats, NULL, "--sync-stats", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &reset_stats, NULL, "--reset-stats", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &include_all, NULL, "--include-all", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &allow_unordered, NULL, "--allow-unordered", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &extra_info, NULL, "--extra-info", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &bypass_gc, NULL, "--bypass-gc", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &warnings_only, NULL, "--warnings-only", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &inconsistent_index, NULL, "--inconsistent-index", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) { + caps = val; + } else if (ceph_argparse_witharg(args, i, &val, "-i", "--infile", (char*)NULL)) { + infile = val; + } else if (ceph_argparse_witharg(args, i, &val, "--metadata-key", (char*)NULL)) { + metadata_key = val; + } else if (ceph_argparse_witharg(args, i, &val, "--marker", (char*)NULL)) { + marker = val; + } else if (ceph_argparse_witharg(args, i, &val, "--start-marker", (char*)NULL)) { + start_marker = val; + } else if (ceph_argparse_witharg(args, i, &val, "--end-marker", (char*)NULL)) { + end_marker = val; + } else if (ceph_argparse_witharg(args, i, &val, "--quota-scope", (char*)NULL)) { + quota_scope = val; + } else if (ceph_argparse_witharg(args, i, &val, "--index-type", (char*)NULL)) { + string index_type_str = val; + bi_index_type = get_bi_index_type(index_type_str); + if (bi_index_type == BIIndexType::Invalid) { + cerr << "ERROR: invalid bucket index entry type" << std::endl; + return EINVAL; + } + } else if (ceph_argparse_binary_flag(args, i, &is_master_int, NULL, "--master", (char*)NULL)) { + is_master = (bool)is_master_int; + is_master_set = true; + } else if (ceph_argparse_binary_flag(args, i, &set_default, NULL, "--default", (char*)NULL)) { + /* do nothing */ + } else if (ceph_argparse_witharg(args, i, &val, "--redirect-zone", (char*)NULL)) { + redirect_zone = val; + redirect_zone_set = true; + } else if (ceph_argparse_binary_flag(args, i, &read_only_int, NULL, "--read-only", (char*)NULL)) { + read_only = (bool)read_only_int; + is_read_only_set = true; + } else if (ceph_argparse_witharg(args, i, &val, "--master-zone", (char*)NULL)) { + master_zone = val; + } else if (ceph_argparse_witharg(args, i, &val, "--period", (char*)NULL)) { + period_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--epoch", (char*)NULL)) { + period_epoch = val; + } else if (ceph_argparse_witharg(args, i, &val, "--remote", (char*)NULL)) { + remote = val; + } else if (ceph_argparse_witharg(args, i, &val, "--url", (char*)NULL)) { + url = val; + } else if (ceph_argparse_witharg(args, i, &val, "--realm-id", (char*)NULL)) { + realm_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--realm-new-name", (char*)NULL)) { + realm_new_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--zonegroup-id", (char*)NULL)) { + zonegroup_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--zonegroup-new-name", (char*)NULL)) { + zonegroup_new_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--placement-id", (char*)NULL)) { + placement_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--storage-class", (char*)NULL)) { + storage_class = val; + } else if (ceph_argparse_witharg(args, i, &val, "--tags", (char*)NULL)) { + get_str_list(val, tags); + } else if (ceph_argparse_witharg(args, i, &val, "--tags-add", (char*)NULL)) { + get_str_list(val, tags_add); + } else if (ceph_argparse_witharg(args, i, &val, "--tags-rm", (char*)NULL)) { + get_str_list(val, tags_rm); + } else if (ceph_argparse_witharg(args, i, &val, "--api-name", (char*)NULL)) { + api_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--zone-id", (char*)NULL)) { + zone_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--zone-new-name", (char*)NULL)) { + zone_new_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--endpoints", (char*)NULL)) { + get_str_list(val, endpoints); + } else if (ceph_argparse_witharg(args, i, &val, "--sync-from", (char*)NULL)) { + get_str_list(val, sync_from); + } else if (ceph_argparse_witharg(args, i, &val, "--sync-from-rm", (char*)NULL)) { + get_str_list(val, sync_from_rm); + } else if (ceph_argparse_binary_flag(args, i, &tmp_int, NULL, "--sync-from-all", (char*)NULL)) { + sync_from_all = (bool)tmp_int; + sync_from_all_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--source-zone", (char*)NULL)) { + source_zone_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--tier-type", (char*)NULL)) { + tier_type = val; + tier_type_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--tier-config", (char*)NULL)) { + parse_tier_config_param(val, tier_config_add); + } else if (ceph_argparse_witharg(args, i, &val, "--tier-config-rm", (char*)NULL)) { + parse_tier_config_param(val, tier_config_rm); + } else if (ceph_argparse_witharg(args, i, &val, "--index-pool", (char*)NULL)) { + index_pool = val; + } else if (ceph_argparse_witharg(args, i, &val, "--data-pool", (char*)NULL)) { + data_pool = val; + } else if (ceph_argparse_witharg(args, i, &val, "--data-extra-pool", (char*)NULL)) { + data_extra_pool = val; + } else if (ceph_argparse_witharg(args, i, &val, "--placement-index-type", (char*)NULL)) { + if (val == "normal") { + placement_index_type = RGWBIType_Normal; + } else if (val == "indexless") { + placement_index_type = RGWBIType_Indexless; + } else { + placement_index_type = (RGWBucketIndexType)strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + cerr << "ERROR: failed to parse index type index: " << err << std::endl; + return EINVAL; + } + } + index_type_specified = true; + } else if (ceph_argparse_witharg(args, i, &val, "--compression", (char*)NULL)) { + compression_type = val; + } else if (ceph_argparse_witharg(args, i, &val, "--role-name", (char*)NULL)) { + role_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--path", (char*)NULL)) { + path = val; + } else if (ceph_argparse_witharg(args, i, &val, "--assume-role-policy-doc", (char*)NULL)) { + assume_role_doc = val; + } else if (ceph_argparse_witharg(args, i, &val, "--policy-name", (char*)NULL)) { + policy_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--policy-doc", (char*)NULL)) { + perm_policy_doc = val; + } else if (ceph_argparse_witharg(args, i, &val, "--path-prefix", (char*)NULL)) { + path_prefix = val; + } else if (ceph_argparse_witharg(args, i, &val, "--totp-serial", (char*)NULL)) { + totp_serial = val; + } else if (ceph_argparse_witharg(args, i, &val, "--totp-pin", (char*)NULL)) { + totp_pin.push_back(val); + } else if (ceph_argparse_witharg(args, i, &val, "--totp-seed", (char*)NULL)) { + totp_seed = val; + } else if (ceph_argparse_witharg(args, i, &val, "--totp-seed-type", (char*)NULL)) { + totp_seed_type = val; + } else if (ceph_argparse_witharg(args, i, &val, "--totp-seconds", (char*)NULL)) { + totp_seconds = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--totp-window", (char*)NULL)) { + totp_window = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--trim-delay-ms", (char*)NULL)) { + trim_delay_ms = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--topic", (char*)NULL)) { + topic_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--sub-name", (char*)NULL)) { + sub_name = val; + } else if (ceph_argparse_witharg(args, i, &val, "--sub-oid-prefix", (char*)NULL)) { + sub_oid_prefix = val; + } else if (ceph_argparse_witharg(args, i, &val, "--sub-dest-bucket", (char*)NULL)) { + sub_dest_bucket = val; + } else if (ceph_argparse_witharg(args, i, &val, "--sub-push-endpoint", (char*)NULL)) { + sub_push_endpoint = val; + } else if (ceph_argparse_witharg(args, i, &val, "--event-id", (char*)NULL)) { + event_id = val; + } else if (ceph_argparse_witharg(args, i, &val, "--event-type", "--event-types", (char*)NULL)) { + rgw::notify::from_string_list(val, event_types); + } else if (ceph_argparse_binary_flag(args, i, &detail, NULL, "--detail", (char*)NULL)) { + // do nothing + } else if (strncmp(*i, "-", 1) == 0) { + cerr << "ERROR: invalid flag " << *i << std::endl; + return EINVAL; + } else { + ++i; + } + } + + if (args.empty()) { + usage(); + exit(1); + } + else { + const char *prev_cmd = NULL; + const char *prev_prev_cmd = NULL; + std::vector::iterator i ; + for (i = args.begin(); i != args.end(); ++i) { + opt_cmd = get_cmd(*i, prev_cmd, prev_prev_cmd, &need_more); + if (opt_cmd < 0) { + cerr << "unrecognized arg " << *i << std::endl; + exit(1); + } + if (!need_more) { + ++i; + break; + } + prev_prev_cmd = prev_cmd; + prev_cmd = *i; + } + + if (opt_cmd == OPT_NO_CMD) { + cerr << "no command" << std::endl; + exit(1); + } + + /* some commands may have an optional extra param */ + if (i != args.end()) { + switch (opt_cmd) { + case OPT_METADATA_GET: + case OPT_METADATA_PUT: + case OPT_METADATA_RM: + case OPT_METADATA_LIST: + metadata_key = *i; + break; + default: + break; + } + } + + if (tenant.empty()) { + tenant = user_id.tenant; + } else { + if (user_id.empty() && opt_cmd != OPT_ROLE_CREATE + && opt_cmd != OPT_ROLE_DELETE + && opt_cmd != OPT_ROLE_GET + && opt_cmd != OPT_ROLE_MODIFY + && opt_cmd != OPT_ROLE_LIST + && opt_cmd != OPT_ROLE_POLICY_PUT + && opt_cmd != OPT_ROLE_POLICY_LIST + && opt_cmd != OPT_ROLE_POLICY_GET + && opt_cmd != OPT_ROLE_POLICY_DELETE + && opt_cmd != OPT_RESHARD_ADD + && opt_cmd != OPT_RESHARD_CANCEL + && opt_cmd != OPT_RESHARD_STATUS) { + cerr << "ERROR: --tenant is set, but there's no user ID" << std::endl; + return EINVAL; + } + user_id.tenant = tenant; + } + /* check key parameter conflict */ + if ((!access_key.empty()) && gen_access_key) { + cerr << "ERROR: key parameter conflict, --access-key & --gen-access-key" << std::endl; + return EINVAL; + } + if ((!secret_key.empty()) && gen_secret_key) { + cerr << "ERROR: key parameter conflict, --secret & --gen-secret" << std::endl; + return EINVAL; + } + } + + // default to pretty json + if (format.empty()) { + format = "json"; + pretty_format = true; + } + + if (format == "xml") + formatter = new XMLFormatter(pretty_format); + else if (format == "json") + formatter = new JSONFormatter(pretty_format); + else { + cerr << "unrecognized format: " << format << std::endl; + exit(1); + } + + realm_name = g_conf()->rgw_realm; + zone_name = g_conf()->rgw_zone; + zonegroup_name = g_conf()->rgw_zonegroup; + + RGWStreamFlusher f(formatter, cout); + + // not a raw op if 'period update' needs to commit to master + bool raw_period_update = opt_cmd == OPT_PERIOD_UPDATE && !commit; + // not a raw op if 'period pull' needs to read zone/period configuration + bool raw_period_pull = opt_cmd == OPT_PERIOD_PULL && !url.empty(); + + std::set raw_storage_ops_list = {OPT_ZONEGROUP_ADD, OPT_ZONEGROUP_CREATE, OPT_ZONEGROUP_DELETE, + OPT_ZONEGROUP_GET, OPT_ZONEGROUP_LIST, + OPT_ZONEGROUP_SET, OPT_ZONEGROUP_DEFAULT, + OPT_ZONEGROUP_RENAME, OPT_ZONEGROUP_MODIFY, + OPT_ZONEGROUP_REMOVE, + OPT_ZONEGROUP_PLACEMENT_ADD, OPT_ZONEGROUP_PLACEMENT_RM, + OPT_ZONEGROUP_PLACEMENT_MODIFY, OPT_ZONEGROUP_PLACEMENT_LIST, + OPT_ZONEGROUP_PLACEMENT_GET, + OPT_ZONEGROUP_PLACEMENT_DEFAULT, + OPT_ZONE_CREATE, OPT_ZONE_DELETE, + OPT_ZONE_GET, OPT_ZONE_SET, OPT_ZONE_RENAME, + OPT_ZONE_LIST, OPT_ZONE_MODIFY, OPT_ZONE_DEFAULT, + OPT_ZONE_PLACEMENT_ADD, OPT_ZONE_PLACEMENT_RM, + OPT_ZONE_PLACEMENT_MODIFY, OPT_ZONE_PLACEMENT_LIST, + OPT_ZONE_PLACEMENT_GET, + OPT_REALM_CREATE, + OPT_PERIOD_DELETE, OPT_PERIOD_GET, + OPT_PERIOD_GET_CURRENT, OPT_PERIOD_LIST, + OPT_GLOBAL_QUOTA_GET, OPT_GLOBAL_QUOTA_SET, + OPT_GLOBAL_QUOTA_ENABLE, OPT_GLOBAL_QUOTA_DISABLE, + OPT_REALM_DELETE, OPT_REALM_GET, OPT_REALM_LIST, + OPT_REALM_LIST_PERIODS, + OPT_REALM_GET_DEFAULT, + OPT_REALM_RENAME, OPT_REALM_SET, + OPT_REALM_DEFAULT, OPT_REALM_PULL}; + + std::set readonly_ops_list = { + OPT_USER_INFO, + OPT_USER_STATS, + OPT_BUCKETS_LIST, + OPT_BUCKET_LIMIT_CHECK, + OPT_BUCKET_STATS, + OPT_BUCKET_SYNC_STATUS, + OPT_BUCKET_SYNC_MARKERS, + OPT_LOG_LIST, + OPT_LOG_SHOW, + OPT_USAGE_SHOW, + OPT_OBJECT_STAT, + OPT_BI_GET, + OPT_BI_LIST, + OPT_OLH_GET, + OPT_OLH_READLOG, + OPT_GC_LIST, + OPT_LC_LIST, + OPT_ORPHANS_LIST_JOBS, + OPT_ZONEGROUP_GET, + OPT_ZONEGROUP_LIST, + OPT_ZONEGROUP_PLACEMENT_LIST, + OPT_ZONEGROUP_PLACEMENT_GET, + OPT_ZONE_GET, + OPT_ZONE_LIST, + OPT_ZONE_PLACEMENT_LIST, + OPT_ZONE_PLACEMENT_GET, + OPT_METADATA_GET, + OPT_METADATA_LIST, + OPT_METADATA_SYNC_STATUS, + OPT_MDLOG_LIST, + OPT_MDLOG_STATUS, + OPT_SYNC_ERROR_LIST, + OPT_BILOG_LIST, + OPT_BILOG_STATUS, + OPT_DATA_SYNC_STATUS, + OPT_DATALOG_LIST, + OPT_DATALOG_STATUS, + OPT_REALM_GET, + OPT_REALM_GET_DEFAULT, + OPT_REALM_LIST, + OPT_REALM_LIST_PERIODS, + OPT_PERIOD_GET, + OPT_PERIOD_GET_CURRENT, + OPT_PERIOD_LIST, + OPT_GLOBAL_QUOTA_GET, + OPT_SYNC_STATUS, + OPT_ROLE_GET, + OPT_ROLE_LIST, + OPT_ROLE_POLICY_LIST, + OPT_ROLE_POLICY_GET, + OPT_RESHARD_LIST, + OPT_RESHARD_STATUS, + }; + + bool raw_storage_op = (raw_storage_ops_list.find(opt_cmd) != raw_storage_ops_list.end() || + raw_period_update || raw_period_pull); + bool need_cache = readonly_ops_list.find(opt_cmd) == readonly_ops_list.end(); + + if (raw_storage_op) { + store = RGWStoreManager::get_raw_storage(g_ceph_context); + } else { + store = RGWStoreManager::get_storage(g_ceph_context, false, false, false, false, false, + need_cache && g_conf()->rgw_cache_enabled); + } + if (!store) { + cerr << "couldn't init storage provider" << std::endl; + return 5; //EIO + } + + if (!source_zone_name.empty()) { + if (!store->svc.zone->find_zone_id_by_name(source_zone_name, &source_zone)) { + cerr << "WARNING: cannot find source zone id for name=" << source_zone_name << std::endl; + source_zone = source_zone_name; + } + } + + rgw_user_init(store); + rgw_bucket_init(store->meta_mgr); + rgw_otp_init(store); + + rgw_http_client_init(g_ceph_context); + + struct rgw_curl_setup { + rgw_curl_setup() { + rgw::curl::setup_curl(boost::none); + } + ~rgw_curl_setup() { + rgw::curl::cleanup_curl(); + } + } curl_cleanup; + + oath_init(); + + StoreDestructor store_destructor(store); + + if (raw_storage_op) { + switch (opt_cmd) { + case OPT_PERIOD_DELETE: + { + if (period_id.empty()) { + cerr << "missing period id" << std::endl; + return EINVAL; + } + RGWPeriod period(period_id); + int ret = period.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "period.init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = period.delete_obj(); + if (ret < 0) { + cerr << "ERROR: couldn't delete period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + } + break; + case OPT_PERIOD_GET: + { + epoch_t epoch = 0; + if (!period_epoch.empty()) { + epoch = atoi(period_epoch.c_str()); + } + if (staging) { + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0 ) { + cerr << "Error initializing realm " << cpp_strerror(-ret) << std::endl; + return -ret; + } + realm_id = realm.get_id(); + realm_name = realm.get_name(); + period_id = RGWPeriod::get_staging_id(realm_id); + epoch = 1; + } + RGWPeriod period(period_id, epoch); + int ret = period.init(g_ceph_context, store->svc.sysobj, realm_id, realm_name); + if (ret < 0) { + cerr << "period init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("period", period, formatter); + formatter->flush(cout); + } + break; + case OPT_PERIOD_GET_CURRENT: + { + int ret = read_current_period_id(store, realm_id, realm_name, &period_id); + if (ret < 0) { + return -ret; + } + formatter->open_object_section("period_get_current"); + encode_json("current_period", period_id, formatter); + formatter->close_section(); + formatter->flush(cout); + } + break; + case OPT_PERIOD_LIST: + { + list periods; + int ret = store->svc.zone->list_periods(periods); + if (ret < 0) { + cerr << "failed to list periods: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("periods_list"); + encode_json("periods", periods, formatter); + formatter->close_section(); + formatter->flush(cout); + } + break; + case OPT_PERIOD_UPDATE: + { + int ret = update_period(realm_id, realm_name, period_id, period_epoch, + commit, remote, url, access_key, secret_key, + formatter, yes_i_really_mean_it); + if (ret < 0) { + return -ret; + } + } + break; + case OPT_PERIOD_PULL: + { + boost::optional conn; + RGWRESTConn *remote_conn = nullptr; + if (url.empty()) { + // load current period for endpoints + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + RGWPeriod current_period(realm.get_current_period()); + ret = current_period.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init current period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (remote.empty()) { + // use realm master zone as remote + remote = current_period.get_master_zone(); + } + conn = get_remote_conn(store, current_period.get_map(), remote); + if (!conn) { + cerr << "failed to find a zone or zonegroup for remote " + << remote << std::endl; + return -ENOENT; + } + remote_conn = &*conn; + } + + RGWPeriod period; + int ret = do_period_pull(remote_conn, url, access_key, secret_key, + realm_id, realm_name, period_id, period_epoch, + &period); + if (ret < 0) { + cerr << "period pull failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("period", period, formatter); + formatter->flush(cout); + } + break; + case OPT_GLOBAL_QUOTA_GET: + case OPT_GLOBAL_QUOTA_SET: + case OPT_GLOBAL_QUOTA_ENABLE: + case OPT_GLOBAL_QUOTA_DISABLE: + { + if (realm_id.empty()) { + RGWRealm realm(g_ceph_context, store->svc.sysobj); + if (!realm_name.empty()) { + // look up realm_id for the given realm_name + int ret = realm.read_id(realm_name, realm_id); + if (ret < 0) { + cerr << "ERROR: failed to read realm for " << realm_name + << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + // use default realm_id when none is given + int ret = realm.read_default_id(realm_id); + if (ret < 0 && ret != -ENOENT) { // on ENOENT, use empty realm_id + cerr << "ERROR: failed to read default realm: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + RGWPeriodConfig period_config; + int ret = period_config.read(store->svc.sysobj, realm_id); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: failed to read period config: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + + formatter->open_object_section("period_config"); + if (quota_scope == "bucket") { + set_quota_info(period_config.bucket_quota, opt_cmd, + max_size, max_objects, + have_max_size, have_max_objects); + encode_json("bucket quota", period_config.bucket_quota, formatter); + } else if (quota_scope == "user") { + set_quota_info(period_config.user_quota, opt_cmd, + max_size, max_objects, + have_max_size, have_max_objects); + encode_json("user quota", period_config.user_quota, formatter); + } else if (quota_scope.empty() && opt_cmd == OPT_GLOBAL_QUOTA_GET) { + // if no scope is given for GET, print both + encode_json("bucket quota", period_config.bucket_quota, formatter); + encode_json("user quota", period_config.user_quota, formatter); + } else { + cerr << "ERROR: invalid quota scope specification. Please specify " + "either --quota-scope=bucket, or --quota-scope=user" << std::endl; + return EINVAL; + } + formatter->close_section(); + + if (opt_cmd != OPT_GLOBAL_QUOTA_GET) { + // write the modified period config + ret = period_config.write(store->svc.sysobj, realm_id); + if (ret < 0) { + cerr << "ERROR: failed to write period config: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (!realm_id.empty()) { + cout << "Global quota changes saved. Use 'period update' to apply " + "them to the staging period, and 'period commit' to commit the " + "new period." << std::endl; + } else { + cout << "Global quota changes saved. They will take effect as " + "the gateways are restarted." << std::endl; + } + } + + formatter->flush(cout); + } + break; + case OPT_REALM_CREATE: + { + if (realm_name.empty()) { + cerr << "missing realm name" << std::endl; + return EINVAL; + } + + RGWRealm realm(realm_name, g_ceph_context, store->svc.sysobj); + int ret = realm.create(); + if (ret < 0) { + cerr << "ERROR: couldn't create realm " << realm_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (set_default) { + ret = realm.set_as_default(); + if (ret < 0) { + cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("realm", realm, formatter); + formatter->flush(cout); + } + break; + case OPT_REALM_DELETE: + { + RGWRealm realm(realm_id, realm_name); + if (realm_name.empty() && realm_id.empty()) { + cerr << "missing realm name or id" << std::endl; + return EINVAL; + } + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "realm.init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = realm.delete_obj(); + if (ret < 0) { + cerr << "ERROR: couldn't : " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + } + break; + case OPT_REALM_GET: + { + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + if (ret == -ENOENT && realm_name.empty() && realm_id.empty()) { + cerr << "missing realm name or id, or default realm not found" << std::endl; + } else { + cerr << "realm.init failed: " << cpp_strerror(-ret) << std::endl; + } + return -ret; + } + encode_json("realm", realm, formatter); + formatter->flush(cout); + } + break; + case OPT_REALM_GET_DEFAULT: + { + RGWRealm realm(g_ceph_context, store->svc.sysobj); + string default_id; + int ret = realm.read_default_id(default_id); + if (ret == -ENOENT) { + cout << "No default realm is set" << std::endl; + return -ret; + } else if (ret < 0) { + cerr << "Error reading default realm:" << cpp_strerror(-ret) << std::endl; + return -ret; + } + cout << "default realm: " << default_id << std::endl; + } + break; + case OPT_REALM_LIST: + { + RGWRealm realm(g_ceph_context, store->svc.sysobj); + string default_id; + int ret = realm.read_default_id(default_id); + if (ret < 0 && ret != -ENOENT) { + cerr << "could not determine default realm: " << cpp_strerror(-ret) << std::endl; + } + list realms; + ret = store->svc.zone->list_realms(realms); + if (ret < 0) { + cerr << "failed to list realms: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("realms_list"); + encode_json("default_info", default_id, formatter); + encode_json("realms", realms, formatter); + formatter->close_section(); + formatter->flush(cout); + } + break; + case OPT_REALM_LIST_PERIODS: + { + int ret = read_current_period_id(store, realm_id, realm_name, &period_id); + if (ret < 0) { + return -ret; + } + list periods; + ret = store->svc.zone->list_periods(period_id, periods); + if (ret < 0) { + cerr << "list periods failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("realm_periods_list"); + encode_json("current_period", period_id, formatter); + encode_json("periods", periods, formatter); + formatter->close_section(); + formatter->flush(cout); + } + break; + + case OPT_REALM_RENAME: + { + RGWRealm realm(realm_id, realm_name); + if (realm_new_name.empty()) { + cerr << "missing realm new name" << std::endl; + return EINVAL; + } + if (realm_name.empty() && realm_id.empty()) { + cerr << "missing realm name or id" << std::endl; + return EINVAL; + } + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "realm.init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = realm.rename(realm_new_name); + if (ret < 0) { + cerr << "realm.rename failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + cout << "Realm name updated. Note that this change only applies to " + "the current cluster, so this command must be run separately " + "on each of the realm's other clusters." << std::endl; + } + break; + case OPT_REALM_SET: + { + if (realm_id.empty() && realm_name.empty()) { + cerr << "no realm name or id provided" << std::endl; + return EINVAL; + } + RGWRealm realm(realm_id, realm_name); + bool new_realm = false; + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0 && ret != -ENOENT) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } else if (ret == -ENOENT) { + new_realm = true; + } + ret = read_decode_json(infile, realm); + if (ret < 0) { + return 1; + } + if (!realm_name.empty() && realm.get_name() != realm_name) { + cerr << "mismatch between --rgw-realm " << realm_name << " and json input file name " << + realm.get_name() << std::endl; + return EINVAL; + } + /* new realm */ + if (new_realm) { + cout << "clearing period and epoch for new realm" << std::endl; + realm.clear_current_period_and_epoch(); + ret = realm.create(); + if (ret < 0) { + cerr << "ERROR: couldn't create new realm: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } else { + ret = realm.update(); + if (ret < 0) { + cerr << "ERROR: couldn't store realm info: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + if (set_default) { + ret = realm.set_as_default(); + if (ret < 0) { + cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + encode_json("realm", realm, formatter); + formatter->flush(cout); + } + break; + + case OPT_REALM_DEFAULT: + { + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = realm.set_as_default(); + if (ret < 0) { + cerr << "failed to set realm as default: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT_REALM_PULL: + { + if (url.empty()) { + cerr << "A --url must be provided." << std::endl; + return EINVAL; + } + RGWEnv env; + req_info info(g_ceph_context, &env); + info.method = "GET"; + info.request_uri = "/admin/realm"; + + map ¶ms = info.args.get_params(); + if (!realm_id.empty()) + params["id"] = realm_id; + if (!realm_name.empty()) + params["name"] = realm_name; + + bufferlist bl; + JSONParser p; + int ret = send_to_url(url, access_key, secret_key, info, bl, p); + if (ret < 0) { + cerr << "request failed: " << cpp_strerror(-ret) << std::endl; + if (ret == -EACCES) { + cerr << "If the realm has been changed on the master zone, the " + "master zone's gateway may need to be restarted to recognize " + "this user." << std::endl; + } + return -ret; + } + RGWRealm realm; + realm.init(g_ceph_context, store->svc.sysobj, false); + try { + decode_json_obj(realm, &p); + } catch (JSONDecoder::err& e) { + cerr << "failed to decode JSON response: " << e.message << std::endl; + return EINVAL; + } + RGWPeriod period; + auto& current_period = realm.get_current_period(); + if (!current_period.empty()) { + // pull the latest epoch of the realm's current period + ret = do_period_pull(nullptr, url, access_key, secret_key, + realm_id, realm_name, current_period, "", + &period); + if (ret < 0) { + cerr << "could not fetch period " << current_period << std::endl; + return -ret; + } + } + ret = realm.create(false); + if (ret < 0 && ret != -EEXIST) { + cerr << "Error storing realm " << realm.get_id() << ": " + << cpp_strerror(ret) << std::endl; + return -ret; + } else if (ret ==-EEXIST) { + ret = realm.update(); + if (ret < 0) { + cerr << "Error storing realm " << realm.get_id() << ": " + << cpp_strerror(ret) << std::endl; + } + } + + if (set_default) { + ret = realm.set_as_default(); + if (ret < 0) { + cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("realm", realm, formatter); + formatter->flush(cout); + } + break; + + case OPT_ZONEGROUP_ADD: + { + if (zonegroup_id.empty() && zonegroup_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + + RGWZoneGroup zonegroup(zonegroup_id,zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to initialize zonegroup " << zonegroup_name << " id " << zonegroup_id << " :" + << cpp_strerror(-ret) << std::endl; + return -ret; + } + RGWZoneParams zone(zone_id, zone_name); + ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (zone.realm_id != zonegroup.realm_id) { + zone.realm_id = zonegroup.realm_id; + ret = zone.update(); + if (ret < 0) { + cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + string *ptier_type = (tier_type_specified ? &tier_type : nullptr); + + for (auto a : tier_config_add) { + int r = zone.tier_config.set(a.first, a.second); + if (r < 0) { + cerr << "ERROR: failed to set configurable: " << a << std::endl; + return EINVAL; + } + } + + bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr); + string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr); + + ret = zonegroup.add_zone(zone, + (is_master_set ? &is_master : NULL), + (is_read_only_set ? &read_only : NULL), + endpoints, ptier_type, + psync_from_all, sync_from, sync_from_rm, + predirect_zone, + store->svc.sync_modules->get_manager()); + if (ret < 0) { + cerr << "failed to add zone " << zone_name << " to zonegroup " << zonegroup.get_name() << ": " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("zonegroup", zonegroup, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_CREATE: + { + if (zonegroup_name.empty()) { + cerr << "Missing zonegroup name" << std::endl; + return EINVAL; + } + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWZoneGroup zonegroup(zonegroup_name, is_master, g_ceph_context, store->svc.sysobj, realm.get_id(), endpoints); + zonegroup.api_name = (api_name.empty() ? zonegroup_name : api_name); + ret = zonegroup.create(); + if (ret < 0) { + cerr << "failed to create zonegroup " << zonegroup_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (set_default) { + ret = zonegroup.set_as_default(); + if (ret < 0) { + cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zonegroup", zonegroup, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_DEFAULT: + { + if (zonegroup_id.empty() && zonegroup_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = zonegroup.set_as_default(); + if (ret < 0) { + cerr << "failed to set zonegroup as default: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT_ZONEGROUP_DELETE: + { + if (zonegroup_id.empty() && zonegroup_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = zonegroup.delete_obj(); + if (ret < 0) { + cerr << "ERROR: couldn't delete zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT_ZONEGROUP_GET: + { + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("zonegroup", zonegroup, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_LIST: + { + RGWZoneGroup zonegroup; + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj, false); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + list zonegroups; + ret = store->svc.zone->list_zonegroups(zonegroups); + if (ret < 0) { + cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + string default_zonegroup; + ret = zonegroup.read_default_id(default_zonegroup); + if (ret < 0 && ret != -ENOENT) { + cerr << "could not determine default zonegroup: " << cpp_strerror(-ret) << std::endl; + } + formatter->open_object_section("zonegroups_list"); + encode_json("default_info", default_zonegroup, formatter); + encode_json("zonegroups", zonegroups, formatter); + formatter->close_section(); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_MODIFY: + { + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + bool need_update = false; + + if (!master_zone.empty()) { + zonegroup.master_zone = master_zone; + need_update = true; + } + + if (is_master_set) { + zonegroup.update_master(is_master); + need_update = true; + } + + if (!endpoints.empty()) { + zonegroup.endpoints = endpoints; + need_update = true; + } + + if (!api_name.empty()) { + zonegroup.api_name = api_name; + need_update = true; + } + + if (!realm_id.empty()) { + zonegroup.realm_id = realm_id; + need_update = true; + } else if (!realm_name.empty()) { + // get realm id from name + RGWRealm realm{g_ceph_context, store->svc.sysobj}; + ret = realm.read_id(realm_name, zonegroup.realm_id); + if (ret < 0) { + cerr << "failed to find realm by name " << realm_name << std::endl; + return -ret; + } + need_update = true; + } + + if (need_update) { + ret = zonegroup.update(); + if (ret < 0) { + cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (set_default) { + ret = zonegroup.set_as_default(); + if (ret < 0) { + cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zonegroup", zonegroup, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_SET: + { + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + bool default_realm_not_exist = (ret == -ENOENT && realm_id.empty() && realm_name.empty()); + + if (ret < 0 && !default_realm_not_exist ) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWZoneGroup zonegroup; + ret = zonegroup.init(g_ceph_context, store->svc.sysobj, false); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = read_decode_json(infile, zonegroup); + if (ret < 0) { + return 1; + } + if (zonegroup.realm_id.empty() && !default_realm_not_exist) { + zonegroup.realm_id = realm.get_id(); + } + ret = zonegroup.create(); + if (ret < 0 && ret != -EEXIST) { + cerr << "ERROR: couldn't create zonegroup info: " << cpp_strerror(-ret) << std::endl; + return 1; + } else if (ret == -EEXIST) { + ret = zonegroup.update(); + if (ret < 0) { + cerr << "ERROR: couldn't store zonegroup info: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + if (set_default) { + ret = zonegroup.set_as_default(); + if (ret < 0) { + cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zonegroup", zonegroup, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_REMOVE: + { + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (zone_id.empty()) { + if (zone_name.empty()) { + cerr << "no --zone-id or --rgw-zone name provided" << std::endl; + return EINVAL; + } + // look up zone id by name + for (auto& z : zonegroup.zones) { + if (zone_name == z.second.name) { + zone_id = z.second.id; + break; + } + } + if (zone_id.empty()) { + cerr << "zone name " << zone_name << " not found in zonegroup " + << zonegroup.get_name() << std::endl; + return ENOENT; + } + } + + ret = zonegroup.remove_zone(zone_id); + if (ret < 0) { + cerr << "failed to remove zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("zonegroup", zonegroup, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_RENAME: + { + if (zonegroup_new_name.empty()) { + cerr << " missing zonegroup new name" << std::endl; + return EINVAL; + } + if (zonegroup_id.empty() && zonegroup_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = zonegroup.rename(zonegroup_new_name); + if (ret < 0) { + cerr << "failed to rename zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT_ZONEGROUP_PLACEMENT_LIST: + { + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("placement_targets", zonegroup.placement_targets, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_PLACEMENT_GET: + { + if (placement_id.empty()) { + cerr << "ERROR: --placement-id not specified" << std::endl; + return EINVAL; + } + + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto p = zonegroup.placement_targets.find(placement_id); + if (p == zonegroup.placement_targets.end()) { + cerr << "failed to find a zonegroup placement target named '" << placement_id << "'" << std::endl; + return -ENOENT; + } + encode_json("placement_targets", p->second, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONEGROUP_PLACEMENT_ADD: + case OPT_ZONEGROUP_PLACEMENT_MODIFY: + case OPT_ZONEGROUP_PLACEMENT_RM: + case OPT_ZONEGROUP_PLACEMENT_DEFAULT: + { + if (placement_id.empty()) { + cerr << "ERROR: --placement-id not specified" << std::endl; + return EINVAL; + } + + rgw_placement_rule rule; + rule.from_str(placement_id); + + if (!rule.storage_class.empty() && !storage_class.empty() && + rule.storage_class != storage_class) { + cerr << "ERROR: provided contradicting storage class configuration" << std::endl; + return EINVAL; + } else if (rule.storage_class.empty()) { + rule.storage_class = storage_class; + } + + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (opt_cmd == OPT_ZONEGROUP_PLACEMENT_ADD || + opt_cmd == OPT_ZONEGROUP_PLACEMENT_MODIFY) { + RGWZoneGroupPlacementTarget& target = zonegroup.placement_targets[placement_id]; + if (!tags.empty()) { + target.tags.clear(); + for (auto& t : tags) { + target.tags.insert(t); + } + } + target.name = placement_id; + for (auto& t : tags_rm) { + target.tags.erase(t); + } + for (auto& t : tags_add) { + target.tags.insert(t); + } + target.storage_classes.insert(rule.get_storage_class()); + } else if (opt_cmd == OPT_ZONEGROUP_PLACEMENT_RM) { + zonegroup.placement_targets.erase(placement_id); + } else if (opt_cmd == OPT_ZONEGROUP_PLACEMENT_DEFAULT) { + if (!zonegroup.placement_targets.count(placement_id)) { + cerr << "failed to find a zonegroup placement target named '" + << placement_id << "'" << std::endl; + return -ENOENT; + } + zonegroup.default_placement = rule; + } + + zonegroup.post_process_params(); + ret = zonegroup.update(); + if (ret < 0) { + cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("placement_targets", zonegroup.placement_targets, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONE_CREATE: + { + if (zone_name.empty()) { + cerr << "zone name not provided" << std::endl; + return EINVAL; + } + int ret; + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + /* if the user didn't provide zonegroup info , create stand alone zone */ + if (!zonegroup_id.empty() || !zonegroup_name.empty()) { + ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "unable to initialize zonegroup " << zonegroup_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (realm_id.empty() && realm_name.empty()) { + realm_id = zonegroup.realm_id; + } + } + + RGWZoneParams zone(zone_id, zone_name); + ret = zone.init(g_ceph_context, store->svc.sysobj, false); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + zone.system_key.id = access_key; + zone.system_key.key = secret_key; + zone.realm_id = realm_id; + for (auto a : tier_config_add) { + int r = zone.tier_config.set(a.first, a.second); + if (r < 0) { + cerr << "ERROR: failed to set configurable: " << a << std::endl; + return EINVAL; + } + } + + ret = zone.create(); + if (ret < 0) { + cerr << "failed to create zone " << zone_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (!zonegroup_id.empty() || !zonegroup_name.empty()) { + string *ptier_type = (tier_type_specified ? &tier_type : nullptr); + bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr); + string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr); + ret = zonegroup.add_zone(zone, + (is_master_set ? &is_master : NULL), + (is_read_only_set ? &read_only : NULL), + endpoints, + ptier_type, + psync_from_all, + sync_from, sync_from_rm, + predirect_zone, + store->svc.sync_modules->get_manager()); + if (ret < 0) { + cerr << "failed to add zone " << zone_name << " to zonegroup " << zonegroup.get_name() + << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (set_default) { + ret = zone.set_as_default(); + if (ret < 0) { + cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zone", zone, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONE_DEFAULT: + { + RGWZoneGroup zonegroup(zonegroup_id,zonegroup_name); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "WARNING: failed to initialize zonegroup " << zonegroup_name << std::endl; + } + if (zone_id.empty() && zone_name.empty()) { + cerr << "no zone name or id provided" << std::endl; + return EINVAL; + } + RGWZoneParams zone(zone_id, zone_name); + ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = zone.set_as_default(); + if (ret < 0) { + cerr << "failed to set zone as default: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT_ZONE_DELETE: + { + if (zone_id.empty() && zone_name.empty()) { + cerr << "no zone name or id provided" << std::endl; + return EINVAL; + } + RGWZoneParams zone(zone_id, zone_name); + int ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + list zonegroups; + ret = store->svc.zone->list_zonegroups(zonegroups); + if (ret < 0) { + cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + for (list::iterator iter = zonegroups.begin(); iter != zonegroups.end(); ++iter) { + RGWZoneGroup zonegroup(string(), *iter); + int ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "WARNING: failed to initialize zonegroup " << zonegroup_name << std::endl; + continue; + } + ret = zonegroup.remove_zone(zone.get_id()); + if (ret < 0 && ret != -ENOENT) { + cerr << "failed to remove zone " << zone_name << " from zonegroup " << zonegroup.get_name() << ": " + << cpp_strerror(-ret) << std::endl; + } + } + + ret = zone.delete_obj(); + if (ret < 0) { + cerr << "failed to delete zone " << zone_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + break; + case OPT_ZONE_GET: + { + RGWZoneParams zone(zone_id, zone_name); + int ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("zone", zone, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONE_SET: + { + RGWZoneParams zone(zone_name); + int ret = zone.init(g_ceph_context, store->svc.sysobj, false); + if (ret < 0) { + return -ret; + } + + ret = zone.read(); + if (ret < 0 && ret != -ENOENT) { + cerr << "zone.read() returned ret=" << ret << std::endl; + return -ret; + } + + string orig_id = zone.get_id(); + + ret = read_decode_json(infile, zone); + if (ret < 0) { + return 1; + } + + if(zone.realm_id.empty()) { + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0 && ret != -ENOENT) { + cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + zone.realm_id = realm.get_id(); + } + + if( !zone_name.empty() && !zone.get_name().empty() && zone.get_name() != zone_name) { + cerr << "Error: zone name" << zone_name << " is different than the zone name " << zone.get_name() << " in the provided json " << std::endl; + return EINVAL; + } + + if (zone.get_name().empty()) { + zone.set_name(zone_name); + if (zone.get_name().empty()) { + cerr << "no zone name specified" << std::endl; + return EINVAL; + } + } + + zone_name = zone.get_name(); + + if (zone.get_id().empty()) { + zone.set_id(orig_id); + } + + if (zone.get_id().empty()) { + cerr << "no zone name id the json provided, assuming old format" << std::endl; + if (zone_name.empty()) { + cerr << "missing zone name" << std::endl; + return EINVAL; + } + zone.set_name(zone_name); + zone.set_id(zone_name); + } + + cerr << "zone id " << zone.get_id(); + ret = zone.fix_pool_names(); + if (ret < 0) { + cerr << "ERROR: couldn't fix zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = zone.write(false); + if (ret < 0) { + cerr << "ERROR: couldn't create zone: " << cpp_strerror(-ret) << std::endl; + return 1; + } + + if (set_default) { + ret = zone.set_as_default(); + if (ret < 0) { + cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zone", zone, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONE_LIST: + { + list zones; + int ret = store->svc.zone->list_zones(zones); + if (ret < 0) { + cerr << "failed to list zones: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWZoneParams zone; + ret = zone.init(g_ceph_context, store->svc.sysobj, false); + if (ret < 0) { + cerr << "failed to init zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + string default_zone; + ret = zone.read_default_id(default_zone); + if (ret < 0 && ret != -ENOENT) { + cerr << "could not determine default zone: " << cpp_strerror(-ret) << std::endl; + } + formatter->open_object_section("zones_list"); + encode_json("default_info", default_zone, formatter); + encode_json("zones", zones, formatter); + formatter->close_section(); + formatter->flush(cout); + } + break; + case OPT_ZONE_MODIFY: + { + RGWZoneParams zone(zone_id, zone_name); + int ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + bool need_zone_update = false; + if (!access_key.empty()) { + zone.system_key.id = access_key; + need_zone_update = true; + } + + if (!secret_key.empty()) { + zone.system_key.key = secret_key; + need_zone_update = true; + } + + if (!realm_id.empty()) { + zone.realm_id = realm_id; + need_zone_update = true; + } else if (!realm_name.empty()) { + // get realm id from name + RGWRealm realm{g_ceph_context, store->svc.sysobj}; + ret = realm.read_id(realm_name, zone.realm_id); + if (ret < 0) { + cerr << "failed to find realm by name " << realm_name << std::endl; + return -ret; + } + need_zone_update = true; + } + + if (tier_config_add.size() > 0) { + for (auto add : tier_config_add) { + int r = zone.tier_config.set(add.first, add.second); + if (r < 0) { + cerr << "ERROR: failed to set configurable: " << add << std::endl; + return EINVAL; + } + } + need_zone_update = true; + } + + for (auto rm : tier_config_rm) { + if (!rm.first.empty()) { /* otherwise will remove the entire config */ + zone.tier_config.erase(rm.first); + need_zone_update = true; + } + } + + if (need_zone_update) { + ret = zone.update(); + if (ret < 0) { + cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + string *ptier_type = (tier_type_specified ? &tier_type : nullptr); + + bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr); + string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr); + + ret = zonegroup.add_zone(zone, + (is_master_set ? &is_master : NULL), + (is_read_only_set ? &read_only : NULL), + endpoints, ptier_type, + psync_from_all, sync_from, sync_from_rm, + predirect_zone, + store->svc.sync_modules->get_manager()); + if (ret < 0) { + cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = zonegroup.update(); + if (ret < 0) { + cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (set_default) { + ret = zone.set_as_default(); + if (ret < 0) { + cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl; + } + } + + encode_json("zone", zone, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONE_RENAME: + { + if (zone_new_name.empty()) { + cerr << " missing zone new name" << std::endl; + return EINVAL; + } + if (zone_id.empty() && zone_name.empty()) { + cerr << "no zonegroup name or id provided" << std::endl; + return EINVAL; + } + RGWZoneParams zone(zone_id,zone_name); + int ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = zone.rename(zone_new_name); + if (ret < 0) { + cerr << "failed to rename zone " << zone_name << " to " << zone_new_name << ": " << cpp_strerror(-ret) + << std::endl; + return -ret; + } + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "WARNING: failed to initialize zonegroup " << zonegroup_name << std::endl; + } else { + ret = zonegroup.rename_zone(zone); + if (ret < 0) { + cerr << "Error in zonegroup rename for " << zone_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + break; + case OPT_ZONE_PLACEMENT_ADD: + case OPT_ZONE_PLACEMENT_MODIFY: + case OPT_ZONE_PLACEMENT_RM: + { + if (placement_id.empty()) { + cerr << "ERROR: --placement-id not specified" << std::endl; + return EINVAL; + } + // validate compression type + if (compression_type && *compression_type != "random" + && !Compressor::get_comp_alg_type(*compression_type)) { + std::cerr << "Unrecognized compression type" << std::endl; + return EINVAL; + } + + RGWZoneParams zone(zone_id, zone_name); + int ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (opt_cmd == OPT_ZONE_PLACEMENT_ADD || + opt_cmd == OPT_ZONE_PLACEMENT_MODIFY) { + RGWZoneGroup zonegroup(zonegroup_id, zonegroup_name); + ret = zonegroup.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto ptiter = zonegroup.placement_targets.find(placement_id); + if (ptiter == zonegroup.placement_targets.end()) { + cerr << "ERROR: placement id '" << placement_id << "' is not configured in zonegroup placement targets" << std::endl; + return EINVAL; + } + + storage_class = rgw_placement_rule::get_canonical_storage_class(storage_class); + if (ptiter->second.storage_classes.find(storage_class) == ptiter->second.storage_classes.end()) { + cerr << "ERROR: storage class '" << storage_class << "' is not defined in zonegroup '" << placement_id << "' placement target" << std::endl; + return EINVAL; + } + + RGWZonePlacementInfo& info = zone.placement_pools[placement_id]; + + string opt_index_pool = index_pool.value_or(string()); + string opt_data_pool = data_pool.value_or(string()); + + if (!opt_index_pool.empty()) { + info.index_pool = opt_index_pool; + } + + if (info.index_pool.empty()) { + cerr << "ERROR: index pool not configured, need to specify --index-pool" << std::endl; + return EINVAL; + } + + if (opt_data_pool.empty()) { + const RGWZoneStorageClass *porig_sc{nullptr}; + if (info.storage_classes.find(storage_class, &porig_sc)) { + if (porig_sc->data_pool) { + opt_data_pool = porig_sc->data_pool->to_str(); + } + } + if (opt_data_pool.empty()) { + cerr << "ERROR: data pool not configured, need to specify --data-pool" << std::endl; + return EINVAL; + } + } + + rgw_pool dp = opt_data_pool; + info.storage_classes.set_storage_class(storage_class, &dp, compression_type.get_ptr()); + + if (data_extra_pool) { + info.data_extra_pool = *data_extra_pool; + } + if (index_type_specified) { + info.index_type = placement_index_type; + } + + ret = check_pool_support_omap(info.get_data_extra_pool()); + if (ret < 0) { + cerr << "ERROR: the data extra (non-ec) pool '" << info.get_data_extra_pool() + << "' does not support omap" << std::endl; + return ret; + } + } else if (opt_cmd == OPT_ZONE_PLACEMENT_RM) { + zone.placement_pools.erase(placement_id); + } + + ret = zone.update(); + if (ret < 0) { + cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("zone", zone, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONE_PLACEMENT_LIST: + { + RGWZoneParams zone(zone_id, zone_name); + int ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("placement_pools", zone.placement_pools, formatter); + formatter->flush(cout); + } + break; + case OPT_ZONE_PLACEMENT_GET: + { + if (placement_id.empty()) { + cerr << "ERROR: --placement-id not specified" << std::endl; + return EINVAL; + } + + RGWZoneParams zone(zone_id, zone_name); + int ret = zone.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + auto p = zone.placement_pools.find(placement_id); + if (p == zone.placement_pools.end()) { + cerr << "ERROR: zone placement target '" << placement_id << "' not found" << std::endl; + return -ENOENT; + } + encode_json("placement_pools", p->second, formatter); + formatter->flush(cout); + } + break; + } + return 0; + } + + bool non_master_cmd = (!store->svc.zone->is_meta_master() && !yes_i_really_mean_it); + std::set non_master_ops_list = {OPT_USER_CREATE, OPT_USER_RM, + OPT_USER_MODIFY, OPT_USER_ENABLE, + OPT_USER_SUSPEND, OPT_SUBUSER_CREATE, + OPT_SUBUSER_MODIFY, OPT_SUBUSER_RM, + OPT_BUCKET_LINK, OPT_BUCKET_UNLINK, + OPT_BUCKET_RESHARD, OPT_BUCKET_RM, + OPT_METADATA_PUT, OPT_METADATA_RM, + OPT_RESHARD_CANCEL, OPT_RESHARD_ADD, + OPT_MFA_CREATE, OPT_MFA_REMOVE, + OPT_MFA_RESYNC, OPT_CAPS_ADD, + OPT_CAPS_RM}; + + bool print_warning_message = (non_master_ops_list.find(opt_cmd) != non_master_ops_list.end() && + non_master_cmd); + + if (print_warning_message) { + cerr << "Please run the command on master zone. Performing this operation on non-master zone leads to inconsistent metadata between zones" << std::endl; + cerr << "Are you sure you want to go ahead? (requires --yes-i-really-mean-it)" << std::endl; + return EINVAL; + } + + if (!user_id.empty()) { + user_op.set_user_id(user_id); + bucket_op.set_user_id(user_id); + } + + if (!display_name.empty()) + user_op.set_display_name(display_name); + + if (!user_email.empty()) + user_op.set_user_email(user_email); + + if (!access_key.empty()) + user_op.set_access_key(access_key); + + if (!secret_key.empty()) + user_op.set_secret_key(secret_key); + + if (!subuser.empty()) + user_op.set_subuser(subuser); + + if (!caps.empty()) + user_op.set_caps(caps); + + user_op.set_purge_data(purge_data); + + if (purge_keys) + user_op.set_purge_keys(); + + if (gen_access_key) + user_op.set_generate_key(); + + if (gen_secret_key) + user_op.set_gen_secret(); // assume that a key pair should be created + + if (max_buckets_specified) + user_op.set_max_buckets(max_buckets); + + if (admin_specified) + user_op.set_admin(admin); + + if (system_specified) + user_op.set_system(system); + + if (set_perm) + user_op.set_perm(perm_mask); + + if (set_temp_url_key) { + map::iterator iter = temp_url_keys.begin(); + for (; iter != temp_url_keys.end(); ++iter) { + user_op.set_temp_url_key(iter->second, iter->first); + } + } + + if (!op_mask_str.empty()) { + uint32_t op_mask; + int ret = rgw_parse_op_type_list(op_mask_str, &op_mask); + if (ret < 0) { + cerr << "failed to parse op_mask: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + user_op.set_op_mask(op_mask); + } + + if (key_type != KEY_TYPE_UNDEFINED) + user_op.set_key_type(key_type); + + // set suspension operation parameters + if (opt_cmd == OPT_USER_ENABLE) + user_op.set_suspension(false); + else if (opt_cmd == OPT_USER_SUSPEND) + user_op.set_suspension(true); + + // RGWUser to use for user operations + RGWUser user; + int ret = 0; + if (!(user_id.empty() && access_key.empty()) || !subuser.empty()) { + ret = user.init(store, user_op); + if (ret < 0) { + cerr << "user.init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + /* populate bucket operation */ + bucket_op.set_bucket_name(bucket_name); + bucket_op.set_object(object); + bucket_op.set_check_objects(check_objects); + bucket_op.set_delete_children(delete_child_objects); + bucket_op.set_fix_index(fix); + bucket_op.set_max_aio(max_concurrent_ios); + + // required to gather errors from operations + std::string err_msg; + + bool output_user_info = true; + + switch (opt_cmd) { + case OPT_USER_INFO: + if (user_id.empty() && access_key.empty()) { + cerr << "ERROR: --uid or --access-key required" << std::endl; + return EINVAL; + } + break; + case OPT_USER_CREATE: + if (!user_op.has_existing_user()) { + user_op.set_generate_key(); // generate a new key by default + } + ret = user.add(user_op, &err_msg); + if (ret < 0) { + cerr << "could not create user: " << err_msg << std::endl; + if (ret == -ERR_INVALID_TENANT_NAME) + ret = -EINVAL; + + return -ret; + } + if (!subuser.empty()) { + ret = user.subusers.add(user_op, &err_msg); + if (ret < 0) { + cerr << "could not create subuser: " << err_msg << std::endl; + return -ret; + } + } + break; + case OPT_USER_RM: + ret = user.remove(user_op, &err_msg); + if (ret < 0) { + cerr << "could not remove user: " << err_msg << std::endl; + return -ret; + } + + output_user_info = false; + break; + case OPT_USER_ENABLE: + case OPT_USER_SUSPEND: + case OPT_USER_MODIFY: + ret = user.modify(user_op, &err_msg); + if (ret < 0) { + cerr << "could not modify user: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT_SUBUSER_CREATE: + ret = user.subusers.add(user_op, &err_msg); + if (ret < 0) { + cerr << "could not create subuser: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT_SUBUSER_MODIFY: + ret = user.subusers.modify(user_op, &err_msg); + if (ret < 0) { + cerr << "could not modify subuser: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT_SUBUSER_RM: + ret = user.subusers.remove(user_op, &err_msg); + if (ret < 0) { + cerr << "could not remove subuser: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT_CAPS_ADD: + ret = user.caps.add(user_op, &err_msg); + if (ret < 0) { + cerr << "could not add caps: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT_CAPS_RM: + ret = user.caps.remove(user_op, &err_msg); + if (ret < 0) { + cerr << "could not remove caps: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT_KEY_CREATE: + ret = user.keys.add(user_op, &err_msg); + if (ret < 0) { + cerr << "could not create key: " << err_msg << std::endl; + return -ret; + } + + break; + case OPT_KEY_RM: + ret = user.keys.remove(user_op, &err_msg); + if (ret < 0) { + cerr << "could not remove key: " << err_msg << std::endl; + return -ret; + } + break; + case OPT_PERIOD_PUSH: + { + RGWEnv env; + req_info info(g_ceph_context, &env); + info.method = "POST"; + info.request_uri = "/admin/realm/period"; + + map ¶ms = info.args.get_params(); + if (!realm_id.empty()) + params["realm_id"] = realm_id; + if (!realm_name.empty()) + params["realm_name"] = realm_name; + if (!period_id.empty()) + params["period_id"] = period_id; + if (!period_epoch.empty()) + params["epoch"] = period_epoch; + + // load the period + RGWPeriod period(period_id); + int ret = period.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "period init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + // json format into a bufferlist + JSONFormatter jf(false); + encode_json("period", period, &jf); + bufferlist bl; + jf.flush(bl); + + JSONParser p; + ret = send_to_remote_or_url(nullptr, url, access_key, secret_key, + info, bl, p); + if (ret < 0) { + cerr << "request failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + return 0; + case OPT_PERIOD_UPDATE: + { + int ret = update_period(realm_id, realm_name, period_id, period_epoch, + commit, remote, url, access_key, secret_key, + formatter, yes_i_really_mean_it); + if (ret < 0) { + return -ret; + } + } + return 0; + case OPT_PERIOD_COMMIT: + { + // read realm and staging period + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(g_ceph_context, store->svc.sysobj); + if (ret < 0) { + cerr << "Error initializing realm: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + RGWPeriod period(RGWPeriod::get_staging_id(realm.get_id()), 1); + ret = period.init(g_ceph_context, store->svc.sysobj, realm.get_id()); + if (ret < 0) { + cerr << "period init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = commit_period(realm, period, remote, url, access_key, secret_key, + yes_i_really_mean_it); + if (ret < 0) { + cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("period", period, formatter); + formatter->flush(cout); + } + return 0; + case OPT_ROLE_CREATE: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + if (assume_role_doc.empty()) { + cerr << "ERROR: assume role policy document is empty" << std::endl; + return -EINVAL; + } + bufferlist bl = bufferlist::static_from_string(assume_role_doc); + try { + const rgw::IAM::Policy p(g_ceph_context, tenant, bl); + } catch (rgw::IAM::PolicyParseException& e) { + cerr << "failed to parse policy: " << e.what() << std::endl; + return -EINVAL; + } + RGWRole role(g_ceph_context, store, role_name, path, assume_role_doc, tenant); + ret = role.create(true); + if (ret < 0) { + return -ret; + } + show_role_info(role, formatter); + return 0; + } + case OPT_ROLE_DELETE: + { + if (role_name.empty()) { + cerr << "ERROR: empty role name" << std::endl; + return -EINVAL; + } + RGWRole role(g_ceph_context, store, role_name, tenant); + ret = role.delete_obj(); + if (ret < 0) { + return -ret; + } + cout << "role: " << role_name << " successfully deleted" << std::endl; + return 0; + } + case OPT_ROLE_GET: + { + if (role_name.empty()) { + cerr << "ERROR: empty role name" << std::endl; + return -EINVAL; + } + RGWRole role(g_ceph_context, store, role_name, tenant); + ret = role.get(); + if (ret < 0) { + return -ret; + } + show_role_info(role, formatter); + return 0; + } + case OPT_ROLE_MODIFY: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + if (assume_role_doc.empty()) { + cerr << "ERROR: assume role policy document is empty" << std::endl; + return -EINVAL; + } + + bufferlist bl = bufferlist::static_from_string(assume_role_doc); + try { + const rgw::IAM::Policy p(g_ceph_context, tenant, bl); + } catch (rgw::IAM::PolicyParseException& e) { + cerr << "failed to parse policy: " << e.what() << std::endl; + return -EINVAL; + } + + RGWRole role(g_ceph_context, store, role_name, tenant); + ret = role.get(); + if (ret < 0) { + return -ret; + } + role.update_trust_policy(assume_role_doc); + ret = role.update(); + if (ret < 0) { + return -ret; + } + cout << "Assume role policy document updated successfully for role: " << role_name << std::endl; + return 0; + } + case OPT_ROLE_LIST: + { + vector result; + ret = RGWRole::get_roles_by_path_prefix(store, g_ceph_context, path_prefix, tenant, result); + if (ret < 0) { + return -ret; + } + show_roles_info(result, formatter); + return 0; + } + case OPT_ROLE_POLICY_PUT: + { + if (role_name.empty()) { + cerr << "role name is empty" << std::endl; + return -EINVAL; + } + + if (policy_name.empty()) { + cerr << "policy name is empty" << std::endl; + return -EINVAL; + } + + if (perm_policy_doc.empty()) { + cerr << "permission policy document is empty" << std::endl; + return -EINVAL; + } + + bufferlist bl = bufferlist::static_from_string(perm_policy_doc); + try { + const rgw::IAM::Policy p(g_ceph_context, tenant, bl); + } catch (rgw::IAM::PolicyParseException& e) { + cerr << "failed to parse perm policy: " << e.what() << std::endl; + return -EINVAL; + } + + RGWRole role(g_ceph_context, store, role_name, tenant); + ret = role.get(); + if (ret < 0) { + return -ret; + } + role.set_perm_policy(policy_name, perm_policy_doc); + ret = role.update(); + if (ret < 0) { + return -ret; + } + cout << "Permission policy attached successfully" << std::endl; + return 0; + } + case OPT_ROLE_POLICY_LIST: + { + if (role_name.empty()) { + cerr << "ERROR: Role name is empty" << std::endl; + return -EINVAL; + } + RGWRole role(g_ceph_context, store, role_name, tenant); + ret = role.get(); + if (ret < 0) { + return -ret; + } + std::vector policy_names = role.get_role_policy_names(); + show_policy_names(policy_names, formatter); + return 0; + } + case OPT_ROLE_POLICY_GET: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + if (policy_name.empty()) { + cerr << "ERROR: policy name is empty" << std::endl; + return -EINVAL; + } + RGWRole role(g_ceph_context, store, role_name, tenant); + int ret = role.get(); + if (ret < 0) { + return -ret; + } + string perm_policy; + ret = role.get_role_policy(policy_name, perm_policy); + if (ret < 0) { + return -ret; + } + show_perm_policy(perm_policy, formatter); + return 0; + } + case OPT_ROLE_POLICY_DELETE: + { + if (role_name.empty()) { + cerr << "ERROR: role name is empty" << std::endl; + return -EINVAL; + } + + if (policy_name.empty()) { + cerr << "ERROR: policy name is empty" << std::endl; + return -EINVAL; + } + RGWRole role(g_ceph_context, store, role_name, tenant); + ret = role.get(); + if (ret < 0) { + return -ret; + } + ret = role.delete_policy(policy_name); + if (ret < 0) { + return -ret; + } + ret = role.update(); + if (ret < 0) { + return -ret; + } + cout << "Policy: " << policy_name << " successfully deleted for role: " + << role_name << std::endl; + return 0; + } + default: + output_user_info = false; + } + + // output the result of a user operation + if (output_user_info) { + ret = user.info(info, &err_msg); + if (ret < 0) { + cerr << "could not fetch user info: " << err_msg << std::endl; + return -ret; + } + show_user_info(info, formatter); + } + + if (opt_cmd == OPT_POLICY) { + if (format == "xml") { + int ret = RGWBucketAdminOp::dump_s3_policy(store, bucket_op, cout); + if (ret < 0) { + cerr << "ERROR: failed to get policy: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + int ret = RGWBucketAdminOp::get_policy(store, bucket_op, f); + if (ret < 0) { + cerr << "ERROR: failed to get policy: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + if (opt_cmd == OPT_BUCKET_LIMIT_CHECK) { + void *handle; + std::list user_ids; + metadata_key = "user"; + int max = 1000; + + bool truncated; + + if (! user_id.empty()) { + user_ids.push_back(user_id.id); + ret = + RGWBucketAdminOp::limit_check(store, bucket_op, user_ids, f, + warnings_only); + } else { + /* list users in groups of max-keys, then perform user-bucket + * limit-check on each group */ + ret = store->meta_mgr->list_keys_init(metadata_key, &handle); + if (ret < 0) { + cerr << "ERROR: buckets limit check can't get user metadata_key: " + << cpp_strerror(-ret) << std::endl; + return -ret; + } + + do { + ret = store->meta_mgr->list_keys_next(handle, max, user_ids, + &truncated); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: buckets limit check lists_keys_next(): " + << cpp_strerror(-ret) << std::endl; + break; + } else { + /* ok, do the limit checks for this group */ + ret = + RGWBucketAdminOp::limit_check(store, bucket_op, user_ids, f, + warnings_only); + if (ret < 0) + break; + } + user_ids.clear(); + } while (truncated); + store->meta_mgr->list_keys_complete(handle); + } + return -ret; + } /* OPT_BUCKET_LIMIT_CHECK */ + + if (opt_cmd == OPT_BUCKETS_LIST) { + if (bucket_name.empty()) { + if (!user_id.empty()) { + if (!user_op.has_existing_user()) { + cerr << "ERROR: could not find user: " << user_id << std::endl; + return -ENOENT; + } + } + RGWBucketAdminOp::info(store, bucket_op, f); + } else { + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_array_section("entries"); + + bool truncated = false; + int count = 0; + + static constexpr int MAX_PAGINATE_SIZE = 10000; + static constexpr int DEFAULT_MAX_ENTRIES = 1000; + + if (max_entries < 0) { + max_entries = DEFAULT_MAX_ENTRIES; + } + const int paginate_size = std::min(max_entries, MAX_PAGINATE_SIZE); + + string prefix; + string delim; + vector result; + map common_prefixes; + string ns; + + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.prefix = prefix; + list_op.params.delim = delim; + list_op.params.marker = rgw_obj_key(marker); + list_op.params.ns = ns; + list_op.params.enforce_ns = false; + list_op.params.list_versions = true; + list_op.params.allow_unordered = bool(allow_unordered); + + do { + const int remaining = max_entries - count; + ret = list_op.list_objects(std::min(remaining, paginate_size), + &result, &common_prefixes, &truncated); + if (ret < 0) { + cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += result.size(); + + for (const auto& entry : result) { + encode_json("entry", entry, formatter); + } + formatter->flush(cout); + } while (truncated && count < max_entries); + + formatter->close_section(); + formatter->flush(cout); + } /* have bucket_name */ + } /* OPT_BUCKETS_LIST */ + + if (opt_cmd == OPT_BUCKET_RADOS_LIST) { + RGWRadosList lister(store, + max_concurrent_ios, orphan_stale_secs, tenant); + if (bucket_name.empty()) { + ret = lister.run(); + } else { + ret = lister.run(bucket_name); + } + + if (ret < 0) { + std::cerr << + "ERROR: bucket radoslist failed to finish before " << + "encountering error: " << cpp_strerror(-ret) << std::endl; + std::cerr << "************************************" + "************************************" << std::endl; + std::cerr << "WARNING: THE RESULTS ARE NOT RELIABLE AND SHOULD NOT " << + "BE USED IN DELETING ORPHANS" << std::endl; + std::cerr << "************************************" + "************************************" << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_BUCKET_STATS) { + if (bucket_name.empty() && !bucket_id.empty()) { + rgw_bucket bucket; + if (!rgw_find_bucket_by_id(store->ctx(), store->meta_mgr, marker, bucket_id, &bucket)) { + cerr << "failure: no such bucket id" << std::endl; + return -ENOENT; + } + bucket_op.set_tenant(bucket.tenant); + bucket_op.set_bucket_name(bucket.name); + } + bucket_op.set_fetch_stats(true); + + int r = RGWBucketAdminOp::info(store, bucket_op, f); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + } + + if (opt_cmd == OPT_BUCKET_LINK) { + bucket_op.set_bucket_id(bucket_id); + string err; + int r = RGWBucketAdminOp::link(store, bucket_op, &err); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + } + + if (opt_cmd == OPT_BUCKET_UNLINK) { + int r = RGWBucketAdminOp::unlink(store, bucket_op); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << std::endl; + return -r; + } + } + + if (opt_cmd == OPT_LOG_LIST) { + // filter by date? + if (date.size() && date.size() != 10) { + cerr << "bad date format for '" << date << "', expect YYYY-MM-DD" << std::endl; + return EINVAL; + } + + formatter->reset(); + formatter->open_array_section("logs"); + RGWAccessHandle h; + int r = store->log_list_init(date, &h); + if (r == -ENOENT) { + // no logs. + } else { + if (r < 0) { + cerr << "log list: error " << r << std::endl; + return -r; + } + while (true) { + string name; + int r = store->log_list_next(h, &name); + if (r == -ENOENT) + break; + if (r < 0) { + cerr << "log list: error " << r << std::endl; + return -r; + } + formatter->dump_string("object", name); + } + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + } + + if (opt_cmd == OPT_LOG_SHOW || opt_cmd == OPT_LOG_RM) { + if (object.empty() && (date.empty() || bucket_name.empty() || bucket_id.empty())) { + cerr << "specify an object or a date, bucket and bucket-id" << std::endl; + exit(1); + } + + string oid; + if (!object.empty()) { + oid = object; + } else { + oid = date; + oid += "-"; + oid += bucket_id; + oid += "-"; + oid += bucket_name; + } + + if (opt_cmd == OPT_LOG_SHOW) { + RGWAccessHandle h; + + int r = store->log_show_init(oid, &h); + if (r < 0) { + cerr << "error opening log " << oid << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + + formatter->reset(); + formatter->open_object_section("log"); + + struct rgw_log_entry entry; + + // peek at first entry to get bucket metadata + r = store->log_show_next(h, &entry); + if (r < 0) { + cerr << "error reading log " << oid << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + formatter->dump_string("bucket_id", entry.bucket_id); + formatter->dump_string("bucket_owner", entry.bucket_owner.to_str()); + formatter->dump_string("bucket", entry.bucket); + + uint64_t agg_time = 0; + uint64_t agg_bytes_sent = 0; + uint64_t agg_bytes_received = 0; + uint64_t total_entries = 0; + + if (show_log_entries) + formatter->open_array_section("log_entries"); + + do { + using namespace std::chrono; + uint64_t total_time = duration_cast(entry.total_time).count(); + + agg_time += total_time; + agg_bytes_sent += entry.bytes_sent; + agg_bytes_received += entry.bytes_received; + total_entries++; + + if (skip_zero_entries && entry.bytes_sent == 0 && + entry.bytes_received == 0) + goto next; + + if (show_log_entries) { + + rgw_format_ops_log_entry(entry, formatter); + formatter->flush(cout); + } +next: + r = store->log_show_next(h, &entry); + } while (r > 0); + + if (r < 0) { + cerr << "error reading log " << oid << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + if (show_log_entries) + formatter->close_section(); + + if (show_log_sum) { + formatter->open_object_section("log_sum"); + formatter->dump_int("bytes_sent", agg_bytes_sent); + formatter->dump_int("bytes_received", agg_bytes_received); + formatter->dump_int("total_time", agg_time); + formatter->dump_int("total_entries", total_entries); + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + } + if (opt_cmd == OPT_LOG_RM) { + int r = store->log_remove(oid); + if (r < 0) { + cerr << "error removing log " << oid << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + } + } + + if (opt_cmd == OPT_POOL_ADD) { + if (pool_name.empty()) { + cerr << "need to specify pool to add!" << std::endl; + exit(1); + } + + int ret = store->svc.zone->add_bucket_placement(pool); + if (ret < 0) + cerr << "failed to add bucket placement: " << cpp_strerror(-ret) << std::endl; + } + + if (opt_cmd == OPT_POOL_RM) { + if (pool_name.empty()) { + cerr << "need to specify pool to remove!" << std::endl; + exit(1); + } + + int ret = store->svc.zone->remove_bucket_placement(pool); + if (ret < 0) + cerr << "failed to remove bucket placement: " << cpp_strerror(-ret) << std::endl; + } + + if (opt_cmd == OPT_POOLS_LIST) { + set pools; + int ret = store->svc.zone->list_placement_set(pools); + if (ret < 0) { + cerr << "could not list placement set: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->reset(); + formatter->open_array_section("pools"); + for (auto siter = pools.begin(); siter != pools.end(); ++siter) { + formatter->open_object_section("pool"); + formatter->dump_string("name", siter->to_str()); + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + } + + if (opt_cmd == OPT_USAGE_SHOW) { + uint64_t start_epoch = 0; + uint64_t end_epoch = (uint64_t)-1; + + int ret; + + if (!start_date.empty()) { + ret = utime_t::parse_date(start_date, &start_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse start date" << std::endl; + return 1; + } + } + if (!end_date.empty()) { + ret = utime_t::parse_date(end_date, &end_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse end date" << std::endl; + return 1; + } + } + + + ret = RGWUsage::show(store, user_id, bucket_name, start_epoch, end_epoch, + show_log_entries, show_log_sum, &categories, + f); + if (ret < 0) { + cerr << "ERROR: failed to show usage" << std::endl; + return 1; + } + } + + if (opt_cmd == OPT_USAGE_TRIM) { + if (user_id.empty() && bucket_name.empty() && + start_date.empty() && end_date.empty() && !yes_i_really_mean_it) { + cerr << "usage trim without user/date/bucket specified will remove *all* users data" << std::endl; + cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return 1; + } + int ret; + uint64_t start_epoch = 0; + uint64_t end_epoch = (uint64_t)-1; + + + if (!start_date.empty()) { + ret = utime_t::parse_date(start_date, &start_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse start date" << std::endl; + return 1; + } + } + + if (!end_date.empty()) { + ret = utime_t::parse_date(end_date, &end_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse end date" << std::endl; + return 1; + } + } + + ret = RGWUsage::trim(store, user_id, bucket_name, start_epoch, end_epoch); + if (ret < 0) { + cerr << "ERROR: read_usage() returned ret=" << ret << std::endl; + return 1; + } + } + + if (opt_cmd == OPT_USAGE_CLEAR) { + if (!yes_i_really_mean_it) { + cerr << "usage clear would remove *all* users usage data for all time" << std::endl; + cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return 1; + } + + ret = RGWUsage::clear(store); + if (ret < 0) { + return ret; + } + } + + + if (opt_cmd == OPT_OLH_GET || opt_cmd == OPT_OLH_READLOG) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + if (object.empty()) { + cerr << "ERROR: object not specified" << std::endl; + return EINVAL; + } + } + + if (opt_cmd == OPT_OLH_GET) { + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + RGWOLHInfo olh; + rgw_obj obj(bucket, object); + ret = store->get_olh(bucket_info, obj, &olh); + if (ret < 0) { + cerr << "ERROR: failed reading olh: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("olh", olh, formatter); + formatter->flush(cout); + } + + if (opt_cmd == OPT_OLH_READLOG) { + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + map > log; + bool is_truncated; + + RGWObjectCtx rctx(store); + rgw_obj obj(bucket, object); + + RGWObjState *state; + + ret = store->get_obj_state(&rctx, bucket_info, obj, &state, false); /* don't follow olh */ + if (ret < 0) { + return -ret; + } + + ret = store->bucket_index_read_olh_log(bucket_info, *state, obj, 0, &log, &is_truncated); + if (ret < 0) { + cerr << "ERROR: failed reading olh: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("result"); + encode_json("is_truncated", is_truncated, formatter); + encode_json("log", log, formatter); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_BI_GET) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + if (object.empty()) { + cerr << "ERROR: object not specified" << std::endl; + return EINVAL; + } + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + rgw_obj obj(bucket, object); + if (!object_version.empty()) { + obj.key.set_instance(object_version); + } + + rgw_cls_bi_entry entry; + + ret = store->bi_get(bucket_info, obj, bi_index_type, &entry); + if (ret < 0) { + cerr << "ERROR: bi_get(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + encode_json("entry", entry, formatter); + formatter->flush(cout); + } + + if (opt_cmd == OPT_BI_PUT) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + rgw_cls_bi_entry entry; + cls_rgw_obj_key key; + ret = read_decode_json(infile, entry, &key); + if (ret < 0) { + return 1; + } + + rgw_obj obj(bucket, key); + + ret = store->bi_put(bucket, obj, entry); + if (ret < 0) { + cerr << "ERROR: bi_put(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_BI_LIST) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + list entries; + bool is_truncated; + if (max_entries < 0) { + max_entries = 1000; + } + + int max_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1); + + formatter->open_array_section("entries"); + + for (int i = 0; i < max_shards; i++) { + RGWRados::BucketShard bs(store); + int shard_id = (bucket_info.num_shards > 0 ? i : -1); + int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */); + marker.clear(); + + if (ret < 0) { + cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << shard_id << "): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + do { + entries.clear(); + ret = store->bi_list(bs, object, marker, max_entries, &entries, &is_truncated); + if (ret < 0) { + cerr << "ERROR: bi_list(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + list::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_cls_bi_entry& entry = *iter; + encode_json("entry", entry, formatter); + marker = entry.idx; + } + formatter->flush(cout); + } while (is_truncated); + formatter->flush(cout); + } + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_BI_PURGE) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket name not specified" << std::endl; + return EINVAL; + } + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWBucketInfo cur_bucket_info; + rgw_bucket cur_bucket; + ret = init_bucket(tenant, bucket_name, string(), cur_bucket_info, cur_bucket); + if (ret < 0) { + cerr << "ERROR: could not init current bucket info for bucket_name=" << bucket_name << ": " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + if (cur_bucket_info.bucket.bucket_id == bucket_info.bucket.bucket_id && !yes_i_really_mean_it) { + cerr << "specified bucket instance points to a current bucket instance" << std::endl; + cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return EINVAL; + } + + int max_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1); + + for (int i = 0; i < max_shards; i++) { + RGWRados::BucketShard bs(store); + int shard_id = (bucket_info.num_shards > 0 ? i : -1); + int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */); + if (ret < 0) { + cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << shard_id << "): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = store->bi_remove(bs); + if (ret < 0) { + cerr << "ERROR: failed to remove bucket index object: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + if (opt_cmd == OPT_OBJECT_PUT) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + if (object.empty()) { + cerr << "ERROR: object not specified" << std::endl; + return EINVAL; + } + + RGWDataAccess data_access(store); + rgw_obj_key key(object, object_version); + + RGWDataAccess::BucketRef b; + RGWDataAccess::ObjectRef obj; + + int ret = data_access.get_bucket(tenant, bucket_name, bucket_id, &b); + if (ret < 0) { + cerr << "ERROR: failed to init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + ret = b->get_object(key, &obj); + if (ret < 0) { + cerr << "ERROR: failed to get object: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + bufferlist bl; + ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl; + } + + map attrs; + ret = obj->put(bl, attrs); + if (ret < 0) { + cerr << "ERROR: put object returned error: " << cpp_strerror(-ret) << std::endl; + } + } + + if (opt_cmd == OPT_OBJECT_RM) { + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + rgw_obj_key key(object, object_version); + ret = rgw_remove_object(store, bucket_info, bucket, key); + + if (ret < 0) { + cerr << "ERROR: object remove returned: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_OBJECT_REWRITE) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + if (object.empty()) { + cerr << "ERROR: object not specified" << std::endl; + return EINVAL; + } + + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + rgw_obj obj(bucket, object); + obj.key.set_instance(object_version); + bool need_rewrite = true; + if (min_rewrite_stripe_size > 0) { + ret = check_min_obj_stripe_size(store, bucket_info, obj, min_rewrite_stripe_size, &need_rewrite); + if (ret < 0) { + ldout(store->ctx(), 0) << "WARNING: check_min_obj_stripe_size failed, r=" << ret << dendl; + } + } + if (need_rewrite) { + ret = store->rewrite_obj(bucket_info, obj); + if (ret < 0) { + cerr << "ERROR: object rewrite returned: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + ldout(store->ctx(), 20) << "skipped object" << dendl; + } + } + + if (opt_cmd == OPT_OBJECTS_EXPIRE) { + if (!store->process_expire_objects()) { + cerr << "ERROR: process_expire_objects() processing returned error." << std::endl; + return 1; + } + } + + if (opt_cmd == OPT_OBJECTS_EXPIRE_STALE_LIST) { + ret = RGWBucketAdminOp::fix_obj_expiry(store, bucket_op, f, true); + if (ret < 0) { + cerr << "ERROR: listing returned " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_OBJECTS_EXPIRE_STALE_RM) { + ret = RGWBucketAdminOp::fix_obj_expiry(store, bucket_op, f, false); + if (ret < 0) { + cerr << "ERROR: removing returned " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_BUCKET_REWRITE) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + uint64_t start_epoch = 0; + uint64_t end_epoch = 0; + + if (!end_date.empty()) { + int ret = utime_t::parse_date(end_date, &end_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse end date" << std::endl; + return EINVAL; + } + } + if (!start_date.empty()) { + int ret = utime_t::parse_date(start_date, &start_epoch, NULL); + if (ret < 0) { + cerr << "ERROR: failed to parse start date" << std::endl; + return EINVAL; + } + } + + bool is_truncated = true; + + rgw_obj_index_key marker; + string prefix; + + formatter->open_object_section("result"); + formatter->dump_string("bucket", bucket_name); + formatter->open_array_section("objects"); + + constexpr uint32_t NUM_ENTRIES = 1000; + uint16_t expansion_factor = 1; + while (is_truncated) { + map result; + int r = + store->cls_bucket_list_ordered(bucket_info, RGW_NO_SHARD, marker, + prefix, NUM_ENTRIES, true, expansion_factor, + result, &is_truncated, &marker, + bucket_object_check_filter); + if (r < 0 && r != -ENOENT) { + cerr << "ERROR: failed operation r=" << r << std::endl; + } else if (r == -ENOENT) { + break; + } + + if (result.size() < NUM_ENTRIES / 8) { + ++expansion_factor; + } else if (result.size() > NUM_ENTRIES * 7 / 8 && + expansion_factor > 1) { + --expansion_factor; + } + + map::iterator iter; + for (iter = result.begin(); iter != result.end(); ++iter) { + rgw_obj_key key = iter->second.key; + rgw_bucket_dir_entry& entry = iter->second; + + formatter->open_object_section("object"); + formatter->dump_string("name", key.name); + formatter->dump_string("instance", key.instance); + formatter->dump_int("size", entry.meta.size); + utime_t ut(entry.meta.mtime); + ut.gmtime(formatter->dump_stream("mtime")); + + if ((entry.meta.size < min_rewrite_size) || + (entry.meta.size > max_rewrite_size) || + (start_epoch > 0 && start_epoch > (uint64_t)ut.sec()) || + (end_epoch > 0 && end_epoch < (uint64_t)ut.sec())) { + formatter->dump_string("status", "Skipped"); + } else { + rgw_obj obj(bucket, key); + + bool need_rewrite = true; + if (min_rewrite_stripe_size > 0) { + r = check_min_obj_stripe_size(store, bucket_info, obj, min_rewrite_stripe_size, &need_rewrite); + if (r < 0) { + ldout(store->ctx(), 0) << "WARNING: check_min_obj_stripe_size failed, r=" << r << dendl; + } + } + if (!need_rewrite) { + formatter->dump_string("status", "Skipped"); + } else { + r = store->rewrite_obj(bucket_info, obj); + if (r == 0) { + formatter->dump_string("status", "Success"); + } else { + formatter->dump_string("status", cpp_strerror(-r)); + } + } + } + formatter->dump_int("flags", entry.flags); + + formatter->close_section(); + formatter->flush(cout); + } + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_BUCKET_RESHARD) { + rgw_bucket bucket; + RGWBucketInfo bucket_info; + map attrs; + + int ret = check_reshard_bucket_params(store, + bucket_name, + tenant, + bucket_id, + num_shards_specified, + num_shards, + yes_i_really_mean_it, + bucket, + bucket_info, + attrs); + if (ret < 0) { + return ret; + } + + RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */); + +#define DEFAULT_RESHARD_MAX_ENTRIES 1000 + if (max_entries < 1) { + max_entries = DEFAULT_RESHARD_MAX_ENTRIES; + } + + return br.execute(num_shards, max_entries, + verbose, &cout, formatter); + } + + if (opt_cmd == OPT_RESHARD_ADD) { + rgw_bucket bucket; + RGWBucketInfo bucket_info; + map attrs; + + int ret = check_reshard_bucket_params(store, + bucket_name, + tenant, + bucket_id, + num_shards_specified, + num_shards, + yes_i_really_mean_it, + bucket, + bucket_info, + attrs); + if (ret < 0) { + return ret; + } + + int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1); + + RGWReshard reshard(store); + cls_rgw_reshard_entry entry; + entry.time = real_clock::now(); + entry.tenant = tenant; + entry.bucket_name = bucket_name; + entry.bucket_id = bucket_info.bucket.bucket_id; + entry.old_num_shards = num_source_shards; + entry.new_num_shards = num_shards; + + return reshard.add(entry); + } + + if (opt_cmd == OPT_RESHARD_LIST) { + list entries; + int ret; + int count = 0; + if (max_entries < 0) { + max_entries = 1000; + } + + int num_logshards = + store->ctx()->_conf.get_val("rgw_reshard_num_logs"); + + RGWReshard reshard(store); + + formatter->open_array_section("reshard"); + for (int i = 0; i < num_logshards; i++) { + bool is_truncated = true; + string marker; + do { + entries.clear(); + ret = reshard.list(i, marker, max_entries, entries, &is_truncated); + if (ret < 0) { + cerr << "Error listing resharding buckets: " << cpp_strerror(-ret) << std::endl; + return ret; + } + for (auto iter=entries.begin(); iter != entries.end(); ++iter) { + cls_rgw_reshard_entry& entry = *iter; + encode_json("entry", entry, formatter); + entry.get_key(&marker); + } + count += entries.size(); + formatter->flush(cout); + } while (is_truncated && count < max_entries); + + if (count >= max_entries) { + break; + } + } + + formatter->close_section(); + formatter->flush(cout); + return 0; + } + + if (opt_cmd == OPT_RESHARD_STATUS) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + rgw_bucket bucket; + RGWBucketInfo bucket_info; + map attrs; + ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket, &attrs); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */); + list status; + int r = br.get_status(&status); + if (r < 0) { + cerr << "ERROR: could not get resharding status for bucket " << + bucket_name << std::endl; + return -r; + } + + show_reshard_status(status, formatter); + } + + if (opt_cmd == OPT_RESHARD_PROCESS) { + RGWReshard reshard(store, true, &cout); + + int ret = reshard.process_all_logshards(); + if (ret < 0) { + cerr << "ERROR: failed to process reshard logs, error=" << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_RESHARD_CANCEL) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + rgw_bucket bucket; + RGWBucketInfo bucket_info; + map attrs; + bool bucket_initable = true; + ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket, + &attrs); + if (ret < 0) { + if (yes_i_really_mean_it) { + bucket_initable = false; + } else { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << + "; if you want to cancel the reshard request nonetheless, please " + "use the --yes-i-really-mean-it option" << std::endl; + return -ret; + } + } + + if (bucket_initable) { + // we did not encounter an error, so let's work with the bucket + RGWBucketReshard br(store, bucket_info, attrs, + nullptr /* no callback */); + int ret = br.cancel(); + if (ret < 0) { + if (ret == -EBUSY) { + cerr << "There is ongoing resharding, please retry after " << + store->ctx()->_conf.get_val( + "rgw_reshard_bucket_lock_duration") << + " seconds " << std::endl; + } else { + cerr << "Error canceling bucket " << bucket_name << + " resharding: " << cpp_strerror(-ret) << std::endl; + } + return ret; + } + } + + RGWReshard reshard(store); + + cls_rgw_reshard_entry entry; + entry.tenant = tenant; + entry.bucket_name = bucket_name; + //entry.bucket_id = bucket_id; + + ret = reshard.remove(entry); + if (ret < 0 && ret != -ENOENT) { + cerr << "Error in updating reshard log with bucket " << + bucket_name << ": " << cpp_strerror(-ret) << std::endl; + return ret; + } + } // OPT_RESHARD_CANCEL + + if (opt_cmd == OPT_OBJECT_UNLINK) { + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + list oid_list; + rgw_obj_key key(object, object_version); + rgw_obj_index_key index_key; + key.get_index_key(&index_key); + oid_list.push_back(index_key); + ret = store->remove_objs_from_index(bucket_info, oid_list); + if (ret < 0) { + cerr << "ERROR: remove_obj_from_index() returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + if (opt_cmd == OPT_OBJECT_STAT) { + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + rgw_obj obj(bucket, object); + obj.key.set_instance(object_version); + + uint64_t obj_size; + map attrs; + RGWObjectCtx obj_ctx(store); + RGWRados::Object op_target(store, bucket_info, obj_ctx, obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrs; + read_op.params.obj_size = &obj_size; + + ret = read_op.prepare(); + if (ret < 0) { + cerr << "ERROR: failed to stat object, returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + formatter->open_object_section("object_metadata"); + formatter->dump_string("name", object); + formatter->dump_unsigned("size", obj_size); + + map::iterator iter; + map other_attrs; + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + bufferlist& bl = iter->second; + bool handled = false; + if (iter->first == RGW_ATTR_MANIFEST) { + handled = decode_dump("manifest", bl, formatter); + } else if (iter->first == RGW_ATTR_ACL) { + handled = decode_dump("policy", bl, formatter); + } else if (iter->first == RGW_ATTR_ID_TAG) { + handled = dump_string("tag", bl, formatter); + } else if (iter->first == RGW_ATTR_ETAG) { + handled = dump_string("etag", bl, formatter); + } else if (iter->first == RGW_ATTR_COMPRESSION) { + handled = decode_dump("compression", bl, formatter); + } else if (iter->first == RGW_ATTR_DELETE_AT) { + handled = decode_dump("delete_at", bl, formatter); + } + + if (!handled) + other_attrs[iter->first] = bl; + } + + formatter->open_object_section("attrs"); + for (iter = other_attrs.begin(); iter != other_attrs.end(); ++iter) { + dump_string(iter->first.c_str(), iter->second, formatter); + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_BUCKET_CHECK) { + if (check_head_obj_locator) { + if (bucket_name.empty()) { + cerr << "ERROR: need to specify bucket name" << std::endl; + return EINVAL; + } + do_check_object_locator(tenant, bucket_name, fix, remove_bad, formatter); + } else { + RGWBucketAdminOp::check_index(store, bucket_op, f); + } + } + + if (opt_cmd == OPT_BUCKET_RM) { + if (!inconsistent_index) { + RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, true); + } else { + if (!yes_i_really_mean_it) { + cerr << "using --inconsistent_index can corrupt the bucket index " << std::endl + << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl; + return 1; + } + RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, false); + } + } + + if (opt_cmd == OPT_GC_LIST) { + int index = 0; + bool truncated; + formatter->open_array_section("entries"); + + do { + list result; + int ret = store->list_gc_objs(&index, marker, 1000, !include_all, result, &truncated); + if (ret < 0) { + cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret) << std::endl; + return 1; + } + + + list::iterator iter; + for (iter = result.begin(); iter != result.end(); ++iter) { + cls_rgw_gc_obj_info& info = *iter; + formatter->open_object_section("chain_info"); + formatter->dump_string("tag", info.tag); + formatter->dump_stream("time") << info.time; + formatter->open_array_section("objs"); + list::iterator liter; + cls_rgw_obj_chain& chain = info.chain; + for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) { + cls_rgw_obj& obj = *liter; + encode_json("obj", obj, formatter); + } + formatter->close_section(); // objs + formatter->close_section(); // obj_chain + formatter->flush(cout); + } + } while (truncated); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_GC_PROCESS) { + int ret = store->process_gc(!include_all); + if (ret < 0) { + cerr << "ERROR: gc processing returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + if (opt_cmd == OPT_LC_LIST) { + formatter->open_array_section("lifecycle_list"); + map bucket_lc_map; + string marker; +#define MAX_LC_LIST_ENTRIES 100 + if (max_entries < 0) { + max_entries = MAX_LC_LIST_ENTRIES; + } + do { + int ret = store->list_lc_progress(marker, max_entries, &bucket_lc_map); + if (ret < 0) { + cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret) << std::endl; + return 1; + } + map::iterator iter; + for (iter = bucket_lc_map.begin(); iter != bucket_lc_map.end(); ++iter) { + formatter->open_object_section("bucket_lc_info"); + formatter->dump_string("bucket", iter->first); + string lc_status = LC_STATUS[iter->second]; + formatter->dump_string("status", lc_status); + formatter->close_section(); // objs + formatter->flush(cout); + marker = iter->first; + } + } while (!bucket_lc_map.empty()); + + formatter->close_section(); //lifecycle list + formatter->flush(cout); + } + + + if (opt_cmd == OPT_LC_GET) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + rgw_bucket bucket; + RGWBucketInfo bucket_info; + map attrs; + RGWLifecycleConfiguration config; + ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket, &attrs); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto aiter = attrs.find(RGW_ATTR_LC); + if (aiter == attrs.end()) { + return -ENOENT; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + cerr << "ERROR: decode life cycle config failed" << std::endl; + return -EIO; + } + + encode_json("result", config, formatter); + formatter->flush(cout); + } + + if (opt_cmd == OPT_LC_PROCESS) { + int ret = store->process_lc(); + if (ret < 0) { + cerr << "ERROR: lc processing returned error: " << cpp_strerror(-ret) << std::endl; + return 1; + } + } + + + if (opt_cmd == OPT_LC_RESHARD_FIX) { + ret = RGWBucketAdminOp::fix_lc_shards(store, bucket_op,f); + if (ret < 0) { + cerr << "ERROR: listing stale instances" << cpp_strerror(-ret) << std::endl; + } + + } + + if (opt_cmd == OPT_ORPHANS_FIND) { + if (!yes_i_really_mean_it) { + cerr << "accidental removal of active objects can not be reversed; " + << "do you really mean it? (requires --yes-i-really-mean-it)" + << std::endl; + return EINVAL; + } + + RGWOrphanSearch search(store, max_concurrent_ios, orphan_stale_secs); + + if (job_id.empty()) { + cerr << "ERROR: --job-id not specified" << std::endl; + return EINVAL; + } + if (pool_name.empty()) { + cerr << "ERROR: --pool not specified" << std::endl; + return EINVAL; + } + + RGWOrphanSearchInfo info; + + info.pool = pool; + info.job_name = job_id; + info.num_shards = num_shards; + + int ret = search.init(job_id, &info, detail); + if (ret < 0) { + cerr << "could not init search, ret=" << ret << std::endl; + return -ret; + } + ret = search.run(); + if (ret < 0) { + return -ret; + } + } + + if (opt_cmd == OPT_ORPHANS_FINISH) { + RGWOrphanSearch search(store, max_concurrent_ios, orphan_stale_secs); + + if (job_id.empty()) { + cerr << "ERROR: --job-id not specified" << std::endl; + return EINVAL; + } + int ret = search.init(job_id, NULL); + if (ret < 0) { + if (ret == -ENOENT) { + cerr << "job not found" << std::endl; + } + return -ret; + } + ret = search.finish(); + if (ret < 0) { + return -ret; + } + } + + if (opt_cmd == OPT_ORPHANS_LIST_JOBS){ + RGWOrphanStore orphan_store(store); + int ret = orphan_store.init(); + if (ret < 0){ + cerr << "connection to cluster failed!" << std::endl; + return -ret; + } + + map m; + ret = orphan_store.list_jobs(m); + if (ret < 0) { + cerr << "job list failed" << std::endl; + return -ret; + } + formatter->open_array_section("entries"); + for (const auto &it: m){ + if (!extra_info){ + formatter->dump_string("job-id",it.first); + } else { + encode_json("orphan_search_state", it.second, formatter); + } + } + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_USER_CHECK) { + check_bad_user_bucket_mapping(store, user_id, fix); + } + + if (opt_cmd == OPT_USER_STATS) { + if (user_id.empty()) { + cerr << "ERROR: uid not specified" << std::endl; + return EINVAL; + } + + string user_str = user_id.to_str(); + if (reset_stats) { + if (!bucket_name.empty()) { + cerr << "ERROR: --reset-stats does not work on buckets and " + "bucket specified" << std::endl; + return EINVAL; + } + if (sync_stats) { + cerr << "ERROR: sync-stats includes the reset-stats functionality, " + "so at most one of the two should be specified" << std::endl; + return EINVAL; + } + ret = store->cls_user_reset_stats(user_str); + if (ret < 0) { + cerr << "ERROR: could not reset user stats: " << cpp_strerror(-ret) << + std::endl; + return -ret; + } + } + + if (sync_stats) { + if (!bucket_name.empty()) { + int ret = rgw_bucket_sync_user_stats(store, tenant, bucket_name); + if (ret < 0) { + cerr << "ERROR: could not sync bucket stats: " << + cpp_strerror(-ret) << std::endl; + return -ret; + } + } else { + int ret = rgw_user_sync_all_stats(store, user_id); + if (ret < 0) { + cerr << "ERROR: could not sync user stats: " << + cpp_strerror(-ret) << std::endl; + return -ret; + } + } + } + + cls_user_header header; + int ret = store->cls_user_get_header(user_str, &header); + if (ret < 0) { + if (ret == -ENOENT) { /* in case of ENOENT */ + cerr << "User has not been initialized or user does not exist" << std::endl; + } else { + cerr << "ERROR: can't read user: " << cpp_strerror(ret) << std::endl; + } + return -ret; + } + + encode_json("header", header, formatter); + formatter->flush(cout); + } + + if (opt_cmd == OPT_METADATA_GET) { + int ret = store->meta_mgr->get(metadata_key, formatter); + if (ret < 0) { + cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + formatter->flush(cout); + } + + if (opt_cmd == OPT_METADATA_PUT) { + bufferlist bl; + int ret = read_input(infile, bl); + if (ret < 0) { + cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = store->meta_mgr->put(metadata_key, bl, RGWMetadataHandler::RGWMetadataHandler::APPLY_ALWAYS); + if (ret < 0) { + cerr << "ERROR: can't put key: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_METADATA_RM) { + int ret = store->meta_mgr->remove(metadata_key); + if (ret < 0) { + cerr << "ERROR: can't remove key: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_METADATA_LIST || opt_cmd == OPT_USER_LIST) { + if (opt_cmd == OPT_USER_LIST) { + metadata_key = "user"; + } + void *handle; + int max = 1000; + int ret = store->meta_mgr->list_keys_init(metadata_key, marker, &handle); + if (ret < 0) { + cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + bool truncated; + uint64_t count = 0; + + if (max_entries_specified) { + formatter->open_object_section("result"); + } + formatter->open_array_section("keys"); + + uint64_t left; + do { + list keys; + left = (max_entries_specified ? max_entries - count : max); + ret = store->meta_mgr->list_keys_next(handle, left, keys, &truncated); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } if (ret != -ENOENT) { + for (list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + formatter->dump_string("key", *iter); + ++count; + } + formatter->flush(cout); + } + } while (truncated && left > 0); + + formatter->close_section(); + + if (max_entries_specified) { + encode_json("truncated", truncated, formatter); + encode_json("count", count, formatter); + if (truncated) { + encode_json("marker", store->meta_mgr->get_marker(handle), formatter); + } + formatter->close_section(); + } + formatter->flush(cout); + + store->meta_mgr->list_keys_complete(handle); + } + + if (opt_cmd == OPT_MDLOG_LIST) { + utime_t start_time, end_time; + + int ret = parse_date_str(start_date, start_time); + if (ret < 0) + return -ret; + + ret = parse_date_str(end_date, end_time); + if (ret < 0) + return -ret; + + int i = (specified_shard_id ? shard_id : 0); + + if (period_id.empty()) { + int ret = read_current_period_id(store, realm_id, realm_name, &period_id); + if (ret < 0) { + return -ret; + } + std::cerr << "No --period given, using current period=" + << period_id << std::endl; + } + RGWMetadataLog *meta_log = store->meta_mgr->get_log(period_id); + + formatter->open_array_section("entries"); + for (; i < g_ceph_context->_conf->rgw_md_log_max_shards; i++) { + void *handle; + list entries; + + + meta_log->init_list_entries(i, start_time.to_real_time(), end_time.to_real_time(), marker, &handle); + bool truncated; + do { + int ret = meta_log->list_entries(handle, 1000, entries, NULL, &truncated); + if (ret < 0) { + cerr << "ERROR: meta_log->list_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + cls_log_entry& entry = *iter; + store->meta_mgr->dump_log_entry(entry, formatter); + } + formatter->flush(cout); + } while (truncated); + + meta_log->complete_list_entries(handle); + + if (specified_shard_id) + break; + } + + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_MDLOG_STATUS) { + int i = (specified_shard_id ? shard_id : 0); + + if (period_id.empty()) { + int ret = read_current_period_id(store, realm_id, realm_name, &period_id); + if (ret < 0) { + return -ret; + } + std::cerr << "No --period given, using current period=" + << period_id << std::endl; + } + RGWMetadataLog *meta_log = store->meta_mgr->get_log(period_id); + + formatter->open_array_section("entries"); + + for (; i < g_ceph_context->_conf->rgw_md_log_max_shards; i++) { + RGWMetadataLogInfo info; + meta_log->get_info(i, &info); + + ::encode_json("info", info, formatter); + + if (specified_shard_id) + break; + } + + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_MDLOG_AUTOTRIM) { + // need a full history for purging old mdlog periods + store->meta_mgr->init_oldest_log_period(); + + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + RGWHTTPManager http(store->ctx(), crs.get_completion_mgr()); + int ret = http.start(); + if (ret < 0) { + cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl; + return -ret; + } + + auto num_shards = g_conf()->rgw_md_log_max_shards; + ret = crs.run(create_admin_meta_log_trim_cr(dpp(), store, &http, num_shards)); + if (ret < 0) { + cerr << "automated mdlog trim failed with " << cpp_strerror(ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_MDLOG_TRIM) { + utime_t start_time, end_time; + + if (!specified_shard_id) { + cerr << "ERROR: shard-id must be specified for trim operation" << std::endl; + return EINVAL; + } + + int ret = parse_date_str(start_date, start_time); + if (ret < 0) + return -ret; + + ret = parse_date_str(end_date, end_time); + if (ret < 0) + return -ret; + + if (period_id.empty()) { + std::cerr << "missing --period argument" << std::endl; + return EINVAL; + } + RGWMetadataLog *meta_log = store->meta_mgr->get_log(period_id); + + // trim until -ENODATA + do { + ret = meta_log->trim(shard_id, start_time.to_real_time(), + end_time.to_real_time(), start_marker, end_marker); + } while (ret == 0); + if (ret < 0 && ret != -ENODATA) { + cerr << "ERROR: meta_log->trim(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_SYNC_STATUS) { + sync_status(formatter); + } + + if (opt_cmd == OPT_METADATA_SYNC_STATUS) { + RGWMetaSyncStatusManager sync(store, store->get_async_rados()); + + int ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + rgw_meta_sync_status sync_status; + ret = sync.read_sync_status(&sync_status); + if (ret < 0) { + cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + + formatter->open_object_section("summary"); + encode_json("sync_status", sync_status, formatter); + + uint64_t full_total = 0; + uint64_t full_complete = 0; + + for (auto marker_iter : sync_status.sync_markers) { + full_total += marker_iter.second.total_entries; + if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) { + full_complete += marker_iter.second.pos; + } else { + full_complete += marker_iter.second.total_entries; + } + } + + formatter->open_object_section("full_sync"); + encode_json("total", full_total, formatter); + encode_json("complete", full_complete, formatter); + formatter->close_section(); + formatter->close_section(); + + formatter->flush(cout); + + } + + if (opt_cmd == OPT_METADATA_SYNC_INIT) { + RGWMetaSyncStatusManager sync(store, store->get_async_rados()); + + int ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + ret = sync.init_sync_status(); + if (ret < 0) { + cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + } + + + if (opt_cmd == OPT_METADATA_SYNC_RUN) { + RGWMetaSyncStatusManager sync(store, store->get_async_rados()); + + int ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + ret = sync.run(); + if (ret < 0) { + cerr << "ERROR: sync.run() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_DATA_SYNC_STATUS) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + RGWDataSyncStatusManager sync(store, store->get_async_rados(), source_zone, nullptr); + + int ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + rgw_data_sync_status sync_status; + if (specified_shard_id) { + set pending_buckets; + set recovering_buckets; + rgw_data_sync_marker sync_marker; + ret = sync.read_shard_status(shard_id, pending_buckets, recovering_buckets, &sync_marker, + max_entries_specified ? max_entries : 20); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: sync.read_shard_status() returned ret=" << ret << std::endl; + return -ret; + } + formatter->open_object_section("summary"); + encode_json("shard_id", shard_id, formatter); + encode_json("marker", sync_marker, formatter); + encode_json("pending_buckets", pending_buckets, formatter); + encode_json("recovering_buckets", recovering_buckets, formatter); + formatter->close_section(); + formatter->flush(cout); + } else { + ret = sync.read_sync_status(&sync_status); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + + formatter->open_object_section("summary"); + encode_json("sync_status", sync_status, formatter); + + uint64_t full_total = 0; + uint64_t full_complete = 0; + + for (auto marker_iter : sync_status.sync_markers) { + full_total += marker_iter.second.total_entries; + if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) { + full_complete += marker_iter.second.pos; + } else { + full_complete += marker_iter.second.total_entries; + } + } + + formatter->open_object_section("full_sync"); + encode_json("total", full_total, formatter); + encode_json("complete", full_complete, formatter); + formatter->close_section(); + formatter->close_section(); + + formatter->flush(cout); + } + } + + if (opt_cmd == OPT_DATA_SYNC_INIT) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + + RGWDataSyncStatusManager sync(store, store->get_async_rados(), source_zone, nullptr); + + int ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + ret = sync.init_sync_status(); + if (ret < 0) { + cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_DATA_SYNC_RUN) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + + RGWSyncModuleInstanceRef sync_module; + int ret = store->svc.sync_modules->get_manager()->create_instance(g_ceph_context, store->svc.zone->get_zone().tier_type, + store->svc.zone->get_zone_params().tier_config, &sync_module); + if (ret < 0) { + lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl; + return ret; + } + + RGWDataSyncStatusManager sync(store, store->get_async_rados(), source_zone, nullptr, sync_module); + + ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + ret = sync.run(); + if (ret < 0) { + cerr << "ERROR: sync.run() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_BUCKET_SYNC_INIT) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + rgw_bucket bucket; + int ret = init_bucket_for_sync(tenant, bucket_name, bucket_id, bucket); + if (ret < 0) { + return -ret; + } + RGWBucketSyncStatusManager sync(store, source_zone, bucket); + + ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + ret = sync.init_sync_status(); + if (ret < 0) { + cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + } + + if ((opt_cmd == OPT_BUCKET_SYNC_DISABLE) || (opt_cmd == OPT_BUCKET_SYNC_ENABLE)) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + + if (ret < 0) { + cerr << "could not init realm " << ": " << cpp_strerror(-ret) << std::endl; + return ret; + } + RGWPeriod period; + ret = period.init(g_ceph_context, store->svc.sysobj, realm_id, realm_name, true); + if (ret < 0) { + cerr << "failed to init period " << ": " << cpp_strerror(-ret) << std::endl; + return ret; + } + + if (!store->svc.zone->is_meta_master()) { + cerr << "failed to update bucket sync: only allowed on meta master zone " << std::endl; + cerr << period.get_master_zone() << " | " << period.get_realm() << std::endl; + return EINVAL; + } + + rgw_obj obj(bucket, object); + ret = set_bucket_sync_enabled(store, opt_cmd, tenant, bucket_name); + if (ret < 0) + return -ret; + } + + if (opt_cmd == OPT_BUCKET_SYNC_STATUS) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + RGWBucketInfo bucket_info; + rgw_bucket bucket; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + return -ret; + } + bucket_sync_status(store, bucket_info, source_zone, std::cout); + } + + if (opt_cmd == OPT_BUCKET_SYNC_MARKERS) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + rgw_bucket bucket; + int ret = init_bucket_for_sync(tenant, bucket_name, bucket_id, bucket); + if (ret < 0) { + return -ret; + } + RGWBucketSyncStatusManager sync(store, source_zone, bucket); + + ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + ret = sync.read_sync_status(); + if (ret < 0) { + cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl; + return -ret; + } + + map& sync_status = sync.get_sync_status(); + + encode_json("sync_status", sync_status, formatter); + formatter->flush(cout); + } + + if (opt_cmd == OPT_BUCKET_SYNC_RUN) { + if (source_zone.empty()) { + cerr << "ERROR: source zone not specified" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + rgw_bucket bucket; + int ret = init_bucket_for_sync(tenant, bucket_name, bucket_id, bucket); + if (ret < 0) { + return -ret; + } + RGWBucketSyncStatusManager sync(store, source_zone, bucket); + + ret = sync.init(); + if (ret < 0) { + cerr << "ERROR: sync.init() returned ret=" << ret << std::endl; + return -ret; + } + + ret = sync.run(); + if (ret < 0) { + cerr << "ERROR: sync.run() returned ret=" << ret << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_BILOG_LIST) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_array_section("entries"); + bool truncated; + int count = 0; + if (max_entries < 0) + max_entries = 1000; + + do { + list entries; + ret = store->list_bi_log_entries(bucket_info, shard_id, marker, max_entries - count, entries, &truncated); + if (ret < 0) { + cerr << "ERROR: list_bi_log_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += entries.size(); + + for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_bi_log_entry& entry = *iter; + encode_json("entry", entry, formatter); + + marker = entry.id; + } + formatter->flush(cout); + } while (truncated && count < max_entries); + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_SYNC_ERROR_LIST) { + if (max_entries < 0) { + max_entries = 1000; + } + + bool truncated; + utime_t start_time, end_time; + + int ret = parse_date_str(start_date, start_time); + if (ret < 0) + return -ret; + + ret = parse_date_str(end_date, end_time); + if (ret < 0) + return -ret; + + if (shard_id < 0) { + shard_id = 0; + } + + formatter->open_array_section("entries"); + + for (; shard_id < ERROR_LOGGER_SHARDS; ++shard_id) { + formatter->open_object_section("shard"); + encode_json("shard_id", shard_id, formatter); + formatter->open_array_section("entries"); + + int count = 0; + string oid = RGWSyncErrorLogger::get_shard_oid(RGW_SYNC_ERROR_LOG_SHARD_PREFIX, shard_id); + + do { + list entries; + ret = store->time_log_list(oid, start_time.to_real_time(), end_time.to_real_time(), + max_entries - count, entries, marker, &marker, &truncated); + if (ret == -ENOENT) { + break; + } + if (ret < 0) { + cerr << "ERROR: store->time_log_list(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += entries.size(); + + for (auto& cls_entry : entries) { + rgw_sync_error_info log_entry; + + auto iter = cls_entry.data.cbegin(); + try { + decode(log_entry, iter); + } catch (buffer::error& err) { + cerr << "ERROR: failed to decode log entry" << std::endl; + continue; + } + formatter->open_object_section("entry"); + encode_json("id", cls_entry.id, formatter); + encode_json("section", cls_entry.section, formatter); + encode_json("name", cls_entry.name, formatter); + encode_json("timestamp", cls_entry.timestamp, formatter); + encode_json("info", log_entry, formatter); + formatter->close_section(); + formatter->flush(cout); + } + } while (truncated && count < max_entries); + + formatter->close_section(); + formatter->close_section(); + + if (specified_shard_id) { + break; + } + } + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_SYNC_ERROR_TRIM) { + utime_t start_time, end_time; + int ret = parse_date_str(start_date, start_time); + if (ret < 0) + return -ret; + + ret = parse_date_str(end_date, end_time); + if (ret < 0) + return -ret; + + if (shard_id < 0) { + shard_id = 0; + } + + for (; shard_id < ERROR_LOGGER_SHARDS; ++shard_id) { + ret = trim_sync_error_log(shard_id, start_time.to_real_time(), + end_time.to_real_time(), start_marker, + end_marker, trim_delay_ms); + if (ret < 0) { + cerr << "ERROR: sync error trim: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + if (specified_shard_id) { + break; + } + } + } + + if (opt_cmd == OPT_BILOG_TRIM) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + ret = store->trim_bi_log_entries(bucket_info, shard_id, start_marker, end_marker); + if (ret < 0) { + cerr << "ERROR: trim_bi_log_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_BILOG_STATUS) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + map markers; + ret = store->get_bi_log_status(bucket_info, shard_id, markers); + if (ret < 0) { + cerr << "ERROR: get_bi_log_status(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("entries"); + encode_json("markers", markers, formatter); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_BILOG_AUTOTRIM) { + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + RGWHTTPManager http(store->ctx(), crs.get_completion_mgr()); + int ret = http.start(); + if (ret < 0) { + cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl; + return -ret; + } + + rgw::BucketTrimConfig config; + configure_bucket_trim(store->ctx(), config); + + rgw::BucketTrimManager trim(store, config); + ret = trim.init(); + if (ret < 0) { + cerr << "trim manager init failed with " << cpp_strerror(ret) << std::endl; + return -ret; + } + ret = crs.run(trim.create_admin_bucket_trim_cr(&http)); + if (ret < 0) { + cerr << "automated bilog trim failed with " << cpp_strerror(ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_DATALOG_LIST) { + formatter->open_array_section("entries"); + bool truncated; + int count = 0; + if (max_entries < 0) + max_entries = 1000; + + utime_t start_time, end_time; + + int ret = parse_date_str(start_date, start_time); + if (ret < 0) + return -ret; + + ret = parse_date_str(end_date, end_time); + if (ret < 0) + return -ret; + + RGWDataChangesLog *log = store->data_log; + RGWDataChangesLog::LogMarker log_marker; + + do { + list entries; + if (specified_shard_id) { + ret = log->list_entries(shard_id, start_time.to_real_time(), end_time.to_real_time(), max_entries - count, entries, marker, &marker, &truncated); + } else { + ret = log->list_entries(start_time.to_real_time(), end_time.to_real_time(), max_entries - count, entries, log_marker, &truncated); + } + if (ret < 0) { + cerr << "ERROR: list_bi_log_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + count += entries.size(); + + for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_data_change_log_entry& entry = *iter; + if (!extra_info) { + encode_json("entry", entry.entry, formatter); + } else { + encode_json("entry", entry, formatter); + } + } + formatter->flush(cout); + } while (truncated && count < max_entries); + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_DATALOG_STATUS) { + RGWDataChangesLog *log = store->data_log; + int i = (specified_shard_id ? shard_id : 0); + + formatter->open_array_section("entries"); + for (; i < g_ceph_context->_conf->rgw_data_log_num_shards; i++) { + list entries; + + RGWDataChangesLogInfo info; + log->get_info(i, &info); + + ::encode_json("info", info, formatter); + + if (specified_shard_id) + break; + } + + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_DATALOG_AUTOTRIM) { + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + RGWHTTPManager http(store->ctx(), crs.get_completion_mgr()); + int ret = http.start(); + if (ret < 0) { + cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl; + return -ret; + } + + auto num_shards = g_conf()->rgw_data_log_num_shards; + std::vector markers(num_shards); + ret = crs.run(create_admin_data_log_trim_cr(store, &http, num_shards, markers)); + if (ret < 0) { + cerr << "automated datalog trim failed with " << cpp_strerror(ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_DATALOG_TRIM) { + utime_t start_time, end_time; + + int ret = parse_date_str(start_date, start_time); + if (ret < 0) + return -ret; + + ret = parse_date_str(end_date, end_time); + if (ret < 0) + return -ret; + + if (!specified_shard_id) { + cerr << "ERROR: requires a --shard-id" << std::endl; + return EINVAL; + } + + // loop until -ENODATA + do { + auto datalog = store->data_log; + ret = datalog->trim_entries(shard_id, start_time.to_real_time(), + end_time.to_real_time(), + start_marker, end_marker); + } while (ret == 0); + + if (ret < 0 && ret != -ENODATA) { + cerr << "ERROR: trim_entries(): " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + bool quota_op = (opt_cmd == OPT_QUOTA_SET || opt_cmd == OPT_QUOTA_ENABLE || opt_cmd == OPT_QUOTA_DISABLE); + + if (quota_op) { + if (bucket_name.empty() && user_id.empty()) { + cerr << "ERROR: bucket name or uid is required for quota operation" << std::endl; + return EINVAL; + } + + if (!bucket_name.empty()) { + if (!quota_scope.empty() && quota_scope != "bucket") { + cerr << "ERROR: invalid quota scope specification." << std::endl; + return EINVAL; + } + set_bucket_quota(store, opt_cmd, tenant, bucket_name, + max_size, max_objects, have_max_size, have_max_objects); + } else if (!user_id.empty()) { + if (quota_scope == "bucket") { + return set_user_bucket_quota(opt_cmd, user, user_op, max_size, max_objects, have_max_size, have_max_objects); + } else if (quota_scope == "user") { + return set_user_quota(opt_cmd, user, user_op, max_size, max_objects, have_max_size, have_max_objects); + } else { + cerr << "ERROR: invalid quota scope specification. Please specify either --quota-scope=bucket, or --quota-scope=user" << std::endl; + return EINVAL; + } + } + } + + if (opt_cmd == OPT_MFA_CREATE) { + rados::cls::otp::otp_info_t config; + + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + if (totp_seed.empty()) { + cerr << "ERROR: TOTP device seed was not provided (via --totp-seed)" << std::endl; + return EINVAL; + } + + + rados::cls::otp::SeedType seed_type; + if (totp_seed_type == "hex") { + seed_type = rados::cls::otp::OTP_SEED_HEX; + } else if (totp_seed_type == "base32") { + seed_type = rados::cls::otp::OTP_SEED_BASE32; + } else { + cerr << "ERROR: invalid seed type: " << totp_seed_type << std::endl; + return EINVAL; + } + + config.id = totp_serial; + config.seed = totp_seed; + config.seed_type = seed_type; + + if (totp_seconds > 0) { + config.step_size = totp_seconds; + } + + if (totp_window > 0) { + config.window = totp_window; + } + + real_time mtime = real_clock::now(); + string oid = store->get_mfa_oid(user_id); + + int ret = store->meta_mgr->mutate(rgw_otp_get_handler(), oid, mtime, &objv_tracker, + MDLOG_STATUS_WRITE, RGWMetadataHandler::APPLY_ALWAYS, + [&] { + return store->create_mfa(user_id, config, &objv_tracker, mtime); + }); + if (ret < 0) { + cerr << "MFA creation failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWUserInfo& user_info = user_op.get_user_info(); + user_info.mfa_ids.insert(totp_serial); + user_op.set_mfa_ids(user_info.mfa_ids); + string err; + ret = user.modify(user_op, &err); + if (ret < 0) { + cerr << "ERROR: failed storing user info, error: " << err << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_MFA_REMOVE) { + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + real_time mtime = real_clock::now(); + string oid = store->get_mfa_oid(user_id); + + int ret = store->meta_mgr->mutate(rgw_otp_get_handler(), oid, mtime, &objv_tracker, + MDLOG_STATUS_WRITE, RGWMetadataHandler::APPLY_ALWAYS, + [&] { + return store->remove_mfa(user_id, totp_serial, &objv_tracker, mtime); + }); + if (ret < 0) { + cerr << "MFA removal failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + RGWUserInfo& user_info = user_op.get_user_info(); + user_info.mfa_ids.erase(totp_serial); + user_op.set_mfa_ids(user_info.mfa_ids); + string err; + ret = user.modify(user_op, &err); + if (ret < 0) { + cerr << "ERROR: failed storing user info, error: " << err << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_MFA_GET) { + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + rados::cls::otp::otp_info_t result; + int ret = store->get_mfa(user_id, totp_serial, &result); + if (ret < 0) { + if (ret == -ENOENT || ret == -ENODATA) { + cerr << "MFA serial id not found" << std::endl; + } else { + cerr << "MFA retrieval failed, error: " << cpp_strerror(-ret) << std::endl; + } + return -ret; + } + formatter->open_object_section("result"); + encode_json("entry", result, formatter); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_MFA_LIST) { + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + list result; + int ret = store->list_mfa(user_id, &result); + if (ret < 0) { + cerr << "MFA listing failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + formatter->open_object_section("result"); + encode_json("entries", result, formatter); + formatter->close_section(); + formatter->flush(cout); + } + + if (opt_cmd == OPT_MFA_CHECK) { + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + if (totp_pin.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-pin)" << std::endl; + return EINVAL; + } + + list result; + int ret = store->check_mfa(user_id, totp_serial, totp_pin.front()); + if (ret < 0) { + cerr << "MFA check failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + cout << "ok" << std::endl; + } + + if (opt_cmd == OPT_MFA_RESYNC) { + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + + if (totp_serial.empty()) { + cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl; + return EINVAL; + } + + if (totp_pin.size() != 2) { + cerr << "ERROR: missing two --totp-pin params (--totp-pin= --totp-pin=)" << std::endl; + } + + rados::cls::otp::otp_info_t config; + int ret = store->get_mfa(user_id, totp_serial, &config); + if (ret < 0) { + if (ret == -ENOENT || ret == -ENODATA) { + cerr << "MFA serial id not found" << std::endl; + } else { + cerr << "MFA retrieval failed, error: " << cpp_strerror(-ret) << std::endl; + } + return -ret; + } + + ceph::real_time now; + + ret = store->otp_get_current_time(user_id, &now); + if (ret < 0) { + cerr << "ERROR: failed to fetch current time from osd: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + time_t time_ofs; + + ret = scan_totp(store->ctx(), now, config, totp_pin, &time_ofs); + if (ret < 0) { + if (ret == -ENOENT) { + cerr << "failed to resync, TOTP values not found in range" << std::endl; + } else { + cerr << "ERROR: failed to scan for TOTP values: " << cpp_strerror(-ret) << std::endl; + } + return -ret; + } + + config.time_ofs = time_ofs; + + /* now update the backend */ + real_time mtime = real_clock::now(); + string oid = store->get_mfa_oid(user_id); + + ret = store->meta_mgr->mutate(rgw_otp_get_handler(), oid, mtime, &objv_tracker, + MDLOG_STATUS_WRITE, RGWMetadataHandler::APPLY_ALWAYS, + [&] { + return store->create_mfa(user_id, config, &objv_tracker, mtime); + }); + if (ret < 0) { + cerr << "MFA update failed, error: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + } + + if (opt_cmd == OPT_RESHARD_STALE_INSTANCES_LIST) { + if (!store->svc.zone->can_reshard() && !yes_i_really_mean_it) { + cerr << "Resharding disabled in a multisite env, stale instances unlikely from resharding" << std::endl; + cerr << "These instances may not be safe to delete." << std::endl; + cerr << "Use --yes-i-really-mean-it to force displaying these instances." << std::endl; + return EINVAL; + } + + ret = RGWBucketAdminOp::list_stale_instances(store, bucket_op,f); + if (ret < 0) { + cerr << "ERROR: listing stale instances" << cpp_strerror(-ret) << std::endl; + } + } + + if (opt_cmd == OPT_RESHARD_STALE_INSTANCES_DELETE) { + if (!store->svc.zone->can_reshard()) { + cerr << "Resharding disabled in a multisite env. Stale instances are not safe to be deleted." << std::endl; + return EINVAL; + } + + ret = RGWBucketAdminOp::clear_stale_instances(store, bucket_op,f); + if (ret < 0) { + cerr << "ERROR: deleting stale instances" << cpp_strerror(-ret) << std::endl; + } + } + + if (opt_cmd == OPT_PUBSUB_TOPICS_LIST) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + + RGWUserPubSub ups(store, user_info.user_id); + + rgw_bucket bucket; + + if (!bucket_name.empty()) { + rgw_pubsub_bucket_topics result; + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto b = ups.get_bucket(bucket_info.bucket); + ret = b->get_topics(&result); + if (ret < 0) { + cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("result", result, formatter); + } else { + rgw_pubsub_user_topics result; + int ret = ups.get_user_topics(&result); + if (ret < 0) { + cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("result", result, formatter); + } + formatter->flush(cout); + } + + if (opt_cmd == OPT_PUBSUB_TOPIC_CREATE) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (topic_name.empty()) { + cerr << "ERROR: topic name was not provided (via --topic)" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + ret = ups.create_topic(topic_name); + if (ret < 0) { + cerr << "ERROR: could not create topic: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_PUBSUB_TOPIC_GET) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (topic_name.empty()) { + cerr << "ERROR: topic name was not provided (via --topic)" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + rgw_pubsub_topic_subs topic; + ret = ups.get_topic(topic_name, &topic); + if (ret < 0) { + cerr << "ERROR: could not create topic: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("topic", topic, formatter); + formatter->flush(cout); + } + + if (opt_cmd == OPT_PUBSUB_NOTIFICATION_CREATE) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (topic_name.empty()) { + cerr << "ERROR: topic name was not provided (via --topic)" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket name was not provided (via --bucket)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + rgw_bucket bucket; + + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto b = ups.get_bucket(bucket_info.bucket); + ret = b->create_notification(topic_name, event_types); + if (ret < 0) { + cerr << "ERROR: could not publish bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_PUBSUB_NOTIFICATION_RM) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (topic_name.empty()) { + cerr << "ERROR: topic name was not provided (via --topic)" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + if (bucket_name.empty()) { + cerr << "ERROR: bucket name was not provided (via --bucket)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + rgw_bucket bucket; + + RGWBucketInfo bucket_info; + int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket); + if (ret < 0) { + cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + auto b = ups.get_bucket(bucket_info.bucket); + ret = b->remove_notification(topic_name); + if (ret < 0) { + cerr << "ERROR: could not publish bucket: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_PUBSUB_TOPIC_RM) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (topic_name.empty()) { + cerr << "ERROR: topic name was not provided (via --topic)" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + ret = ups.remove_topic(topic_name); + if (ret < 0) { + cerr << "ERROR: could not remove topic: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_PUBSUB_SUB_GET) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + if (sub_name.empty()) { + cerr << "ERROR: subscription name was not provided (via --sub-name)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + rgw_pubsub_sub_config sub_conf; + + auto sub = ups.get_sub(sub_name); + ret = sub->get_conf(&sub_conf); + if (ret < 0) { + cerr << "ERROR: could not get subscription info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("sub", sub_conf, formatter); + formatter->flush(cout); + } + + if (opt_cmd == OPT_PUBSUB_SUB_CREATE) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + if (sub_name.empty()) { + cerr << "ERROR: subscription name was not provided (via --sub-name)" << std::endl; + return EINVAL; + } + if (topic_name.empty()) { + cerr << "ERROR: topic name was not provided (via --topic)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + rgw_pubsub_topic_subs topic; + int ret = ups.get_topic(topic_name, &topic); + if (ret < 0) { + cerr << "ERROR: topic not found" << std::endl; + return EINVAL; + } + + rgw_pubsub_sub_dest dest_config; + dest_config.bucket_name = sub_dest_bucket; + dest_config.oid_prefix = sub_oid_prefix; + dest_config.push_endpoint = sub_push_endpoint; + + auto psmodule = static_cast(store->get_sync_module().get()); + auto conf = psmodule->get_effective_conf(); + + if (dest_config.bucket_name.empty()) { + dest_config.bucket_name = string(conf["data_bucket_prefix"]) + user_info.user_id.to_str() + "-" + topic.topic.name; + } + if (dest_config.oid_prefix.empty()) { + dest_config.oid_prefix = conf["data_oid_prefix"]; + } + auto sub = ups.get_sub(sub_name); + ret = sub->subscribe(topic_name, dest_config); + if (ret < 0) { + cerr << "ERROR: could not store subscription info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_PUBSUB_SUB_RM) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + if (sub_name.empty()) { + cerr << "ERROR: subscription name was not provided (via --sub-name)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + auto sub = ups.get_sub(sub_name); + ret = sub->unsubscribe(topic_name); + if (ret < 0) { + cerr << "ERROR: could not get subscription info: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + if (opt_cmd == OPT_PUBSUB_SUB_PULL) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + if (sub_name.empty()) { + cerr << "ERROR: subscription name was not provided (via --sub-name)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + if (!max_entries_specified) { + max_entries = RGWUserPubSub::Sub::DEFAULT_MAX_EVENTS; + } + auto sub = ups.get_sub(sub_name); + ret = sub->list_events(marker, max_entries); + if (ret < 0) { + cerr << "ERROR: could not list events: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + encode_json("result", *sub, formatter); + formatter->flush(cout); + } + + if (opt_cmd == OPT_PUBSUB_EVENT_RM) { + if (get_tier_type(store) != "pubsub") { + cerr << "ERROR: only pubsub tier type supports this command" << std::endl; + return EINVAL; + } + if (user_id.empty()) { + cerr << "ERROR: user id was not provided (via --uid)" << std::endl; + return EINVAL; + } + if (sub_name.empty()) { + cerr << "ERROR: subscription name was not provided (via --sub-name)" << std::endl; + return EINVAL; + } + if (event_id.empty()) { + cerr << "ERROR: event id was not provided (via --event-id)" << std::endl; + return EINVAL; + } + RGWUserInfo& user_info = user_op.get_user_info(); + RGWUserPubSub ups(store, user_info.user_id); + + auto sub = ups.get_sub(sub_name); + ret = sub->remove_event(event_id); + if (ret < 0) { + cerr << "ERROR: could not remove event: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + } + + return 0; +} diff --git a/src/rgw/rgw_admin_user.cc b/src/rgw/rgw_admin_user.cc new file mode 100644 index 00000000..615c6b31 --- /dev/null +++ b/src/rgw/rgw_admin_user.cc @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/rgw/librgw_admin_user.h" +#include "rgw_admin_user.h" +#include "rgw_user.h" +#include "common/errno.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace rgw; + +namespace rgw { + + extern RGWLibAdmin rgw_lib_admin; + +} + +extern "C" { + + int rgw_admin_create_user(librgw_admin_user_t librgw_admin_user, const char *uid, + const char *display_name, const char *access_key, const char* secret_key, + const char *email, const char *caps, + const char *access, bool admin, bool system) + { + RGWUserAdminOpState user_op; + rgw_user user_id; + user_id.from_str(uid); + user_op.set_user_id(user_id); + user_op.set_display_name(display_name); + user_op.user_email = email; + user_op.user_email_specified=true; + user_op.set_access_key(access_key); + user_op.set_secret_key(secret_key); + user_op.set_caps(caps); + if (access) { + uint32_t perm_mask = rgw_str_to_perm(access); + user_op.set_perm(perm_mask); + } + user_op.set_admin(admin); + user_op.set_system(system); + + RGWUser user; + int ret = 0; + ret = user.init(rgw_lib_admin.get_store(), user_op); + if (ret < 0) { + cerr << "user.init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + std::string err_msg; + ret = user.add(user_op, &err_msg); + if (ret < 0) { + cerr << "could not create user: " << err_msg << std::endl; + if (ret == -ERR_INVALID_TENANT_NAME) + ret = -EINVAL; + + return -ret; + } + + return 0; + } + + int rgw_admin_user_info(librgw_admin_user_t librgw_admin_user, const char *uid, rgw_user_info* user_info) + { + RGWUserAdminOpState user_op; + rgw_user user_id; + user_id.from_str(uid); + user_op.set_user_id(user_id); + + RGWUser user; + int ret = 0; + ret = user.init(rgw_lib_admin.get_store(), user_op); + if (ret < 0) { + cerr << "user.init failed: " << cpp_strerror(-ret) << std::endl; + return -ret; + } + + std::string err_msg; + RGWUserInfo info; + ret = user.info(info, &err_msg); + if (ret < 0) { + cerr << "could not fetch user info: " << err_msg << std::endl; + return -ret; + } + + return 0; + } + +} diff --git a/src/rgw/rgw_admin_user.h b/src/rgw/rgw_admin_user.h new file mode 100644 index 00000000..68f8167f --- /dev/null +++ b/src/rgw/rgw_admin_user.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * create rgw admin user + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_ADMIN_USER_H +#define RGW_ADMIN_USER_H + +#include +#include "common/config.h" + +#include "rgw_rados.h" + +namespace rgw { + + class RGWLibAdmin + { + RGWRados *store; + boost::intrusive_ptr cct; + + public: + RGWRados* get_store() + { + return store; + } + + int init(); + int init(vector& args); + int stop(); + }; +} + +#endif /*RGW_ADMIN_USER_H */ diff --git a/src/rgw/rgw_aio.h b/src/rgw/rgw_aio.h new file mode 100644 index 00000000..0ca401da --- /dev/null +++ b/src/rgw/rgw_aio.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "include/rados/librados_fwd.hpp" +#include +#include "rgw_common.h" +#include "services/svc_rados.h" // cant forward declare RGWSI_RADOS::Obj + +namespace rgw { + +struct AioResult { + RGWSI_RADOS::Obj obj; + uint64_t id = 0; // id allows caller to associate a result with its request + bufferlist data; // result buffer for reads + int result = 0; +}; +struct AioResultEntry : AioResult, boost::intrusive::list_base_hook<> { + virtual ~AioResultEntry() {} +}; +// a list of polymorphic entries that frees them on destruction +template +struct OwningList : boost::intrusive::list { + OwningList() = default; + ~OwningList() { this->clear_and_dispose(std::default_delete{}); } + OwningList(OwningList&&) = default; + OwningList& operator=(OwningList&&) = default; + OwningList(const OwningList&) = delete; + OwningList& operator=(const OwningList&) = delete; +}; +using AioResultList = OwningList; + +// returns the first error code or 0 if all succeeded +inline int check_for_errors(const AioResultList& results) { + for (auto& e : results) { + if (e.result < 0) { + return e.result; + } + } + return 0; +} + +// interface to submit async librados operations and wait on their completions. +// each call returns a list of results from prior completions +class Aio { + public: + virtual ~Aio() {} + + virtual AioResultList submit(RGWSI_RADOS::Obj& obj, + librados::ObjectReadOperation *op, + uint64_t cost, uint64_t id) = 0; + + virtual AioResultList submit(RGWSI_RADOS::Obj& obj, + librados::ObjectWriteOperation *op, + uint64_t cost, uint64_t id) = 0; + + // poll for any ready completions without waiting + virtual AioResultList poll() = 0; + + // return any ready completions. if there are none, wait for the next + virtual AioResultList wait() = 0; + + // wait for all outstanding completions and return their results + virtual AioResultList drain() = 0; +}; + +} // namespace rgw diff --git a/src/rgw/rgw_aio_throttle.cc b/src/rgw/rgw_aio_throttle.cc new file mode 100644 index 00000000..79d095d2 --- /dev/null +++ b/src/rgw/rgw_aio_throttle.cc @@ -0,0 +1,157 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/librados.hpp" + +#include "rgw_aio_throttle.h" +#include "rgw_rados.h" + +namespace rgw { + +void AioThrottle::aio_cb(void *cb, void *arg) +{ + Pending& p = *static_cast(arg); + p.result = p.completion->get_return_value(); + p.parent->put(p); +} + +bool AioThrottle::waiter_ready() const +{ + switch (waiter) { + case Wait::Available: return is_available(); + case Wait::Completion: return has_completion(); + case Wait::Drained: return is_drained(); + default: return false; + } +} + +AioResultList AioThrottle::submit(RGWSI_RADOS::Obj& obj, + librados::ObjectWriteOperation *op, + uint64_t cost, uint64_t id) +{ + auto p = std::make_unique(); + p->obj = obj; + p->id = id; + p->cost = cost; + + if (cost > window) { + p->result = -EDEADLK; // would never succeed + std::unique_lock lock{mutex}; + completed.push_back(*p); + } else { + get(*p); + p->result = obj.aio_operate(p->completion, op); + if (p->result < 0) { + put(*p); + } + } + p.release(); + std::unique_lock lock{mutex}; + return std::move(completed); +} + +AioResultList AioThrottle::submit(RGWSI_RADOS::Obj& obj, + librados::ObjectReadOperation *op, + uint64_t cost, uint64_t id) +{ + auto p = std::make_unique(); + p->obj = obj; + p->id = id; + p->cost = cost; + + if (cost > window) { + p->result = -EDEADLK; // would never succeed + std::unique_lock lock{mutex}; + completed.push_back(*p); + } else { + get(*p); + p->result = obj.aio_operate(p->completion, op, &p->data); + if (p->result < 0) { + put(*p); + } + } + p.release(); + std::unique_lock lock{mutex}; + return std::move(completed); +} + +void AioThrottle::get(Pending& p) +{ + std::unique_lock lock{mutex}; + + // wait for the write size to become available + pending_size += p.cost; + if (!is_available()) { + ceph_assert(waiter == Wait::None); + waiter = Wait::Available; + cond.wait(lock, [this] { return is_available(); }); + waiter = Wait::None; + } + + // register the pending write and attach a completion + p.parent = this; + p.completion = librados::Rados::aio_create_completion(&p, nullptr, aio_cb); + pending.push_back(p); +} + +void AioThrottle::put(Pending& p) +{ + p.completion->release(); + p.completion = nullptr; + + std::scoped_lock lock{mutex}; + + // move from pending to completed + pending.erase(pending.iterator_to(p)); + completed.push_back(p); + + pending_size -= p.cost; + + if (waiter_ready()) { + cond.notify_one(); + } +} + +AioResultList AioThrottle::poll() +{ + std::unique_lock lock{mutex}; + return std::move(completed); +} + +AioResultList AioThrottle::wait() +{ + std::unique_lock lock{mutex}; + if (completed.empty() && !pending.empty()) { + ceph_assert(waiter == Wait::None); + waiter = Wait::Completion; + cond.wait(lock, [this] { return has_completion(); }); + waiter = Wait::None; + } + return std::move(completed); +} + +AioResultList AioThrottle::drain() +{ + std::unique_lock lock{mutex}; + if (!pending.empty()) { + ceph_assert(waiter == Wait::None); + waiter = Wait::Drained; + cond.wait(lock, [this] { return is_drained(); }); + waiter = Wait::None; + } + return std::move(completed); +} + +} // namespace rgw diff --git a/src/rgw/rgw_aio_throttle.h b/src/rgw/rgw_aio_throttle.h new file mode 100644 index 00000000..751d7f98 --- /dev/null +++ b/src/rgw/rgw_aio_throttle.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "include/rados/librados_fwd.hpp" +#include +#include "common/ceph_mutex.h" +#include "services/svc_rados.h" +#include "rgw_aio.h" + +namespace rgw { + +// a throttle for aio operations that enforces a maximum window on outstanding +// bytes. only supports a single waiter, so all public functions must be called +// from the same thread +class AioThrottle : public Aio { + protected: + const uint64_t window; + uint64_t pending_size = 0; + + bool is_available() const { return pending_size <= window; } + bool has_completion() const { return !completed.empty(); } + bool is_drained() const { return pending.empty(); } + + struct Pending : AioResultEntry { + AioThrottle *parent = nullptr; + uint64_t cost = 0; + librados::AioCompletion *completion = nullptr; + }; + OwningList pending; + AioResultList completed; + + enum class Wait { None, Available, Completion, Drained }; + Wait waiter = Wait::None; + + bool waiter_ready() const; + + ceph::mutex mutex = ceph::make_mutex("AioThrottle"); + ceph::condition_variable cond; + + void get(Pending& p); + void put(Pending& p); + + static void aio_cb(void *cb, void *arg); + + public: + AioThrottle(uint64_t window) : window(window) {} + + virtual ~AioThrottle() { + // must drain before destructing + ceph_assert(pending.empty()); + ceph_assert(completed.empty()); + } + + AioResultList submit(RGWSI_RADOS::Obj& obj, + librados::ObjectReadOperation *op, + uint64_t cost, uint64_t id) override; + + AioResultList submit(RGWSI_RADOS::Obj& obj, + librados::ObjectWriteOperation *op, + uint64_t cost, uint64_t id) override; + + AioResultList poll() override; + + AioResultList wait() override; + + AioResultList drain() override; +}; + +} // namespace rgw diff --git a/src/rgw/rgw_amqp.cc b/src/rgw/rgw_amqp.cc new file mode 100644 index 00000000..45167a8e --- /dev/null +++ b/src/rgw/rgw_amqp.cc @@ -0,0 +1,1035 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_amqp.h" +#include +#include +#include +#include "include/ceph_assert.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/dout.h" + +#define dout_subsys ceph_subsys_rgw + +// TODO investigation, not necessarily issues: +// (1) in case of single threaded writer context use spsc_queue +// (2) support multiple channels +// (3) check performance of emptying queue to local list, and go over the list and publish +// (4) use std::shared_mutex (c++17) or equivalent for the connections lock + +namespace rgw::amqp { + +// RGW AMQP status codes for publishing +static const int RGW_AMQP_STATUS_BROKER_NACK = -0x1001; +static const int RGW_AMQP_STATUS_CONNECTION_CLOSED = -0x1002; +static const int RGW_AMQP_STATUS_QUEUE_FULL = -0x1003; +static const int RGW_AMQP_STATUS_MAX_INFLIGHT = -0x1004; +static const int RGW_AMQP_STATUS_MANAGER_STOPPED = -0x1005; +// RGW AMQP status code for connection opening +static const int RGW_AMQP_STATUS_CONN_ALLOC_FAILED = -0x2001; +static const int RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED = -0x2002; +static const int RGW_AMQP_STATUS_SOCKET_OPEN_FAILED = -0x2003; +static const int RGW_AMQP_STATUS_LOGIN_FAILED = -0x2004; +static const int RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED = -0x2005; +static const int RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED = -0x2006; +static const int RGW_AMQP_STATUS_Q_DECLARE_FAILED = -0x2007; +static const int RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED = -0x2008; +static const int RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED = -0x2009; + +static const int RGW_AMQP_RESPONSE_SOCKET_ERROR = -0x3008; +static const int RGW_AMQP_NO_REPLY_CODE = 0x0; + +// key class for the connection list +struct connection_id_t { + const std::string host; + const int port; + const std::string vhost; + // constructed from amqp_connection_info struct + connection_id_t(const amqp_connection_info& info) + : host(info.host), port(info.port), vhost(info.vhost) {} + + // equality operator and hasher functor are needed + // so that connection_id_t could be used as key in unordered_map + bool operator==(const connection_id_t& other) const { + return host == other.host && port == other.port && vhost == other.vhost; + } + + struct hasher { + std::size_t operator()(const connection_id_t& k) const { + return ((std::hash()(k.host) + ^ (std::hash()(k.port) << 1)) >> 1) + ^ (std::hash()(k.vhost) << 1); + } + }; +}; + +std::string to_string(const connection_id_t& id) { + return id.host+":"+std::to_string(id.port)+"/"+id.vhost; +} + +// connection_t state cleaner +// could be used for automatic cleanup when getting out of scope +class ConnectionCleaner { + private: + amqp_connection_state_t conn; + public: + ConnectionCleaner(amqp_connection_state_t _conn) : conn(_conn) {} + ~ConnectionCleaner() { + if (conn) { + amqp_destroy_connection(conn); + } + } + // call reset() if cleanup is not needed anymore + void reset() { + conn = nullptr; + } +}; + +// struct for holding the callback and its tag in the callback list +struct reply_callback_with_tag_t { + uint64_t tag; + reply_callback_t cb; + + reply_callback_with_tag_t(uint64_t _tag, reply_callback_t _cb) : tag(_tag), cb(_cb) {} + + bool operator==(uint64_t rhs) { + return tag == rhs; + } +}; + +typedef std::vector CallbackList; + +// struct for holding the connection state object as well as the exchange +// it is used inside an intrusive ref counted pointer (boost::intrusive_ptr) +// since references to deleted objects may still exist in the calling code +struct connection_t { + amqp_connection_state_t state; + std::string exchange; + std::string user; + std::string password; + amqp_bytes_t reply_to_queue; + bool marked_for_deletion; + uint64_t delivery_tag; + int status; + int reply_type; + int reply_code; + mutable std::atomic ref_count; + CephContext* cct; + CallbackList callbacks; + + // default ctor + connection_t() : + state(nullptr), + reply_to_queue(amqp_empty_bytes), + marked_for_deletion(false), + delivery_tag(1), + status(AMQP_STATUS_OK), + reply_type(AMQP_RESPONSE_NORMAL), + reply_code(RGW_AMQP_NO_REPLY_CODE), + ref_count(0), + cct(nullptr) {} + + // cleanup of all internal connection resource + // the object can still remain, and internal connection + // resources created again on successful reconnection + void destroy(int s) { + status = s; + ConnectionCleaner clean_state(state); + state = nullptr; + amqp_bytes_free(reply_to_queue); + reply_to_queue = amqp_empty_bytes; + // fire all remaining callbacks + std::for_each(callbacks.begin(), callbacks.end(), [this](auto& cb_tag) { + cb_tag.cb(status); + ldout(cct, 20) << "AMQP destroy: invoking callback with tag=" << cb_tag.tag << dendl; + }); + callbacks.clear(); + delivery_tag = 1; + } + + bool is_ok() const { + return (state != nullptr && !marked_for_deletion); + } + + // dtor also destroys the internals + ~connection_t() { + destroy(RGW_AMQP_STATUS_CONNECTION_CLOSED); + } + + friend void intrusive_ptr_add_ref(const connection_t* p); + friend void intrusive_ptr_release(const connection_t* p); +}; + +// these are required interfaces so that connection_t could be used inside boost::intrusive_ptr +void intrusive_ptr_add_ref(const connection_t* p) { + ++p->ref_count; +} +void intrusive_ptr_release(const connection_t* p) { + if (--p->ref_count == 0) { + delete p; + } +} + +// convert connection info to string +std::string to_string(const amqp_connection_info& info) { + std::stringstream ss; + ss << "connection info:" << + "\nHost: " << info.host << + "\nPort: " << info.port << + "\nUser: " << info.user << + "\nPassword: " << info.password << + "\nvhost: " << info.vhost << + "\nSSL support: " << info.ssl << std::endl; + return ss.str(); +} + +// convert reply to error code +int reply_to_code(const amqp_rpc_reply_t& reply) { + switch (reply.reply_type) { + case AMQP_RESPONSE_NONE: + case AMQP_RESPONSE_NORMAL: + return RGW_AMQP_NO_REPLY_CODE; + case AMQP_RESPONSE_LIBRARY_EXCEPTION: + return reply.library_error; + case AMQP_RESPONSE_SERVER_EXCEPTION: + if (reply.reply.decoded) { + const amqp_connection_close_t* m = (amqp_connection_close_t*)reply.reply.decoded; + return m->reply_code; + } + return reply.reply.id; + } + return RGW_AMQP_NO_REPLY_CODE; +} + +// convert reply to string +std::string to_string(const amqp_rpc_reply_t& reply) { + std::stringstream ss; + switch (reply.reply_type) { + case AMQP_RESPONSE_NORMAL: + return ""; + case AMQP_RESPONSE_NONE: + return "missing RPC reply type"; + case AMQP_RESPONSE_LIBRARY_EXCEPTION: + return amqp_error_string2(reply.library_error); + case AMQP_RESPONSE_SERVER_EXCEPTION: + { + switch (reply.reply.id) { + case AMQP_CONNECTION_CLOSE_METHOD: + ss << "server connection error: "; + break; + case AMQP_CHANNEL_CLOSE_METHOD: + ss << "server channel error: "; + break; + default: + ss << "server unknown error: "; + break; + } + if (reply.reply.decoded) { + amqp_connection_close_t* m = (amqp_connection_close_t*)reply.reply.decoded; + ss << m->reply_code << " text: " << std::string((char*)m->reply_text.bytes, m->reply_text.len); + } + return ss.str(); + } + default: + ss << "unknown error, method id: " << reply.reply.id; + return ss.str(); + } +} + +// convert status enum to string +std::string to_string(amqp_status_enum s) { + switch (s) { + case AMQP_STATUS_OK: + return "AMQP_STATUS_OK"; + case AMQP_STATUS_NO_MEMORY: + return "AMQP_STATUS_NO_MEMORY"; + case AMQP_STATUS_BAD_AMQP_DATA: + return "AMQP_STATUS_BAD_AMQP_DATA"; + case AMQP_STATUS_UNKNOWN_CLASS: + return "AMQP_STATUS_UNKNOWN_CLASS"; + case AMQP_STATUS_UNKNOWN_METHOD: + return "AMQP_STATUS_UNKNOWN_METHOD"; + case AMQP_STATUS_HOSTNAME_RESOLUTION_FAILED: + return "AMQP_STATUS_HOSTNAME_RESOLUTION_FAILED"; + case AMQP_STATUS_INCOMPATIBLE_AMQP_VERSION: + return "AMQP_STATUS_INCOMPATIBLE_AMQP_VERSION"; + case AMQP_STATUS_CONNECTION_CLOSED: + return "AMQP_STATUS_CONNECTION_CLOSED"; + case AMQP_STATUS_BAD_URL: + return "AMQP_STATUS_BAD_URL"; + case AMQP_STATUS_SOCKET_ERROR: + return "AMQP_STATUS_SOCKET_ERROR"; + case AMQP_STATUS_INVALID_PARAMETER: + return "AMQP_STATUS_INVALID_PARAMETER"; + case AMQP_STATUS_TABLE_TOO_BIG: + return "AMQP_STATUS_TABLE_TOO_BIG"; + case AMQP_STATUS_WRONG_METHOD: + return "AMQP_STATUS_WRONG_METHOD"; + case AMQP_STATUS_TIMEOUT: + return "AMQP_STATUS_TIMEOUT"; + case AMQP_STATUS_TIMER_FAILURE: + return "AMQP_STATUS_TIMER_FAILURE"; + case AMQP_STATUS_HEARTBEAT_TIMEOUT: + return "AMQP_STATUS_HEARTBEAT_TIMEOUT"; + case AMQP_STATUS_UNEXPECTED_STATE: + return "AMQP_STATUS_UNEXPECTED_STATE"; + case AMQP_STATUS_SOCKET_CLOSED: + return "AMQP_STATUS_SOCKET_CLOSED"; + case AMQP_STATUS_SOCKET_INUSE: + return "AMQP_STATUS_SOCKET_INUSE"; + case AMQP_STATUS_BROKER_UNSUPPORTED_SASL_METHOD: + return "AMQP_STATUS_BROKER_UNSUPPORTED_SASL_METHOD"; +#if AMQP_VERSION >= AMQP_VERSION_CODE(0, 8, 0, 0) + case AMQP_STATUS_UNSUPPORTED: + return "AMQP_STATUS_UNSUPPORTED"; +#endif + case _AMQP_STATUS_NEXT_VALUE: + return "AMQP_STATUS_INTERNAL"; + case AMQP_STATUS_TCP_ERROR: + return "AMQP_STATUS_TCP_ERROR"; + case AMQP_STATUS_TCP_SOCKETLIB_INIT_ERROR: + return "AMQP_STATUS_TCP_SOCKETLIB_INIT_ERROR"; + case _AMQP_STATUS_TCP_NEXT_VALUE: + return "AMQP_STATUS_INTERNAL"; + case AMQP_STATUS_SSL_ERROR: + return "AMQP_STATUS_SSL_ERROR"; + case AMQP_STATUS_SSL_HOSTNAME_VERIFY_FAILED: + return "AMQP_STATUS_SSL_HOSTNAME_VERIFY_FAILED"; + case AMQP_STATUS_SSL_PEER_VERIFY_FAILED: + return "AMQP_STATUS_SSL_PEER_VERIFY_FAILED"; + case AMQP_STATUS_SSL_CONNECTION_FAILED: + return "AMQP_STATUS_SSL_CONNECTION_FAILED"; + case _AMQP_STATUS_SSL_NEXT_VALUE: + return "AMQP_STATUS_INTERNAL"; + } + return "AMQP_STATUS_UNKNOWN"; +} + +// TODO: add status_to_string on the connection object to prinf full status + +// convert int status to string - including RGW specific values +std::string status_to_string(int s) { + switch (s) { + case RGW_AMQP_STATUS_BROKER_NACK: + return "RGW_AMQP_STATUS_BROKER_NACK"; + case RGW_AMQP_STATUS_CONNECTION_CLOSED: + return "RGW_AMQP_STATUS_CONNECTION_CLOSED"; + case RGW_AMQP_STATUS_QUEUE_FULL: + return "RGW_AMQP_STATUS_QUEUE_FULL"; + case RGW_AMQP_STATUS_MAX_INFLIGHT: + return "RGW_AMQP_STATUS_MAX_INFLIGHT"; + case RGW_AMQP_STATUS_MANAGER_STOPPED: + return "RGW_AMQP_STATUS_MANAGER_STOPPED"; + case RGW_AMQP_STATUS_CONN_ALLOC_FAILED: + return "RGW_AMQP_STATUS_CONN_ALLOC_FAILED"; + case RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED: + return "RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED"; + case RGW_AMQP_STATUS_SOCKET_OPEN_FAILED: + return "RGW_AMQP_STATUS_SOCKET_OPEN_FAILED"; + case RGW_AMQP_STATUS_LOGIN_FAILED: + return "RGW_AMQP_STATUS_LOGIN_FAILED"; + case RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED: + return "RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED"; + case RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED: + return "RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED"; + case RGW_AMQP_STATUS_Q_DECLARE_FAILED: + return "RGW_AMQP_STATUS_Q_DECLARE_FAILED"; + case RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED: + return "RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED"; + case RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED: + return "RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED"; + } + return to_string((amqp_status_enum)s); +} + +// check the result from calls and return if error (=null) +#define RETURN_ON_ERROR(C, S, OK) \ + if (!OK) { \ + C->status = S; \ + return C; \ + } + +// in case of RPC calls, getting the RPC reply and return if an error is detected +#define RETURN_ON_REPLY_ERROR(C, ST, S) { \ + const auto reply = amqp_get_rpc_reply(ST); \ + if (reply.reply_type != AMQP_RESPONSE_NORMAL) { \ + C->status = S; \ + C->reply_type = reply.reply_type; \ + C->reply_code = reply_to_code(reply); \ + return C; \ + } \ + } + +static const amqp_channel_t CHANNEL_ID = 1; +static const amqp_channel_t CONFIRMING_CHANNEL_ID = 2; + +// utility function to create a connection, when the connection object already exists +connection_ptr_t& create_connection(connection_ptr_t& conn, const amqp_connection_info& info) { + // pointer must be valid and not marked for deletion + ceph_assert(conn && !conn->marked_for_deletion); + + // reset all status codes + conn->status = AMQP_STATUS_OK; + conn->reply_type = AMQP_RESPONSE_NORMAL; + conn->reply_code = RGW_AMQP_NO_REPLY_CODE; + + auto state = amqp_new_connection(); + if (!state) { + conn->status = RGW_AMQP_STATUS_CONN_ALLOC_FAILED; + return conn; + } + // make sure that the connection state is cleaned up in case of error + ConnectionCleaner state_guard(state); + + // create and open socket + auto socket = amqp_tcp_socket_new(state); + if (!socket) { + conn->status = RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED; + return conn; + } + const auto s = amqp_socket_open(socket, info.host, info.port); + if (s < 0) { + conn->status = RGW_AMQP_STATUS_SOCKET_OPEN_FAILED; + conn->reply_type = RGW_AMQP_RESPONSE_SOCKET_ERROR; + conn->reply_code = s; + return conn; + } + + // login to broker + const auto reply = amqp_login(state, + info.vhost, + AMQP_DEFAULT_MAX_CHANNELS, + AMQP_DEFAULT_FRAME_SIZE, + 0, // no heartbeat TODO: add conf + AMQP_SASL_METHOD_PLAIN, // TODO: add other types of security + info.user, + info.password); + if (reply.reply_type != AMQP_RESPONSE_NORMAL) { + conn->status = RGW_AMQP_STATUS_LOGIN_FAILED; + conn->reply_type = reply.reply_type; + conn->reply_code = reply_to_code(reply); + return conn; + } + + // open channels + { + const auto ok = amqp_channel_open(state, CHANNEL_ID); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED, ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED); + } + { + const auto ok = amqp_channel_open(state, CONFIRMING_CHANNEL_ID); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED, ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED); + } + { + const auto ok = amqp_confirm_select(state, CONFIRMING_CHANNEL_ID); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED, ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED); + } + + // verify that the topic exchange is there + // TODO: make this step optional + { + const auto ok = amqp_exchange_declare(state, + CHANNEL_ID, + amqp_cstring_bytes(conn->exchange.c_str()), + amqp_cstring_bytes("topic"), + 1, // passive - exchange must already exist on broker + 1, // durable + 0, // dont auto-delete + 0, // not internal + amqp_empty_table); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED, ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED); + } + { + // create queue for confirmations + const auto queue_ok = amqp_queue_declare(state, + CHANNEL_ID, // use the regular channel for this call + amqp_empty_bytes, // let broker allocate queue name + 0, // not passive - create the queue + 0, // not durable + 1, // exclusive + 1, // auto-delete + amqp_empty_table // not args TODO add args from conf: TTL, max length etc. + ); + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_Q_DECLARE_FAILED, queue_ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_Q_DECLARE_FAILED); + + // define consumption for connection + const auto consume_ok = amqp_basic_consume(state, + CONFIRMING_CHANNEL_ID, + queue_ok->queue, + amqp_empty_bytes, // broker will generate consumer tag + 1, // messages sent from client are never routed back + 1, // client does not ack thr acks + 1, // exclusive access to queue + amqp_empty_table // no parameters + ); + + RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED, consume_ok); + RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED); + // broker generated consumer_tag could be used to cancel sending of n/acks from broker - not needed + + state_guard.reset(); + conn->state = state; + conn->reply_to_queue = amqp_bytes_malloc_dup(queue_ok->queue); + return conn; + } +} + +// utility function to create a new connection +connection_ptr_t create_new_connection(const amqp_connection_info& info, + const std::string& exchange, CephContext* cct) { + // create connection state + connection_ptr_t conn = new connection_t; + conn->exchange = exchange; + conn->user.assign(info.user); + conn->password.assign(info.password); + conn->cct = cct; + return create_connection(conn, info); +} + +/// struct used for holding messages in the message queue +struct message_wrapper_t { + connection_ptr_t conn; + std::string topic; + std::string message; + reply_callback_t cb; + + message_wrapper_t(connection_ptr_t& _conn, + const std::string& _topic, + const std::string& _message, + reply_callback_t _cb) : conn(_conn), topic(_topic), message(_message), cb(_cb) {} +}; + + +typedef std::unordered_map ConnectionList; +typedef boost::lockfree::queue> MessageQueue; + +// macros used inside a loop where an iterator is either incremented or erased +#define INCREMENT_AND_CONTINUE(IT) \ + ++IT; \ + continue; + +#define ERASE_AND_CONTINUE(IT,CONTAINER) \ + IT=CONTAINER.erase(IT); \ + --connection_count; \ + continue; + +class Manager { +public: + const size_t max_connections; + const size_t max_inflight; + const size_t max_queue; +private: + std::atomic connection_count; + bool stopped; + struct timeval read_timeout; + ConnectionList connections; + MessageQueue messages; + std::atomic queued; + std::atomic dequeued; + CephContext* const cct; + mutable std::mutex connections_lock; + std::thread runner; + + void publish_internal(message_wrapper_t* message) { + const std::unique_ptr msg_owner(message); + auto& conn = message->conn; + + if (!conn->is_ok()) { + // connection had an issue while message was in the queue + // TODO add error stats + ldout(conn->cct, 1) << "AMQP publish: connection had an issue while message was in the queue" << dendl; + if (message->cb) { + message->cb(RGW_AMQP_STATUS_CONNECTION_CLOSED); + } + return; + } + + if (message->cb == nullptr) { + // TODO add error stats + const auto rc = amqp_basic_publish(conn->state, + CHANNEL_ID, + amqp_cstring_bytes(conn->exchange.c_str()), + amqp_cstring_bytes(message->topic.c_str()), + 1, // mandatory, TODO: take from conf + 0, // not immediate + nullptr, + amqp_cstring_bytes(message->message.c_str())); + if (rc == AMQP_STATUS_OK) { + ldout(conn->cct, 20) << "AMQP publish (no callback): OK" << dendl; + return; + } + ldout(conn->cct, 1) << "AMQP publish (no callback): failed with error " << status_to_string(rc) << dendl; + // an error occurred, close connection + // it will be retied by the main loop + conn->destroy(rc); + return; + } + + amqp_basic_properties_t props; + props._flags = + AMQP_BASIC_DELIVERY_MODE_FLAG | + AMQP_BASIC_REPLY_TO_FLAG; + props.delivery_mode = 2; // persistent delivery TODO take from conf + props.reply_to = conn->reply_to_queue; + + const auto rc = amqp_basic_publish(conn->state, + CONFIRMING_CHANNEL_ID, + amqp_cstring_bytes(conn->exchange.c_str()), + amqp_cstring_bytes(message->topic.c_str()), + 1, // mandatory, TODO: take from conf + 0, // not immediate + &props, + amqp_cstring_bytes(message->message.c_str())); + + if (rc == AMQP_STATUS_OK) { + auto const q_len = conn->callbacks.size(); + if (q_len < max_inflight) { + ldout(conn->cct, 20) << "AMQP publish (with callback, tag=" << conn->delivery_tag << "): OK. Queue has: " << q_len << " callbacks" << dendl; + conn->callbacks.emplace_back(conn->delivery_tag++, message->cb); + } else { + // immediately invoke callback with error + ldout(conn->cct, 1) << "AMQP publish (with callback): failed with error: callback queue full" << dendl; + message->cb(RGW_AMQP_STATUS_MAX_INFLIGHT); + } + } else { + // an error occurred, close connection + // it will be retied by the main loop + ldout(conn->cct, 1) << "AMQP publish (with callback): failed with error: " << status_to_string(rc) << dendl; + conn->destroy(rc); + // immediately invoke callback with error + message->cb(rc); + } + } + + // the managers thread: + // (1) empty the queue of messages to be published + // (2) loop over all connections and read acks + // (3) manages deleted connections + // (4) TODO reconnect on connection errors + // (5) TODO cleanup timedout callbacks + void run() { + amqp_frame_t frame; + while (!stopped) { + + // publish all messages in the queue + const auto count = messages.consume_all(std::bind(&Manager::publish_internal, this, std::placeholders::_1)); + dequeued += count; + ConnectionList::iterator conn_it; + ConnectionList::const_iterator end_it; + { + // thread safe access to the connection list + // once the iterators are fetched they are guaranteed to remain valid + std::lock_guard lock(connections_lock); + conn_it = connections.begin(); + end_it = connections.end(); + } + auto incoming_message = false; + // loop over all connections to read acks + for (;conn_it != end_it;) { + + auto& conn = conn_it->second; + // delete the connection if marked for deletion + if (conn->marked_for_deletion) { + ldout(conn->cct, 10) << "AMQP run: connection is deleted" << dendl; + conn->destroy(RGW_AMQP_STATUS_CONNECTION_CLOSED); + std::lock_guard lock(connections_lock); + // erase is safe - does not invalidate any other iterator + // lock so no insertion happens at the same time + ERASE_AND_CONTINUE(conn_it, connections); + } + + // try to reconnect the connection if it has an error + if (!conn->is_ok()) { + // pointers are used temporarily inside the amqp_connection_info object + // as read-only values, hence the assignment, and const_cast are safe here + amqp_connection_info info; + info.host = const_cast(conn_it->first.host.c_str()); + info.port = conn_it->first.port; + info.vhost = const_cast(conn_it->first.vhost.c_str()); + info.user = const_cast(conn->user.c_str()); + info.password = const_cast(conn->password.c_str()); + ldout(conn->cct, 20) << "AMQP run: retry connection" << dendl; + if (create_connection(conn, info)->is_ok() == false) { + ldout(conn->cct, 10) << "AMQP run: connection (" << to_string(conn_it->first) << ") retry failed" << dendl; + // TODO: add error counter for failed retries + // TODO: add exponential backoff for retries + } else { + ldout(conn->cct, 10) << "AMQP run: connection (" << to_string(conn_it->first) << ") retry successfull" << dendl; + } + INCREMENT_AND_CONTINUE(conn_it); + } + + const auto rc = amqp_simple_wait_frame_noblock(conn->state, &frame, &read_timeout); + + if (rc == AMQP_STATUS_TIMEOUT) { + // TODO mark connection as idle + INCREMENT_AND_CONTINUE(conn_it); + } + + // this is just to prevent spinning idle, does not indicate that a message + // was successfully processed or not + incoming_message = true; + + // check if error occurred that require reopening the connection + if (rc != AMQP_STATUS_OK) { + // an error occurred, close connection + // it will be retied by the main loop + ldout(conn->cct, 1) << "AMQP run: connection read error: " << status_to_string(rc) << dendl; + conn->destroy(rc); + INCREMENT_AND_CONTINUE(conn_it); + } + + if (frame.frame_type != AMQP_FRAME_METHOD) { + ldout(conn->cct, 10) << "AMQP run: ignoring non n/ack messages" << dendl; + // handler is for publish confirmation only - handle only method frames + // TODO: add a counter + INCREMENT_AND_CONTINUE(conn_it); + } + + uint64_t tag; + bool multiple; + int result; + + switch (frame.payload.method.id) { + case AMQP_BASIC_ACK_METHOD: + { + result = AMQP_STATUS_OK; + const auto ack = (amqp_basic_ack_t*)frame.payload.method.decoded; + ceph_assert(ack); + tag = ack->delivery_tag; + multiple = ack->multiple; + break; + } + case AMQP_BASIC_NACK_METHOD: + { + result = RGW_AMQP_STATUS_BROKER_NACK; + const auto nack = (amqp_basic_nack_t*)frame.payload.method.decoded; + ceph_assert(nack); + tag = nack->delivery_tag; + multiple = nack->multiple; + break; + } + case AMQP_CONNECTION_CLOSE_METHOD: + // TODO on channel close, no need to reopen the connection + case AMQP_CHANNEL_CLOSE_METHOD: + { + // other side closed the connection, no need to continue + ldout(conn->cct, 10) << "AMQP run: connection was closed by broker" << dendl; + conn->destroy(rc); + INCREMENT_AND_CONTINUE(conn_it); + } + case AMQP_BASIC_RETURN_METHOD: + // message was not delivered, returned to sender + // TODO: add a counter + ldout(conn->cct, 10) << "AMQP run: message delivery error" << dendl; + INCREMENT_AND_CONTINUE(conn_it); + break; + default: + // unexpected method + // TODO: add a counter + ldout(conn->cct, 10) << "AMQP run: unexpected message" << dendl; + INCREMENT_AND_CONTINUE(conn_it); + } + + const auto& callbacks_end = conn->callbacks.end(); + const auto& callbacks_begin = conn->callbacks.begin(); + const auto tag_it = std::find(callbacks_begin, callbacks_end, tag); + if (tag_it != callbacks_end) { + if (multiple) { + // n/ack all up to (and including) the tag + ldout(conn->cct, 20) << "AMQP run: multiple n/acks received with tag=" << tag << " and result=" << result << dendl; + auto it = callbacks_begin; + while (it->tag <= tag && it != conn->callbacks.end()) { + ldout(conn->cct, 20) << "AMQP run: invoking callback with tag=" << it->tag << dendl; + it->cb(result); + it = conn->callbacks.erase(it); + } + } else { + // n/ack a specific tag + ldout(conn->cct, 20) << "AMQP run: n/ack received, invoking callback with tag=" << tag << " and result=" << result << dendl; + tag_it->cb(result); + conn->callbacks.erase(tag_it); + } + } else { + // TODO add counter for acks with no callback + ldout(conn->cct, 10) << "AMQP run: unsolicited n/ack received with tag=" << tag << dendl; + } + // just increment the iterator + ++conn_it; + } + // if no messages were received or published, sleep for 100ms + if (count == 0 && !incoming_message) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + } + + // used in the dtor for message cleanup + static void delete_message(const message_wrapper_t* message) { + delete message; + } + +public: + Manager(size_t _max_connections, + size_t _max_inflight, + size_t _max_queue, + long _usec_timeout, + CephContext* _cct) : + max_connections(_max_connections), + max_inflight(_max_inflight), + max_queue(_max_queue), + connection_count(0), + stopped(false), + read_timeout{0, _usec_timeout}, + connections(_max_connections), + messages(max_queue), + queued(0), + dequeued(0), + cct(_cct), + runner(&Manager::run, this) { + // The hashmap has "max connections" as the initial number of buckets, + // and allows for 10 collisions per bucket before rehash. + // This is to prevent rehashing so that iterators are not invalidated + // when a new connection is added. + connections.max_load_factor(10.0); + // give the runner thread a name for easier debugging + const auto rc = ceph_pthread_setname(runner.native_handle(), "amqp_manager"); + ceph_assert(rc==0); + } + + // non copyable + Manager(const Manager&) = delete; + const Manager& operator=(const Manager&) = delete; + + // stop the main thread + void stop() { + stopped = true; + } + + // disconnect from a broker + bool disconnect(connection_ptr_t& conn) { + if (!conn || stopped) { + return false; + } + conn->marked_for_deletion = true; + return true; + } + + // connect to a broker, or reuse an existing connection if already connected + connection_ptr_t connect(const std::string& url, const std::string& exchange) { + if (stopped) { + // TODO: increment counter + ldout(cct, 1) << "AMQP connect: manager is stopped" << dendl; + return nullptr; + } + + struct amqp_connection_info info; + // cache the URL so that parsing could happen in-place + std::vector url_cache(url.c_str(), url.c_str()+url.size()+1); + if (AMQP_STATUS_OK != amqp_parse_url(url_cache.data(), &info)) { + // TODO: increment counter + ldout(cct, 1) << "AMQP connect: URL parsing failed" << dendl; + return nullptr; + } + + const connection_id_t id(info); + std::lock_guard lock(connections_lock); + const auto it = connections.find(id); + if (it != connections.end()) { + if (it->second->marked_for_deletion) { + // TODO: increment counter + ldout(cct, 1) << "AMQP connect: endpoint marked for deletion" << dendl; + return nullptr; + } else if (it->second->exchange != exchange) { + // TODO: increment counter + ldout(cct, 1) << "AMQP connect: exchange mismatch" << dendl; + return nullptr; + } + // connection found - return even if non-ok + ldout(cct, 20) << "AMQP connect: connection found" << dendl; + return it->second; + } + + // connection not found, creating a new one + if (connection_count >= max_connections) { + // TODO: increment counter + ldout(cct, 1) << "AMQP connect: max connections exceeded" << dendl; + return nullptr; + } + const auto conn = create_new_connection(info, exchange, cct); + // create_new_connection must always return a connection object + // even if error occurred during creation. + // in such a case the creation will be retried in the main thread + ceph_assert(conn); + ++connection_count; + ldout(cct, 10) << "AMQP connect: new connection is created. Total connections: " << connection_count << dendl; + ldout(cct, 10) << "AMQP connect: new connection status is: " << status_to_string(conn->status) << dendl; + return connections.emplace(id, conn).first->second; + } + + // TODO publish with confirm is needed in "none" case as well, cb should be invoked publish is ok (no ack) + int publish(connection_ptr_t& conn, + const std::string& topic, + const std::string& message) { + if (stopped) { + return RGW_AMQP_STATUS_MANAGER_STOPPED; + } + if (!conn || !conn->is_ok()) { + return RGW_AMQP_STATUS_CONNECTION_CLOSED; + } + if (messages.push(new message_wrapper_t(conn, topic, message, nullptr))) { + ++queued; + return AMQP_STATUS_OK; + } + return RGW_AMQP_STATUS_QUEUE_FULL; + } + + int publish_with_confirm(connection_ptr_t& conn, + const std::string& topic, + const std::string& message, + reply_callback_t cb) { + if (stopped) { + return RGW_AMQP_STATUS_MANAGER_STOPPED; + } + if (!conn || !conn->is_ok()) { + return RGW_AMQP_STATUS_CONNECTION_CLOSED; + } + if (messages.push(new message_wrapper_t(conn, topic, message, cb))) { + ++queued; + return AMQP_STATUS_OK; + } + return RGW_AMQP_STATUS_QUEUE_FULL; + } + + // dtor wait for thread to stop + // then connection are cleaned-up + ~Manager() { + stopped = true; + runner.join(); + messages.consume_all(delete_message); + } + + // get the number of connections + size_t get_connection_count() const { + return connection_count; + } + + // get the number of in-flight messages + size_t get_inflight() const { + size_t sum = 0; + std::lock_guard lock(connections_lock); + std::for_each(connections.begin(), connections.end(), [&sum](auto& conn_pair) { + sum += conn_pair.second->callbacks.size(); + }); + return sum; + } + + // running counter of the queued messages + size_t get_queued() const { + return queued; + } + + // running counter of the dequeued messages + size_t get_dequeued() const { + return dequeued; + } +}; + +// singleton manager +// note that the manager itself is not a singleton, and multiple instances may co-exist +// TODO make the pointer atomic in allocation and deallocation to avoid race conditions +static Manager* s_manager = nullptr; + +static const size_t MAX_CONNECTIONS_DEFAULT = 256; +static const size_t MAX_INFLIGHT_DEFAULT = 8192; +static const size_t MAX_QUEUE_DEFAULT = 8192; + +bool init(CephContext* cct) { + if (s_manager) { + return false; + } + // TODO: take conf from CephContext + s_manager = new Manager(MAX_CONNECTIONS_DEFAULT, MAX_INFLIGHT_DEFAULT, MAX_QUEUE_DEFAULT, 100, cct); + return true; +} + +void shutdown() { + delete s_manager; + s_manager = nullptr; +} + +connection_ptr_t connect(const std::string& url, const std::string& exchange) { + if (!s_manager) return nullptr; + return s_manager->connect(url, exchange); +} + +int publish(connection_ptr_t& conn, + const std::string& topic, + const std::string& message) { + if (!s_manager) return RGW_AMQP_STATUS_MANAGER_STOPPED; + return s_manager->publish(conn, topic, message); +} + +int publish_with_confirm(connection_ptr_t& conn, + const std::string& topic, + const std::string& message, + reply_callback_t cb) { + if (!s_manager) return RGW_AMQP_STATUS_MANAGER_STOPPED; + return s_manager->publish_with_confirm(conn, topic, message, cb); +} + +size_t get_connection_count() { + if (!s_manager) return 0; + return s_manager->get_connection_count(); +} + +size_t get_inflight() { + if (!s_manager) return 0; + return s_manager->get_inflight(); +} + +size_t get_queued() { + if (!s_manager) return 0; + return s_manager->get_queued(); +} + +size_t get_dequeued() { + if (!s_manager) return 0; + return s_manager->get_dequeued(); +} + +size_t get_max_connections() { + if (!s_manager) return MAX_CONNECTIONS_DEFAULT; + return s_manager->max_connections; +} + +size_t get_max_inflight() { + if (!s_manager) return MAX_INFLIGHT_DEFAULT; + return s_manager->max_inflight; +} + +size_t get_max_queue() { + if (!s_manager) return MAX_QUEUE_DEFAULT; + return s_manager->max_queue; +} + +bool disconnect(connection_ptr_t& conn) { + if (!s_manager) return false; + return s_manager->disconnect(conn); +} + +} // namespace amqp + diff --git a/src/rgw/rgw_amqp.h b/src/rgw/rgw_amqp.h new file mode 100644 index 00000000..938bdade --- /dev/null +++ b/src/rgw/rgw_amqp.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include + +class CephContext; + +namespace rgw::amqp { +// forward declaration of connection object +struct connection_t; + +typedef boost::intrusive_ptr connection_ptr_t; + +// required interfaces needed so that connection_t could be used inside boost::intrusive_ptr +void intrusive_ptr_add_ref(const connection_t* p); +void intrusive_ptr_release(const connection_t* p); + +// the reply callback is expected to get an integer parameter +// indicating the result, and not to return anything +typedef std::function reply_callback_t; + +// initialize the amqp manager +bool init(CephContext* cct); + +// shutdown the amqp manager +void shutdown(); + +// connect to an amqp endpoint +connection_ptr_t connect(const std::string& url, const std::string& exchange); + +// publish a message over a connection that was already created +int publish(connection_ptr_t& conn, + const std::string& topic, + const std::string& message); + +// publish a message over a connection that was already created +// and pass a callback that will be invoked (async) when broker confirms +// receiving the message +int publish_with_confirm(connection_ptr_t& conn, + const std::string& topic, + const std::string& message, + reply_callback_t cb); + +// convert the integer status returned from the "publish" function to a string +std::string status_to_string(int s); + +// number of connections +size_t get_connection_count(); + +// return the number of messages that were sent +// to broker, but were not yet acked/nacked/timedout +size_t get_inflight(); + +// running counter of successfully queued messages +size_t get_queued(); + +// running counter of dequeued messages +size_t get_dequeued(); + +// number of maximum allowed connections +size_t get_max_connections(); + +// number of maximum allowed inflight messages +size_t get_max_inflight(); + +// maximum number of messages in the queue +size_t get_max_queue(); + +// disconnect from an amqp broker +bool disconnect(connection_ptr_t& conn); + +} + diff --git a/src/rgw/rgw_arn.cc b/src/rgw/rgw_arn.cc new file mode 100644 index 00000000..d8b4ed39 --- /dev/null +++ b/src/rgw/rgw_arn.cc @@ -0,0 +1,385 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_arn.h" +#include "rgw_common.h" +#include + +namespace rgw { + +namespace { +boost::optional to_partition(const smatch::value_type& p, + bool wildcards) { + if (p == "aws") { + return Partition::aws; + } else if (p == "aws-cn") { + return Partition::aws_cn; + } else if (p == "aws-us-gov") { + return Partition::aws_us_gov; + } else if (p == "*" && wildcards) { + return Partition::wildcard; + } else { + return boost::none; + } + + ceph_abort(); +} + +boost::optional to_service(const smatch::value_type& s, + bool wildcards) { + static const unordered_map services = { + { "acm", Service::acm }, + { "apigateway", Service::apigateway }, + { "appstream", Service::appstream }, + { "artifact", Service::artifact }, + { "autoscaling", Service::autoscaling }, + { "aws-marketplace", Service::aws_marketplace }, + { "aws-marketplace-management", + Service::aws_marketplace_management }, + { "aws-portal", Service::aws_portal }, + { "cloudformation", Service::cloudformation }, + { "cloudfront", Service::cloudfront }, + { "cloudhsm", Service::cloudhsm }, + { "cloudsearch", Service::cloudsearch }, + { "cloudtrail", Service::cloudtrail }, + { "cloudwatch", Service::cloudwatch }, + { "codebuild", Service::codebuild }, + { "codecommit", Service::codecommit }, + { "codedeploy", Service::codedeploy }, + { "codepipeline", Service::codepipeline }, + { "cognito-identity", Service::cognito_identity }, + { "cognito-idp", Service::cognito_idp }, + { "cognito-sync", Service::cognito_sync }, + { "config", Service::config }, + { "datapipeline", Service::datapipeline }, + { "devicefarm", Service::devicefarm }, + { "directconnect", Service::directconnect }, + { "dms", Service::dms }, + { "ds", Service::ds }, + { "dynamodb", Service::dynamodb }, + { "ec2", Service::ec2 }, + { "ecr", Service::ecr }, + { "ecs", Service::ecs }, + { "elasticache", Service::elasticache }, + { "elasticbeanstalk", Service::elasticbeanstalk }, + { "elasticfilesystem", Service::elasticfilesystem }, + { "elasticloadbalancing", Service::elasticloadbalancing }, + { "elasticmapreduce", Service::elasticmapreduce }, + { "elastictranscoder", Service::elastictranscoder }, + { "es", Service::es }, + { "events", Service::events }, + { "firehose", Service::firehose }, + { "gamelift", Service::gamelift }, + { "glacier", Service::glacier }, + { "health", Service::health }, + { "iam", Service::iam }, + { "importexport", Service::importexport }, + { "inspector", Service::inspector }, + { "iot", Service::iot }, + { "kinesis", Service::kinesis }, + { "kinesisanalytics", Service::kinesisanalytics }, + { "kms", Service::kms }, + { "lambda", Service::lambda }, + { "lightsail", Service::lightsail }, + { "logs", Service::logs }, + { "machinelearning", Service::machinelearning }, + { "mobileanalytics", Service::mobileanalytics }, + { "mobilehub", Service::mobilehub }, + { "opsworks", Service::opsworks }, + { "opsworks-cm", Service::opsworks_cm }, + { "polly", Service::polly }, + { "rds", Service::rds }, + { "redshift", Service::redshift }, + { "route53", Service::route53 }, + { "route53domains", Service::route53domains }, + { "s3", Service::s3 }, + { "sdb", Service::sdb }, + { "servicecatalog", Service::servicecatalog }, + { "ses", Service::ses }, + { "sns", Service::sns }, + { "sqs", Service::sqs }, + { "ssm", Service::ssm }, + { "states", Service::states }, + { "storagegateway", Service::storagegateway }, + { "sts", Service::sts }, + { "support", Service::support }, + { "swf", Service::swf }, + { "trustedadvisor", Service::trustedadvisor }, + { "waf", Service::waf }, + { "workmail", Service::workmail }, + { "workspaces", Service::workspaces }}; + + if (wildcards && s == "*") { + return Service::wildcard; + } + + auto i = services.find(s); + if (i == services.end()) { + return boost::none; + } else { + return i->second; + } +} +} +ARN::ARN(const rgw_obj& o) + : partition(Partition::aws), + service(Service::s3), + region(), + account(o.bucket.tenant), + resource(o.bucket.name) +{ + resource.push_back('/'); + resource.append(o.key.name); +} + +ARN::ARN(const rgw_bucket& b) + : partition(Partition::aws), + service(Service::s3), + region(), + account(b.tenant), + resource(b.name) { } + +ARN::ARN(const rgw_bucket& b, const std::string& o) + : partition(Partition::aws), + service(Service::s3), + region(), + account(b.tenant), + resource(b.name) { + resource.push_back('/'); + resource.append(o); +} + +ARN::ARN(const std::string& resource_name, const std::string& type, const std::string& tenant, bool has_path) + : partition(Partition::aws), + service(Service::iam), + region(), + account(tenant), + resource(type) { + if (! has_path) + resource.push_back('/'); + resource.append(resource_name); +} + +boost::optional ARN::parse(const std::string& s, bool wildcards) { + static const std::regex rx_wild("arn:([^:]*):([^:]*):([^:]*):([^:]*):([^:]*)", + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + static const std::regex rx_no_wild( + "arn:([^:*]*):([^:*]*):([^:*]*):([^:*]*):(.*)", + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + + smatch match; + + if ((s == "*") && wildcards) { + return ARN(Partition::wildcard, Service::wildcard, "*", "*", "*"); + } else if (regex_match(s, match, wildcards ? rx_wild : rx_no_wild) && + match.size() == 6) { + if (auto p = to_partition(match[1], wildcards)) { + if (auto s = to_service(match[2], wildcards)) { + return ARN(*p, *s, match[3], match[4], match[5]); + } + } + } + return boost::none; +} + +std::string ARN::to_string() const { + std::string s{"arn:"}; + + if (partition == Partition::aws) { + s.append("aws:"); + } else if (partition == Partition::aws_cn) { + s.append("aws-cn:"); + } else if (partition == Partition::aws_us_gov) { + s.append("aws-us-gov:"); + } else { + s.append("*:"); + } + + static const std::unordered_map services = { + { Service::acm, "acm" }, + { Service::apigateway, "apigateway" }, + { Service::appstream, "appstream" }, + { Service::artifact, "artifact" }, + { Service::autoscaling, "autoscaling" }, + { Service::aws_marketplace, "aws-marketplace" }, + { Service::aws_marketplace_management, "aws-marketplace-management" }, + { Service::aws_portal, "aws-portal" }, + { Service::cloudformation, "cloudformation" }, + { Service::cloudfront, "cloudfront" }, + { Service::cloudhsm, "cloudhsm" }, + { Service::cloudsearch, "cloudsearch" }, + { Service::cloudtrail, "cloudtrail" }, + { Service::cloudwatch, "cloudwatch" }, + { Service::codebuild, "codebuild" }, + { Service::codecommit, "codecommit" }, + { Service::codedeploy, "codedeploy" }, + { Service::codepipeline, "codepipeline" }, + { Service::cognito_identity, "cognito-identity" }, + { Service::cognito_idp, "cognito-idp" }, + { Service::cognito_sync, "cognito-sync" }, + { Service::config, "config" }, + { Service::datapipeline, "datapipeline" }, + { Service::devicefarm, "devicefarm" }, + { Service::directconnect, "directconnect" }, + { Service::dms, "dms" }, + { Service::ds, "ds" }, + { Service::dynamodb, "dynamodb" }, + { Service::ec2, "ec2" }, + { Service::ecr, "ecr" }, + { Service::ecs, "ecs" }, + { Service::elasticache, "elasticache" }, + { Service::elasticbeanstalk, "elasticbeanstalk" }, + { Service::elasticfilesystem, "elasticfilesystem" }, + { Service::elasticloadbalancing, "elasticloadbalancing" }, + { Service::elasticmapreduce, "elasticmapreduce" }, + { Service::elastictranscoder, "elastictranscoder" }, + { Service::es, "es" }, + { Service::events, "events" }, + { Service::firehose, "firehose" }, + { Service::gamelift, "gamelift" }, + { Service::glacier, "glacier" }, + { Service::health, "health" }, + { Service::iam, "iam" }, + { Service::importexport, "importexport" }, + { Service::inspector, "inspector" }, + { Service::iot, "iot" }, + { Service::kinesis, "kinesis" }, + { Service::kinesisanalytics, "kinesisanalytics" }, + { Service::kms, "kms" }, + { Service::lambda, "lambda" }, + { Service::lightsail, "lightsail" }, + { Service::logs, "logs" }, + { Service::machinelearning, "machinelearning" }, + { Service::mobileanalytics, "mobileanalytics" }, + { Service::mobilehub, "mobilehub" }, + { Service::opsworks, "opsworks" }, + { Service::opsworks_cm, "opsworks-cm" }, + { Service::polly, "polly" }, + { Service::rds, "rds" }, + { Service::redshift, "redshift" }, + { Service::route53, "route53" }, + { Service::route53domains, "route53domains" }, + { Service::s3, "s3" }, + { Service::sdb, "sdb" }, + { Service::servicecatalog, "servicecatalog" }, + { Service::ses, "ses" }, + { Service::sns, "sns" }, + { Service::sqs, "sqs" }, + { Service::ssm, "ssm" }, + { Service::states, "states" }, + { Service::storagegateway, "storagegateway" }, + { Service::sts, "sts" }, + { Service::support, "support" }, + { Service::swf, "swf" }, + { Service::trustedadvisor, "trustedadvisor" }, + { Service::waf, "waf" }, + { Service::workmail, "workmail" }, + { Service::workspaces, "workspaces" }}; + + auto i = services.find(service); + if (i != services.end()) { + s.append(i->second); + } else { + s.push_back('*'); + } + s.push_back(':'); + + s.append(region); + s.push_back(':'); + + s.append(account); + s.push_back(':'); + + s.append(resource); + + return s; +} + +bool operator ==(const ARN& l, const ARN& r) { + return ((l.partition == r.partition) && + (l.service == r.service) && + (l.region == r.region) && + (l.account == r.account) && + (l.resource == r.resource)); +} +bool operator <(const ARN& l, const ARN& r) { + return ((l.partition < r.partition) || + (l.service < r.service) || + (l.region < r.region) || + (l.account < r.account) || + (l.resource < r.resource)); +} + +// The candidate is not allowed to have wildcards. The only way to +// do that sanely would be to use unification rather than matching. +bool ARN::match(const ARN& candidate) const { + if ((candidate.partition == Partition::wildcard) || + (partition != candidate.partition && partition + != Partition::wildcard)) { + return false; + } + + if ((candidate.service == Service::wildcard) || + (service != candidate.service && service != Service::wildcard)) { + return false; + } + + if (!match_policy(region, candidate.region, MATCH_POLICY_ARN)) { + return false; + } + + if (!match_policy(account, candidate.account, MATCH_POLICY_ARN)) { + return false; + } + + if (!match_policy(resource, candidate.resource, MATCH_POLICY_RESOURCE)) { + return false; + } + + return true; +} + +boost::optional ARNResource::parse(const std::string& s) { + static const std::regex rx("^([^:/]*)[:/]?([^:/]*)?[:/]?(.*)$", + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + std::smatch match; + if (!regex_match(s, match, rx)) { + return boost::none; + } + if (match[2].str().empty() && match[3].str().empty()) { + // only resource exist + return rgw::ARNResource("", match[1], ""); + } + + // resource type also exist, and cannot be wildcard + if (match[1] != std::string(wildcard)) { + // resource type cannot be wildcard + return rgw::ARNResource(match[1], match[2], match[3]); + } + + return boost::none; +} + +std::string ARNResource::to_string() const { + std::string s; + + if (!resource_type.empty()) { + s.append(resource_type); + s.push_back(':'); + + s.append(resource); + s.push_back(':'); + + s.append(qualifier); + } else { + s.append(resource); + } + + return s; +} + +} + diff --git a/src/rgw/rgw_arn.h b/src/rgw/rgw_arn.h new file mode 100644 index 00000000..406a9f42 --- /dev/null +++ b/src/rgw/rgw_arn.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once +#include +#include + +class rgw_obj; +class rgw_bucket; + +namespace rgw { + +enum struct Partition { + aws, aws_cn, aws_us_gov, wildcard + // If we wanted our own ARNs for principal type unique to us + // (maybe to integrate better with Swift) or for anything else we + // provide that doesn't map onto S3, we could add an 'rgw' + // partition type. +}; + +enum struct Service { + apigateway, appstream, artifact, autoscaling, aws_portal, acm, + cloudformation, cloudfront, cloudhsm, cloudsearch, cloudtrail, + cloudwatch, events, logs, codebuild, codecommit, codedeploy, + codepipeline, cognito_idp, cognito_identity, cognito_sync, + config, datapipeline, dms, devicefarm, directconnect, + ds, dynamodb, ec2, ecr, ecs, ssm, elasticbeanstalk, elasticfilesystem, + elasticloadbalancing, elasticmapreduce, elastictranscoder, elasticache, + es, gamelift, glacier, health, iam, importexport, inspector, iot, + kms, kinesisanalytics, firehose, kinesis, lambda, lightsail, + machinelearning, aws_marketplace, aws_marketplace_management, + mobileanalytics, mobilehub, opsworks, opsworks_cm, polly, + redshift, rds, route53, route53domains, sts, servicecatalog, + ses, sns, sqs, s3, swf, sdb, states, storagegateway, support, + trustedadvisor, waf, workmail, workspaces, wildcard +}; + +/* valid format: + * 'arn:partition:service:region:account-id:resource' + * The 'resource' part can be further broken down via ARNResource +*/ +struct ARN { + Partition partition; + Service service; + std::string region; + // Once we refit tenant, we should probably use that instead of a + // string. + std::string account; + std::string resource; + + ARN() + : partition(Partition::wildcard), service(Service::wildcard) {} + ARN(Partition partition, Service service, std::string region, + std::string account, std::string resource) + : partition(partition), service(service), region(std::move(region)), + account(std::move(account)), resource(std::move(resource)) {} + ARN(const rgw_obj& o); + ARN(const rgw_bucket& b); + ARN(const rgw_bucket& b, const std::string& o); + ARN(const std::string& resource_name, const std::string& type, const std::string& tenant, bool has_path=false); + + static boost::optional parse(const std::string& s, + bool wildcard = false); + std::string to_string() const; + + // `this` is the pattern + bool match(const ARN& candidate) const; +}; + +inline std::string to_string(const ARN& a) { + return a.to_string(); +} + +inline std::ostream& operator <<(std::ostream& m, const ARN& a) { + return m << to_string(a); +} + +bool operator ==(const ARN& l, const ARN& r); +bool operator <(const ARN& l, const ARN& r); + +/* valid formats (only resource part): + * 'resource' + * 'resourcetype/resource' + * 'resourcetype/resource/qualifier' + * 'resourcetype/resource:qualifier' + * 'resourcetype:resource' + * 'resourcetype:resource:qualifier' + * Note that 'resourceType' cannot be wildcard +*/ +struct ARNResource { + constexpr static const char* const wildcard = "*"; + std::string resource_type; + std::string resource; + std::string qualifier; + + ARNResource() : resource_type(""), resource(wildcard), qualifier("") {} + + ARNResource(const std::string& _resource_type, const std::string& _resource, const std::string& _qualifier) : + resource_type(std::move(_resource_type)), resource(std::move(_resource)), qualifier(std::move(_qualifier)) {} + + static boost::optional parse(const std::string& s); + + std::string to_string() const; +}; + +inline std::string to_string(const ARNResource& r) { + return r.to_string(); +} + +} // namespace rgw + +namespace std { +template<> +struct hash<::rgw::Service> { + size_t operator()(const ::rgw::Service& s) const noexcept { + // Invoke a default-constructed hash object for int. + return hash()(static_cast(s)); + } +}; +} // namespace std + diff --git a/src/rgw/rgw_asio_client.cc b/src/rgw/rgw_asio_client.cc new file mode 100644 index 00000000..bea985a7 --- /dev/null +++ b/src/rgw/rgw_asio_client.cc @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "rgw_asio_client.h" +#include "rgw_perf_counters.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace rgw::asio; + +ClientIO::ClientIO(parser_type& parser, bool is_ssl, + const endpoint_type& local_endpoint, + const endpoint_type& remote_endpoint) + : parser(parser), is_ssl(is_ssl), + local_endpoint(local_endpoint), + remote_endpoint(remote_endpoint), + txbuf(*this) +{ +} + +ClientIO::~ClientIO() = default; + +int ClientIO::init_env(CephContext *cct) +{ + env.init(cct); + + perfcounter->inc(l_rgw_qlen); + perfcounter->inc(l_rgw_qactive); + + const auto& request = parser.get(); + const auto& headers = request; + for (auto header = headers.begin(); header != headers.end(); ++header) { + const auto& field = header->name(); // enum type for known headers + const auto& name = header->name_string(); + const auto& value = header->value(); + + if (field == beast::http::field::content_length) { + env.set("CONTENT_LENGTH", value.to_string()); + continue; + } + if (field == beast::http::field::content_type) { + env.set("CONTENT_TYPE", value.to_string()); + continue; + } + + static const boost::string_ref HTTP_{"HTTP_"}; + + char buf[name.size() + HTTP_.size() + 1]; + auto dest = std::copy(std::begin(HTTP_), std::end(HTTP_), buf); + for (auto src = name.begin(); src != name.end(); ++src, ++dest) { + if (*src == '-') { + *dest = '_'; + } else { + *dest = std::toupper(*src); + } + } + *dest = '\0'; + + env.set(buf, value.to_string()); + } + + int major = request.version() / 10; + int minor = request.version() % 10; + env.set("HTTP_VERSION", std::to_string(major) + '.' + std::to_string(minor)); + + env.set("REQUEST_METHOD", request.method_string().to_string()); + + // split uri from query + auto url = request.target(); + auto pos = url.find('?'); + if (pos != url.npos) { + auto query = url.substr(pos + 1); + env.set("QUERY_STRING", query.to_string()); + url = url.substr(0, pos); + } + env.set("REQUEST_URI", url.to_string()); + env.set("SCRIPT_URI", url.to_string()); /* FIXME */ + + char port_buf[16]; + snprintf(port_buf, sizeof(port_buf), "%d", local_endpoint.port()); + env.set("SERVER_PORT", port_buf); + if (is_ssl) { + env.set("SERVER_PORT_SECURE", port_buf); + } + env.set("REMOTE_ADDR", remote_endpoint.address().to_string()); + // TODO: set REMOTE_USER if authenticated + return 0; +} + +size_t ClientIO::complete_request() +{ + perfcounter->inc(l_rgw_qlen, -1); + perfcounter->inc(l_rgw_qactive, -1); + return 0; +} + +void ClientIO::flush() +{ + txbuf.pubsync(); +} + +size_t ClientIO::send_status(int status, const char* status_name) +{ + static constexpr size_t STATUS_BUF_SIZE = 128; + + char statusbuf[STATUS_BUF_SIZE]; + const auto statuslen = snprintf(statusbuf, sizeof(statusbuf), + "HTTP/1.1 %d %s\r\n", status, status_name); + + return txbuf.sputn(statusbuf, statuslen); +} + +size_t ClientIO::send_100_continue() +{ + const char HTTTP_100_CONTINUE[] = "HTTP/1.1 100 CONTINUE\r\n\r\n"; + const size_t sent = txbuf.sputn(HTTTP_100_CONTINUE, + sizeof(HTTTP_100_CONTINUE) - 1); + flush(); + return sent; +} + +static constexpr size_t TIME_BUF_SIZE = 128; +static size_t dump_date_header(char (×tr)[TIME_BUF_SIZE]) +{ + const time_t gtime = time(nullptr); + struct tm result; + struct tm const * const tmp = gmtime_r(>ime, &result); + if (tmp == nullptr) { + return 0; + } + return strftime(timestr, sizeof(timestr), + "Date: %a, %d %b %Y %H:%M:%S %Z\r\n", tmp); +} + +size_t ClientIO::complete_header() +{ + size_t sent = 0; + + char timestr[TIME_BUF_SIZE]; + if (dump_date_header(timestr)) { + sent += txbuf.sputn(timestr, strlen(timestr)); + } + + if (parser.keep_alive()) { + constexpr char CONN_KEEP_ALIVE[] = "Connection: Keep-Alive\r\n"; + sent += txbuf.sputn(CONN_KEEP_ALIVE, sizeof(CONN_KEEP_ALIVE) - 1); + } else { + constexpr char CONN_KEEP_CLOSE[] = "Connection: close\r\n"; + sent += txbuf.sputn(CONN_KEEP_CLOSE, sizeof(CONN_KEEP_CLOSE) - 1); + } + + constexpr char HEADER_END[] = "\r\n"; + sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1); + + flush(); + return sent; +} + +size_t ClientIO::send_header(const boost::string_ref& name, + const boost::string_ref& value) +{ + static constexpr char HEADER_SEP[] = ": "; + static constexpr char HEADER_END[] = "\r\n"; + + size_t sent = 0; + + sent += txbuf.sputn(name.data(), name.length()); + sent += txbuf.sputn(HEADER_SEP, sizeof(HEADER_SEP) - 1); + sent += txbuf.sputn(value.data(), value.length()); + sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1); + + return sent; +} + +size_t ClientIO::send_content_length(uint64_t len) +{ + static constexpr size_t CONLEN_BUF_SIZE = 128; + + char sizebuf[CONLEN_BUF_SIZE]; + const auto sizelen = snprintf(sizebuf, sizeof(sizebuf), + "Content-Length: %" PRIu64 "\r\n", len); + + return txbuf.sputn(sizebuf, sizelen); +} diff --git a/src/rgw/rgw_asio_client.h b/src/rgw/rgw_asio_client.h new file mode 100644 index 00000000..e99c3f7c --- /dev/null +++ b/src/rgw/rgw_asio_client.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_ASIO_CLIENT_H +#define RGW_ASIO_CLIENT_H + +#include +#include +#include +#include "include/ceph_assert.h" + +#include "rgw_client_io.h" + +namespace rgw { +namespace asio { + +namespace beast = boost::beast; +using parser_type = beast::http::request_parser; + +class ClientIO : public io::RestfulClient, + public io::BuffererSink { + protected: + parser_type& parser; + private: + const bool is_ssl; + using endpoint_type = boost::asio::ip::tcp::endpoint; + endpoint_type local_endpoint; + endpoint_type remote_endpoint; + + RGWEnv env; + + rgw::io::StaticOutputBufferer<> txbuf; + + public: + ClientIO(parser_type& parser, bool is_ssl, + const endpoint_type& local_endpoint, + const endpoint_type& remote_endpoint); + ~ClientIO() override; + + int init_env(CephContext *cct) override; + size_t complete_request() override; + void flush() override; + size_t send_status(int status, const char *status_name) override; + size_t send_100_continue() override; + size_t send_header(const boost::string_ref& name, + const boost::string_ref& value) override; + size_t send_content_length(uint64_t len) override; + size_t complete_header() override; + + size_t send_body(const char* buf, size_t len) override { + return write_data(buf, len); + } + + RGWEnv& get_env() noexcept override { + return env; + } +}; + +} // namespace asio +} // namespace rgw + +#endif // RGW_ASIO_CLIENT_H diff --git a/src/rgw/rgw_asio_frontend.cc b/src/rgw/rgw_asio_frontend.cc new file mode 100644 index 00000000..10e8d35a --- /dev/null +++ b/src/rgw/rgw_asio_frontend.cc @@ -0,0 +1,834 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include +#define BOOST_COROUTINES_NO_DEPRECATION_WARNING +#include +#include +#include +#include + +#include "common/async/shared_mutex.h" +#include "common/errno.h" +#include "common/strtol.h" + +#include "rgw_asio_client.h" +#include "rgw_asio_frontend.h" + +#ifdef WITH_RADOSGW_BEAST_OPENSSL +#include +#endif + +#include "rgw_dmclock_async_scheduler.h" + +#define dout_subsys ceph_subsys_rgw + +namespace { + +using tcp = boost::asio::ip::tcp; +namespace http = boost::beast::http; +#ifdef WITH_RADOSGW_BEAST_OPENSSL +namespace ssl = boost::asio::ssl; +#endif + +using parse_buffer = boost::beast::flat_static_buffer<65536>; + +template +class StreamIO : public rgw::asio::ClientIO { + CephContext* const cct; + Stream& stream; + parse_buffer& buffer; + public: + StreamIO(CephContext *cct, Stream& stream, rgw::asio::parser_type& parser, + parse_buffer& buffer, bool is_ssl, + const tcp::endpoint& local_endpoint, + const tcp::endpoint& remote_endpoint) + : ClientIO(parser, is_ssl, local_endpoint, remote_endpoint), + cct(cct), stream(stream), buffer(buffer) + {} + + size_t write_data(const char* buf, size_t len) override { + boost::system::error_code ec; + auto bytes = boost::asio::write(stream, boost::asio::buffer(buf, len), ec); + if (ec) { + ldout(cct, 4) << "write_data failed: " << ec.message() << dendl; + if (ec==boost::asio::error::broken_pipe) { + boost::system::error_code ec_ignored; + stream.lowest_layer().shutdown(tcp::socket::shutdown_both, ec_ignored); + } + throw rgw::io::Exception(ec.value(), std::system_category()); + } + return bytes; + } + + size_t recv_body(char* buf, size_t max) override { + auto& message = parser.get(); + auto& body_remaining = message.body(); + body_remaining.data = buf; + body_remaining.size = max; + + while (body_remaining.size && !parser.is_done()) { + boost::system::error_code ec; + http::read_some(stream, buffer, parser, ec); + if (ec == http::error::need_buffer) { + break; + } + if (ec) { + ldout(cct, 4) << "failed to read body: " << ec.message() << dendl; + throw rgw::io::Exception(ec.value(), std::system_category()); + } + } + return max - body_remaining.size; + } +}; + +// output the http version as a string, ie 'HTTP/1.1' +struct http_version { + unsigned major_ver; + unsigned minor_ver; + explicit http_version(unsigned version) + : major_ver(version / 10), minor_ver(version % 10) {} +}; +std::ostream& operator<<(std::ostream& out, const http_version& v) { + return out << "HTTP/" << v.major_ver << '.' << v.minor_ver; +} + +// log an http header value or '-' if it's missing +struct log_header { + const http::fields& fields; + http::field field; + std::string_view quote; + log_header(const http::fields& fields, http::field field, + std::string_view quote = "") + : fields(fields), field(field), quote(quote) {} +}; +std::ostream& operator<<(std::ostream& out, const log_header& h) { + auto p = h.fields.find(h.field); + if (p == h.fields.end()) { + return out << '-'; + } + return out << h.quote << p->value() << h.quote; +} + +using SharedMutex = ceph::async::SharedMutex; + +template +void handle_connection(boost::asio::io_context& context, + RGWProcessEnv& env, Stream& stream, + parse_buffer& buffer, bool is_ssl, + SharedMutex& pause_mutex, + rgw::dmclock::Scheduler *scheduler, + boost::system::error_code& ec, + boost::asio::yield_context yield) +{ + // limit header to 4k, since we read it all into a single flat_buffer + static constexpr size_t header_limit = 4096; + // don't impose a limit on the body, since we read it in pieces + static constexpr size_t body_limit = std::numeric_limits::max(); + + auto cct = env.store->ctx(); + + // read messages from the stream until eof + for (;;) { + // configure the parser + rgw::asio::parser_type parser; + parser.header_limit(header_limit); + parser.body_limit(body_limit); + + // parse the header + http::async_read_header(stream, buffer, parser, yield[ec]); + if (ec == boost::asio::error::connection_reset || + ec == boost::asio::error::bad_descriptor || + ec == boost::asio::error::operation_aborted || +#ifdef WITH_RADOSGW_BEAST_OPENSSL + ec == ssl::error::stream_truncated || +#endif + ec == http::error::end_of_stream) { + ldout(cct, 20) << "failed to read header: " << ec.message() << dendl; + return; + } + auto& message = parser.get(); + if (ec) { + ldout(cct, 1) << "failed to read header: " << ec.message() << dendl; + http::response response; + response.result(http::status::bad_request); + response.version(message.version() == 10 ? 10 : 11); + response.prepare_payload(); + http::async_write(stream, response, yield[ec]); + if (ec) { + ldout(cct, 5) << "failed to write response: " << ec.message() << dendl; + } + ldout(cct, 1) << "====== req done http_status=400 ======" << dendl; + return; + } + + { + auto lock = pause_mutex.async_lock_shared(yield[ec]); + if (ec == boost::asio::error::operation_aborted) { + return; + } else if (ec) { + ldout(cct, 1) << "failed to lock: " << ec.message() << dendl; + return; + } + + // process the request + RGWRequest req{env.store->get_new_req_id()}; + + auto& socket = stream.lowest_layer(); + const auto& remote_endpoint = socket.remote_endpoint(ec); + if (ec) { + ldout(cct, 1) << "failed to connect client: " << ec.message() << dendl; + return; + } + + StreamIO real_client{cct, stream, parser, buffer, is_ssl, + socket.local_endpoint(), + remote_endpoint}; + + auto real_client_io = rgw::io::add_reordering( + rgw::io::add_buffering(cct, + rgw::io::add_chunking( + rgw::io::add_conlen_controlling( + &real_client)))); + RGWRestfulIO client(cct, &real_client_io); + auto y = optional_yield{context, yield}; + int http_ret = 0; + process_request(env.store, env.rest, &req, env.uri_prefix, + *env.auth_registry, &client, env.olog, y, + scheduler, &http_ret); + + if (cct->_conf->subsys.should_gather(dout_subsys, 1)) { + // access log line elements begin per Apache Combined Log Format with additions following + const auto now = ceph::coarse_real_clock::now(); + using ceph::operator<<; // for coarse_real_time + ldout(cct, 1) << "beast: " << hex << &req << dec << ": " + << remote_endpoint.address() << " - - [" << now << "] \"" + << message.method_string() << ' ' << message.target() << ' ' + << http_version{message.version()} << "\" " << http_ret << ' ' + << client.get_bytes_sent() + client.get_bytes_received() << ' ' + << log_header{message, http::field::referer, "\""} << ' ' + << log_header{message, http::field::user_agent, "\""} << ' ' + << log_header{message, http::field::range} << dendl; + } + } + + if (!parser.keep_alive()) { + return; + } + + // if we failed before reading the entire message, discard any remaining + // bytes before reading the next + while (!parser.is_done()) { + static std::array discard_buffer; + + auto& body = parser.get().body(); + body.size = discard_buffer.size(); + body.data = discard_buffer.data(); + + http::async_read_some(stream, buffer, parser, yield[ec]); + if (ec == http::error::need_buffer) { + continue; + } + if (ec == boost::asio::error::connection_reset) { + return; + } + if (ec) { + ldout(cct, 5) << "failed to discard unread message: " + << ec.message() << dendl; + return; + } + } + } +} + +struct Connection : boost::intrusive::list_base_hook<> { + tcp::socket& socket; + Connection(tcp::socket& socket) : socket(socket) {} +}; + +class ConnectionList { + using List = boost::intrusive::list; + List connections; + std::mutex mutex; + + void remove(Connection& c) { + std::lock_guard lock{mutex}; + if (c.is_linked()) { + connections.erase(List::s_iterator_to(c)); + } + } + public: + class Guard { + ConnectionList *list; + Connection *conn; + public: + Guard(ConnectionList *list, Connection *conn) : list(list), conn(conn) {} + ~Guard() { list->remove(*conn); } + }; + [[nodiscard]] Guard add(Connection& conn) { + std::lock_guard lock{mutex}; + connections.push_back(conn); + return Guard{this, &conn}; + } + void close(boost::system::error_code& ec) { + std::lock_guard lock{mutex}; + for (auto& conn : connections) { + conn.socket.close(ec); + } + connections.clear(); + } +}; + +namespace dmc = rgw::dmclock; +class AsioFrontend { + RGWProcessEnv env; + RGWFrontendConfig* conf; + boost::asio::io_context context; +#ifdef WITH_RADOSGW_BEAST_OPENSSL + boost::optional ssl_context; + int init_ssl(); +#endif + SharedMutex pause_mutex; + std::unique_ptr scheduler; + + struct Listener { + tcp::endpoint endpoint; + tcp::acceptor acceptor; + tcp::socket socket; + bool use_ssl = false; + bool use_nodelay = false; + + explicit Listener(boost::asio::io_context& context) + : acceptor(context), socket(context) {} + }; + std::vector listeners; + + ConnectionList connections; + + // work guard to keep run() threads busy while listeners are paused + using Executor = boost::asio::io_context::executor_type; + std::optional> work; + + std::vector threads; + std::atomic going_down{false}; + + CephContext* ctx() const { return env.store->ctx(); } + std::optional client_counters; + std::unique_ptr client_config; + void accept(Listener& listener, boost::system::error_code ec); + + public: + AsioFrontend(const RGWProcessEnv& env, RGWFrontendConfig* conf, + dmc::SchedulerCtx& sched_ctx) + : env(env), conf(conf), pause_mutex(context.get_executor()) + { + auto sched_t = dmc::get_scheduler_t(ctx()); + switch(sched_t){ + case dmc::scheduler_t::dmclock: + scheduler.reset(new dmc::AsyncScheduler(ctx(), + context, + std::ref(sched_ctx.get_dmc_client_counters()), + sched_ctx.get_dmc_client_config(), + *sched_ctx.get_dmc_client_config(), + dmc::AtLimit::Reject)); + break; + case dmc::scheduler_t::none: + lderr(ctx()) << "Got invalid scheduler type for beast, defaulting to throttler" << dendl; + [[fallthrough]]; + case dmc::scheduler_t::throttler: + scheduler.reset(new dmc::SimpleThrottler(ctx())); + + } + } + + int init(); + int run(); + void stop(); + void join(); + void pause(); + void unpause(RGWRados* store, rgw_auth_registry_ptr_t); +}; + +unsigned short parse_port(const char *input, boost::system::error_code& ec) +{ + char *end = nullptr; + auto port = std::strtoul(input, &end, 10); + if (port > std::numeric_limits::max()) { + ec.assign(ERANGE, boost::system::system_category()); + } else if (port == 0 && end == input) { + ec.assign(EINVAL, boost::system::system_category()); + } + return port; +} + +tcp::endpoint parse_endpoint(boost::asio::string_view input, + unsigned short default_port, + boost::system::error_code& ec) +{ + tcp::endpoint endpoint; + + if (input.empty()) { + ec = boost::asio::error::invalid_argument; + return endpoint; + } + + if (input[0] == '[') { // ipv6 + const size_t addr_begin = 1; + const size_t addr_end = input.find(']'); + if (addr_end == input.npos) { // no matching ] + ec = boost::asio::error::invalid_argument; + return endpoint; + } + if (addr_end + 1 < input.size()) { + // :port must must follow [ipv6] + if (input[addr_end + 1] != ':') { + ec = boost::asio::error::invalid_argument; + return endpoint; + } else { + auto port_str = input.substr(addr_end + 2); + endpoint.port(parse_port(port_str.data(), ec)); + } + } else { + endpoint.port(default_port); + } + auto addr = input.substr(addr_begin, addr_end - addr_begin); + endpoint.address(boost::asio::ip::make_address_v6(addr, ec)); + } else { // ipv4 + auto colon = input.find(':'); + if (colon != input.npos) { + auto port_str = input.substr(colon + 1); + endpoint.port(parse_port(port_str.data(), ec)); + if (ec) { + return endpoint; + } + } else { + endpoint.port(default_port); + } + auto addr = input.substr(0, colon); + endpoint.address(boost::asio::ip::make_address_v4(addr, ec)); + } + return endpoint; +} + +static int drop_privileges(CephContext *ctx) +{ + uid_t uid = ctx->get_set_uid(); + gid_t gid = ctx->get_set_gid(); + std::string uid_string = ctx->get_set_uid_string(); + std::string gid_string = ctx->get_set_gid_string(); + if (gid && setgid(gid) != 0) { + int err = errno; + ldout(ctx, -1) << "unable to setgid " << gid << ": " << cpp_strerror(err) << dendl; + return -err; + } + if (uid && setuid(uid) != 0) { + int err = errno; + ldout(ctx, -1) << "unable to setuid " << uid << ": " << cpp_strerror(err) << dendl; + return -err; + } + if (uid && gid) { + ldout(ctx, 0) << "set uid:gid to " << uid << ":" << gid + << " (" << uid_string << ":" << gid_string << ")" << dendl; + } + return 0; +} + +int AsioFrontend::init() +{ + boost::system::error_code ec; + auto& config = conf->get_config_map(); + +#ifdef WITH_RADOSGW_BEAST_OPENSSL + int r = init_ssl(); + if (r < 0) { + return r; + } +#endif + + // parse endpoints + auto ports = config.equal_range("port"); + for (auto i = ports.first; i != ports.second; ++i) { + auto port = parse_port(i->second.c_str(), ec); + if (ec) { + lderr(ctx()) << "failed to parse port=" << i->second << dendl; + return -ec.value(); + } + listeners.emplace_back(context); + listeners.back().endpoint.port(port); + + listeners.emplace_back(context); + listeners.back().endpoint = tcp::endpoint(tcp::v6(), port); + } + + auto endpoints = config.equal_range("endpoint"); + for (auto i = endpoints.first; i != endpoints.second; ++i) { + auto endpoint = parse_endpoint(i->second, 80, ec); + if (ec) { + lderr(ctx()) << "failed to parse endpoint=" << i->second << dendl; + return -ec.value(); + } + listeners.emplace_back(context); + listeners.back().endpoint = endpoint; + } + // parse tcp nodelay + auto nodelay = config.find("tcp_nodelay"); + if (nodelay != config.end()) { + for (auto& l : listeners) { + l.use_nodelay = (nodelay->second == "1"); + } + } + + + bool socket_bound = false; + // start listeners + for (auto& l : listeners) { + l.acceptor.open(l.endpoint.protocol(), ec); + if (ec) { + if (ec == boost::asio::error::address_family_not_supported) { + ldout(ctx(), 0) << "WARNING: cannot open socket for endpoint=" << l.endpoint + << ", " << ec.message() << dendl; + continue; + } + + lderr(ctx()) << "failed to open socket: " << ec.message() << dendl; + return -ec.value(); + } + + if (l.endpoint.protocol() == tcp::v6()) { + l.acceptor.set_option(boost::asio::ip::v6_only(true), ec); + if (ec) { + lderr(ctx()) << "failed to set v6_only socket option: " + << ec.message() << dendl; + return -ec.value(); + } + } + + l.acceptor.set_option(tcp::acceptor::reuse_address(true)); + l.acceptor.bind(l.endpoint, ec); + if (ec) { + lderr(ctx()) << "failed to bind address " << l.endpoint + << ": " << ec.message() << dendl; + return -ec.value(); + } + + auto it = config.find("max_connection_backlog"); + auto max_connection_backlog = boost::asio::socket_base::max_listen_connections; + if (it != config.end()) { + string err; + max_connection_backlog = strict_strtol(it->second.c_str(), 10, &err); + if (!err.empty()) { + ldout(ctx(), 0) << "WARNING: invalid value for max_connection_backlog=" << it->second << dendl; + max_connection_backlog = boost::asio::socket_base::max_listen_connections; + } + } + l.acceptor.listen(max_connection_backlog); + l.acceptor.async_accept(l.socket, + [this, &l] (boost::system::error_code ec) { + accept(l, ec); + }); + + ldout(ctx(), 4) << "frontend listening on " << l.endpoint << dendl; + socket_bound = true; + } + if (!socket_bound) { + lderr(ctx()) << "Unable to listen at any endpoints" << dendl; + return -EINVAL; + } + + return drop_privileges(ctx()); +} + +#ifdef WITH_RADOSGW_BEAST_OPENSSL +int AsioFrontend::init_ssl() +{ + boost::system::error_code ec; + auto& config = conf->get_config_map(); + + // ssl configuration + auto cert = config.find("ssl_certificate"); + const bool have_cert = cert != config.end(); + if (have_cert) { + // only initialize the ssl context if it's going to be used + ssl_context = boost::in_place(ssl::context::tls); + } + + auto key = config.find("ssl_private_key"); + const bool have_private_key = key != config.end(); + if (have_private_key) { + if (!have_cert) { + lderr(ctx()) << "no ssl_certificate configured for ssl_private_key" << dendl; + return -EINVAL; + } + ssl_context->use_private_key_file(key->second, ssl::context::pem, ec); + if (ec) { + lderr(ctx()) << "failed to add ssl_private_key=" << key->second + << ": " << ec.message() << dendl; + return -ec.value(); + } + } + if (have_cert) { + ssl_context->use_certificate_chain_file(cert->second, ec); + if (ec) { + lderr(ctx()) << "failed to use ssl_certificate=" << cert->second + << ": " << ec.message() << dendl; + return -ec.value(); + } + if (!have_private_key) { + // attempt to use it as a private key if a separate one wasn't provided + ssl_context->use_private_key_file(cert->second, ssl::context::pem, ec); + if (ec) { + lderr(ctx()) << "failed to use ssl_certificate=" << cert->second + << " as a private key: " << ec.message() << dendl; + return -ec.value(); + } + } + } + + // parse ssl endpoints + auto ports = config.equal_range("ssl_port"); + for (auto i = ports.first; i != ports.second; ++i) { + if (!have_cert) { + lderr(ctx()) << "no ssl_certificate configured for ssl_port" << dendl; + return -EINVAL; + } + auto port = parse_port(i->second.c_str(), ec); + if (ec) { + lderr(ctx()) << "failed to parse ssl_port=" << i->second << dendl; + return -ec.value(); + } + listeners.emplace_back(context); + listeners.back().endpoint.port(port); + listeners.back().use_ssl = true; + + listeners.emplace_back(context); + listeners.back().endpoint = tcp::endpoint(tcp::v6(), port); + listeners.back().use_ssl = true; + } + + auto endpoints = config.equal_range("ssl_endpoint"); + for (auto i = endpoints.first; i != endpoints.second; ++i) { + if (!have_cert) { + lderr(ctx()) << "no ssl_certificate configured for ssl_endpoint" << dendl; + return -EINVAL; + } + auto endpoint = parse_endpoint(i->second, 443, ec); + if (ec) { + lderr(ctx()) << "failed to parse ssl_endpoint=" << i->second << dendl; + return -ec.value(); + } + listeners.emplace_back(context); + listeners.back().endpoint = endpoint; + listeners.back().use_ssl = true; + } + return 0; +} +#endif // WITH_RADOSGW_BEAST_OPENSSL + +void AsioFrontend::accept(Listener& l, boost::system::error_code ec) +{ + if (!l.acceptor.is_open()) { + return; + } else if (ec == boost::asio::error::operation_aborted) { + return; + } else if (ec) { + ldout(ctx(), 1) << "accept failed: " << ec.message() << dendl; + return; + } + auto socket = std::move(l.socket); + tcp::no_delay options(l.use_nodelay); + socket.set_option(options,ec); + l.acceptor.async_accept(l.socket, + [this, &l] (boost::system::error_code ec) { + accept(l, ec); + }); + + // spawn a coroutine to handle the connection +#ifdef WITH_RADOSGW_BEAST_OPENSSL + if (l.use_ssl) { + boost::asio::spawn(context, + [this, s=std::move(socket)] (boost::asio::yield_context yield) mutable { + Connection conn{s}; + auto c = connections.add(conn); + // wrap the socket in an ssl stream + ssl::stream stream{s, *ssl_context}; + auto buffer = std::make_unique(); + // do ssl handshake + boost::system::error_code ec; + auto bytes = stream.async_handshake(ssl::stream_base::server, + buffer->data(), yield[ec]); + if (ec) { + ldout(ctx(), 1) << "ssl handshake failed: " << ec.message() << dendl; + return; + } + buffer->consume(bytes); + handle_connection(context, env, stream, *buffer, true, pause_mutex, + scheduler.get(), ec, yield); + if (!ec) { + // ssl shutdown (ignoring errors) + stream.async_shutdown(yield[ec]); + } + s.shutdown(tcp::socket::shutdown_both, ec); + }); + } else { +#else + { +#endif // WITH_RADOSGW_BEAST_OPENSSL + boost::asio::spawn(context, + [this, s=std::move(socket)] (boost::asio::yield_context yield) mutable { + Connection conn{s}; + auto c = connections.add(conn); + auto buffer = std::make_unique(); + boost::system::error_code ec; + handle_connection(context, env, s, *buffer, false, pause_mutex, + scheduler.get(), ec, yield); + s.shutdown(tcp::socket::shutdown_both, ec); + }); + } +} + +int AsioFrontend::run() +{ + auto cct = ctx(); + const int thread_count = cct->_conf->rgw_thread_pool_size; + threads.reserve(thread_count); + + ldout(cct, 4) << "frontend spawning " << thread_count << " threads" << dendl; + + // the worker threads call io_context::run(), which will return when there's + // no work left. hold a work guard to keep these threads going until join() + work.emplace(boost::asio::make_work_guard(context)); + + for (int i = 0; i < thread_count; i++) { + threads.emplace_back([=] { + // request warnings on synchronous librados calls in this thread + is_asio_thread = true; + boost::system::error_code ec; + context.run(ec); + }); + } + return 0; +} + +void AsioFrontend::stop() +{ + ldout(ctx(), 4) << "frontend initiating shutdown..." << dendl; + + going_down = true; + + boost::system::error_code ec; + // close all listeners + for (auto& listener : listeners) { + listener.acceptor.close(ec); + } + // close all connections + connections.close(ec); + pause_mutex.cancel(); +} + +void AsioFrontend::join() +{ + if (!going_down) { + stop(); + } + work.reset(); + + ldout(ctx(), 4) << "frontend joining threads..." << dendl; + for (auto& thread : threads) { + thread.join(); + } + ldout(ctx(), 4) << "frontend done" << dendl; +} + +void AsioFrontend::pause() +{ + ldout(ctx(), 4) << "frontend pausing connections..." << dendl; + + // cancel pending calls to accept(), but don't close the sockets + boost::system::error_code ec; + for (auto& l : listeners) { + l.acceptor.cancel(ec); + } + + // pause and wait for outstanding requests to complete + pause_mutex.lock(ec); + + if (ec) { + ldout(ctx(), 1) << "frontend failed to pause: " << ec.message() << dendl; + } else { + ldout(ctx(), 4) << "frontend paused" << dendl; + } +} + +void AsioFrontend::unpause(RGWRados* const store, + rgw_auth_registry_ptr_t auth_registry) +{ + env.store = store; + env.auth_registry = std::move(auth_registry); + + // unpause to unblock connections + pause_mutex.unlock(); + + // start accepting connections again + for (auto& l : listeners) { + l.acceptor.async_accept(l.socket, + [this, &l] (boost::system::error_code ec) { + accept(l, ec); + }); + } + + ldout(ctx(), 4) << "frontend unpaused" << dendl; +} + +} // anonymous namespace + +class RGWAsioFrontend::Impl : public AsioFrontend { + public: + Impl(const RGWProcessEnv& env, RGWFrontendConfig* conf, + rgw::dmclock::SchedulerCtx& sched_ctx) + : AsioFrontend(env, conf, sched_ctx) {} +}; + +RGWAsioFrontend::RGWAsioFrontend(const RGWProcessEnv& env, + RGWFrontendConfig* conf, + rgw::dmclock::SchedulerCtx& sched_ctx) + : impl(new Impl(env, conf, sched_ctx)) +{ +} + +RGWAsioFrontend::~RGWAsioFrontend() = default; + +int RGWAsioFrontend::init() +{ + return impl->init(); +} + +int RGWAsioFrontend::run() +{ + return impl->run(); +} + +void RGWAsioFrontend::stop() +{ + impl->stop(); +} + +void RGWAsioFrontend::join() +{ + impl->join(); +} + +void RGWAsioFrontend::pause_for_new_config() +{ + impl->pause(); +} + +void RGWAsioFrontend::unpause_with_new_config( + RGWRados* const store, + rgw_auth_registry_ptr_t auth_registry +) { + impl->unpause(store, std::move(auth_registry)); +} diff --git a/src/rgw/rgw_asio_frontend.h b/src/rgw/rgw_asio_frontend.h new file mode 100644 index 00000000..857910bb --- /dev/null +++ b/src/rgw/rgw_asio_frontend.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_ASIO_FRONTEND_H +#define RGW_ASIO_FRONTEND_H + +#include +#include "rgw_frontend.h" + +class RGWAsioFrontend : public RGWFrontend { + class Impl; + std::unique_ptr impl; +public: + RGWAsioFrontend(const RGWProcessEnv& env, RGWFrontendConfig* conf, + rgw::dmclock::SchedulerCtx& sched_ctx); + ~RGWAsioFrontend() override; + + int init() override; + int run() override; + void stop() override; + void join() override; + + void pause_for_new_config() override; + void unpause_with_new_config(RGWRados *store, + rgw_auth_registry_ptr_t auth_registry) override; +}; + +#endif // RGW_ASIO_FRONTEND_H diff --git a/src/rgw/rgw_auth.cc b/src/rgw/rgw_auth.cc new file mode 100644 index 00000000..a6f84b22 --- /dev/null +++ b/src/rgw/rgw_auth.cc @@ -0,0 +1,722 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "rgw_common.h" +#include "rgw_auth.h" +#include "rgw_quota.h" +#include "rgw_user.h" +#include "rgw_http_client.h" +#include "rgw_keystone.h" + +#include "include/str_list.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + + +namespace rgw { +namespace auth { + +std::unique_ptr +transform_old_authinfo(const req_state* const s) +{ + /* This class is not intended for public use. Should be removed altogether + * with this function after moving all our APIs to the new authentication + * infrastructure. */ + class DummyIdentityApplier : public rgw::auth::Identity { + CephContext* const cct; + + /* For this particular case it's OK to use rgw_user structure to convey + * the identity info as this was the policy for doing that before the + * new auth. */ + const rgw_user id; + const int perm_mask; + const bool is_admin; + const uint32_t type; + public: + DummyIdentityApplier(CephContext* const cct, + const rgw_user& auth_id, + const int perm_mask, + const bool is_admin, + const uint32_t type) + : cct(cct), + id(auth_id), + perm_mask(perm_mask), + is_admin(is_admin), + type(type) { + } + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override { + return rgw_perms_from_aclspec_default_strategy(id, aclspec); + } + + bool is_admin_of(const rgw_user& acct_id) const override { + return is_admin; + } + + bool is_owner_of(const rgw_user& acct_id) const override { + return id == acct_id; + } + + bool is_identity(const idset_t& ids) const override { + for (auto& p : ids) { + if (p.is_wildcard()) { + return true; + } else if (p.is_tenant() && p.get_tenant() == id.tenant) { + return true; + } else if (p.is_user() && + (p.get_tenant() == id.tenant) && + (p.get_id() == id.id)) { + return true; + } + } + return false; + } + + uint32_t get_perm_mask() const override { + return perm_mask; + } + + uint32_t get_identity_type() const override { + return type; + } + + string get_acct_name() const override { + return {}; + } + + string get_subuser() const override { + return {}; + } + + void to_str(std::ostream& out) const override { + out << "RGWDummyIdentityApplier(auth_id=" << id + << ", perm_mask=" << perm_mask + << ", is_admin=" << is_admin << ")"; + } + }; + + return std::unique_ptr( + new DummyIdentityApplier(s->cct, + s->user->user_id, + s->perm_mask, + /* System user has admin permissions by default - it's supposed to pass + * through any security check. */ + s->system_request, + s->user->type)); +} + +} /* namespace auth */ +} /* namespace rgw */ + + +uint32_t rgw_perms_from_aclspec_default_strategy( + const rgw_user& uid, + const rgw::auth::Identity::aclspec_t& aclspec) +{ + dout(5) << "Searching permissions for uid=" << uid << dendl; + + const auto iter = aclspec.find(uid.to_str()); + if (std::end(aclspec) != iter) { + dout(5) << "Found permission: " << iter->second << dendl; + return iter->second; + } + + dout(5) << "Permissions for user not found" << dendl; + return 0; +} + + +static inline const std::string make_spec_item(const std::string& tenant, + const std::string& id) +{ + return tenant + ":" + id; +} + + +static inline std::pair +strategy_handle_rejected(rgw::auth::Engine::result_t&& engine_result, + const rgw::auth::Strategy::Control policy, + rgw::auth::Engine::result_t&& strategy_result) +{ + using Control = rgw::auth::Strategy::Control; + switch (policy) { + case Control::REQUISITE: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + case Control::SUFFICIENT: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + case Control::FALLBACK: + /* Don't try next. */ + return std::make_pair(false, std::move(strategy_result)); + + default: + /* Huh, memory corruption? */ + ceph_abort(); + } +} + +static inline std::pair +strategy_handle_denied(rgw::auth::Engine::result_t&& engine_result, + const rgw::auth::Strategy::Control policy, + rgw::auth::Engine::result_t&& strategy_result) +{ + using Control = rgw::auth::Strategy::Control; + switch (policy) { + case Control::REQUISITE: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + case Control::SUFFICIENT: + /* Just try next. */ + return std::make_pair(true, std::move(engine_result)); + + case Control::FALLBACK: + return std::make_pair(true, std::move(strategy_result)); + + default: + /* Huh, memory corruption? */ + ceph_abort(); + } +} + +static inline std::pair +strategy_handle_granted(rgw::auth::Engine::result_t&& engine_result, + const rgw::auth::Strategy::Control policy, + rgw::auth::Engine::result_t&& strategy_result) +{ + using Control = rgw::auth::Strategy::Control; + switch (policy) { + case Control::REQUISITE: + /* Try next. */ + return std::make_pair(true, std::move(engine_result)); + + case Control::SUFFICIENT: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + case Control::FALLBACK: + /* Don't try next. */ + return std::make_pair(false, std::move(engine_result)); + + default: + /* Huh, memory corruption? */ + ceph_abort(); + } +} + +rgw::auth::Engine::result_t +rgw::auth::Strategy::authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const +{ + result_t strategy_result = result_t::deny(); + + for (const stack_item_t& kv : auth_stack) { + const rgw::auth::Engine& engine = kv.first; + const auto& policy = kv.second; + + ldpp_dout(dpp, 20) << get_name() << ": trying " << engine.get_name() << dendl; + + result_t engine_result = result_t::deny(); + try { + engine_result = engine.authenticate(dpp, s); + } catch (const int err) { + engine_result = result_t::deny(err); + } + + bool try_next = true; + switch (engine_result.get_status()) { + case result_t::Status::REJECTED: { + ldpp_dout(dpp, 20) << engine.get_name() << " rejected with reason=" + << engine_result.get_reason() << dendl; + + std::tie(try_next, strategy_result) = \ + strategy_handle_rejected(std::move(engine_result), policy, + std::move(strategy_result)); + break; + } + case result_t::Status::DENIED: { + ldpp_dout(dpp, 20) << engine.get_name() << " denied with reason=" + << engine_result.get_reason() << dendl; + + std::tie(try_next, strategy_result) = \ + strategy_handle_denied(std::move(engine_result), policy, + std::move(strategy_result)); + break; + } + case result_t::Status::GRANTED: { + ldpp_dout(dpp, 20) << engine.get_name() << " granted access" << dendl; + + std::tie(try_next, strategy_result) = \ + strategy_handle_granted(std::move(engine_result), policy, + std::move(strategy_result)); + break; + } + default: { + ceph_abort(); + } + } + + if (! try_next) { + break; + } + } + + return strategy_result; +} + +int +rgw::auth::Strategy::apply(const DoutPrefixProvider *dpp, const rgw::auth::Strategy& auth_strategy, + req_state* const s) noexcept +{ + try { + auto result = auth_strategy.authenticate(dpp, s); + if (result.get_status() != decltype(result)::Status::GRANTED) { + /* Access denied is acknowledged by returning a std::unique_ptr with + * nullptr inside. */ + ldpp_dout(dpp, 5) << "Failed the auth strategy, reason=" + << result.get_reason() << dendl; + return result.get_reason(); + } + + try { + rgw::auth::IdentityApplier::aplptr_t applier = result.get_applier(); + rgw::auth::Completer::cmplptr_t completer = result.get_completer(); + + /* Account used by a given RGWOp is decoupled from identity employed + * in the authorization phase (RGWOp::verify_permissions). */ + applier->load_acct_info(dpp, *s->user); + s->perm_mask = applier->get_perm_mask(); + + /* This is the single place where we pass req_state as a pointer + * to non-const and thus its modification is allowed. In the time + * of writing only RGWTempURLEngine needed that feature. */ + applier->modify_request_state(dpp, s); + if (completer) { + completer->modify_request_state(dpp, s); + } + + s->auth.identity = std::move(applier); + s->auth.completer = std::move(completer); + + return 0; + } catch (const int err) { + ldpp_dout(dpp, 5) << "applier throwed err=" << err << dendl; + return err; + } + } catch (const int err) { + ldpp_dout(dpp, 5) << "auth engine throwed err=" << err << dendl; + return err; + } + + /* We never should be here. */ + return -EPERM; +} + +void +rgw::auth::Strategy::add_engine(const Control ctrl_flag, + const Engine& engine) noexcept +{ + auth_stack.push_back(std::make_pair(std::cref(engine), ctrl_flag)); +} + +void rgw::auth::WebIdentityApplier::to_str(std::ostream& out) const +{ + out << "rgw::auth::WebIdentityApplier(sub =" << token_claims.sub + << ", user_name=" << token_claims.user_name + << ", aud =" << token_claims.aud + << ", provider_id =" << token_claims.iss << ")"; +} + +string rgw::auth::WebIdentityApplier::get_idp_url() const +{ + string idp_url = token_claims.iss; + auto pos = idp_url.find("http://"); + if (pos == std::string::npos) { + pos = idp_url.find("https://"); + if (pos != std::string::npos) { + idp_url.erase(pos, 8); + } + } else { + idp_url.erase(pos, 7); + } + return idp_url; +} + +void rgw::auth::WebIdentityApplier::modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const +{ + s->info.args.append("sub", token_claims.sub); + s->info.args.append("aud", token_claims.aud); + s->info.args.append("provider_id", token_claims.iss); + + string idp_url = get_idp_url(); + string condition = idp_url + ":app_id"; + s->env.emplace(condition, token_claims.aud); +} + +bool rgw::auth::WebIdentityApplier::is_identity(const idset_t& ids) const +{ + if (ids.size() > 1) { + return false; + } + + for (auto id : ids) { + string idp_url = get_idp_url(); + if (id.is_oidc_provider() && id.get_idp_url() == idp_url) { + return true; + } + } + return false; +} + +/* rgw::auth::RemoteAuthApplier */ +uint32_t rgw::auth::RemoteApplier::get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const +{ + uint32_t perm = 0; + + /* For backward compatibility with ACLOwner. */ + perm |= rgw_perms_from_aclspec_default_strategy(info.acct_user, + aclspec); + + /* We also need to cover cases where rgw_keystone_implicit_tenants + * was enabled. */ + if (info.acct_user.tenant.empty()) { + const rgw_user tenanted_acct_user(info.acct_user.id, info.acct_user.id); + + perm |= rgw_perms_from_aclspec_default_strategy(tenanted_acct_user, + aclspec); + } + + /* Now it's a time for invoking additional strategy that was supplied by + * a specific auth engine. */ + if (extra_acl_strategy) { + perm |= extra_acl_strategy(aclspec); + } + + ldpp_dout(dpp, 20) << "from ACL got perm=" << perm << dendl; + return perm; +} + +bool rgw::auth::RemoteApplier::is_admin_of(const rgw_user& uid) const +{ + return info.is_admin; +} + +bool rgw::auth::RemoteApplier::is_owner_of(const rgw_user& uid) const +{ + if (info.acct_user.tenant.empty()) { + const rgw_user tenanted_acct_user(info.acct_user.id, info.acct_user.id); + + if (tenanted_acct_user == uid) { + return true; + } + } + + return info.acct_user == uid; +} + +bool rgw::auth::RemoteApplier::is_identity(const idset_t& ids) const { + for (auto& id : ids) { + if (id.is_wildcard()) { + return true; + + // We also need to cover cases where rgw_keystone_implicit_tenants + // was enabled. */ + } else if (id.is_tenant() && + (info.acct_user.tenant.empty() ? + info.acct_user.id : + info.acct_user.tenant) == id.get_tenant()) { + return true; + } else if (id.is_user() && + info.acct_user.id == id.get_id() && + (info.acct_user.tenant.empty() ? + info.acct_user.id : + info.acct_user.tenant) == id.get_tenant()) { + return true; + } + } + return false; +} + +void rgw::auth::RemoteApplier::to_str(std::ostream& out) const +{ + out << "rgw::auth::RemoteApplier(acct_user=" << info.acct_user + << ", acct_name=" << info.acct_name + << ", perm_mask=" << info.perm_mask + << ", is_admin=" << info.is_admin << ")"; +} + +void rgw::auth::ImplicitTenants::recompute_value(const ConfigProxy& c) +{ + std::string s = c.get_val("rgw_keystone_implicit_tenants"); + int v = 0; + if (boost::iequals(s, "both") + || boost::iequals(s, "true") + || boost::iequals(s, "1")) { + v = IMPLICIT_TENANTS_S3|IMPLICIT_TENANTS_SWIFT; + } else if (boost::iequals(s, "0") + || boost::iequals(s, "none") + || boost::iequals(s, "false")) { + v = 0; + } else if (boost::iequals(s, "s3")) { + v = IMPLICIT_TENANTS_S3; + } else if (boost::iequals(s, "swift")) { + v = IMPLICIT_TENANTS_SWIFT; + } else { /* "" (and anything else) */ + v = IMPLICIT_TENANTS_BAD; + // assert(0); + } + saved = v; +} + +const char **rgw::auth::ImplicitTenants::get_tracked_conf_keys() const +{ + static const char *keys[] = { + "rgw_keystone_implicit_tenants", + nullptr }; + return keys; +} + +void rgw::auth::ImplicitTenants::handle_conf_change(const ConfigProxy& c, + const std::set &changed) +{ + if (changed.count("rgw_keystone_implicit_tenants")) { + recompute_value(c); + } +} + +void rgw::auth::RemoteApplier::create_account(const DoutPrefixProvider* dpp, + const rgw_user& acct_user, + bool implicit_tenant, + RGWUserInfo& user_info) const /* out */ +{ + rgw_user new_acct_user = acct_user; + + if (info.acct_type) { + //ldap/keystone for s3 users + user_info.type = info.acct_type; + } + + /* An upper layer may enforce creating new accounts within their own + * tenants. */ + if (new_acct_user.tenant.empty() && implicit_tenant) { + new_acct_user.tenant = new_acct_user.id; + } + + user_info.user_id = new_acct_user; + user_info.display_name = info.acct_name; + + user_info.max_buckets = + cct->_conf.get_val("rgw_user_max_buckets"); + rgw_apply_default_bucket_quota(user_info.bucket_quota, cct->_conf); + rgw_apply_default_user_quota(user_info.user_quota, cct->_conf); + + int ret = rgw_store_user_info(store, user_info, nullptr, nullptr, + real_time(), true); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to store new user info: user=" + << user_info.user_id << " ret=" << ret << dendl; + throw ret; + } +} + +/* TODO(rzarzynski): we need to handle display_name changes. */ +void rgw::auth::RemoteApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +{ + /* It's supposed that RGWRemoteAuthApplier tries to load account info + * that belongs to the authenticated identity. Another policy may be + * applied by using a RGWThirdPartyAccountAuthApplier decorator. */ + const rgw_user& acct_user = info.acct_user; + auto implicit_value = implicit_tenant_context.get_value(); + bool implicit_tenant = implicit_value.implicit_tenants_for_(implicit_tenant_bit); + bool split_mode = implicit_value.is_split_mode(); + + /* Normally, empty "tenant" field of acct_user means the authenticated + * identity has the legacy, global tenant. However, due to inclusion + * of multi-tenancy, we got some special compatibility kludge for remote + * backends like Keystone. + * If the global tenant is the requested one, we try the same tenant as + * the user name first. If that RGWUserInfo exists, we use it. This way, + * migrated OpenStack users can get their namespaced containers and nobody's + * the wiser. + * If that fails, we look up in the requested (possibly empty) tenant. + * If that fails too, we create the account within the global or separated + * namespace depending on rgw_keystone_implicit_tenants. + * For compatibility with previous versions of ceph, it is possible + * to enable implicit_tenants for only s3 or only swift. + * in this mode ("split_mode"), we must constrain the id lookups to + * only use the identifier space that would be used if the id were + * to be created. */ + + if (split_mode && !implicit_tenant) + ; /* suppress lookup for id used by "other" protocol */ + else if (acct_user.tenant.empty()) { + const rgw_user tenanted_uid(acct_user.id, acct_user.id); + + if (rgw_get_user_info_by_uid(store, tenanted_uid, user_info) >= 0) { + /* Succeeded. */ + return; + } + } + + if (split_mode && implicit_tenant) + ; /* suppress lookup for id used by "other" protocol */ + else if (rgw_get_user_info_by_uid(store, acct_user, user_info) >= 0) { + /* Succeeded. */ + return; + } + + ldout(cct, 0) << "NOTICE: couldn't map swift user " << acct_user << dendl; + create_account(dpp, acct_user, implicit_tenant, user_info); + + /* Succeeded if we are here (create_account() hasn't throwed). */ +} + +/* rgw::auth::LocalApplier */ +/* static declaration */ +const std::string rgw::auth::LocalApplier::NO_SUBUSER; + +uint32_t rgw::auth::LocalApplier::get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const +{ + return rgw_perms_from_aclspec_default_strategy(user_info.user_id, aclspec); +} + +bool rgw::auth::LocalApplier::is_admin_of(const rgw_user& uid) const +{ + return user_info.admin || user_info.system; +} + +bool rgw::auth::LocalApplier::is_owner_of(const rgw_user& uid) const +{ + return uid == user_info.user_id; +} + +bool rgw::auth::LocalApplier::is_identity(const idset_t& ids) const { + for (auto& id : ids) { + if (id.is_wildcard()) { + return true; + } else if (id.is_tenant() && + id.get_tenant() == user_info.user_id.tenant) { + return true; + } else if (id.is_user() && + (id.get_tenant() == user_info.user_id.tenant)) { + if (id.get_id() == user_info.user_id.id) { + return true; + } + std::string wildcard_subuser = user_info.user_id.id; + wildcard_subuser.append(":*"); + if (wildcard_subuser == id.get_id()) { + return true; + } else if (subuser != NO_SUBUSER) { + std::string user = user_info.user_id.id; + user.append(":"); + user.append(subuser); + if (user == id.get_id()) { + return true; + } + } + } + } + return false; +} + +void rgw::auth::LocalApplier::to_str(std::ostream& out) const { + out << "rgw::auth::LocalApplier(acct_user=" << user_info.user_id + << ", acct_name=" << user_info.display_name + << ", subuser=" << subuser + << ", perm_mask=" << get_perm_mask() + << ", is_admin=" << static_cast(user_info.admin) << ")"; +} + +uint32_t rgw::auth::LocalApplier::get_perm_mask(const std::string& subuser_name, + const RGWUserInfo &uinfo) const +{ + if (! subuser_name.empty() && subuser_name != NO_SUBUSER) { + const auto iter = uinfo.subusers.find(subuser_name); + + if (iter != std::end(uinfo.subusers)) { + return iter->second.perm_mask; + } else { + /* Subuser specified but not found. */ + return RGW_PERM_NONE; + } + } else { + /* Due to backward compatibility. */ + return RGW_PERM_FULL_CONTROL; + } +} + +void rgw::auth::LocalApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +{ + /* Load the account that belongs to the authenticated identity. An extra call + * to RADOS may be safely skipped in this case. */ + user_info = this->user_info; +} + +void rgw::auth::RoleApplier::to_str(std::ostream& out) const { + out << "rgw::auth::LocalApplier(role name =" << role_name; + for (auto policy : role_policies) { + out << ", role policy =" << policy; + } + out << ")"; +} + +bool rgw::auth::RoleApplier::is_identity(const idset_t& ids) const { + for (auto& p : ids) { + string name; + string tenant = p.get_tenant(); + if (tenant.empty()) { + name = p.get_id(); + } else { + name = tenant + "$" + p.get_id(); + } + if (p.is_wildcard()) { + return true; + } else if (p.is_role() && name == role_name) { + return true; + } + } + return false; +} + +void rgw::auth::RoleApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +{ + /* Load the user id */ + user_info.user_id = this->user_id; +} + +void rgw::auth::RoleApplier::modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const +{ + for (auto it : role_policies) { + try { + bufferlist bl = bufferlist::static_from_string(it); + const rgw::IAM::Policy p(s->cct, s->user->user_id.tenant, bl); + s->iam_user_policies.push_back(std::move(p)); + } catch (rgw::IAM::PolicyParseException& e) { + //Control shouldn't reach here as the policy has already been + //verified earlier + ldpp_dout(dpp, 20) << "failed to parse policy: " << e.what() << dendl; + } + } +} + +rgw::auth::Engine::result_t +rgw::auth::AnonymousEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const +{ + if (! is_applicable(s)) { + return result_t::deny(-EPERM); + } else { + RGWUserInfo user_info; + rgw_get_anon_user(user_info); + + auto apl = \ + apl_factory->create_apl_local(cct, s, user_info, + rgw::auth::LocalApplier::NO_SUBUSER, + boost::none); + return result_t::grant(std::move(apl)); + } +} diff --git a/src/rgw/rgw_auth.h b/src/rgw/rgw_auth.h new file mode 100644 index 00000000..be7a102a --- /dev/null +++ b/src/rgw/rgw_auth.h @@ -0,0 +1,696 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#ifndef CEPH_RGW_AUTH_H +#define CEPH_RGW_AUTH_H + +#include +#include +#include +#include +#include + +#include "rgw_common.h" +#include "rgw_keystone.h" +#include "rgw_web_idp.h" + +#define RGW_USER_ANON_ID "anonymous" + +namespace rgw { +namespace auth { + +using Exception = std::system_error; + + +/* Load information about identity that will be used by RGWOp to authorize + * any operation that comes from an authenticated user. */ +class Identity { +public: + typedef std::map aclspec_t; + using idset_t = boost::container::flat_set; + + virtual ~Identity() = default; + + /* Translate the ACL provided in @aclspec into concrete permission set that + * can be used during the authorization phase (RGWOp::verify_permission). + * On error throws rgw::auth::Exception storing the reason. + * + * NOTE: an implementation is responsible for giving the real semantic to + * the items in @aclspec. That is, their meaning may depend on particular + * applier that is being used. */ + virtual uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const = 0; + + /* Verify whether a given identity *can be treated as* an admin of rgw_user + * (account in Swift's terminology) specified in @uid. On error throws + * rgw::auth::Exception storing the reason. */ + virtual bool is_admin_of(const rgw_user& uid) const = 0; + + /* Verify whether a given identity *is* the owner of the rgw_user (account + * in the Swift's terminology) specified in @uid. On internal error throws + * rgw::auth::Exception storing the reason. */ + virtual bool is_owner_of(const rgw_user& uid) const = 0; + + /* Return the permission mask that is used to narrow down the set of + * operations allowed for a given identity. This method reflects the idea + * of subuser tied to RGWUserInfo. On error throws rgw::auth::Exception + * with the reason. */ + virtual uint32_t get_perm_mask() const = 0; + + virtual bool is_anonymous() const { + /* If the identity owns the anonymous account (rgw_user), it's considered + * the anonymous identity. On error throws rgw::auth::Exception storing + * the reason. */ + return is_owner_of(rgw_user(RGW_USER_ANON_ID)); + } + + virtual void to_str(std::ostream& out) const = 0; + + /* Verify whether a given identity corresponds to an identity in the + provided set */ + virtual bool is_identity(const idset_t& ids) const = 0; + + /* Identity Type: RGW/ LDAP/ Keystone */ + virtual uint32_t get_identity_type() const = 0; + + /* Name of Account */ + virtual string get_acct_name() const = 0; + + /* Subuser of Account */ + virtual string get_subuser() const = 0; +}; + +inline std::ostream& operator<<(std::ostream& out, + const rgw::auth::Identity& id) { + id.to_str(out); + return out; +} + + +std::unique_ptr transform_old_authinfo(const req_state* const s); + + +/* Interface for classes applying changes to request state/RADOS store + * imposed by a particular rgw::auth::Engine. + * + * In contrast to rgw::auth::Engine, implementations of this interface + * are allowed to handle req_state or RGWRados in the read-write manner. + * + * It's expected that most (if not all) of implementations will also + * conform to rgw::auth::Identity interface to provide authorization + * policy (ACLs, account's ownership and entitlement). */ +class IdentityApplier : public Identity { +public: + typedef std::unique_ptr aplptr_t; + + virtual ~IdentityApplier() {}; + + /* Fill provided RGWUserInfo with information about the account that + * RGWOp will operate on. Errors are handled solely through exceptions. + * + * XXX: be aware that the "account" term refers to rgw_user. The naming + * is legacy. */ + virtual void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const = 0; /* out */ + + /* Apply any changes to request state. This method will be most useful for + * TempURL of Swift API. */ + virtual void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const {} /* in/out */ +}; + + +/* Interface class for completing the two-step authentication process. + * Completer provides the second step - the complete() method that should + * be called after Engine::authenticate() but before *committing* results + * of an RGWOp (or sending a response in the case of non-mutating ops). + * + * The motivation driving the interface is to address those authentication + * schemas that require message integrity verification *without* in-memory + * data buffering. Typical examples are AWS Auth v4 and the auth mechanism + * of browser uploads facilities both in S3 and Swift APIs (see RGWPostObj). + * The workflow of request from the authentication point-of-view does look + * like following one: + * A. authenticate (Engine::authenticate), + * B. authorize (see RGWOp::verify_permissions), + * C. execute-prepare (init potential data modifications), + * D. authenticate-complete - (Completer::complete), + * E. execute-commit - commit the modifications from point C. */ +class Completer { +public: + /* It's expected that Completers would tend to implement many interfaces + * and be used not only in req_state::auth::completer. Ref counting their + * instances would be helpful. */ + typedef std::shared_ptr cmplptr_t; + + virtual ~Completer() = default; + + /* Complete the authentication process. Return boolean indicating whether + * the completion succeeded. On error throws rgw::auth::Exception storing + * the reason. */ + virtual bool complete() = 0; + + /* Apply any changes to request state. The initial use case was injecting + * the AWSv4 filter over rgw::io::RestfulClient in req_state. */ + virtual void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) = 0; /* in/out */ +}; + + +/* Interface class for authentication backends (auth engines) in RadosGW. + * + * An engine is supposed only to authenticate (not authorize!) requests + * basing on their req_state and - if access has been granted - provide + * an upper layer with: + * - rgw::auth::IdentityApplier to commit all changes to the request state as + * well as to the RADOS store (creating an account, synchronizing + * user-related information with external databases and so on). + * - rgw::auth::Completer (optionally) to finish the authentication + * of the request. Typical use case is verifying message integrity + * in AWS Auth v4 and browser uploads (RGWPostObj). + * + * Both of them are supposed to be wrapped in Engine::AuthResult. + * + * The authentication process consists of two steps: + * - Engine::authenticate() which should be called before *initiating* + * any modifications to RADOS store that are related to an operation + * a client wants to perform (RGWOp::execute). + * - Completer::complete() supposed to be called, if completer has been + * returned, after the authenticate() step but before *committing* + * those modifications or sending a response (RGWOp::complete). + * + * An engine outlives both Applier and Completer. It's intended to live + * since RadosGW's initialization and handle multiple requests till + * a reconfiguration. + * + * Auth engine MUST NOT make any changes to req_state nor RADOS store. + * This is solely an Applier's responsibility! + * + * Separation between authentication and global state modification has + * been introduced because many auth engines are orthogonal to appliers + * and thus they can be decoupled. Additional motivation is to clearly + * distinguish all portions of code modifying data structures. */ +class Engine { +public: + virtual ~Engine() = default; + + class AuthResult { + struct rejection_mark_t {}; + bool is_rejected = false; + int reason = 0; + + std::pair result_pair; + + explicit AuthResult(const int reason) + : reason(reason) { + } + + AuthResult(rejection_mark_t&&, const int reason) + : is_rejected(true), + reason(reason) { + } + + /* Allow only the reasonable combintations - returning just Completer + * without accompanying IdentityApplier is strictly prohibited! */ + explicit AuthResult(IdentityApplier::aplptr_t&& applier) + : result_pair(std::move(applier), nullptr) { + } + + AuthResult(IdentityApplier::aplptr_t&& applier, + Completer::cmplptr_t&& completer) + : result_pair(std::move(applier), std::move(completer)) { + } + + public: + enum class Status { + /* Engine doesn't grant the access but also doesn't reject it. */ + DENIED, + + /* Engine successfully authenicated requester. */ + GRANTED, + + /* Engine strictly indicates that a request should be rejected + * without trying any further engine. */ + REJECTED + }; + + Status get_status() const { + if (is_rejected) { + return Status::REJECTED; + } else if (! result_pair.first) { + return Status::DENIED; + } else { + return Status::GRANTED; + } + } + + int get_reason() const { + return reason; + } + + IdentityApplier::aplptr_t get_applier() { + return std::move(result_pair.first); + } + + Completer::cmplptr_t&& get_completer() { + return std::move(result_pair.second); + } + + static AuthResult reject(const int reason = -EACCES) { + return AuthResult(rejection_mark_t(), reason); + } + + static AuthResult deny(const int reason = -EACCES) { + return AuthResult(reason); + } + + static AuthResult grant(IdentityApplier::aplptr_t&& applier) { + return AuthResult(std::move(applier)); + } + + static AuthResult grant(IdentityApplier::aplptr_t&& applier, + Completer::cmplptr_t&& completer) { + return AuthResult(std::move(applier), std::move(completer)); + } + }; + + using result_t = AuthResult; + + /* Get name of the auth engine. */ + virtual const char* get_name() const noexcept = 0; + + /* Throwing method for identity verification. When the check is positive + * an implementation should return Engine::result_t containing: + * - a non-null pointer to an object conforming the Applier interface. + * Otherwise, the authentication is treated as failed. + * - a (potentially null) pointer to an object conforming the Completer + * interface. + * + * On error throws rgw::auth::Exception containing the reason. */ + virtual result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s) const = 0; +}; + + +/* Interface for extracting a token basing from data carried by req_state. */ +class TokenExtractor { +public: + virtual ~TokenExtractor() = default; + virtual std::string get_token(const req_state* s) const = 0; +}; + + +/* Abstract class for stacking sub-engines to expose them as a single + * Engine. It is responsible for ordering its sub-engines and managing + * fall-backs between them. Derivatee is supposed to encapsulate engine + * instances and add them using the add_engine() method in the order it + * wants to be tried during the call to authenticate(). + * + * Each new Strategy should be exposed to StrategyRegistry for handling + * the dynamic reconfiguration. */ +class Strategy : public Engine { +public: + /* Specifiers controlling what happens when an associated engine fails. + * The names and semantic has been borrowed mostly from libpam. */ + enum class Control { + /* Failure of an engine injected with the REQUISITE specifier aborts + * the strategy's authentication process immediately. No other engine + * will be tried. */ + REQUISITE, + + /* Success of an engine injected with the SUFFICIENT specifier ends + * strategy's authentication process successfully. However, denying + * doesn't abort it -- there will be fall-back to following engine + * if the one that failed wasn't the last one. */ + SUFFICIENT, + + /* Like SUFFICIENT with the exception that on failure the reason code + * is not overridden. Instead, it's taken directly from the last tried + * non-FALLBACK engine. If there was no previous non-FALLBACK engine + * in a Strategy, then the result_t::deny(reason = -EACCES) is used. */ + FALLBACK, + }; + + Engine::result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s) const override final; + + bool is_empty() const { + return auth_stack.empty(); + } + + static int apply(const DoutPrefixProvider* dpp, const Strategy& auth_strategy, req_state* s) noexcept; + +private: + /* Using the reference wrapper here to explicitly point out we are not + * interested in storing nulls while preserving the dynamic polymorphism. */ + using stack_item_t = std::pair, + Control>; + std::vector auth_stack; + +protected: + void add_engine(Control ctrl_flag, const Engine& engine) noexcept; +}; + + +/* A class aggregating the knowledge about all Strategies in RadosGW. It is + * responsible for handling the dynamic reconfiguration on e.g. realm update. + * The definition is in rgw/rgw_auth_registry.h, + * + * Each new Strategy should be exposed to it. */ +class StrategyRegistry; + +class WebIdentityApplier : public IdentityApplier { +protected: + CephContext* const cct; + RGWRados* const store; + rgw::web_idp::WebTokenClaims token_claims; + + string get_idp_url() const; + +public: + WebIdentityApplier( CephContext* const cct, + RGWRados* const store, + const rgw::web_idp::WebTokenClaims& token_claims) + : cct(cct), + store(store), + token_claims(token_claims) { + } + + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override { + user_info.user_id = rgw_user(token_claims.sub); + user_info.display_name = token_claims.user_name; + } + + void modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const override; + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override { + return RGW_PERM_NONE; + } + + bool is_admin_of(const rgw_user& uid) const override { + return false; + } + + bool is_owner_of(const rgw_user& uid) const override { + return false; + } + + uint32_t get_perm_mask() const override { + return RGW_PERM_NONE; + } + + void to_str(std::ostream& out) const override; + + bool is_identity(const idset_t& ids) const override; + + uint32_t get_identity_type() const override { + return TYPE_WEB; + } + + string get_acct_name() const override { + return token_claims.user_name; + } + + string get_subuser() const override { + return {}; + } + + struct Factory { + virtual ~Factory() {} + + virtual aplptr_t create_apl_web_identity( CephContext* cct, + const req_state* s, + const rgw::web_idp::WebTokenClaims& token) const = 0; + }; +}; + +class ImplicitTenants: public md_config_obs_t { +public: + enum implicit_tenant_flag_bits {IMPLICIT_TENANTS_SWIFT=1, + IMPLICIT_TENANTS_S3=2, IMPLICIT_TENANTS_BAD = -1, }; +private: + int saved; + void recompute_value(const ConfigProxy& ); + class ImplicitTenantValue { + friend class ImplicitTenants; + int v; + ImplicitTenantValue(int v) : v(v) {}; + public: + bool inline is_split_mode() + { + assert(v != IMPLICIT_TENANTS_BAD); + return v == IMPLICIT_TENANTS_SWIFT || v == IMPLICIT_TENANTS_S3; + } + bool inline implicit_tenants_for_(const implicit_tenant_flag_bits bit) + { + assert(v != IMPLICIT_TENANTS_BAD); + return static_cast(v&bit); + } + }; +public: + ImplicitTenants(const ConfigProxy& c) { recompute_value(c);} + ImplicitTenantValue get_value() { + return ImplicitTenantValue(saved); + } +private: + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set &changed) override; +}; + +std::tuple implicit_tenants_enabled_for_swift(CephContext * const cct); +std::tuple implicit_tenants_enabled_for_s3(CephContext * const cct); + +/* rgw::auth::RemoteApplier targets those authentication engines which don't + * need to ask the RADOS store while performing the auth process. Instead, + * they obtain credentials from an external source like Keystone or LDAP. + * + * As the authenticated user may not have an account yet, RGWRemoteAuthApplier + * must be able to create it basing on data passed by an auth engine. Those + * data will be used to fill RGWUserInfo structure. */ +class RemoteApplier : public IdentityApplier { +public: + class AuthInfo { + friend class RemoteApplier; + protected: + const rgw_user acct_user; + const std::string acct_name; + const uint32_t perm_mask; + const bool is_admin; + const uint32_t acct_type; + + public: + enum class acct_privilege_t { + IS_ADMIN_ACCT, + IS_PLAIN_ACCT + }; + + AuthInfo(const rgw_user& acct_user, + const std::string& acct_name, + const uint32_t perm_mask, + const acct_privilege_t level, + const uint32_t acct_type=TYPE_NONE) + : acct_user(acct_user), + acct_name(acct_name), + perm_mask(perm_mask), + is_admin(acct_privilege_t::IS_ADMIN_ACCT == level), + acct_type(acct_type) { + } + }; + + using aclspec_t = rgw::auth::Identity::aclspec_t; + typedef std::function acl_strategy_t; + +protected: + CephContext* const cct; + + /* Read-write is intensional here due to RGWUserInfo creation process. */ + RGWRados* const store; + + /* Supplemental strategy for extracting permissions from ACLs. Its results + * will be combined (ORed) with a default strategy that is responsible for + * handling backward compatibility. */ + const acl_strategy_t extra_acl_strategy; + + const AuthInfo info; + rgw::auth::ImplicitTenants& implicit_tenant_context; + const rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit; + + virtual void create_account(const DoutPrefixProvider* dpp, + const rgw_user& acct_user, + bool implicit_tenant, + RGWUserInfo& user_info) const; /* out */ + +public: + RemoteApplier(CephContext* const cct, + RGWRados* const store, + acl_strategy_t&& extra_acl_strategy, + const AuthInfo& info, + rgw::auth::ImplicitTenants& implicit_tenant_context, + rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit) + : cct(cct), + store(store), + extra_acl_strategy(std::move(extra_acl_strategy)), + info(info), + implicit_tenant_context(implicit_tenant_context), + implicit_tenant_bit(implicit_tenant_bit) { + } + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override; + bool is_admin_of(const rgw_user& uid) const override; + bool is_owner_of(const rgw_user& uid) const override; + bool is_identity(const idset_t& ids) const override; + + uint32_t get_perm_mask() const override { return info.perm_mask; } + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + uint32_t get_identity_type() const override { return info.acct_type; } + string get_acct_name() const override { return info.acct_name; } + string get_subuser() const override { return {}; } + + struct Factory { + virtual ~Factory() {} + /* Providing r-value reference here is required intensionally. Callee is + * thus disallowed to handle std::function in a way that could inhibit + * the move behaviour (like forgetting about std::moving a l-value). */ + virtual aplptr_t create_apl_remote(CephContext* cct, + const req_state* s, + acl_strategy_t&& extra_acl_strategy, + const AuthInfo &info) const = 0; + }; +}; + + +/* rgw::auth::LocalApplier targets those auth engines that base on the data + * enclosed in the RGWUserInfo control structure. As a side effect of doing + * the authentication process, they must have it loaded. Leveraging this is + * a way to avoid unnecessary calls to underlying RADOS store. */ +class LocalApplier : public IdentityApplier { + using aclspec_t = rgw::auth::Identity::aclspec_t; + +protected: + const RGWUserInfo user_info; + const std::string subuser; + uint32_t perm_mask; + + uint32_t get_perm_mask(const std::string& subuser_name, + const RGWUserInfo &uinfo) const; + +public: + static const std::string NO_SUBUSER; + + LocalApplier(CephContext* const cct, + const RGWUserInfo& user_info, + std::string subuser, + const boost::optional& perm_mask) + : user_info(user_info), + subuser(std::move(subuser)) { + if (perm_mask) { + this->perm_mask = perm_mask.get(); + } else { + this->perm_mask = RGW_PERM_INVALID; + } + } + + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override; + bool is_admin_of(const rgw_user& uid) const override; + bool is_owner_of(const rgw_user& uid) const override; + bool is_identity(const idset_t& ids) const override; + uint32_t get_perm_mask() const override { + if (this->perm_mask == RGW_PERM_INVALID) { + return get_perm_mask(subuser, user_info); + } else { + return this->perm_mask; + } + } + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + uint32_t get_identity_type() const override { return TYPE_RGW; } + string get_acct_name() const override { return {}; } + string get_subuser() const override { return subuser; } + + struct Factory { + virtual ~Factory() {} + virtual aplptr_t create_apl_local(CephContext* cct, + const req_state* s, + const RGWUserInfo& user_info, + const std::string& subuser, + const boost::optional& perm_mask) const = 0; + }; +}; + +class RoleApplier : public IdentityApplier { +protected: + const string role_name; + const rgw_user user_id; + vector role_policies; + +public: + + RoleApplier(CephContext* const cct, + const string& role_name, + const rgw_user& user_id, + const vector& role_policies) + : role_name(role_name), + user_id(user_id), + role_policies(role_policies) {} + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override { + return 0; + } + bool is_admin_of(const rgw_user& uid) const override { + return false; + } + bool is_owner_of(const rgw_user& uid) const override { + return false; + } + bool is_identity(const idset_t& ids) const override; + uint32_t get_perm_mask() const override { + return RGW_PERM_NONE; + } + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + uint32_t get_identity_type() const override { return TYPE_ROLE; } + string get_acct_name() const override { return {}; } + string get_subuser() const override { return {}; } + void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override; + + struct Factory { + virtual ~Factory() {} + virtual aplptr_t create_apl_role( CephContext* cct, + const req_state* s, + const string& role_name, + const rgw_user& user_id, + const vector& role_policies) const = 0; + }; +}; + +/* The anonymous abstract engine. */ +class AnonymousEngine : public Engine { + CephContext* const cct; + const rgw::auth::LocalApplier::Factory* const apl_factory; + +public: + AnonymousEngine(CephContext* const cct, + const rgw::auth::LocalApplier::Factory* const apl_factory) + : cct(cct), + apl_factory(apl_factory) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::AnonymousEngine"; + } + + Engine::result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s) const override final; + +protected: + virtual bool is_applicable(const req_state*) const noexcept { + return true; + } +}; + +} /* namespace auth */ +} /* namespace rgw */ + + +uint32_t rgw_perms_from_aclspec_default_strategy( + const rgw_user& uid, + const rgw::auth::Identity::aclspec_t& aclspec); + +#endif /* CEPH_RGW_AUTH_H */ diff --git a/src/rgw/rgw_auth_filters.h b/src/rgw/rgw_auth_filters.h new file mode 100644 index 00000000..58436022 --- /dev/null +++ b/src/rgw/rgw_auth_filters.h @@ -0,0 +1,290 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_AUTH_FILTERS_H +#define CEPH_RGW_AUTH_FILTERS_H + +#include + +#include +#include + +#include "rgw_common.h" +#include "rgw_auth.h" + +namespace rgw { +namespace auth { + +/* Abstract decorator over any implementation of rgw::auth::IdentityApplier + * which could be provided both as a pointer-to-object or the object itself. */ +template +class DecoratedApplier : public rgw::auth::IdentityApplier { + typedef typename std::remove_pointer::type DerefedDecorateeT; + + static_assert(std::is_base_of::value, + "DecorateeT must be a subclass of rgw::auth::IdentityApplier"); + + DecorateeT decoratee; + + /* There is an indirection layer over accessing decoratee to share the same + * code base between dynamic and static decorators. The difference is about + * what we store internally: pointer to a decorated object versus the whole + * object itself. Googling for "SFINAE" can help to understand the code. */ + template ::value, T>::type* = nullptr> + DerefedDecorateeT& get_decoratee() { + return *decoratee; + } + + template ::value, T>::type* = nullptr> + DerefedDecorateeT& get_decoratee() { + return decoratee; + } + + template ::value, T>::type* = nullptr> + const DerefedDecorateeT& get_decoratee() const { + return *decoratee; + } + + template ::value, T>::type* = nullptr> + const DerefedDecorateeT& get_decoratee() const { + return decoratee; + } + +public: + explicit DecoratedApplier(DecorateeT&& decoratee) + : decoratee(std::forward(decoratee)) { + } + + uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override { + return get_decoratee().get_perms_from_aclspec(dpp, aclspec); + } + + bool is_admin_of(const rgw_user& uid) const override { + return get_decoratee().is_admin_of(uid); + } + + bool is_owner_of(const rgw_user& uid) const override { + return get_decoratee().is_owner_of(uid); + } + + bool is_anonymous() const override { + return get_decoratee().is_anonymous(); + } + + uint32_t get_perm_mask() const override { + return get_decoratee().get_perm_mask(); + } + + uint32_t get_identity_type() const override { + return get_decoratee().get_identity_type(); + } + + string get_acct_name() const override { + return get_decoratee().get_acct_name(); + } + + string get_subuser() const override { + return get_decoratee().get_subuser(); + } + + bool is_identity( + const boost::container::flat_set& ids) const override { + return get_decoratee().is_identity(ids); + } + + void to_str(std::ostream& out) const override { + get_decoratee().to_str(out); + } + + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override { /* out */ + return get_decoratee().load_acct_info(dpp, user_info); + } + + void modify_request_state(const DoutPrefixProvider* dpp, req_state * s) const override { /* in/out */ + return get_decoratee().modify_request_state(dpp, s); + } +}; + + +template +class ThirdPartyAccountApplier : public DecoratedApplier { + /* const */RGWRados* const store; + const rgw_user acct_user_override; + +public: + /* A value representing situations where there is no requested account + * override. In other words, acct_user_override will be equal to this + * constant where the request isn't a cross-tenant one. */ + static const rgw_user UNKNOWN_ACCT; + + template + ThirdPartyAccountApplier(RGWRados* const store, + const rgw_user &acct_user_override, + U&& decoratee) + : DecoratedApplier(std::move(decoratee)), + store(store), + acct_user_override(acct_user_override) { + } + + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ +}; + +/* static declaration: UNKNOWN_ACCT will be an empty rgw_user that is a result + * of the default construction. */ +template +const rgw_user ThirdPartyAccountApplier::UNKNOWN_ACCT; + +template +void ThirdPartyAccountApplier::to_str(std::ostream& out) const +{ + out << "rgw::auth::ThirdPartyAccountApplier(" + acct_user_override.to_str() + ")" + << " -> "; + DecoratedApplier::to_str(out); +} + +template +void ThirdPartyAccountApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const +{ + if (UNKNOWN_ACCT == acct_user_override) { + /* There is no override specified by the upper layer. This means that we'll + * load the account owned by the authenticated identity (aka auth_user). */ + DecoratedApplier::load_acct_info(dpp, user_info); + } else if (DecoratedApplier::is_owner_of(acct_user_override)) { + /* The override has been specified but the account belongs to the authenticated + * identity. We may safely forward the call to a next stage. */ + DecoratedApplier::load_acct_info(dpp, user_info); + } else if (this->is_anonymous()) { + /* If the user was authed by the anonymous engine then scope the ANON user + * to the correct tenant */ + if (acct_user_override.tenant.empty()) + user_info.user_id = rgw_user(acct_user_override.id, RGW_USER_ANON_ID); + else + user_info.user_id = rgw_user(acct_user_override.tenant, RGW_USER_ANON_ID); + } else { + /* Compatibility mechanism for multi-tenancy. For more details refer to + * load_acct_info method of rgw::auth::RemoteApplier. */ + if (acct_user_override.tenant.empty()) { + const rgw_user tenanted_uid(acct_user_override.id, acct_user_override.id); + + if (rgw_get_user_info_by_uid(store, tenanted_uid, user_info) >= 0) { + /* Succeeded. */ + return; + } + } + + const int ret = rgw_get_user_info_by_uid(store, acct_user_override, user_info); + if (ret < 0) { + /* We aren't trying to recover from ENOENT here. It's supposed that creating + * someone else's account isn't a thing we want to support in this filter. */ + if (ret == -ENOENT) { + throw -EACCES; + } else { + throw ret; + } + } + + } +} + +template static inline +ThirdPartyAccountApplier add_3rdparty(RGWRados* const store, + const rgw_user &acct_user_override, + T&& t) { + return ThirdPartyAccountApplier(store, acct_user_override, + std::forward(t)); +} + + +template +class SysReqApplier : public DecoratedApplier { + CephContext* const cct; + /*const*/ RGWRados* const store; + const RGWHTTPArgs& args; + mutable boost::tribool is_system; + +public: + template + SysReqApplier(CephContext* const cct, + /*const*/ RGWRados* const store, + const req_state* const s, + U&& decoratee) + : DecoratedApplier(std::forward(decoratee)), + cct(cct), + store(store), + args(s->info.args), + is_system(boost::logic::indeterminate) { + } + + void to_str(std::ostream& out) const override; + void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override; /* in/out */ +}; + +template +void SysReqApplier::to_str(std::ostream& out) const +{ + out << "rgw::auth::SysReqApplier" << " -> "; + DecoratedApplier::to_str(out); +} + +template +void SysReqApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const +{ + DecoratedApplier::load_acct_info(dpp, user_info); + is_system = user_info.system; + + if (is_system) { + //ldpp_dout(dpp, 20) << "system request" << dendl; + + rgw_user effective_uid(args.sys_get(RGW_SYS_PARAM_PREFIX "uid")); + if (! effective_uid.empty()) { + /* We aren't writing directly to user_info for consistency and security + * reasons. rgw_get_user_info_by_uid doesn't trigger the operator=() but + * calls ::decode instead. */ + RGWUserInfo euser_info; + if (rgw_get_user_info_by_uid(store, effective_uid, euser_info) < 0) { + //ldpp_dout(dpp, 0) << "User lookup failed!" << dendl; + throw -EACCES; + } + user_info = euser_info; + } + } +} + +template +void SysReqApplier::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s) const +{ + if (boost::logic::indeterminate(is_system)) { + RGWUserInfo unused_info; + load_acct_info(dpp, unused_info); + } + + if (is_system) { + s->info.args.set_system(); + s->system_request = true; + } + DecoratedApplier::modify_request_state(dpp, s); +} + +template static inline +SysReqApplier add_sysreq(CephContext* const cct, + /* const */ RGWRados* const store, + const req_state* const s, + T&& t) { + return SysReqApplier(cct, store, s, std::forward(t)); +} + +} /* namespace auth */ +} /* namespace rgw */ + +#endif /* CEPH_RGW_AUTH_FILTERS_H */ diff --git a/src/rgw/rgw_auth_keystone.cc b/src/rgw/rgw_auth_keystone.cc new file mode 100644 index 00000000..5a325425 --- /dev/null +++ b/src/rgw/rgw_auth_keystone.cc @@ -0,0 +1,491 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include +#include + +#include "rgw_b64.h" + +#include "common/errno.h" +#include "common/ceph_json.h" +#include "include/types.h" +#include "include/str_list.h" + +#include "rgw_common.h" +#include "rgw_keystone.h" +#include "rgw_auth_keystone.h" +#include "rgw_rest_s3.h" +#include "rgw_auth_s3.h" + +#include "common/ceph_crypto_cms.h" +#include "common/armor.h" +#include "common/Cond.h" + +#define dout_subsys ceph_subsys_rgw + + +namespace rgw { +namespace auth { +namespace keystone { + +bool +TokenEngine::is_applicable(const std::string& token) const noexcept +{ + return ! token.empty() && ! cct->_conf->rgw_keystone_url.empty(); +} + +TokenEngine::token_envelope_t +TokenEngine::decode_pki_token(const DoutPrefixProvider* dpp, const std::string& token) const +{ + ceph::buffer::list token_body_bl; + int ret = rgw_decode_b64_cms(cct, token, token_body_bl); + if (ret < 0) { + ldpp_dout(dpp, 20) << "cannot decode pki token" << dendl; + throw ret; + } else { + ldpp_dout(dpp, 20) << "successfully decoded pki token" << dendl; + } + + TokenEngine::token_envelope_t token_body; + ret = token_body.parse(cct, token, token_body_bl, config.get_api_version()); + if (ret < 0) { + throw ret; + } + + return token_body; +} + +boost::optional +TokenEngine::get_from_keystone(const DoutPrefixProvider* dpp, const std::string& token) const +{ + /* Unfortunately, we can't use the short form of "using" here. It's because + * we're aliasing a class' member, not namespace. */ + using RGWValidateKeystoneToken = \ + rgw::keystone::Service::RGWValidateKeystoneToken; + + /* The container for plain response obtained from Keystone. It will be + * parsed token_envelope_t::parse method. */ + ceph::bufferlist token_body_bl; + RGWValidateKeystoneToken validate(cct, "GET", "", &token_body_bl); + + std::string url = config.get_endpoint_url(); + if (url.empty()) { + throw -EINVAL; + } + + const auto keystone_version = config.get_api_version(); + if (keystone_version == rgw::keystone::ApiVersion::VER_2) { + url.append("v2.0/tokens/" + token); + } else if (keystone_version == rgw::keystone::ApiVersion::VER_3) { + url.append("v3/auth/tokens"); + validate.append_header("X-Subject-Token", token); + } + + std::string admin_token; + if (rgw::keystone::Service::get_admin_token(cct, token_cache, config, + admin_token) < 0) { + throw -EINVAL; + } + + validate.append_header("X-Auth-Token", admin_token); + validate.set_send_length(0); + + validate.set_url(url); + + int ret = validate.process(); + if (ret < 0) { + throw ret; + } + + /* NULL terminate for debug output. */ + token_body_bl.append(static_cast(0)); + + /* Detect Keystone rejection earlier than during the token parsing. + * Although failure at the parsing phase doesn't impose a threat, + * this allows to return proper error code (EACCESS instead of EINVAL + * or similar) and thus improves logging. */ + if (validate.get_http_status() == + /* Most likely: wrong admin credentials or admin token. */ + RGWValidateKeystoneToken::HTTP_STATUS_UNAUTHORIZED || + validate.get_http_status() == + /* Most likely: non-existent token supplied by the client. */ + RGWValidateKeystoneToken::HTTP_STATUS_NOTFOUND) { + ldpp_dout(dpp, 5) << "Failed keystone auth from " << url << " with " + << validate.get_http_status() << dendl; + return boost::none; + } + + ldpp_dout(dpp, 20) << "received response status=" << validate.get_http_status() + << ", body=" << token_body_bl.c_str() << dendl; + + TokenEngine::token_envelope_t token_body; + ret = token_body.parse(cct, token, token_body_bl, config.get_api_version()); + if (ret < 0) { + throw ret; + } + + return token_body; +} + +TokenEngine::auth_info_t +TokenEngine::get_creds_info(const TokenEngine::token_envelope_t& token, + const std::vector& admin_roles + ) const noexcept +{ + using acct_privilege_t = rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t; + + /* Check whether the user has an admin status. */ + acct_privilege_t level = acct_privilege_t::IS_PLAIN_ACCT; + for (const auto& admin_role : admin_roles) { + if (token.has_role(admin_role)) { + level = acct_privilege_t::IS_ADMIN_ACCT; + break; + } + } + + return auth_info_t { + /* Suggested account name for the authenticated user. */ + rgw_user(token.get_project_id()), + /* User's display name (aka real name). */ + token.get_project_name(), + /* Keystone doesn't support RGW's subuser concept, so we cannot cut down + * the access rights through the perm_mask. At least at this layer. */ + RGW_PERM_FULL_CONTROL, + level, + TYPE_KEYSTONE, + }; +} + +static inline const std::string +make_spec_item(const std::string& tenant, const std::string& id) +{ + return tenant + ":" + id; +} + +TokenEngine::acl_strategy_t +TokenEngine::get_acl_strategy(const TokenEngine::token_envelope_t& token) const +{ + /* The primary identity is constructed upon UUIDs. */ + const auto& tenant_uuid = token.get_project_id(); + const auto& user_uuid = token.get_user_id(); + + /* For Keystone v2 an alias may be also used. */ + const auto& tenant_name = token.get_project_name(); + const auto& user_name = token.get_user_name(); + + /* Construct all possible combinations including Swift's wildcards. */ + const std::array allowed_items = { + make_spec_item(tenant_uuid, user_uuid), + make_spec_item(tenant_name, user_name), + + /* Wildcards. */ + make_spec_item(tenant_uuid, "*"), + make_spec_item(tenant_name, "*"), + make_spec_item("*", user_uuid), + make_spec_item("*", user_name), + }; + + /* Lambda will obtain a copy of (not a reference to!) allowed_items. */ + return [allowed_items](const rgw::auth::Identity::aclspec_t& aclspec) { + uint32_t perm = 0; + + for (const auto& allowed_item : allowed_items) { + const auto iter = aclspec.find(allowed_item); + + if (std::end(aclspec) != iter) { + perm |= iter->second; + } + } + + return perm; + }; +} + +TokenEngine::result_t +TokenEngine::authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* const s) const +{ + boost::optional t; + + /* This will be initialized on the first call to this method. In C++11 it's + * also thread-safe. */ + static const struct RolesCacher { + explicit RolesCacher(CephContext* const cct) { + get_str_vec(cct->_conf->rgw_keystone_accepted_roles, plain); + get_str_vec(cct->_conf->rgw_keystone_accepted_admin_roles, admin); + + /* Let's suppose that having an admin role implies also a regular one. */ + plain.insert(std::end(plain), std::begin(admin), std::end(admin)); + } + + std::vector plain; + std::vector admin; + } roles(cct); + + if (! is_applicable(token)) { + return result_t::deny(); + } + + /* Token ID is a concept that makes dealing with PKI tokens more effective. + * Instead of storing several kilobytes, a short hash can be burried. */ + const auto& token_id = rgw_get_token_id(token); + ldpp_dout(dpp, 20) << "token_id=" << token_id << dendl; + + /* Check cache first. */ + t = token_cache.find(token_id); + if (t) { + ldpp_dout(dpp, 20) << "cached token.project.id=" << t->get_project_id() + << dendl; + auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t), + get_creds_info(*t, roles.admin)); + return result_t::grant(std::move(apl)); + } + + /* Retrieve token. */ + if (rgw_is_pki_token(token)) { + try { + t = decode_pki_token(dpp, token); + } catch (...) { + /* Last resort. */ + t = get_from_keystone(dpp, token); + } + } else { + /* Can't decode, just go to the Keystone server for validation. */ + t = get_from_keystone(dpp, token); + } + + if (! t) { + return result_t::deny(-EACCES); + } + + /* Verify expiration. */ + if (t->expired()) { + ldpp_dout(dpp, 0) << "got expired token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expired: " << t->get_expires() << dendl; + return result_t::deny(-EPERM); + } + + /* Check for necessary roles. */ + for (const auto& role : roles.plain) { + if (t->has_role(role) == true) { + ldpp_dout(dpp, 0) << "validated token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expires: " << t->get_expires() << dendl; + token_cache.add(token_id, *t); + auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t), + get_creds_info(*t, roles.admin)); + return result_t::grant(std::move(apl)); + } + } + + ldpp_dout(dpp, 0) << "user does not hold a matching role; required roles: " + << g_conf()->rgw_keystone_accepted_roles << dendl; + + return result_t::deny(-EPERM); +} + + +/* + * Try to validate S3 auth against keystone s3token interface + */ +std::pair, int> +EC2Engine::get_from_keystone(const DoutPrefixProvider* dpp, const boost::string_view& access_key_id, + const std::string& string_to_sign, + const boost::string_view& signature) const +{ + /* prepare keystone url */ + std::string keystone_url = config.get_endpoint_url(); + if (keystone_url.empty()) { + throw -EINVAL; + } + + const auto api_version = config.get_api_version(); + if (config.get_api_version() == rgw::keystone::ApiVersion::VER_3) { + keystone_url.append("v3/s3tokens"); + } else { + keystone_url.append("v2.0/s3tokens"); + } + + /* get authentication token for Keystone. */ + std::string admin_token; + int ret = rgw::keystone::Service::get_admin_token(cct, token_cache, config, + admin_token); + if (ret < 0) { + ldpp_dout(dpp, 2) << "s3 keystone: cannot get token for keystone access" + << dendl; + throw ret; + } + + using RGWValidateKeystoneToken + = rgw::keystone::Service::RGWValidateKeystoneToken; + + /* The container for plain response obtained from Keystone. It will be + * parsed token_envelope_t::parse method. */ + ceph::bufferlist token_body_bl; + RGWValidateKeystoneToken validate(cct, "POST", keystone_url, &token_body_bl); + + /* set required headers for keystone request */ + validate.append_header("X-Auth-Token", admin_token); + validate.append_header("Content-Type", "application/json"); + + /* check if we want to verify keystone's ssl certs */ + validate.set_verify_ssl(cct->_conf->rgw_keystone_verify_ssl); + + /* create json credentials request body */ + JSONFormatter credentials(false); + credentials.open_object_section(""); + credentials.open_object_section("credentials"); + credentials.dump_string("access", sview2cstr(access_key_id).data()); + credentials.dump_string("token", rgw::to_base64(string_to_sign)); + credentials.dump_string("signature", sview2cstr(signature).data()); + credentials.close_section(); + credentials.close_section(); + + std::stringstream os; + credentials.flush(os); + validate.set_post_data(os.str()); + validate.set_send_length(os.str().length()); + + /* send request */ + ret = validate.process(); + if (ret < 0) { + ldpp_dout(dpp, 2) << "s3 keystone: token validation ERROR: " + << token_body_bl.c_str() << dendl; + throw ret; + } + + /* if the supplied signature is wrong, we will get 401 from Keystone */ + if (validate.get_http_status() == + decltype(validate)::HTTP_STATUS_UNAUTHORIZED) { + return std::make_pair(boost::none, -ERR_SIGNATURE_NO_MATCH); + } else if (validate.get_http_status() == + decltype(validate)::HTTP_STATUS_NOTFOUND) { + return std::make_pair(boost::none, -ERR_INVALID_ACCESS_KEY); + } + + /* now parse response */ + rgw::keystone::TokenEnvelope token_envelope; + ret = token_envelope.parse(cct, std::string(), token_body_bl, api_version); + if (ret < 0) { + ldpp_dout(dpp, 2) << "s3 keystone: token parsing failed, ret=0" << ret + << dendl; + throw ret; + } + + return std::make_pair(std::move(token_envelope), 0); +} + +EC2Engine::acl_strategy_t +EC2Engine::get_acl_strategy(const EC2Engine::token_envelope_t&) const +{ + /* This is based on the assumption that the default acl strategy in + * get_perms_from_aclspec, will take care. Extra acl spec is not required. */ + return nullptr; +} + +EC2Engine::auth_info_t +EC2Engine::get_creds_info(const EC2Engine::token_envelope_t& token, + const std::vector& admin_roles + ) const noexcept +{ + using acct_privilege_t = \ + rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t; + + /* Check whether the user has an admin status. */ + acct_privilege_t level = acct_privilege_t::IS_PLAIN_ACCT; + for (const auto& admin_role : admin_roles) { + if (token.has_role(admin_role)) { + level = acct_privilege_t::IS_ADMIN_ACCT; + break; + } + } + + return auth_info_t { + /* Suggested account name for the authenticated user. */ + rgw_user(token.get_project_id()), + /* User's display name (aka real name). */ + token.get_project_name(), + /* Keystone doesn't support RGW's subuser concept, so we cannot cut down + * the access rights through the perm_mask. At least at this layer. */ + RGW_PERM_FULL_CONTROL, + level, + TYPE_KEYSTONE, + }; +} + +rgw::auth::Engine::result_t EC2Engine::authenticate( + const DoutPrefixProvider* dpp, + const boost::string_view& access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t&, + const completer_factory_t& completer_factory, + /* Passthorugh only! */ + const req_state* s) const +{ + /* This will be initialized on the first call to this method. In C++11 it's + * also thread-safe. */ + static const struct RolesCacher { + explicit RolesCacher(CephContext* const cct) { + get_str_vec(cct->_conf->rgw_keystone_accepted_roles, plain); + get_str_vec(cct->_conf->rgw_keystone_accepted_admin_roles, admin); + + /* Let's suppose that having an admin role implies also a regular one. */ + plain.insert(std::end(plain), std::begin(admin), std::end(admin)); + } + + std::vector plain; + std::vector admin; + } accepted_roles(cct); + + boost::optional t; + int failure_reason; + std::tie(t, failure_reason) = \ + get_from_keystone(dpp, access_key_id, string_to_sign, signature); + if (! t) { + return result_t::deny(failure_reason); + } + + /* Verify expiration. */ + if (t->expired()) { + ldpp_dout(dpp, 0) << "got expired token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expired: " << t->get_expires() << dendl; + return result_t::deny(); + } + + /* check if we have a valid role */ + bool found = false; + for (const auto& role : accepted_roles.plain) { + if (t->has_role(role) == true) { + found = true; + break; + } + } + + if (! found) { + ldpp_dout(dpp, 5) << "s3 keystone: user does not hold a matching role;" + " required roles: " + << cct->_conf->rgw_keystone_accepted_roles << dendl; + return result_t::deny(); + } else { + /* everything seems fine, continue with this user */ + ldpp_dout(dpp, 5) << "s3 keystone: validated token: " << t->get_project_name() + << ":" << t->get_user_name() + << " expires: " << t->get_expires() << dendl; + + auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t), + get_creds_info(*t, accepted_roles.admin)); + return result_t::grant(std::move(apl), completer_factory(boost::none)); + } +} + +}; /* namespace keystone */ +}; /* namespace auth */ +}; /* namespace rgw */ diff --git a/src/rgw/rgw_auth_keystone.h b/src/rgw/rgw_auth_keystone.h new file mode 100644 index 00000000..e63ba1e3 --- /dev/null +++ b/src/rgw/rgw_auth_keystone.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#ifndef CEPH_RGW_AUTH_KEYSTONE_H +#define CEPH_RGW_AUTH_KEYSTONE_H + +#include +#include +#include + +#include "rgw_auth.h" +#include "rgw_rest_s3.h" +#include "rgw_common.h" +#include "rgw_keystone.h" + +namespace rgw { +namespace auth { +namespace keystone { + +/* Dedicated namespace for Keystone-related auth engines. We need it because + * Keystone offers three different authentication mechanisms (token, EC2 and + * regular user/pass). RadosGW actually does support the first two. */ + +class TokenEngine : public rgw::auth::Engine { + CephContext* const cct; + + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + using auth_info_t = rgw::auth::RemoteApplier::AuthInfo; + using result_t = rgw::auth::Engine::result_t; + using token_envelope_t = rgw::keystone::TokenEnvelope; + + const rgw::auth::TokenExtractor* const extractor; + const rgw::auth::RemoteApplier::Factory* const apl_factory; + rgw::keystone::Config& config; + rgw::keystone::TokenCache& token_cache; + + /* Helper methods. */ + bool is_applicable(const std::string& token) const noexcept; + token_envelope_t decode_pki_token(const DoutPrefixProvider* dpp, const std::string& token) const; + + boost::optional + get_from_keystone(const DoutPrefixProvider* dpp, const std::string& token) const; + + acl_strategy_t get_acl_strategy(const token_envelope_t& token) const; + auth_info_t get_creds_info(const token_envelope_t& token, + const std::vector& admin_roles + ) const noexcept; + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* s) const; + +public: + TokenEngine(CephContext* const cct, + const rgw::auth::TokenExtractor* const extractor, + const rgw::auth::RemoteApplier::Factory* const apl_factory, + rgw::keystone::Config& config, + rgw::keystone::TokenCache& token_cache) + : cct(cct), + extractor(extractor), + apl_factory(apl_factory), + config(config), + token_cache(token_cache) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::keystone::TokenEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const override { + return authenticate(dpp, extractor->get_token(s), s); + } +}; /* class TokenEngine */ + + +class EC2Engine : public rgw::auth::s3::AWSEngine { + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + using auth_info_t = rgw::auth::RemoteApplier::AuthInfo; + using result_t = rgw::auth::Engine::result_t; + using token_envelope_t = rgw::keystone::TokenEnvelope; + + const rgw::auth::RemoteApplier::Factory* const apl_factory; + rgw::keystone::Config& config; + rgw::keystone::TokenCache& token_cache; + + /* Helper methods. */ + acl_strategy_t get_acl_strategy(const token_envelope_t& token) const; + auth_info_t get_creds_info(const token_envelope_t& token, + const std::vector& admin_roles + ) const noexcept; + std::pair, int> + get_from_keystone(const DoutPrefixProvider* dpp, const boost::string_view& access_key_id, + const std::string& string_to_sign, + const boost::string_view& signature) const; + result_t authenticate(const DoutPrefixProvider* dpp, + const boost::string_view& access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t&, + const completer_factory_t& completer_factory, + const req_state* s) const override; +public: + EC2Engine(CephContext* const cct, + const rgw::auth::s3::AWSEngine::VersionAbstractor* const ver_abstractor, + const rgw::auth::RemoteApplier::Factory* const apl_factory, + rgw::keystone::Config& config, + /* The token cache is used ONLY for the retrieving admin token. + * Due to the architecture of AWS Auth S3 credentials cannot be + * cached at all. */ + rgw::keystone::TokenCache& token_cache) + : AWSEngine(cct, *ver_abstractor), + apl_factory(apl_factory), + config(config), + token_cache(token_cache) { + } + + using AWSEngine::authenticate; + + const char* get_name() const noexcept override { + return "rgw::auth::keystone::EC2Engine"; + } + +}; /* class EC2Engine */ + +}; /* namespace keystone */ +}; /* namespace auth */ +}; /* namespace rgw */ + +#endif /* CEPH_RGW_AUTH_KEYSTONE_H */ diff --git a/src/rgw/rgw_auth_registry.h b/src/rgw/rgw_auth_registry.h new file mode 100644 index 00000000..696f40cd --- /dev/null +++ b/src/rgw/rgw_auth_registry.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#ifndef CEPH_RGW_AUTH_REGISTRY_H +#define CEPH_RGW_AUTH_REGISTRY_H + +#include +#include +#include +#include +#include + +#include "rgw_auth.h" +#include "rgw_auth_s3.h" +#include "rgw_swift_auth.h" +#include "rgw_rest_sts.h" + +namespace rgw { +namespace auth { + +/* A class aggregating the knowledge about all Strategies in RadosGW. It is + * responsible for handling the dynamic reconfiguration on e.g. realm update. */ +class StrategyRegistry { + template + using s3_strategy_t = \ + rgw::auth::s3::AWSAuthStrategy; + + struct s3_main_strategy_t : public Strategy { + using s3_main_strategy_plain_t = \ + s3_strategy_t; + using s3_main_strategy_boto2_t = \ + s3_strategy_t; + + s3_main_strategy_plain_t s3_main_strategy_plain; + s3_main_strategy_boto2_t s3_main_strategy_boto2; + + s3_main_strategy_t(CephContext* const cct, + ImplicitTenants& implicit_tenant_context, + RGWRados* const store) + : s3_main_strategy_plain(cct, implicit_tenant_context, store), + s3_main_strategy_boto2(cct, implicit_tenant_context, store) { + add_engine(Strategy::Control::SUFFICIENT, s3_main_strategy_plain); + add_engine(Strategy::Control::FALLBACK, s3_main_strategy_boto2); + } + + const char* get_name() const noexcept override { + return "rgw::auth::StrategyRegistry::s3_main_strategy_t"; + } + } s3_main_strategy; + + using s3_post_strategy_t = \ + s3_strategy_t; + s3_post_strategy_t s3_post_strategy; + + rgw::auth::swift::DefaultStrategy swift_strategy; + + rgw::auth::sts::DefaultStrategy sts_strategy; + +public: + StrategyRegistry(CephContext* const cct, + ImplicitTenants& implicit_tenant_context, + RGWRados* const store) + : s3_main_strategy(cct, implicit_tenant_context, store), + s3_post_strategy(cct, implicit_tenant_context, store), + swift_strategy(cct, implicit_tenant_context, store), + sts_strategy(cct, store) { + } + + const s3_main_strategy_t& get_s3_main() const { + return s3_main_strategy; + } + + const s3_post_strategy_t& get_s3_post() const { + return s3_post_strategy; + } + + const rgw::auth::swift::DefaultStrategy& get_swift() const { + return swift_strategy; + } + + const rgw::auth::sts::DefaultStrategy& get_sts() const { + return sts_strategy; + } + + static std::shared_ptr + create(CephContext* const cct, + ImplicitTenants& implicit_tenant_context, + RGWRados* const store) { + return std::make_shared(cct, implicit_tenant_context, store); + } +}; + +} /* namespace auth */ +} /* namespace rgw */ + +using rgw_auth_registry_t = rgw::auth::StrategyRegistry; +using rgw_auth_registry_ptr_t = std::shared_ptr; + +#endif /* CEPH_RGW_AUTH_REGISTRY_H */ diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc new file mode 100644 index 00000000..f7a8af67 --- /dev/null +++ b/src/rgw/rgw_auth_s3.cc @@ -0,0 +1,1135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include + +#include "common/armor.h" +#include "common/utf8.h" +#include "rgw_rest_s3.h" +#include "rgw_auth_s3.h" +#include "rgw_common.h" +#include "rgw_client_io.h" +#include "rgw_rest.h" +#include "rgw_crypt_sanitize.h" + +#include +#include +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +static const auto signed_subresources = { + "acl", + "cors", + "delete", + "lifecycle", + "location", + "logging", + "notification", + "partNumber", + "policy", + "requestPayment", + "response-cache-control", + "response-content-disposition", + "response-content-encoding", + "response-content-language", + "response-content-type", + "response-expires", + "tagging", + "torrent", + "uploadId", + "uploads", + "versionId", + "versioning", + "versions", + "website", + "object-lock" +}; + +/* + * ?get the canonical amazon-style header for something? + */ + +static std::string +get_canon_amz_hdr(const meta_map_t& meta_map) +{ + std::string dest; + + for (const auto& kv : meta_map) { + dest.append(kv.first); + dest.append(":"); + dest.append(kv.second); + dest.append("\n"); + } + + return dest; +} + +/* + * ?get the canonical representation of the object's location + */ +static std::string +get_canon_resource(const char* const request_uri, + const std::map& sub_resources) +{ + std::string dest; + + if (request_uri) { + dest.append(request_uri); + } + + bool initial = true; + for (const auto& subresource : signed_subresources) { + const auto iter = sub_resources.find(subresource); + if (iter == std::end(sub_resources)) { + continue; + } + + if (initial) { + dest.append("?"); + initial = false; + } else { + dest.append("&"); + } + + dest.append(iter->first); + if (! iter->second.empty()) { + dest.append("="); + dest.append(iter->second); + } + } + + dout(10) << "get_canon_resource(): dest=" << dest << dendl; + return dest; +} + +/* + * get the header authentication information required to + * compute a request's signature + */ +void rgw_create_s3_canonical_header( + const char* const method, + const char* const content_md5, + const char* const content_type, + const char* const date, + const meta_map_t& meta_map, + const meta_map_t& qs_map, + const char* const request_uri, + const std::map& sub_resources, + std::string& dest_str) +{ + std::string dest; + + if (method) { + dest = method; + } + dest.append("\n"); + + if (content_md5) { + dest.append(content_md5); + } + dest.append("\n"); + + if (content_type) { + dest.append(content_type); + } + dest.append("\n"); + + if (date) { + dest.append(date); + } + dest.append("\n"); + + dest.append(get_canon_amz_hdr(meta_map)); + dest.append(get_canon_amz_hdr(qs_map)); + dest.append(get_canon_resource(request_uri, sub_resources)); + + dest_str = dest; +} + +static inline bool is_base64_for_content_md5(unsigned char c) { + return (isalnum(c) || isspace(c) || (c == '+') || (c == '/') || (c == '=')); +} + +static inline void get_v2_qs_map(const req_info& info, + meta_map_t& qs_map) { + const auto& params = const_cast(info.args).get_params(); + for (const auto& elt : params) { + std::string k = boost::algorithm::to_lower_copy(elt.first); + if (k.find("x-amz-meta-") == /* offset */ 0) { + add_amz_meta_header(qs_map, k, elt.second); + } + } +} + +/* + * get the header authentication information required to + * compute a request's signature + */ +bool rgw_create_s3_canonical_header(const req_info& info, + utime_t* const header_time, + std::string& dest, + const bool qsr) +{ + const char* const content_md5 = info.env->get("HTTP_CONTENT_MD5"); + if (content_md5) { + for (const char *p = content_md5; *p; p++) { + if (!is_base64_for_content_md5(*p)) { + dout(0) << "NOTICE: bad content-md5 provided (not base64)," + << " aborting request p=" << *p << " " << (int)*p << dendl; + return false; + } + } + } + + const char *content_type = info.env->get("CONTENT_TYPE"); + + std::string date; + meta_map_t qs_map; + + if (qsr) { + get_v2_qs_map(info, qs_map); // handle qs metadata + date = info.args.get("Expires"); + } else { + const char *str = info.env->get("HTTP_X_AMZ_DATE"); + const char *req_date = str; + if (str == NULL) { + req_date = info.env->get("HTTP_DATE"); + if (!req_date) { + dout(0) << "NOTICE: missing date for auth header" << dendl; + return false; + } + date = req_date; + } + + if (header_time) { + struct tm t; + if (!parse_rfc2616(req_date, &t)) { + dout(0) << "NOTICE: failed to parse date for auth header" << dendl; + return false; + } + if (t.tm_year < 70) { + dout(0) << "NOTICE: bad date (predates epoch): " << req_date << dendl; + return false; + } + *header_time = utime_t(internal_timegm(&t), 0); + } + } + + const auto& meta_map = info.x_meta_map; + const auto& sub_resources = info.args.get_sub_resources(); + + std::string request_uri; + if (info.effective_uri.empty()) { + request_uri = info.request_uri; + } else { + request_uri = info.effective_uri; + } + + rgw_create_s3_canonical_header(info.method, content_md5, content_type, + date.c_str(), meta_map, qs_map, + request_uri.c_str(), sub_resources, dest); + return true; +} + + +namespace rgw { +namespace auth { +namespace s3 { + +bool is_time_skew_ok(time_t t) +{ + auto req_tp = ceph::coarse_real_clock::from_time_t(t); + auto cur_tp = ceph::coarse_real_clock::now(); + + if (std::chrono::abs(cur_tp - req_tp) > RGW_AUTH_GRACE) { + dout(10) << "NOTICE: request time skew too big." << dendl; + using ceph::operator<<; + dout(10) << "req_tp=" << req_tp << ", cur_tp=" << cur_tp << dendl; + return false; + } + + return true; +} + +static inline int parse_v4_query_string(const req_info& info, /* in */ + boost::string_view& credential, /* out */ + boost::string_view& signedheaders, /* out */ + boost::string_view& signature, /* out */ + boost::string_view& date, /* out */ + boost::string_view& sessiontoken) /* out */ +{ + /* auth ships with req params ... */ + + /* look for required params */ + credential = info.args.get("X-Amz-Credential"); + if (credential.size() == 0) { + return -EPERM; + } + + date = info.args.get("X-Amz-Date"); + struct tm date_t; + if (!parse_iso8601(sview2cstr(date).data(), &date_t, nullptr, false)) { + return -EPERM; + } + + boost::string_view expires = info.args.get("X-Amz-Expires"); + if (expires.empty()) { + return -EPERM; + } + /* X-Amz-Expires provides the time period, in seconds, for which + the generated presigned URL is valid. The minimum value + you can set is 1, and the maximum is 604800 (seven days) */ + time_t exp = atoll(expires.data()); + if ((exp < 1) || (exp > 7*24*60*60)) { + dout(10) << "NOTICE: exp out of range, exp = " << exp << dendl; + return -EPERM; + } + /* handle expiration in epoch time */ + uint64_t req_sec = (uint64_t)internal_timegm(&date_t); + uint64_t now = ceph_clock_now(); + if (now >= req_sec + exp) { + dout(10) << "NOTICE: now = " << now << ", req_sec = " << req_sec << ", exp = " << exp << dendl; + return -EPERM; + } + + signedheaders = info.args.get("X-Amz-SignedHeaders"); + if (signedheaders.size() == 0) { + return -EPERM; + } + + signature = info.args.get("X-Amz-Signature"); + if (signature.size() == 0) { + return -EPERM; + } + + if (info.args.exists("X-Amz-Security-Token")) { + sessiontoken = info.args.get("X-Amz-Security-Token"); + if (sessiontoken.size() == 0) { + return -EPERM; + } + } + + return 0; +} + +static bool get_next_token(const boost::string_view& s, + size_t& pos, + const char* const delims, + boost::string_view& token) +{ + const size_t start = s.find_first_not_of(delims, pos); + if (start == boost::string_view::npos) { + pos = s.size(); + return false; + } + + size_t end = s.find_first_of(delims, start); + if (end != boost::string_view::npos) + pos = end + 1; + else { + pos = end = s.size(); + } + + token = s.substr(start, end - start); + return true; +} + +template +boost::container::small_vector +get_str_vec(const boost::string_view& str, const char* const delims) +{ + boost::container::small_vector str_vec; + + size_t pos = 0; + boost::string_view token; + while (pos < str.size()) { + if (get_next_token(str, pos, delims, token)) { + if (token.size() > 0) { + str_vec.push_back(token); + } + } + } + + return str_vec; +} + +template +boost::container::small_vector +get_str_vec(const boost::string_view& str) +{ + const char delims[] = ";,= \t"; + return get_str_vec(str, delims); +} + +static inline int parse_v4_auth_header(const req_info& info, /* in */ + boost::string_view& credential, /* out */ + boost::string_view& signedheaders, /* out */ + boost::string_view& signature, /* out */ + boost::string_view& date, /* out */ + boost::string_view& sessiontoken) /* out */ +{ + boost::string_view input(info.env->get("HTTP_AUTHORIZATION", "")); + try { + input = input.substr(::strlen(AWS4_HMAC_SHA256_STR) + 1); + } catch (std::out_of_range&) { + /* We should never ever run into this situation as the presence of + * AWS4_HMAC_SHA256_STR had been verified earlier. */ + dout(10) << "credentials string is too short" << dendl; + return -EINVAL; + } + + std::map kv; + for (const auto& s : get_str_vec<4>(input, ",")) { + const auto parsed_pair = parse_key_value(s); + if (parsed_pair) { + kv[parsed_pair->first] = parsed_pair->second; + } else { + dout(10) << "NOTICE: failed to parse auth header (s=" << s << ")" + << dendl; + return -EINVAL; + } + } + + static const std::array required_keys = { + "Credential", + "SignedHeaders", + "Signature" + }; + + /* Ensure that the presigned required keys are really there. */ + for (const auto& k : required_keys) { + if (kv.find(k) == std::end(kv)) { + dout(10) << "NOTICE: auth header missing key: " << k << dendl; + return -EINVAL; + } + } + + credential = kv["Credential"]; + signedheaders = kv["SignedHeaders"]; + signature = kv["Signature"]; + + /* sig hex str */ + dout(10) << "v4 signature format = " << signature << dendl; + + /* ------------------------- handle x-amz-date header */ + + /* grab date */ + + const char *d = info.env->get("HTTP_X_AMZ_DATE"); + struct tm t; + if (!parse_iso8601(d, &t, NULL, false)) { + dout(10) << "error reading date via http_x_amz_date" << dendl; + return -EACCES; + } + date = d; + + if (!is_time_skew_ok(internal_timegm(&t))) { + return -ERR_REQUEST_TIME_SKEWED; + } + + if (info.env->exists("HTTP_X_AMZ_SECURITY_TOKEN")) { + sessiontoken = info.env->get("HTTP_X_AMZ_SECURITY_TOKEN"); + } + + return 0; +} + +int parse_v4_credentials(const req_info& info, /* in */ + boost::string_view& access_key_id, /* out */ + boost::string_view& credential_scope, /* out */ + boost::string_view& signedheaders, /* out */ + boost::string_view& signature, /* out */ + boost::string_view& date, /* out */ + boost::string_view& session_token, /* out */ + const bool using_qs) /* in */ +{ + boost::string_view credential; + int ret; + if (using_qs) { + ret = parse_v4_query_string(info, credential, signedheaders, + signature, date, session_token); + } else { + ret = parse_v4_auth_header(info, credential, signedheaders, + signature, date, session_token); + } + + if (ret < 0) { + return ret; + } + + /* access_key/YYYYMMDD/region/service/aws4_request */ + dout(10) << "v4 credential format = " << credential << dendl; + + if (std::count(credential.begin(), credential.end(), '/') != 4) { + return -EINVAL; + } + + /* credential must end with 'aws4_request' */ + if (credential.find("aws4_request") == std::string::npos) { + return -EINVAL; + } + + /* grab access key id */ + const size_t pos = credential.find("/"); + access_key_id = credential.substr(0, pos); + dout(10) << "access key id = " << access_key_id << dendl; + + /* grab credential scope */ + credential_scope = credential.substr(pos + 1); + dout(10) << "credential scope = " << credential_scope << dendl; + + return 0; +} + +std::string get_v4_canonical_qs(const req_info& info, const bool using_qs) +{ + const std::string *params = &info.request_params; + std::string copy_params; + if (params->empty()) { + /* Optimize the typical flow. */ + return std::string(); + } + if (params->find_first_of('+') != std::string::npos) { + copy_params = *params; + boost::replace_all(copy_params, "+", "%20"); + params = ©_params; + } + + /* Handle case when query string exists. Step 3 described in: http://docs. + * aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html */ + std::map canonical_qs_map; + for (const auto& s : get_str_vec<5>(*params, "&")) { + boost::string_view key, val; + const auto parsed_pair = parse_key_value(s); + if (parsed_pair) { + std::tie(key, val) = *parsed_pair; + } else { + /* Handling a parameter without any value (even the empty one). That's + * it, we've encountered something like "this_param&other_param=val" + * which is used by S3 for subresources. */ + key = s; + } + + if (using_qs && key == "X-Amz-Signature") { + /* Preserving the original behaviour of get_v4_canonical_qs() here. */ + continue; + } + + // while awsv4 specs ask for all slashes to be encoded, s3 itself is relaxed + // in its implementation allowing non-url-encoded slashes to be present in + // presigned urls for instance + canonical_qs_map[aws4_uri_recode(key, true)] = aws4_uri_recode(val, true); + } + + /* Thanks to the early exist we have the guarantee that canonical_qs_map has + * at least one element. */ + auto iter = std::begin(canonical_qs_map); + std::string canonical_qs; + canonical_qs.append(iter->first) + .append("=", ::strlen("=")) + .append(iter->second); + + for (iter++; iter != std::end(canonical_qs_map); iter++) { + canonical_qs.append("&", ::strlen("&")) + .append(iter->first) + .append("=", ::strlen("=")) + .append(iter->second); + } + + return canonical_qs; +} + +boost::optional +get_v4_canonical_headers(const req_info& info, + const boost::string_view& signedheaders, + const bool using_qs, + const bool force_boto2_compat) +{ + std::map canonical_hdrs_map; + for (const auto& token : get_str_vec<5>(signedheaders, ";")) { + /* TODO(rzarzynski): we'd like to switch to sstring here but it should + * get push_back() and reserve() first. */ + std::string token_env = "HTTP_"; + token_env.reserve(token.length() + std::strlen("HTTP_") + 1); + + std::transform(std::begin(token), std::end(token), + std::back_inserter(token_env), [](const int c) { + return c == '-' ? '_' : std::toupper(c); + }); + + if (token_env == "HTTP_CONTENT_LENGTH") { + token_env = "CONTENT_LENGTH"; + } else if (token_env == "HTTP_CONTENT_TYPE") { + token_env = "CONTENT_TYPE"; + } + const char* const t = info.env->get(token_env.c_str()); + if (!t) { + dout(10) << "warning env var not available" << dendl; + continue; + } + + std::string token_value(t); + if (token_env == "HTTP_CONTENT_MD5" && + !std::all_of(std::begin(token_value), std::end(token_value), + is_base64_for_content_md5)) { + dout(0) << "NOTICE: bad content-md5 provided (not base64)" + << ", aborting request" << dendl; + return boost::none; + } + + if (force_boto2_compat && using_qs && token == "host") { + boost::string_view port = info.env->get("SERVER_PORT", ""); + boost::string_view secure_port = info.env->get("SERVER_PORT_SECURE", ""); + + if (!secure_port.empty()) { + if (secure_port != "443") + token_value.append(":", std::strlen(":")) + .append(secure_port.data(), secure_port.length()); + } else if (!port.empty()) { + if (port != "80") + token_value.append(":", std::strlen(":")) + .append(port.data(), port.length()); + } + } + + canonical_hdrs_map[token] = rgw_trim_whitespace(token_value); + } + + std::string canonical_hdrs; + for (const auto& header : canonical_hdrs_map) { + const boost::string_view& name = header.first; + std::string value = header.second; + boost::trim_all(value); + + canonical_hdrs.append(name.data(), name.length()) + .append(":", std::strlen(":")) + .append(value) + .append("\n", std::strlen("\n")); + } + + return canonical_hdrs; +} + +/* + * create canonical request for signature version 4 + * + * http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + */ +sha256_digest_t +get_v4_canon_req_hash(CephContext* cct, + const boost::string_view& http_verb, + const std::string& canonical_uri, + const std::string& canonical_qs, + const std::string& canonical_hdrs, + const boost::string_view& signed_hdrs, + const boost::string_view& request_payload_hash) +{ + ldout(cct, 10) << "payload request hash = " << request_payload_hash << dendl; + + const auto canonical_req = string_join_reserve("\n", + http_verb, + canonical_uri, + canonical_qs, + canonical_hdrs, + signed_hdrs, + request_payload_hash); + + const auto canonical_req_hash = calc_hash_sha256(canonical_req); + + using sanitize = rgw::crypt_sanitize::log_content; + ldout(cct, 10) << "canonical request = " << sanitize{canonical_req} << dendl; + ldout(cct, 10) << "canonical request hash = " + << canonical_req_hash << dendl; + + return canonical_req_hash; +} + +/* + * create string to sign for signature version 4 + * + * http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html + */ +AWSEngine::VersionAbstractor::string_to_sign_t +get_v4_string_to_sign(CephContext* const cct, + const boost::string_view& algorithm, + const boost::string_view& request_date, + const boost::string_view& credential_scope, + const sha256_digest_t& canonreq_hash) +{ + const auto hexed_cr_hash = canonreq_hash.to_str(); + const boost::string_view hexed_cr_hash_str(hexed_cr_hash); + + const auto string_to_sign = string_join_reserve("\n", + algorithm, + request_date, + credential_scope, + hexed_cr_hash_str); + + ldout(cct, 10) << "string to sign = " + << rgw::crypt_sanitize::log_content{string_to_sign} + << dendl; + + return string_to_sign; +} + + +static inline std::tuple /* service */ +parse_cred_scope(boost::string_view credential_scope) +{ + /* date cred */ + size_t pos = credential_scope.find("/"); + const auto date_cs = credential_scope.substr(0, pos); + credential_scope = credential_scope.substr(pos + 1); + + /* region cred */ + pos = credential_scope.find("/"); + const auto region_cs = credential_scope.substr(0, pos); + credential_scope = credential_scope.substr(pos + 1); + + /* service cred */ + pos = credential_scope.find("/"); + const auto service_cs = credential_scope.substr(0, pos); + + return std::make_tuple(date_cs, region_cs, service_cs); +} + +static inline std::vector +transform_secret_key(const boost::string_view& secret_access_key) +{ + /* TODO(rzarzynski): switch to constexpr when C++14 becomes available. */ + static const std::initializer_list AWS4 { 'A', 'W', 'S', '4' }; + + /* boost::container::small_vector might be used here if someone wants to + * optimize out even more dynamic allocations. */ + std::vector secret_key_utf8; + secret_key_utf8.reserve(AWS4.size() + secret_access_key.size()); + secret_key_utf8.assign(AWS4); + + for (const auto c : secret_access_key) { + std::array buf; + const size_t n = encode_utf8(c, buf.data()); + secret_key_utf8.insert(std::end(secret_key_utf8), + std::begin(buf), std::begin(buf) + n); + } + + return secret_key_utf8; +} + +/* + * calculate the SigningKey of AWS auth version 4 + */ +static sha256_digest_t +get_v4_signing_key(CephContext* const cct, + const boost::string_view& credential_scope, + const boost::string_view& secret_access_key) +{ + boost::string_view date, region, service; + std::tie(date, region, service) = parse_cred_scope(credential_scope); + + const auto utfed_sec_key = transform_secret_key(secret_access_key); + const auto date_k = calc_hmac_sha256(utfed_sec_key, date); + const auto region_k = calc_hmac_sha256(date_k, region); + const auto service_k = calc_hmac_sha256(region_k, service); + + /* aws4_request */ + const auto signing_key = calc_hmac_sha256(service_k, + boost::string_view("aws4_request")); + + ldout(cct, 10) << "date_k = " << date_k << dendl; + ldout(cct, 10) << "region_k = " << region_k << dendl; + ldout(cct, 10) << "service_k = " << service_k << dendl; + ldout(cct, 10) << "signing_k = " << signing_key << dendl; + + return signing_key; +} + +/* + * calculate the AWS signature version 4 + * + * http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html + * + * srv_signature_t is an alias over Ceph's basic_sstring. We're using + * it to keep everything within the stack boundaries instead of doing + * dynamic allocations. + */ +AWSEngine::VersionAbstractor::server_signature_t +get_v4_signature(const boost::string_view& credential_scope, + CephContext* const cct, + const boost::string_view& secret_key, + const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign) +{ + auto signing_key = get_v4_signing_key(cct, credential_scope, secret_key); + + /* The server-side generated digest for comparison. */ + const auto digest = calc_hmac_sha256(signing_key, string_to_sign); + + /* TODO(rzarzynski): I would love to see our sstring having reserve() and + * the non-const data() variant like C++17's std::string. */ + using srv_signature_t = AWSEngine::VersionAbstractor::server_signature_t; + srv_signature_t signature(srv_signature_t::initialized_later(), + digest.SIZE * 2); + buf_to_hex(digest.v, digest.SIZE, signature.begin()); + + ldout(cct, 10) << "generated signature = " << signature << dendl; + + return signature; +} + +AWSEngine::VersionAbstractor::server_signature_t +get_v2_signature(CephContext* const cct, + const std::string& secret_key, + const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign) +{ + if (secret_key.empty()) { + throw -EINVAL; + } + + const auto digest = calc_hmac_sha1(secret_key, string_to_sign); + + /* 64 is really enough */; + char buf[64]; + const int ret = ceph_armor(std::begin(buf), + std::begin(buf) + 64, + reinterpret_cast(digest.v), + reinterpret_cast(digest.v + digest.SIZE)); + if (ret < 0) { + ldout(cct, 10) << "ceph_armor failed" << dendl; + throw ret; + } else { + buf[ret] = '\0'; + using srv_signature_t = AWSEngine::VersionAbstractor::server_signature_t; + return srv_signature_t(buf, ret); + } +} + +bool AWSv4ComplMulti::ChunkMeta::is_new_chunk_in_stream(size_t stream_pos) const +{ + return stream_pos >= (data_offset_in_stream + data_length); +} + +size_t AWSv4ComplMulti::ChunkMeta::get_data_size(size_t stream_pos) const +{ + if (stream_pos > (data_offset_in_stream + data_length)) { + /* Data in parsing_buf. */ + return data_length; + } else { + return data_offset_in_stream + data_length - stream_pos; + } +} + + +/* AWSv4 completers begin. */ +std::pair +AWSv4ComplMulti::ChunkMeta::create_next(CephContext* const cct, + ChunkMeta&& old, + const char* const metabuf, + const size_t metabuf_len) +{ + boost::string_ref metastr(metabuf, metabuf_len); + + const size_t semicolon_pos = metastr.find(";"); + if (semicolon_pos == boost::string_ref::npos) { + ldout(cct, 20) << "AWSv4ComplMulti cannot find the ';' separator" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + char* data_field_end; + /* strtoull ignores the "\r\n" sequence after each non-first chunk. */ + const size_t data_length = std::strtoull(metabuf, &data_field_end, 16); + if (data_length == 0 && data_field_end == metabuf) { + ldout(cct, 20) << "AWSv4ComplMulti: cannot parse the data size" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + /* Parse the chunk_signature=... part. */ + const auto signature_part = metastr.substr(semicolon_pos + 1); + const size_t eq_sign_pos = signature_part.find("="); + if (eq_sign_pos == boost::string_ref::npos) { + ldout(cct, 20) << "AWSv4ComplMulti: cannot find the '=' separator" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + /* OK, we have at least the beginning of a signature. */ + const size_t data_sep_pos = signature_part.find("\r\n"); + if (data_sep_pos == boost::string_ref::npos) { + ldout(cct, 20) << "AWSv4ComplMulti: no new line at signature end" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + const auto signature = \ + signature_part.substr(eq_sign_pos + 1, data_sep_pos - 1 - eq_sign_pos); + if (signature.length() != SIG_SIZE) { + ldout(cct, 20) << "AWSv4ComplMulti: signature.length() != 64" + << dendl; + throw rgw::io::Exception(EINVAL, std::system_category()); + } + + const size_t data_starts_in_stream = \ + + semicolon_pos + strlen(";") + data_sep_pos + strlen("\r\n") + + old.data_offset_in_stream + old.data_length; + + ldout(cct, 20) << "parsed new chunk; signature=" << signature + << ", data_length=" << data_length + << ", data_starts_in_stream=" << data_starts_in_stream + << dendl; + + return std::make_pair(ChunkMeta(data_starts_in_stream, + data_length, + signature), + semicolon_pos + 83); +} + +std::string +AWSv4ComplMulti::calc_chunk_signature(const std::string& payload_hash) const +{ + const auto string_to_sign = string_join_reserve("\n", + AWS4_HMAC_SHA256_PAYLOAD_STR, + date, + credential_scope, + prev_chunk_signature, + AWS4_EMPTY_PAYLOAD_HASH, + payload_hash); + + ldout(cct, 20) << "AWSv4ComplMulti: string_to_sign=\n" << string_to_sign + << dendl; + + /* new chunk signature */ + const auto sig = calc_hmac_sha256(signing_key, string_to_sign); + /* FIXME(rzarzynski): std::string here is really unnecessary. */ + return sig.to_str(); +} + + +bool AWSv4ComplMulti::is_signature_mismatched() +{ + /* The validity of previous chunk can be verified only after getting meta- + * data of the next one. */ + const auto payload_hash = calc_hash_sha256_restart_stream(&sha256_hash); + const auto calc_signature = calc_chunk_signature(payload_hash); + + if (chunk_meta.get_signature() != calc_signature) { + ldout(cct, 20) << "AWSv4ComplMulti: ERROR: chunk signature mismatch" + << dendl; + ldout(cct, 20) << "AWSv4ComplMulti: declared signature=" + << chunk_meta.get_signature() << dendl; + ldout(cct, 20) << "AWSv4ComplMulti: calculated signature=" + << calc_signature << dendl; + + return true; + } else { + prev_chunk_signature = chunk_meta.get_signature(); + return false; + } +} + +size_t AWSv4ComplMulti::recv_body(char* const buf, const size_t buf_max) +{ + /* Buffer stores only parsed stream. Raw values reflect the stream + * we're getting from a client. */ + size_t buf_pos = 0; + + if (chunk_meta.is_new_chunk_in_stream(stream_pos)) { + /* Verify signature of the previous chunk. We aren't doing that for new + * one as the procedure requires calculation of payload hash. This code + * won't be triggered for the last, zero-length chunk. Instead, is will + * be checked in the complete() method. */ + if (stream_pos >= ChunkMeta::META_MAX_SIZE && is_signature_mismatched()) { + throw rgw::io::Exception(ERR_SIGNATURE_NO_MATCH, std::system_category()); + } + + /* We don't have metadata for this range. This means a new chunk, so we + * need to parse a fresh portion of the stream. Let's start. */ + size_t to_extract = parsing_buf.capacity() - parsing_buf.size(); + do { + const size_t orig_size = parsing_buf.size(); + parsing_buf.resize(parsing_buf.size() + to_extract); + const size_t received = io_base_t::recv_body(parsing_buf.data() + orig_size, + to_extract); + parsing_buf.resize(parsing_buf.size() - (to_extract - received)); + if (received == 0) { + break; + } + + stream_pos += received; + to_extract -= received; + } while (to_extract > 0); + + size_t consumed; + std::tie(chunk_meta, consumed) = \ + ChunkMeta::create_next(cct, std::move(chunk_meta), + parsing_buf.data(), parsing_buf.size()); + + /* We can drop the bytes consumed during metadata parsing. The remainder + * can be chunk's data plus possibly beginning of next chunks' metadata. */ + parsing_buf.erase(std::begin(parsing_buf), + std::begin(parsing_buf) + consumed); + } + + size_t stream_pos_was = stream_pos - parsing_buf.size(); + + size_t to_extract = \ + std::min(chunk_meta.get_data_size(stream_pos_was), buf_max); + dout(30) << "AWSv4ComplMulti: stream_pos_was=" << stream_pos_was << ", to_extract=" << to_extract << dendl; + + /* It's quite probable we have a couple of real data bytes stored together + * with meta-data in the parsing_buf. We need to extract them and move to + * the final buffer. This is a trade-off between frontend's read overhead + * and memcpy. */ + if (to_extract > 0 && parsing_buf.size() > 0) { + const auto data_len = std::min(to_extract, parsing_buf.size()); + const auto data_end_iter = std::begin(parsing_buf) + data_len; + dout(30) << "AWSv4ComplMulti: to_extract=" << to_extract << ", data_len=" << data_len << dendl; + + std::copy(std::begin(parsing_buf), data_end_iter, buf); + parsing_buf.erase(std::begin(parsing_buf), data_end_iter); + + calc_hash_sha256_update_stream(sha256_hash, buf, data_len); + + to_extract -= data_len; + buf_pos += data_len; + } + + /* Now we can do the bulk read directly from RestfulClient without any extra + * buffering. */ + while (to_extract > 0) { + const size_t received = io_base_t::recv_body(buf + buf_pos, to_extract); + dout(30) << "AWSv4ComplMulti: to_extract=" << to_extract << ", received=" << received << dendl; + + if (received == 0) { + break; + } + + calc_hash_sha256_update_stream(sha256_hash, buf + buf_pos, received); + + buf_pos += received; + stream_pos += received; + to_extract -= received; + } + + dout(20) << "AWSv4ComplMulti: filled=" << buf_pos << dendl; + return buf_pos; +} + +void AWSv4ComplMulti::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s_rw) +{ + const char* const decoded_length = \ + s_rw->info.env->get("HTTP_X_AMZ_DECODED_CONTENT_LENGTH"); + + if (!decoded_length) { + throw -EINVAL; + } else { + s_rw->length = decoded_length; + s_rw->content_length = parse_content_length(decoded_length); + + if (s_rw->content_length < 0) { + ldpp_dout(dpp, 10) << "negative AWSv4's content length, aborting" << dendl; + throw -EINVAL; + } + } + + /* Install the filter over rgw::io::RestfulClient. */ + AWS_AUTHv4_IO(s_rw)->add_filter( + std::static_pointer_cast(shared_from_this())); +} + +bool AWSv4ComplMulti::complete() +{ + /* Now it's time to verify the signature of the last, zero-length chunk. */ + if (is_signature_mismatched()) { + ldout(cct, 10) << "ERROR: signature of last chunk does not match" + << dendl; + return false; + } else { + return true; + } +} + +rgw::auth::Completer::cmplptr_t +AWSv4ComplMulti::create(const req_state* const s, + boost::string_view date, + boost::string_view credential_scope, + boost::string_view seed_signature, + const boost::optional& secret_key) +{ + if (!secret_key) { + /* Some external authorizers (like Keystone) aren't fully compliant with + * AWSv4. They do not provide the secret_key which is necessary to handle + * the streamed upload. */ + throw -ERR_NOT_IMPLEMENTED; + } + + const auto signing_key = \ + rgw::auth::s3::get_v4_signing_key(s->cct, credential_scope, *secret_key); + + return std::make_shared(s, + std::move(date), + std::move(credential_scope), + std::move(seed_signature), + signing_key); +} + +size_t AWSv4ComplSingle::recv_body(char* const buf, const size_t max) +{ + const auto received = io_base_t::recv_body(buf, max); + calc_hash_sha256_update_stream(sha256_hash, buf, received); + + return received; +} + +void AWSv4ComplSingle::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s_rw) +{ + /* Install the filter over rgw::io::RestfulClient. */ + AWS_AUTHv4_IO(s_rw)->add_filter( + std::static_pointer_cast(shared_from_this())); +} + +bool AWSv4ComplSingle::complete() +{ + /* The completer is only for the cases where signed payload has been + * requested. It won't be used, for instance, during the query string-based + * authentication. */ + const auto payload_hash = calc_hash_sha256_close_stream(&sha256_hash); + + /* Validate x-amz-sha256 */ + if (payload_hash.compare(expected_request_payload_hash) == 0) { + return true; + } else { + ldout(cct, 10) << "ERROR: x-amz-content-sha256 does not match" + << dendl; + ldout(cct, 10) << "ERROR: grab_aws4_sha256_hash()=" + << payload_hash << dendl; + ldout(cct, 10) << "ERROR: expected_request_payload_hash=" + << expected_request_payload_hash << dendl; + return false; + } +} + +AWSv4ComplSingle::AWSv4ComplSingle(const req_state* const s) + : io_base_t(nullptr), + cct(s->cct), + expected_request_payload_hash(get_v4_exp_payload_hash(s->info)), + sha256_hash(calc_hash_sha256_open_stream()) { +} + +rgw::auth::Completer::cmplptr_t +AWSv4ComplSingle::create(const req_state* const s, + const boost::optional&) +{ + return std::make_shared(s); +} + +} /* namespace s3 */ +} /* namespace auth */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_auth_s3.h b/src/rgw/rgw_auth_s3.h new file mode 100644 index 00000000..519f8395 --- /dev/null +++ b/src/rgw/rgw_auth_s3.h @@ -0,0 +1,615 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_AUTH_S3_H +#define CEPH_RGW_AUTH_S3_H + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common/sstring.hh" +#include "rgw_common.h" +#include "rgw_rest_s3.h" +#include "rgw_auth.h" +#include "rgw_auth_filters.h" +#include "rgw_auth_keystone.h" + + +namespace rgw { +namespace auth { +namespace s3 { + +static constexpr auto RGW_AUTH_GRACE = std::chrono::minutes{15}; + +// returns true if the request time is within RGW_AUTH_GRACE of the current time +bool is_time_skew_ok(time_t t); + +class STSAuthStrategy : public rgw::auth::Strategy, + public rgw::auth::RemoteApplier::Factory, + public rgw::auth::LocalApplier::Factory, + public rgw::auth::RoleApplier::Factory { + typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t; + RGWRados* const store; + rgw::auth::ImplicitTenants& implicit_tenant_context; + + STSEngine sts_engine; + + aplptr_t create_apl_remote(CephContext* const cct, + const req_state* const s, + rgw::auth::RemoteApplier::acl_strategy_t&& acl_alg, + const rgw::auth::RemoteApplier::AuthInfo &info + ) const override { + auto apl = rgw::auth::add_sysreq(cct, store, s, + rgw::auth::RemoteApplier(cct, store, std::move(acl_alg), info, + implicit_tenant_context, + rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3)); + return aplptr_t(new decltype(apl)(std::move(apl))); + } + + aplptr_t create_apl_local(CephContext* const cct, + const req_state* const s, + const RGWUserInfo& user_info, + const std::string& subuser, + const boost::optional& perm_mask) const override { + auto apl = rgw::auth::add_sysreq(cct, store, s, + rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask)); + return aplptr_t(new decltype(apl)(std::move(apl))); + } + + aplptr_t create_apl_role(CephContext* const cct, + const req_state* const s, + const string& role_name, + const rgw_user& user_id, + const vector& role_policies) const override { + auto apl = rgw::auth::add_sysreq(cct, store, s, + rgw::auth::RoleApplier(cct, role_name, user_id, role_policies)); + return aplptr_t(new decltype(apl)(std::move(apl))); + } + +public: + STSAuthStrategy(CephContext* const cct, + RGWRados* const store, + rgw::auth::ImplicitTenants& implicit_tenant_context, + AWSEngine::VersionAbstractor* const ver_abstractor) + : store(store), + implicit_tenant_context(implicit_tenant_context), + sts_engine(cct, store, *ver_abstractor, + static_cast(this), + static_cast(this), + static_cast(this)) { + if (cct->_conf->rgw_s3_auth_use_sts) { + add_engine(Control::SUFFICIENT, sts_engine); + } + } + + const char* get_name() const noexcept override { + return "rgw::auth::s3::STSAuthStrategy"; + } +}; + +class ExternalAuthStrategy : public rgw::auth::Strategy, + public rgw::auth::RemoteApplier::Factory { + typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t; + RGWRados* const store; + rgw::auth::ImplicitTenants& implicit_tenant_context; + + using keystone_config_t = rgw::keystone::CephCtxConfig; + using keystone_cache_t = rgw::keystone::TokenCache; + using EC2Engine = rgw::auth::keystone::EC2Engine; + + boost::optional keystone_engine; + LDAPEngine ldap_engine; + + aplptr_t create_apl_remote(CephContext* const cct, + const req_state* const s, + rgw::auth::RemoteApplier::acl_strategy_t&& acl_alg, + const rgw::auth::RemoteApplier::AuthInfo &info + ) const override { + auto apl = rgw::auth::add_sysreq(cct, store, s, + rgw::auth::RemoteApplier(cct, store, std::move(acl_alg), info, + implicit_tenant_context, + rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3)); + /* TODO(rzarzynski): replace with static_ptr. */ + return aplptr_t(new decltype(apl)(std::move(apl))); + } + +public: + ExternalAuthStrategy(CephContext* const cct, + RGWRados* const store, + rgw::auth::ImplicitTenants& implicit_tenant_context, + AWSEngine::VersionAbstractor* const ver_abstractor) + : store(store), + implicit_tenant_context(implicit_tenant_context), + ldap_engine(cct, store, *ver_abstractor, + static_cast(this)) { + + if (cct->_conf->rgw_s3_auth_use_keystone && + ! cct->_conf->rgw_keystone_url.empty()) { + + keystone_engine.emplace(cct, ver_abstractor, + static_cast(this), + keystone_config_t::get_instance(), + keystone_cache_t::get_instance()); + add_engine(Control::SUFFICIENT, *keystone_engine); + + } + + if (ldap_engine.valid()) { + add_engine(Control::SUFFICIENT, ldap_engine); + } + } + + const char* get_name() const noexcept override { + return "rgw::auth::s3::AWSv2ExternalAuthStrategy"; + } +}; + + +template +class AWSAuthStrategy : public rgw::auth::Strategy, + public rgw::auth::LocalApplier::Factory { + typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t; + + static_assert(std::is_base_of::value, + "AbstractorT must be a subclass of rgw::auth::s3::VersionAbstractor"); + + RGWRados* const store; + AbstractorT ver_abstractor; + + S3AnonymousEngine anonymous_engine; + ExternalAuthStrategy external_engines; + STSAuthStrategy sts_engine; + LocalEngine local_engine; + + aplptr_t create_apl_local(CephContext* const cct, + const req_state* const s, + const RGWUserInfo& user_info, + const std::string& subuser, + const boost::optional& perm_mask) const override { + auto apl = rgw::auth::add_sysreq(cct, store, s, + rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask)); + /* TODO(rzarzynski): replace with static_ptr. */ + return aplptr_t(new decltype(apl)(std::move(apl))); + } + +public: + using engine_map_t = std::map >; + void add_engines(const std::vector & auth_order, + engine_map_t eng_map) + { + auto ctrl_flag = Control::SUFFICIENT; + for (const auto &eng : auth_order) { + // fallback to the last engine, in case of multiple engines, since ctrl + // flag is sufficient for others, error from earlier engine is returned + if (&eng == &auth_order.back() && eng_map.size() > 1) { + ctrl_flag = Control::FALLBACK; + } + if (const auto kv = eng_map.find(eng); + kv != eng_map.end()) { + add_engine(ctrl_flag, kv->second); + } + } + } + + auto parse_auth_order(CephContext* const cct) + { + std::vector result; + + const std::set allowed_auth = { "sts", "external", "local" }; + std::vector default_order = { "sts", "external", "local" }; + // supplied strings may contain a space, so let's bypass that + boost::split(result, cct->_conf->rgw_s3_auth_order, + boost::is_any_of(", "), boost::token_compress_on); + + if (std::any_of(result.begin(), result.end(), + [allowed_auth](std::string_view s) + { return allowed_auth.find(s) == allowed_auth.end();})){ + return default_order; + } + return result; + } + + AWSAuthStrategy(CephContext* const cct, + rgw::auth::ImplicitTenants& implicit_tenant_context, + RGWRados* const store) + : store(store), + ver_abstractor(cct), + anonymous_engine(cct, + static_cast(this)), + external_engines(cct, store, implicit_tenant_context, &ver_abstractor), + sts_engine(cct, store, implicit_tenant_context, &ver_abstractor), + local_engine(cct, store, ver_abstractor, + static_cast(this)) { + /* The anonymous auth. */ + if (AllowAnonAccessT) { + add_engine(Control::SUFFICIENT, anonymous_engine); + } + + auto auth_order = parse_auth_order(cct); + engine_map_t engine_map; + + /* STS Auth*/ + if (! sts_engine.is_empty()) { + engine_map.insert(std::make_pair("sts", std::cref(sts_engine))); + } + + /* The external auth. */ + if (! external_engines.is_empty()) { + engine_map.insert(std::make_pair("external", std::cref(external_engines))); + } + /* The local auth. */ + if (cct->_conf->rgw_s3_auth_use_rados) { + engine_map.insert(std::make_pair("local", std::cref(local_engine))); + } + + add_engines(auth_order, engine_map); + } + + const char* get_name() const noexcept override { + return "rgw::auth::s3::AWSAuthStrategy"; + } +}; + + +class AWSv4ComplMulti : public rgw::auth::Completer, + public rgw::io::DecoratedRestfulClient, + public std::enable_shared_from_this { + using io_base_t = rgw::io::DecoratedRestfulClient; + using signing_key_t = sha256_digest_t; + + CephContext* const cct; + + const boost::string_view date; + const boost::string_view credential_scope; + const signing_key_t signing_key; + + class ChunkMeta { + size_t data_offset_in_stream = 0; + size_t data_length = 0; + std::string signature; + + ChunkMeta(const size_t data_starts_in_stream, + const size_t data_length, + const boost::string_ref signature) + : data_offset_in_stream(data_starts_in_stream), + data_length(data_length), + signature(signature.to_string()) { + } + + explicit ChunkMeta(const boost::string_view& signature) + : signature(signature.to_string()) { + } + + public: + static constexpr size_t SIG_SIZE = 64; + + /* Let's suppose the data length fields can't exceed uint64_t. */ + static constexpr size_t META_MAX_SIZE = \ + sarrlen("\r\nffffffffffffffff;chunk-signature=") + SIG_SIZE + sarrlen("\r\n"); + + /* The metadata size of for the last, empty chunk. */ + static constexpr size_t META_MIN_SIZE = \ + sarrlen("0;chunk-signature=") + SIG_SIZE + sarrlen("\r\n"); + + /* Detect whether a given stream_pos fits in boundaries of a chunk. */ + bool is_new_chunk_in_stream(size_t stream_pos) const; + + /* Get the remaining data size. */ + size_t get_data_size(size_t stream_pos) const; + + const std::string& get_signature() const { + return signature; + } + + /* Factory: create an object representing metadata of first, initial chunk + * in a stream. */ + static ChunkMeta create_first(const boost::string_view& seed_signature) { + return ChunkMeta(seed_signature); + } + + /* Factory: parse a block of META_MAX_SIZE bytes and creates an object + * representing non-first chunk in a stream. As the process is sequential + * and depends on the previous chunk, caller must pass it. */ + static std::pair create_next(CephContext* cct, + ChunkMeta&& prev, + const char* metabuf, + size_t metabuf_len); + } chunk_meta; + + size_t stream_pos; + boost::container::static_vector parsing_buf; + ceph::crypto::SHA256* sha256_hash; + std::string prev_chunk_signature; + + bool is_signature_mismatched(); + std::string calc_chunk_signature(const std::string& payload_hash) const; + +public: + /* We need the constructor to be public because of the std::make_shared that + * is employed by the create() method. */ + AWSv4ComplMulti(const req_state* const s, + boost::string_view date, + boost::string_view credential_scope, + boost::string_view seed_signature, + const signing_key_t& signing_key) + : io_base_t(nullptr), + cct(s->cct), + date(std::move(date)), + credential_scope(std::move(credential_scope)), + signing_key(signing_key), + + /* The evolving state. */ + chunk_meta(ChunkMeta::create_first(seed_signature)), + stream_pos(0), + sha256_hash(calc_hash_sha256_open_stream()), + prev_chunk_signature(std::move(seed_signature)) { + } + + ~AWSv4ComplMulti() { + if (sha256_hash) { + calc_hash_sha256_close_stream(&sha256_hash); + } + } + + /* rgw::io::DecoratedRestfulClient. */ + size_t recv_body(char* buf, size_t max) override; + + /* rgw::auth::Completer. */ + void modify_request_state(const DoutPrefixProvider* dpp, req_state* s_rw) override; + bool complete() override; + + /* Factories. */ + static cmplptr_t create(const req_state* s, + boost::string_view date, + boost::string_view credential_scope, + boost::string_view seed_signature, + const boost::optional& secret_key); + +}; + +class AWSv4ComplSingle : public rgw::auth::Completer, + public rgw::io::DecoratedRestfulClient, + public std::enable_shared_from_this { + using io_base_t = rgw::io::DecoratedRestfulClient; + + CephContext* const cct; + const char* const expected_request_payload_hash; + ceph::crypto::SHA256* sha256_hash = nullptr; + +public: + /* Defined in rgw_auth_s3.cc because of get_v4_exp_payload_hash(). We need + * the constructor to be public because of the std::make_shared employed by + * the create() method. */ + explicit AWSv4ComplSingle(const req_state* const s); + + ~AWSv4ComplSingle() { + if (sha256_hash) { + calc_hash_sha256_close_stream(&sha256_hash); + } + } + + /* rgw::io::DecoratedRestfulClient. */ + size_t recv_body(char* buf, size_t max) override; + + /* rgw::auth::Completer. */ + void modify_request_state(const DoutPrefixProvider* dpp, req_state* s_rw) override; + bool complete() override; + + /* Factories. */ + static cmplptr_t create(const req_state* s, + const boost::optional&); + +}; + +} /* namespace s3 */ +} /* namespace auth */ +} /* namespace rgw */ + +void rgw_create_s3_canonical_header( + const char *method, + const char *content_md5, + const char *content_type, + const char *date, + const meta_map_t& meta_map, + const meta_map_t& qs_map, + const char *request_uri, + const std::map& sub_resources, + std::string& dest_str); +bool rgw_create_s3_canonical_header(const req_info& info, + utime_t *header_time, /* out */ + std::string& dest, /* out */ + bool qsr); +static inline std::tuple +rgw_create_s3_canonical_header(const req_info& info, const bool qsr) { + std::string dest; + utime_t header_time; + + const bool ok = rgw_create_s3_canonical_header(info, &header_time, dest, qsr); + return std::make_tuple(ok, dest, header_time); +} + +namespace rgw { +namespace auth { +namespace s3 { + +static constexpr char AWS4_HMAC_SHA256_STR[] = "AWS4-HMAC-SHA256"; +static constexpr char AWS4_HMAC_SHA256_PAYLOAD_STR[] = "AWS4-HMAC-SHA256-PAYLOAD"; + +static constexpr char AWS4_EMPTY_PAYLOAD_HASH[] = \ + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; + +static constexpr char AWS4_UNSIGNED_PAYLOAD_HASH[] = "UNSIGNED-PAYLOAD"; + +static constexpr char AWS4_STREAMING_PAYLOAD_HASH[] = \ + "STREAMING-AWS4-HMAC-SHA256-PAYLOAD"; + +int parse_v4_credentials(const req_info& info, /* in */ + boost::string_view& access_key_id, /* out */ + boost::string_view& credential_scope, /* out */ + boost::string_view& signedheaders, /* out */ + boost::string_view& signature, /* out */ + boost::string_view& date, /* out */ + boost::string_view& session_token, /* out */ + const bool using_qs); /* in */ + +static inline bool char_needs_aws4_escaping(const char c, bool encode_slash) +{ + if ((c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9')) { + return false; + } + + switch (c) { + case '-': + case '_': + case '.': + case '~': + return false; + } + + if (c == '/' && !encode_slash) + return false; + + return true; +} + +static inline std::string aws4_uri_encode(const std::string& src, bool encode_slash) +{ + std::string result; + + for (const std::string::value_type c : src) { + if (char_needs_aws4_escaping(c, encode_slash)) { + rgw_uri_escape_char(c, result); + } else { + result.push_back(c); + } + } + + return result; +} + +static inline std::string aws4_uri_recode(const boost::string_view& src, bool encode_slash) +{ + std::string decoded = url_decode(src); + return aws4_uri_encode(decoded, encode_slash); +} + +static inline std::string get_v4_canonical_uri(const req_info& info) { + /* The code should normalize according to RFC 3986 but S3 does NOT do path + * normalization that SigV4 typically does. This code follows the same + * approach that boto library. See auth.py:canonical_uri(...). */ + + std::string canonical_uri = aws4_uri_recode(info.request_uri_aws4, false); + + if (canonical_uri.empty()) { + canonical_uri = "/"; + } else { + boost::replace_all(canonical_uri, "+", "%20"); + } + + return canonical_uri; +} + +static inline const string calc_v4_payload_hash(const string& payload) +{ + ceph::crypto::SHA256* sha256_hash = calc_hash_sha256_open_stream(); + calc_hash_sha256_update_stream(sha256_hash, payload.c_str(), payload.length()); + const auto payload_hash = calc_hash_sha256_close_stream(&sha256_hash); + return payload_hash; +} + +static inline const char* get_v4_exp_payload_hash(const req_info& info) +{ + /* In AWSv4 the hash of real, transferred payload IS NOT necessary to form + * a Canonical Request, and thus verify a Signature. x-amz-content-sha256 + * header lets get the information very early -- before seeing first byte + * of HTTP body. As a consequence, we can decouple Signature verification + * from payload's fingerprint check. */ + const char *expected_request_payload_hash = \ + info.env->get("HTTP_X_AMZ_CONTENT_SHA256"); + + if (!expected_request_payload_hash) { + /* An HTTP client MUST send x-amz-content-sha256. The single exception + * is the case of using the Query Parameters where "UNSIGNED-PAYLOAD" + * literals are used for crafting Canonical Request: + * + * You don't include a payload hash in the Canonical Request, because + * when you create a presigned URL, you don't know the payload content + * because the URL is used to upload an arbitrary payload. Instead, you + * use a constant string UNSIGNED-PAYLOAD. */ + expected_request_payload_hash = AWS4_UNSIGNED_PAYLOAD_HASH; + } + + return expected_request_payload_hash; +} + +static inline bool is_v4_payload_unsigned(const char* const exp_payload_hash) +{ + return boost::equals(exp_payload_hash, AWS4_UNSIGNED_PAYLOAD_HASH); +} + +static inline bool is_v4_payload_empty(const req_state* const s) +{ + /* from rfc2616 - 4.3 Message Body + * + * "The presence of a message-body in a request is signaled by the inclusion + * of a Content-Length or Transfer-Encoding header field in the request's + * message-headers." */ + return s->content_length == 0 && + s->info.env->get("HTTP_TRANSFER_ENCODING") == nullptr; +} + +static inline bool is_v4_payload_streamed(const char* const exp_payload_hash) +{ + return boost::equals(exp_payload_hash, AWS4_STREAMING_PAYLOAD_HASH); +} + +std::string get_v4_canonical_qs(const req_info& info, bool using_qs); + +boost::optional +get_v4_canonical_headers(const req_info& info, + const boost::string_view& signedheaders, + bool using_qs, + bool force_boto2_compat); + +extern sha256_digest_t +get_v4_canon_req_hash(CephContext* cct, + const boost::string_view& http_verb, + const std::string& canonical_uri, + const std::string& canonical_qs, + const std::string& canonical_hdrs, + const boost::string_view& signed_hdrs, + const boost::string_view& request_payload_hash); + +AWSEngine::VersionAbstractor::string_to_sign_t +get_v4_string_to_sign(CephContext* cct, + const boost::string_view& algorithm, + const boost::string_view& request_date, + const boost::string_view& credential_scope, + const sha256_digest_t& canonreq_hash); + +extern AWSEngine::VersionAbstractor::server_signature_t +get_v4_signature(const boost::string_view& credential_scope, + CephContext* const cct, + const boost::string_view& secret_key, + const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign); + +extern AWSEngine::VersionAbstractor::server_signature_t +get_v2_signature(CephContext*, + const std::string& secret_key, + const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign); +} /* namespace s3 */ +} /* namespace auth */ +} /* namespace rgw */ + +#endif diff --git a/src/rgw/rgw_b64.h b/src/rgw/rgw_b64.h new file mode 100644 index 00000000..c4ad9880 --- /dev/null +++ b/src/rgw/rgw_b64.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_B64_H +#define RGW_B64_H + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rgw { + + /* + * A header-only Base64 encoder built on boost::archive. The + * formula is based on a class poposed for inclusion in boost in + * 2011 by Denis Shevchenko (abandoned), updated slightly + * (e.g., uses boost::string_view). + * + * Also, wrap_width added as template argument, based on + * feedback from Marcus. + */ + + template::max()> + inline std::string to_base64(boost::string_view sview) + { + using namespace boost::archive::iterators; + + // output must be =padded modulo 3 + auto psize = sview.size(); + while ((psize % 3) != 0) { + ++psize; + } + + /* RFC 2045 requires linebreaks to be present in the output + * sequence every at-most 76 characters (MIME-compliance), + * but we could likely omit it. */ + typedef + insert_linebreaks< + base64_from_binary< + transform_width< + boost::string_view::const_iterator + ,6,8> + > + ,wrap_width + > b64_iter; + + std::string outstr(b64_iter(sview.data()), + b64_iter(sview.data() + sview.size())); + + // pad outstr with '=' to a length that is a multiple of 3 + for (size_t ix = 0; ix < (psize-sview.size()); ++ix) + outstr.push_back('='); + + return outstr; + } + + inline std::string from_base64(boost::string_view sview) + { + using namespace boost::archive::iterators; + if (sview.empty()) + return std::string(); + /* MIME-compliant input will have line-breaks, so we have to + * filter WS */ + typedef + transform_width< + binary_from_base64< + remove_whitespace< + boost::string_view::const_iterator>> + ,8,6 + > b64_iter; + + while (sview.back() == '=') + sview.remove_suffix(1); + + std::string outstr(b64_iter(sview.data()), + b64_iter(sview.data() + sview.size())); + + return outstr; + } +} /* namespace */ + +#endif /* RGW_B64_H */ diff --git a/src/rgw/rgw_basic_types.cc b/src/rgw/rgw_basic_types.cc new file mode 100644 index 00000000..b1db690b --- /dev/null +++ b/src/rgw/rgw_basic_types.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include "rgw_basic_types.h" +#include "rgw_xml.h" +#include "common/ceph_json.h" + +using std::string; +using std::stringstream; + +void decode_json_obj(rgw_user& val, JSONObj *obj) +{ + val.from_str(obj->get_data()); +} + +void encode_json(const char *name, const rgw_user& val, Formatter *f) +{ + f->dump_string(name, val.to_str()); +} + +void encode_xml(const char *name, const rgw_user& val, Formatter *f) +{ + encode_xml(name, val.to_str(), f); +} + +namespace rgw { +namespace auth { +ostream& operator <<(ostream& m, const Principal& p) { + if (p.is_wildcard()) { + return m << "*"; + } + + m << "arn:aws:iam:" << p.get_tenant() << ":"; + if (p.is_tenant()) { + return m << "root"; + } + return m << (p.is_user() ? "user/" : "role/") << p.get_id(); +} +} +} diff --git a/src/rgw/rgw_basic_types.h b/src/rgw/rgw_basic_types.h new file mode 100644 index 00000000..c8d3abb7 --- /dev/null +++ b/src/rgw/rgw_basic_types.h @@ -0,0 +1,213 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_BASIC_TYPES_H +#define CEPH_RGW_BASIC_TYPES_H + +#include + +#include "include/types.h" + +struct rgw_user { + std::string tenant; + std::string id; + + rgw_user() {} + // cppcheck-suppress noExplicitConstructor + rgw_user(const std::string& s) { + from_str(s); + } + rgw_user(const std::string& tenant, const std::string& id) + : tenant(tenant), + id(id) { + } + rgw_user(std::string&& tenant, std::string&& id) + : tenant(std::move(tenant)), + id(std::move(id)) { + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(tenant, bl); + encode(id, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(tenant, bl); + decode(id, bl); + DECODE_FINISH(bl); + } + + void to_str(std::string& str) const { + if (!tenant.empty()) { + str = tenant + '$' + id; + } else { + str = id; + } + } + + void clear() { + tenant.clear(); + id.clear(); + } + + bool empty() const { + return id.empty(); + } + + string to_str() const { + string s; + to_str(s); + return s; + } + + void from_str(const std::string& str) { + size_t pos = str.find('$'); + if (pos != std::string::npos) { + tenant = str.substr(0, pos); + id = str.substr(pos + 1); + } else { + tenant.clear(); + id = str; + } + } + + rgw_user& operator=(const string& str) { + from_str(str); + return *this; + } + + int compare(const rgw_user& u) const { + int r = tenant.compare(u.tenant); + if (r != 0) + return r; + + return id.compare(u.id); + } + int compare(const string& str) const { + rgw_user u(str); + return compare(u); + } + + bool operator!=(const rgw_user& rhs) const { + return (compare(rhs) != 0); + } + bool operator==(const rgw_user& rhs) const { + return (compare(rhs) == 0); + } + bool operator<(const rgw_user& rhs) const { + if (tenant < rhs.tenant) { + return true; + } else if (tenant > rhs.tenant) { + return false; + } + return (id < rhs.id); + } +}; +WRITE_CLASS_ENCODER(rgw_user) + +// Represents an identity. This is more wide-ranging than a +// 'User'. Its purposes is to be matched against by an +// IdentityApplier. The internal representation will doubtless change as +// more types are added. We may want to expose the type enum and make +// the member public so people can switch/case on it. + +namespace rgw { +namespace auth { +class Principal { + enum types { User, Role, Tenant, Wildcard, OidcProvider }; + types t; + rgw_user u; + string idp_url; + + explicit Principal(types t) + : t(t) {} + + Principal(types t, std::string&& n, std::string i) + : t(t), u(std::move(n), std::move(i)) {} + + Principal(string&& idp_url) + : t(OidcProvider), idp_url(std::move(idp_url)) {} + +public: + + static Principal wildcard() { + return Principal(Wildcard); + } + + static Principal user(std::string&& t, std::string&& u) { + return Principal(User, std::move(t), std::move(u)); + } + + static Principal role(std::string&& t, std::string&& u) { + return Principal(Role, std::move(t), std::move(u)); + } + + static Principal tenant(std::string&& t) { + return Principal(Tenant, std::move(t), {}); + } + + static Principal oidc_provider(string&& idp_url) { + return Principal(std::move(idp_url)); + } + + bool is_wildcard() const { + return t == Wildcard; + } + + bool is_user() const { + return t == User; + } + + bool is_role() const { + return t == Role; + } + + bool is_tenant() const { + return t == Tenant; + } + + bool is_oidc_provider() const { + return t == OidcProvider; + } + + const std::string& get_tenant() const { + return u.tenant; + } + + const std::string& get_id() const { + return u.id; + } + + const string& get_idp_url() const { + return idp_url; + } + + bool operator ==(const Principal& o) const { + return (t == o.t) && (u == o.u); + } + + bool operator <(const Principal& o) const { + return (t < o.t) || ((t == o.t) && (u < o.u)); + } +}; + +std::ostream& operator <<(std::ostream& m, const Principal& p); +} +} + +class JSONObj; + +void decode_json_obj(rgw_user& val, JSONObj *obj); +void encode_json(const char *name, const rgw_user& val, Formatter *f); +void encode_xml(const char *name, const rgw_user& val, Formatter *f); + +inline ostream& operator<<(ostream& out, const rgw_user &u) { + string s; + u.to_str(s); + return out << s; +} + + +#endif diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc new file mode 100644 index 00000000..f022222f --- /dev/null +++ b/src/rgw/rgw_bucket.cc @@ -0,0 +1,3178 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include +#include + +#include +#include + +#include "common/errno.h" +#include "common/ceph_json.h" +#include "include/scope_guard.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" + +#include "include/types.h" +#include "rgw_bucket.h" +#include "rgw_user.h" +#include "rgw_string.h" +#include "rgw_multi.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" + +#include "include/rados/librados.hpp" +// until everything is moved from rgw_common +#include "rgw_common.h" +#include "rgw_reshard.h" +#include "rgw_lc.h" +#include "cls/user/cls_user_types.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +#define BUCKET_TAG_TIMEOUT 30 + +// default number of entries to list with each bucket listing call +// (use marker to bridge between calls) +static constexpr size_t listing_max_entries = 1000; + + +static RGWMetadataHandler *bucket_meta_handler = NULL; +static RGWMetadataHandler *bucket_instance_meta_handler = NULL; + +// define as static when RGWBucket implementation completes +void rgw_get_buckets_obj(const rgw_user& user_id, string& buckets_obj_id) +{ + buckets_obj_id = user_id.to_str(); + buckets_obj_id += RGW_BUCKETS_OBJ_SUFFIX; +} + +/* + * Note that this is not a reversal of parse_bucket(). That one deals + * with the syntax we need in metadata and such. This one deals with + * the representation in RADOS pools. We chose '/' because it's not + * acceptable in bucket names and thus qualified buckets cannot conflict + * with the legacy or S3 buckets. + */ +std::string rgw_make_bucket_entry_name(const std::string& tenant_name, + const std::string& bucket_name) { + std::string bucket_entry; + + if (bucket_name.empty()) { + bucket_entry.clear(); + } else if (tenant_name.empty()) { + bucket_entry = bucket_name; + } else { + bucket_entry = tenant_name + "/" + bucket_name; + } + + return bucket_entry; +} + +/* + * Tenants are separated from buckets in URLs by a colon in S3. + * This function is not to be used on Swift URLs, not even for COPY arguments. + */ +void rgw_parse_url_bucket(const string &bucket, const string& auth_tenant, + string &tenant_name, string &bucket_name) { + + int pos = bucket.find(':'); + if (pos >= 0) { + /* + * N.B.: We allow ":bucket" syntax with explicit empty tenant in order + * to refer to the legacy tenant, in case users in new named tenants + * want to access old global buckets. + */ + tenant_name = bucket.substr(0, pos); + bucket_name = bucket.substr(pos + 1); + } else { + tenant_name = auth_tenant; + bucket_name = bucket; + } +} + +/** + * Get all the buckets owned by a user and fill up an RGWUserBuckets with them. + * Returns: 0 on success, -ERR# on failure. + */ +int rgw_read_user_buckets(RGWRados * store, + const rgw_user& user_id, + RGWUserBuckets& buckets, + const string& marker, + const string& end_marker, + uint64_t max, + bool need_stats, + bool *is_truncated, + uint64_t default_amount) +{ + int ret; + buckets.clear(); + std::string buckets_obj_id; + rgw_get_buckets_obj(user_id, buckets_obj_id); + rgw_raw_obj obj(store->svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + + bool truncated = false; + string m = marker; + + uint64_t total = 0; + + if (!max) { + max = default_amount; + } + + do { + std::list entries; + ret = store->cls_user_list_buckets(obj, m, end_marker, max - total, entries, &m, &truncated); + if (ret == -ENOENT) { + ret = 0; + } + + if (ret < 0) { + return ret; + } + + for (auto& entry : entries) { + buckets.add(RGWBucketEnt(user_id, std::move(entry))); + total++; + } + + } while (truncated && total < max); + + if (is_truncated != nullptr) { + *is_truncated = truncated; + } + + if (need_stats) { + map& m = buckets.get_buckets(); + ret = store->update_containers_stats(m); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: could not get stats for buckets" << dendl; + return ret; + } + } + return 0; +} + +int rgw_bucket_sync_user_stats(RGWRados *store, const rgw_user& user_id, const RGWBucketInfo& bucket_info) +{ + string buckets_obj_id; + rgw_get_buckets_obj(user_id, buckets_obj_id); + rgw_raw_obj obj(store->svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + + return store->cls_user_sync_bucket_stats(obj, bucket_info); +} + +int rgw_bucket_sync_user_stats(RGWRados *store, const string& tenant_name, const string& bucket_name) +{ + RGWBucketInfo bucket_info; + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + int ret = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: could not fetch bucket info: ret=" << ret << dendl; + return ret; + } + + ret = rgw_bucket_sync_user_stats(store, bucket_info.owner, bucket_info); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: could not sync user stats for bucket " << bucket_name << ": ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int rgw_link_bucket(RGWRados* const store, + const rgw_user& user_id, + rgw_bucket& bucket, + ceph::real_time creation_time, + bool update_entrypoint) +{ + int ret; + string& tenant_name = bucket.tenant; + string& bucket_name = bucket.name; + + cls_user_bucket_entry new_bucket; + + RGWBucketEntryPoint ep; + RGWObjVersionTracker ot; + + bucket.convert(&new_bucket.bucket); + new_bucket.size = 0; + if (real_clock::is_zero(creation_time)) + new_bucket.creation_time = real_clock::now(); + else + new_bucket.creation_time = creation_time; + + map attrs; + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + + if (update_entrypoint) { + ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, ep, &ot, NULL, &attrs); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: store->get_bucket_entrypoint_info() returned: " + << cpp_strerror(-ret) << dendl; + } + } + + string buckets_obj_id; + rgw_get_buckets_obj(user_id, buckets_obj_id); + + rgw_raw_obj obj(store->svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + ret = store->cls_user_add_bucket(obj, new_bucket); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: error adding bucket to directory: " + << cpp_strerror(-ret) << dendl; + goto done_err; + } + + if (!update_entrypoint) + return 0; + + ep.linked = true; + ep.owner = user_id; + ep.bucket = bucket; + ret = store->put_bucket_entrypoint_info(tenant_name, bucket_name, ep, false, ot, real_time(), &attrs); + if (ret < 0) + goto done_err; + + return 0; +done_err: + int r = rgw_unlink_bucket(store, user_id, bucket.tenant, bucket.name); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: failed unlinking bucket on error cleanup: " + << cpp_strerror(-r) << dendl; + } + return ret; +} + +int rgw_unlink_bucket(RGWRados *store, const rgw_user& user_id, const string& tenant_name, const string& bucket_name, bool update_entrypoint) +{ + int ret; + + string buckets_obj_id; + rgw_get_buckets_obj(user_id, buckets_obj_id); + + cls_user_bucket bucket; + bucket.name = bucket_name; + rgw_raw_obj obj(store->svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + ret = store->cls_user_remove_bucket(obj, bucket); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: error removing bucket from directory: " + << cpp_strerror(-ret)<< dendl; + } + + if (!update_entrypoint) + return 0; + + RGWBucketEntryPoint ep; + RGWObjVersionTracker ot; + map attrs; + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, ep, &ot, NULL, &attrs); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + + if (!ep.linked) + return 0; + + if (ep.owner != user_id) { + ldout(store->ctx(), 0) << "bucket entry point user mismatch, can't unlink bucket: " << ep.owner << " != " << user_id << dendl; + return -EINVAL; + } + + ep.linked = false; + return store->put_bucket_entrypoint_info(tenant_name, bucket_name, ep, false, ot, real_time(), &attrs); +} + +int rgw_bucket_store_info(RGWRados *store, const string& bucket_name, bufferlist& bl, bool exclusive, + map *pattrs, RGWObjVersionTracker *objv_tracker, + real_time mtime) { + return store->meta_mgr->put_entry(bucket_meta_handler, bucket_name, bl, exclusive, objv_tracker, mtime, pattrs); +} + +int rgw_bucket_instance_store_info(RGWRados *store, string& entry, bufferlist& bl, bool exclusive, + map *pattrs, RGWObjVersionTracker *objv_tracker, + real_time mtime) { + return store->meta_mgr->put_entry(bucket_instance_meta_handler, entry, bl, exclusive, objv_tracker, mtime, pattrs); +} + +int rgw_bucket_instance_remove_entry(RGWRados *store, const string& entry, + RGWObjVersionTracker *objv_tracker) { + return store->meta_mgr->remove_entry(bucket_instance_meta_handler, entry, objv_tracker); +} + +// 'tenant/' is used in bucket instance keys for sync to avoid parsing ambiguity +// with the existing instance[:shard] format. once we parse the shard, the / is +// replaced with a : to match the [tenant:]instance format +void rgw_bucket_instance_key_to_oid(string& key) +{ + // replace tenant/ with tenant: + auto c = key.find('/'); + if (c != string::npos) { + key[c] = ':'; + } +} + +// convert bucket instance oids back to the tenant/ format for metadata keys. +// it's safe to parse 'tenant:' only for oids, because they won't contain the +// optional :shard at the end +void rgw_bucket_instance_oid_to_key(string& oid) +{ + // find first : (could be tenant:bucket or bucket:instance) + auto c = oid.find(':'); + if (c != string::npos) { + // if we find another :, the first one was for tenant + if (oid.find(':', c + 1) != string::npos) { + oid[c] = '/'; + } + } +} + +int rgw_bucket_parse_bucket_instance(const string& bucket_instance, string *target_bucket_instance, int *shard_id) +{ + ssize_t pos = bucket_instance.rfind(':'); + if (pos < 0) { + return -EINVAL; + } + + string first = bucket_instance.substr(0, pos); + string second = bucket_instance.substr(pos + 1); + + if (first.find(':') == string::npos) { + *shard_id = -1; + *target_bucket_instance = bucket_instance; + return 0; + } + + *target_bucket_instance = first; + string err; + *shard_id = strict_strtol(second.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + + return 0; +} + +// parse key in format: [tenant/]name:instance[:shard_id] +int rgw_bucket_parse_bucket_key(CephContext *cct, const string& key, + rgw_bucket *bucket, int *shard_id) +{ + boost::string_ref name{key}; + boost::string_ref instance; + + // split tenant/name + auto pos = name.find('/'); + if (pos != boost::string_ref::npos) { + auto tenant = name.substr(0, pos); + bucket->tenant.assign(tenant.begin(), tenant.end()); + name = name.substr(pos + 1); + } else { + bucket->tenant.clear(); + } + + // split name:instance + pos = name.find(':'); + if (pos != boost::string_ref::npos) { + instance = name.substr(pos + 1); + name = name.substr(0, pos); + } + bucket->name.assign(name.begin(), name.end()); + + // split instance:shard + pos = instance.find(':'); + if (pos == boost::string_ref::npos) { + bucket->bucket_id.assign(instance.begin(), instance.end()); + *shard_id = -1; + return 0; + } + + // parse shard id + auto shard = instance.substr(pos + 1); + string err; + auto id = strict_strtol(shard.data(), 10, &err); + if (!err.empty()) { + ldout(cct, 0) << "ERROR: failed to parse bucket shard '" + << instance.data() << "': " << err << dendl; + return -EINVAL; + } + + *shard_id = id; + instance = instance.substr(0, pos); + bucket->bucket_id.assign(instance.begin(), instance.end()); + return 0; +} + +int rgw_bucket_set_attrs(RGWRados *store, RGWBucketInfo& bucket_info, + map& attrs, + RGWObjVersionTracker *objv_tracker) +{ + rgw_bucket& bucket = bucket_info.bucket; + + if (!bucket_info.has_instance_obj) { + /* an old bucket object, need to convert it */ + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + int ret = store->convert_old_bucket_info(obj_ctx, bucket.tenant, bucket.name); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: failed converting old bucket info: " << ret << dendl; + return ret; + } + } + + /* we want the bucket instance name without the oid prefix cruft */ + string key = bucket.get_key(); + bufferlist bl; + + encode(bucket_info, bl); + + return rgw_bucket_instance_store_info(store, key, bl, false, &attrs, objv_tracker, real_time()); +} + +static void dump_mulipart_index_results(list& objs_to_unlink, + Formatter *f) +{ + for (const auto& o : objs_to_unlink) { + f->dump_string("object", o.name); + } +} + +void check_bad_user_bucket_mapping(RGWRados *store, const rgw_user& user_id, + bool fix) +{ + RGWUserBuckets user_buckets; + bool is_truncated = false; + string marker; + + CephContext *cct = store->ctx(); + + size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk; + + do { + int ret = rgw_read_user_buckets(store, user_id, user_buckets, marker, + string(), max_entries, false, + &is_truncated); + if (ret < 0) { + ldout(store->ctx(), 0) << "failed to read user buckets: " + << cpp_strerror(-ret) << dendl; + return; + } + + map& buckets = user_buckets.get_buckets(); + for (map::iterator i = buckets.begin(); + i != buckets.end(); + ++i) { + marker = i->first; + + RGWBucketEnt& bucket_ent = i->second; + rgw_bucket& bucket = bucket_ent.bucket; + + RGWBucketInfo bucket_info; + real_time mtime; + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_info(obj_ctx, user_id.tenant, bucket.name, bucket_info, &mtime); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket << dendl; + continue; + } + + rgw_bucket& actual_bucket = bucket_info.bucket; + + if (actual_bucket.name.compare(bucket.name) != 0 || + actual_bucket.tenant.compare(bucket.tenant) != 0 || + actual_bucket.marker.compare(bucket.marker) != 0 || + actual_bucket.bucket_id.compare(bucket.bucket_id) != 0) { + cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl; + if (fix) { + cout << "fixing" << std::endl; + r = rgw_link_bucket(store, user_id, actual_bucket, + bucket_info.creation_time); + if (r < 0) { + cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl; + } + } + } + } + } while (is_truncated); +} + +static bool bucket_object_check_filter(const string& oid) +{ + rgw_obj_key key; + string ns; + return rgw_obj_key::oid_to_key_in_ns(oid, &key, ns); +} + +int rgw_remove_object(RGWRados *store, const RGWBucketInfo& bucket_info, const rgw_bucket& bucket, rgw_obj_key& key) +{ + RGWObjectCtx rctx(store); + + if (key.instance.empty()) { + key.instance = "null"; + } + + rgw_obj obj(bucket, key); + + return store->delete_obj(rctx, bucket_info, obj, bucket_info.versioning_status()); +} + +int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children) +{ + int ret; + map stats; + std::vector objs; + map common_prefixes; + RGWBucketInfo info; + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + + string bucket_ver, master_ver; + + ret = store->get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL); + if (ret < 0) + return ret; + + ret = store->get_bucket_stats(info, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, NULL); + if (ret < 0) + return ret; + + RGWRados::Bucket target(store, info); + RGWRados::Bucket::List list_op(&target); + CephContext *cct = store->ctx(); + int max = 1000; + + list_op.params.list_versions = true; + list_op.params.allow_unordered = true; + + bool is_truncated = false; + do { + objs.clear(); + + ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated); + if (ret < 0) + return ret; + + if (!objs.empty() && !delete_children) { + lderr(store->ctx()) << "ERROR: could not remove non-empty bucket " << bucket.name << dendl; + return -ENOTEMPTY; + } + + for (const auto& obj : objs) { + rgw_obj_key key(obj.key); + ret = rgw_remove_object(store, info, bucket, key); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + } + } while(is_truncated); + + string prefix, delimiter; + + ret = abort_bucket_multiparts(store, cct, info, prefix, delimiter); + if (ret < 0) { + return ret; + } + + ret = rgw_bucket_sync_user_stats(store, info.owner, info); + if ( ret < 0) { + dout(1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl; + } + + RGWObjVersionTracker objv_tracker; + + // if we deleted children above we will force delete, as any that + // remain is detrius from a prior bug + ret = store->delete_bucket(info, objv_tracker, !delete_children); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: could not remove bucket " << + bucket.name << dendl; + return ret; + } + + ret = rgw_unlink_bucket(store, info.owner, bucket.tenant, bucket.name, false); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: unable to remove user bucket information" << dendl; + } + + return ret; +} + +static int aio_wait(librados::AioCompletion *handle) +{ + librados::AioCompletion *c = (librados::AioCompletion *)handle; + c->wait_for_safe(); + int ret = c->get_return_value(); + c->release(); + return ret; +} + +static int drain_handles(list& pending) +{ + int ret = 0; + while (!pending.empty()) { + librados::AioCompletion *handle = pending.front(); + pending.pop_front(); + int r = aio_wait(handle); + if (r < 0) { + ret = r; + } + } + return ret; +} + +int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket, + int concurrent_max, bool keep_index_consistent) +{ + int ret; + map stats; + std::vector objs; + map common_prefixes; + RGWBucketInfo info; + RGWObjectCtx obj_ctx(store); + RGWSysObjectCtx sysobj_ctx = store->svc.sysobj->init_obj_ctx(); + CephContext *cct = store->ctx(); + + string bucket_ver, master_ver; + + ret = store->get_bucket_info(sysobj_ctx, bucket.tenant, bucket.name, info, NULL); + if (ret < 0) + return ret; + + ret = store->get_bucket_stats(info, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, NULL); + if (ret < 0) + return ret; + + string prefix, delimiter; + + ret = abort_bucket_multiparts(store, cct, info, prefix, delimiter); + if (ret < 0) { + return ret; + } + + RGWRados::Bucket target(store, info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.list_versions = true; + list_op.params.allow_unordered = true; + + std::list handles; + + int max = 1000; + int max_aio = concurrent_max; + bool is_truncated = true; + + while (is_truncated) { + objs.clear(); + ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated); + if (ret < 0) + return ret; + + std::vector::iterator it = objs.begin(); + for (; it != objs.end(); ++it) { + RGWObjState *astate = NULL; + rgw_obj obj(bucket, (*it).key); + + ret = store->get_obj_state(&obj_ctx, info, obj, &astate, false); + if (ret == -ENOENT) { + dout(1) << "WARNING: cannot find obj state for obj " << obj.get_oid() << dendl; + continue; + } + if (ret < 0) { + lderr(store->ctx()) << "ERROR: get obj state returned with error " << ret << dendl; + return ret; + } + + if (astate->has_manifest) { + RGWObjManifest& manifest = astate->manifest; + RGWObjManifest::obj_iterator miter = manifest.obj_begin(); + rgw_obj head_obj = manifest.get_obj(); + rgw_raw_obj raw_head_obj; + store->obj_to_raw(info.placement_rule, head_obj, &raw_head_obj); + + + for (; miter != manifest.obj_end() && max_aio--; ++miter) { + if (!max_aio) { + ret = drain_handles(handles); + if (ret < 0 && ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + max_aio = concurrent_max; + } + + rgw_raw_obj last_obj = miter.get_location().get_raw_obj(store); + if (last_obj == raw_head_obj) { + // have the head obj deleted at the end + continue; + } + + ret = store->delete_raw_obj_aio(last_obj, handles); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: delete obj aio failed with " << ret << dendl; + return ret; + } + } // for all shadow objs + + ret = store->delete_obj_aio(head_obj, info, astate, handles, keep_index_consistent); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: delete obj aio failed with " << ret << dendl; + return ret; + } + } + + if (!max_aio) { + ret = drain_handles(handles); + if (ret < 0 && ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + max_aio = concurrent_max; + } + obj_ctx.invalidate(obj); + } // for all RGW objects + } + + ret = drain_handles(handles); + if (ret < 0 && ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + + ret = rgw_bucket_sync_user_stats(store, info.owner, info); + if (ret < 0) { + dout(1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl; + } + + RGWObjVersionTracker objv_tracker; + + // this function can only be run if caller wanted children to be + // deleted, so we can ignore the check for children as any that + // remain are detritus from a prior bug + ret = store->delete_bucket(info, objv_tracker, false); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: could not remove bucket " << bucket.name << dendl; + return ret; + } + + ret = rgw_unlink_bucket(store, info.owner, bucket.tenant, bucket.name, false); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: unable to remove user bucket information" << dendl; + } + + return ret; +} + +int rgw_bucket_delete_bucket_obj(RGWRados *store, + const string& tenant_name, + const string& bucket_name, + RGWObjVersionTracker& objv_tracker) +{ + string key; + + rgw_make_bucket_entry_name(tenant_name, bucket_name, key); + return store->meta_mgr->remove_entry(bucket_meta_handler, key, &objv_tracker); +} + +static void set_err_msg(std::string *sink, std::string msg) +{ + if (sink && !msg.empty()) + *sink = msg; +} + +int RGWBucket::init(RGWRados *storage, RGWBucketAdminOpState& op_state) +{ + if (!storage) + return -EINVAL; + + store = storage; + + rgw_user user_id = op_state.get_user_id(); + tenant = user_id.tenant; + bucket_name = op_state.get_bucket_name(); + RGWUserBuckets user_buckets; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + if (bucket_name.empty() && user_id.empty()) + return -EINVAL; + + if (!bucket_name.empty()) { + int r = store->get_bucket_info(obj_ctx, tenant, bucket_name, bucket_info, NULL); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket_name << dendl; + return r; + } + + op_state.set_bucket(bucket_info.bucket); + } + + if (!user_id.empty()) { + int r = rgw_get_user_info_by_uid(store, user_id, user_info); + if (r < 0) + return r; + + op_state.display_name = user_info.display_name; + } + + clear_failure(); + return 0; +} + +bool rgw_find_bucket_by_id(CephContext *cct, RGWMetadataManager *mgr, + const string& marker, const string& bucket_id, rgw_bucket* bucket_out) +{ + void *handle = NULL; + bool truncated = false; + int shard_id; + string s; + + int ret = mgr->list_keys_init("bucket.instance", marker, &handle); + if (ret < 0) { + cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + mgr->list_keys_complete(handle); + return -ret; + } + do { + list keys; + ret = mgr->list_keys_next(handle, 1000, keys, &truncated); + if (ret < 0) { + cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl; + mgr->list_keys_complete(handle); + return -ret; + } + for (list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + s = *iter; + ret = rgw_bucket_parse_bucket_key(cct, s, bucket_out, &shard_id); + if (ret < 0) { + continue; + } + if (bucket_id == bucket_out->bucket_id) { + mgr->list_keys_complete(handle); + return true; + } + } + } while (truncated); + mgr->list_keys_complete(handle); + return false; +} + +int RGWBucket::link(RGWBucketAdminOpState& op_state, std::string *err_msg) +{ + if (!op_state.is_user_op()) { + set_err_msg(err_msg, "empty user id"); + return -EINVAL; + } + + string bucket_id = op_state.get_bucket_id(); + if (bucket_id.empty()) { + set_err_msg(err_msg, "empty bucket instance id"); + return -EINVAL; + } + + std::string display_name = op_state.get_user_display_name(); + rgw_bucket bucket = op_state.get_bucket(); + + const rgw_pool& root_pool = store->svc.zone->get_zone_params().domain_root; + std::string bucket_entry; + rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry); + rgw_raw_obj obj(root_pool, bucket_entry); + RGWObjVersionTracker objv_tracker; + + map attrs; + RGWBucketInfo bucket_info; + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, &attrs); + if (r < 0) { + return r; + } + + map::iterator aiter = attrs.find(RGW_ATTR_ACL); + if (aiter != attrs.end()) { + bufferlist aclbl = aiter->second; + RGWAccessControlPolicy policy; + ACLOwner owner; + try { + auto iter = aclbl.cbegin(); + decode(policy, iter); + owner = policy.get_owner(); + } catch (buffer::error& err) { + set_err_msg(err_msg, "couldn't decode policy"); + return -EIO; + } + + r = rgw_unlink_bucket(store, owner.get_id(), bucket.tenant, bucket.name, false); + if (r < 0) { + set_err_msg(err_msg, "could not unlink policy from user " + owner.get_id().to_str()); + return r; + } + + // now update the user for the bucket... + if (display_name.empty()) { + ldout(store->ctx(), 0) << "WARNING: user " << user_info.user_id << " has no display name set" << dendl; + } + policy.create_default(user_info.user_id, display_name); + + owner = policy.get_owner(); + r = store->set_bucket_owner(bucket_info.bucket, owner); + if (r < 0) { + set_err_msg(err_msg, "failed to set bucket owner: " + cpp_strerror(-r)); + return r; + } + + // ...and encode the acl + aclbl.clear(); + policy.encode(aclbl); + + auto sysobj = obj_ctx.get_obj(obj); + r = sysobj.wop() + .set_objv_tracker(&objv_tracker) + .write_attr(RGW_ATTR_ACL, aclbl); + if (r < 0) { + return r; + } + + RGWAccessControlPolicy policy_instance; + policy_instance.create_default(user_info.user_id, display_name); + aclbl.clear(); + policy_instance.encode(aclbl); + + rgw_raw_obj obj_bucket_instance; + store->get_bucket_instance_obj(bucket, obj_bucket_instance); + auto inst_sysobj = obj_ctx.get_obj(obj_bucket_instance); + r = inst_sysobj.wop() + .set_objv_tracker(&objv_tracker) + .write_attr(RGW_ATTR_ACL, aclbl); + if (r < 0) { + return r; + } + + r = rgw_link_bucket(store, user_info.user_id, bucket_info.bucket, + ceph::real_time()); + if (r < 0) { + return r; + } + } + + return 0; +} + +int RGWBucket::unlink(RGWBucketAdminOpState& op_state, std::string *err_msg) +{ + rgw_bucket bucket = op_state.get_bucket(); + + if (!op_state.is_user_op()) { + set_err_msg(err_msg, "could not fetch user or user bucket info"); + return -EINVAL; + } + + int r = rgw_unlink_bucket(store, user_info.user_id, bucket.tenant, bucket.name); + if (r < 0) { + set_err_msg(err_msg, "error unlinking bucket" + cpp_strerror(-r)); + } + + return r; +} + +int RGWBucket::set_quota(RGWBucketAdminOpState& op_state, std::string *err_msg) +{ + rgw_bucket bucket = op_state.get_bucket(); + RGWBucketInfo bucket_info; + map attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL, &attrs); + if (r < 0) { + set_err_msg(err_msg, "could not get bucket info for bucket=" + bucket.name + ": " + cpp_strerror(-r)); + return r; + } + + bucket_info.quota = op_state.quota; + r = store->put_bucket_instance_info(bucket_info, false, real_time(), &attrs); + if (r < 0) { + set_err_msg(err_msg, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r)); + return r; + } + return r; +} + +int RGWBucket::remove(RGWBucketAdminOpState& op_state, bool bypass_gc, + bool keep_index_consistent, std::string *err_msg) +{ + bool delete_children = op_state.will_delete_children(); + rgw_bucket bucket = op_state.get_bucket(); + int ret; + + if (bypass_gc) { + if (delete_children) { + ret = rgw_remove_bucket_bypass_gc(store, bucket, op_state.get_max_aio(), keep_index_consistent); + } else { + set_err_msg(err_msg, "purge objects should be set for gc to be bypassed"); + return -EINVAL; + } + } else { + ret = rgw_remove_bucket(store, bucket, delete_children); + } + + if (ret < 0) { + set_err_msg(err_msg, "unable to remove bucket" + cpp_strerror(-ret)); + return ret; + } + + return 0; +} + +int RGWBucket::remove_object(RGWBucketAdminOpState& op_state, std::string *err_msg) +{ + rgw_bucket bucket = op_state.get_bucket(); + std::string object_name = op_state.get_object_name(); + + rgw_obj_key key(object_name); + + int ret = rgw_remove_object(store, bucket_info, bucket, key); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove object" + cpp_strerror(-ret)); + return ret; + } + + return 0; +} + +static void dump_bucket_index(map result, Formatter *f) +{ + map::iterator iter; + for (iter = result.begin(); iter != result.end(); ++iter) { + f->dump_string("object", iter->first); + } +} + +static void dump_bucket_usage(map& stats, Formatter *formatter) +{ + map::iterator iter; + + formatter->open_object_section("usage"); + for (iter = stats.begin(); iter != stats.end(); ++iter) { + RGWStorageStats& s = iter->second; + const char *cat_name = rgw_obj_category_name(iter->first); + formatter->open_object_section(cat_name); + s.dump(formatter); + formatter->close_section(); + } + formatter->close_section(); +} + +static void dump_index_check(map existing_stats, + map calculated_stats, + Formatter *formatter) +{ + formatter->open_object_section("check_result"); + formatter->open_object_section("existing_header"); + dump_bucket_usage(existing_stats, formatter); + formatter->close_section(); + formatter->open_object_section("calculated_header"); + dump_bucket_usage(calculated_stats, formatter); + formatter->close_section(); + formatter->close_section(); +} + +int RGWBucket::check_bad_index_multipart(RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher ,std::string *err_msg) +{ + bool fix_index = op_state.will_fix_index(); + rgw_bucket bucket = op_state.get_bucket(); + + size_t max = 1000; + + map common_prefixes; + + bool is_truncated; + map meta_objs; + map all_objs; + + RGWBucketInfo bucket_info; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): get_bucket_instance_info(bucket=" << bucket << ") returned r=" << r << dendl; + return r; + } + + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.list_versions = true; + list_op.params.ns = RGW_OBJ_NS_MULTIPART; + + do { + vector result; + int r = list_op.list_objects(max, &result, &common_prefixes, &is_truncated); + if (r < 0) { + set_err_msg(err_msg, "failed to list objects in bucket=" + bucket.name + + " err=" + cpp_strerror(-r)); + + return r; + } + + vector::iterator iter; + for (iter = result.begin(); iter != result.end(); ++iter) { + rgw_obj_index_key key = iter->key; + rgw_obj obj(bucket, key); + string oid = obj.get_oid(); + + int pos = oid.find_last_of('.'); + if (pos < 0) { + /* obj has no suffix */ + all_objs[key] = oid; + } else { + /* obj has suffix */ + string name = oid.substr(0, pos); + string suffix = oid.substr(pos + 1); + + if (suffix.compare("meta") == 0) { + meta_objs[name] = true; + } else { + all_objs[key] = name; + } + } + } + + } while (is_truncated); + + list objs_to_unlink; + Formatter *f = flusher.get_formatter(); + + f->open_array_section("invalid_multipart_entries"); + + for (auto aiter = all_objs.begin(); aiter != all_objs.end(); ++aiter) { + string& name = aiter->second; + + if (meta_objs.find(name) == meta_objs.end()) { + objs_to_unlink.push_back(aiter->first); + } + + if (objs_to_unlink.size() > max) { + if (fix_index) { + int r = store->remove_objs_from_index(bucket_info, objs_to_unlink); + if (r < 0) { + set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " + + cpp_strerror(-r)); + return r; + } + } + + dump_mulipart_index_results(objs_to_unlink, flusher.get_formatter()); + flusher.flush(); + objs_to_unlink.clear(); + } + } + + if (fix_index) { + int r = store->remove_objs_from_index(bucket_info, objs_to_unlink); + if (r < 0) { + set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " + + cpp_strerror(-r)); + + return r; + } + } + + dump_mulipart_index_results(objs_to_unlink, f); + f->close_section(); + flusher.flush(); + + return 0; +} + +int RGWBucket::check_object_index(RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + std::string *err_msg) +{ + + bool fix_index = op_state.will_fix_index(); + + if (!fix_index) { + set_err_msg(err_msg, "check-objects flag requires fix index enabled"); + return -EINVAL; + } + + store->cls_obj_set_bucket_tag_timeout(bucket_info, BUCKET_TAG_TIMEOUT); + + string prefix; + rgw_obj_index_key marker; + bool is_truncated = true; + + Formatter *formatter = flusher.get_formatter(); + formatter->open_object_section("objects"); + uint16_t expansion_factor = 1; + while (is_truncated) { + map result; + + int r = store->cls_bucket_list_ordered(bucket_info, RGW_NO_SHARD, + marker, prefix, + listing_max_entries, true, + expansion_factor, + result, &is_truncated, &marker, + bucket_object_check_filter); + if (r == -ENOENT) { + break; + } else if (r < 0 && r != -ENOENT) { + set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r)); + } + + if (result.size() < listing_max_entries / 8) { + ++expansion_factor; + } else if (result.size() > listing_max_entries * 7 / 8 && + expansion_factor > 1) { + --expansion_factor; + } + + dump_bucket_index(result, formatter); + flusher.flush(); + } + + formatter->close_section(); + + store->cls_obj_set_bucket_tag_timeout(bucket_info, 0); + + return 0; +} + + +int RGWBucket::check_index(RGWBucketAdminOpState& op_state, + map& existing_stats, + map& calculated_stats, + std::string *err_msg) +{ + bool fix_index = op_state.will_fix_index(); + + int r = store->bucket_check_index(bucket_info, &existing_stats, &calculated_stats); + if (r < 0) { + set_err_msg(err_msg, "failed to check index error=" + cpp_strerror(-r)); + return r; + } + + if (fix_index) { + r = store->bucket_rebuild_index(bucket_info); + if (r < 0) { + set_err_msg(err_msg, "failed to rebuild index err=" + cpp_strerror(-r)); + return r; + } + } + + return 0; +} + +int RGWBucket::policy_bl_to_stream(bufferlist& bl, ostream& o) +{ + RGWAccessControlPolicy_S3 policy(g_ceph_context); + int ret = decode_bl(bl, policy); + if (ret < 0) { + ldout(store->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl; + } + policy.to_xml(o); + return 0; +} + +int rgw_object_get_attr(RGWRados* store, const RGWBucketInfo& bucket_info, + const rgw_obj& obj, const char* attr_name, + bufferlist& out_bl) +{ + RGWObjectCtx obj_ctx(store); + RGWRados::Object op_target(store, bucket_info, obj_ctx, obj); + RGWRados::Object::Read rop(&op_target); + + return rop.get_attr(attr_name, out_bl); +} + +int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy) +{ + std::string object_name = op_state.get_object_name(); + rgw_bucket bucket = op_state.get_bucket(); + auto sysobj_ctx = store->svc.sysobj->init_obj_ctx(); + + RGWBucketInfo bucket_info; + map attrs; + int ret = store->get_bucket_info(sysobj_ctx, bucket.tenant, bucket.name, bucket_info, NULL, &attrs); + if (ret < 0) { + return ret; + } + + if (!object_name.empty()) { + bufferlist bl; + rgw_obj obj(bucket, object_name); + + ret = rgw_object_get_attr(store, bucket_info, obj, RGW_ATTR_ACL, bl); + if (ret < 0){ + return ret; + } + + ret = decode_bl(bl, policy); + if (ret < 0) { + ldout(store->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl; + } + return ret; + } + + map::iterator aiter = attrs.find(RGW_ATTR_ACL); + if (aiter == attrs.end()) { + return -ENOENT; + } + + ret = decode_bl(aiter->second, policy); + if (ret < 0) { + ldout(store->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl; + } + + return ret; +} + + +int RGWBucketAdminOp::get_policy(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWAccessControlPolicy& policy) +{ + RGWBucket bucket; + + int ret = bucket.init(store, op_state); + if (ret < 0) + return ret; + + ret = bucket.get_policy(op_state, policy); + if (ret < 0) + return ret; + + return 0; +} + +/* Wrappers to facilitate RESTful interface */ + + +int RGWBucketAdminOp::get_policy(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWAccessControlPolicy policy(store->ctx()); + + int ret = get_policy(store, op_state, policy); + if (ret < 0) + return ret; + + Formatter *formatter = flusher.get_formatter(); + + flusher.start(0); + + formatter->open_object_section("policy"); + policy.dump(formatter); + formatter->close_section(); + + flusher.flush(); + + return 0; +} + +int RGWBucketAdminOp::dump_s3_policy(RGWRados *store, RGWBucketAdminOpState& op_state, + ostream& os) +{ + RGWAccessControlPolicy_S3 policy(store->ctx()); + + int ret = get_policy(store, op_state, policy); + if (ret < 0) + return ret; + + policy.to_xml(os); + + return 0; +} + +int RGWBucketAdminOp::unlink(RGWRados *store, RGWBucketAdminOpState& op_state) +{ + RGWBucket bucket; + + int ret = bucket.init(store, op_state); + if (ret < 0) + return ret; + + return bucket.unlink(op_state); +} + +int RGWBucketAdminOp::link(RGWRados *store, RGWBucketAdminOpState& op_state, string *err) +{ + RGWBucket bucket; + + int ret = bucket.init(store, op_state); + if (ret < 0) + return ret; + + return bucket.link(op_state, err); + +} + +int RGWBucketAdminOp::check_index(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + int ret; + map existing_stats; + map calculated_stats; + + + RGWBucket bucket; + + ret = bucket.init(store, op_state); + if (ret < 0) + return ret; + + Formatter *formatter = flusher.get_formatter(); + flusher.start(0); + + ret = bucket.check_bad_index_multipart(op_state, flusher); + if (ret < 0) + return ret; + + ret = bucket.check_object_index(op_state, flusher); + if (ret < 0) + return ret; + + ret = bucket.check_index(op_state, existing_stats, calculated_stats); + if (ret < 0) + return ret; + + dump_index_check(existing_stats, calculated_stats, formatter); + flusher.flush(); + + return 0; +} + +int RGWBucketAdminOp::remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state, + bool bypass_gc, bool keep_index_consistent) +{ + RGWBucket bucket; + + int ret = bucket.init(store, op_state); + if (ret < 0) + return ret; + + std::string err_msg; + ret = bucket.remove(op_state, bypass_gc, keep_index_consistent, &err_msg); + if (!err_msg.empty()) { + lderr(store->ctx()) << "ERROR: " << err_msg << dendl; + } + return ret; +} + +int RGWBucketAdminOp::remove_object(RGWRados *store, RGWBucketAdminOpState& op_state) +{ + RGWBucket bucket; + + int ret = bucket.init(store, op_state); + if (ret < 0) + return ret; + + return bucket.remove_object(op_state); +} + +static int bucket_stats(RGWRados *store, const std::string& tenant_name, const std::string& bucket_name, Formatter *formatter) +{ + RGWBucketInfo bucket_info; + map stats; + + real_time mtime; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, &mtime); + if (r < 0) { + return r; + } + + rgw_bucket& bucket = bucket_info.bucket; + + string bucket_ver, master_ver; + string max_marker; + int ret = store->get_bucket_stats(bucket_info, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, &max_marker); + if (ret < 0) { + cerr << "error getting bucket stats bucket=" << bucket.name << " ret=" << ret << std::endl; + return ret; + } + + utime_t ut(mtime); + + formatter->open_object_section("stats"); + formatter->dump_string("bucket", bucket.name); + formatter->dump_int("num_shards", bucket_info.num_shards); + formatter->dump_string("tenant", bucket.tenant); + formatter->dump_string("zonegroup", bucket_info.zonegroup); + formatter->dump_string("placement_rule", bucket_info.placement_rule.to_str()); + ::encode_json("explicit_placement", bucket.explicit_placement, formatter); + formatter->dump_string("id", bucket.bucket_id); + formatter->dump_string("marker", bucket.marker); + formatter->dump_stream("index_type") << bucket_info.index_type; + ::encode_json("owner", bucket_info.owner, formatter); + formatter->dump_string("ver", bucket_ver); + formatter->dump_string("master_ver", master_ver); + ut.gmtime(formatter->dump_stream("mtime")); + formatter->dump_string("max_marker", max_marker); + dump_bucket_usage(stats, formatter); + encode_json("bucket_quota", bucket_info.quota, formatter); + formatter->close_section(); + + return 0; +} + +int RGWBucketAdminOp::limit_check(RGWRados *store, + RGWBucketAdminOpState& op_state, + const std::list& user_ids, + RGWFormatterFlusher& flusher, + bool warnings_only) +{ + int ret = 0; + const size_t max_entries = + store->ctx()->_conf->rgw_list_buckets_max_chunk; + + const size_t safe_max_objs_per_shard = + store->ctx()->_conf->rgw_safe_max_objects_per_shard; + + uint16_t shard_warn_pct = + store->ctx()->_conf->rgw_shard_warning_threshold; + if (shard_warn_pct > 100) + shard_warn_pct = 90; + + Formatter *formatter = flusher.get_formatter(); + flusher.start(0); + + formatter->open_array_section("users"); + + for (const auto& user_id : user_ids) { + + formatter->open_object_section("user"); + formatter->dump_string("user_id", user_id); + formatter->open_array_section("buckets"); + + string marker; + bool is_truncated{false}; + do { + RGWUserBuckets buckets; + + ret = rgw_read_user_buckets(store, user_id, buckets, + marker, string(), max_entries, false, + &is_truncated); + if (ret < 0) + return ret; + + map& m_buckets = buckets.get_buckets(); + + for (const auto& iter : m_buckets) { + auto& bucket = iter.second.bucket; + uint32_t num_shards = 1; + uint64_t num_objects = 0; + + /* need info for num_shards */ + RGWBucketInfo info; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + marker = bucket.name; /* Casey's location for marker update, + * as we may now not reach the end of + * the loop body */ + + ret = store->get_bucket_info(obj_ctx, bucket.tenant, bucket.name, + info, nullptr); + if (ret < 0) + continue; + + /* need stats for num_entries */ + string bucket_ver, master_ver; + std::map stats; + ret = store->get_bucket_stats(info, RGW_NO_SHARD, &bucket_ver, + &master_ver, stats, nullptr); + + if (ret < 0) + continue; + + for (const auto& s : stats) { + num_objects += s.second.num_objects; + } + + num_shards = info.num_shards; + uint64_t objs_per_shard = + (num_shards) ? num_objects/num_shards : num_objects; + { + bool warn; + stringstream ss; + uint64_t fill_pct = objs_per_shard * 100 / safe_max_objs_per_shard; + if (fill_pct > 100) { + ss << "OVER " << fill_pct << "%"; + warn = true; + } else if (fill_pct >= shard_warn_pct) { + ss << "WARN " << fill_pct << "%"; + warn = true; + } else { + ss << "OK"; + warn = false; + } + + if (warn || !warnings_only) { + formatter->open_object_section("bucket"); + formatter->dump_string("bucket", bucket.name); + formatter->dump_string("tenant", bucket.tenant); + formatter->dump_int("num_objects", num_objects); + formatter->dump_int("num_shards", num_shards); + formatter->dump_int("objects_per_shard", objs_per_shard); + formatter->dump_string("fill_status", ss.str()); + formatter->close_section(); + } + } + } + formatter->flush(cout); + } while (is_truncated); /* foreach: bucket */ + + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + + } /* foreach: user_id */ + + formatter->close_section(); + formatter->flush(cout); + + return ret; +} /* RGWBucketAdminOp::limit_check */ + +int RGWBucketAdminOp::info(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWBucket bucket; + int ret = 0; + const std::string& bucket_name = op_state.get_bucket_name(); + if (!bucket_name.empty()) { + ret = bucket.init(store, op_state); + if (-ENOENT == ret) + return -ERR_NO_SUCH_BUCKET; + else if (ret < 0) + return ret; + } + + Formatter *formatter = flusher.get_formatter(); + flusher.start(0); + + CephContext *cct = store->ctx(); + + const size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk; + + const bool show_stats = op_state.will_fetch_stats(); + const rgw_user& user_id = op_state.get_user_id(); + if (op_state.is_user_op()) { + formatter->open_array_section("buckets"); + + RGWUserBuckets buckets; + string marker; + const std::string empty_end_marker; + constexpr bool no_need_stats = false; // set need_stats to false + + bool is_truncated = false; + do { + buckets.clear(); + ret = rgw_read_user_buckets(store, op_state.get_user_id(), buckets, + marker, empty_end_marker, max_entries, no_need_stats, + &is_truncated); + if (ret < 0) { + return ret; + } + + const std::string* marker_cursor = nullptr; + map& m = buckets.get_buckets(); + + for (const auto& i : m) { + const std::string& obj_name = i.first; + if (!bucket_name.empty() && bucket_name != obj_name) { + continue; + } + + if (show_stats) { + bucket_stats(store, user_id.tenant, obj_name, formatter); + } else { + formatter->dump_string("bucket", obj_name); + } + + marker_cursor = &obj_name; + } // for loop + if (marker_cursor) { + marker = *marker_cursor; + } + + flusher.flush(); + } while (is_truncated); + + formatter->close_section(); + } else if (!bucket_name.empty()) { + ret = bucket_stats(store, user_id.tenant, bucket_name, formatter); + if (ret < 0) { + return ret; + } + } else { + void *handle = nullptr; + bool truncated = true; + + formatter->open_array_section("buckets"); + ret = store->meta_mgr->list_keys_init("bucket", &handle); + while (ret == 0 && truncated) { + std::list buckets; + constexpr int max_keys = 1000; + ret = store->meta_mgr->list_keys_next(handle, max_keys, buckets, + &truncated); + for (auto& bucket_name : buckets) { + if (show_stats) { + bucket_stats(store, user_id.tenant, bucket_name, formatter); + } else { + formatter->dump_string("bucket", bucket_name); + } + } + } + store->meta_mgr->list_keys_complete(handle); + + formatter->close_section(); + } + + flusher.flush(); + + return 0; +} + +int RGWBucketAdminOp::set_quota(RGWRados *store, RGWBucketAdminOpState& op_state) +{ + RGWBucket bucket; + + int ret = bucket.init(store, op_state); + if (ret < 0) + return ret; + return bucket.set_quota(op_state); +} + +static int purge_bucket_instance(RGWRados *store, const RGWBucketInfo& bucket_info) +{ + int max_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1); + for (int i = 0; i < max_shards; i++) { + RGWRados::BucketShard bs(store); + int shard_id = (bucket_info.num_shards > 0 ? i : -1); + int ret = bs.init(bucket_info.bucket, shard_id, nullptr); + if (ret < 0) { + cerr << "ERROR: bs.init(bucket=" << bucket_info.bucket << ", shard=" << shard_id + << "): " << cpp_strerror(-ret) << std::endl; + return ret; + } + ret = store->bi_remove(bs); + if (ret < 0) { + cerr << "ERROR: failed to remove bucket index object: " + << cpp_strerror(-ret) << std::endl; + return ret; + } + } + return 0; +} + +inline auto split_tenant(const std::string& bucket_name){ + auto p = bucket_name.find('/'); + if(p != std::string::npos) { + return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1)); + } + return std::make_pair(std::string(), bucket_name); +} + +using bucket_instance_ls = std::vector; +void get_stale_instances(RGWRados *store, const std::string& bucket_name, + const vector& lst, + bucket_instance_ls& stale_instances) +{ + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + bucket_instance_ls other_instances; +// first iterate over the entries, and pick up the done buckets; these +// are guaranteed to be stale + for (const auto& bucket_instance : lst){ + RGWBucketInfo binfo; + int r = store->get_bucket_instance_info(obj_ctx, bucket_instance, + binfo, nullptr,nullptr); + if (r < 0){ + // this can only happen if someone deletes us right when we're processing + lderr(store->ctx()) << "Bucket instance is invalid: " << bucket_instance + << cpp_strerror(-r) << dendl; + continue; + } + if (binfo.reshard_status == CLS_RGW_RESHARD_DONE) + stale_instances.emplace_back(std::move(binfo)); + else { + other_instances.emplace_back(std::move(binfo)); + } + } + + // Read the cur bucket info, if the bucket doesn't exist we can simply return + // all the instances + auto [tenant, bucket] = split_tenant(bucket_name); + RGWBucketInfo cur_bucket_info; + int r = store->get_bucket_info(obj_ctx, tenant, bucket, cur_bucket_info, nullptr); + if (r < 0) { + if (r == -ENOENT) { + // bucket doesn't exist, everything is stale then + stale_instances.insert(std::end(stale_instances), + std::make_move_iterator(other_instances.begin()), + std::make_move_iterator(other_instances.end())); + } else { + // all bets are off if we can't read the bucket, just return the sureshot stale instances + lderr(store->ctx()) << "error: reading bucket info for bucket: " + << bucket << cpp_strerror(-r) << dendl; + } + return; + } + + // Don't process further in this round if bucket is resharding + if (cur_bucket_info.reshard_status == CLS_RGW_RESHARD_IN_PROGRESS) + return; + + other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(), + [&cur_bucket_info](const RGWBucketInfo& b){ + return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id || + b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id); + }), + other_instances.end()); + + // check if there are still instances left + if (other_instances.empty()) { + return; + } + + // Now we have a bucket with instances where the reshard status is none, this + // usually happens when the reshard process couldn't complete, lockdown the + // bucket and walk through these instances to make sure no one else interferes + // with these + { + RGWBucketReshardLock reshard_lock(store, cur_bucket_info, true); + r = reshard_lock.lock(); + if (r < 0) { + // most likely bucket is under reshard, return the sureshot stale instances + ldout(store->ctx(), 5) << __func__ + << "failed to take reshard lock; reshard underway likey" << dendl; + return; + } + auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} ); + // this should be fast enough that we may not need to renew locks and check + // exit status?, should we read the values of the instances again? + stale_instances.insert(std::end(stale_instances), + std::make_move_iterator(other_instances.begin()), + std::make_move_iterator(other_instances.end())); + } + + return; +} + +static int process_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + std::function process_f) +{ + std::string marker; + void *handle; + Formatter *formatter = flusher.get_formatter(); + static constexpr auto default_max_keys = 1000; + + int ret = store->meta_mgr->list_keys_init("bucket.instance", marker, &handle); + if (ret < 0) { + cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + bool truncated; + + formatter->open_array_section("keys"); + auto g = make_scope_guard([&store, &handle, &formatter]() { + store->meta_mgr->list_keys_complete(handle); + formatter->close_section(); // keys + formatter->flush(cout); + }); + + do { + list keys; + + ret = store->meta_mgr->list_keys_next(handle, default_max_keys, keys, &truncated); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl; + return ret; + } if (ret != -ENOENT) { + // partition the list of buckets by buckets as the listing is un sorted, + // since it would minimize the reads to bucket_info + std::unordered_map> bucket_instance_map; + for (auto &key: keys) { + auto pos = key.find(':'); + if(pos != std::string::npos) + bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key)); + } + for (const auto& kv: bucket_instance_map) { + bucket_instance_ls stale_lst; + get_stale_instances(store, kv.first, kv.second, stale_lst); + process_f(stale_lst, formatter, store); + } + } + } while (truncated); + + return 0; +} + +int RGWBucketAdminOp::list_stale_instances(RGWRados *store, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + auto process_f = [](const bucket_instance_ls& lst, + Formatter *formatter, + RGWRados*){ + for (const auto& binfo: lst) + formatter->dump_string("key", binfo.bucket.get_key()); + }; + return process_stale_instances(store, op_state, flusher, process_f); +} + + +int RGWBucketAdminOp::clear_stale_instances(RGWRados *store, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + auto process_f = [](const bucket_instance_ls& lst, + Formatter *formatter, + RGWRados *store){ + for (const auto &binfo: lst) { + int ret = purge_bucket_instance(store, binfo); + if (ret == 0){ + auto md_key = "bucket.instance:" + binfo.bucket.get_key(); + ret = store->meta_mgr->remove(md_key); + } + formatter->open_object_section("delete_status"); + formatter->dump_string("bucket_instance", binfo.bucket.get_key()); + formatter->dump_int("status", -ret); + formatter->close_section(); + } + }; + + return process_stale_instances(store, op_state, flusher, process_f); +} + +static int fix_single_bucket_lc(RGWRados *store, + const std::string& tenant_name, + const std::string& bucket_name) +{ + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + RGWBucketInfo bucket_info; + map bucket_attrs; + int ret = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, + bucket_info, nullptr, &bucket_attrs); + if (ret < 0) { + // TODO: Should we handle the case where the bucket could've been removed between + // listing and fetching? + return ret; + } + + return rgw::lc::fix_lc_shard_entry(store, bucket_info, bucket_attrs); +} + +static void format_lc_status(Formatter* formatter, + const std::string& tenant_name, + const std::string& bucket_name, + int status) +{ + formatter->open_object_section("bucket_entry"); + std::string entry = tenant_name.empty() ? bucket_name : tenant_name + "/" + bucket_name; + formatter->dump_string("bucket", entry); + formatter->dump_int("status", status); + formatter->close_section(); // bucket_entry +} + +static void process_single_lc_entry(RGWRados *store, Formatter *formatter, + const std::string& tenant_name, + const std::string& bucket_name) +{ + int ret = fix_single_bucket_lc(store, tenant_name, bucket_name); + format_lc_status(formatter, tenant_name, bucket_name, -ret); +} + +int RGWBucketAdminOp::fix_lc_shards(RGWRados *store, + RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + std::string marker; + void *handle; + Formatter *formatter = flusher.get_formatter(); + static constexpr auto default_max_keys = 1000; + + bool truncated; + if (const std::string& bucket_name = op_state.get_bucket_name(); + ! bucket_name.empty()) { + const rgw_user user_id = op_state.get_user_id(); + process_single_lc_entry(store, formatter, user_id.tenant, bucket_name); + formatter->flush(cout); + } else { + int ret = store->meta_mgr->list_keys_init("bucket", marker, &handle); + if (ret < 0) { + std::cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + { + formatter->open_array_section("lc_fix_status"); + auto sg = make_scope_guard([&store, &handle, &formatter](){ + store->meta_mgr->list_keys_complete(handle); + formatter->close_section(); // lc_fix_status + formatter->flush(cout); + }); + do { + list keys; + ret = store->meta_mgr->list_keys_next(handle, default_max_keys, keys, &truncated); + if (ret < 0 && ret != -ENOENT) { + std::cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl; + return ret; + } if (ret != -ENOENT) { + for (const auto &key:keys) { + auto [tenant_name, bucket_name] = split_tenant(key); + process_single_lc_entry(store, formatter, tenant_name, bucket_name); + } + } + formatter->flush(cout); // regularly flush every 1k entries + } while (truncated); + } + + } + return 0; + +} + +static bool has_object_expired(RGWRados *store, const RGWBucketInfo& bucket_info, + const rgw_obj_key& key, utime_t& delete_at) +{ + rgw_obj obj(bucket_info.bucket, key); + bufferlist delete_at_bl; + + int ret = rgw_object_get_attr(store, bucket_info, obj, RGW_ATTR_DELETE_AT, delete_at_bl); + if (ret < 0) { + return false; // no delete at attr, proceed + } + + ret = decode_bl(delete_at_bl, delete_at); + if (ret < 0) { + return false; // failed to parse + } + + if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) { + return true; + } + + return false; +} + +static int fix_bucket_obj_expiry(RGWRados *store, const RGWBucketInfo& bucket_info, + RGWFormatterFlusher& flusher, bool dry_run) +{ + if (bucket_info.bucket.bucket_id == bucket_info.bucket.marker) { + lderr(store->ctx()) << "Not a resharded bucket skipping" << dendl; + return 0; // not a resharded bucket, move along + } + + Formatter *formatter = flusher.get_formatter(); + formatter->open_array_section("expired_deletion_status"); + auto sg = make_scope_guard([&formatter] { + formatter->close_section(); + formatter->flush(std::cout); + }); + + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.list_versions = bucket_info.versioned(); + list_op.params.allow_unordered = true; + + constexpr auto max_objects = 1000; + bool is_truncated {false}; + do { + std::vector objs; + + int ret = list_op.list_objects(max_objects, &objs, nullptr, &is_truncated); + if (ret < 0) { + lderr(store->ctx()) << "ERROR failed to list objects in the bucket" << dendl; + return ret; + } + for (const auto& obj : objs) { + rgw_obj_key key(obj.key); + utime_t delete_at; + if (has_object_expired(store, bucket_info, key, delete_at)) { + formatter->open_object_section("object_status"); + formatter->dump_string("object", key.name); + formatter->dump_stream("delete_at") << delete_at; + + if (!dry_run) { + ret = rgw_remove_object(store, bucket_info, bucket_info.bucket, key); + formatter->dump_int("status", ret); + } + + formatter->close_section(); // object_status + } + } + formatter->flush(cout); // regularly flush every 1k entries + } while (is_truncated); + + return 0; +} + +int RGWBucketAdminOp::fix_obj_expiry(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, bool dry_run) +{ + RGWBucket admin_bucket; + int ret = admin_bucket.init(store, op_state); + if (ret < 0) { + lderr(store->ctx()) << "failed to initialize bucket" << dendl; + return ret; + } + + return fix_bucket_obj_expiry(store, admin_bucket.get_bucket_info(), flusher, dry_run); +} + +void rgw_data_change::dump(Formatter *f) const +{ + string type; + switch (entity_type) { + case ENTITY_TYPE_BUCKET: + type = "bucket"; + break; + default: + type = "unknown"; + } + encode_json("entity_type", type, f); + encode_json("key", key, f); + utime_t ut(timestamp); + encode_json("timestamp", ut, f); +} + +void rgw_data_change::decode_json(JSONObj *obj) { + string s; + JSONDecoder::decode_json("entity_type", s, obj); + if (s == "bucket") { + entity_type = ENTITY_TYPE_BUCKET; + } else { + entity_type = ENTITY_TYPE_UNKNOWN; + } + JSONDecoder::decode_json("key", key, obj); + utime_t ut; + JSONDecoder::decode_json("timestamp", ut, obj); + timestamp = ut.to_real_time(); +} + +void rgw_data_change_log_entry::dump(Formatter *f) const +{ + encode_json("log_id", log_id, f); + utime_t ut(log_timestamp); + encode_json("log_timestamp", ut, f); + encode_json("entry", entry, f); +} + +void rgw_data_change_log_entry::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("log_id", log_id, obj); + utime_t ut; + JSONDecoder::decode_json("log_timestamp", ut, obj); + log_timestamp = ut.to_real_time(); + JSONDecoder::decode_json("entry", entry, obj); +} + +int RGWDataChangesLog::choose_oid(const rgw_bucket_shard& bs) { + const string& name = bs.bucket.name; + int shard_shift = (bs.shard_id > 0 ? bs.shard_id : 0); + uint32_t r = (ceph_str_hash_linux(name.c_str(), name.size()) + shard_shift) % num_shards; + + return (int)r; +} + +int RGWDataChangesLog::renew_entries() +{ + if (!store->svc.zone->need_to_log_data()) + return 0; + + /* we can't keep the bucket name as part of the cls_log_entry, and we need + * it later, so we keep two lists under the map */ + map, list > > m; + + lock.Lock(); + map entries; + entries.swap(cur_cycle); + lock.Unlock(); + + map::iterator iter; + string section; + real_time ut = real_clock::now(); + for (iter = entries.begin(); iter != entries.end(); ++iter) { + const rgw_bucket_shard& bs = iter->first; + + int index = choose_oid(bs); + + cls_log_entry entry; + + rgw_data_change change; + bufferlist bl; + change.entity_type = ENTITY_TYPE_BUCKET; + change.key = bs.get_key(); + change.timestamp = ut; + encode(change, bl); + + store->time_log_prepare_entry(entry, ut, section, change.key, bl); + + m[index].first.push_back(bs); + m[index].second.emplace_back(std::move(entry)); + } + + map, list > >::iterator miter; + for (miter = m.begin(); miter != m.end(); ++miter) { + list& entries = miter->second.second; + + real_time now = real_clock::now(); + + int ret = store->time_log_add(oids[miter->first], entries, NULL); + if (ret < 0) { + /* we don't really need to have a special handling for failed cases here, + * as this is just an optimization. */ + lderr(cct) << "ERROR: store->time_log_add() returned " << ret << dendl; + return ret; + } + + real_time expiration = now; + expiration += make_timespan(cct->_conf->rgw_data_log_window); + + list& buckets = miter->second.first; + list::iterator liter; + for (liter = buckets.begin(); liter != buckets.end(); ++liter) { + update_renewed(*liter, expiration); + } + } + + return 0; +} + +void RGWDataChangesLog::_get_change(const rgw_bucket_shard& bs, ChangeStatusPtr& status) +{ + ceph_assert(lock.is_locked()); + if (!changes.find(bs, status)) { + status = ChangeStatusPtr(new ChangeStatus); + changes.add(bs, status); + } +} + +void RGWDataChangesLog::register_renew(rgw_bucket_shard& bs) +{ + Mutex::Locker l(lock); + cur_cycle[bs] = true; +} + +void RGWDataChangesLog::update_renewed(rgw_bucket_shard& bs, real_time& expiration) +{ + Mutex::Locker l(lock); + ChangeStatusPtr status; + _get_change(bs, status); + + ldout(cct, 20) << "RGWDataChangesLog::update_renewd() bucket_name=" << bs.bucket.name << " shard_id=" << bs.shard_id << " expiration=" << expiration << dendl; + status->cur_expiration = expiration; +} + +int RGWDataChangesLog::get_log_shard_id(rgw_bucket& bucket, int shard_id) { + rgw_bucket_shard bs(bucket, shard_id); + + return choose_oid(bs); +} + +int RGWDataChangesLog::add_entry(rgw_bucket& bucket, int shard_id) { + if (!store->svc.zone->need_to_log_data()) + return 0; + + if (observer) { + observer->on_bucket_changed(bucket.get_key()); + } + + rgw_bucket_shard bs(bucket, shard_id); + + int index = choose_oid(bs); + mark_modified(index, bs); + + lock.Lock(); + + ChangeStatusPtr status; + _get_change(bs, status); + + lock.Unlock(); + + real_time now = real_clock::now(); + + status->lock->Lock(); + + ldout(cct, 20) << "RGWDataChangesLog::add_entry() bucket.name=" << bucket.name << " shard_id=" << shard_id << " now=" << now << " cur_expiration=" << status->cur_expiration << dendl; + + if (now < status->cur_expiration) { + /* no need to send, recently completed */ + status->lock->Unlock(); + + register_renew(bs); + return 0; + } + + RefCountedCond *cond; + + if (status->pending) { + cond = status->cond; + + ceph_assert(cond); + + status->cond->get(); + status->lock->Unlock(); + + int ret = cond->wait(); + cond->put(); + if (!ret) { + register_renew(bs); + } + return ret; + } + + status->cond = new RefCountedCond; + status->pending = true; + + string& oid = oids[index]; + real_time expiration; + + int ret; + + do { + status->cur_sent = now; + + expiration = now; + expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window); + + status->lock->Unlock(); + + bufferlist bl; + rgw_data_change change; + change.entity_type = ENTITY_TYPE_BUCKET; + change.key = bs.get_key(); + change.timestamp = now; + encode(change, bl); + string section; + + ldout(cct, 20) << "RGWDataChangesLog::add_entry() sending update with now=" << now << " cur_expiration=" << expiration << dendl; + + ret = store->time_log_add(oid, now, section, change.key, bl); + + now = real_clock::now(); + + status->lock->Lock(); + + } while (!ret && real_clock::now() > expiration); + + cond = status->cond; + + status->pending = false; + status->cur_expiration = status->cur_sent; /* time of when operation started, not completed */ + status->cur_expiration += make_timespan(cct->_conf->rgw_data_log_window); + status->cond = NULL; + status->lock->Unlock(); + + cond->done(ret); + cond->put(); + + return ret; +} + +int RGWDataChangesLog::list_entries(int shard, const real_time& start_time, const real_time& end_time, int max_entries, + list& entries, + const string& marker, + string *out_marker, + bool *truncated) { + if (shard >= num_shards) + return -EINVAL; + + list log_entries; + + int ret = store->time_log_list(oids[shard], start_time, end_time, + max_entries, log_entries, marker, + out_marker, truncated); + if (ret < 0) + return ret; + + list::iterator iter; + for (iter = log_entries.begin(); iter != log_entries.end(); ++iter) { + rgw_data_change_log_entry log_entry; + log_entry.log_id = iter->id; + real_time rt = iter->timestamp.to_real_time(); + log_entry.log_timestamp = rt; + auto liter = iter->data.cbegin(); + try { + decode(log_entry.entry, liter); + } catch (buffer::error& err) { + lderr(cct) << "ERROR: failed to decode data changes log entry" << dendl; + return -EIO; + } + entries.push_back(log_entry); + } + + return 0; +} + +int RGWDataChangesLog::list_entries(const real_time& start_time, const real_time& end_time, int max_entries, + list& entries, LogMarker& marker, bool *ptruncated) { + bool truncated; + entries.clear(); + + for (; marker.shard < num_shards && (int)entries.size() < max_entries; + marker.shard++, marker.marker.clear()) { + int ret = list_entries(marker.shard, start_time, end_time, max_entries - entries.size(), entries, + marker.marker, NULL, &truncated); + if (ret == -ENOENT) { + continue; + } + if (ret < 0) { + return ret; + } + if (truncated) { + *ptruncated = true; + return 0; + } + } + + *ptruncated = (marker.shard < num_shards); + + return 0; +} + +int RGWDataChangesLog::get_info(int shard_id, RGWDataChangesLogInfo *info) +{ + if (shard_id >= num_shards) + return -EINVAL; + + string oid = oids[shard_id]; + + cls_log_header header; + + int ret = store->time_log_info(oid, &header); + if ((ret < 0) && (ret != -ENOENT)) + return ret; + + info->marker = header.max_marker; + info->last_update = header.max_time.to_real_time(); + + return 0; +} + +int RGWDataChangesLog::trim_entries(int shard_id, const real_time& start_time, const real_time& end_time, + const string& start_marker, const string& end_marker) +{ + if (shard_id > num_shards) + return -EINVAL; + + return store->time_log_trim(oids[shard_id], start_time, end_time, + start_marker, end_marker, nullptr); +} + +int RGWDataChangesLog::lock_exclusive(int shard_id, timespan duration, string& zone_id, string& owner_id) { + return store->lock_exclusive(store->svc.zone->get_zone_params().log_pool, oids[shard_id], duration, zone_id, owner_id); +} + +int RGWDataChangesLog::unlock(int shard_id, string& zone_id, string& owner_id) { + return store->unlock(store->svc.zone->get_zone_params().log_pool, oids[shard_id], zone_id, owner_id); +} + +bool RGWDataChangesLog::going_down() +{ + return down_flag; +} + +RGWDataChangesLog::~RGWDataChangesLog() { + down_flag = true; + renew_thread->stop(); + renew_thread->join(); + delete renew_thread; + delete[] oids; +} + +void *RGWDataChangesLog::ChangesRenewThread::entry() { + do { + dout(2) << "RGWDataChangesLog::ChangesRenewThread: start" << dendl; + int r = log->renew_entries(); + if (r < 0) { + dout(0) << "ERROR: RGWDataChangesLog::renew_entries returned error r=" << r << dendl; + } + + if (log->going_down()) + break; + + int interval = cct->_conf->rgw_data_log_window * 3 / 4; + lock.Lock(); + cond.WaitInterval(lock, utime_t(interval, 0)); + lock.Unlock(); + } while (!log->going_down()); + + return NULL; +} + +void RGWDataChangesLog::ChangesRenewThread::stop() +{ + Mutex::Locker l(lock); + cond.Signal(); +} + +void RGWDataChangesLog::mark_modified(int shard_id, const rgw_bucket_shard& bs) +{ + auto key = bs.get_key(); + modified_lock.get_read(); + map >::iterator iter = modified_shards.find(shard_id); + if (iter != modified_shards.end()) { + set& keys = iter->second; + if (keys.find(key) != keys.end()) { + modified_lock.unlock(); + return; + } + } + modified_lock.unlock(); + + RWLock::WLocker wl(modified_lock); + modified_shards[shard_id].insert(key); +} + +void RGWDataChangesLog::read_clear_modified(map > &modified) +{ + RWLock::WLocker wl(modified_lock); + modified.swap(modified_shards); + modified_shards.clear(); +} + +void RGWBucketCompleteInfo::dump(Formatter *f) const { + encode_json("bucket_info", info, f); + encode_json("attrs", attrs, f); +} + +void RGWBucketCompleteInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket_info", info, obj); + JSONDecoder::decode_json("attrs", attrs, obj); +} + +class RGWBucketMetadataHandler : public RGWMetadataHandler { + +public: + string get_type() override { return "bucket"; } + + int get(RGWRados *store, string& entry, RGWMetadataObject **obj) override { + RGWObjVersionTracker ot; + RGWBucketEntryPoint be; + + real_time mtime; + map attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + string tenant_name, bucket_name; + parse_bucket(entry, &tenant_name, &bucket_name); + int ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, be, &ot, &mtime, &attrs); + if (ret < 0) + return ret; + + RGWBucketEntryMetadataObject *mdo = new RGWBucketEntryMetadataObject(be, ot.read_version, mtime); + + *obj = mdo; + + return 0; + } + + int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker, + real_time mtime, JSONObj *obj, sync_type_t sync_type) override { + RGWBucketEntryPoint be, old_be; + try { + decode_json_obj(be, obj); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + + real_time orig_mtime; + map attrs; + + RGWObjVersionTracker old_ot; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + string tenant_name, bucket_name; + parse_bucket(entry, &tenant_name, &bucket_name); + int ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, old_be, &old_ot, &orig_mtime, &attrs); + if (ret < 0 && ret != -ENOENT) + return ret; + + // are we actually going to perform this put, or is it too old? + if (ret != -ENOENT && + !check_versions(old_ot.read_version, orig_mtime, + objv_tracker.write_version, mtime, sync_type)) { + return STATUS_NO_APPLY; + } + + objv_tracker.read_version = old_ot.read_version; /* maintain the obj version we just read */ + + ret = store->put_bucket_entrypoint_info(tenant_name, bucket_name, be, false, objv_tracker, mtime, &attrs); + if (ret < 0) + return ret; + + /* link bucket */ + if (be.linked) { + ret = rgw_link_bucket(store, be.owner, be.bucket, be.creation_time, false); + } else { + ret = rgw_unlink_bucket(store, be.owner, be.bucket.tenant, + be.bucket.name, false); + } + + return ret; + } + + struct list_keys_info { + RGWRados *store; + RGWListRawObjsCtx ctx; + }; + + int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override { + RGWBucketEntryPoint be; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + string tenant_name, bucket_name; + parse_bucket(entry, &tenant_name, &bucket_name); + int ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, be, &objv_tracker, NULL, NULL); + if (ret < 0) + return ret; + + /* + * We're unlinking the bucket but we don't want to update the entrypoint here - we're removing + * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal + * will incorrectly fail. + */ + ret = rgw_unlink_bucket(store, be.owner, tenant_name, bucket_name, false); + if (ret < 0) { + lderr(store->ctx()) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl; + } + + ret = rgw_bucket_delete_bucket_obj(store, tenant_name, bucket_name, objv_tracker); + if (ret < 0) { + lderr(store->ctx()) << "could not delete bucket=" << entry << dendl; + } + /* idempotent */ + return 0; + } + + void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) override { + oid = key; + pool = store->svc.zone->get_zone_params().domain_root; + } + + int list_keys_init(RGWRados *store, const string& marker, void **phandle) override { + auto info = std::make_unique(); + + info->store = store; + + int ret = store->list_raw_objects_init(store->svc.zone->get_zone_params().domain_root, marker, + &info->ctx); + if (ret < 0) { + return ret; + } + *phandle = (void *)info.release(); + + return 0; + } + + int list_keys_next(void *handle, int max, list& keys, bool *truncated) override { + list_keys_info *info = static_cast(handle); + + string no_filter; + + keys.clear(); + + RGWRados *store = info->store; + + list unfiltered_keys; + + int ret = store->list_raw_objects_next(no_filter, max, info->ctx, + unfiltered_keys, truncated); + if (ret < 0 && ret != -ENOENT) + return ret; + if (ret == -ENOENT) { + if (truncated) + *truncated = false; + return 0; + } + + // now filter out the system entries + list::iterator iter; + for (iter = unfiltered_keys.begin(); iter != unfiltered_keys.end(); ++iter) { + string& k = *iter; + + if (k[0] != '.') { + keys.push_back(k); + } + } + + return 0; + } + + void list_keys_complete(void *handle) override { + list_keys_info *info = static_cast(handle); + delete info; + } + + string get_marker(void *handle) override { + list_keys_info *info = static_cast(handle); + return info->store->list_raw_objs_get_cursor(info->ctx); + } +}; + +void get_md5_digest(const RGWBucketEntryPoint *be, string& md5_digest) { + + char md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + bufferlist bl; + + Formatter *f = new JSONFormatter(false); + be->dump(f); + f->flush(bl); + + MD5 hash; + hash.Update((const unsigned char *)bl.c_str(), bl.length()); + hash.Final(m); + + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, md5); + + delete f; + + md5_digest = md5; +} + +#define ARCHIVE_META_ATTR RGW_ATTR_PREFIX "zone.archive.info" + +struct archive_meta_info { + rgw_bucket orig_bucket; + + bool from_attrs(CephContext *cct, map& attrs) { + auto iter = attrs.find(ARCHIVE_META_ATTR); + if (iter == attrs.end()) { + return false; + } + + auto bliter = iter->second.cbegin(); + try { + decode(bliter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode archive meta info" << dendl; + return false; + } + + return true; + } + + void store_in_attrs(map& attrs) const { + encode(attrs[ARCHIVE_META_ATTR]); + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(orig_bucket, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(orig_bucket, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(archive_meta_info) + +class RGWArchiveBucketMetadataHandler : public RGWBucketMetadataHandler { +public: + int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override { + ldout(store->ctx(), 5) << "SKIP: bucket removal is not allowed on archive zone: bucket:" << entry << " ... proceeding to rename" << dendl; + + string tenant_name, bucket_name; + parse_bucket(entry, &tenant_name, &bucket_name); + + real_time mtime; + + /* read original entrypoint */ + + RGWBucketEntryPoint be; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + map attrs; + int ret = store->get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, be, &objv_tracker, &mtime, &attrs); + if (ret < 0) { + return ret; + } + + string meta_name = bucket_name + ":" + be.bucket.bucket_id; + + /* read original bucket instance info */ + + map attrs_m; + ceph::real_time orig_mtime; + RGWBucketInfo old_bi; + + ret = store->get_bucket_instance_info(obj_ctx, be.bucket, old_bi, &orig_mtime, &attrs_m); + if (ret < 0) { + return ret; + } + + archive_meta_info ami; + + if (!ami.from_attrs(store->ctx(), attrs_m)) { + ami.orig_bucket = old_bi.bucket; + ami.store_in_attrs(attrs_m); + } + + /* generate a new bucket instance. We could have avoided this if we could just point a new + * bucket entry point to the old bucket instance, however, due to limitation in the way + * we index buckets under the user, bucket entrypoint and bucket instance of the same + * bucket need to have the same name, so we need to copy the old bucket instance into + * to a new entry with the new name + */ + + string new_bucket_name; + + RGWBucketInfo new_bi = old_bi; + RGWBucketEntryPoint new_be = be; + + string md5_digest; + + get_md5_digest(&new_be, md5_digest); + new_bucket_name = ami.orig_bucket.name + "-deleted-" + md5_digest; + + new_bi.bucket.name = new_bucket_name; + new_bi.objv_tracker.clear(); + + new_be.bucket.name = new_bucket_name; + + ret = store->put_bucket_instance_info(new_bi, false, orig_mtime, &attrs_m); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: failed to put new bucket instance info for bucket=" << new_bi.bucket << " ret=" << ret << dendl; + return ret; + } + + /* store a new entrypoint */ + + RGWObjVersionTracker ot; + ot.generate_new_write_ver(store->ctx()); + + ret = store->put_bucket_entrypoint_info(tenant_name, new_bucket_name, new_be, true, ot, mtime, &attrs); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl; + return ret; + } + + /* link new bucket */ + + ret = rgw_link_bucket(store, new_be.owner, new_be.bucket, new_be.creation_time, false); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: failed to link new bucket for bucket=" << new_be.bucket << " ret=" << ret << dendl; + return ret; + } + + /* clean up old stuff */ + + ret = rgw_unlink_bucket(store, be.owner, tenant_name, bucket_name, false); + if (ret < 0) { + lderr(store->ctx()) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl; + } + + // if (ret == -ECANCELED) it means that there was a race here, and someone + // wrote to the bucket entrypoint just before we removed it. The question is + // whether it was a newly created bucket entrypoint ... in which case we + // should ignore the error and move forward, or whether it is a higher version + // of the same bucket instance ... in which we should retry + ret = rgw_bucket_delete_bucket_obj(store, tenant_name, bucket_name, objv_tracker); + if (ret < 0) { + lderr(store->ctx()) << "could not delete bucket=" << entry << dendl; + } + + ret = rgw_delete_system_obj(store, store->svc.zone->get_zone_params().domain_root, RGW_BUCKET_INSTANCE_MD_PREFIX + meta_name, NULL); + + /* idempotent */ + + return 0; + } + + int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker, + real_time mtime, JSONObj *obj, sync_type_t sync_type) override { + if (entry.find("-deleted-") != string::npos) { + RGWObjVersionTracker ot; + RGWMetadataObject *robj; + int ret = get(store, entry, &robj); + if (ret != -ENOENT) { + if (ret < 0) { + return ret; + } + ot.read_version = robj->get_version(); + delete robj; + + ret = remove(store, entry, ot); + if (ret < 0) { + return ret; + } + } + } + + return RGWBucketMetadataHandler::put(store, entry, objv_tracker, + mtime, obj, sync_type); + } + +}; + +class RGWBucketInstanceMetadataHandler : public RGWMetadataHandler { + +public: + string get_type() override { return "bucket.instance"; } + + int get(RGWRados *store, string& oid, RGWMetadataObject **obj) override { + RGWBucketCompleteInfo bci; + + real_time mtime; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int ret = store->get_bucket_instance_info(obj_ctx, oid, bci.info, &mtime, &bci.attrs); + if (ret < 0) + return ret; + + RGWBucketInstanceMetadataObject *mdo = new RGWBucketInstanceMetadataObject(bci, bci.info.objv_tracker.read_version, mtime); + + *obj = mdo; + + return 0; + } + + int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker, + real_time mtime, JSONObj *obj, sync_type_t sync_type) override { + RGWBucketCompleteInfo bci, old_bci; + try { + decode_json_obj(bci, obj); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + + real_time orig_mtime; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int ret = store->get_bucket_instance_info(obj_ctx, entry, old_bci.info, + &orig_mtime, &old_bci.attrs); + bool exists = (ret != -ENOENT); + if (ret < 0 && exists) + return ret; + + if (!exists || old_bci.info.bucket.bucket_id != bci.info.bucket.bucket_id) { + /* a new bucket, we need to select a new bucket placement for it */ + auto key(entry); + rgw_bucket_instance_oid_to_key(key); + string tenant_name; + string bucket_name; + string bucket_instance; + parse_bucket(key, &tenant_name, &bucket_name, &bucket_instance); + + RGWZonePlacementInfo rule_info; + bci.info.bucket.name = bucket_name; + bci.info.bucket.bucket_id = bucket_instance; + bci.info.bucket.tenant = tenant_name; + ret = store->svc.zone->select_bucket_location_by_rule(bci.info.placement_rule, &rule_info); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: select_bucket_placement() returned " << ret << dendl; + return ret; + } + bci.info.index_type = rule_info.index_type; + } else { + /* existing bucket, keep its placement */ + bci.info.bucket.explicit_placement = old_bci.info.bucket.explicit_placement; + bci.info.placement_rule = old_bci.info.placement_rule; + } + + if (exists && old_bci.info.datasync_flag_enabled() != bci.info.datasync_flag_enabled()) { + int shards_num = bci.info.num_shards? bci.info.num_shards : 1; + int shard_id = bci.info.num_shards? 0 : -1; + + if (!bci.info.datasync_flag_enabled()) { + ret = store->stop_bi_log_entries(bci.info, -1); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: failed writing bilog" << dendl; + return ret; + } + } else { + ret = store->resync_bi_log_entries(bci.info, -1); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: failed writing bilog" << dendl; + return ret; + } + } + + for (int i = 0; i < shards_num; ++i, ++shard_id) { + ret = store->data_log->add_entry(bci.info.bucket, shard_id); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; + return ret; + } + } + } + + // are we actually going to perform this put, or is it too old? + if (exists && + !check_versions(old_bci.info.objv_tracker.read_version, orig_mtime, + objv_tracker.write_version, mtime, sync_type)) { + objv_tracker.read_version = old_bci.info.objv_tracker.read_version; + return STATUS_NO_APPLY; + } + + /* record the read version (if any), store the new version */ + bci.info.objv_tracker.read_version = old_bci.info.objv_tracker.read_version; + bci.info.objv_tracker.write_version = objv_tracker.write_version; + + ret = store->put_bucket_instance_info(bci.info, false, mtime, &bci.attrs); + if (ret < 0) + return ret; + + objv_tracker = bci.info.objv_tracker; + + ret = store->init_bucket_index(bci.info, bci.info.num_shards); + if (ret < 0) + return ret; + + return STATUS_APPLIED; + } + + struct list_keys_info { + RGWRados *store; + RGWListRawObjsCtx ctx; + }; + + int remove(RGWRados *store, string& entry, + RGWObjVersionTracker& objv_tracker) override { + RGWBucketInfo info; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int ret = store->get_bucket_instance_info(obj_ctx, entry, info, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + return ret; + + return rgw_bucket_instance_remove_entry(store, entry, + &info.objv_tracker); + } + + void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) override { + oid = RGW_BUCKET_INSTANCE_MD_PREFIX + key; + rgw_bucket_instance_key_to_oid(oid); + pool = store->svc.zone->get_zone_params().domain_root; + } + + int list_keys_init(RGWRados *store, const string& marker, void **phandle) override { + auto info = std::make_unique(); + + info->store = store; + + int ret = store->list_raw_objects_init(store->svc.zone->get_zone_params().domain_root, marker, + &info->ctx); + if (ret < 0) { + return ret; + } + *phandle = (void *)info.release(); + + return 0; + } + + int list_keys_next(void *handle, int max, list& keys, bool *truncated) override { + list_keys_info *info = static_cast(handle); + + string no_filter; + + keys.clear(); + + RGWRados *store = info->store; + + list unfiltered_keys; + + int ret = store->list_raw_objects_next(no_filter, max, info->ctx, + unfiltered_keys, truncated); + if (ret < 0 && ret != -ENOENT) + return ret; + if (ret == -ENOENT) { + if (truncated) + *truncated = false; + return 0; + } + + constexpr int prefix_size = sizeof(RGW_BUCKET_INSTANCE_MD_PREFIX) - 1; + // now filter in the relevant entries + list::iterator iter; + for (iter = unfiltered_keys.begin(); iter != unfiltered_keys.end(); ++iter) { + string& k = *iter; + + if (k.compare(0, prefix_size, RGW_BUCKET_INSTANCE_MD_PREFIX) == 0) { + auto oid = k.substr(prefix_size); + rgw_bucket_instance_oid_to_key(oid); + keys.emplace_back(std::move(oid)); + } + } + + return 0; + } + + void list_keys_complete(void *handle) override { + list_keys_info *info = static_cast(handle); + delete info; + } + + string get_marker(void *handle) override { + list_keys_info *info = static_cast(handle); + return info->store->list_raw_objs_get_cursor(info->ctx); + } + + /* + * hash entry for mdlog placement. Use the same hash key we'd have for the bucket entry + * point, so that the log entries end up at the same log shard, so that we process them + * in order + */ + void get_hash_key(const string& section, const string& key, string& hash_key) override { + string k; + int pos = key.find(':'); + if (pos < 0) + k = key; + else + k = key.substr(0, pos); + hash_key = "bucket:" + k; + } +}; + +class RGWArchiveBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandler { +public: + + int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override { + ldout(store->ctx(), 0) << "SKIP: bucket instance removal is not allowed on archive zone: bucket.instance:" << entry << dendl; + return 0; + } +}; + +RGWMetadataHandler *RGWBucketMetaHandlerAllocator::alloc() { + return new RGWBucketMetadataHandler; +} + +RGWMetadataHandler *RGWBucketInstanceMetaHandlerAllocator::alloc() { + return new RGWBucketInstanceMetadataHandler; +} + +RGWMetadataHandler *RGWArchiveBucketMetaHandlerAllocator::alloc() { + return new RGWArchiveBucketMetadataHandler; +} + +RGWMetadataHandler *RGWArchiveBucketInstanceMetaHandlerAllocator::alloc() { + return new RGWArchiveBucketInstanceMetadataHandler; +} + +void rgw_bucket_init(RGWMetadataManager *mm) +{ + auto sync_module = mm->get_store()->get_sync_module(); + if (sync_module) { + bucket_meta_handler = sync_module->alloc_bucket_meta_handler(); + bucket_instance_meta_handler = sync_module->alloc_bucket_instance_meta_handler(); + } else { + bucket_meta_handler = RGWBucketMetaHandlerAllocator::alloc(); + bucket_instance_meta_handler = RGWBucketInstanceMetaHandlerAllocator::alloc(); + } + mm->register_handler(bucket_meta_handler); + mm->register_handler(bucket_instance_meta_handler); +} diff --git a/src/rgw/rgw_bucket.h b/src/rgw/rgw_bucket.h new file mode 100644 index 00000000..11623b85 --- /dev/null +++ b/src/rgw/rgw_bucket.h @@ -0,0 +1,575 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_BUCKET_H +#define CEPH_RGW_BUCKET_H + +#include +#include + +#include "include/types.h" +#include "rgw_common.h" +#include "rgw_tools.h" + +#include "rgw_rados.h" + +#include "rgw_string.h" + +#include "common/Formatter.h" +#include "common/lru_map.h" +#include "common/ceph_time.h" +#include "rgw_formats.h" + + +// define as static when RGWBucket implementation completes +extern void rgw_get_buckets_obj(const rgw_user& user_id, string& buckets_obj_id); + +extern int rgw_bucket_store_info(RGWRados *store, const string& bucket_name, bufferlist& bl, bool exclusive, + map *pattrs, RGWObjVersionTracker *objv_tracker, + real_time mtime); +extern int rgw_bucket_instance_store_info(RGWRados *store, string& oid, bufferlist& bl, bool exclusive, + map *pattrs, RGWObjVersionTracker *objv_tracker, + real_time mtime); + +extern int rgw_bucket_parse_bucket_instance(const string& bucket_instance, string *target_bucket_instance, int *shard_id); +extern int rgw_bucket_parse_bucket_key(CephContext *cct, const string& key, + rgw_bucket* bucket, int *shard_id); + +extern int rgw_bucket_instance_remove_entry(RGWRados *store, const string& entry, + RGWObjVersionTracker *objv_tracker); +extern void rgw_bucket_instance_key_to_oid(string& key); +extern void rgw_bucket_instance_oid_to_key(string& oid); + +extern int rgw_bucket_delete_bucket_obj(RGWRados *store, + const string& tenant_name, + const string& bucket_name, + RGWObjVersionTracker& objv_tracker); + +extern int rgw_bucket_sync_user_stats(RGWRados *store, const rgw_user& user_id, const RGWBucketInfo& bucket_info); +extern int rgw_bucket_sync_user_stats(RGWRados *store, const string& tenant_name, const string& bucket_name); + +extern std::string rgw_make_bucket_entry_name(const std::string& tenant_name, + const std::string& bucket_name); +static inline void rgw_make_bucket_entry_name(const string& tenant_name, + const string& bucket_name, + std::string& bucket_entry) { + bucket_entry = rgw_make_bucket_entry_name(tenant_name, bucket_name); +} + +extern void rgw_parse_url_bucket(const string& bucket, + const string& auth_tenant, + string &tenant_name, string &bucket_name); + +struct RGWBucketCompleteInfo { + RGWBucketInfo info; + map attrs; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; + +class RGWBucketEntryMetadataObject : public RGWMetadataObject { + RGWBucketEntryPoint ep; +public: + RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, obj_version& v, real_time m) : ep(_ep) { + objv = v; + mtime = m; + } + + void dump(Formatter *f) const override { + ep.dump(f); + } +}; + +class RGWBucketInstanceMetadataObject : public RGWMetadataObject { + RGWBucketCompleteInfo info; +public: + RGWBucketInstanceMetadataObject() {} + RGWBucketInstanceMetadataObject(RGWBucketCompleteInfo& i, obj_version& v, real_time m) : info(i) { + objv = v; + mtime = m; + } + + void dump(Formatter *f) const override { + info.dump(f); + } + + void decode_json(JSONObj *obj) { + info.decode_json(obj); + } + + RGWBucketInfo& get_bucket_info() { return info.info; } +}; + +/** + * Store a list of the user's buckets, with associated functinos. + */ +class RGWUserBuckets +{ + std::map buckets; + +public: + RGWUserBuckets() = default; + RGWUserBuckets(RGWUserBuckets&&) = default; + + RGWUserBuckets& operator=(const RGWUserBuckets&) = default; + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(buckets, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(buckets, bl); + } + /** + * Check if the user owns a bucket by the given name. + */ + bool owns(string& name) { + map::iterator iter; + iter = buckets.find(name); + return (iter != buckets.end()); + } + + /** + * Add a (created) bucket to the user's bucket list. + */ + void add(const RGWBucketEnt& bucket) { + buckets[bucket.bucket.name] = bucket; + } + + /** + * Remove a bucket from the user's list by name. + */ + void remove(const string& name) { + map::iterator iter; + iter = buckets.find(name); + if (iter != buckets.end()) { + buckets.erase(iter); + } + } + + /** + * Get the user's buckets as a map. + */ + map& get_buckets() { return buckets; } + + /** + * Cleanup data structure + */ + void clear() { buckets.clear(); } + + size_t count() { return buckets.size(); } +}; +WRITE_CLASS_ENCODER(RGWUserBuckets) + +class RGWMetadataManager; +class RGWMetadataHandler; + +class RGWBucketMetaHandlerAllocator { +public: + static RGWMetadataHandler *alloc(); +}; + +class RGWBucketInstanceMetaHandlerAllocator { +public: + static RGWMetadataHandler *alloc(); +}; + +class RGWArchiveBucketMetaHandlerAllocator { +public: + static RGWMetadataHandler *alloc(); +}; + +class RGWArchiveBucketInstanceMetaHandlerAllocator { +public: + static RGWMetadataHandler *alloc(); +}; + +extern void rgw_bucket_init(RGWMetadataManager *mm); +/** + * Get all the buckets owned by a user and fill up an RGWUserBuckets with them. + * Returns: 0 on success, -ERR# on failure. + */ +extern int rgw_read_user_buckets(RGWRados *store, + const rgw_user& user_id, + RGWUserBuckets& buckets, + const string& marker, + const string& end_marker, + uint64_t max, + bool need_stats, + bool* is_truncated, + uint64_t default_amount = 1000); + +extern int rgw_link_bucket(RGWRados* store, + const rgw_user& user_id, + rgw_bucket& bucket, + ceph::real_time creation_time, + bool update_entrypoint = true); +extern int rgw_unlink_bucket(RGWRados *store, const rgw_user& user_id, + const string& tenant_name, const string& bucket_name, bool update_entrypoint = true); + +extern int rgw_remove_object(RGWRados *store, const RGWBucketInfo& bucket_info, const rgw_bucket& bucket, rgw_obj_key& key); +extern int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children); +extern int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket, int concurrent_max); + +extern int rgw_bucket_set_attrs(RGWRados *store, RGWBucketInfo& bucket_info, + map& attrs, + RGWObjVersionTracker *objv_tracker); +extern int rgw_object_get_attr(RGWRados* store, const RGWBucketInfo& bucket_info, + const rgw_obj& obj, const char* attr_name, + bufferlist& out_bl); + +extern void check_bad_user_bucket_mapping(RGWRados *store, const rgw_user& user_id, bool fix); + +struct RGWBucketAdminOpState { + rgw_user uid; + std::string display_name; + std::string bucket_name; + std::string bucket_id; + std::string object_name; + + bool list_buckets; + bool stat_buckets; + bool check_objects; + bool fix_index; + bool delete_child_objects; + bool bucket_stored; + int max_aio = 0; + + rgw_bucket bucket; + + RGWQuotaInfo quota; + + void set_fetch_stats(bool value) { stat_buckets = value; } + void set_check_objects(bool value) { check_objects = value; } + void set_fix_index(bool value) { fix_index = value; } + void set_delete_children(bool value) { delete_child_objects = value; } + + void set_max_aio(int value) { max_aio = value; } + + void set_user_id(const rgw_user& user_id) { + if (!user_id.empty()) + uid = user_id; + } + void set_tenant(const std::string& tenant_str) { + uid.tenant = tenant_str; + } + void set_bucket_name(const std::string& bucket_str) { + bucket_name = bucket_str; + } + void set_object(std::string& object_str) { + object_name = object_str; + } + void set_quota(RGWQuotaInfo& value) { + quota = value; + } + + + rgw_user& get_user_id() { return uid; } + std::string& get_user_display_name() { return display_name; } + std::string& get_bucket_name() { return bucket_name; } + std::string& get_object_name() { return object_name; } + std::string& get_tenant() { return uid.tenant; } + + rgw_bucket& get_bucket() { return bucket; } + void set_bucket(rgw_bucket& _bucket) { + bucket = _bucket; + bucket_stored = true; + } + + void set_bucket_id(const string& bi) { + bucket_id = bi; + } + const string& get_bucket_id() { return bucket_id; } + + bool will_fetch_stats() { return stat_buckets; } + bool will_fix_index() { return fix_index; } + bool will_delete_children() { return delete_child_objects; } + bool will_check_objects() { return check_objects; } + bool is_user_op() { return !uid.empty(); } + bool is_system_op() { return uid.empty(); } + bool has_bucket_stored() { return bucket_stored; } + int get_max_aio() { return max_aio; } + + RGWBucketAdminOpState() : list_buckets(false), stat_buckets(false), check_objects(false), + fix_index(false), delete_child_objects(false), + bucket_stored(false) {} +}; + +/* + * A simple wrapper class for administrative bucket operations + */ + +class RGWBucket +{ + RGWUserBuckets buckets; + RGWRados *store; + RGWAccessHandle handle; + + RGWUserInfo user_info; + std::string tenant; + std::string bucket_name; + + bool failure; + + RGWBucketInfo bucket_info; + +public: + RGWBucket() : store(NULL), handle(NULL), failure(false) {} + int init(RGWRados *storage, RGWBucketAdminOpState& op_state); + + int check_bad_index_multipart(RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, std::string *err_msg = NULL); + + int check_object_index(RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, + std::string *err_msg = NULL); + + int check_index(RGWBucketAdminOpState& op_state, + map& existing_stats, + map& calculated_stats, + std::string *err_msg = NULL); + + int remove(RGWBucketAdminOpState& op_state, bool bypass_gc = false, bool keep_index_consistent = true, std::string *err_msg = NULL); + int link(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL); + int unlink(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL); + int set_quota(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL); + + int remove_object(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL); + int policy_bl_to_stream(bufferlist& bl, ostream& o); + int get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy); + + void clear_failure() { failure = false; } + + const RGWBucketInfo& get_bucket_info() const { return bucket_info; } +}; + +class RGWBucketAdminOp +{ +public: + static int get_policy(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher); + static int get_policy(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWAccessControlPolicy& policy); + static int dump_s3_policy(RGWRados *store, RGWBucketAdminOpState& op_state, + ostream& os); + + static int unlink(RGWRados *store, RGWBucketAdminOpState& op_state); + static int link(RGWRados *store, RGWBucketAdminOpState& op_state, string *err_msg = NULL); + + static int check_index(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher); + + static int remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state, bool bypass_gc = false, bool keep_index_consistent = true); + static int remove_object(RGWRados *store, RGWBucketAdminOpState& op_state); + static int info(RGWRados *store, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher); + static int limit_check(RGWRados *store, RGWBucketAdminOpState& op_state, + const std::list& user_ids, + RGWFormatterFlusher& flusher, + bool warnings_only = false); + static int set_quota(RGWRados *store, RGWBucketAdminOpState& op_state); + + static int list_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher); + + static int clear_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher); + static int fix_lc_shards(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher); + static int fix_obj_expiry(RGWRados *store, RGWBucketAdminOpState& op_state, + RGWFormatterFlusher& flusher, bool dry_run = false); +}; + + +enum DataLogEntityType { + ENTITY_TYPE_UNKNOWN = 0, + ENTITY_TYPE_BUCKET = 1, +}; + +struct rgw_data_change { + DataLogEntityType entity_type; + string key; + real_time timestamp; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + uint8_t t = (uint8_t)entity_type; + encode(t, bl); + encode(key, bl); + encode(timestamp, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + uint8_t t; + decode(t, bl); + entity_type = (DataLogEntityType)t; + decode(key, bl); + decode(timestamp, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_data_change) + +struct rgw_data_change_log_entry { + string log_id; + real_time log_timestamp; + rgw_data_change entry; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(log_id, bl); + encode(log_timestamp, bl); + encode(entry, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(log_id, bl); + decode(log_timestamp, bl); + decode(entry, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_data_change_log_entry) + +struct RGWDataChangesLogInfo { + string marker; + real_time last_update; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; + +namespace rgw { +struct BucketChangeObserver; +} + +class RGWDataChangesLog { + CephContext *cct; + RGWRados *store; + rgw::BucketChangeObserver *observer = nullptr; + + int num_shards; + string *oids; + + Mutex lock; + RWLock modified_lock; + map > modified_shards; + + std::atomic down_flag = { false }; + + struct ChangeStatus { + real_time cur_expiration; + real_time cur_sent; + bool pending; + RefCountedCond *cond; + Mutex *lock; + + ChangeStatus() : pending(false), cond(NULL) { + lock = new Mutex("RGWDataChangesLog::ChangeStatus"); + } + + ~ChangeStatus() { + delete lock; + } + }; + + typedef std::shared_ptr ChangeStatusPtr; + + lru_map changes; + + map cur_cycle; + + void _get_change(const rgw_bucket_shard& bs, ChangeStatusPtr& status); + void register_renew(rgw_bucket_shard& bs); + void update_renewed(rgw_bucket_shard& bs, real_time& expiration); + + class ChangesRenewThread : public Thread { + CephContext *cct; + RGWDataChangesLog *log; + Mutex lock; + Cond cond; + + public: + ChangesRenewThread(CephContext *_cct, RGWDataChangesLog *_log) : cct(_cct), log(_log), lock("ChangesRenewThread::lock") {} + void *entry() override; + void stop(); + }; + + ChangesRenewThread *renew_thread; + +public: + + RGWDataChangesLog(CephContext *_cct, RGWRados *_store) : cct(_cct), store(_store), + lock("RGWDataChangesLog::lock"), modified_lock("RGWDataChangesLog::modified_lock"), + changes(cct->_conf->rgw_data_log_changes_size) { + num_shards = cct->_conf->rgw_data_log_num_shards; + + oids = new string[num_shards]; + + string prefix = cct->_conf->rgw_data_log_obj_prefix; + + if (prefix.empty()) { + prefix = "data_log"; + } + + for (int i = 0; i < num_shards; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%s.%d", prefix.c_str(), i); + oids[i] = buf; + } + + renew_thread = new ChangesRenewThread(cct, this); + renew_thread->create("rgw_dt_lg_renew"); + } + + ~RGWDataChangesLog(); + + int choose_oid(const rgw_bucket_shard& bs); + const std::string& get_oid(int shard_id) const { return oids[shard_id]; } + int add_entry(rgw_bucket& bucket, int shard_id); + int get_log_shard_id(rgw_bucket& bucket, int shard_id); + int renew_entries(); + int list_entries(int shard, const real_time& start_time, const real_time& end_time, int max_entries, + list& entries, + const string& marker, + string *out_marker, + bool *truncated); + int trim_entries(int shard_id, const real_time& start_time, const real_time& end_time, + const string& start_marker, const string& end_marker); + int get_info(int shard_id, RGWDataChangesLogInfo *info); + int lock_exclusive(int shard_id, timespan duration, string& zone_id, string& owner_id); + int unlock(int shard_id, string& zone_id, string& owner_id); + struct LogMarker { + int shard; + string marker; + + LogMarker() : shard(0) {} + }; + int list_entries(const real_time& start_time, const real_time& end_time, int max_entries, + list& entries, LogMarker& marker, bool *ptruncated); + + void mark_modified(int shard_id, const rgw_bucket_shard& bs); + void read_clear_modified(map > &modified); + + void set_observer(rgw::BucketChangeObserver *observer) { + this->observer = observer; + } + + bool going_down(); +}; + +bool rgw_find_bucket_by_id(CephContext *cct, RGWMetadataManager *mgr, const string& marker, + const string& bucket_id, rgw_bucket* bucket_out); + +#endif diff --git a/src/rgw/rgw_cache.cc b/src/rgw/rgw_cache.cc new file mode 100644 index 00000000..df992b59 --- /dev/null +++ b/src/rgw/rgw_cache.cc @@ -0,0 +1,353 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_cache.h" +#include "rgw_perf_counters.h" + +#include + +#define dout_subsys ceph_subsys_rgw + + +int ObjectCache::get(const string& name, ObjectCacheInfo& info, uint32_t mask, rgw_cache_entry_info *cache_info) +{ + RWLock::RLocker l(lock); + + if (!enabled) { + return -ENOENT; + } + + auto iter = cache_map.find(name); + if (iter == cache_map.end()) { + ldout(cct, 10) << "cache get: name=" << name << " : miss" << dendl; + if (perfcounter) + perfcounter->inc(l_rgw_cache_miss); + return -ENOENT; + } + if (expiry.count() && + (ceph::coarse_mono_clock::now() - iter->second.info.time_added) > expiry) { + ldout(cct, 10) << "cache get: name=" << name << " : expiry miss" << dendl; + lock.unlock(); + lock.get_write(); + // check that wasn't already removed by other thread + iter = cache_map.find(name); + if (iter != cache_map.end()) { + for (auto &kv : iter->second.chained_entries) + kv.first->invalidate(kv.second); + remove_lru(name, iter->second.lru_iter); + cache_map.erase(iter); + } + if(perfcounter) + perfcounter->inc(l_rgw_cache_miss); + return -ENOENT; + } + + ObjectCacheEntry *entry = &iter->second; + + if (lru_counter - entry->lru_promotion_ts > lru_window) { + ldout(cct, 20) << "cache get: touching lru, lru_counter=" << lru_counter + << " promotion_ts=" << entry->lru_promotion_ts << dendl; + lock.unlock(); + lock.get_write(); /* promote lock to writer */ + + /* need to redo this because entry might have dropped off the cache */ + iter = cache_map.find(name); + if (iter == cache_map.end()) { + ldout(cct, 10) << "lost race! cache get: name=" << name << " : miss" << dendl; + if(perfcounter) perfcounter->inc(l_rgw_cache_miss); + return -ENOENT; + } + + entry = &iter->second; + /* check again, we might have lost a race here */ + if (lru_counter - entry->lru_promotion_ts > lru_window) { + touch_lru(name, *entry, iter->second.lru_iter); + } + } + + ObjectCacheInfo& src = iter->second.info; + if(src.status == -ENOENT) { + ldout(cct, 10) << "cache get: name=" << name << " : hit (negative entry)" << dendl; + if (perfcounter) perfcounter->inc(l_rgw_cache_hit); + return -ENODATA; + } + if ((src.flags & mask) != mask) { + ldout(cct, 10) << "cache get: name=" << name << " : type miss (requested=0x" + << std::hex << mask << ", cached=0x" << src.flags + << std::dec << ")" << dendl; + if(perfcounter) perfcounter->inc(l_rgw_cache_miss); + return -ENOENT; + } + ldout(cct, 10) << "cache get: name=" << name << " : hit (requested=0x" + << std::hex << mask << ", cached=0x" << src.flags + << std::dec << ")" << dendl; + + info = src; + if (cache_info) { + cache_info->cache_locator = name; + cache_info->gen = entry->gen; + } + if(perfcounter) perfcounter->inc(l_rgw_cache_hit); + + return 0; +} + +bool ObjectCache::chain_cache_entry(std::initializer_list cache_info_entries, + RGWChainedCache::Entry *chained_entry) +{ + RWLock::WLocker l(lock); + + if (!enabled) { + return false; + } + + std::vector entries; + entries.reserve(cache_info_entries.size()); + /* first verify that all entries are still valid */ + for (auto cache_info : cache_info_entries) { + ldout(cct, 10) << "chain_cache_entry: cache_locator=" + << cache_info->cache_locator << dendl; + auto iter = cache_map.find(cache_info->cache_locator); + if (iter == cache_map.end()) { + ldout(cct, 20) << "chain_cache_entry: couldn't find cache locator" << dendl; + return false; + } + + auto entry = &iter->second; + + if (entry->gen != cache_info->gen) { + ldout(cct, 20) << "chain_cache_entry: entry.gen (" << entry->gen + << ") != cache_info.gen (" << cache_info->gen << ")" + << dendl; + return false; + } + entries.push_back(entry); + } + + + chained_entry->cache->chain_cb(chained_entry->key, chained_entry->data); + + for (auto entry : entries) { + entry->chained_entries.push_back(make_pair(chained_entry->cache, + chained_entry->key)); + } + + return true; +} + +void ObjectCache::put(const string& name, ObjectCacheInfo& info, rgw_cache_entry_info *cache_info) +{ + RWLock::WLocker l(lock); + + if (!enabled) { + return; + } + + ldout(cct, 10) << "cache put: name=" << name << " info.flags=0x" + << std::hex << info.flags << std::dec << dendl; + + auto [iter, inserted] = cache_map.emplace(name, ObjectCacheEntry{}); + ObjectCacheEntry& entry = iter->second; + entry.info.time_added = ceph::coarse_mono_clock::now(); + if (inserted) { + entry.lru_iter = lru.end(); + } + ObjectCacheInfo& target = entry.info; + + invalidate_lru(entry); + + entry.chained_entries.clear(); + entry.gen++; + + touch_lru(name, entry, entry.lru_iter); + + target.status = info.status; + + if (info.status < 0) { + target.flags = 0; + target.xattrs.clear(); + target.data.clear(); + return; + } + + if (cache_info) { + cache_info->cache_locator = name; + cache_info->gen = entry.gen; + } + + // put() must include the latest version if we're going to keep caching it + target.flags &= ~CACHE_FLAG_OBJV; + + target.flags |= info.flags; + + if (info.flags & CACHE_FLAG_META) + target.meta = info.meta; + else if (!(info.flags & CACHE_FLAG_MODIFY_XATTRS)) + target.flags &= ~CACHE_FLAG_META; // non-meta change should reset meta + + if (info.flags & CACHE_FLAG_XATTRS) { + target.xattrs = info.xattrs; + map::iterator iter; + for (iter = target.xattrs.begin(); iter != target.xattrs.end(); ++iter) { + ldout(cct, 10) << "updating xattr: name=" << iter->first << " bl.length()=" << iter->second.length() << dendl; + } + } else if (info.flags & CACHE_FLAG_MODIFY_XATTRS) { + map::iterator iter; + for (iter = info.rm_xattrs.begin(); iter != info.rm_xattrs.end(); ++iter) { + ldout(cct, 10) << "removing xattr: name=" << iter->first << dendl; + target.xattrs.erase(iter->first); + } + for (iter = info.xattrs.begin(); iter != info.xattrs.end(); ++iter) { + ldout(cct, 10) << "appending xattr: name=" << iter->first << " bl.length()=" << iter->second.length() << dendl; + target.xattrs[iter->first] = iter->second; + } + } + + if (info.flags & CACHE_FLAG_DATA) + target.data = info.data; + + if (info.flags & CACHE_FLAG_OBJV) + target.version = info.version; +} + +bool ObjectCache::remove(const string& name) +{ + RWLock::WLocker l(lock); + + if (!enabled) { + return false; + } + + auto iter = cache_map.find(name); + if (iter == cache_map.end()) + return false; + + ldout(cct, 10) << "removing " << name << " from cache" << dendl; + ObjectCacheEntry& entry = iter->second; + + for (auto& kv : entry.chained_entries) { + kv.first->invalidate(kv.second); + } + + remove_lru(name, iter->second.lru_iter); + cache_map.erase(iter); + return true; +} + +void ObjectCache::touch_lru(const string& name, ObjectCacheEntry& entry, + std::list::iterator& lru_iter) +{ + while (lru_size > (size_t)cct->_conf->rgw_cache_lru_size) { + auto iter = lru.begin(); + if ((*iter).compare(name) == 0) { + /* + * if the entry we're touching happens to be at the lru end, don't remove it, + * lru shrinking can wait for next time + */ + break; + } + auto map_iter = cache_map.find(*iter); + ldout(cct, 10) << "removing entry: name=" << *iter << " from cache LRU" << dendl; + if (map_iter != cache_map.end()) { + ObjectCacheEntry& entry = map_iter->second; + invalidate_lru(entry); + cache_map.erase(map_iter); + } + lru.pop_front(); + lru_size--; + } + + if (lru_iter == lru.end()) { + lru.push_back(name); + lru_size++; + lru_iter--; + ldout(cct, 10) << "adding " << name << " to cache LRU end" << dendl; + } else { + ldout(cct, 10) << "moving " << name << " to cache LRU end" << dendl; + lru.erase(lru_iter); + lru.push_back(name); + lru_iter = lru.end(); + --lru_iter; + } + + lru_counter++; + entry.lru_promotion_ts = lru_counter; +} + +void ObjectCache::remove_lru(const string& name, + std::list::iterator& lru_iter) +{ + if (lru_iter == lru.end()) + return; + + lru.erase(lru_iter); + lru_size--; + lru_iter = lru.end(); +} + +void ObjectCache::invalidate_lru(ObjectCacheEntry& entry) +{ + for (auto iter = entry.chained_entries.begin(); + iter != entry.chained_entries.end(); ++iter) { + RGWChainedCache *chained_cache = iter->first; + chained_cache->invalidate(iter->second); + } +} + +void ObjectCache::set_enabled(bool status) +{ + RWLock::WLocker l(lock); + + enabled = status; + + if (!enabled) { + do_invalidate_all(); + } +} + +void ObjectCache::invalidate_all() +{ + RWLock::WLocker l(lock); + + do_invalidate_all(); +} + +void ObjectCache::do_invalidate_all() +{ + cache_map.clear(); + lru.clear(); + + lru_size = 0; + lru_counter = 0; + lru_window = 0; + + for (auto& cache : chained_cache) { + cache->invalidate_all(); + } +} + +void ObjectCache::chain_cache(RGWChainedCache *cache) { + RWLock::WLocker l(lock); + chained_cache.push_back(cache); +} + +void ObjectCache::unchain_cache(RGWChainedCache *cache) { + RWLock::WLocker l(lock); + + auto iter = chained_cache.begin(); + for (; iter != chained_cache.end(); ++iter) { + if (cache == *iter) { + chained_cache.erase(iter); + cache->unregistered(); + return; + } + } +} + +ObjectCache::~ObjectCache() +{ + for (auto cache : chained_cache) { + cache->unregistered(); + } +} + diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h new file mode 100644 index 00000000..b0696237 --- /dev/null +++ b/src/rgw/rgw_cache.h @@ -0,0 +1,219 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGWCACHE_H +#define CEPH_RGWCACHE_H + +#include "rgw_rados.h" +#include +#include +#include +#include "include/types.h" +#include "include/utime.h" +#include "include/ceph_assert.h" +#include "common/RWLock.h" + +enum { + UPDATE_OBJ, + REMOVE_OBJ, +}; + +#define CACHE_FLAG_DATA 0x01 +#define CACHE_FLAG_XATTRS 0x02 +#define CACHE_FLAG_META 0x04 +#define CACHE_FLAG_MODIFY_XATTRS 0x08 +#define CACHE_FLAG_OBJV 0x10 + +#define mydout(v) lsubdout(T::cct, rgw, v) + +struct ObjectMetaInfo { + uint64_t size; + real_time mtime; + + ObjectMetaInfo() : size(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(size, bl); + encode(mtime, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(size, bl); + decode(mtime, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(ObjectMetaInfo) + +struct ObjectCacheInfo { + int status = 0; + uint32_t flags = 0; + uint64_t epoch = 0; + bufferlist data; + map xattrs; + map rm_xattrs; + ObjectMetaInfo meta; + obj_version version = {}; + ceph::coarse_mono_time time_added; + + ObjectCacheInfo() = default; + + void encode(bufferlist& bl) const { + ENCODE_START(5, 3, bl); + encode(status, bl); + encode(flags, bl); + encode(data, bl); + encode(xattrs, bl); + encode(meta, bl); + encode(rm_xattrs, bl); + encode(epoch, bl); + encode(version, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl); + decode(status, bl); + decode(flags, bl); + decode(data, bl); + decode(xattrs, bl); + decode(meta, bl); + if (struct_v >= 2) + decode(rm_xattrs, bl); + if (struct_v >= 4) + decode(epoch, bl); + if (struct_v >= 5) + decode(version, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(ObjectCacheInfo) + +struct RGWCacheNotifyInfo { + uint32_t op; + rgw_raw_obj obj; + ObjectCacheInfo obj_info; + off_t ofs; + string ns; + + RGWCacheNotifyInfo() : op(0), ofs(0) {} + + void encode(bufferlist& obl) const { + ENCODE_START(2, 2, obl); + encode(op, obl); + encode(obj, obl); + encode(obj_info, obl); + encode(ofs, obl); + encode(ns, obl); + ENCODE_FINISH(obl); + } + void decode(bufferlist::const_iterator& ibl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, ibl); + decode(op, ibl); + decode(obj, ibl); + decode(obj_info, ibl); + decode(ofs, ibl); + decode(ns, ibl); + DECODE_FINISH(ibl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(RGWCacheNotifyInfo) + +class RGWChainedCache { +public: + virtual ~RGWChainedCache() {} + virtual void chain_cb(const string& key, void *data) = 0; + virtual void invalidate(const string& key) = 0; + virtual void invalidate_all() = 0; + virtual void unregistered() {} + + struct Entry { + RGWChainedCache *cache; + const string& key; + void *data; + + Entry(RGWChainedCache *_c, const string& _k, void *_d) : cache(_c), key(_k), data(_d) {} + }; +}; + + +struct ObjectCacheEntry { + ObjectCacheInfo info; + std::list::iterator lru_iter; + uint64_t lru_promotion_ts; + uint64_t gen; + std::vector > chained_entries; + + ObjectCacheEntry() : lru_promotion_ts(0), gen(0) {} +}; + +class ObjectCache { + std::unordered_map cache_map; + std::list lru; + unsigned long lru_size; + unsigned long lru_counter; + unsigned long lru_window; + RWLock lock; + CephContext *cct; + + vector chained_cache; + + bool enabled; + ceph::timespan expiry; + + void touch_lru(const string& name, ObjectCacheEntry& entry, + std::list::iterator& lru_iter); + void remove_lru(const string& name, std::list::iterator& lru_iter); + void invalidate_lru(ObjectCacheEntry& entry); + + void do_invalidate_all(); + +public: + ObjectCache() : lru_size(0), lru_counter(0), lru_window(0), lock("ObjectCache"), cct(NULL), enabled(false) { } + ~ObjectCache(); + int get(const std::string& name, ObjectCacheInfo& bl, uint32_t mask, rgw_cache_entry_info *cache_info); + std::optional get(const std::string& name) { + std::optional info{std::in_place}; + auto r = get(name, *info, 0, nullptr); + return r < 0 ? std::nullopt : info; + } + + template + void for_each(const F& f) { + RWLock::RLocker l(lock); + if (enabled) { + auto now = ceph::coarse_mono_clock::now(); + for (const auto& [name, entry] : cache_map) { + if (expiry.count() && (now - entry.info.time_added) < expiry) { + f(name, entry); + } + } + } + } + + void put(const std::string& name, ObjectCacheInfo& bl, rgw_cache_entry_info *cache_info); + bool remove(const std::string& name); + void set_ctx(CephContext *_cct) { + cct = _cct; + lru_window = cct->_conf->rgw_cache_lru_size / 2; + expiry = std::chrono::seconds(cct->_conf.get_val( + "rgw_cache_expiry_interval")); + } + bool chain_cache_entry(std::initializer_list cache_info_entries, + RGWChainedCache::Entry *chained_entry); + + void set_enabled(bool status); + + void chain_cache(RGWChainedCache *cache); + void unchain_cache(RGWChainedCache *cache); + void invalidate_all(); +}; + +#endif diff --git a/src/rgw/rgw_civetweb.cc b/src/rgw/rgw_civetweb.cc new file mode 100644 index 00000000..f0f8ef5d --- /dev/null +++ b/src/rgw/rgw_civetweb.cc @@ -0,0 +1,248 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include + +#include "civetweb/civetweb.h" +#include "rgw_civetweb.h" +#include "rgw_perf_counters.h" + + +#define dout_subsys ceph_subsys_rgw + +size_t RGWCivetWeb::write_data(const char *buf, const size_t len) +{ + size_t off = 0; + auto to_sent = len; + while (to_sent) { + const int ret = mg_write(conn, buf + off, to_sent); + if (ret < 0 || ! ret) { + /* According to the documentation of mg_write() it always returns -1 on + * error. The details aren't available, so we will just throw EIO. Same + * goes to 0 that is associated with writing to a closed connection. */ + throw rgw::io::Exception(EIO, std::system_category()); + } else { + off += static_cast(ret); + to_sent -= static_cast(ret); + } + } + return len; +} + +RGWCivetWeb::RGWCivetWeb(mg_connection* const conn) + : conn(conn), + explicit_keepalive(false), + explicit_conn_close(false), + got_eof_on_read(false), + txbuf(*this) +{ + sockaddr *lsa = mg_get_local_addr(conn); + switch(lsa->sa_family) { + case AF_INET: + port = ntohs(((struct sockaddr_in*)lsa)->sin_port); + break; + case AF_INET6: + port = ntohs(((struct sockaddr_in6*)lsa)->sin6_port); + break; + default: + port = -1; + } +} + +size_t RGWCivetWeb::read_data(char *buf, size_t len) +{ + size_t c; + int ret; + if (got_eof_on_read) { + return 0; + } + for (c = 0; c < len; c += ret) { + ret = mg_read(conn, buf+c, len-c); + if (ret < 0) { + throw rgw::io::Exception(EIO, std::system_category()); + } + if (!ret) { + got_eof_on_read = true; + break; + } + } + return c; +} + +void RGWCivetWeb::flush() +{ + txbuf.pubsync(); +} + +size_t RGWCivetWeb::complete_request() +{ + perfcounter->inc(l_rgw_qlen, -1); + perfcounter->inc(l_rgw_qactive, -1); + return 0; +} + +int RGWCivetWeb::init_env(CephContext *cct) +{ + env.init(cct); + const struct mg_request_info* info = mg_get_request_info(conn); + + if (! info) { + // request info is NULL; we have no info about the connection + return -EINVAL; + } + + for (int i = 0; i < info->num_headers; i++) { + const auto header = &info->http_headers[i]; + + if (header->name == nullptr || header->value==nullptr) { + lderr(cct) << "client supplied malformatted headers" << dendl; + return -EINVAL; + } + + const boost::string_ref name(header->name); + const auto& value = header->value; + + if (boost::algorithm::iequals(name, "content-length")) { + env.set("CONTENT_LENGTH", value); + continue; + } + if (boost::algorithm::iequals(name, "content-type")) { + env.set("CONTENT_TYPE", value); + continue; + } + if (boost::algorithm::iequals(name, "connection")) { + explicit_keepalive = boost::algorithm::iequals(value, "keep-alive"); + explicit_conn_close = boost::algorithm::iequals(value, "close"); + } + + static const boost::string_ref HTTP_{"HTTP_"}; + + char buf[name.size() + HTTP_.size() + 1]; + auto dest = std::copy(std::begin(HTTP_), std::end(HTTP_), buf); + for (auto src = name.begin(); src != name.end(); ++src, ++dest) { + if (*src == '-') { + *dest = '_'; + } else { + *dest = std::toupper(*src); + } + } + *dest = '\0'; + + env.set(buf, value); + } + + perfcounter->inc(l_rgw_qlen); + perfcounter->inc(l_rgw_qactive); + + env.set("REMOTE_ADDR", info->remote_addr); + env.set("REQUEST_METHOD", info->request_method); + env.set("HTTP_VERSION", info->http_version); + env.set("REQUEST_URI", info->request_uri); // get the full uri, we anyway handle abs uris later + env.set("SCRIPT_URI", info->local_uri); + if (info->query_string) { + env.set("QUERY_STRING", info->query_string); + } + if (info->remote_user) { + env.set("REMOTE_USER", info->remote_user); + } + + if (port <= 0) + lderr(cct) << "init_env: bug: invalid port number" << dendl; + char port_buf[16]; + snprintf(port_buf, sizeof(port_buf), "%d", port); + env.set("SERVER_PORT", port_buf); + if (info->is_ssl) { + env.set("SERVER_PORT_SECURE", port_buf); + } + return 0; +} + +size_t RGWCivetWeb::send_status(int status, const char *status_name) +{ + mg_set_http_status(conn, status); + + static constexpr size_t STATUS_BUF_SIZE = 128; + + char statusbuf[STATUS_BUF_SIZE]; + const auto statuslen = snprintf(statusbuf, sizeof(statusbuf), + "HTTP/1.1 %d %s\r\n", status, status_name); + + return txbuf.sputn(statusbuf, statuslen); +} + +size_t RGWCivetWeb::send_100_continue() +{ + const char HTTTP_100_CONTINUE[] = "HTTP/1.1 100 CONTINUE\r\n\r\n"; + const size_t sent = txbuf.sputn(HTTTP_100_CONTINUE, + sizeof(HTTTP_100_CONTINUE) - 1); + flush(); + return sent; +} + +size_t RGWCivetWeb::send_header(const boost::string_ref& name, + const boost::string_ref& value) +{ + static constexpr char HEADER_SEP[] = ": "; + static constexpr char HEADER_END[] = "\r\n"; + + size_t sent = 0; + + sent += txbuf.sputn(name.data(), name.length()); + sent += txbuf.sputn(HEADER_SEP, sizeof(HEADER_SEP) - 1); + sent += txbuf.sputn(value.data(), value.length()); + sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1); + + return sent; +} + +size_t RGWCivetWeb::dump_date_header() +{ + char timestr[TIME_BUF_SIZE]; + + const time_t gtime = time(nullptr); + struct tm result; + struct tm const* const tmp = gmtime_r(>ime, &result); + + if (nullptr == tmp) { + return 0; + } + + if (! strftime(timestr, sizeof(timestr), + "Date: %a, %d %b %Y %H:%M:%S %Z\r\n", tmp)) { + return 0; + } + + return txbuf.sputn(timestr, strlen(timestr)); +} + +size_t RGWCivetWeb::complete_header() +{ + size_t sent = dump_date_header(); + + if (explicit_keepalive) { + constexpr char CONN_KEEP_ALIVE[] = "Connection: Keep-Alive\r\n"; + sent += txbuf.sputn(CONN_KEEP_ALIVE, sizeof(CONN_KEEP_ALIVE) - 1); + } else if (explicit_conn_close) { + constexpr char CONN_KEEP_CLOSE[] = "Connection: close\r\n"; + sent += txbuf.sputn(CONN_KEEP_CLOSE, sizeof(CONN_KEEP_CLOSE) - 1); + } + + static constexpr char HEADER_END[] = "\r\n"; + sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1); + + flush(); + return sent; +} + +size_t RGWCivetWeb::send_content_length(uint64_t len) +{ + static constexpr size_t CONLEN_BUF_SIZE = 128; + + char sizebuf[CONLEN_BUF_SIZE]; + const auto sizelen = snprintf(sizebuf, sizeof(sizebuf), + "Content-Length: %" PRIu64 "\r\n", len); + return txbuf.sputn(sizebuf, sizelen); +} diff --git a/src/rgw/rgw_civetweb.h b/src/rgw/rgw_civetweb.h new file mode 100644 index 00000000..6a6acd58 --- /dev/null +++ b/src/rgw/rgw_civetweb.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_MONGOOSE_H +#define CEPH_RGW_MONGOOSE_H +#define TIME_BUF_SIZE 128 + +#include "rgw_client_io.h" + + +struct mg_connection; + +class RGWCivetWeb : public rgw::io::RestfulClient, + public rgw::io::BuffererSink { + RGWEnv env; + mg_connection *conn; + + int port; + + bool explicit_keepalive; + bool explicit_conn_close; + bool got_eof_on_read; + + rgw::io::StaticOutputBufferer<> txbuf; + + size_t write_data(const char *buf, size_t len) override; + size_t read_data(char *buf, size_t len); + size_t dump_date_header(); + +public: + [[nodiscard]] int init_env(CephContext *cct) override; + + size_t send_status(int status, const char *status_name) override; + size_t send_100_continue() override; + size_t send_header(const boost::string_ref& name, + const boost::string_ref& value) override; + size_t send_content_length(uint64_t len) override; + size_t complete_header() override; + + size_t recv_body(char* buf, size_t max) override { + return read_data(buf, max); + } + + size_t send_body(const char* buf, size_t len) override { + return write_data(buf, len); + } + + size_t complete_request() override; + + void flush() override; + + RGWEnv& get_env() noexcept override { + return env; + } + + explicit RGWCivetWeb(mg_connection *_conn); +}; + +#endif diff --git a/src/rgw/rgw_civetweb_frontend.cc b/src/rgw/rgw_civetweb_frontend.cc new file mode 100644 index 00000000..4e9d1ce7 --- /dev/null +++ b/src/rgw/rgw_civetweb_frontend.cc @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include + +#include "rgw_frontend.h" +#include "rgw_client_io_filters.h" +#include "rgw_dmclock_sync_scheduler.h" + +#define dout_subsys ceph_subsys_rgw + +namespace dmc = rgw::dmclock; + +RGWCivetWebFrontend::RGWCivetWebFrontend(RGWProcessEnv& env, + RGWFrontendConfig *conf, + dmc::SchedulerCtx& sched_ctx) + : conf(conf), + ctx(nullptr), + env(env) +{ + + auto sched_t = dmc::get_scheduler_t(cct()); + switch(sched_t){ + case dmc::scheduler_t::none: [[fallthrough]]; + case dmc::scheduler_t::throttler: + break; + case dmc::scheduler_t::dmclock: + // TODO: keep track of server ready state and use that here civetweb + // internally tracks in the ctx the threads used and free, while it is + // expected with the current implementation that the threads waiting on the + // queue would still show up in the "used" queue, it might be a useful thing + // to make decisions on in the future. Also while reconfiguring we should + // probably set this to false + auto server_ready_f = []() -> bool { return true; }; + + scheduler.reset(new dmc::SyncScheduler(cct(), + std::ref(sched_ctx.get_dmc_client_counters()), + *sched_ctx.get_dmc_client_config(), + server_ready_f, + std::ref(dmc::SyncScheduler::handle_request_cb), + dmc::AtLimit::Reject)); + } + +} + +static int civetweb_callback(struct mg_connection* conn) +{ + const struct mg_request_info* const req_info = mg_get_request_info(conn); + return static_cast(req_info->user_data)->process(conn); +} + +int RGWCivetWebFrontend::process(struct mg_connection* const conn) +{ + /* Hold a read lock over access to env.store for reconfiguration. */ + RWLock::RLocker lock(env.mutex); + + RGWCivetWeb cw_client(conn); + auto real_client_io = rgw::io::add_reordering( + rgw::io::add_buffering(dout_context, + rgw::io::add_chunking( + rgw::io::add_conlen_controlling( + &cw_client)))); + RGWRestfulIO client_io(dout_context, &real_client_io); + + RGWRequest req(env.store->get_new_req_id()); + int http_ret = 0; + //assert (scheduler != nullptr); + int ret = process_request(env.store, env.rest, &req, env.uri_prefix, + *env.auth_registry, &client_io, env.olog, + null_yield, scheduler.get() ,&http_ret); + if (ret < 0) { + /* We don't really care about return code. */ + dout(20) << "process_request() returned " << ret << dendl; + } + + if (http_ret <= 0) { + /* Mark as processed. */ + return 1; + } + + return http_ret; +} + +int RGWCivetWebFrontend::run() +{ + auto& conf_map = conf->get_config_map(); + + set_conf_default(conf_map, "num_threads", + std::to_string(g_conf()->rgw_thread_pool_size)); + set_conf_default(conf_map, "decode_url", "no"); + set_conf_default(conf_map, "enable_keep_alive", "yes"); + set_conf_default(conf_map, "validate_http_method", "no"); + set_conf_default(conf_map, "canonicalize_url_path", "no"); + set_conf_default(conf_map, "enable_auth_domain_check", "no"); + set_conf_default(conf_map, "allow_unicode_in_urls", "yes"); + + std::string listening_ports; + // support multiple port= entries + auto range = conf_map.equal_range("port"); + for (auto p = range.first; p != range.second; ++p) { + std::string port_str = p->second; + // support port= entries with multiple values + std::replace(port_str.begin(), port_str.end(), '+', ','); + if (!listening_ports.empty()) { + listening_ports.append(1, ','); + } + listening_ports.append(port_str); + } + if (listening_ports.empty()) { + listening_ports = "80"; + } + conf_map.emplace("listening_ports", std::move(listening_ports)); + + /* Set run_as_user. This will cause civetweb to invoke setuid() and setgid() + * based on pw_uid and pw_gid obtained from pw_name. */ + std::string uid_string = g_ceph_context->get_set_uid_string(); + if (! uid_string.empty()) { + conf_map.emplace("run_as_user", std::move(uid_string)); + } + + /* Prepare options for CivetWeb. */ + const std::set rgw_opts = { "port", "prefix" }; + + std::vector options; + + for (const auto& pair : conf_map) { + if (! rgw_opts.count(pair.first)) { + /* CivetWeb doesn't understand configurables of the glue layer between + * it and RadosGW. We need to strip them out. Otherwise CivetWeb would + * signalise an error. */ + options.push_back(pair.first.c_str()); + options.push_back(pair.second.c_str()); + + dout(20) << "civetweb config: " << pair.first + << ": " << pair.second << dendl; + } + } + + options.push_back(nullptr); + /* Initialize the CivetWeb right now. */ + struct mg_callbacks cb; + // FIPS zeroization audit 20191115: this memset is not security related. + memset((void *)&cb, 0, sizeof(cb)); + cb.begin_request = civetweb_callback; + cb.log_message = rgw_civetweb_log_callback; + cb.log_access = rgw_civetweb_log_access_callback; + ctx = mg_start(&cb, this, options.data()); + + return ! ctx ? -EIO : 0; +} /* RGWCivetWebFrontend::run */ diff --git a/src/rgw/rgw_civetweb_log.cc b/src/rgw/rgw_civetweb_log.cc new file mode 100644 index 00000000..d8a89453 --- /dev/null +++ b/src/rgw/rgw_civetweb_log.cc @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/config.h" +#include "rgw_common.h" + +#include "civetweb/civetweb.h" +#include "rgw_crypt_sanitize.h" + +#define dout_subsys ceph_subsys_civetweb + + +#define dout_context g_ceph_context +int rgw_civetweb_log_callback(const struct mg_connection *conn, const char *buf) { + dout(0) << "civetweb: " << (void *)conn << ": " << rgw::crypt_sanitize::log_content(buf) << dendl; + return 0; +} + +int rgw_civetweb_log_access_callback(const struct mg_connection *conn, const char *buf) { + dout(1) << "civetweb: " << (void *)conn << ": " << rgw::crypt_sanitize::log_content(buf) << dendl; + return 0; +} + + diff --git a/src/rgw/rgw_civetweb_log.h b/src/rgw/rgw_civetweb_log.h new file mode 100644 index 00000000..2fbd517c --- /dev/null +++ b/src/rgw/rgw_civetweb_log.h @@ -0,0 +1,10 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_CIVETWEB_LOG_H +#define CEPH_RGW_CIVETWEB_LOG_H + +int rgw_civetweb_log_callback(const struct mg_connection *conn, const char *buf); +int rgw_civetweb_log_access_callback(const struct mg_connection *conn, const char *buf); + +#endif diff --git a/src/rgw/rgw_client_io.cc b/src/rgw/rgw_client_io.cc new file mode 100644 index 00000000..9528ab6f --- /dev/null +++ b/src/rgw/rgw_client_io.cc @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include "rgw_client_io.h" +#include "rgw_crypt.h" +#include "rgw_crypt_sanitize.h" +#define dout_subsys ceph_subsys_rgw + +namespace rgw { +namespace io { + +[[nodiscard]] int BasicClient::init(CephContext *cct) { + int init_error = init_env(cct); + + if (init_error != 0) + return init_error; + + if (cct->_conf->subsys.should_gather()) { + const auto& env_map = get_env().get_map(); + + for (const auto& iter: env_map) { + rgw::crypt_sanitize::env x{iter.first, iter.second}; + ldout(cct, 20) << iter.first << "=" << (x) << dendl; + } + } + return init_error; +} + +} /* namespace io */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_client_io.h b/src/rgw/rgw_client_io.h new file mode 100644 index 00000000..1f5af676 --- /dev/null +++ b/src/rgw/rgw_client_io.h @@ -0,0 +1,439 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_CLIENT_IO_H +#define CEPH_RGW_CLIENT_IO_H + +#include +#include +#include +#include +#include +#include + +#include + +#include "include/types.h" +#include "rgw_common.h" + + +class RGWRestfulIO; + +namespace rgw { +namespace io { + +using Exception = std::system_error; + +/* The minimal and simplest subset of methods that a client of RadosGW can be + * interacted with. */ +class BasicClient { +protected: + virtual int init_env(CephContext *cct) = 0; + +public: + virtual ~BasicClient() = default; + + /* Initialize the BasicClient and inject CephContext. */ + int init(CephContext *cct); + + /* Return the RGWEnv describing the environment that a given request lives in. + * The method does not throw exceptions. */ + virtual RGWEnv& get_env() noexcept = 0; + + /* Complete request. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. */ + virtual size_t complete_request() = 0; +}; /* rgw::io::Client */ + + +class Accounter { +public: + virtual ~Accounter() = default; + + /* Enable or disable the accounting of both sent and received data. Changing + * the state does not affect the counters. */ + virtual void set_account(bool enabled) = 0; + + /* Return number of bytes sent to a direct client of RadosGW (direct means + * eg. a web server instance in the case of using FastCGI front-end) when + * the accounting was enabled. */ + virtual uint64_t get_bytes_sent() const = 0; + + /* Return number of bytes received from a direct client of RadosGW (direct + * means eg. a web server instance in the case of using FastCGI front-end) + * when the accounting was enabled. */ + virtual uint64_t get_bytes_received() const = 0; +}; /* rgw::io::Accounter */ + + +/* Interface abstracting restful interactions with clients, usually through + * the HTTP protocol. The methods participating in the response generation + * process should be called in the specific order: + * 1. send_100_continue() - at most once, + * 2. send_status() - exactly once, + * 3. Any of: + * a. send_header(), + * b. send_content_length() XOR send_chunked_transfer_encoding() + * Please note that only one of those two methods must be called + at most once. + * 4. complete_header() - exactly once, + * 5. send_body() + * 6. complete_request() - exactly once. + * There are no restrictions on flush() - it may be called in any moment. + * + * Receiving data from a client isn't a subject to any further call order + * restrictions besides those imposed by BasicClient. That is, get_env() + * and recv_body can be mixed. */ +class RestfulClient : public BasicClient { + template friend class DecoratedRestfulClient; + +public: + /* Generate the 100 Continue message. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. */ + virtual size_t send_100_continue() = 0; + + /* Generate the response's status part taking the HTTP status code as @status + * and its name pointed in @status_name. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. */ + virtual size_t send_status(int status, const char *status_name) = 0; + + /* Generate header. On success returns number of bytes generated for a direct + * client of RadosGW. On failure throws rgw::io::Exception containing errno. + * + * boost::string_ref is being used because of length it internally carries. */ + virtual size_t send_header(const boost::string_ref& name, + const boost::string_ref& value) = 0; + + /* Inform a client about a content length. Takes number of bytes as @len. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. + * + * CALL LIMITATIONS: + * - The method must be called EXACTLY ONCE. + * - The method is interchangeable with send_chunked_transfer_encoding(). */ + virtual size_t send_content_length(uint64_t len) = 0; + + /* Inform a client that the chunked transfer encoding will be used. + * On success returns number of bytes generated for a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. + * + * CALL LIMITATIONS: + * - The method must be called EXACTLY ONCE. + * - The method is interchangeable with send_content_length(). */ + virtual size_t send_chunked_transfer_encoding() { + /* This is a null implementation. We don't send anything here, even the HTTP + * header. The intended behaviour should be provided through a decorator or + * directly by a given front-end. */ + return 0; + } + + /* Generate completion (the CRLF sequence separating headers and body in + * the case of HTTP) of headers. On success returns number of generated bytes + * for a direct client of RadosGW. On failure throws rgw::io::Exception with + * errno. */ + virtual size_t complete_header() = 0; + + /* Receive no more than @max bytes from a request's body and store it in + * buffer pointed by @buf. On success returns number of bytes received from + * a direct client of RadosGW that has been stored in @buf. On failure throws + * rgw::io::Exception containing errno. */ + virtual size_t recv_body(char* buf, size_t max) = 0; + + /* Generate a part of response's body by taking exactly @len bytes from + * the buffer pointed by @buf. On success returns number of generated bytes + * of response's body. On failure throws rgw::io::Exception. */ + virtual size_t send_body(const char* buf, size_t len) = 0; + + /* Flushes all already generated data to a direct client of RadosGW. + * On failure throws rgw::io::Exception containing errno. */ + virtual void flush() = 0; +} /* rgw::io::RestfulClient */; + + +/* Abstract decorator over any implementation of rgw::io::RestfulClient + * which could be provided both as a pointer-to-object or the object itself. */ +template +class DecoratedRestfulClient : public RestfulClient { + template friend class DecoratedRestfulClient; + friend RGWRestfulIO; + + typedef typename std::remove_pointer::type DerefedDecorateeT; + + static_assert(std::is_base_of::value, + "DecorateeT must be a subclass of rgw::io::RestfulClient"); + + DecorateeT decoratee; + + /* There is an indirection layer over accessing decoratee to share the same + * code base between dynamic and static decorators. The difference is about + * what we store internally: pointer to a decorated object versus the whole + * object itself. */ + template ::value, T>::type* = nullptr> + DerefedDecorateeT& get_decoratee() { + return decoratee; + } + +protected: + template ::value, T>::type* = nullptr> + DerefedDecorateeT& get_decoratee() { + return *decoratee; + } + + /* Dynamic decorators (those storing a pointer instead of the decorated + * object itself) can be reconfigured on-the-fly. HOWEVER: there are no + * facilities for orchestrating such changes. Callers must take care of + * atomicity and thread-safety. */ + template ::value, T>::type* = nullptr> + void set_decoratee(DerefedDecorateeT& new_dec) { + decoratee = &new_dec; + } + + int init_env(CephContext *cct) override { + return get_decoratee().init_env(cct); + } + +public: + explicit DecoratedRestfulClient(DecorateeT&& decoratee) + : decoratee(std::forward(decoratee)) { + } + + size_t send_status(const int status, + const char* const status_name) override { + return get_decoratee().send_status(status, status_name); + } + + size_t send_100_continue() override { + return get_decoratee().send_100_continue(); + } + + size_t send_header(const boost::string_ref& name, + const boost::string_ref& value) override { + return get_decoratee().send_header(name, value); + } + + size_t send_content_length(const uint64_t len) override { + return get_decoratee().send_content_length(len); + } + + size_t send_chunked_transfer_encoding() override { + return get_decoratee().send_chunked_transfer_encoding(); + } + + size_t complete_header() override { + return get_decoratee().complete_header(); + } + + size_t recv_body(char* const buf, const size_t max) override { + return get_decoratee().recv_body(buf, max); + } + + size_t send_body(const char* const buf, + const size_t len) override { + return get_decoratee().send_body(buf, len); + } + + void flush() override { + return get_decoratee().flush(); + } + + RGWEnv& get_env() noexcept override { + return get_decoratee().get_env(); + } + + size_t complete_request() override { + return get_decoratee().complete_request(); + } +} /* rgw::io::DecoratedRestfulClient */; + + +/* Interface that should be provided by a front-end class wanting to to use + * the low-level buffering offered by i.e. StaticOutputBufferer. */ +class BuffererSink { +public: + virtual ~BuffererSink() = default; + + /* Send exactly @len bytes from the memory location pointed by @buf. + * On success returns @len. On failure throws rgw::io::Exception. */ + virtual size_t write_data(const char *buf, size_t len) = 0; +}; + +/* Utility class providing RestfulClient's implementations with facilities + * for low-level buffering without relying on dynamic memory allocations. + * The buffer is carried entirely on stack. This narrows down applicability + * to these situations where buffers are relatively small. This perfectly + * fits the needs of composing an HTTP header. Without that a front-end + * might need to issue a lot of small IO operations leading to increased + * overhead on syscalls and fragmentation of a message if the Nagle's + * algorithm won't be able to form a single TCP segment (usually when + * running on extremely fast network interfaces like the loopback). */ +template +class StaticOutputBufferer : public std::streambuf { + static_assert(BufferSizeV >= sizeof(std::streambuf::char_type), + "Buffer size must be bigger than a single char_type."); + + using std::streambuf::int_type; + + int_type overflow(const int_type c) override { + *pptr() = c; + pbump(sizeof(std::streambuf::char_type)); + + if (! sync()) { + /* No error, the buffer has been successfully synchronized. */ + return c; + } else { + return std::streambuf::traits_type::eof(); + } + } + + int sync() override { + const auto len = static_cast(std::streambuf::pptr() - + std::streambuf::pbase()); + std::streambuf::pbump(-len); + sink.write_data(std::streambuf::pbase(), len); + /* Always return success here. In case of failure write_data() will throw + * rgw::io::Exception. */ + return 0; + } + + BuffererSink& sink; + std::streambuf::char_type buffer[BufferSizeV]; + +public: + explicit StaticOutputBufferer(BuffererSink& sink) + : sink(sink) { + constexpr size_t len = sizeof(buffer) - sizeof(std::streambuf::char_type); + std::streambuf::setp(buffer, buffer + len); + } +}; + +} /* namespace io */ +} /* namespace rgw */ + + +/* We're doing this nasty thing only because of extensive usage of templates + * to implement the static decorator pattern. C++ templates de facto enforce + * mixing interfaces with implementation. Additionally, those classes derive + * from RGWRestfulIO defined here. I believe that including in the middle of + * file is still better than polluting it directly. */ +#include "rgw_client_io_filters.h" + + +/* RGWRestfulIO: high level interface to interact with RESTful clients. What + * differentiates it from rgw::io::RestfulClient is providing more specific APIs + * like rgw::io::Accounter or the AWS Auth v4 stuff implemented by filters + * while hiding the pipelined architecture from clients. + * + * rgw::io::Accounter came in as a part of rgw::io::AccountingFilter. */ +class RGWRestfulIO : public rgw::io::AccountingFilter { + std::vector> filters; + +public: + ~RGWRestfulIO() override = default; + + RGWRestfulIO(CephContext *_cx, rgw::io::RestfulClient* engine) + : AccountingFilter(_cx, std::move(engine)) { + } + + void add_filter(std::shared_ptr new_filter) { + new_filter->set_decoratee(this->get_decoratee()); + this->set_decoratee(*new_filter); + filters.emplace_back(std::move(new_filter)); + } +}; /* RGWRestfulIO */ + + +/* Type conversions to work around lack of req_state type hierarchy matching + * (e.g.) REST backends (may be replaced w/dynamic typed req_state). */ +static inline rgw::io::RestfulClient* RESTFUL_IO(struct req_state* s) { + ceph_assert(dynamic_cast(s->cio) != nullptr); + + return static_cast(s->cio); +} + +static inline rgw::io::Accounter* ACCOUNTING_IO(struct req_state* s) { + auto ptr = dynamic_cast(s->cio); + ceph_assert(ptr != nullptr); + + return ptr; +} + +static inline RGWRestfulIO* AWS_AUTHv4_IO(const req_state* const s) { + ceph_assert(dynamic_cast(s->cio) != nullptr); + + return static_cast(s->cio); +} + + +class RGWClientIOStreamBuf : public std::streambuf { +protected: + RGWRestfulIO &rio; + size_t const window_size; + size_t const putback_size; + std::vector buffer; + +public: + RGWClientIOStreamBuf(RGWRestfulIO &rio, size_t ws, size_t ps = 1) + : rio(rio), + window_size(ws), + putback_size(ps), + buffer(ws + ps) + { + setg(nullptr, nullptr, nullptr); + } + + std::streambuf::int_type underflow() override { + if (gptr() < egptr()) { + return traits_type::to_int_type(*gptr()); + } + + char * const base = buffer.data(); + char * start; + + if (nullptr != eback()) { + /* We need to skip moving bytes on first underflow. In such case + * there is simply no previous data we should preserve for unget() + * or something similar. */ + std::memmove(base, egptr() - putback_size, putback_size); + start = base + putback_size; + } else { + start = base; + } + + size_t read_len = 0; + try { + read_len = rio.recv_body(base, window_size); + } catch (rgw::io::Exception&) { + return traits_type::eof(); + } + if (0 == read_len) { + return traits_type::eof(); + } + + setg(base, start, start + read_len); + + return traits_type::to_int_type(*gptr()); + } +}; + +class RGWClientIOStream : private RGWClientIOStreamBuf, public std::istream { +/* Inheritance from RGWClientIOStreamBuf is a kind of shadow, undirect + * form of composition here. We cannot do that explicitly because istream + * ctor is being called prior to construction of any member of this class. */ + +public: + explicit RGWClientIOStream(RGWRestfulIO &s) + : RGWClientIOStreamBuf(s, 1, 2), + istream(static_cast(this)) { + } +}; + +#endif /* CEPH_RGW_CLIENT_IO_H */ diff --git a/src/rgw/rgw_client_io_filters.h b/src/rgw/rgw_client_io_filters.h new file mode 100644 index 00000000..9ce83a93 --- /dev/null +++ b/src/rgw/rgw_client_io_filters.h @@ -0,0 +1,456 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_CLIENT_IO_DECOIMPL_H +#define CEPH_RGW_CLIENT_IO_DECOIMPL_H + +#include + +#include + +#include "rgw_common.h" +#include "rgw_client_io.h" + +namespace rgw { +namespace io { + +template +class AccountingFilter : public DecoratedRestfulClient, + public Accounter { + bool enabled; + uint64_t total_sent; + uint64_t total_received; + CephContext *cct; + +public: + template + AccountingFilter(CephContext *cct, U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + enabled(false), + total_sent(0), + total_received(0), cct(cct) { + } + + size_t send_status(const int status, + const char* const status_name) override { + const auto sent = DecoratedRestfulClient::send_status(status, + status_name); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_status: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t send_100_continue() override { + const auto sent = DecoratedRestfulClient::send_100_continue(); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_100_continue: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t send_header(const boost::string_ref& name, + const boost::string_ref& value) override { + const auto sent = DecoratedRestfulClient::send_header(name, value); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_header: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t send_content_length(const uint64_t len) override { + const auto sent = DecoratedRestfulClient::send_content_length(len); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_content_length: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t send_chunked_transfer_encoding() override { + const auto sent = DecoratedRestfulClient::send_chunked_transfer_encoding(); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_chunked_transfer_encoding: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t complete_header() override { + const auto sent = DecoratedRestfulClient::complete_header(); + lsubdout(cct, rgw, 30) << "AccountingFilter::complete_header: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t recv_body(char* buf, size_t max) override { + const auto received = DecoratedRestfulClient::recv_body(buf, max); + lsubdout(cct, rgw, 30) << "AccountingFilter::recv_body: e=" + << (enabled ? "1" : "0") << ", received=" << received << dendl; + if (enabled) { + total_received += received; + } + return received; + } + + size_t send_body(const char* const buf, + const size_t len) override { + const auto sent = DecoratedRestfulClient::send_body(buf, len); + lsubdout(cct, rgw, 30) << "AccountingFilter::send_body: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + size_t complete_request() override { + const auto sent = DecoratedRestfulClient::complete_request(); + lsubdout(cct, rgw, 30) << "AccountingFilter::complete_request: e=" + << (enabled ? "1" : "0") << ", sent=" << sent << ", total=" + << total_sent << dendl; + if (enabled) { + total_sent += sent; + } + return sent; + } + + uint64_t get_bytes_sent() const override { + return total_sent; + } + + uint64_t get_bytes_received() const override { + return total_received; + } + + void set_account(bool enabled) override { + this->enabled = enabled; + lsubdout(cct, rgw, 30) << "AccountingFilter::set_account: e=" + << (enabled ? "1" : "0") << dendl; + } +}; + + +/* Filter for in-memory buffering incoming data and calculating the content + * length header if it isn't present. */ +template +class BufferingFilter : public DecoratedRestfulClient { + template friend class DecoratedRestfulClient; +protected: + ceph::bufferlist data; + + bool has_content_length; + bool buffer_data; + CephContext *cct; + +public: + template + BufferingFilter(CephContext *cct, U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + has_content_length(false), + buffer_data(false), cct(cct) { + } + + size_t send_content_length(const uint64_t len) override; + size_t send_chunked_transfer_encoding() override; + size_t complete_header() override; + size_t send_body(const char* buf, size_t len) override; + size_t complete_request() override; +}; + +template +size_t BufferingFilter::send_body(const char* const buf, + const size_t len) +{ + if (buffer_data) { + data.append(buf, len); + + lsubdout(cct, rgw, 30) << "BufferingFilter::send_body: defer count = " + << len << dendl; + return 0; + } + + return DecoratedRestfulClient::send_body(buf, len); +} + +template +size_t BufferingFilter::send_content_length(const uint64_t len) +{ + has_content_length = true; + return DecoratedRestfulClient::send_content_length(len); +} + +template +size_t BufferingFilter::send_chunked_transfer_encoding() +{ + has_content_length = true; + return DecoratedRestfulClient::send_chunked_transfer_encoding(); +} + +template +size_t BufferingFilter::complete_header() +{ + if (! has_content_length) { + /* We will dump everything in complete_request(). */ + buffer_data = true; + lsubdout(cct, rgw, 30) << "BufferingFilter::complete_header: has_content_length=" + << (has_content_length ? "1" : "0") << dendl; + return 0; + } + + return DecoratedRestfulClient::complete_header(); +} + +template +size_t BufferingFilter::complete_request() +{ + size_t sent = 0; + + if (! has_content_length) { + /* It is not correct to count these bytes here, + * because they can only be part of the header. + * Therefore force count to 0. + */ + sent += DecoratedRestfulClient::send_content_length(data.length()); + sent += DecoratedRestfulClient::complete_header(); + lsubdout(cct, rgw, 30) << + "BufferingFilter::complete_request: !has_content_length: IGNORE: sent=" + << sent << dendl; + sent = 0; + } + + if (buffer_data) { + /* We are sending each buffer separately to avoid extra memory shuffling + * that would occur on data.c_str() to provide a continuous memory area. */ + for (const auto& ptr : data.buffers()) { + sent += DecoratedRestfulClient::send_body(ptr.c_str(), + ptr.length()); + } + data.clear(); + buffer_data = false; + lsubdout(cct, rgw, 30) << "BufferingFilter::complete_request: buffer_data: sent=" + << sent << dendl; + } + + return sent + DecoratedRestfulClient::complete_request(); +} + +template static inline +BufferingFilter add_buffering( +CephContext *cct, +T&& t) { + return BufferingFilter(cct, std::forward(t)); +} + + +template +class ChunkingFilter : public DecoratedRestfulClient { + template friend class DecoratedRestfulClient; +protected: + bool chunking_enabled; + +public: + template + explicit ChunkingFilter(U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + chunking_enabled(false) { + } + + size_t send_chunked_transfer_encoding() override { + chunking_enabled = true; + return DecoratedRestfulClient::send_header("Transfer-Encoding", + "chunked"); + } + + size_t send_body(const char* buf, + const size_t len) override { + if (! chunking_enabled) { + return DecoratedRestfulClient::send_body(buf, len); + } else { + static constexpr char HEADER_END[] = "\r\n"; + /* https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.6.1 */ + // TODO: we have no support for sending chunked-encoding + // extensions/trailing headers. + char chunk_size[32]; + const auto chunk_size_len = snprintf(chunk_size, sizeof(chunk_size), + "%" PRIx64 "\r\n", len); + size_t sent = 0; + + sent += DecoratedRestfulClient::send_body(chunk_size, chunk_size_len); + sent += DecoratedRestfulClient::send_body(buf, len); + sent += DecoratedRestfulClient::send_body(HEADER_END, + sizeof(HEADER_END) - 1); + return sent; + } + } + + size_t complete_request() override { + size_t sent = 0; + + if (chunking_enabled) { + static constexpr char CHUNKED_RESP_END[] = "0\r\n\r\n"; + sent += DecoratedRestfulClient::send_body(CHUNKED_RESP_END, + sizeof(CHUNKED_RESP_END) - 1); + } + + return sent + DecoratedRestfulClient::complete_request(); + } +}; + +template static inline +ChunkingFilter add_chunking(T&& t) { + return ChunkingFilter(std::forward(t)); +} + + +/* Class that controls and inhibits the process of sending Content-Length HTTP + * header where RFC 7230 requests so. The cases worth our attention are 204 No + * Content as well as 304 Not Modified. */ +template +class ConLenControllingFilter : public DecoratedRestfulClient { +protected: + enum class ContentLengthAction { + FORWARD, + INHIBIT, + UNKNOWN + } action; + +public: + template + explicit ConLenControllingFilter(U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + action(ContentLengthAction::UNKNOWN) { + } + + size_t send_status(const int status, + const char* const status_name) override { + if ((204 == status || 304 == status) && + ! g_conf()->rgw_print_prohibited_content_length) { + action = ContentLengthAction::INHIBIT; + } else { + action = ContentLengthAction::FORWARD; + } + + return DecoratedRestfulClient::send_status(status, status_name); + } + + size_t send_content_length(const uint64_t len) override { + switch(action) { + case ContentLengthAction::FORWARD: + return DecoratedRestfulClient::send_content_length(len); + case ContentLengthAction::INHIBIT: + return 0; + case ContentLengthAction::UNKNOWN: + default: + return -EINVAL; + } + } +}; + +template static inline +ConLenControllingFilter add_conlen_controlling(T&& t) { + return ConLenControllingFilter(std::forward(t)); +} + + +/* Filter that rectifies the wrong behaviour of some clients of the RGWRestfulIO + * interface. Should be removed after fixing those clients. */ +template +class ReorderingFilter : public DecoratedRestfulClient { +protected: + enum class ReorderState { + RGW_EARLY_HEADERS, /* Got headers sent before calling send_status. */ + RGW_STATUS_SEEN, /* Status has been seen. */ + RGW_DATA /* Header has been completed. */ + } phase; + + boost::optional content_length; + + std::vector> headers; + + size_t send_header(const boost::string_ref& name, + const boost::string_ref& value) override { + switch (phase) { + case ReorderState::RGW_EARLY_HEADERS: + case ReorderState::RGW_STATUS_SEEN: + headers.emplace_back(std::make_pair(std::string(name.data(), name.size()), + std::string(value.data(), value.size()))); + return 0; + case ReorderState::RGW_DATA: + return DecoratedRestfulClient::send_header(name, value); + } + + return -EIO; + } + +public: + template + explicit ReorderingFilter(U&& decoratee) + : DecoratedRestfulClient(std::forward(decoratee)), + phase(ReorderState::RGW_EARLY_HEADERS) { + } + + size_t send_status(const int status, + const char* const status_name) override { + phase = ReorderState::RGW_STATUS_SEEN; + + return DecoratedRestfulClient::send_status(status, status_name); + } + + size_t send_content_length(const uint64_t len) override { + if (ReorderState::RGW_EARLY_HEADERS == phase) { + /* Oh great, someone tries to send content length before status. */ + content_length = len; + return 0; + } else { + return DecoratedRestfulClient::send_content_length(len); + } + } + + size_t complete_header() override { + size_t sent = 0; + + /* Change state in order to immediately send everything we get. */ + phase = ReorderState::RGW_DATA; + + /* Sent content length if necessary. */ + if (content_length) { + sent += DecoratedRestfulClient::send_content_length(*content_length); + } + + /* Header data in buffers are already counted. */ + for (const auto& kv : headers) { + sent += DecoratedRestfulClient::send_header(kv.first, kv.second); + } + headers.clear(); + + return sent + DecoratedRestfulClient::complete_header(); + } +}; + +template static inline +ReorderingFilter add_reordering(T&& t) { + return ReorderingFilter(std::forward(t)); +} + +} /* namespace io */ +} /* namespace rgw */ +#endif /* CEPH_RGW_CLIENT_IO_DECOIMPL_H */ diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc new file mode 100644 index 00000000..567b80ca --- /dev/null +++ b/src/rgw/rgw_common.cc @@ -0,0 +1,1921 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include + +#include "json_spirit/json_spirit.h" +#include "common/ceph_json.h" + +#include "rgw_op.h" +#include "rgw_common.h" +#include "rgw_acl.h" +#include "rgw_string.h" +#include "rgw_rados.h" +#include "rgw_http_errors.h" +#include "rgw_arn.h" + +#include "common/ceph_crypto.h" +#include "common/armor.h" +#include "common/errno.h" +#include "common/Clock.h" +#include "common/Formatter.h" +#include "common/convenience.h" +#include "common/strtol.h" +#include "include/str_list.h" +#include "rgw_crypt_sanitize.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using rgw::ARN; +using rgw::IAM::Effect; +using rgw::IAM::op_to_perm; +using rgw::IAM::Policy; + +const uint32_t RGWBucketInfo::NUM_SHARDS_BLIND_BUCKET(UINT32_MAX); + +rgw_http_errors rgw_http_s3_errors({ + { 0, {200, "" }}, + { STATUS_CREATED, {201, "Created" }}, + { STATUS_ACCEPTED, {202, "Accepted" }}, + { STATUS_NO_CONTENT, {204, "NoContent" }}, + { STATUS_PARTIAL_CONTENT, {206, "" }}, + { ERR_PERMANENT_REDIRECT, {301, "PermanentRedirect" }}, + { ERR_WEBSITE_REDIRECT, {301, "WebsiteRedirect" }}, + { STATUS_REDIRECT, {303, "" }}, + { ERR_NOT_MODIFIED, {304, "NotModified" }}, + { EINVAL, {400, "InvalidArgument" }}, + { ERR_INVALID_REQUEST, {400, "InvalidRequest" }}, + { ERR_INVALID_DIGEST, {400, "InvalidDigest" }}, + { ERR_BAD_DIGEST, {400, "BadDigest" }}, + { ERR_INVALID_LOCATION_CONSTRAINT, {400, "InvalidLocationConstraint" }}, + { ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION, {400, "ZonegroupDefaultPlacementMisconfiguration" }}, + { ERR_INVALID_BUCKET_NAME, {400, "InvalidBucketName" }}, + { ERR_INVALID_OBJECT_NAME, {400, "InvalidObjectName" }}, + { ERR_UNRESOLVABLE_EMAIL, {400, "UnresolvableGrantByEmailAddress" }}, + { ERR_INVALID_PART, {400, "InvalidPart" }}, + { ERR_INVALID_PART_ORDER, {400, "InvalidPartOrder" }}, + { ERR_REQUEST_TIMEOUT, {400, "RequestTimeout" }}, + { ERR_TOO_LARGE, {400, "EntityTooLarge" }}, + { ERR_TOO_SMALL, {400, "EntityTooSmall" }}, + { ERR_TOO_MANY_BUCKETS, {400, "TooManyBuckets" }}, + { ERR_MALFORMED_XML, {400, "MalformedXML" }}, + { ERR_AMZ_CONTENT_SHA256_MISMATCH, {400, "XAmzContentSHA256Mismatch" }}, + { ERR_MALFORMED_DOC, {400, "MalformedPolicyDocument"}}, + { ERR_INVALID_TAG, {400, "InvalidTag"}}, + { ERR_MALFORMED_ACL_ERROR, {400, "MalformedACLError" }}, + { ERR_INVALID_CORS_RULES_ERROR, {400, "InvalidRequest" }}, + { ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR, {400, "InvalidRequest" }}, + { ERR_INVALID_ENCRYPTION_ALGORITHM, {400, "InvalidEncryptionAlgorithmError" }}, + { ERR_INVALID_RETENTION_PERIOD,{400, "InvalidRetentionPeriod"}}, + { ERR_LENGTH_REQUIRED, {411, "MissingContentLength" }}, + { EACCES, {403, "AccessDenied" }}, + { EPERM, {403, "AccessDenied" }}, + { ERR_SIGNATURE_NO_MATCH, {403, "SignatureDoesNotMatch" }}, + { ERR_INVALID_ACCESS_KEY, {403, "InvalidAccessKeyId" }}, + { ERR_USER_SUSPENDED, {403, "UserSuspended" }}, + { ERR_REQUEST_TIME_SKEWED, {403, "RequestTimeTooSkewed" }}, + { ERR_QUOTA_EXCEEDED, {403, "QuotaExceeded" }}, + { ERR_MFA_REQUIRED, {403, "AccessDenied" }}, + { ENOENT, {404, "NoSuchKey" }}, + { ERR_NO_SUCH_BUCKET, {404, "NoSuchBucket" }}, + { ERR_NO_SUCH_WEBSITE_CONFIGURATION, {404, "NoSuchWebsiteConfiguration" }}, + { ERR_NO_SUCH_UPLOAD, {404, "NoSuchUpload" }}, + { ERR_NOT_FOUND, {404, "Not Found"}}, + { ERR_NO_SUCH_LC, {404, "NoSuchLifecycleConfiguration"}}, + { ERR_NO_SUCH_BUCKET_POLICY, {404, "NoSuchBucketPolicy"}}, + { ERR_NO_SUCH_USER, {404, "NoSuchUser"}}, + { ERR_NO_ROLE_FOUND, {404, "NoSuchEntity"}}, + { ERR_NO_CORS_FOUND, {404, "NoSuchCORSConfiguration"}}, + { ERR_NO_SUCH_SUBUSER, {404, "NoSuchSubUser"}}, + { ERR_NO_SUCH_ENTITY, {404, "NoSuchEntity"}}, + { ERR_NO_SUCH_CORS_CONFIGURATION, {404, "NoSuchCORSConfiguration"}}, + { ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION, {404, "ObjectLockConfigurationNotFoundError"}}, + { ERR_METHOD_NOT_ALLOWED, {405, "MethodNotAllowed" }}, + { ETIMEDOUT, {408, "RequestTimeout" }}, + { EEXIST, {409, "BucketAlreadyExists" }}, + { ERR_USER_EXIST, {409, "UserAlreadyExists" }}, + { ERR_EMAIL_EXIST, {409, "EmailExists" }}, + { ERR_KEY_EXIST, {409, "KeyExists"}}, + { ERR_TAG_CONFLICT, {409, "OperationAborted"}}, + { ERR_POSITION_NOT_EQUAL_TO_LENGTH, {409, "PositionNotEqualToLength"}}, + { ERR_OBJECT_NOT_APPENDABLE, {409, "ObjectNotAppendable"}}, + { ERR_INVALID_BUCKET_STATE, {409, "InvalidBucketState"}}, + { ERR_INVALID_SECRET_KEY, {400, "InvalidSecretKey"}}, + { ERR_INVALID_KEY_TYPE, {400, "InvalidKeyType"}}, + { ERR_INVALID_CAP, {400, "InvalidCapability"}}, + { ERR_INVALID_TENANT_NAME, {400, "InvalidTenantName" }}, + { ENOTEMPTY, {409, "BucketNotEmpty" }}, + { ERR_PRECONDITION_FAILED, {412, "PreconditionFailed" }}, + { ERANGE, {416, "InvalidRange" }}, + { ERR_UNPROCESSABLE_ENTITY, {422, "UnprocessableEntity" }}, + { ERR_LOCKED, {423, "Locked" }}, + { ERR_INTERNAL_ERROR, {500, "InternalError" }}, + { ERR_NOT_IMPLEMENTED, {501, "NotImplemented" }}, + { ERR_SERVICE_UNAVAILABLE, {503, "ServiceUnavailable"}}, + { ERR_RATE_LIMITED, {503, "SlowDown"}}, + { ERR_ZERO_IN_URL, {400, "InvalidRequest" }}, +}); + +rgw_http_errors rgw_http_swift_errors({ + { EACCES, {403, "AccessDenied" }}, + { EPERM, {401, "AccessDenied" }}, + { ENAMETOOLONG, {400, "Metadata name too long" }}, + { ERR_USER_SUSPENDED, {401, "UserSuspended" }}, + { ERR_INVALID_UTF8, {412, "Invalid UTF8" }}, + { ERR_BAD_URL, {412, "Bad URL" }}, + { ERR_NOT_SLO_MANIFEST, {400, "Not an SLO manifest" }}, + { ERR_QUOTA_EXCEEDED, {413, "QuotaExceeded" }}, + { ENOTEMPTY, {409, "There was a conflict when trying " + "to complete your request." }}, + /* FIXME(rzarzynski): we need to find a way to apply Swift's error handling + * procedures also for ERR_ZERO_IN_URL. This make a problem as the validation + * is performed very early, even before setting the req_state::proto_flags. */ + { ERR_ZERO_IN_URL, {412, "Invalid UTF8 or contains NULL"}}, + { ERR_RATE_LIMITED, {498, "Rate Limited"}}, +}); + +rgw_http_errors rgw_http_sts_errors({ + { ERR_PACKED_POLICY_TOO_LARGE, {400, "PackedPolicyTooLarge" }}, + { ERR_INVALID_IDENTITY_TOKEN, {400, "InvalidIdentityToken" }}, +}); + +rgw_http_errors rgw_http_iam_errors({ + { ERR_ROLE_EXISTS, {409, "EntityAlreadyExists"}}, + { ERR_DELETE_CONFLICT, {409, "DeleteConflict"}}, +}); + +using namespace ceph::crypto; + +rgw_err:: +rgw_err() +{ + clear(); +} + +void rgw_err:: +clear() +{ + http_ret = 200; + ret = 0; + err_code.clear(); +} + +bool rgw_err:: +is_clear() const +{ + return (http_ret == 200); +} + +bool rgw_err:: +is_err() const +{ + return !(http_ret >= 200 && http_ret <= 399); +} + +// The requestURI transferred from the frontend can be abs_path or absoluteURI +// If it is absoluteURI, we should adjust it to abs_path for the following +// S3 authorization and some other processes depending on the requestURI +// The absoluteURI can start with "http://", "https://", "ws://" or "wss://" +static string get_abs_path(const string& request_uri) { + const static string ABS_PREFIXS[] = {"http://", "https://", "ws://", "wss://"}; + bool isAbs = false; + for (int i = 0; i < 4; ++i) { + if (boost::algorithm::starts_with(request_uri, ABS_PREFIXS[i])) { + isAbs = true; + break; + } + } + if (!isAbs) { // it is not a valid absolute uri + return request_uri; + } + size_t beg_pos = request_uri.find("://") + 3; + size_t len = request_uri.size(); + beg_pos = request_uri.find('/', beg_pos); + if (beg_pos == string::npos) return request_uri; + return request_uri.substr(beg_pos, len - beg_pos); +} + +req_info::req_info(CephContext *cct, const class RGWEnv *env) : env(env) { + method = env->get("REQUEST_METHOD", ""); + script_uri = env->get("SCRIPT_URI", cct->_conf->rgw_script_uri.c_str()); + request_uri = env->get("REQUEST_URI", cct->_conf->rgw_request_uri.c_str()); + if (request_uri[0] != '/') { + request_uri = get_abs_path(request_uri); + } + auto pos = request_uri.find('?'); + if (pos != string::npos) { + request_params = request_uri.substr(pos + 1); + request_uri = request_uri.substr(0, pos); + } else { + request_params = env->get("QUERY_STRING", ""); + } + host = env->get("HTTP_HOST", ""); + + // strip off any trailing :port from host (added by CrossFTP and maybe others) + size_t colon_offset = host.find_last_of(':'); + if (colon_offset != string::npos) { + bool all_digits = true; + for (unsigned i = colon_offset + 1; i < host.size(); ++i) { + if (!isdigit(host[i])) { + all_digits = false; + break; + } + } + if (all_digits) { + host.resize(colon_offset); + } + } +} + +void req_info::rebuild_from(req_info& src) +{ + method = src.method; + script_uri = src.script_uri; + args = src.args; + if (src.effective_uri.empty()) { + request_uri = src.request_uri; + } else { + request_uri = src.effective_uri; + } + effective_uri.clear(); + host = src.host; + + x_meta_map = src.x_meta_map; + x_meta_map.erase("x-amz-date"); +} + + +req_state::req_state(CephContext* _cct, RGWEnv* e, RGWUserInfo* u, uint64_t id) + : cct(_cct), user(u), + info(_cct, e), id(id) +{ + enable_ops_log = e->get_enable_ops_log(); + enable_usage_log = e->get_enable_usage_log(); + defer_to_bucket_acls = e->get_defer_to_bucket_acls(); + + time = Clock::now(); +} + +req_state::~req_state() { + delete formatter; +} + +std::ostream& req_state::gen_prefix(std::ostream& out) const +{ + auto p = out.precision(); + return out << "req " << id << ' ' + << std::setprecision(3) << std::fixed << time_elapsed() // '0.123s' + << std::setprecision(p) << std::defaultfloat << ' '; +} + +bool search_err(rgw_http_errors& errs, int err_no, int& http_ret, string& code) +{ + auto r = errs.find(err_no); + if (r != errs.end()) { + http_ret = r->second.first; + code = r->second.second; + return true; + } + return false; +} + +void set_req_state_err(struct rgw_err& err, /* out */ + int err_no, /* in */ + const int prot_flags) /* in */ +{ + if (err_no < 0) + err_no = -err_no; + + err.ret = -err_no; + + if (prot_flags & RGW_REST_SWIFT) { + if (search_err(rgw_http_swift_errors, err_no, err.http_ret, err.err_code)) + return; + } + + if (prot_flags & RGW_REST_STS) { + if (search_err(rgw_http_sts_errors, err_no, err.http_ret, err.err_code)) + return; + } + + if (prot_flags & RGW_REST_IAM) { + if (search_err(rgw_http_iam_errors, err_no, err.http_ret, err.err_code)) + return; + } + + //Default to searching in s3 errors + if (search_err(rgw_http_s3_errors, err_no, err.http_ret, err.err_code)) + return; + dout(0) << "WARNING: set_req_state_err err_no=" << err_no + << " resorting to 500" << dendl; + + err.http_ret = 500; + err.err_code = "UnknownError"; +} + +void set_req_state_err(struct req_state* s, int err_no, const string& err_msg) +{ + if (s) { + set_req_state_err(s, err_no); + if (s->prot_flags & RGW_REST_SWIFT && !err_msg.empty()) { + /* TODO(rzarzynski): there never ever should be a check like this one. + * It's here only for the sake of the patch's backportability. Further + * commits will move the logic to a per-RGWHandler replacement of + * the end_header() function. Alternativaly, we might consider making + * that just for the dump(). Please take a look on @cbodley's comments + * in PR #10690 (https://github.com/ceph/ceph/pull/10690). */ + s->err.err_code = err_msg; + } else { + s->err.message = err_msg; + } + } +} + +void set_req_state_err(struct req_state* s, int err_no) +{ + if (s) { + set_req_state_err(s->err, err_no, s->prot_flags); + } +} + +void dump(struct req_state* s) +{ + if (s->format != RGW_FORMAT_HTML) + s->formatter->open_object_section("Error"); + if (!s->err.err_code.empty()) + s->formatter->dump_string("Code", s->err.err_code); + if (!s->err.message.empty()) + s->formatter->dump_string("Message", s->err.message); + if (!s->bucket_name.empty()) // TODO: connect to expose_bucket + s->formatter->dump_string("BucketName", s->bucket_name); + if (!s->trans_id.empty()) // TODO: connect to expose_bucket or another toggle + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->dump_string("HostId", s->host_id); + if (s->format != RGW_FORMAT_HTML) + s->formatter->close_section(); +} + +struct str_len { + const char *str; + int len; +}; + +#define STR_LEN_ENTRY(s) { s, sizeof(s) - 1 } + +struct str_len meta_prefixes[] = { STR_LEN_ENTRY("HTTP_X_AMZ"), + STR_LEN_ENTRY("HTTP_X_GOOG"), + STR_LEN_ENTRY("HTTP_X_DHO"), + STR_LEN_ENTRY("HTTP_X_RGW"), + STR_LEN_ENTRY("HTTP_X_OBJECT"), + STR_LEN_ENTRY("HTTP_X_CONTAINER"), + STR_LEN_ENTRY("HTTP_X_ACCOUNT"), + {NULL, 0} }; + +void req_info::init_meta_info(bool *found_bad_meta) +{ + x_meta_map.clear(); + + for (const auto& kv: env->get_map()) { + const char *prefix; + const string& header_name = kv.first; + const string& val = kv.second; + for (int prefix_num = 0; (prefix = meta_prefixes[prefix_num].str) != NULL; prefix_num++) { + int len = meta_prefixes[prefix_num].len; + const char *p = header_name.c_str(); + if (strncmp(p, prefix, len) == 0) { + dout(10) << "meta>> " << p << dendl; + const char *name = p+len; /* skip the prefix */ + int name_len = header_name.size() - len; + + if (found_bad_meta && strncmp(name, "_META_", name_len) == 0) + *found_bad_meta = true; + + char name_low[meta_prefixes[0].len + name_len + 1]; + snprintf(name_low, meta_prefixes[0].len - 5 + name_len + 1, "%s%s", meta_prefixes[0].str + 5 /* skip HTTP_ */, name); // normalize meta prefix + int j; + for (j = 0; name_low[j]; j++) { + if (name_low[j] != '_') + name_low[j] = tolower(name_low[j]); + else + name_low[j] = '-'; + } + name_low[j] = 0; + + auto it = x_meta_map.find(name_low); + if (it != x_meta_map.end()) { + string old = it->second; + boost::algorithm::trim_right(old); + old.append(","); + old.append(val); + x_meta_map[name_low] = old; + } else { + x_meta_map[name_low] = val; + } + } + } + } + for (const auto& kv: x_meta_map) { + dout(10) << "x>> " << kv.first << ":" << rgw::crypt_sanitize::x_meta_map{kv.first, kv.second} << dendl; + } +} + +std::ostream& operator<<(std::ostream& oss, const rgw_err &err) +{ + oss << "rgw_err(http_ret=" << err.http_ret << ", err_code='" << err.err_code << "') "; + return oss; +} + +string rgw_string_unquote(const string& s) +{ + if (s[0] != '"' || s.size() < 2) + return s; + + int len; + for (len = s.size(); len > 2; --len) { + if (s[len - 1] != ' ') + break; + } + + if (s[len-1] != '"') + return s; + + return s.substr(1, len - 2); +} + +static bool check_str_end(const char *s) +{ + if (!s) + return false; + + while (*s) { + if (!isspace(*s)) + return false; + s++; + } + return true; +} + +static bool check_gmt_end(const char *s) +{ + if (!s || !*s) + return false; + + while (isspace(*s)) { + ++s; + } + + /* check for correct timezone */ + if ((strncmp(s, "GMT", 3) != 0) && + (strncmp(s, "UTC", 3) != 0)) { + return false; + } + + return true; +} + +static bool parse_rfc850(const char *s, struct tm *t) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + return check_gmt_end(strptime(s, "%A, %d-%b-%y %H:%M:%S ", t)); +} + +static bool parse_asctime(const char *s, struct tm *t) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + return check_str_end(strptime(s, "%a %b %d %H:%M:%S %Y", t)); +} + +static bool parse_rfc1123(const char *s, struct tm *t) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + return check_gmt_end(strptime(s, "%a, %d %b %Y %H:%M:%S ", t)); +} + +static bool parse_rfc1123_alt(const char *s, struct tm *t) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + return check_str_end(strptime(s, "%a, %d %b %Y %H:%M:%S %z", t)); +} + +bool parse_rfc2616(const char *s, struct tm *t) +{ + return parse_rfc850(s, t) || parse_asctime(s, t) || parse_rfc1123(s, t) || parse_rfc1123_alt(s,t); +} + +bool parse_iso8601(const char *s, struct tm *t, uint32_t *pns, bool extended_format) +{ + // FIPS zeroization audit 20191115: this memset is not security related. + memset(t, 0, sizeof(*t)); + const char *p; + + if (!s) + s = ""; + + if (extended_format) { + p = strptime(s, "%Y-%m-%dT%T", t); + if (!p) { + p = strptime(s, "%Y-%m-%d %T", t); + } + } else { + p = strptime(s, "%Y%m%dT%H%M%S", t); + } + if (!p) { + dout(0) << "parse_iso8601 failed" << dendl; + return false; + } + const boost::string_view str = rgw_trim_whitespace(boost::string_view(p)); + int len = str.size(); + + if (len == 0 || (len == 1 && str[0] == 'Z')) + return true; + + if (str[0] != '.' || + str[len - 1] != 'Z') + return false; + + uint32_t ms; + boost::string_view nsstr = str.substr(1, len - 2); + int r = stringtoul(nsstr.to_string(), &ms); + if (r < 0) + return false; + + if (!pns) { + return true; + } + + if (nsstr.size() > 9) { + nsstr = nsstr.substr(0, 9); + } + + uint64_t mul_table[] = { 0, + 100000000LL, + 10000000LL, + 1000000LL, + 100000LL, + 10000LL, + 1000LL, + 100LL, + 10LL, + 1 }; + + + *pns = ms * mul_table[nsstr.size()]; + + return true; +} + +int parse_key_value(string& in_str, const char *delim, string& key, string& val) +{ + if (delim == NULL) + return -EINVAL; + + auto pos = in_str.find(delim); + if (pos == string::npos) + return -EINVAL; + + key = rgw_trim_whitespace(in_str.substr(0, pos)); + val = rgw_trim_whitespace(in_str.substr(pos + 1)); + + return 0; +} + +int parse_key_value(string& in_str, string& key, string& val) +{ + return parse_key_value(in_str, "=", key,val); +} + +boost::optional> +parse_key_value(const boost::string_view& in_str, + const boost::string_view& delim) +{ + const size_t pos = in_str.find(delim); + if (pos == boost::string_view::npos) { + return boost::none; + } + + const auto key = rgw_trim_whitespace(in_str.substr(0, pos)); + const auto val = rgw_trim_whitespace(in_str.substr(pos + 1)); + + return std::make_pair(key, val); +} + +boost::optional> +parse_key_value(const boost::string_view& in_str) +{ + return parse_key_value(in_str, "="); +} + +int parse_time(const char *time_str, real_time *time) +{ + struct tm tm; + uint32_t ns = 0; + + if (!parse_rfc2616(time_str, &tm) && !parse_iso8601(time_str, &tm, &ns)) { + return -EINVAL; + } + + time_t sec = internal_timegm(&tm); + *time = utime_t(sec, ns).to_real_time(); + + return 0; +} + +#define TIME_BUF_SIZE 128 + +void rgw_to_iso8601(const real_time& t, char *dest, int buf_size) +{ + utime_t ut(t); + + char buf[TIME_BUF_SIZE]; + struct tm result; + time_t epoch = ut.sec(); + struct tm *tmp = gmtime_r(&epoch, &result); + if (tmp == NULL) + return; + + if (strftime(buf, sizeof(buf), "%Y-%m-%dT%T", tmp) == 0) + return; + + snprintf(dest, buf_size, "%s.%03dZ", buf, (int)(ut.usec() / 1000)); +} + +void rgw_to_iso8601(const real_time& t, string *dest) +{ + char buf[TIME_BUF_SIZE]; + rgw_to_iso8601(t, buf, sizeof(buf)); + *dest = buf; +} + + +string rgw_to_asctime(const utime_t& t) +{ + stringstream s; + t.asctime(s); + return s.str(); +} + +/* + * calculate the sha1 value of a given msg and key + */ +void calc_hmac_sha1(const char *key, int key_len, + const char *msg, int msg_len, char *dest) +/* destination should be CEPH_CRYPTO_HMACSHA1_DIGESTSIZE bytes long */ +{ + HMACSHA1 hmac((const unsigned char *)key, key_len); + hmac.Update((const unsigned char *)msg, msg_len); + hmac.Final((unsigned char *)dest); +} + +/* + * calculate the sha256 value of a given msg and key + */ +void calc_hmac_sha256(const char *key, int key_len, + const char *msg, int msg_len, char *dest) +{ + char hash_sha256[CEPH_CRYPTO_HMACSHA256_DIGESTSIZE]; + + HMACSHA256 hmac((const unsigned char *)key, key_len); + hmac.Update((const unsigned char *)msg, msg_len); + hmac.Final((unsigned char *)hash_sha256); + + memcpy(dest, hash_sha256, CEPH_CRYPTO_HMACSHA256_DIGESTSIZE); +} + +using ceph::crypto::SHA256; + +/* + * calculate the sha256 hash value of a given msg + */ +sha256_digest_t calc_hash_sha256(const boost::string_view& msg) +{ + sha256_digest_t hash; + + SHA256 hasher; + hasher.Update(reinterpret_cast(msg.data()), msg.size()); + hasher.Final(hash.v); + + return hash; +} + +SHA256* calc_hash_sha256_open_stream() +{ + return new SHA256; +} + +void calc_hash_sha256_update_stream(SHA256 *hash, const char *msg, int len) +{ + hash->Update((const unsigned char *)msg, len); +} + +string calc_hash_sha256_close_stream(SHA256 **phash) +{ + SHA256 *hash = *phash; + if (!hash) { + hash = calc_hash_sha256_open_stream(); + } + char hash_sha256[CEPH_CRYPTO_HMACSHA256_DIGESTSIZE]; + + hash->Final((unsigned char *)hash_sha256); + + char hex_str[(CEPH_CRYPTO_SHA256_DIGESTSIZE * 2) + 1]; + buf_to_hex((unsigned char *)hash_sha256, CEPH_CRYPTO_SHA256_DIGESTSIZE, hex_str); + + delete hash; + *phash = NULL; + + return std::string(hex_str); +} + +std::string calc_hash_sha256_restart_stream(SHA256 **phash) +{ + const auto hash = calc_hash_sha256_close_stream(phash); + *phash = calc_hash_sha256_open_stream(); + + return hash; +} + +int NameVal::parse() +{ + auto delim_pos = str.find('='); + int ret = 0; + + if (delim_pos == string::npos) { + name = str; + val = ""; + ret = 1; + } else { + name = str.substr(0, delim_pos); + val = str.substr(delim_pos + 1); + } + + return ret; +} + +int RGWHTTPArgs::parse() +{ + int pos = 0; + bool end = false; + + if (str.empty()) + return 0; + + if (str[pos] == '?') + pos++; + + while (!end) { + int fpos = str.find('&', pos); + if (fpos < pos) { + end = true; + fpos = str.size(); + } + std::string nameval = url_decode(str.substr(pos, fpos - pos), true); + NameVal nv(std::move(nameval)); + int ret = nv.parse(); + if (ret >= 0) { + string& name = nv.get_name(); + string& val = nv.get_val(); + + append(name, val); + } + + pos = fpos + 1; + } + + return 0; +} + +void RGWHTTPArgs::append(const string& name, const string& val) +{ + if (name.compare(0, sizeof(RGW_SYS_PARAM_PREFIX) - 1, RGW_SYS_PARAM_PREFIX) == 0) { + sys_val_map[name] = val; + } else { + val_map[name] = val; + } + + if ((name.compare("acl") == 0) || + (name.compare("cors") == 0) || + (name.compare("notification") == 0) || + (name.compare("location") == 0) || + (name.compare("logging") == 0) || + (name.compare("usage") == 0) || + (name.compare("lifecycle") == 0) || + (name.compare("delete") == 0) || + (name.compare("uploads") == 0) || + (name.compare("partNumber") == 0) || + (name.compare("uploadId") == 0) || + (name.compare("versionId") == 0) || + (name.compare("start-date") == 0) || + (name.compare("end-date") == 0) || + (name.compare("versions") == 0) || + (name.compare("versioning") == 0) || + (name.compare("website") == 0) || + (name.compare("requestPayment") == 0) || + (name.compare("torrent") == 0) || + (name.compare("tagging") == 0) || + (name.compare("append") == 0) || + (name.compare("position") == 0)) { + sub_resources[name] = val; + } else if (name[0] == 'r') { // root of all evil + if ((name.compare("response-content-type") == 0) || + (name.compare("response-content-language") == 0) || + (name.compare("response-expires") == 0) || + (name.compare("response-cache-control") == 0) || + (name.compare("response-content-disposition") == 0) || + (name.compare("response-content-encoding") == 0)) { + sub_resources[name] = val; + has_resp_modifier = true; + } + } else if ((name.compare("subuser") == 0) || + (name.compare("key") == 0) || + (name.compare("caps") == 0) || + (name.compare("index") == 0) || + (name.compare("policy") == 0) || + (name.compare("quota") == 0) || + (name.compare("list") == 0) || + (name.compare("object") == 0)) { + + if (!admin_subresource_added) { + sub_resources[name] = ""; + admin_subresource_added = true; + } + } +} + +const string& RGWHTTPArgs::get(const string& name, bool *exists) const +{ + auto iter = val_map.find(name); + bool e = (iter != std::end(val_map)); + if (exists) + *exists = e; + if (e) + return iter->second; + return empty_str; +} + +boost::optional +RGWHTTPArgs::get_optional(const std::string& name) const +{ + bool exists; + const std::string& value = get(name, &exists); + if (exists) { + return value; + } else { + return boost::none; + } +} + +int RGWHTTPArgs::get_bool(const string& name, bool *val, bool *exists) +{ + map::iterator iter; + iter = val_map.find(name); + bool e = (iter != val_map.end()); + if (exists) + *exists = e; + + if (e) { + const char *s = iter->second.c_str(); + + if (strcasecmp(s, "false") == 0) { + *val = false; + } else if (strcasecmp(s, "true") == 0) { + *val = true; + } else { + return -EINVAL; + } + } + + return 0; +} + +int RGWHTTPArgs::get_bool(const char *name, bool *val, bool *exists) +{ + string s(name); + return get_bool(s, val, exists); +} + +void RGWHTTPArgs::get_bool(const char *name, bool *val, bool def_val) +{ + bool exists = false; + if ((get_bool(name, val, &exists) < 0) || + !exists) { + *val = def_val; + } +} + +int RGWHTTPArgs::get_int(const char *name, int *val, int def_val) +{ + bool exists = false; + string val_str; + val_str = get(name, &exists); + if (!exists) { + *val = def_val; + return 0; + } + + string err; + + *val = (int)strict_strtol(val_str.c_str(), 10, &err); + if (!err.empty()) { + *val = def_val; + return -EINVAL; + } + return 0; +} + +string RGWHTTPArgs::sys_get(const string& name, bool * const exists) const +{ + const auto iter = sys_val_map.find(name); + const bool e = (iter != sys_val_map.end()); + + if (exists) { + *exists = e; + } + + return e ? iter->second : string(); +} + +bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env) +{ + const auto& m = env.get_map(); + // frontend connected with ssl + if (m.count("SERVER_PORT_SECURE")) { + return true; + } + // ignore proxy headers unless explicitly enabled + if (!cct->_conf->rgw_trust_forwarded_https) { + return false; + } + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Forwarded + // Forwarded: by=; for=; host=; proto= + auto i = m.find("HTTP_FORWARDED"); + if (i != m.end() && i->second.find("proto=https") != std::string::npos) { + return true; + } + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Proto + i = m.find("HTTP_X_FORWARDED_PROTO"); + if (i != m.end() && i->second == "https") { + return true; + } + return false; +} + +namespace { +Effect eval_or_pass(const boost::optional& policy, + const rgw::IAM::Environment& env, + boost::optional id, + const uint64_t op, + const ARN& arn) { + if (!policy) + return Effect::Pass; + else + return policy->eval(env, id, op, arn); +} + +} + +Effect eval_user_policies(const vector& user_policies, + const rgw::IAM::Environment& env, + boost::optional id, + const uint64_t op, + const ARN& arn) { + auto usr_policy_res = Effect::Pass, prev_res = Effect::Pass; + for (auto& user_policy : user_policies) { + if (usr_policy_res = eval_or_pass(user_policy, env, id, op, arn); usr_policy_res == Effect::Deny) + return usr_policy_res; + else if (usr_policy_res == Effect::Allow) + prev_res = Effect::Allow; + else if (usr_policy_res == Effect::Pass && prev_res == Effect::Allow) + usr_policy_res = Effect::Allow; + } + return usr_policy_res; +} + +bool verify_user_permission(const DoutPrefixProvider* dpp, + struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + const vector& user_policies, + const rgw::ARN& res, + const uint64_t op) +{ + auto usr_policy_res = eval_user_policies(user_policies, s->env, boost::none, op, res); + if (usr_policy_res == Effect::Deny) { + return false; + } + + if (usr_policy_res == Effect::Allow) { + return true; + } + + if (op == rgw::IAM::s3CreateBucket || op == rgw::IAM::s3ListAllMyBuckets) { + auto perm = op_to_perm(op); + + return verify_user_permission_no_policy(dpp, s, user_acl, perm); + } + + return false; +} + +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + const int perm) +{ + if (s->auth.identity->get_identity_type() == TYPE_ROLE) + return false; + + /* S3 doesn't support account ACLs. */ + if (!user_acl) + return true; + + if ((perm & (int)s->perm_mask) != perm) + return false; + + return user_acl->verify_permission(dpp, *s->auth.identity, perm, perm); +} + +bool verify_user_permission(const DoutPrefixProvider* dpp, + struct req_state * const s, + const rgw::ARN& res, + const uint64_t op) +{ + return verify_user_permission(dpp, s, s->user_acl.get(), s->iam_user_policies, res, op); +} + +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, + struct req_state * const s, + const int perm) +{ + return verify_user_permission_no_policy(dpp, s, s->user_acl.get(), perm); +} + +bool verify_requester_payer_permission(struct req_state *s) +{ + if (!s->bucket_info.requester_pays) + return true; + + if (s->auth.identity->is_owner_of(s->bucket_info.owner)) + return true; + + if (s->auth.identity->is_anonymous()) { + return false; + } + + const char *request_payer = s->info.env->get("HTTP_X_AMZ_REQUEST_PAYER"); + if (!request_payer) { + bool exists; + request_payer = s->info.args.get("x-amz-request-payer", &exists).c_str(); + if (!exists) { + return false; + } + } + + if (strcasecmp(request_payer, "requester") == 0) { + return true; + } + + return false; +} + +bool verify_bucket_permission(const DoutPrefixProvider* dpp, + struct req_state * const s, + const rgw_bucket& bucket, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const vector& user_policies, + const uint64_t op) +{ + if (!verify_requester_payer_permission(s)) + return false; + + auto usr_policy_res = eval_user_policies(user_policies, s->env, boost::none, op, ARN(bucket)); + if (usr_policy_res == Effect::Deny) + return false; + + auto r = eval_or_pass(bucket_policy, s->env, *s->auth.identity, + op, ARN(bucket)); + if (r == Effect::Allow) + // It looks like S3 ACLs only GRANT permissions rather than + // denying them, so this should be safe. + return true; + else if (r == Effect::Deny) + return false; + else if (usr_policy_res == Effect::Allow) // r is Effect::Pass at this point + return true; + + const auto perm = op_to_perm(op); + + return verify_bucket_permission_no_policy(dpp, s, user_acl, bucket_acl, perm); +} + +bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const int perm) +{ + if (!bucket_acl) + return false; + + if ((perm & (int)s->perm_mask) != perm) + return false; + + if (bucket_acl->verify_permission(dpp, *s->auth.identity, perm, perm, + s->info.env->get("HTTP_REFERER"))) + return true; + + if (!user_acl) + return false; + + return user_acl->verify_permission(dpp, *s->auth.identity, perm, perm); +} + +bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, struct req_state * const s, const int perm) +{ + if (!verify_requester_payer_permission(s)) + return false; + + return verify_bucket_permission_no_policy(dpp, + s, + s->user_acl.get(), + s->bucket_acl.get(), + perm); +} + +bool verify_bucket_permission(const DoutPrefixProvider* dpp, struct req_state * const s, const uint64_t op) +{ + return verify_bucket_permission(dpp, + s, + s->bucket, + s->user_acl.get(), + s->bucket_acl.get(), + s->iam_policy, + s->iam_user_policies, + op); +} + +// Authorize anyone permitted by the policy and the bucket owner +// unless explicitly denied by the policy. + +int verify_bucket_owner_or_policy(struct req_state* const s, + const uint64_t op) +{ + auto e = eval_or_pass(s->iam_policy, + s->env, *s->auth.identity, + op, ARN(s->bucket)); + if (e == Effect::Allow || + (e == Effect::Pass && + s->auth.identity->is_owner_of(s->bucket_owner.get_id()))) { + return 0; + } else { + return -EACCES; + } +} + + +static inline bool check_deferred_bucket_perms(const DoutPrefixProvider* dpp, + struct req_state * const s, + const rgw_bucket& bucket, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const vector& user_policies, + const uint8_t deferred_check, + const uint64_t op) +{ + return (s->defer_to_bucket_acls == deferred_check \ + && verify_bucket_permission(dpp, s, bucket, user_acl, bucket_acl, bucket_policy, user_policies,op)); +} + +static inline bool check_deferred_bucket_only_acl(const DoutPrefixProvider* dpp, + struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const uint8_t deferred_check, + const int perm) +{ + return (s->defer_to_bucket_acls == deferred_check \ + && verify_bucket_permission_no_policy(dpp, s, user_acl, bucket_acl, perm)); +} + +bool verify_object_permission(const DoutPrefixProvider* dpp, struct req_state * const s, + const rgw_obj& obj, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + const boost::optional& bucket_policy, + const vector& user_policies, + const uint64_t op) +{ + if (!verify_requester_payer_permission(s)) + return false; + + auto usr_policy_res = eval_user_policies(user_policies, s->env, boost::none, op, ARN(obj)); + if (usr_policy_res == Effect::Deny) + return false; + + auto r = eval_or_pass(bucket_policy, s->env, *s->auth.identity, op, ARN(obj)); + if (r == Effect::Allow) + // It looks like S3 ACLs only GRANT permissions rather than + // denying them, so this should be safe. + return true; + else if (r == Effect::Deny) + return false; + else if (usr_policy_res == Effect::Allow) + return true; + + const auto perm = op_to_perm(op); + + if (check_deferred_bucket_perms(dpp, s, obj.bucket, user_acl, bucket_acl, bucket_policy, + user_policies, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, op) || + check_deferred_bucket_perms(dpp, s, obj.bucket, user_acl, bucket_acl, bucket_policy, + user_policies, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, rgw::IAM::s3All)) { + return true; + } + + if (!object_acl) { + return false; + } + + bool ret = object_acl->verify_permission(dpp, *s->auth.identity, s->perm_mask, perm); + if (ret) { + return true; + } + + if (!s->cct->_conf->rgw_enforce_swift_acls) + return ret; + + if ((perm & (int)s->perm_mask) != perm) + return false; + + int swift_perm = 0; + if (perm & (RGW_PERM_READ | RGW_PERM_READ_ACP)) + swift_perm |= RGW_PERM_READ_OBJS; + if (perm & RGW_PERM_WRITE) + swift_perm |= RGW_PERM_WRITE_OBJS; + + if (!swift_perm) + return false; + + /* we already verified the user mask above, so we pass swift_perm as the mask here, + otherwise the mask might not cover the swift permissions bits */ + if (bucket_acl->verify_permission(dpp, *s->auth.identity, swift_perm, swift_perm, + s->info.env->get("HTTP_REFERER"))) + return true; + + if (!user_acl) + return false; + + return user_acl->verify_permission(dpp, *s->auth.identity, swift_perm, swift_perm); +} + +bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, + struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + const int perm) +{ + if (check_deferred_bucket_only_acl(dpp, s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, perm) || + check_deferred_bucket_only_acl(dpp, s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, RGW_PERM_FULL_CONTROL)) { + return true; + } + + if (!object_acl) { + return false; + } + + bool ret = object_acl->verify_permission(dpp, *s->auth.identity, s->perm_mask, perm); + if (ret) { + return true; + } + + if (!s->cct->_conf->rgw_enforce_swift_acls) + return ret; + + if ((perm & (int)s->perm_mask) != perm) + return false; + + int swift_perm = 0; + if (perm & (RGW_PERM_READ | RGW_PERM_READ_ACP)) + swift_perm |= RGW_PERM_READ_OBJS; + if (perm & RGW_PERM_WRITE) + swift_perm |= RGW_PERM_WRITE_OBJS; + + if (!swift_perm) + return false; + + /* we already verified the user mask above, so we pass swift_perm as the mask here, + otherwise the mask might not cover the swift permissions bits */ + if (bucket_acl->verify_permission(dpp, *s->auth.identity, swift_perm, swift_perm, + s->info.env->get("HTTP_REFERER"))) + return true; + + if (!user_acl) + return false; + + return user_acl->verify_permission(dpp, *s->auth.identity, swift_perm, swift_perm); +} + +bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, struct req_state *s, int perm) +{ + if (!verify_requester_payer_permission(s)) + return false; + + return verify_object_permission_no_policy(dpp, + s, + s->user_acl.get(), + s->bucket_acl.get(), + s->object_acl.get(), + perm); +} + +bool verify_object_permission(const DoutPrefixProvider* dpp, struct req_state *s, uint64_t op) +{ + return verify_object_permission(dpp, + s, + rgw_obj(s->bucket, s->object), + s->user_acl.get(), + s->bucket_acl.get(), + s->object_acl.get(), + s->iam_policy, + s->iam_user_policies, + op); +} + +class HexTable +{ + char table[256]; + +public: + HexTable() { + // FIPS zeroization audit 20191115: this memset is not security related. + memset(table, -1, sizeof(table)); + int i; + for (i = '0'; i<='9'; i++) + table[i] = i - '0'; + for (i = 'A'; i<='F'; i++) + table[i] = i - 'A' + 0xa; + for (i = 'a'; i<='f'; i++) + table[i] = i - 'a' + 0xa; + } + + char to_num(char c) { + return table[(int)c]; + } +}; + +static char hex_to_num(char c) +{ + static HexTable hex_table; + return hex_table.to_num(c); +} + +std::string url_decode(const boost::string_view& src_str, bool in_query) +{ + std::string dest_str; + dest_str.reserve(src_str.length() + 1); + + for (auto src = std::begin(src_str); src != std::end(src_str); ++src) { + if (*src != '%') { + if (!in_query || *src != '+') { + if (*src == '?') { + in_query = true; + } + dest_str.push_back(*src); + } else { + dest_str.push_back(' '); + } + } else { + /* 3 == strlen("%%XX") */ + if (std::distance(src, std::end(src_str)) < 3) { + break; + } + + src++; + const char c1 = hex_to_num(*src++); + const char c2 = hex_to_num(*src); + if (c1 < 0 || c2 < 0) { + return std::string(); + } else { + dest_str.push_back(c1 << 4 | c2); + } + } + } + + return dest_str; +} + +void rgw_uri_escape_char(char c, string& dst) +{ + char buf[16]; + snprintf(buf, sizeof(buf), "%%%.2X", (int)(unsigned char)c); + dst.append(buf); +} + +static bool char_needs_url_encoding(char c) +{ + if (c <= 0x20 || c >= 0x7f) + return true; + + switch (c) { + case 0x22: + case 0x23: + case 0x25: + case 0x26: + case 0x2B: + case 0x2C: + case 0x2F: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3E: + case 0x3D: + case 0x3F: + case 0x40: + case 0x5B: + case 0x5D: + case 0x5C: + case 0x5E: + case 0x60: + case 0x7B: + case 0x7D: + return true; + } + return false; +} + +void url_encode(const string& src, string& dst, bool encode_slash) +{ + const char *p = src.c_str(); + for (unsigned i = 0; i < src.size(); i++, p++) { + if ((!encode_slash && *p == 0x2F) || !char_needs_url_encoding(*p)) { + dst.append(p, 1); + }else { + rgw_uri_escape_char(*p, dst); + } + } +} + +std::string url_encode(const std::string& src, bool encode_slash) +{ + std::string dst; + url_encode(src, dst, encode_slash); + + return dst; +} + +string rgw_trim_whitespace(const string& src) +{ + if (src.empty()) { + return string(); + } + + int start = 0; + for (; start != (int)src.size(); start++) { + if (!isspace(src[start])) + break; + } + + int end = src.size() - 1; + if (end < start) { + return string(); + } + + for (; end > start; end--) { + if (!isspace(src[end])) + break; + } + + return src.substr(start, end - start + 1); +} + +boost::string_view rgw_trim_whitespace(const boost::string_view& src) +{ + boost::string_view res = src; + + while (res.size() > 0 && std::isspace(res.front())) { + res.remove_prefix(1); + } + while (res.size() > 0 && std::isspace(res.back())) { + res.remove_suffix(1); + } + return res; +} + +string rgw_trim_quotes(const string& val) +{ + string s = rgw_trim_whitespace(val); + if (s.size() < 2) + return s; + + int start = 0; + int end = s.size() - 1; + int quotes_count = 0; + + if (s[start] == '"') { + start++; + quotes_count++; + } + if (s[end] == '"') { + end--; + quotes_count++; + } + if (quotes_count == 2) { + return s.substr(start, end - start + 1); + } + return s; +} + +struct rgw_name_to_flag { + const char *type_name; + uint32_t flag; +}; + +static int parse_list_of_flags(struct rgw_name_to_flag *mapping, + const string& str, uint32_t *perm) +{ + list strs; + get_str_list(str, strs); + list::iterator iter; + uint32_t v = 0; + for (iter = strs.begin(); iter != strs.end(); ++iter) { + string& s = *iter; + for (int i = 0; mapping[i].type_name; i++) { + if (s.compare(mapping[i].type_name) == 0) + v |= mapping[i].flag; + } + } + + *perm = v; + return 0; +} + +static struct rgw_name_to_flag cap_names[] = { {"*", RGW_CAP_ALL}, + {"read", RGW_CAP_READ}, + {"write", RGW_CAP_WRITE}, + {NULL, 0} }; + +int RGWUserCaps::parse_cap_perm(const string& str, uint32_t *perm) +{ + return parse_list_of_flags(cap_names, str, perm); +} + +int RGWUserCaps::get_cap(const string& cap, string& type, uint32_t *pperm) +{ + int pos = cap.find('='); + if (pos >= 0) { + type = rgw_trim_whitespace(cap.substr(0, pos)); + } + + if (!is_valid_cap_type(type)) + return -ERR_INVALID_CAP; + + string cap_perm; + uint32_t perm = 0; + if (pos < (int)cap.size() - 1) { + cap_perm = cap.substr(pos + 1); + int r = RGWUserCaps::parse_cap_perm(cap_perm, &perm); + if (r < 0) + return r; + } + + *pperm = perm; + + return 0; +} + +int RGWUserCaps::add_cap(const string& cap) +{ + uint32_t perm; + string type; + + int r = get_cap(cap, type, &perm); + if (r < 0) + return r; + + caps[type] |= perm; + + return 0; +} + +int RGWUserCaps::remove_cap(const string& cap) +{ + uint32_t perm; + string type; + + int r = get_cap(cap, type, &perm); + if (r < 0) + return r; + + map::iterator iter = caps.find(type); + if (iter == caps.end()) + return 0; + + uint32_t& old_perm = iter->second; + old_perm &= ~perm; + if (!old_perm) + caps.erase(iter); + + return 0; +} + +int RGWUserCaps::add_from_string(const string& str) +{ + int start = 0; + do { + auto end = str.find(';', start); + if (end == string::npos) + end = str.size(); + + int r = add_cap(str.substr(start, end - start)); + if (r < 0) + return r; + + start = end + 1; + } while (start < (int)str.size()); + + return 0; +} + +int RGWUserCaps::remove_from_string(const string& str) +{ + int start = 0; + do { + auto end = str.find(';', start); + if (end == string::npos) + end = str.size(); + + int r = remove_cap(str.substr(start, end - start)); + if (r < 0) + return r; + + start = end + 1; + } while (start < (int)str.size()); + + return 0; +} + +void RGWUserCaps::dump(Formatter *f) const +{ + dump(f, "caps"); +} + +void RGWUserCaps::dump(Formatter *f, const char *name) const +{ + f->open_array_section(name); + map::const_iterator iter; + for (iter = caps.begin(); iter != caps.end(); ++iter) + { + f->open_object_section("cap"); + f->dump_string("type", iter->first); + uint32_t perm = iter->second; + string perm_str; + for (int i=0; cap_names[i].type_name; i++) { + if ((perm & cap_names[i].flag) == cap_names[i].flag) { + if (perm_str.size()) + perm_str.append(", "); + + perm_str.append(cap_names[i].type_name); + perm &= ~cap_names[i].flag; + } + } + if (perm_str.empty()) + perm_str = ""; + + f->dump_string("perm", perm_str); + f->close_section(); + } + + f->close_section(); +} + +struct RGWUserCap { + string type; + uint32_t perm; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("type", type, obj); + string perm_str; + JSONDecoder::decode_json("perm", perm_str, obj); + if (RGWUserCaps::parse_cap_perm(perm_str, &perm) < 0) { + throw JSONDecoder::err("failed to parse permissions"); + } + } +}; + +void RGWUserCaps::decode_json(JSONObj *obj) +{ + list caps_list; + decode_json_obj(caps_list, obj); + + list::iterator iter; + for (iter = caps_list.begin(); iter != caps_list.end(); ++iter) { + RGWUserCap& cap = *iter; + caps[cap.type] = cap.perm; + } +} + +int RGWUserCaps::check_cap(const string& cap, uint32_t perm) +{ + map::iterator iter = caps.find(cap); + + if ((iter == caps.end()) || + (iter->second & perm) != perm) { + return -EPERM; + } + + return 0; +} + +bool RGWUserCaps::is_valid_cap_type(const string& tp) +{ + static const char *cap_type[] = { "user", + "users", + "buckets", + "metadata", + "usage", + "zone", + "bilog", + "mdlog", + "datalog", + "roles", + "user-policy"}; + + for (unsigned int i = 0; i < sizeof(cap_type) / sizeof(char *); ++i) { + if (tp.compare(cap_type[i]) == 0) { + return true; + } + } + + return false; +} + +void rgw_pool::from_str(const string& s) +{ + size_t pos = rgw_unescape_str(s, 0, '\\', ':', &name); + if (pos != string::npos) { + pos = rgw_unescape_str(s, pos, '\\', ':', &ns); + /* ignore return; if pos != string::npos it means that we had a colon + * in the middle of ns that wasn't escaped, we're going to stop there + */ + } +} + +string rgw_pool::to_str() const +{ + string esc_name; + rgw_escape_str(name, '\\', ':', &esc_name); + if (ns.empty()) { + return esc_name; + } + string esc_ns; + rgw_escape_str(ns, '\\', ':', &esc_ns); + return esc_name + ":" + esc_ns; +} + +void rgw_raw_obj::decode_from_rgw_obj(bufferlist::const_iterator& bl) +{ + using ceph::decode; + rgw_obj old_obj; + decode(old_obj, bl); + + get_obj_bucket_and_oid_loc(old_obj, oid, loc); + pool = old_obj.get_explicit_data_pool(); +} + +std::string rgw_bucket::get_key(char tenant_delim, char id_delim, size_t reserve) const +{ + const size_t max_len = tenant.size() + sizeof(tenant_delim) + + name.size() + sizeof(id_delim) + bucket_id.size() + reserve; + + std::string key; + key.reserve(max_len); + if (!tenant.empty() && tenant_delim) { + key.append(tenant); + key.append(1, tenant_delim); + } + key.append(name); + if (!bucket_id.empty() && id_delim) { + key.append(1, id_delim); + key.append(bucket_id); + } + return key; +} + +std::string rgw_bucket_shard::get_key(char tenant_delim, char id_delim, + char shard_delim) const +{ + static constexpr size_t shard_len{12}; // ":4294967295\0" + auto key = bucket.get_key(tenant_delim, id_delim, shard_len); + if (shard_id >= 0 && shard_delim) { + key.append(1, shard_delim); + key.append(std::to_string(shard_id)); + } + return key; +} + +static struct rgw_name_to_flag op_type_mapping[] = { {"*", RGW_OP_TYPE_ALL}, + {"read", RGW_OP_TYPE_READ}, + {"write", RGW_OP_TYPE_WRITE}, + {"delete", RGW_OP_TYPE_DELETE}, + {NULL, 0} }; + + +int rgw_parse_op_type_list(const string& str, uint32_t *perm) +{ + return parse_list_of_flags(op_type_mapping, str, perm); +} + +bool match_policy(boost::string_view pattern, boost::string_view input, + uint32_t flag) +{ + const uint32_t flag2 = flag & (MATCH_POLICY_ACTION|MATCH_POLICY_ARN) ? + MATCH_CASE_INSENSITIVE : 0; + const bool colonblocks = !(flag & (MATCH_POLICY_RESOURCE | + MATCH_POLICY_STRING)); + + const auto npos = boost::string_view::npos; + boost::string_view::size_type last_pos_input = 0, last_pos_pattern = 0; + while (true) { + auto cur_pos_input = colonblocks ? input.find(":", last_pos_input) : npos; + auto cur_pos_pattern = + colonblocks ? pattern.find(":", last_pos_pattern) : npos; + + auto substr_input = input.substr(last_pos_input, cur_pos_input); + auto substr_pattern = pattern.substr(last_pos_pattern, cur_pos_pattern); + + if (!match_wildcards(substr_pattern, substr_input, flag2)) + return false; + + if (cur_pos_pattern == npos) + return cur_pos_input == npos; + if (cur_pos_input == npos) + return false; + + last_pos_pattern = cur_pos_pattern + 1; + last_pos_input = cur_pos_input + 1; + } +} + +/* + * make attrs look-like-this + * converts underscores to dashes + */ +string lowercase_dash_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + switch (*s) { + case '_': + buf[i] = '-'; + break; + default: + buf[i] = tolower(*s); + } + } + return string(buf); +} + +/* + * make attrs Look-Like-This + * converts underscores to dashes + */ +string camelcase_dash_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + bool last_sep = true; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + switch (*s) { + case '_': + case '-': + buf[i] = '-'; + last_sep = true; + break; + default: + if (last_sep) { + buf[i] = toupper(*s); + } else { + buf[i] = tolower(*s); + } + last_sep = false; + } + } + return string(buf); +} diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h new file mode 100644 index 00000000..3911ab18 --- /dev/null +++ b/src/rgw/rgw_common.h @@ -0,0 +1,2742 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2009 Sage Weil + * Copyright (C) 2015 Yehuda Sadeh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RGW_COMMON_H +#define CEPH_RGW_COMMON_H + +#include + +#include +#include + +#include "common/ceph_crypto.h" +#include "common/random_string.h" +#include "rgw_acl.h" +#include "rgw_cors.h" +#include "rgw_iam_policy.h" +#include "rgw_quota.h" +#include "rgw_string.h" +#include "common/async/yield_context.h" +#include "rgw_website.h" +#include "rgw_object_lock.h" +#include "rgw_tag.h" +#include "cls/version/cls_version_types.h" +#include "cls/user/cls_user_types.h" +#include "cls/rgw/cls_rgw_types.h" +#include "include/rados/librados.hpp" + +namespace ceph { + class Formatter; +} + +using ceph::crypto::MD5; + + +#define RGW_ATTR_PREFIX "user.rgw." + +#define RGW_HTTP_RGWX_ATTR_PREFIX "RGWX_ATTR_" +#define RGW_HTTP_RGWX_ATTR_PREFIX_OUT "Rgwx-Attr-" + +#define RGW_AMZ_PREFIX "x-amz-" +#define RGW_AMZ_META_PREFIX RGW_AMZ_PREFIX "meta-" +#define RGW_AMZ_WEBSITE_REDIRECT_LOCATION RGW_AMZ_PREFIX "website-redirect-location" +#define RGW_AMZ_TAG_COUNT RGW_AMZ_PREFIX "tagging-count" + +#define RGW_SYS_PARAM_PREFIX "rgwx-" + +#define RGW_ATTR_ACL RGW_ATTR_PREFIX "acl" +#define RGW_ATTR_LC RGW_ATTR_PREFIX "lc" +#define RGW_ATTR_CORS RGW_ATTR_PREFIX "cors" +#define RGW_ATTR_ETAG RGW_ATTR_PREFIX "etag" +#define RGW_ATTR_BUCKETS RGW_ATTR_PREFIX "buckets" +#define RGW_ATTR_META_PREFIX RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX +#define RGW_ATTR_CONTENT_TYPE RGW_ATTR_PREFIX "content_type" +#define RGW_ATTR_CACHE_CONTROL RGW_ATTR_PREFIX "cache_control" +#define RGW_ATTR_CONTENT_DISP RGW_ATTR_PREFIX "content_disposition" +#define RGW_ATTR_CONTENT_ENC RGW_ATTR_PREFIX "content_encoding" +#define RGW_ATTR_CONTENT_LANG RGW_ATTR_PREFIX "content_language" +#define RGW_ATTR_EXPIRES RGW_ATTR_PREFIX "expires" +#define RGW_ATTR_DELETE_AT RGW_ATTR_PREFIX "delete_at" +#define RGW_ATTR_ID_TAG RGW_ATTR_PREFIX "idtag" +#define RGW_ATTR_TAIL_TAG RGW_ATTR_PREFIX "tail_tag" +#define RGW_ATTR_SHADOW_OBJ RGW_ATTR_PREFIX "shadow_name" +#define RGW_ATTR_MANIFEST RGW_ATTR_PREFIX "manifest" +#define RGW_ATTR_USER_MANIFEST RGW_ATTR_PREFIX "user_manifest" +#define RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION RGW_ATTR_PREFIX RGW_AMZ_WEBSITE_REDIRECT_LOCATION +#define RGW_ATTR_SLO_MANIFEST RGW_ATTR_PREFIX "slo_manifest" +/* Information whether an object is SLO or not must be exposed to + * user through custom HTTP header named X-Static-Large-Object. */ +#define RGW_ATTR_SLO_UINDICATOR RGW_ATTR_META_PREFIX "static-large-object" +#define RGW_ATTR_X_ROBOTS_TAG RGW_ATTR_PREFIX "x-robots-tag" +#define RGW_ATTR_STORAGE_CLASS RGW_ATTR_PREFIX "storage_class" + +/* S3 Object Lock*/ +#define RGW_ATTR_OBJECT_LOCK RGW_ATTR_PREFIX "object-lock" +#define RGW_ATTR_OBJECT_RETENTION RGW_ATTR_PREFIX "object-retention" +#define RGW_ATTR_OBJECT_LEGAL_HOLD RGW_ATTR_PREFIX "object-legal-hold" + + +#define RGW_ATTR_PG_VER RGW_ATTR_PREFIX "pg_ver" +#define RGW_ATTR_SOURCE_ZONE RGW_ATTR_PREFIX "source_zone" +#define RGW_ATTR_TAGS RGW_ATTR_PREFIX RGW_AMZ_PREFIX "tagging" + +#define RGW_ATTR_TEMPURL_KEY1 RGW_ATTR_META_PREFIX "temp-url-key" +#define RGW_ATTR_TEMPURL_KEY2 RGW_ATTR_META_PREFIX "temp-url-key-2" + +/* Account/container quota of the Swift API. */ +#define RGW_ATTR_QUOTA_NOBJS RGW_ATTR_META_PREFIX "quota-count" +#define RGW_ATTR_QUOTA_MSIZE RGW_ATTR_META_PREFIX "quota-bytes" + +/* Static Web Site of Swift API. */ +#define RGW_ATTR_WEB_INDEX RGW_ATTR_META_PREFIX "web-index" +#define RGW_ATTR_WEB_ERROR RGW_ATTR_META_PREFIX "web-error" +#define RGW_ATTR_WEB_LISTINGS RGW_ATTR_META_PREFIX "web-listings" +#define RGW_ATTR_WEB_LIST_CSS RGW_ATTR_META_PREFIX "web-listings-css" +#define RGW_ATTR_SUBDIR_MARKER RGW_ATTR_META_PREFIX "web-directory-type" + +#define RGW_ATTR_OLH_PREFIX RGW_ATTR_PREFIX "olh." + +#define RGW_ATTR_OLH_INFO RGW_ATTR_OLH_PREFIX "info" +#define RGW_ATTR_OLH_VER RGW_ATTR_OLH_PREFIX "ver" +#define RGW_ATTR_OLH_ID_TAG RGW_ATTR_OLH_PREFIX "idtag" +#define RGW_ATTR_OLH_PENDING_PREFIX RGW_ATTR_OLH_PREFIX "pending." + +#define RGW_ATTR_COMPRESSION RGW_ATTR_PREFIX "compression" + +#define RGW_ATTR_APPEND_PART_NUM RGW_ATTR_PREFIX "append_part_num" + +/* IAM Policy */ +#define RGW_ATTR_IAM_POLICY RGW_ATTR_PREFIX "iam-policy" +#define RGW_ATTR_USER_POLICY RGW_ATTR_PREFIX "user-policy" + +/* RGW File Attributes */ +#define RGW_ATTR_UNIX_KEY1 RGW_ATTR_PREFIX "unix-key1" +#define RGW_ATTR_UNIX1 RGW_ATTR_PREFIX "unix1" + +#define RGW_ATTR_CRYPT_PREFIX RGW_ATTR_PREFIX "crypt." +#define RGW_ATTR_CRYPT_MODE RGW_ATTR_CRYPT_PREFIX "mode" +#define RGW_ATTR_CRYPT_KEYMD5 RGW_ATTR_CRYPT_PREFIX "keymd5" +#define RGW_ATTR_CRYPT_KEYID RGW_ATTR_CRYPT_PREFIX "keyid" +#define RGW_ATTR_CRYPT_KEYSEL RGW_ATTR_CRYPT_PREFIX "keysel" + +#define RGW_BUCKETS_OBJ_SUFFIX ".buckets" + +#define RGW_FORMAT_PLAIN 0 +#define RGW_FORMAT_XML 1 +#define RGW_FORMAT_JSON 2 +#define RGW_FORMAT_HTML 3 + +#define RGW_CAP_READ 0x1 +#define RGW_CAP_WRITE 0x2 +#define RGW_CAP_ALL (RGW_CAP_READ | RGW_CAP_WRITE) + +#define RGW_REST_SWIFT 0x1 +#define RGW_REST_SWIFT_AUTH 0x2 +#define RGW_REST_S3 0x4 +#define RGW_REST_WEBSITE 0x8 +#define RGW_REST_STS 0x10 +#define RGW_REST_IAM 0x20 + +#define RGW_SUSPENDED_USER_AUID (uint64_t)-2 + +#define RGW_OP_TYPE_READ 0x01 +#define RGW_OP_TYPE_WRITE 0x02 +#define RGW_OP_TYPE_DELETE 0x04 + +#define RGW_OP_TYPE_MODIFY (RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE) +#define RGW_OP_TYPE_ALL (RGW_OP_TYPE_READ | RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE) + +#define RGW_DEFAULT_MAX_BUCKETS 1000 + +#define RGW_DEFER_TO_BUCKET_ACLS_RECURSE 1 +#define RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL 2 + +#define STATUS_CREATED 1900 +#define STATUS_ACCEPTED 1901 +#define STATUS_NO_CONTENT 1902 +#define STATUS_PARTIAL_CONTENT 1903 +#define STATUS_REDIRECT 1904 +#define STATUS_NO_APPLY 1905 +#define STATUS_APPLIED 1906 + +#define ERR_INVALID_BUCKET_NAME 2000 +#define ERR_INVALID_OBJECT_NAME 2001 +#define ERR_NO_SUCH_BUCKET 2002 +#define ERR_METHOD_NOT_ALLOWED 2003 +#define ERR_INVALID_DIGEST 2004 +#define ERR_BAD_DIGEST 2005 +#define ERR_UNRESOLVABLE_EMAIL 2006 +#define ERR_INVALID_PART 2007 +#define ERR_INVALID_PART_ORDER 2008 +#define ERR_NO_SUCH_UPLOAD 2009 +#define ERR_REQUEST_TIMEOUT 2010 +#define ERR_LENGTH_REQUIRED 2011 +#define ERR_REQUEST_TIME_SKEWED 2012 +#define ERR_BUCKET_EXISTS 2013 +#define ERR_BAD_URL 2014 +#define ERR_PRECONDITION_FAILED 2015 +#define ERR_NOT_MODIFIED 2016 +#define ERR_INVALID_UTF8 2017 +#define ERR_UNPROCESSABLE_ENTITY 2018 +#define ERR_TOO_LARGE 2019 +#define ERR_TOO_MANY_BUCKETS 2020 +#define ERR_INVALID_REQUEST 2021 +#define ERR_TOO_SMALL 2022 +#define ERR_NOT_FOUND 2023 +#define ERR_PERMANENT_REDIRECT 2024 +#define ERR_LOCKED 2025 +#define ERR_QUOTA_EXCEEDED 2026 +#define ERR_SIGNATURE_NO_MATCH 2027 +#define ERR_INVALID_ACCESS_KEY 2028 +#define ERR_MALFORMED_XML 2029 +#define ERR_USER_EXIST 2030 +#define ERR_NOT_SLO_MANIFEST 2031 +#define ERR_EMAIL_EXIST 2032 +#define ERR_KEY_EXIST 2033 +#define ERR_INVALID_SECRET_KEY 2034 +#define ERR_INVALID_KEY_TYPE 2035 +#define ERR_INVALID_CAP 2036 +#define ERR_INVALID_TENANT_NAME 2037 +#define ERR_WEBSITE_REDIRECT 2038 +#define ERR_NO_SUCH_WEBSITE_CONFIGURATION 2039 +#define ERR_AMZ_CONTENT_SHA256_MISMATCH 2040 +#define ERR_NO_SUCH_LC 2041 +#define ERR_NO_SUCH_USER 2042 +#define ERR_NO_SUCH_SUBUSER 2043 +#define ERR_MFA_REQUIRED 2044 +#define ERR_NO_SUCH_CORS_CONFIGURATION 2045 +#define ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION 2046 +#define ERR_INVALID_RETENTION_PERIOD 2047 +#define ERR_USER_SUSPENDED 2100 +#define ERR_INTERNAL_ERROR 2200 +#define ERR_NOT_IMPLEMENTED 2201 +#define ERR_SERVICE_UNAVAILABLE 2202 +#define ERR_ROLE_EXISTS 2203 +#define ERR_MALFORMED_DOC 2204 +#define ERR_NO_ROLE_FOUND 2205 +#define ERR_DELETE_CONFLICT 2206 +#define ERR_NO_SUCH_BUCKET_POLICY 2207 +#define ERR_INVALID_LOCATION_CONSTRAINT 2208 +#define ERR_TAG_CONFLICT 2209 +#define ERR_INVALID_TAG 2210 +#define ERR_ZERO_IN_URL 2211 +#define ERR_MALFORMED_ACL_ERROR 2212 +#define ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION 2213 +#define ERR_INVALID_ENCRYPTION_ALGORITHM 2214 +#define ERR_INVALID_CORS_RULES_ERROR 2215 +#define ERR_NO_CORS_FOUND 2216 +#define ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR 2217 +#define ERR_RATE_LIMITED 2218 +#define ERR_POSITION_NOT_EQUAL_TO_LENGTH 2219 +#define ERR_OBJECT_NOT_APPENDABLE 2220 +#define ERR_INVALID_BUCKET_STATE 2221 + +#define ERR_BUSY_RESHARDING 2300 +#define ERR_NO_SUCH_ENTITY 2301 + +// STS Errors +#define ERR_PACKED_POLICY_TOO_LARGE 2400 +#define ERR_INVALID_IDENTITY_TOKEN 2401 + +#ifndef UINT32_MAX +#define UINT32_MAX (0xffffffffu) +#endif + +struct req_state; + +typedef void *RGWAccessHandle; + +enum RGWIntentEvent { + DEL_OBJ = 0, + DEL_DIR = 1, +}; + +enum HostStyle { + PathStyle = 0, + VirtualStyle = 1, +}; + +/** Store error returns for output at a different point in the program */ +struct rgw_err { + rgw_err(); + void clear(); + bool is_clear() const; + bool is_err() const; + friend std::ostream& operator<<(std::ostream& oss, const rgw_err &err); + + int http_ret; + int ret; + std::string err_code; + std::string message; +}; + + + +/* Helper class used for RGWHTTPArgs parsing */ +class NameVal +{ + const std::string str; + std::string name; + std::string val; + public: + explicit NameVal(const std::string& nv) : str(nv) {} + + int parse(); + + std::string& get_name() { return name; } + std::string& get_val() { return val; } +}; + +/** Stores the XML arguments associated with the HTTP request in req_state*/ +class RGWHTTPArgs { + std::string str, empty_str; + std::map val_map; + std::map sys_val_map; + std::map sub_resources; + bool has_resp_modifier = false; + bool admin_subresource_added = false; + public: + RGWHTTPArgs() = default; + explicit RGWHTTPArgs(const std::string& s) { + set(s); + parse(); + } + + /** Set the arguments; as received */ + void set(const std::string& s) { + has_resp_modifier = false; + val_map.clear(); + sub_resources.clear(); + str = s; + } + /** parse the received arguments */ + int parse(); + void append(const std::string& name, const string& val); + /** Get the value for a specific argument parameter */ + const string& get(const std::string& name, bool *exists = NULL) const; + boost::optional + get_optional(const std::string& name) const; + int get_bool(const std::string& name, bool *val, bool *exists); + int get_bool(const char *name, bool *val, bool *exists); + void get_bool(const char *name, bool *val, bool def_val); + int get_int(const char *name, int *val, int def_val); + + /** Get the value for specific system argument parameter */ + std::string sys_get(const std::string& name, bool *exists = nullptr) const; + + /** see if a parameter is contained in this RGWHTTPArgs */ + bool exists(const char *name) const { + return (val_map.find(name) != std::end(val_map)); + } + bool sub_resource_exists(const char *name) const { + return (sub_resources.find(name) != std::end(sub_resources)); + } + std::map& get_params() { + return val_map; + } + const std::map& get_sub_resources() const { + return sub_resources; + } + unsigned get_num_params() const { + return val_map.size(); + } + bool has_response_modifier() const { + return has_resp_modifier; + } + void set_system() { /* make all system params visible */ + std::map::iterator iter; + for (iter = sys_val_map.begin(); iter != sys_val_map.end(); ++iter) { + val_map[iter->first] = iter->second; + } + } + const std::string& get_str() { + return str; + } +}; // RGWHTTPArgs + +const char *rgw_conf_get(const map& conf_map, const char *name, const char *def_val); +int rgw_conf_get_int(const map& conf_map, const char *name, int def_val); +bool rgw_conf_get_bool(const map& conf_map, const char *name, bool def_val); + +class RGWEnv; + +class RGWConf { + friend class RGWEnv; + int enable_ops_log; + int enable_usage_log; + uint8_t defer_to_bucket_acls; + void init(CephContext *cct); +public: + RGWConf() + : enable_ops_log(1), + enable_usage_log(1), + defer_to_bucket_acls(0) { + } +}; + +class RGWEnv { + std::map env_map; + RGWConf conf; +public: + void init(CephContext *cct); + void init(CephContext *cct, char **envp); + void set(std::string name, std::string val); + const char *get(const char *name, const char *def_val = nullptr) const; + int get_int(const char *name, int def_val = 0) const; + bool get_bool(const char *name, bool def_val = 0); + size_t get_size(const char *name, size_t def_val = 0) const; + bool exists(const char *name) const; + bool exists_prefix(const char *prefix) const; + void remove(const char *name); + const std::map& get_map() const { return env_map; } + int get_enable_ops_log() const { + return conf.enable_ops_log; + } + + int get_enable_usage_log() const { + return conf.enable_usage_log; + } + + int get_defer_to_bucket_acls() const { + return conf.defer_to_bucket_acls; + } +}; + +// return true if the connection is secure. this either means that the +// connection arrived via ssl, or was forwarded as https by a trusted proxy +bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env); + +enum http_op { + OP_GET, + OP_PUT, + OP_DELETE, + OP_HEAD, + OP_POST, + OP_COPY, + OP_OPTIONS, + OP_UNKNOWN, +}; + +enum RGWOpType { + RGW_OP_UNKNOWN = 0, + RGW_OP_GET_OBJ, + RGW_OP_LIST_BUCKETS, + RGW_OP_STAT_ACCOUNT, + RGW_OP_LIST_BUCKET, + RGW_OP_GET_BUCKET_LOGGING, + RGW_OP_GET_BUCKET_LOCATION, + RGW_OP_GET_BUCKET_VERSIONING, + RGW_OP_SET_BUCKET_VERSIONING, + RGW_OP_GET_BUCKET_WEBSITE, + RGW_OP_SET_BUCKET_WEBSITE, + RGW_OP_STAT_BUCKET, + RGW_OP_CREATE_BUCKET, + RGW_OP_DELETE_BUCKET, + RGW_OP_PUT_OBJ, + RGW_OP_STAT_OBJ, + RGW_OP_POST_OBJ, + RGW_OP_PUT_METADATA_ACCOUNT, + RGW_OP_PUT_METADATA_BUCKET, + RGW_OP_PUT_METADATA_OBJECT, + RGW_OP_SET_TEMPURL, + RGW_OP_DELETE_OBJ, + RGW_OP_COPY_OBJ, + RGW_OP_GET_ACLS, + RGW_OP_PUT_ACLS, + RGW_OP_GET_CORS, + RGW_OP_PUT_CORS, + RGW_OP_DELETE_CORS, + RGW_OP_OPTIONS_CORS, + RGW_OP_GET_REQUEST_PAYMENT, + RGW_OP_SET_REQUEST_PAYMENT, + RGW_OP_INIT_MULTIPART, + RGW_OP_COMPLETE_MULTIPART, + RGW_OP_ABORT_MULTIPART, + RGW_OP_LIST_MULTIPART, + RGW_OP_LIST_BUCKET_MULTIPARTS, + RGW_OP_DELETE_MULTI_OBJ, + RGW_OP_BULK_DELETE, + RGW_OP_SET_ATTRS, + RGW_OP_GET_CROSS_DOMAIN_POLICY, + RGW_OP_GET_HEALTH_CHECK, + RGW_OP_GET_INFO, + RGW_OP_CREATE_ROLE, + RGW_OP_DELETE_ROLE, + RGW_OP_GET_ROLE, + RGW_OP_MODIFY_ROLE, + RGW_OP_LIST_ROLES, + RGW_OP_PUT_ROLE_POLICY, + RGW_OP_GET_ROLE_POLICY, + RGW_OP_LIST_ROLE_POLICIES, + RGW_OP_DELETE_ROLE_POLICY, + RGW_OP_PUT_BUCKET_POLICY, + RGW_OP_GET_BUCKET_POLICY, + RGW_OP_DELETE_BUCKET_POLICY, + RGW_OP_PUT_OBJ_TAGGING, + RGW_OP_GET_OBJ_TAGGING, + RGW_OP_DELETE_OBJ_TAGGING, + RGW_OP_PUT_LC, + RGW_OP_GET_LC, + RGW_OP_DELETE_LC, + RGW_OP_PUT_USER_POLICY, + RGW_OP_GET_USER_POLICY, + RGW_OP_LIST_USER_POLICIES, + RGW_OP_DELETE_USER_POLICY, + RGW_OP_PUT_BUCKET_OBJ_LOCK, + RGW_OP_GET_BUCKET_OBJ_LOCK, + RGW_OP_PUT_OBJ_RETENTION, + RGW_OP_GET_OBJ_RETENTION, + RGW_OP_PUT_OBJ_LEGAL_HOLD, + RGW_OP_GET_OBJ_LEGAL_HOLD, + /* rgw specific */ + RGW_OP_ADMIN_SET_METADATA, + RGW_OP_GET_OBJ_LAYOUT, + RGW_OP_BULK_UPLOAD, + RGW_OP_METADATA_SEARCH, + RGW_OP_CONFIG_BUCKET_META_SEARCH, + RGW_OP_GET_BUCKET_META_SEARCH, + RGW_OP_DEL_BUCKET_META_SEARCH, + /* sts specific*/ + RGW_STS_ASSUME_ROLE, + RGW_STS_GET_SESSION_TOKEN, + RGW_STS_ASSUME_ROLE_WEB_IDENTITY, + /* pubsub */ + RGW_OP_PUBSUB_TOPIC_CREATE, + RGW_OP_PUBSUB_TOPICS_LIST, + RGW_OP_PUBSUB_TOPIC_GET, + RGW_OP_PUBSUB_TOPIC_DELETE, + RGW_OP_PUBSUB_SUB_CREATE, + RGW_OP_PUBSUB_SUB_GET, + RGW_OP_PUBSUB_SUB_DELETE, + RGW_OP_PUBSUB_SUB_PULL, + RGW_OP_PUBSUB_SUB_ACK, + RGW_OP_PUBSUB_NOTIF_CREATE, + RGW_OP_PUBSUB_NOTIF_DELETE, + RGW_OP_PUBSUB_NOTIF_LIST, +}; + +class RGWAccessControlPolicy; +class JSONObj; + +struct RGWAccessKey { + string id; // AccessKey + string key; // SecretKey + string subuser; + + RGWAccessKey() {} + RGWAccessKey(std::string _id, std::string _key) + : id(std::move(_id)), key(std::move(_key)) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(id, bl); + encode(key, bl); + encode(subuser, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl); + decode(id, bl); + decode(key, bl); + decode(subuser, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void dump_plain(Formatter *f) const; + void dump(Formatter *f, const string& user, bool swift) const; + static void generate_test_instances(list& o); + + void decode_json(JSONObj *obj); + void decode_json(JSONObj *obj, bool swift); +}; +WRITE_CLASS_ENCODER(RGWAccessKey) + +struct RGWSubUser { + string name; + uint32_t perm_mask; + + RGWSubUser() : perm_mask(0) {} + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(name, bl); + encode(perm_mask, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl); + decode(name, bl); + decode(perm_mask, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void dump(Formatter *f, const string& user) const; + static void generate_test_instances(list& o); + + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWSubUser) + +class RGWUserCaps +{ + map caps; + + int get_cap(const string& cap, string& type, uint32_t *perm); + int add_cap(const string& cap); + int remove_cap(const string& cap); +public: + static int parse_cap_perm(const string& str, uint32_t *perm); + int add_from_string(const string& str); + int remove_from_string(const string& str); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(caps, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(caps, bl); + DECODE_FINISH(bl); + } + int check_cap(const string& cap, uint32_t perm); + bool is_valid_cap_type(const string& tp); + void dump(Formatter *f) const; + void dump(Formatter *f, const char *name) const; + + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWUserCaps) + +void encode_json(const char *name, const obj_version& v, Formatter *f); +void encode_json(const char *name, const RGWUserCaps& val, Formatter *f); + +void decode_json_obj(obj_version& v, JSONObj *obj); + + + +enum RGWIdentityType +{ + TYPE_NONE=0, + TYPE_RGW=1, + TYPE_KEYSTONE=2, + TYPE_LDAP=3, + TYPE_ROLE=4, + TYPE_WEB=5, +}; + +static string RGW_STORAGE_CLASS_STANDARD = "STANDARD"; + +struct rgw_placement_rule { + std::string name; + std::string storage_class; + + rgw_placement_rule() {} + rgw_placement_rule(const string& _n, const string& _sc) : name(_n), storage_class(_sc) {} + rgw_placement_rule(const rgw_placement_rule& _r, const string& _sc) : name(_r.name) { + if (!_sc.empty()) { + storage_class = _sc; + } else { + storage_class = _r.storage_class; + } + } + + bool empty() const { + return name.empty() && storage_class.empty(); + } + + void inherit_from(const rgw_placement_rule& r) { + if (name.empty()) { + name = r.name; + } + if (storage_class.empty()) { + storage_class = r.storage_class; + } + } + + void clear() { + name.clear(); + storage_class.clear(); + } + + void init(const string& n, const string& c) { + name = n; + storage_class = c; + } + + static const string& get_canonical_storage_class(const string& storage_class) { + if (storage_class.empty()) { + return RGW_STORAGE_CLASS_STANDARD; + } + return storage_class; + } + + const string& get_storage_class() const { + return get_canonical_storage_class(storage_class); + } + + int compare(const rgw_placement_rule& r) const { + int c = name.compare(r.name); + if (c != 0) { + return c; + } + return get_storage_class().compare(r.get_storage_class()); + } + + bool operator==(const rgw_placement_rule& r) const { + return (name == r.name && + get_storage_class() == r.get_storage_class()); + } + + bool operator!=(const rgw_placement_rule& r) const { + return !(*this == r); + } + + void encode(bufferlist& bl) const { + /* no ENCODE_START/END due to backward compatibility */ + std::string s = to_str(); + ceph::encode(s, bl); + } + + void decode(bufferlist::const_iterator& bl) { + std::string s; + ceph::decode(s, bl); + from_str(s); + } + + std::string to_str() const { + if (standard_storage_class()) { + return name; + } + return to_str_explicit(); + } + + std::string to_str_explicit() const { + return name + "/" + storage_class; + } + + void from_str(const std::string& s) { + size_t pos = s.find("/"); + if (pos == std::string::npos) { + name = s; + storage_class.clear(); + return; + } + name = s.substr(0, pos); + storage_class = s.substr(pos + 1); + } + + bool standard_storage_class() const { + return storage_class.empty() || storage_class == RGW_STORAGE_CLASS_STANDARD; + } +}; +WRITE_CLASS_ENCODER(rgw_placement_rule) + +void encode_json(const char *name, const rgw_placement_rule& val, ceph::Formatter *f); +void decode_json_obj(rgw_placement_rule& v, JSONObj *obj); + +inline ostream& operator<<(ostream& out, const rgw_placement_rule& rule) { + return out << rule.to_str(); +} +struct RGWUserInfo +{ + rgw_user user_id; + string display_name; + string user_email; + map access_keys; + map swift_keys; + map subusers; + __u8 suspended; + int32_t max_buckets; + uint32_t op_mask; + RGWUserCaps caps; + __u8 admin; + __u8 system; + rgw_placement_rule default_placement; + list placement_tags; + RGWQuotaInfo bucket_quota; + map temp_url_keys; + RGWQuotaInfo user_quota; + uint32_t type; + set mfa_ids; + string assumed_role_arn; + + RGWUserInfo() + : suspended(0), + max_buckets(RGW_DEFAULT_MAX_BUCKETS), + op_mask(RGW_OP_TYPE_ALL), + admin(0), + system(0), + type(TYPE_NONE) { + } + + RGWAccessKey* get_key(const string& access_key) { + if (access_keys.empty()) + return nullptr; + + auto k = access_keys.find(access_key); + if (k == access_keys.end()) + return nullptr; + else + return &(k->second); + } + + void encode(bufferlist& bl) const { + ENCODE_START(21, 9, bl); + encode((uint64_t)0, bl); // old auid + string access_key; + string secret_key; + if (!access_keys.empty()) { + map::const_iterator iter = access_keys.begin(); + const RGWAccessKey& k = iter->second; + access_key = k.id; + secret_key = k.key; + } + encode(access_key, bl); + encode(secret_key, bl); + encode(display_name, bl); + encode(user_email, bl); + string swift_name; + string swift_key; + if (!swift_keys.empty()) { + map::const_iterator iter = swift_keys.begin(); + const RGWAccessKey& k = iter->second; + swift_name = k.id; + swift_key = k.key; + } + encode(swift_name, bl); + encode(swift_key, bl); + encode(user_id.id, bl); + encode(access_keys, bl); + encode(subusers, bl); + encode(suspended, bl); + encode(swift_keys, bl); + encode(max_buckets, bl); + encode(caps, bl); + encode(op_mask, bl); + encode(system, bl); + encode(default_placement, bl); + encode(placement_tags, bl); + encode(bucket_quota, bl); + encode(temp_url_keys, bl); + encode(user_quota, bl); + encode(user_id.tenant, bl); + encode(admin, bl); + encode(type, bl); + encode(mfa_ids, bl); + encode(assumed_role_arn, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(20, 9, 9, bl); + if (struct_v >= 2) { + uint64_t old_auid; + decode(old_auid, bl); + } + string access_key; + string secret_key; + decode(access_key, bl); + decode(secret_key, bl); + if (struct_v < 6) { + RGWAccessKey k; + k.id = access_key; + k.key = secret_key; + access_keys[access_key] = k; + } + decode(display_name, bl); + decode(user_email, bl); + /* We populate swift_keys map later nowadays, but we have to decode. */ + string swift_name; + string swift_key; + if (struct_v >= 3) decode(swift_name, bl); + if (struct_v >= 4) decode(swift_key, bl); + if (struct_v >= 5) + decode(user_id.id, bl); + else + user_id.id = access_key; + if (struct_v >= 6) { + decode(access_keys, bl); + decode(subusers, bl); + } + suspended = 0; + if (struct_v >= 7) { + decode(suspended, bl); + } + if (struct_v >= 8) { + decode(swift_keys, bl); + } + if (struct_v >= 10) { + decode(max_buckets, bl); + } else { + max_buckets = RGW_DEFAULT_MAX_BUCKETS; + } + if (struct_v >= 11) { + decode(caps, bl); + } + if (struct_v >= 12) { + decode(op_mask, bl); + } else { + op_mask = RGW_OP_TYPE_ALL; + } + if (struct_v >= 13) { + decode(system, bl); + decode(default_placement, bl); + decode(placement_tags, bl); /* tags of allowed placement rules */ + } + if (struct_v >= 14) { + decode(bucket_quota, bl); + } + if (struct_v >= 15) { + decode(temp_url_keys, bl); + } + if (struct_v >= 16) { + decode(user_quota, bl); + } + if (struct_v >= 17) { + decode(user_id.tenant, bl); + } else { + user_id.tenant.clear(); + } + if (struct_v >= 18) { + decode(admin, bl); + } + if (struct_v >= 19) { + decode(type, bl); + } + if (struct_v >= 20) { + decode(mfa_ids, bl); + } + if (struct_v >= 21) { + decode(assumed_role_arn, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWUserInfo) + +struct rgw_pool { + std::string name; + std::string ns; + + rgw_pool() = default; + rgw_pool(const rgw_pool& _p) : name(_p.name), ns(_p.ns) {} + rgw_pool(rgw_pool&&) = default; + rgw_pool(const string& _s) { + from_str(_s); + } + rgw_pool(const string& _name, const string& _ns) : name(_name), ns(_ns) {} + + string to_str() const; + void from_str(const string& s); + + void init(const string& _s) { + from_str(_s); + } + + bool empty() const { + return name.empty(); + } + + int compare(const rgw_pool& p) const { + int r = name.compare(p.name); + if (r != 0) { + return r; + } + return ns.compare(p.ns); + } + + void encode(bufferlist& bl) const { + ENCODE_START(10, 10, bl); + encode(name, bl); + encode(ns, bl); + ENCODE_FINISH(bl); + } + + void decode_from_bucket(bufferlist::const_iterator& bl); + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(10, 3, 3, bl); + + decode(name, bl); + + if (struct_v < 10) { + + /* + * note that rgw_pool can be used where rgw_bucket was used before + * therefore we inherit rgw_bucket's old versions. However, we only + * need the first field from rgw_bucket. unless we add more fields + * in which case we'll need to look at struct_v, and check the actual + * version. Anything older than 10 needs to be treated as old rgw_bucket + */ + + } else { + decode(ns, bl); + } + + DECODE_FINISH(bl); + } + + rgw_pool& operator=(const rgw_pool&) = default; + + bool operator==(const rgw_pool& p) const { + return (compare(p) == 0); + } + bool operator!=(const rgw_pool& p) const { + return !(*this == p); + } + bool operator<(const rgw_pool& p) const { + int r = name.compare(p.name); + if (r == 0) { + return (ns.compare(p.ns) < 0); + } + return (r < 0); + } +}; +WRITE_CLASS_ENCODER(rgw_pool) + +struct rgw_data_placement_target { + rgw_pool data_pool; + rgw_pool data_extra_pool; + rgw_pool index_pool; + + rgw_data_placement_target() = default; + rgw_data_placement_target(const rgw_data_placement_target&) = default; + rgw_data_placement_target(rgw_data_placement_target&&) = default; + + rgw_data_placement_target(const rgw_pool& data_pool, + const rgw_pool& data_extra_pool, + const rgw_pool& index_pool) + : data_pool(data_pool), + data_extra_pool(data_extra_pool), + index_pool(index_pool) { + } + + rgw_data_placement_target& + operator=(const rgw_data_placement_target&) = default; + + const rgw_pool& get_data_extra_pool() const { + if (data_extra_pool.empty()) { + return data_pool; + } + return data_extra_pool; + } + + int compare(const rgw_data_placement_target& t) { + int c = data_pool.compare(t.data_pool); + if (c != 0) { + return c; + } + c = data_extra_pool.compare(t.data_extra_pool); + if (c != 0) { + return c; + } + return index_pool.compare(t.index_pool); + }; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; + +inline ostream& operator<<(ostream& out, const rgw_pool& p) { + out << p.to_str(); + return out; +} + +struct rgw_raw_obj { + rgw_pool pool; + std::string oid; + std::string loc; + + rgw_raw_obj() {} + rgw_raw_obj(const rgw_pool& _pool, const std::string& _oid) { + init(_pool, _oid); + } + rgw_raw_obj(const rgw_pool& _pool, const std::string& _oid, const string& _loc) : loc(_loc) { + init(_pool, _oid); + } + + void init(const rgw_pool& _pool, const std::string& _oid) { + pool = _pool; + oid = _oid; + } + + bool empty() const { + return oid.empty(); + } + + void encode(bufferlist& bl) const { + ENCODE_START(6, 6, bl); + encode(pool, bl); + encode(oid, bl); + encode(loc, bl); + ENCODE_FINISH(bl); + } + + void decode_from_rgw_obj(bufferlist::const_iterator& bl); + + void decode(bufferlist::const_iterator& bl) { + unsigned ofs = bl.get_off(); + DECODE_START(6, bl); + if (struct_v < 6) { + /* + * this object was encoded as rgw_obj, prior to rgw_raw_obj been split out of it, + * let's decode it as rgw_obj and convert it + */ + bl.seek(ofs); + decode_from_rgw_obj(bl); + return; + } + decode(pool, bl); + decode(oid, bl); + decode(loc, bl); + DECODE_FINISH(bl); + } + + bool operator<(const rgw_raw_obj& o) const { + int r = pool.compare(o.pool); + if (r == 0) { + r = oid.compare(o.oid); + if (r == 0) { + r = loc.compare(o.loc); + } + } + return (r < 0); + } + + bool operator==(const rgw_raw_obj& o) const { + return (pool == o.pool && oid == o.oid && loc == o.loc); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_raw_obj) + +inline ostream& operator<<(ostream& out, const rgw_raw_obj& o) { + out << o.pool << ":" << o.oid; + return out; +} + +struct rgw_bucket { + std::string tenant; + std::string name; + std::string marker; + std::string bucket_id; + rgw_data_placement_target explicit_placement; + + std::string oid; /* + * runtime in-memory only info. If not empty, points to the bucket instance object + */ + + rgw_bucket() { } + // cppcheck-suppress noExplicitConstructor + explicit rgw_bucket(const rgw_user& u, const cls_user_bucket& b) : + tenant(u.tenant), + name(b.name), + marker(b.marker), + bucket_id(b.bucket_id), + explicit_placement(b.explicit_placement.data_pool, + b.explicit_placement.data_extra_pool, + b.explicit_placement.index_pool) {} + rgw_bucket(const rgw_bucket&) = default; + rgw_bucket(rgw_bucket&&) = default; + + void convert(cls_user_bucket *b) const { + b->name = name; + b->marker = marker; + b->bucket_id = bucket_id; + b->explicit_placement.data_pool = explicit_placement.data_pool.to_str(); + b->explicit_placement.data_extra_pool = explicit_placement.data_extra_pool.to_str(); + b->explicit_placement.index_pool = explicit_placement.index_pool.to_str(); + } + + void encode(bufferlist& bl) const { + ENCODE_START(10, 10, bl); + encode(name, bl); + encode(marker, bl); + encode(bucket_id, bl); + encode(tenant, bl); + bool encode_explicit = !explicit_placement.data_pool.empty(); + encode(encode_explicit, bl); + if (encode_explicit) { + encode(explicit_placement.data_pool, bl); + encode(explicit_placement.data_extra_pool, bl); + encode(explicit_placement.index_pool, bl); + } + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(10, 3, 3, bl); + decode(name, bl); + if (struct_v < 10) { + decode(explicit_placement.data_pool.name, bl); + } + if (struct_v >= 2) { + decode(marker, bl); + if (struct_v <= 3) { + uint64_t id; + decode(id, bl); + char buf[16]; + snprintf(buf, sizeof(buf), "%" PRIu64, id); + bucket_id = buf; + } else { + decode(bucket_id, bl); + } + } + if (struct_v < 10) { + if (struct_v >= 5) { + decode(explicit_placement.index_pool.name, bl); + } else { + explicit_placement.index_pool = explicit_placement.data_pool; + } + if (struct_v >= 7) { + decode(explicit_placement.data_extra_pool.name, bl); + } + } + if (struct_v >= 8) { + decode(tenant, bl); + } + if (struct_v >= 10) { + bool decode_explicit = !explicit_placement.data_pool.empty(); + decode(decode_explicit, bl); + if (decode_explicit) { + decode(explicit_placement.data_pool, bl); + decode(explicit_placement.data_extra_pool, bl); + decode(explicit_placement.index_pool, bl); + } + } + DECODE_FINISH(bl); + } + + void update_bucket_id(const string& new_bucket_id) { + bucket_id = new_bucket_id; + oid.clear(); + } + + // format a key for the bucket/instance. pass delim=0 to skip a field + std::string get_key(char tenant_delim = '/', + char id_delim = ':', + size_t reserve = 0) const; + + const rgw_pool& get_data_extra_pool() const { + return explicit_placement.get_data_extra_pool(); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(list& o); + + rgw_bucket& operator=(const rgw_bucket&) = default; + + bool operator<(const rgw_bucket& b) const { + if (tenant == b.tenant) { + return name < b.name; + } else { + return tenant < b.tenant; + } + } + + bool operator==(const rgw_bucket& b) const { + return (tenant == b.tenant) && (name == b.name) && \ + (bucket_id == b.bucket_id); + } +}; +WRITE_CLASS_ENCODER(rgw_bucket) + +inline ostream& operator<<(ostream& out, const rgw_bucket &b) { + out << b.name << "[" << b.marker << "]"; + return out; +} + +struct rgw_bucket_shard { + rgw_bucket bucket; + int shard_id; + + rgw_bucket_shard() : shard_id(-1) {} + rgw_bucket_shard(const rgw_bucket& _b, int _sid) : bucket(_b), shard_id(_sid) {} + + std::string get_key(char tenant_delim = '/', char id_delim = ':', + char shard_delim = ':') const; + + bool operator<(const rgw_bucket_shard& b) const { + if (bucket < b.bucket) { + return true; + } + if (b.bucket < bucket) { + return false; + } + return shard_id < b.shard_id; + } +}; + + +struct RGWObjVersionTracker { + obj_version read_version; + obj_version write_version; + + obj_version *version_for_read() { + return &read_version; + } + + obj_version *version_for_write() { + if (write_version.ver == 0) + return NULL; + + return &write_version; + } + + obj_version *version_for_check() { + if (read_version.ver == 0) + return NULL; + + return &read_version; + } + + void prepare_op_for_read(librados::ObjectReadOperation *op); + void prepare_op_for_write(librados::ObjectWriteOperation *op); + + void apply_write(); + + void clear() { + read_version = obj_version(); + write_version = obj_version(); + } + + void generate_new_write_ver(CephContext *cct); +}; + +inline ostream& operator<<(ostream& out, const obj_version &v) +{ + out << v.tag << ":" << v.ver; + return out; +} + +inline ostream& operator<<(ostream& out, const RGWObjVersionTracker &ot) +{ + out << "{r=" << ot.read_version << ",w=" << ot.write_version << "}"; + return out; +} + +enum RGWBucketFlags { + BUCKET_SUSPENDED = 0x1, + BUCKET_VERSIONED = 0x2, + BUCKET_VERSIONS_SUSPENDED = 0x4, + BUCKET_DATASYNC_DISABLED = 0X8, + BUCKET_MFA_ENABLED = 0X10, + BUCKET_OBJ_LOCK_ENABLED = 0X20, +}; + +enum RGWBucketIndexType { + RGWBIType_Normal = 0, + RGWBIType_Indexless = 1, +}; + +inline ostream& operator<<(ostream& out, const RGWBucketIndexType &index_type) +{ + switch (index_type) { + case RGWBIType_Normal: + return out << "Normal"; + case RGWBIType_Indexless: + return out << "Indexless"; + default: + return out << "Unknown"; + } +} + +struct RGWBucketInfo { + enum BIShardsHashType { + MOD = 0 + }; + + rgw_bucket bucket; + rgw_user owner; + uint32_t flags; + string zonegroup; + ceph::real_time creation_time; + rgw_placement_rule placement_rule; + bool has_instance_obj; + RGWObjVersionTracker objv_tracker; /* we don't need to serialize this, for runtime tracking */ + obj_version ep_objv; /* entry point object version, for runtime tracking only */ + RGWQuotaInfo quota; + + // Represents the number of bucket index object shards: + // - value of 0 indicates there is no sharding (this is by default before this + // feature is implemented). + // - value of UINT32_T::MAX indicates this is a blind bucket. + uint32_t num_shards; + + // Represents the bucket index shard hash type. + uint8_t bucket_index_shard_hash_type; + + // Represents the shard number for blind bucket. + const static uint32_t NUM_SHARDS_BLIND_BUCKET; + + bool requester_pays; + + bool has_website; + RGWBucketWebsiteConf website_conf; + + RGWBucketIndexType index_type = RGWBIType_Normal; + + bool swift_versioning; + string swift_ver_location; + + map mdsearch_config; + + + + /* resharding */ + uint8_t reshard_status; + string new_bucket_instance_id; + + RGWObjectLock obj_lock; + + void encode(bufferlist& bl) const { + ENCODE_START(20, 4, bl); + encode(bucket, bl); + encode(owner.id, bl); + encode(flags, bl); + encode(zonegroup, bl); + uint64_t ct = real_clock::to_time_t(creation_time); + encode(ct, bl); + encode(placement_rule, bl); + encode(has_instance_obj, bl); + encode(quota, bl); + encode(num_shards, bl); + encode(bucket_index_shard_hash_type, bl); + encode(requester_pays, bl); + encode(owner.tenant, bl); + encode(has_website, bl); + if (has_website) { + encode(website_conf, bl); + } + encode((uint32_t)index_type, bl); + encode(swift_versioning, bl); + if (swift_versioning) { + encode(swift_ver_location, bl); + } + encode(creation_time, bl); + encode(mdsearch_config, bl); + encode(reshard_status, bl); + encode(new_bucket_instance_id, bl); + if (obj_lock_enabled()) { + encode(obj_lock, bl); + } + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(20, 4, 4, bl); + decode(bucket, bl); + if (struct_v >= 2) { + string s; + decode(s, bl); + owner.from_str(s); + } + if (struct_v >= 3) + decode(flags, bl); + if (struct_v >= 5) + decode(zonegroup, bl); + if (struct_v >= 6) { + uint64_t ct; + decode(ct, bl); + if (struct_v < 17) + creation_time = ceph::real_clock::from_time_t((time_t)ct); + } + if (struct_v >= 7) + decode(placement_rule, bl); + if (struct_v >= 8) + decode(has_instance_obj, bl); + if (struct_v >= 9) + decode(quota, bl); + if (struct_v >= 10) + decode(num_shards, bl); + if (struct_v >= 11) + decode(bucket_index_shard_hash_type, bl); + if (struct_v >= 12) + decode(requester_pays, bl); + if (struct_v >= 13) + decode(owner.tenant, bl); + if (struct_v >= 14) { + decode(has_website, bl); + if (has_website) { + decode(website_conf, bl); + } else { + website_conf = RGWBucketWebsiteConf(); + } + } + if (struct_v >= 15) { + uint32_t it; + decode(it, bl); + index_type = (RGWBucketIndexType)it; + } else { + index_type = RGWBIType_Normal; + } + swift_versioning = false; + swift_ver_location.clear(); + if (struct_v >= 16) { + decode(swift_versioning, bl); + if (swift_versioning) { + decode(swift_ver_location, bl); + } + } + if (struct_v >= 17) { + decode(creation_time, bl); + } + if (struct_v >= 18) { + decode(mdsearch_config, bl); + } + if (struct_v >= 19) { + decode(reshard_status, bl); + decode(new_bucket_instance_id, bl); + } + if (struct_v >= 20 && obj_lock_enabled()) { + decode(obj_lock, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + void decode_json(JSONObj *obj); + + bool versioned() const { return (flags & BUCKET_VERSIONED) != 0; } + int versioning_status() const { return flags & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED | BUCKET_MFA_ENABLED); } + bool versioning_enabled() const { return (versioning_status() & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED)) == BUCKET_VERSIONED; } + bool mfa_enabled() const { return (versioning_status() & BUCKET_MFA_ENABLED) != 0; } + bool datasync_flag_enabled() const { return (flags & BUCKET_DATASYNC_DISABLED) == 0; } + bool obj_lock_enabled() const { return (flags & BUCKET_OBJ_LOCK_ENABLED) != 0; } + + bool has_swift_versioning() const { + /* A bucket may be versioned through one mechanism only. */ + return swift_versioning && !versioned(); + } + + RGWBucketInfo() : flags(0), has_instance_obj(false), num_shards(0), bucket_index_shard_hash_type(MOD), requester_pays(false), + has_website(false), swift_versioning(false), reshard_status(0) {} +}; +WRITE_CLASS_ENCODER(RGWBucketInfo) + +struct RGWBucketEntryPoint +{ + rgw_bucket bucket; + rgw_user owner; + ceph::real_time creation_time; + bool linked; + + bool has_bucket_info; + RGWBucketInfo old_bucket_info; + + RGWBucketEntryPoint() : linked(false), has_bucket_info(false) {} + + void encode(bufferlist& bl) const { + ENCODE_START(10, 8, bl); + encode(bucket, bl); + encode(owner.id, bl); + encode(linked, bl); + uint64_t ctime = (uint64_t)real_clock::to_time_t(creation_time); + encode(ctime, bl); + encode(owner, bl); + encode(creation_time, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + auto orig_iter = bl; + DECODE_START_LEGACY_COMPAT_LEN_32(10, 4, 4, bl); + if (struct_v < 8) { + /* ouch, old entry, contains the bucket info itself */ + old_bucket_info.decode(orig_iter); + has_bucket_info = true; + return; + } + has_bucket_info = false; + decode(bucket, bl); + decode(owner.id, bl); + decode(linked, bl); + uint64_t ctime; + decode(ctime, bl); + if (struct_v < 10) { + creation_time = real_clock::from_time_t((time_t)ctime); + } + if (struct_v >= 9) { + decode(owner, bl); + } + if (struct_v >= 10) { + decode(creation_time, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWBucketEntryPoint) + +struct RGWStorageStats +{ + RGWObjCategory category; + uint64_t size; + uint64_t size_rounded; + uint64_t size_utilized{0}; //< size after compression, encryption + uint64_t num_objects; + + RGWStorageStats() + : category(RGWObjCategory::None), + size(0), + size_rounded(0), + num_objects(0) {} + + void dump(Formatter *f) const; +}; + +class RGWEnv; + +/* Namespaced forward declarations. */ +namespace rgw { + namespace auth { + namespace s3 { + class AWSBrowserUploadAbstractor; + } + class Completer; + } + namespace io { + class BasicClient; + } +} + +using meta_map_t = boost::container::flat_map ; + +struct req_info { + const RGWEnv *env; + RGWHTTPArgs args; + meta_map_t x_meta_map; + + string host; + const char *method; + string script_uri; + string request_uri; + string request_uri_aws4; + string effective_uri; + string request_params; + string domain; + string storage_class; + + req_info(CephContext *cct, const RGWEnv *env); + void rebuild_from(req_info& src); + void init_meta_info(bool *found_bad_meta); +}; + +typedef cls_rgw_obj_key rgw_obj_index_key; + +struct rgw_obj_key { + string name; + string instance; + string ns; + + rgw_obj_key() {} + // cppcheck-suppress noExplicitConstructor + rgw_obj_key(const string& n) : name(n) {} + rgw_obj_key(const string& n, const string& i) : name(n), instance(i) {} + rgw_obj_key(const string& n, const string& i, const string& _ns) : name(n), instance(i), ns(_ns) {} + + rgw_obj_key(const rgw_obj_index_key& k) { + parse_index_key(k.name, &name, &ns); + instance = k.instance; + } + +// Since bucket index entries are stored in sequence, and the elements +// with namespaces can be between those without, we need a way to skip +// past namespaced elements; this returns a marker that will do so. +// +// Consider the following sequence: ASP, _BAT_cat, __DOG, _eel_FOX, +// goat; the 2nd and 4th entries are namespaced, but the 3rd is not, +// it's just an entry that begins with an underscore, which will be +// quoted with another underscore putting it between two potential +// namespaced blocks + static const rgw_obj_index_key& after_namespace_marker(const std::string& after) { + // this is just before "__", so will allow finding non-namespaced + // entries that begin with an underscore (and therefore are entered + // as starting with "__". + static const rgw_obj_index_key result1(std::string("_^") + char(255)); + + // this is just before entries that do not begin with an + // underscore and will allow skipping past the second namespace + // block + static const rgw_obj_index_key result2(std::string("_") + char(255)); + + if (after < result1.name) { + return result1; + } else { + return result2; + } + } + + static void parse_index_key(const string& key, string *name, string *ns) { + if (key[0] != '_') { + *name = key; + ns->clear(); + return; + } + if (key[1] == '_') { + *name = key.substr(1); + ns->clear(); + return; + } + ssize_t pos = key.find('_', 1); + if (pos < 0) { + /* shouldn't happen, just use key */ + *name = key; + ns->clear(); + return; + } + + *name = key.substr(pos + 1); + *ns = key.substr(1, pos -1); + } + + void set(const string& n) { + name = n; + instance.clear(); + ns.clear(); + } + + void set(const string& n, const string& i) { + name = n; + instance = i; + ns.clear(); + } + + void set(const string& n, const string& i, const string& _ns) { + name = n; + instance = i; + ns = _ns; + } + + bool set(const rgw_obj_index_key& index_key) { + if (!parse_raw_oid(index_key.name, this)) { + return false; + } + instance = index_key.instance; + return true; + } + + void set_instance(const string& i) { + instance = i; + } + + const string& get_instance() const { + return instance; + } + + void set_ns(const std::string& _ns) { + ns = _ns; + } + + const std::string& get_ns() const { + return ns; + } + + string get_index_key_name() const { + if (ns.empty()) { + if (name.size() < 1 || name[0] != '_') { + return name; + } + return string("_") + name; + }; + + char buf[ns.size() + 16]; + snprintf(buf, sizeof(buf), "_%s_", ns.c_str()); + return string(buf) + name; + }; + + void get_index_key(rgw_obj_index_key *key) const { + key->name = get_index_key_name(); + key->instance = instance; + } + + string get_loc() const { + /* + * For backward compatibility. Older versions used to have object locator on all objects, + * however, the name was the effective object locator. This had the same effect as not + * having object locator at all for most objects but the ones that started with underscore as + * these were escaped. + */ + if (name[0] == '_' && ns.empty()) { + return name; + } + + return string(); + } + + bool empty() const { + return name.empty(); + } + + bool have_null_instance() const { + return instance == "null"; + } + + bool have_instance() const { + return !instance.empty(); + } + + bool need_to_encode_instance() const { + return have_instance() && !have_null_instance(); + } + + string get_oid() const { + if (ns.empty() && !need_to_encode_instance()) { + if (name.size() < 1 || name[0] != '_') { + return name; + } + return string("_") + name; + } + + string oid = "_"; + oid.append(ns); + if (need_to_encode_instance()) { + oid.append(string(":") + instance); + } + oid.append("_"); + oid.append(name); + return oid; + } + + bool operator==(const rgw_obj_key& k) const { + return (name.compare(k.name) == 0) && + (instance.compare(k.instance) == 0); + } + + bool operator<(const rgw_obj_key& k) const { + int r = name.compare(k.name); + if (r == 0) { + r = instance.compare(k.instance); + } + return (r < 0); + } + + bool operator<=(const rgw_obj_key& k) const { + return !(k < *this); + } + + static void parse_ns_field(string& ns, string& instance) { + int pos = ns.find(':'); + if (pos >= 0) { + instance = ns.substr(pos + 1); + ns = ns.substr(0, pos); + } else { + instance.clear(); + } + } + + // takes an oid and parses out the namespace (ns), name, and + // instance + static bool parse_raw_oid(const string& oid, rgw_obj_key *key) { + key->instance.clear(); + key->ns.clear(); + if (oid[0] != '_') { + key->name = oid; + return true; + } + + if (oid.size() >= 2 && oid[1] == '_') { + key->name = oid.substr(1); + return true; + } + + if (oid.size() < 3) // for namespace, min size would be 3: _x_ + return false; + + size_t pos = oid.find('_', 2); // oid must match ^_[^_].+$ + if (pos == string::npos) + return false; + + key->ns = oid.substr(1, pos - 1); + parse_ns_field(key->ns, key->instance); + + key->name = oid.substr(pos + 1); + return true; + } + + /** + * Translate a namespace-mangled object name to the user-facing name + * existing in the given namespace. + * + * If the object is part of the given namespace, it returns true + * and cuts down the name to the unmangled version. If it is not + * part of the given namespace, it returns false. + */ + static bool oid_to_key_in_ns(const string& oid, rgw_obj_key *key, const string& ns) { + bool ret = parse_raw_oid(oid, key); + if (!ret) { + return ret; + } + + return (ns == key->ns); + } + + /** + * Given a mangled object name and an empty namespace string, this + * function extracts the namespace into the string and sets the object + * name to be the unmangled version. + * + * It returns true after successfully doing so, or + * false if it fails. + */ + static bool strip_namespace_from_name(string& name, string& ns, string& instance) { + ns.clear(); + instance.clear(); + if (name[0] != '_') { + return true; + } + + size_t pos = name.find('_', 1); + if (pos == string::npos) { + return false; + } + + if (name[1] == '_') { + name = name.substr(1); + return true; + } + + size_t period_pos = name.find('.'); + if (period_pos < pos) { + return false; + } + + ns = name.substr(1, pos-1); + name = name.substr(pos+1, string::npos); + + parse_ns_field(ns, instance); + return true; + } + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(name, bl); + encode(instance, bl); + encode(ns, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(name, bl); + decode(instance, bl); + if (struct_v >= 2) { + decode(ns, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + string to_str() const { + if (instance.empty()) { + return name; + } + char buf[name.size() + instance.size() + 16]; + snprintf(buf, sizeof(buf), "%s[%s]", name.c_str(), instance.c_str()); + return buf; + } +}; +WRITE_CLASS_ENCODER(rgw_obj_key) + +inline ostream& operator<<(ostream& out, const rgw_obj_key &o) { + return out << o.to_str(); +} + +inline ostream& operator<<(ostream& out, const rgw_obj_index_key &o) { + if (o.instance.empty()) { + return out << o.name; + } else { + return out << o.name << "[" << o.instance << "]"; + } +} + +struct req_init_state { + /* Keeps [[tenant]:]bucket until we parse the token. */ + string url_bucket; + string src_bucket; +}; + +#include "rgw_auth.h" + +class RGWObjectCtx; +class RGWSysObjectCtx; + +/** Store all the state necessary to complete and respond to an HTTP request*/ +struct req_state : DoutPrefixProvider { + CephContext *cct; + rgw::io::BasicClient *cio{nullptr}; + http_op op{OP_UNKNOWN}; + RGWOpType op_type{}; + bool content_started{false}; + int format{0}; + ceph::Formatter *formatter{nullptr}; + string decoded_uri; + string relative_uri; + const char *length{nullptr}; + int64_t content_length{0}; + map generic_attrs; + rgw_err err; + bool expect_cont{false}; + uint64_t obj_size{0}; + bool enable_ops_log; + bool enable_usage_log; + uint8_t defer_to_bucket_acls; + uint32_t perm_mask{0}; + + /* Set once when url_bucket is parsed and not violated thereafter. */ + string account_name; + + string bucket_tenant; + string bucket_name; + + rgw_bucket bucket; + rgw_obj_key object; + string src_tenant_name; + string src_bucket_name; + rgw_obj_key src_object; + ACLOwner bucket_owner; + ACLOwner owner; + + string zonegroup_name; + string zonegroup_endpoint; + string bucket_instance_id; + int bucket_instance_shard_id{-1}; + string redirect_zone_endpoint; + + string redirect; + + RGWBucketInfo bucket_info; + real_time bucket_mtime; + std::map bucket_attrs; + bool bucket_exists{false}; + rgw_placement_rule dest_placement; + + bool has_bad_meta{false}; + + RGWUserInfo *user; + + struct { + /* TODO(rzarzynski): switch out to the static_ptr for both members. */ + + /* Object having the knowledge about an authenticated identity and allowing + * to apply it during the authorization phase (verify_permission() methods + * of a given RGWOp). Thus, it bounds authentication and authorization steps + * through a well-defined interface. For more details, see rgw_auth.h. */ + std::unique_ptr identity; + + std::shared_ptr completer; + + /* A container for credentials of the S3's browser upload. It's necessary + * because: 1) the ::authenticate() method of auth engines and strategies + * take req_state only; 2) auth strategies live much longer than RGWOps - + * there is no way to pass additional data dependencies through ctors. */ + class { + /* Writer. */ + friend class RGWPostObj_ObjStore_S3; + /* Reader. */ + friend class rgw::auth::s3::AWSBrowserUploadAbstractor; + + std::string access_key; + std::string signature; + std::string x_amz_algorithm; + std::string x_amz_credential; + std::string x_amz_date; + std::string x_amz_security_token; + ceph::bufferlist encoded_policy; + } s3_postobj_creds; + } auth; + + std::unique_ptr user_acl; + std::unique_ptr bucket_acl; + std::unique_ptr object_acl; + + rgw::IAM::Environment env; + boost::optional iam_policy; + vector iam_user_policies; + + /* Is the request made by an user marked as a system one? + * Being system user means we also have the admin status. */ + bool system_request{false}; + + string canned_acl; + bool has_acl_header{false}; + bool local_source{false}; /* source is local */ + + int prot_flags{0}; + + /* Content-Disposition override for TempURL of Swift API. */ + struct { + string override; + string fallback; + } content_disp; + + string host_id; + + req_info info; + req_init_state init_state; + + using Clock = ceph::coarse_real_clock; + Clock::time_point time; + + Clock::duration time_elapsed() const { return Clock::now() - time; } + + RGWObjectCtx *obj_ctx{nullptr}; + RGWSysObjectCtx *sysobj_ctx{nullptr}; + string dialect; + string req_id; + string trans_id; + uint64_t id; + + RGWObjTags tagset; + + bool mfa_verified{false}; + + /// optional coroutine context + optional_yield yield{null_yield}; + + req_state(CephContext* _cct, RGWEnv* e, RGWUserInfo* u, uint64_t id); + ~req_state(); + + bool is_err() const { return err.is_err(); } + + // implements DoutPrefixProvider + std::ostream& gen_prefix(std::ostream& out) const override; + CephContext* get_cct() const override { return cct; } + unsigned get_subsys() const override { return ceph_subsys_rgw; } +}; + +void set_req_state_err(struct req_state*, int); +void set_req_state_err(struct req_state*, int, const string&); +void set_req_state_err(struct rgw_err&, int, const int); +void dump(struct req_state*); + +/** Store basic data on bucket */ +struct RGWBucketEnt { + rgw_bucket bucket; + size_t size; + size_t size_rounded; + ceph::real_time creation_time; + uint64_t count; + + /* The placement_rule is necessary to calculate per-storage-policy statics + * of the Swift API. Although the info available in RGWBucketInfo, we need + * to duplicate it here to not affect the performance of buckets listing. */ + rgw_placement_rule placement_rule; + + RGWBucketEnt() + : size(0), + size_rounded(0), + count(0) { + } + RGWBucketEnt(const RGWBucketEnt&) = default; + RGWBucketEnt(RGWBucketEnt&&) = default; + explicit RGWBucketEnt(const rgw_user& u, cls_user_bucket_entry&& e) + : bucket(u, std::move(e.bucket)), + size(e.size), + size_rounded(e.size_rounded), + creation_time(e.creation_time), + count(e.count) { + } + + RGWBucketEnt& operator=(const RGWBucketEnt&) = default; + + void convert(cls_user_bucket_entry *b) const { + bucket.convert(&b->bucket); + b->size = size; + b->size_rounded = size_rounded; + b->creation_time = creation_time; + b->count = count; + } + + void encode(bufferlist& bl) const { + ENCODE_START(7, 5, bl); + uint64_t s = size; + __u32 mt = ceph::real_clock::to_time_t(creation_time); + string empty_str; // originally had the bucket name here, but we encode bucket later + encode(empty_str, bl); + encode(s, bl); + encode(mt, bl); + encode(count, bl); + encode(bucket, bl); + s = size_rounded; + encode(s, bl); + encode(creation_time, bl); + encode(placement_rule, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); + __u32 mt; + uint64_t s; + string empty_str; // backward compatibility + decode(empty_str, bl); + decode(s, bl); + decode(mt, bl); + size = s; + if (struct_v < 6) { + creation_time = ceph::real_clock::from_time_t(mt); + } + if (struct_v >= 2) + decode(count, bl); + if (struct_v >= 3) + decode(bucket, bl); + if (struct_v >= 4) + decode(s, bl); + size_rounded = s; + if (struct_v >= 6) + decode(creation_time, bl); + if (struct_v >= 7) + decode(placement_rule, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(RGWBucketEnt) + +struct rgw_obj { + rgw_bucket bucket; + rgw_obj_key key; + + bool in_extra_data{false}; /* in-memory only member, does not serialize */ + + // Represents the hash index source for this object once it is set (non-empty) + std::string index_hash_source; + + rgw_obj() {} + rgw_obj(const rgw_bucket& b, const std::string& name) : bucket(b), key(name) {} + rgw_obj(const rgw_bucket& b, const rgw_obj_key& k) : bucket(b), key(k) {} + rgw_obj(const rgw_bucket& b, const rgw_obj_index_key& k) : bucket(b), key(k) {} + + void init(const rgw_bucket& b, const std::string& name) { + bucket = b; + key.set(name); + } + void init(const rgw_bucket& b, const std::string& name, const string& i, const string& n) { + bucket = b; + key.set(name, i, n); + } + void init_ns(const rgw_bucket& b, const std::string& name, const string& n) { + bucket = b; + key.name = name; + key.instance.clear(); + key.ns = n; + } + + bool empty() const { + return key.empty(); + } + + void set_key(const rgw_obj_key& k) { + key = k; + } + + string get_oid() const { + return key.get_oid(); + } + + const string& get_hash_object() const { + return index_hash_source.empty() ? key.name : index_hash_source; + } + + void set_in_extra_data(bool val) { + in_extra_data = val; + } + + bool is_in_extra_data() const { + return in_extra_data; + } + + void encode(bufferlist& bl) const { + ENCODE_START(6, 6, bl); + encode(bucket, bl); + encode(key.ns, bl); + encode(key.name, bl); + encode(key.instance, bl); +// encode(placement_id, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl); + if (struct_v < 6) { + string s; + decode(bucket.name, bl); /* bucket.name */ + decode(s, bl); /* loc */ + decode(key.ns, bl); + decode(key.name, bl); + if (struct_v >= 2) + decode(bucket, bl); + if (struct_v >= 4) + decode(key.instance, bl); + if (key.ns.empty() && key.instance.empty()) { + if (key.name[0] == '_') { + key.name = key.name.substr(1); + } + } else { + if (struct_v >= 5) { + decode(key.name, bl); + } else { + ssize_t pos = key.name.find('_', 1); + if (pos < 0) { + throw buffer::error(); + } + key.name = key.name.substr(pos + 1); + } + } + } else { + decode(bucket, bl); + decode(key.ns, bl); + decode(key.name, bl); + decode(key.instance, bl); +// decode(placement_id, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + bool operator==(const rgw_obj& o) const { + return (key == o.key) && + (bucket == o.bucket); + } + bool operator<(const rgw_obj& o) const { + int r = key.name.compare(o.key.name); + if (r == 0) { + r = bucket.bucket_id.compare(o.bucket.bucket_id); /* not comparing bucket.name, if bucket_id is equal so will be bucket.name */ + if (r == 0) { + r = key.ns.compare(o.key.ns); + if (r == 0) { + r = key.instance.compare(o.key.instance); + } + } + } + + return (r < 0); + } + + const rgw_pool& get_explicit_data_pool() { + if (!in_extra_data || bucket.explicit_placement.data_extra_pool.empty()) { + return bucket.explicit_placement.data_pool; + } + return bucket.explicit_placement.data_extra_pool; + } +}; +WRITE_CLASS_ENCODER(rgw_obj) + +struct rgw_cache_entry_info { + string cache_locator; + uint64_t gen; + + rgw_cache_entry_info() : gen(0) {} +}; + +inline ostream& operator<<(ostream& out, const rgw_obj &o) { + return out << o.bucket.name << ":" << o.get_oid(); +} + +static inline void buf_to_hex(const unsigned char* const buf, + const size_t len, + char* const str) +{ + str[0] = '\0'; + for (size_t i = 0; i < len; i++) { + ::sprintf(&str[i*2], "%02x", static_cast(buf[i])); + } +} + +template static inline std::array +buf_to_hex(const std::array& buf) +{ + static_assert(N > 0, "The input array must be at least one element long"); + + std::array hex_dest; + buf_to_hex(buf.data(), N, hex_dest.data()); + return hex_dest; +} + +static inline int hexdigit(char c) +{ + if (c >= '0' && c <= '9') + return (c - '0'); + c = toupper(c); + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xa; + return -EINVAL; +} + +static inline int hex_to_buf(const char *hex, char *buf, int len) +{ + int i = 0; + const char *p = hex; + while (*p) { + if (i >= len) + return -EINVAL; + buf[i] = 0; + int d = hexdigit(*p); + if (d < 0) + return d; + buf[i] = d << 4; + p++; + if (!*p) + return -EINVAL; + d = hexdigit(*p); + if (d < 0) + return d; + buf[i] += d; + i++; + p++; + } + return i; +} + +static inline int rgw_str_to_bool(const char *s, int def_val) +{ + if (!s) + return def_val; + + return (strcasecmp(s, "true") == 0 || + strcasecmp(s, "on") == 0 || + strcasecmp(s, "yes") == 0 || + strcasecmp(s, "1") == 0); +} + +static inline void append_rand_alpha(CephContext *cct, const string& src, string& dest, int len) +{ + dest = src; + char buf[len + 1]; + gen_rand_alphanumeric(cct, buf, len); + dest.append("_"); + dest.append(buf); +} + +static inline const char *rgw_obj_category_name(RGWObjCategory category) +{ + switch (category) { + case RGWObjCategory::None: + return "rgw.none"; + case RGWObjCategory::Main: + return "rgw.main"; + case RGWObjCategory::Shadow: + return "rgw.shadow"; + case RGWObjCategory::MultiMeta: + return "rgw.multimeta"; + } + + return "unknown"; +} + +static inline uint64_t rgw_rounded_kb(uint64_t bytes) +{ + return (bytes + 1023) / 1024; +} + +static inline uint64_t rgw_rounded_objsize(uint64_t bytes) +{ + return ((bytes + 4095) & ~4095); +} + +static inline uint64_t rgw_rounded_objsize_kb(uint64_t bytes) +{ + return ((bytes + 4095) & ~4095) / 1024; +} + +/* implement combining step, S3 header canonicalization; k is a + * valid header and in lc form */ +static inline void add_amz_meta_header( + meta_map_t& x_meta_map, + const std::string& k, + const std::string& v) +{ + auto it = x_meta_map.find(k); + if (it != x_meta_map.end()) { + std::string old = it->second; + boost::algorithm::trim_right(old); + old.append(","); + old.append(v); + x_meta_map[k] = old; + } else { + x_meta_map[k] = v; + } +} /* add_amz_meta_header */ + +extern string rgw_string_unquote(const string& s); +extern void parse_csv_string(const string& ival, vector& ovals); +extern int parse_key_value(string& in_str, string& key, string& val); +extern int parse_key_value(string& in_str, const char *delim, string& key, string& val); + +extern boost::optional> +parse_key_value(const boost::string_view& in_str, + const boost::string_view& delim); +extern boost::optional> +parse_key_value(const boost::string_view& in_str); + + +/** time parsing */ +extern int parse_time(const char *time_str, real_time *time); +extern bool parse_rfc2616(const char *s, struct tm *t); +extern bool parse_iso8601(const char *s, struct tm *t, uint32_t *pns = NULL, bool extended_format = true); +extern string rgw_trim_whitespace(const string& src); +extern boost::string_view rgw_trim_whitespace(const boost::string_view& src); +extern string rgw_trim_quotes(const string& val); + +extern void rgw_to_iso8601(const real_time& t, char *dest, int buf_size); +extern void rgw_to_iso8601(const real_time& t, string *dest); +extern std::string rgw_to_asctime(const utime_t& t); + +/** Check if the req_state's user has the necessary permissions + * to do the requested action */ +rgw::IAM::Effect eval_user_policies(const vector& user_policies, + const rgw::IAM::Environment& env, + boost::optional id, + const uint64_t op, + const rgw::ARN& arn); +bool verify_user_permission(const DoutPrefixProvider* dpp, + struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + const vector& user_policies, + const rgw::ARN& res, + const uint64_t op); +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, + struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + const int perm); +bool verify_user_permission(const DoutPrefixProvider* dpp, + struct req_state * const s, + const rgw::ARN& res, + const uint64_t op); +bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, + struct req_state * const s, + int perm); +bool verify_bucket_permission( + const DoutPrefixProvider* dpp, + struct req_state * const s, + const rgw_bucket& bucket, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const vector& user_policies, + const uint64_t op); +bool verify_bucket_permission(const DoutPrefixProvider* dpp, struct req_state * const s, const uint64_t op); +bool verify_bucket_permission_no_policy( + const DoutPrefixProvider* dpp, + struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + const int perm); +bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, + struct req_state * const s, + const int perm); +int verify_bucket_owner_or_policy(struct req_state* const s, + const uint64_t op); +extern bool verify_object_permission( + const DoutPrefixProvider* dpp, + struct req_state * const s, + const rgw_obj& obj, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + const boost::optional& bucket_policy, + const vector& user_policies, + const uint64_t op); +extern bool verify_object_permission(const DoutPrefixProvider* dpp, struct req_state *s, uint64_t op); +extern bool verify_object_permission_no_policy( + const DoutPrefixProvider* dpp, + struct req_state * const s, + RGWAccessControlPolicy * const user_acl, + RGWAccessControlPolicy * const bucket_acl, + RGWAccessControlPolicy * const object_acl, + int perm); +extern bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, struct req_state *s, + int perm); +/** Convert an input URL into a sane object name + * by converting %-escaped strings into characters, etc*/ +extern void rgw_uri_escape_char(char c, string& dst); +extern std::string url_decode(const boost::string_view& src_str, + bool in_query = false); +extern void url_encode(const std::string& src, string& dst, + bool encode_slash = true); +extern std::string url_encode(const std::string& src, bool encode_slash = true); +/* destination should be CEPH_CRYPTO_HMACSHA1_DIGESTSIZE bytes long */ +extern void calc_hmac_sha1(const char *key, int key_len, + const char *msg, int msg_len, char *dest); + +static inline sha1_digest_t +calc_hmac_sha1(const boost::string_view& key, const boost::string_view& msg) { + sha1_digest_t dest; + calc_hmac_sha1(key.data(), key.size(), msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +/* destination should be CEPH_CRYPTO_HMACSHA256_DIGESTSIZE bytes long */ +extern void calc_hmac_sha256(const char *key, int key_len, + const char *msg, int msg_len, + char *dest); + +static inline sha256_digest_t +calc_hmac_sha256(const char *key, const int key_len, + const char *msg, const int msg_len) { + sha256_digest_t dest; + calc_hmac_sha256(key, key_len, msg, msg_len, + reinterpret_cast(dest.v)); + return dest; +} + +static inline sha256_digest_t +calc_hmac_sha256(const boost::string_view& key, const boost::string_view& msg) { + sha256_digest_t dest; + calc_hmac_sha256(key.data(), key.size(), + msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +static inline sha256_digest_t +calc_hmac_sha256(const sha256_digest_t &key, + const boost::string_view& msg) { + sha256_digest_t dest; + calc_hmac_sha256(reinterpret_cast(key.v), sha256_digest_t::SIZE, + msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +static inline sha256_digest_t +calc_hmac_sha256(const std::vector& key, + const boost::string_view& msg) { + sha256_digest_t dest; + calc_hmac_sha256(reinterpret_cast(key.data()), key.size(), + msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +template +static inline sha256_digest_t +calc_hmac_sha256(const std::array& key, + const boost::string_view& msg) { + sha256_digest_t dest; + calc_hmac_sha256(reinterpret_cast(key.data()), key.size(), + msg.data(), msg.size(), + reinterpret_cast(dest.v)); + return dest; +} + +extern sha256_digest_t calc_hash_sha256(const boost::string_view& msg); + +extern ceph::crypto::SHA256* calc_hash_sha256_open_stream(); +extern void calc_hash_sha256_update_stream(ceph::crypto::SHA256* hash, + const char* msg, + int len); +extern std::string calc_hash_sha256_close_stream(ceph::crypto::SHA256** phash); +extern std::string calc_hash_sha256_restart_stream(ceph::crypto::SHA256** phash); + +extern int rgw_parse_op_type_list(const string& str, uint32_t *perm); + +static constexpr uint32_t MATCH_POLICY_ACTION = 0x01; +static constexpr uint32_t MATCH_POLICY_RESOURCE = 0x02; +static constexpr uint32_t MATCH_POLICY_ARN = 0x04; +static constexpr uint32_t MATCH_POLICY_STRING = 0x08; + +extern bool match_policy(boost::string_view pattern, boost::string_view input, + uint32_t flag); + +extern string camelcase_dash_http_attr(const string& orig); +extern string lowercase_dash_http_attr(const string& orig); + +void rgw_setup_saved_curl_handles(); +void rgw_release_all_curl_handles(); + +static inline void rgw_escape_str(const string& s, char esc_char, + char special_char, string *dest) +{ + const char *src = s.c_str(); + char dest_buf[s.size() * 2 + 1]; + char *destp = dest_buf; + + for (size_t i = 0; i < s.size(); i++) { + char c = src[i]; + if (c == esc_char || c == special_char) { + *destp++ = esc_char; + } + *destp++ = c; + } + *destp++ = '\0'; + *dest = dest_buf; +} + +static inline ssize_t rgw_unescape_str(const string& s, ssize_t ofs, + char esc_char, char special_char, + string *dest) +{ + const char *src = s.c_str(); + char dest_buf[s.size() + 1]; + char *destp = dest_buf; + bool esc = false; + + dest_buf[0] = '\0'; + + for (size_t i = ofs; i < s.size(); i++) { + char c = src[i]; + if (!esc && c == esc_char) { + esc = true; + continue; + } + if (!esc && c == special_char) { + *destp = '\0'; + *dest = dest_buf; + return (ssize_t)i + 1; + } + *destp++ = c; + esc = false; + } + *destp = '\0'; + *dest = dest_buf; + return string::npos; +} + +static inline string rgw_bl_str(ceph::buffer::list& raw) +{ + size_t len = raw.length(); + string s(raw.c_str(), len); + while (len && !s[len - 1]) { + --len; + s.resize(len); + } + return s; +} + +template +int decode_bl(bufferlist& bl, T& t) +{ + auto iter = bl.cbegin(); + try { + decode(t, iter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +#endif diff --git a/src/rgw/rgw_compression.cc b/src/rgw/rgw_compression.cc new file mode 100644 index 00000000..b70f51ad --- /dev/null +++ b/src/rgw/rgw_compression.cc @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_compression.h" + +#define dout_subsys ceph_subsys_rgw + +int rgw_compression_info_from_attr(const bufferlist& attr, + bool& need_decompress, + RGWCompressionInfo& cs_info) +{ + auto bliter = attr.cbegin(); + try { + decode(cs_info, bliter); + } catch (buffer::error& err) { + return -EIO; + } + if (cs_info.blocks.size() == 0) { + return -EIO; + } + if (cs_info.compression_type != "none") + need_decompress = true; + else + need_decompress = false; + return 0; +} + +int rgw_compression_info_from_attrset(const map& attrs, + bool& need_decompress, + RGWCompressionInfo& cs_info) +{ + auto value = attrs.find(RGW_ATTR_COMPRESSION); + if (value == attrs.end()) { + need_decompress = false; + return 0; + } + return rgw_compression_info_from_attr(value->second, need_decompress, cs_info); +} + +//------------RGWPutObj_Compress--------------- + +int RGWPutObj_Compress::process(bufferlist&& in, uint64_t logical_offset) +{ + bufferlist out; + if (in.length() > 0) { + // compression stuff + if ((logical_offset > 0 && compressed) || // if previous part was compressed + (logical_offset == 0)) { // or it's the first part + ldout(cct, 10) << "Compression for rgw is enabled, compress part " << in.length() << dendl; + int cr = compressor->compress(in, out); + if (cr < 0) { + if (logical_offset > 0) { + lderr(cct) << "Compression failed with exit code " << cr + << " for next part, compression process failed" << dendl; + return -EIO; + } + compressed = false; + ldout(cct, 5) << "Compression failed with exit code " << cr + << " for first part, storing uncompressed" << dendl; + out.claim(in); + } else { + compressed = true; + + compression_block newbl; + size_t bs = blocks.size(); + newbl.old_ofs = logical_offset; + newbl.new_ofs = bs > 0 ? blocks[bs-1].len + blocks[bs-1].new_ofs : 0; + newbl.len = out.length(); + blocks.push_back(newbl); + } + } else { + compressed = false; + out.claim(in); + } + // end of compression stuff + } + return Pipe::process(std::move(out), logical_offset); +} + +//----------------RGWGetObj_Decompress--------------------- +RGWGetObj_Decompress::RGWGetObj_Decompress(CephContext* cct_, + RGWCompressionInfo* cs_info_, + bool partial_content_, + RGWGetObj_Filter* next): RGWGetObj_Filter(next), + cct(cct_), + cs_info(cs_info_), + partial_content(partial_content_), + q_ofs(0), + q_len(0), + cur_ofs(0) +{ + compressor = Compressor::create(cct, cs_info->compression_type); + if (!compressor.get()) + lderr(cct) << "Cannot load compressor of type " << cs_info->compression_type << dendl; +} + +int RGWGetObj_Decompress::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + ldout(cct, 10) << "Compression for rgw is enabled, decompress part " + << "bl_ofs="<< bl_ofs << bl_len << dendl; + + if (!compressor.get()) { + // if compressor isn't available - error, because cannot return decompressed data? + lderr(cct) << "Cannot load compressor of type " << cs_info->compression_type << dendl; + return -EIO; + } + bufferlist out_bl, in_bl, temp_in_bl; + bl.copy(bl_ofs, bl_len, temp_in_bl); + bl_ofs = 0; + int r = 0; + if (waiting.length() != 0) { + in_bl.append(waiting); + in_bl.append(temp_in_bl); + waiting.clear(); + } else { + in_bl.claim(temp_in_bl); + } + bl_len = in_bl.length(); + + while (first_block <= last_block) { + bufferlist tmp; + off_t ofs_in_bl = first_block->new_ofs - cur_ofs; + if (ofs_in_bl + (off_t)first_block->len > bl_len) { + // not complete block, put it to waiting + unsigned tail = bl_len - ofs_in_bl; + in_bl.copy(ofs_in_bl, tail, waiting); + cur_ofs -= tail; + break; + } + in_bl.copy(ofs_in_bl, first_block->len, tmp); + int cr = compressor->decompress(tmp, out_bl); + if (cr < 0) { + lderr(cct) << "Decompression failed with exit code " << cr << dendl; + return cr; + } + ++first_block; + while (out_bl.length() - q_ofs >= cct->_conf->rgw_max_chunk_size) + { + off_t ch_len = std::min(cct->_conf->rgw_max_chunk_size, q_len); + q_len -= ch_len; + r = next->handle_data(out_bl, q_ofs, ch_len); + if (r < 0) { + lderr(cct) << "handle_data failed with exit code " << r << dendl; + return r; + } + out_bl.splice(0, q_ofs + ch_len); + q_ofs = 0; + } + } + + cur_ofs += bl_len; + off_t ch_len = std::min(out_bl.length() - q_ofs, q_len); + if (ch_len > 0) { + r = next->handle_data(out_bl, q_ofs, ch_len); + if (r < 0) { + lderr(cct) << "handle_data failed with exit code " << r << dendl; + return r; + } + out_bl.splice(0, q_ofs + ch_len); + q_len -= ch_len; + q_ofs = 0; + } + return r; +} + +int RGWGetObj_Decompress::fixup_range(off_t& ofs, off_t& end) +{ + if (partial_content) { + // if user set range, we need to calculate it in decompressed data + first_block = cs_info->blocks.begin(); last_block = cs_info->blocks.begin(); + if (cs_info->blocks.size() > 1) { + vector::iterator fb, lb; + // not bad to use auto for lambda, I think + auto cmp_u = [] (off_t ofs, const compression_block& e) { return (uint64_t)ofs < e.old_ofs; }; + auto cmp_l = [] (const compression_block& e, off_t ofs) { return e.old_ofs <= (uint64_t)ofs; }; + fb = upper_bound(cs_info->blocks.begin()+1, + cs_info->blocks.end(), + ofs, + cmp_u); + first_block = fb - 1; + lb = lower_bound(fb, + cs_info->blocks.end(), + end, + cmp_l); + last_block = lb - 1; + } + } else { + first_block = cs_info->blocks.begin(); last_block = cs_info->blocks.end() - 1; + } + + q_ofs = ofs - first_block->old_ofs; + q_len = end + 1 - ofs; + + ofs = first_block->new_ofs; + end = last_block->new_ofs + last_block->len - 1; + + cur_ofs = ofs; + waiting.clear(); + + return next->fixup_range(ofs, end); +} diff --git a/src/rgw/rgw_compression.h b/src/rgw/rgw_compression.h new file mode 100644 index 00000000..67a1e0cc --- /dev/null +++ b/src/rgw/rgw_compression.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_COMPRESSION_H +#define CEPH_RGW_COMPRESSION_H + +#include + +#include "compressor/Compressor.h" +#include "rgw_putobj.h" +#include "rgw_op.h" + +int rgw_compression_info_from_attr(const bufferlist& attr, + bool& need_decompress, + RGWCompressionInfo& cs_info); +int rgw_compression_info_from_attrset(const map& attrs, + bool& need_decompress, + RGWCompressionInfo& cs_info); + +class RGWGetObj_Decompress : public RGWGetObj_Filter +{ + CephContext* cct; + CompressorRef compressor; + RGWCompressionInfo* cs_info; + bool partial_content; + vector::iterator first_block, last_block; + off_t q_ofs, q_len; + uint64_t cur_ofs; + bufferlist waiting; +public: + RGWGetObj_Decompress(CephContext* cct_, + RGWCompressionInfo* cs_info_, + bool partial_content_, + RGWGetObj_Filter* next); + ~RGWGetObj_Decompress() override {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override; + int fixup_range(off_t& ofs, off_t& end) override; + +}; + +class RGWPutObj_Compress : public rgw::putobj::Pipe +{ + CephContext* cct; + bool compressed{false}; + CompressorRef compressor; + std::vector blocks; +public: + RGWPutObj_Compress(CephContext* cct_, CompressorRef compressor, + rgw::putobj::DataProcessor *next) + : Pipe(next), cct(cct_), compressor(compressor) {} + + int process(bufferlist&& data, uint64_t logical_offset) override; + + bool is_compressed() { return compressed; } + vector& get_compression_blocks() { return blocks; } + +}; /* RGWPutObj_Compress */ + +#endif /* CEPH_RGW_COMPRESSION_H */ diff --git a/src/rgw/rgw_coroutine.cc b/src/rgw/rgw_coroutine.cc new file mode 100644 index 00000000..1ccefc2d --- /dev/null +++ b/src/rgw/rgw_coroutine.cc @@ -0,0 +1,1058 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_json.h" +#include "rgw_coroutine.h" + +// re-include our assert to clobber the system one; fix dout: +#include "include/ceph_assert.h" + +#include + +#define dout_subsys ceph_subsys_rgw +#define dout_context g_ceph_context + + +class RGWCompletionManager::WaitContext : public Context { + RGWCompletionManager *manager; + void *opaque; +public: + WaitContext(RGWCompletionManager *_cm, void *_opaque) : manager(_cm), opaque(_opaque) {} + void finish(int r) override { + manager->_wakeup(opaque); + } +}; + +RGWCompletionManager::RGWCompletionManager(CephContext *_cct) : cct(_cct), lock("RGWCompletionManager::lock"), + timer(cct, lock) +{ + timer.init(); +} + +RGWCompletionManager::~RGWCompletionManager() +{ + Mutex::Locker l(lock); + timer.cancel_all_events(); + timer.shutdown(); +} + +void RGWCompletionManager::complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info) +{ + Mutex::Locker l(lock); + _complete(cn, io_id, user_info); +} + +void RGWCompletionManager::register_completion_notifier(RGWAioCompletionNotifier *cn) +{ + Mutex::Locker l(lock); + if (cn) { + cns.insert(cn); + } +} + +void RGWCompletionManager::unregister_completion_notifier(RGWAioCompletionNotifier *cn) +{ + Mutex::Locker l(lock); + if (cn) { + cns.erase(cn); + } +} + +void RGWCompletionManager::_complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info) +{ + if (cn) { + cns.erase(cn); + } + + if (complete_reqs_set.find(io_id) != complete_reqs_set.end()) { + /* already have completion for this io_id, don't allow multiple completions for it */ + return; + } + complete_reqs.push_back(io_completion{io_id, user_info}); + cond.Signal(); +} + +int RGWCompletionManager::get_next(io_completion *io) +{ + Mutex::Locker l(lock); + while (complete_reqs.empty()) { + if (going_down) { + return -ECANCELED; + } + cond.Wait(lock); + } + *io = complete_reqs.front(); + complete_reqs_set.erase(io->io_id); + complete_reqs.pop_front(); + return 0; +} + +bool RGWCompletionManager::try_get_next(io_completion *io) +{ + Mutex::Locker l(lock); + if (complete_reqs.empty()) { + return false; + } + *io = complete_reqs.front(); + complete_reqs_set.erase(io->io_id); + complete_reqs.pop_front(); + return true; +} + +void RGWCompletionManager::go_down() +{ + Mutex::Locker l(lock); + for (auto cn : cns) { + cn->unregister(); + } + going_down = true; + cond.Signal(); +} + +void RGWCompletionManager::wait_interval(void *opaque, const utime_t& interval, void *user_info) +{ + Mutex::Locker l(lock); + ceph_assert(waiters.find(opaque) == waiters.end()); + waiters[opaque] = user_info; + timer.add_event_after(interval, new WaitContext(this, opaque)); +} + +void RGWCompletionManager::wakeup(void *opaque) +{ + Mutex::Locker l(lock); + _wakeup(opaque); +} + +void RGWCompletionManager::_wakeup(void *opaque) +{ + map::iterator iter = waiters.find(opaque); + if (iter != waiters.end()) { + void *user_id = iter->second; + waiters.erase(iter); + _complete(NULL, rgw_io_id{0, -1} /* no IO id */, user_id); + } +} + +RGWCoroutine::~RGWCoroutine() { + for (auto stack : spawned.entries) { + stack->put(); + } +} + +void RGWCoroutine::init_new_io(RGWIOProvider *io_provider) +{ + stack->init_new_io(io_provider); +} + +void RGWCoroutine::set_io_blocked(bool flag) { + stack->set_io_blocked(flag); +} + +void RGWCoroutine::set_sleeping(bool flag) { + stack->set_sleeping(flag); +} + +int RGWCoroutine::io_block(int ret, int64_t io_id) { + return io_block(ret, rgw_io_id{io_id, -1}); +} + +int RGWCoroutine::io_block(int ret, const rgw_io_id& io_id) { + if (stack->consume_io_finish(io_id)) { + return 0; + } + set_io_blocked(true); + stack->set_io_blocked_id(io_id); + return ret; +} + +void RGWCoroutine::io_complete(const rgw_io_id& io_id) { + stack->io_complete(io_id); +} + +void RGWCoroutine::StatusItem::dump(Formatter *f) const { + ::encode_json("timestamp", timestamp, f); + ::encode_json("status", status, f); +} + +stringstream& RGWCoroutine::Status::set_status() +{ + RWLock::WLocker l(lock); + string s = status.str(); + status.str(string()); + if (!timestamp.is_zero()) { + history.push_back(StatusItem(timestamp, s)); + } + if (history.size() > (size_t)max_history) { + history.pop_front(); + } + timestamp = ceph_clock_now(); + + return status; +} + +int64_t RGWCoroutinesManager::get_next_io_id() +{ + return (int64_t)++max_io_id; +} + +RGWCoroutinesStack::RGWCoroutinesStack(CephContext *_cct, RGWCoroutinesManager *_ops_mgr, RGWCoroutine *start) : cct(_cct), ops_mgr(_ops_mgr), + done_flag(false), error_flag(false), blocked_flag(false), + sleep_flag(false), interval_wait_flag(false), is_scheduled(false), is_waiting_for_child(false), + retcode(0), run_count(0), + env(NULL), parent(NULL) +{ + if (start) { + ops.push_back(start); + } + pos = ops.begin(); +} + +RGWCoroutinesStack::~RGWCoroutinesStack() +{ + for (auto op : ops) { + op->put(); + } + + for (auto stack : spawned.entries) { + stack->put(); + } +} + +int RGWCoroutinesStack::operate(RGWCoroutinesEnv *_env) +{ + env = _env; + RGWCoroutine *op = *pos; + op->stack = this; + ldout(cct, 20) << *op << ": operate()" << dendl; + int r = op->operate_wrapper(); + if (r < 0) { + ldout(cct, 20) << *op << ": operate() returned r=" << r << dendl; + } + + error_flag = op->is_error(); + + if (op->is_done()) { + int op_retcode = r; + r = unwind(op_retcode); + op->put(); + done_flag = (pos == ops.end()); + blocked_flag &= !done_flag; + if (done_flag) { + retcode = op_retcode; + } + return r; + } + + /* should r ever be negative at this point? */ + ceph_assert(r >= 0); + + return 0; +} + +string RGWCoroutinesStack::error_str() +{ + if (pos != ops.end()) { + return (*pos)->error_str(); + } + return string(); +} + +void RGWCoroutinesStack::call(RGWCoroutine *next_op) { + if (!next_op) { + return; + } + ops.push_back(next_op); + if (pos != ops.end()) { + ++pos; + } else { + pos = ops.begin(); + } +} + +void RGWCoroutinesStack::schedule() +{ + env->manager->schedule(env, this); +} + +void RGWCoroutinesStack::_schedule() +{ + env->manager->_schedule(env, this); +} + +RGWCoroutinesStack *RGWCoroutinesStack::spawn(RGWCoroutine *source_op, RGWCoroutine *op, bool wait) +{ + if (!op) { + return NULL; + } + + rgw_spawned_stacks *s = (source_op ? &source_op->spawned : &spawned); + + RGWCoroutinesStack *stack = env->manager->allocate_stack(); + s->add_pending(stack); + stack->parent = this; + + stack->get(); /* we'll need to collect the stack */ + stack->call(op); + + env->manager->schedule(env, stack); + + if (wait) { + set_blocked_by(stack); + } + + return stack; +} + +RGWCoroutinesStack *RGWCoroutinesStack::spawn(RGWCoroutine *op, bool wait) +{ + return spawn(NULL, op, wait); +} + +int RGWCoroutinesStack::wait(const utime_t& interval) +{ + RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr(); + completion_mgr->wait_interval((void *)this, interval, (void *)this); + set_io_blocked(true); + set_interval_wait(true); + return 0; +} + +void RGWCoroutinesStack::wakeup() +{ + RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr(); + completion_mgr->wakeup((void *)this); +} + +void RGWCoroutinesStack::io_complete(const rgw_io_id& io_id) +{ + RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr(); + completion_mgr->complete(nullptr, io_id, (void *)this); +} + +int RGWCoroutinesStack::unwind(int retcode) +{ + rgw_spawned_stacks *src_spawned = &(*pos)->spawned; + + if (pos == ops.begin()) { + ldout(cct, 15) << "stack " << (void *)this << " end" << dendl; + spawned.inherit(src_spawned); + ops.clear(); + pos = ops.end(); + return retcode; + } + + --pos; + ops.pop_back(); + RGWCoroutine *op = *pos; + op->set_retcode(retcode); + op->spawned.inherit(src_spawned); + return 0; +} + +void RGWCoroutinesStack::cancel() +{ + while (!ops.empty()) { + RGWCoroutine *op = *pos; + unwind(-ECANCELED); + op->put(); + } + put(); +} + +bool RGWCoroutinesStack::collect(RGWCoroutine *op, int *ret, RGWCoroutinesStack *skip_stack) /* returns true if needs to be called again */ +{ + bool need_retry = false; + rgw_spawned_stacks *s = (op ? &op->spawned : &spawned); + *ret = 0; + vector new_list; + + for (vector::iterator iter = s->entries.begin(); iter != s->entries.end(); ++iter) { + RGWCoroutinesStack *stack = *iter; + if (stack == skip_stack || !stack->is_done()) { + new_list.push_back(stack); + if (!stack->is_done()) { + ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " is still running" << dendl; + } else if (stack == skip_stack) { + ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " explicitly skipping stack" << dendl; + } + continue; + } + int r = stack->get_ret_status(); + stack->put(); + if (r < 0) { + *ret = r; + ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " encountered error (r=" << r << "), skipping next stacks" << dendl; + new_list.insert(new_list.end(), ++iter, s->entries.end()); + need_retry = (iter != s->entries.end()); + break; + } + + ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " is complete" << dendl; + } + + s->entries.swap(new_list); + return need_retry; +} + +bool RGWCoroutinesStack::collect_next(RGWCoroutine *op, int *ret, RGWCoroutinesStack **collected_stack) /* returns true if found a stack to collect */ +{ + rgw_spawned_stacks *s = (op ? &op->spawned : &spawned); + *ret = 0; + + if (collected_stack) { + *collected_stack = NULL; + } + + for (vector::iterator iter = s->entries.begin(); iter != s->entries.end(); ++iter) { + RGWCoroutinesStack *stack = *iter; + if (!stack->is_done()) { + continue; + } + int r = stack->get_ret_status(); + if (r < 0) { + *ret = r; + } + + if (collected_stack) { + *collected_stack = stack; + } + stack->put(); + + s->entries.erase(iter); + return true; + } + + return false; +} + +bool RGWCoroutinesStack::collect(int *ret, RGWCoroutinesStack *skip_stack) /* returns true if needs to be called again */ +{ + return collect(NULL, ret, skip_stack); +} + +static void _aio_completion_notifier_cb(librados::completion_t cb, void *arg) +{ + (static_cast(arg))->cb(); +} + +RGWAioCompletionNotifier::RGWAioCompletionNotifier(RGWCompletionManager *_mgr, const rgw_io_id& _io_id, void *_user_data) : completion_mgr(_mgr), + io_id(_io_id), + user_data(_user_data), lock("RGWAioCompletionNotifier"), registered(true) { + c = librados::Rados::aio_create_completion((void *)this, NULL, + _aio_completion_notifier_cb); +} + +RGWAioCompletionNotifier *RGWCoroutinesStack::create_completion_notifier() +{ + return ops_mgr->create_completion_notifier(this); +} + +RGWCompletionManager *RGWCoroutinesStack::get_completion_mgr() +{ + return ops_mgr->get_completion_mgr(); +} + +bool RGWCoroutinesStack::unblock_stack(RGWCoroutinesStack **s) +{ + if (blocking_stacks.empty()) { + return false; + } + + set::iterator iter = blocking_stacks.begin(); + *s = *iter; + blocking_stacks.erase(iter); + (*s)->blocked_by_stack.erase(this); + + return true; +} + +void RGWCoroutinesManager::report_error(RGWCoroutinesStack *op) +{ + if (!op) { + return; + } + string err = op->error_str(); + if (err.empty()) { + return; + } + lderr(cct) << "ERROR: failed operation: " << op->error_str() << dendl; +} + +void RGWCoroutinesStack::dump(Formatter *f) const { + stringstream ss; + ss << (void *)this; + ::encode_json("stack", ss.str(), f); + ::encode_json("run_count", run_count, f); + f->open_array_section("ops"); + for (auto& i : ops) { + encode_json("op", *i, f); + } + f->close_section(); +} + +void RGWCoroutinesStack::init_new_io(RGWIOProvider *io_provider) +{ + io_provider->set_io_user_info((void *)this); + io_provider->assign_io(env->manager->get_io_id_provider()); +} + +bool RGWCoroutinesStack::try_io_unblock(const rgw_io_id& io_id) +{ + if (!can_io_unblock(io_id)) { + auto p = io_finish_ids.emplace(io_id.id, io_id); + auto& iter = p.first; + bool inserted = p.second; + if (!inserted) { /* could not insert, entry already existed, add channel to completion mask */ + iter->second.channels |= io_id.channels; + } + return false; + } + + return true; +} + +bool RGWCoroutinesStack::consume_io_finish(const rgw_io_id& io_id) +{ + auto iter = io_finish_ids.find(io_id.id); + if (iter == io_finish_ids.end()) { + return false; + } + int finish_mask = iter->second.channels; + bool found = (finish_mask & io_id.channels) != 0; + + finish_mask &= ~(finish_mask & io_id.channels); + + if (finish_mask == 0) { + io_finish_ids.erase(iter); + } + return found; +} + + +void RGWCoroutinesManager::handle_unblocked_stack(set& context_stacks, list& scheduled_stacks, + RGWCompletionManager::io_completion& io, int *blocked_count) +{ + ceph_assert(lock.is_wlocked()); + RGWCoroutinesStack *stack = static_cast(io.user_info); + if (context_stacks.find(stack) == context_stacks.end()) { + return; + } + if (!stack->try_io_unblock(io.io_id)) { + return; + } + if (stack->is_io_blocked()) { + --(*blocked_count); + stack->set_io_blocked(false); + } + stack->set_interval_wait(false); + if (!stack->is_done()) { + if (!stack->is_scheduled) { + scheduled_stacks.push_back(stack); + stack->set_is_scheduled(true); + } + } else { + context_stacks.erase(stack); + stack->put(); + } +} + +void RGWCoroutinesManager::schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack) +{ + RWLock::WLocker wl(lock); + _schedule(env, stack); +} + +void RGWCoroutinesManager::_schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack) +{ + ceph_assert(lock.is_wlocked()); + if (!stack->is_scheduled) { + env->scheduled_stacks->push_back(stack); + stack->set_is_scheduled(true); + } + set& context_stacks = run_contexts[env->run_context]; + context_stacks.insert(stack); +} + +void RGWCoroutinesManager::set_sleeping(RGWCoroutine *cr, bool flag) +{ + cr->set_sleeping(flag); +} + +void RGWCoroutinesManager::io_complete(RGWCoroutine *cr, const rgw_io_id& io_id) +{ + cr->io_complete(io_id); +} + +int RGWCoroutinesManager::run(list& stacks) +{ + int ret = 0; + int blocked_count = 0; + int interval_wait_count = 0; + bool canceled = false; // set on going_down + RGWCoroutinesEnv env; + bool op_not_blocked; + + uint64_t run_context = ++run_context_count; + + lock.get_write(); + set& context_stacks = run_contexts[run_context]; + list scheduled_stacks; + for (auto& st : stacks) { + context_stacks.insert(st); + scheduled_stacks.push_back(st); + st->set_is_scheduled(true); + } + env.run_context = run_context; + env.manager = this; + env.scheduled_stacks = &scheduled_stacks; + + for (list::iterator iter = scheduled_stacks.begin(); iter != scheduled_stacks.end() && !going_down;) { + RGWCompletionManager::io_completion io; + RGWCoroutinesStack *stack = *iter; + ++iter; + scheduled_stacks.pop_front(); + + if (context_stacks.find(stack) == context_stacks.end()) { + /* stack was probably schedule more than once due to IO, but was since complete */ + goto next; + } + env.stack = stack; + + lock.unlock(); + + ret = stack->operate(&env); + + lock.get_write(); + + stack->set_is_scheduled(false); + if (ret < 0) { + ldout(cct, 20) << "stack->operate() returned ret=" << ret << dendl; + } + + if (stack->is_error()) { + report_error(stack); + } + + op_not_blocked = false; + + if (stack->is_io_blocked()) { + ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is io blocked" << dendl; + if (stack->is_interval_waiting()) { + interval_wait_count++; + } + blocked_count++; + } else if (stack->is_blocked()) { + /* do nothing, we'll re-add the stack when the blocking stack is done, + * or when we're awaken + */ + ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is_blocked_by_stack()=" << stack->is_blocked_by_stack() + << " is_sleeping=" << stack->is_sleeping() << " waiting_for_child()=" << stack->waiting_for_child() << dendl; + } else if (stack->is_done()) { + ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is done" << dendl; + RGWCoroutinesStack *s; + while (stack->unblock_stack(&s)) { + if (!s->is_blocked_by_stack() && !s->is_done()) { + if (s->is_io_blocked()) { + if (stack->is_interval_waiting()) { + interval_wait_count++; + } + blocked_count++; + } else { + s->_schedule(); + } + } + } + if (stack->parent && stack->parent->waiting_for_child()) { + stack->parent->set_wait_for_child(false); + stack->parent->_schedule(); + } + context_stacks.erase(stack); + stack->put(); + stack = NULL; + } else { + op_not_blocked = true; + stack->run_count++; + stack->_schedule(); + } + + if (!op_not_blocked && stack) { + stack->run_count = 0; + } + + while (completion_mgr->try_get_next(&io)) { + handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count); + } + + /* + * only account blocked operations that are not in interval_wait, these are stacks that + * were put on a wait without any real IO operations. While we mark these as io_blocked, + * these aren't really waiting for IOs + */ + while (blocked_count - interval_wait_count >= ops_window) { + lock.unlock(); + ret = completion_mgr->get_next(&io); + lock.get_write(); + if (ret < 0) { + ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl; + } + handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count); + } + +next: + while (scheduled_stacks.empty() && blocked_count > 0) { + lock.unlock(); + ret = completion_mgr->get_next(&io); + lock.get_write(); + if (ret < 0) { + ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl; + } + if (going_down) { + ldout(cct, 5) << __func__ << "(): was stopped, exiting" << dendl; + ret = -ECANCELED; + canceled = true; + break; + } + handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count); + iter = scheduled_stacks.begin(); + } + if (canceled) { + break; + } + + if (iter == scheduled_stacks.end()) { + iter = scheduled_stacks.begin(); + } + } + + if (!context_stacks.empty() && !going_down) { + JSONFormatter formatter(true); + formatter.open_array_section("context_stacks"); + for (auto& s : context_stacks) { + ::encode_json("entry", *s, &formatter); + } + formatter.close_section(); + lderr(cct) << __func__ << "(): ERROR: deadlock detected, dumping remaining coroutines:\n"; + formatter.flush(*_dout); + *_dout << dendl; + ceph_assert(context_stacks.empty() || going_down); // assert on deadlock + } + + for (auto stack : context_stacks) { + ldout(cct, 20) << "clearing stack on run() exit: stack=" << (void *)stack << " nref=" << stack->get_nref() << dendl; + stack->cancel(); + } + run_contexts.erase(run_context); + lock.unlock(); + + return ret; +} + +int RGWCoroutinesManager::run(RGWCoroutine *op) +{ + if (!op) { + return 0; + } + list stacks; + RGWCoroutinesStack *stack = allocate_stack(); + op->get(); + stack->call(op); + + stacks.push_back(stack); + + int r = run(stacks); + if (r < 0) { + ldout(cct, 20) << "run(stacks) returned r=" << r << dendl; + } else { + r = op->get_ret_status(); + } + op->put(); + + return r; +} + +RGWAioCompletionNotifier *RGWCoroutinesManager::create_completion_notifier(RGWCoroutinesStack *stack) +{ + rgw_io_id io_id{get_next_io_id(), -1}; + RGWAioCompletionNotifier *cn = new RGWAioCompletionNotifier(completion_mgr, io_id, (void *)stack); + completion_mgr->register_completion_notifier(cn); + return cn; +} + +void RGWCoroutinesManager::dump(Formatter *f) const { + RWLock::RLocker rl(lock); + + f->open_array_section("run_contexts"); + for (auto& i : run_contexts) { + f->open_object_section("context"); + ::encode_json("id", i.first, f); + f->open_array_section("entries"); + for (auto& s : i.second) { + ::encode_json("entry", *s, f); + } + f->close_section(); + f->close_section(); + } + f->close_section(); +} + +RGWCoroutinesStack *RGWCoroutinesManager::allocate_stack() { + return new RGWCoroutinesStack(cct, this); +} + +string RGWCoroutinesManager::get_id() +{ + if (!id.empty()) { + return id; + } + stringstream ss; + ss << (void *)this; + return ss.str(); +} + +void RGWCoroutinesManagerRegistry::add(RGWCoroutinesManager *mgr) +{ + RWLock::WLocker wl(lock); + if (managers.find(mgr) == managers.end()) { + managers.insert(mgr); + get(); + } +} + +void RGWCoroutinesManagerRegistry::remove(RGWCoroutinesManager *mgr) +{ + RWLock::WLocker wl(lock); + if (managers.find(mgr) != managers.end()) { + managers.erase(mgr); + put(); + } +} + +RGWCoroutinesManagerRegistry::~RGWCoroutinesManagerRegistry() +{ + AdminSocket *admin_socket = cct->get_admin_socket(); + if (!admin_command.empty()) { + admin_socket->unregister_command(admin_command); + } +} + +int RGWCoroutinesManagerRegistry::hook_to_admin_command(const string& command) +{ + AdminSocket *admin_socket = cct->get_admin_socket(); + if (!admin_command.empty()) { + admin_socket->unregister_command(admin_command); + } + admin_command = command; + int r = admin_socket->register_command(admin_command, admin_command, this, + "dump current coroutines stack state"); + if (r < 0) { + lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl; + return r; + } + return 0; +} + +bool RGWCoroutinesManagerRegistry::call(std::string_view command, + const cmdmap_t& cmdmap, + std::string_view format, + bufferlist& out) { + RWLock::RLocker rl(lock); + stringstream ss; + JSONFormatter f; + ::encode_json("cr_managers", *this, &f); + f.flush(ss); + out.append(ss); + return true; +} + +void RGWCoroutinesManagerRegistry::dump(Formatter *f) const { + f->open_array_section("coroutine_managers"); + for (auto m : managers) { + ::encode_json("entry", *m, f); + } + f->close_section(); +} + +void RGWCoroutine::call(RGWCoroutine *op) +{ + if (op) { + stack->call(op); + } else { + // the call()er expects this to set a retcode + retcode = 0; + } +} + +RGWCoroutinesStack *RGWCoroutine::spawn(RGWCoroutine *op, bool wait) +{ + return stack->spawn(this, op, wait); +} + +bool RGWCoroutine::collect(int *ret, RGWCoroutinesStack *skip_stack) /* returns true if needs to be called again */ +{ + return stack->collect(this, ret, skip_stack); +} + +bool RGWCoroutine::collect_next(int *ret, RGWCoroutinesStack **collected_stack) /* returns true if found a stack to collect */ +{ + return stack->collect_next(this, ret, collected_stack); +} + +int RGWCoroutine::wait(const utime_t& interval) +{ + return stack->wait(interval); +} + +void RGWCoroutine::wait_for_child() +{ + /* should only wait for child if there is a child that is not done yet, and no complete children */ + if (spawned.entries.empty()) { + return; + } + for (vector::iterator iter = spawned.entries.begin(); iter != spawned.entries.end(); ++iter) { + if ((*iter)->is_done()) { + return; + } + } + stack->set_wait_for_child(true); +} + +string RGWCoroutine::to_str() const +{ + return typeid(*this).name(); +} + +ostream& operator<<(ostream& out, const RGWCoroutine& cr) +{ + out << "cr:s=" << (void *)cr.get_stack() << ":op=" << (void *)&cr << ":" << typeid(cr).name(); + return out; +} + +bool RGWCoroutine::drain_children(int num_cr_left, RGWCoroutinesStack *skip_stack) +{ + bool done = false; + ceph_assert(num_cr_left >= 0); + if (num_cr_left == 0 && skip_stack) { + num_cr_left = 1; + } + reenter(&drain_cr) { + while (num_spawned() > (size_t)num_cr_left) { + yield wait_for_child(); + int ret; + while (collect(&ret, skip_stack)) { + if (ret < 0) { + ldout(cct, 10) << "collect() returned ret=" << ret << dendl; + /* we should have reported this error */ + log_error() << "ERROR: collect() returned error (ret=" << ret << ")"; + } + } + } + done = true; + } + return done; +} + +void RGWCoroutine::wakeup() +{ + stack->wakeup(); +} + +RGWCoroutinesEnv *RGWCoroutine::get_env() const +{ + return stack->get_env(); +} + +void RGWCoroutine::dump(Formatter *f) const { + if (!description.str().empty()) { + encode_json("description", description.str(), f); + } + encode_json("type", to_str(), f); + if (!spawned.entries.empty()) { + f->open_array_section("spawned"); + for (auto& i : spawned.entries) { + char buf[32]; + snprintf(buf, sizeof(buf), "%p", (void *)i); + encode_json("stack", string(buf), f); + } + f->close_section(); + } + if (!status.history.empty()) { + encode_json("history", status.history, f); + } + + if (!status.status.str().empty()) { + f->open_object_section("status"); + encode_json("status", status.status.str(), f); + encode_json("timestamp", status.timestamp, f); + f->close_section(); + } +} + +RGWSimpleCoroutine::~RGWSimpleCoroutine() +{ + if (!called_cleanup) { + request_cleanup(); + } +} + +void RGWSimpleCoroutine::call_cleanup() +{ + called_cleanup = true; + request_cleanup(); +} + +int RGWSimpleCoroutine::operate() +{ + int ret = 0; + reenter(this) { + yield return state_init(); + yield return state_send_request(); + yield return state_request_complete(); + yield return state_all_complete(); + drain_all(); + call_cleanup(); + return set_state(RGWCoroutine_Done, ret); + } + return 0; +} + +int RGWSimpleCoroutine::state_init() +{ + int ret = init(); + if (ret < 0) { + call_cleanup(); + return set_state(RGWCoroutine_Error, ret); + } + return 0; +} + +int RGWSimpleCoroutine::state_send_request() +{ + int ret = send_request(); + if (ret < 0) { + call_cleanup(); + return set_state(RGWCoroutine_Error, ret); + } + return io_block(0); +} + +int RGWSimpleCoroutine::state_request_complete() +{ + int ret = request_complete(); + if (ret < 0) { + call_cleanup(); + return set_state(RGWCoroutine_Error, ret); + } + return 0; +} + +int RGWSimpleCoroutine::state_all_complete() +{ + int ret = finish(); + if (ret < 0) { + call_cleanup(); + return set_state(RGWCoroutine_Error, ret); + } + return 0; +} + + diff --git a/src/rgw/rgw_coroutine.h b/src/rgw/rgw_coroutine.h new file mode 100644 index 00000000..e8173b3f --- /dev/null +++ b/src/rgw/rgw_coroutine.h @@ -0,0 +1,674 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_COROUTINE_H +#define CEPH_RGW_COROUTINE_H + +#ifdef _ASSERT_H +#define NEED_ASSERT_H +#pragma push_macro("_ASSERT_H") +#endif + +#include +#include + +#ifdef NEED_ASSERT_H +#pragma pop_macro("_ASSERT_H") +#endif + +#include "include/utime.h" +#include "common/RefCountedObj.h" +#include "common/debug.h" +#include "common/Timer.h" +#include "common/admin_socket.h" + +#include "rgw_common.h" +#include + +#include + +#define RGW_ASYNC_OPS_MGR_WINDOW 100 + +class RGWCoroutinesStack; +class RGWCoroutinesManager; +class RGWAioCompletionNotifier; + +class RGWCompletionManager : public RefCountedObject { + friend class RGWCoroutinesManager; + + CephContext *cct; + + struct io_completion { + rgw_io_id io_id; + void *user_info; + }; + list complete_reqs; + set complete_reqs_set; + using NotifierRef = boost::intrusive_ptr; + set cns; + + Mutex lock; + Cond cond; + + SafeTimer timer; + + std::atomic going_down = { false }; + + map waiters; + + class WaitContext; + +protected: + void _wakeup(void *opaque); + void _complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info); +public: + explicit RGWCompletionManager(CephContext *_cct); + ~RGWCompletionManager() override; + + void complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info); + int get_next(io_completion *io); + bool try_get_next(io_completion *io); + + void go_down(); + + /* + * wait for interval length to complete user_info + */ + void wait_interval(void *opaque, const utime_t& interval, void *user_info); + void wakeup(void *opaque); + + void register_completion_notifier(RGWAioCompletionNotifier *cn); + void unregister_completion_notifier(RGWAioCompletionNotifier *cn); +}; + +/* a single use librados aio completion notifier that hooks into the RGWCompletionManager */ +class RGWAioCompletionNotifier : public RefCountedObject { + librados::AioCompletion *c; + RGWCompletionManager *completion_mgr; + rgw_io_id io_id; + void *user_data; + Mutex lock; + bool registered; + +public: + RGWAioCompletionNotifier(RGWCompletionManager *_mgr, const rgw_io_id& _io_id, void *_user_data); + ~RGWAioCompletionNotifier() override { + c->release(); + lock.Lock(); + bool need_unregister = registered; + if (registered) { + completion_mgr->get(); + } + registered = false; + lock.Unlock(); + if (need_unregister) { + completion_mgr->unregister_completion_notifier(this); + completion_mgr->put(); + } + } + + librados::AioCompletion *completion() { + return c; + } + + void unregister() { + Mutex::Locker l(lock); + if (!registered) { + return; + } + registered = false; + } + + void cb() { + lock.Lock(); + if (!registered) { + lock.Unlock(); + put(); + return; + } + completion_mgr->get(); + registered = false; + lock.Unlock(); + completion_mgr->complete(this, io_id, user_data); + completion_mgr->put(); + put(); + } +}; + +// completion notifier with opaque payload (ie a reference-counted pointer) +template +class RGWAioCompletionNotifierWith : public RGWAioCompletionNotifier { + T value; +public: + RGWAioCompletionNotifierWith(RGWCompletionManager *mgr, + const rgw_io_id& io_id, void *user_data, + T value) + : RGWAioCompletionNotifier(mgr, io_id, user_data), value(std::move(value)) + {} +}; + +struct RGWCoroutinesEnv { + uint64_t run_context; + RGWCoroutinesManager *manager; + list *scheduled_stacks; + RGWCoroutinesStack *stack; + + RGWCoroutinesEnv() : run_context(0), manager(NULL), scheduled_stacks(NULL), stack(NULL) {} +}; + +enum RGWCoroutineState { + RGWCoroutine_Error = -2, + RGWCoroutine_Done = -1, + RGWCoroutine_Run = 0, +}; + +struct rgw_spawned_stacks { + vector entries; + + rgw_spawned_stacks() {} + + void add_pending(RGWCoroutinesStack *s) { + entries.push_back(s); + } + + void inherit(rgw_spawned_stacks *source) { + for (vector::iterator iter = source->entries.begin(); + iter != source->entries.end(); ++iter) { + add_pending(*iter); + } + source->entries.clear(); + } +}; + + + +class RGWCoroutine : public RefCountedObject, public boost::asio::coroutine { + friend class RGWCoroutinesStack; + + struct StatusItem { + utime_t timestamp; + string status; + + StatusItem(utime_t& t, const string& s) : timestamp(t), status(s) {} + + void dump(Formatter *f) const; + }; + +#define MAX_COROUTINE_HISTORY 10 + + struct Status { + CephContext *cct; + RWLock lock; + int max_history; + + utime_t timestamp; + stringstream status; + + explicit Status(CephContext *_cct) : cct(_cct), lock("RGWCoroutine::Status::lock"), max_history(MAX_COROUTINE_HISTORY) {} + + deque history; + + stringstream& set_status(); + } status; + + stringstream description; + +protected: + bool _yield_ret; + boost::asio::coroutine drain_cr; + + CephContext *cct; + + RGWCoroutinesStack *stack; + int retcode; + int state; + + rgw_spawned_stacks spawned; + + stringstream error_stream; + + int set_state(int s, int ret = 0) { + retcode = ret; + state = s; + return ret; + } + int set_cr_error(int ret) { + return set_state(RGWCoroutine_Error, ret); + } + int set_cr_done() { + return set_state(RGWCoroutine_Done, 0); + } + void set_io_blocked(bool flag); + + void reset_description() { + description.str(string()); + } + + stringstream& set_description() { + return description; + } + stringstream& set_status() { + return status.set_status(); + } + + stringstream& set_status(const string& s) { + stringstream& status = set_status(); + status << s; + return status; + } + + virtual int operate_wrapper() { + return operate(); + } +public: + RGWCoroutine(CephContext *_cct) : status(_cct), _yield_ret(false), cct(_cct), stack(NULL), retcode(0), state(RGWCoroutine_Run) {} + ~RGWCoroutine() override; + + virtual int operate() = 0; + + bool is_done() { return (state == RGWCoroutine_Done || state == RGWCoroutine_Error); } + bool is_error() { return (state == RGWCoroutine_Error); } + + stringstream& log_error() { return error_stream; } + string error_str() { + return error_stream.str(); + } + + void set_retcode(int r) { + retcode = r; + } + + int get_ret_status() { + return retcode; + } + + void call(RGWCoroutine *op); /* call at the same stack we're in */ + RGWCoroutinesStack *spawn(RGWCoroutine *op, bool wait); /* execute on a different stack */ + bool collect(int *ret, RGWCoroutinesStack *skip_stack); /* returns true if needs to be called again */ + bool collect_next(int *ret, RGWCoroutinesStack **collected_stack = NULL); /* returns true if found a stack to collect */ + + int wait(const utime_t& interval); + bool drain_children(int num_cr_left, RGWCoroutinesStack *skip_stack = NULL); /* returns true if needed to be called again */ + void wakeup(); + void set_sleeping(bool flag); /* put in sleep, or wakeup from sleep */ + + size_t num_spawned() { + return spawned.entries.size(); + } + + void wait_for_child(); + + virtual string to_str() const; + + RGWCoroutinesStack *get_stack() const { + return stack; + } + + RGWCoroutinesEnv *get_env() const; + + void dump(Formatter *f) const; + + void init_new_io(RGWIOProvider *io_provider); /* only links the default io id */ + + int io_block(int ret = 0) { + return io_block(ret, -1); + } + int io_block(int ret, int64_t io_id); + int io_block(int ret, const rgw_io_id& io_id); + void io_complete() { + io_complete(rgw_io_id{}); + } + void io_complete(const rgw_io_id& io_id); +}; + +ostream& operator<<(ostream& out, const RGWCoroutine& cr); + +#define yield_until_true(x) \ +do { \ + do { \ + yield _yield_ret = x; \ + } while (!_yield_ret); \ + _yield_ret = false; \ +} while (0) + +#define drain_all() \ + drain_cr = boost::asio::coroutine(); \ + yield_until_true(drain_children(0)) + +#define drain_all_but(n) \ + drain_cr = boost::asio::coroutine(); \ + yield_until_true(drain_children(n)) + +#define drain_all_but_stack(stack) \ + drain_cr = boost::asio::coroutine(); \ + yield_until_true(drain_children(1, stack)) + +template +class RGWConsumerCR : public RGWCoroutine { + list product; + +public: + explicit RGWConsumerCR(CephContext *_cct) : RGWCoroutine(_cct) {} + + bool has_product() { + return !product.empty(); + } + + void wait_for_product() { + if (!has_product()) { + set_sleeping(true); + } + } + + bool consume(T *p) { + if (product.empty()) { + return false; + } + *p = product.front(); + product.pop_front(); + return true; + } + + void receive(const T& p, bool wakeup = true); + void receive(list& l, bool wakeup = true); +}; + +class RGWCoroutinesStack : public RefCountedObject { + friend class RGWCoroutine; + friend class RGWCoroutinesManager; + + CephContext *cct; + + RGWCoroutinesManager *ops_mgr; + + list ops; + list::iterator pos; + + rgw_spawned_stacks spawned; + + set blocked_by_stack; + set blocking_stacks; + + map io_finish_ids; + rgw_io_id io_blocked_id; + + bool done_flag; + bool error_flag; + bool blocked_flag; + bool sleep_flag; + bool interval_wait_flag; + + bool is_scheduled; + + bool is_waiting_for_child; + + int retcode; + + uint64_t run_count; + +protected: + RGWCoroutinesEnv *env; + RGWCoroutinesStack *parent; + + RGWCoroutinesStack *spawn(RGWCoroutine *source_op, RGWCoroutine *next_op, bool wait); + bool collect(RGWCoroutine *op, int *ret, RGWCoroutinesStack *skip_stack); /* returns true if needs to be called again */ + bool collect_next(RGWCoroutine *op, int *ret, RGWCoroutinesStack **collected_stack); /* returns true if found a stack to collect */ +public: + RGWCoroutinesStack(CephContext *_cct, RGWCoroutinesManager *_ops_mgr, RGWCoroutine *start = NULL); + ~RGWCoroutinesStack() override; + + int operate(RGWCoroutinesEnv *env); + + bool is_done() { + return done_flag; + } + bool is_error() { + return error_flag; + } + bool is_blocked_by_stack() { + return !blocked_by_stack.empty(); + } + void set_io_blocked(bool flag) { + blocked_flag = flag; + } + void set_io_blocked_id(const rgw_io_id& io_id) { + io_blocked_id = io_id; + } + bool is_io_blocked() { + return blocked_flag && !done_flag; + } + bool can_io_unblock(const rgw_io_id& io_id) { + return ((io_blocked_id.id < 0) || + io_blocked_id.intersects(io_id)); + } + bool try_io_unblock(const rgw_io_id& io_id); + bool consume_io_finish(const rgw_io_id& io_id); + void set_interval_wait(bool flag) { + interval_wait_flag = flag; + } + bool is_interval_waiting() { + return interval_wait_flag; + } + void set_sleeping(bool flag) { + bool wakeup = sleep_flag & !flag; + sleep_flag = flag; + if (wakeup) { + schedule(); + } + } + bool is_sleeping() { + return sleep_flag; + } + void set_is_scheduled(bool flag) { + is_scheduled = flag; + } + + bool is_blocked() { + return is_blocked_by_stack() || is_sleeping() || + is_io_blocked() || waiting_for_child() ; + } + + void schedule(); + void _schedule(); + + int get_ret_status() { + return retcode; + } + + string error_str(); + + void call(RGWCoroutine *next_op); + RGWCoroutinesStack *spawn(RGWCoroutine *next_op, bool wait); + int unwind(int retcode); + + int wait(const utime_t& interval); + void wakeup(); + void io_complete() { + io_complete(rgw_io_id{}); + } + void io_complete(const rgw_io_id& io_id); + + bool collect(int *ret, RGWCoroutinesStack *skip_stack); /* returns true if needs to be called again */ + + void cancel(); + + RGWAioCompletionNotifier *create_completion_notifier(); + template + RGWAioCompletionNotifier *create_completion_notifier(T value); + RGWCompletionManager *get_completion_mgr(); + + void set_blocked_by(RGWCoroutinesStack *s) { + blocked_by_stack.insert(s); + s->blocking_stacks.insert(this); + } + + void set_wait_for_child(bool flag) { + is_waiting_for_child = flag; + } + + bool waiting_for_child() { + return is_waiting_for_child; + } + + bool unblock_stack(RGWCoroutinesStack **s); + + RGWCoroutinesEnv *get_env() const { return env; } + + void dump(Formatter *f) const; + + void init_new_io(RGWIOProvider *io_provider); +}; + +template +void RGWConsumerCR::receive(list& l, bool wakeup) +{ + product.splice(product.end(), l); + if (wakeup) { + set_sleeping(false); + } +} + + +template +void RGWConsumerCR::receive(const T& p, bool wakeup) +{ + product.push_back(p); + if (wakeup) { + set_sleeping(false); + } +} + +class RGWCoroutinesManagerRegistry : public RefCountedObject, public AdminSocketHook { + CephContext *cct; + + set managers; + RWLock lock; + + string admin_command; + +public: + explicit RGWCoroutinesManagerRegistry(CephContext *_cct) : cct(_cct), lock("RGWCoroutinesRegistry::lock") {} + ~RGWCoroutinesManagerRegistry() override; + + void add(RGWCoroutinesManager *mgr); + void remove(RGWCoroutinesManager *mgr); + + int hook_to_admin_command(const string& command); + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override; + + void dump(Formatter *f) const; +}; + +class RGWCoroutinesManager { + CephContext *cct; + std::atomic going_down = { false }; + + std::atomic run_context_count = { 0 }; + map > run_contexts; + + std::atomic max_io_id = { 0 }; + + RWLock lock; + + RGWIOIDProvider io_id_provider; + + void handle_unblocked_stack(set& context_stacks, list& scheduled_stacks, + RGWCompletionManager::io_completion& io, int *waiting_count); +protected: + RGWCompletionManager *completion_mgr; + RGWCoroutinesManagerRegistry *cr_registry; + + int ops_window; + + string id; + + void put_completion_notifier(RGWAioCompletionNotifier *cn); +public: + RGWCoroutinesManager(CephContext *_cct, RGWCoroutinesManagerRegistry *_cr_registry) : cct(_cct), lock("RGWCoroutinesManager::lock"), + cr_registry(_cr_registry), ops_window(RGW_ASYNC_OPS_MGR_WINDOW) { + completion_mgr = new RGWCompletionManager(cct); + if (cr_registry) { + cr_registry->add(this); + } + } + virtual ~RGWCoroutinesManager() { + stop(); + completion_mgr->put(); + if (cr_registry) { + cr_registry->remove(this); + } + } + + int run(list& ops); + int run(RGWCoroutine *op); + void stop() { + bool expected = false; + if (going_down.compare_exchange_strong(expected, true)) { + completion_mgr->go_down(); + } + } + + virtual void report_error(RGWCoroutinesStack *op); + + RGWAioCompletionNotifier *create_completion_notifier(RGWCoroutinesStack *stack); + template + RGWAioCompletionNotifier *create_completion_notifier(RGWCoroutinesStack *stack, T value); + RGWCompletionManager *get_completion_mgr() { return completion_mgr; } + + void schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack); + void _schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack); + RGWCoroutinesStack *allocate_stack(); + + int64_t get_next_io_id(); + + void set_sleeping(RGWCoroutine *cr, bool flag); + void io_complete(RGWCoroutine *cr, const rgw_io_id& io_id); + + virtual string get_id(); + void dump(Formatter *f) const; + + RGWIOIDProvider& get_io_id_provider() { + return io_id_provider; + } +}; + +template +RGWAioCompletionNotifier *RGWCoroutinesManager::create_completion_notifier(RGWCoroutinesStack *stack, T value) +{ + rgw_io_id io_id{get_next_io_id(), -1}; + RGWAioCompletionNotifier *cn = new RGWAioCompletionNotifierWith(completion_mgr, io_id, (void *)stack, std::move(value)); + completion_mgr->register_completion_notifier(cn); + return cn; +} + +template +RGWAioCompletionNotifier *RGWCoroutinesStack::create_completion_notifier(T value) +{ + return ops_mgr->create_completion_notifier(this, std::move(value)); +} + +class RGWSimpleCoroutine : public RGWCoroutine { + bool called_cleanup; + + int operate() override; + + int state_init(); + int state_send_request(); + int state_request_complete(); + int state_all_complete(); + + void call_cleanup(); + +public: + RGWSimpleCoroutine(CephContext *_cct) : RGWCoroutine(_cct), called_cleanup(false) {} + ~RGWSimpleCoroutine() override; + + virtual int init() { return 0; } + virtual int send_request() = 0; + virtual int request_complete() = 0; + virtual int finish() { return 0; } + virtual void request_cleanup() {} +}; + +#endif diff --git a/src/rgw/rgw_cors.cc b/src/rgw/rgw_cors.cc new file mode 100644 index 00000000..bfe83d64 --- /dev/null +++ b/src/rgw/rgw_cors.cc @@ -0,0 +1,194 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include + +#include +#include + +#include + +#include "include/types.h" +#include "common/debug.h" +#include "include/str_list.h" +#include "common/Formatter.h" + +#include "rgw_cors.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +void RGWCORSRule::dump_origins() { + unsigned num_origins = allowed_origins.size(); + dout(10) << "Allowed origins : " << num_origins << dendl; + for(set::iterator it = allowed_origins.begin(); + it != allowed_origins.end(); + ++it) { + dout(10) << *it << "," << dendl; + } +} + +void RGWCORSRule::erase_origin_if_present(string& origin, bool *rule_empty) { + set::iterator it = allowed_origins.find(origin); + if (!rule_empty) + return; + *rule_empty = false; + if (it != allowed_origins.end()) { + dout(10) << "Found origin " << origin << ", set size:" << + allowed_origins.size() << dendl; + allowed_origins.erase(it); + *rule_empty = (allowed_origins.empty()); + } +} + +/* + * make attrs look-like-this + * does not convert underscores or dashes + * + * Per CORS specification, section 3: + * === + * "Converting a string to ASCII lowercase" means replacing all characters in the + * range U+0041 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z with + * the corresponding characters in the range U+0061 LATIN SMALL LETTER A to + * U+007A LATIN SMALL LETTER Z). + * === + * + * @todo When UTF-8 is allowed in HTTP headers, this function will need to change + */ +string lowercase_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + buf[i] = tolower(*s); + } + return string(buf); +} + + +static bool is_string_in_set(set& s, string h) { + if ((s.find("*") != s.end()) || + (s.find(h) != s.end())) { + return true; + } + /* The header can be Content-*-type, or Content-* */ + for(set::iterator it = s.begin(); + it != s.end(); ++it) { + size_t off; + if ((off = (*it).find("*"))!=string::npos) { + list ssplit; + unsigned flen = 0; + + get_str_list((*it), "* \t", ssplit); + if (off != 0) { + string sl = ssplit.front(); + flen = sl.length(); + dout(10) << "Finding " << sl << ", in " << h << ", at offset 0" << dendl; + if (!boost::algorithm::starts_with(h,sl)) + continue; + ssplit.pop_front(); + } + if (off != ((*it).length() - 1)) { + string sl = ssplit.front(); + dout(10) << "Finding " << sl << ", in " << h + << ", at offset not less than " << flen << dendl; + if (h.size() < sl.size() || + h.compare((h.size() - sl.size()), sl.size(), sl) != 0) + continue; + ssplit.pop_front(); + } + if (!ssplit.empty()) + continue; + return true; + } + } + return false; +} + +bool RGWCORSRule::has_wildcard_origin() { + if (allowed_origins.find("*") != allowed_origins.end()) + return true; + + return false; +} + +bool RGWCORSRule::is_origin_present(const char *o) { + string origin = o; + return is_string_in_set(allowed_origins, origin); +} + +bool RGWCORSRule::is_header_allowed(const char *h, size_t len) { + string hdr(h, len); + if(lowercase_allowed_hdrs.empty()) { + set::iterator iter; + for (iter = allowed_hdrs.begin(); iter != allowed_hdrs.end(); ++iter) { + lowercase_allowed_hdrs.insert(lowercase_http_attr(*iter)); + } + } + return is_string_in_set(lowercase_allowed_hdrs, lowercase_http_attr(hdr)); +} + +void RGWCORSRule::format_exp_headers(string& s) { + s = ""; + for (const auto& header : exposable_hdrs) { + if (s.length() > 0) + s.append(","); + // these values are sent to clients in a 'Access-Control-Expose-Headers' + // response header, so we escape '\n' and '\r' to avoid header injection + std::string tmp = boost::replace_all_copy(header, "\n", "\\n"); + boost::replace_all_copy(std::back_inserter(s), tmp, "\r", "\\r"); + } +} + +RGWCORSRule * RGWCORSConfiguration::host_name_rule(const char *origin) { + for(list::iterator it_r = rules.begin(); + it_r != rules.end(); ++it_r) { + RGWCORSRule& r = (*it_r); + if (r.is_origin_present(origin)) + return &r; + } + return NULL; +} + +void RGWCORSConfiguration::erase_host_name_rule(string& origin) { + bool rule_empty; + unsigned loop = 0; + /*Erase the host name from that rule*/ + dout(10) << "Num of rules : " << rules.size() << dendl; + for(list::iterator it_r = rules.begin(); + it_r != rules.end(); ++it_r, loop++) { + RGWCORSRule& r = (*it_r); + r.erase_origin_if_present(origin, &rule_empty); + dout(10) << "Origin:" << origin << ", rule num:" + << loop << ", emptying now:" << rule_empty << dendl; + if (rule_empty) { + rules.erase(it_r); + break; + } + } +} + +void RGWCORSConfiguration::dump() { + unsigned loop = 1; + unsigned num_rules = rules.size(); + dout(10) << "Number of rules: " << num_rules << dendl; + for(list::iterator it = rules.begin(); + it!= rules.end(); ++it, loop++) { + dout(10) << " <<<<<<< Rule " << loop << " >>>>>>> " << dendl; + (*it).dump_origins(); + } +} diff --git a/src/rgw/rgw_cors.h b/src/rgw/rgw_cors.h new file mode 100644 index 00000000..62e34d45 --- /dev/null +++ b/src/rgw/rgw_cors.h @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RGW_CORS_H +#define CEPH_RGW_CORS_H + +#include +#include +#include + +#define RGW_CORS_GET 0x1 +#define RGW_CORS_PUT 0x2 +#define RGW_CORS_HEAD 0x4 +#define RGW_CORS_POST 0x8 +#define RGW_CORS_DELETE 0x10 +#define RGW_CORS_COPY 0x20 +#define RGW_CORS_ALL (RGW_CORS_GET | \ + RGW_CORS_PUT | \ + RGW_CORS_HEAD | \ + RGW_CORS_POST | \ + RGW_CORS_DELETE | \ + RGW_CORS_COPY) + +#define CORS_MAX_AGE_INVALID ((uint32_t)-1) + +class RGWCORSRule +{ +protected: + uint32_t max_age; + uint8_t allowed_methods; + std::string id; + std::set allowed_hdrs; /* If you change this, you need to discard lowercase_allowed_hdrs */ + std::set lowercase_allowed_hdrs; /* Not built until needed in RGWCORSRule::is_header_allowed */ + std::set allowed_origins; + std::list exposable_hdrs; + +public: + RGWCORSRule() : max_age(CORS_MAX_AGE_INVALID),allowed_methods(0) {} + RGWCORSRule(std::set& o, std::set& h, + std::list& e, uint8_t f, uint32_t a) + :max_age(a), + allowed_methods(f), + allowed_hdrs(h), + allowed_origins(o), + exposable_hdrs(e) {} + virtual ~RGWCORSRule() {} + + std::string& get_id() { return id; } + uint32_t get_max_age() { return max_age; } + uint8_t get_allowed_methods() { return allowed_methods; } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(max_age, bl); + encode(allowed_methods, bl); + encode(id, bl); + encode(allowed_hdrs, bl); + encode(allowed_origins, bl); + encode(exposable_hdrs, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(max_age, bl); + decode(allowed_methods, bl); + decode(id, bl); + decode(allowed_hdrs, bl); + decode(allowed_origins, bl); + decode(exposable_hdrs, bl); + DECODE_FINISH(bl); + } + bool has_wildcard_origin(); + bool is_origin_present(const char *o); + void format_exp_headers(std::string& s); + void erase_origin_if_present(std::string& origin, bool *rule_empty); + void dump_origins(); + void dump(Formatter *f) const; + bool is_header_allowed(const char *hdr, size_t len); +}; +WRITE_CLASS_ENCODER(RGWCORSRule) + +class RGWCORSConfiguration +{ + protected: + std::list rules; + public: + RGWCORSConfiguration() {} + ~RGWCORSConfiguration() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(rules, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(rules, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + std::list& get_rules() { + return rules; + } + bool is_empty() { + return rules.empty(); + } + void get_origins_list(const char *origin, std::list& origins); + RGWCORSRule * host_name_rule(const char *origin); + void erase_host_name_rule(std::string& origin); + void dump(); + void stack_rule(RGWCORSRule& r) { + rules.push_front(r); + } +}; +WRITE_CLASS_ENCODER(RGWCORSConfiguration) + +static inline int validate_name_string(string& o) { + if (o.length() == 0) + return -1; + if (o.find_first_of("*") != o.find_last_of("*")) + return -1; + return 0; +} +#endif /*CEPH_RGW_CORS_H*/ diff --git a/src/rgw/rgw_cors_s3.cc b/src/rgw/rgw_cors_s3.cc new file mode 100644 index 00000000..fe7bd438 --- /dev/null +++ b/src/rgw/rgw_cors_s3.cc @@ -0,0 +1,245 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_cors_s3.h" +#include "rgw_user.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + + +void RGWCORSRule_S3::to_xml(XMLFormatter& f) { + + f.open_object_section("CORSRule"); + /*ID if present*/ + if (id.length() > 0) { + f.dump_string("ID", id); + } + /*AllowedMethods*/ + if (allowed_methods & RGW_CORS_GET) + f.dump_string("AllowedMethod", "GET"); + if (allowed_methods & RGW_CORS_PUT) + f.dump_string("AllowedMethod", "PUT"); + if (allowed_methods & RGW_CORS_DELETE) + f.dump_string("AllowedMethod", "DELETE"); + if (allowed_methods & RGW_CORS_HEAD) + f.dump_string("AllowedMethod", "HEAD"); + if (allowed_methods & RGW_CORS_POST) + f.dump_string("AllowedMethod", "POST"); + if (allowed_methods & RGW_CORS_COPY) + f.dump_string("AllowedMethod", "COPY"); + /*AllowedOrigins*/ + for(set::iterator it = allowed_origins.begin(); + it != allowed_origins.end(); + ++it) { + string host = *it; + f.dump_string("AllowedOrigin", host); + } + /*AllowedHeader*/ + for(set::iterator it = allowed_hdrs.begin(); + it != allowed_hdrs.end(); ++it) { + f.dump_string("AllowedHeader", *it); + } + /*MaxAgeSeconds*/ + if (max_age != CORS_MAX_AGE_INVALID) { + f.dump_unsigned("MaxAgeSeconds", max_age); + } + /*ExposeHeader*/ + for(list::iterator it = exposable_hdrs.begin(); + it != exposable_hdrs.end(); ++it) { + f.dump_string("ExposeHeader", *it); + } + f.close_section(); +} + +bool RGWCORSRule_S3::xml_end(const char *el) { + XMLObjIter iter = find("AllowedMethod"); + XMLObj *obj; + /*Check all the allowedmethods*/ + obj = iter.get_next(); + if (obj) { + for( ; obj; obj = iter.get_next()) { + const char *s = obj->get_data().c_str(); + dout(10) << "RGWCORSRule::xml_end, el : " << el << ", data : " << s << dendl; + if (strcasecmp(s, "GET") == 0) { + allowed_methods |= RGW_CORS_GET; + } else if (strcasecmp(s, "POST") == 0) { + allowed_methods |= RGW_CORS_POST; + } else if (strcasecmp(s, "DELETE") == 0) { + allowed_methods |= RGW_CORS_DELETE; + } else if (strcasecmp(s, "HEAD") == 0) { + allowed_methods |= RGW_CORS_HEAD; + } else if (strcasecmp(s, "PUT") == 0) { + allowed_methods |= RGW_CORS_PUT; + } else if (strcasecmp(s, "COPY") == 0) { + allowed_methods |= RGW_CORS_COPY; + } else { + return false; + } + } + } + /*Check the id's len, it should be less than 255*/ + XMLObj *xml_id = find_first("ID"); + if (xml_id != NULL) { + string data = xml_id->get_data(); + if (data.length() > 255) { + dout(0) << "RGWCORSRule has id of length greater than 255" << dendl; + return false; + } + dout(10) << "RGWCORRule id : " << data << dendl; + id = data; + } + /*Check if there is atleast one AllowedOrigin*/ + iter = find("AllowedOrigin"); + if (!(obj = iter.get_next())) { + dout(0) << "RGWCORSRule does not have even one AllowedOrigin" << dendl; + return false; + } + for( ; obj; obj = iter.get_next()) { + dout(10) << "RGWCORSRule - origin : " << obj->get_data() << dendl; + /*Just take the hostname*/ + string host = obj->get_data(); + if (validate_name_string(host) != 0) + return false; + allowed_origins.insert(allowed_origins.end(), host); + } + /*Check of max_age*/ + iter = find("MaxAgeSeconds"); + if ((obj = iter.get_next())) { + char *end = NULL; + + unsigned long long ull = strtoull(obj->get_data().c_str(), &end, 10); + if (*end != '\0') { + dout(0) << "RGWCORSRule's MaxAgeSeconds " << obj->get_data() << " is an invalid integer" << dendl; + return false; + } + if (ull >= 0x100000000ull) { + max_age = CORS_MAX_AGE_INVALID; + } else { + max_age = (uint32_t)ull; + } + dout(10) << "RGWCORSRule : max_age : " << max_age << dendl; + } + /*Check and update ExposeHeader*/ + iter = find("ExposeHeader"); + if ((obj = iter.get_next())) { + for(; obj; obj = iter.get_next()) { + dout(10) << "RGWCORSRule - exp_hdr : " << obj->get_data() << dendl; + exposable_hdrs.push_back(obj->get_data()); + } + } + /*Check and update AllowedHeader*/ + iter = find("AllowedHeader"); + if ((obj = iter.get_next())) { + for(; obj; obj = iter.get_next()) { + dout(10) << "RGWCORSRule - allowed_hdr : " << obj->get_data() << dendl; + string s = obj->get_data(); + if (validate_name_string(s) != 0) + return false; + allowed_hdrs.insert(allowed_hdrs.end(), s); + } + } + return true; +} + +void RGWCORSConfiguration_S3::to_xml(ostream& out) { + XMLFormatter f; + f.open_object_section_in_ns("CORSConfiguration", XMLNS_AWS_S3); + for(list::iterator it = rules.begin(); + it != rules.end(); ++it) { + (static_cast(*it)).to_xml(f); + } + f.close_section(); + f.flush(out); +} + +bool RGWCORSConfiguration_S3::xml_end(const char *el) { + XMLObjIter iter = find("CORSRule"); + RGWCORSRule_S3 *obj; + if (!(obj = static_cast(iter.get_next()))) { + dout(0) << "CORSConfiguration should have atleast one CORSRule" << dendl; + return false; + } + for(; obj; obj = static_cast(iter.get_next())) { + rules.push_back(*obj); + } + return true; +} + +class CORSRuleID_S3 : public XMLObj { + public: + CORSRuleID_S3() {} + ~CORSRuleID_S3() override {} +}; + +class CORSRuleAllowedOrigin_S3 : public XMLObj { + public: + CORSRuleAllowedOrigin_S3() {} + ~CORSRuleAllowedOrigin_S3() override {} +}; + +class CORSRuleAllowedMethod_S3 : public XMLObj { + public: + CORSRuleAllowedMethod_S3() {} + ~CORSRuleAllowedMethod_S3() override {} +}; + +class CORSRuleAllowedHeader_S3 : public XMLObj { + public: + CORSRuleAllowedHeader_S3() {} + ~CORSRuleAllowedHeader_S3() override {} +}; + +class CORSRuleMaxAgeSeconds_S3 : public XMLObj { + public: + CORSRuleMaxAgeSeconds_S3() {} + ~CORSRuleMaxAgeSeconds_S3() override {} +}; + +class CORSRuleExposeHeader_S3 : public XMLObj { + public: + CORSRuleExposeHeader_S3() {} + ~CORSRuleExposeHeader_S3() override {} +}; + +XMLObj *RGWCORSXMLParser_S3::alloc_obj(const char *el) { + if (strcmp(el, "CORSConfiguration") == 0) { + return new RGWCORSConfiguration_S3; + } else if (strcmp(el, "CORSRule") == 0) { + return new RGWCORSRule_S3; + } else if (strcmp(el, "ID") == 0) { + return new CORSRuleID_S3; + } else if (strcmp(el, "AllowedOrigin") == 0) { + return new CORSRuleAllowedOrigin_S3; + } else if (strcmp(el, "AllowedMethod") == 0) { + return new CORSRuleAllowedMethod_S3; + } else if (strcmp(el, "AllowedHeader") == 0) { + return new CORSRuleAllowedHeader_S3; + } else if (strcmp(el, "MaxAgeSeconds") == 0) { + return new CORSRuleMaxAgeSeconds_S3; + } else if (strcmp(el, "ExposeHeader") == 0) { + return new CORSRuleExposeHeader_S3; + } + return NULL; +} + diff --git a/src/rgw/rgw_cors_s3.h b/src/rgw/rgw_cors_s3.h new file mode 100644 index 00000000..9097e5f1 --- /dev/null +++ b/src/rgw/rgw_cors_s3.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RGW_CORS_S3_H +#define CEPH_RGW_CORS_S3_H + +#include +#include +#include + +#include +#include +#include "rgw_xml.h" +#include "rgw_cors.h" + +class RGWCORSRule_S3 : public RGWCORSRule, public XMLObj +{ + public: + RGWCORSRule_S3() {} + ~RGWCORSRule_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(XMLFormatter& f); +}; + +class RGWCORSConfiguration_S3 : public RGWCORSConfiguration, public XMLObj +{ + public: + RGWCORSConfiguration_S3() {} + ~RGWCORSConfiguration_S3() override {} + + bool xml_end(const char *el) override; + void to_xml(ostream& out); +}; + +class RGWCORSXMLParser_S3 : public RGWXMLParser +{ + CephContext *cct; + + XMLObj *alloc_obj(const char *el) override; +public: + explicit RGWCORSXMLParser_S3(CephContext *_cct) : cct(_cct) {} +}; +#endif /*CEPH_RGW_CORS_S3_H*/ diff --git a/src/rgw/rgw_cors_swift.h b/src/rgw/rgw_cors_swift.h new file mode 100644 index 00000000..da5a2afc --- /dev/null +++ b/src/rgw/rgw_cors_swift.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RGW_CORS_SWIFT3_H +#define CEPH_RGW_CORS_SWIFT3_H + +#include +#include +#include +#include +#include + +#include "rgw_cors.h" + +class RGWCORSConfiguration_SWIFT : public RGWCORSConfiguration +{ + public: + RGWCORSConfiguration_SWIFT() {} + ~RGWCORSConfiguration_SWIFT() {} + int create_update(const char *allow_origins, const char *allow_headers, + const char *expose_headers, const char *max_age) { + set o, h, oc; + list e; + unsigned long a = CORS_MAX_AGE_INVALID; + uint8_t flags = RGW_CORS_ALL; + + string ao = allow_origins; + get_str_set(ao, oc); + if (oc.empty()) + return -EINVAL; + for(set::iterator it = oc.begin(); it != oc.end(); ++it) { + string host = *it; + if (validate_name_string(host) != 0) + return -EINVAL; + o.insert(o.end(), host); + } + if (allow_headers) { + string ah = allow_headers; + get_str_set(ah, h); + for(set::iterator it = h.begin(); + it != h.end(); ++it) { + string s = (*it); + if (validate_name_string(s) != 0) + return -EINVAL; + } + } + + if (expose_headers) { + string eh = expose_headers; + get_str_list(eh, e); + } + if (max_age) { + char *end = NULL; + a = strtoul(max_age, &end, 10); + if (a == ULONG_MAX) + a = CORS_MAX_AGE_INVALID; + } + + RGWCORSRule rule(o, h, e, flags, a); + stack_rule(rule); + return 0; + } +}; +#endif /*CEPH_RGW_CORS_SWIFT3_H*/ diff --git a/src/rgw/rgw_cr_rados.cc b/src/rgw/rgw_cr_rados.cc new file mode 100644 index 00000000..66d05e08 --- /dev/null +++ b/src/rgw/rgw_cr_rados.cc @@ -0,0 +1,916 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_coroutine.h" +#include "rgw_cr_rados.h" +#include "rgw_sync_counters.h" + +#include "services/svc_zone.h" +#include "services/svc_zone_utils.h" +#include "services/svc_sys_obj.h" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rgw/cls_rgw_client.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +bool RGWAsyncRadosProcessor::RGWWQ::_enqueue(RGWAsyncRadosRequest *req) { + if (processor->is_going_down()) { + return false; + } + req->get(); + processor->m_req_queue.push_back(req); + dout(20) << "enqueued request req=" << hex << req << dec << dendl; + _dump_queue(); + return true; +} + +bool RGWAsyncRadosProcessor::RGWWQ::_empty() { + return processor->m_req_queue.empty(); +} + +RGWAsyncRadosRequest *RGWAsyncRadosProcessor::RGWWQ::_dequeue() { + if (processor->m_req_queue.empty()) + return NULL; + RGWAsyncRadosRequest *req = processor->m_req_queue.front(); + processor->m_req_queue.pop_front(); + dout(20) << "dequeued request req=" << hex << req << dec << dendl; + _dump_queue(); + return req; +} + +void RGWAsyncRadosProcessor::RGWWQ::_process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) { + processor->handle_request(req); + processor->req_throttle.put(1); +} + +void RGWAsyncRadosProcessor::RGWWQ::_dump_queue() { + if (!g_conf()->subsys.should_gather()) { + return; + } + deque::iterator iter; + if (processor->m_req_queue.empty()) { + dout(20) << "RGWWQ: empty" << dendl; + return; + } + dout(20) << "RGWWQ:" << dendl; + for (iter = processor->m_req_queue.begin(); iter != processor->m_req_queue.end(); ++iter) { + dout(20) << "req: " << hex << *iter << dec << dendl; + } +} + +RGWAsyncRadosProcessor::RGWAsyncRadosProcessor(RGWRados *_store, int num_threads) + : store(_store), m_tp(store->ctx(), "RGWAsyncRadosProcessor::m_tp", "rados_async", num_threads), + req_throttle(store->ctx(), "rgw_async_rados_ops", num_threads * 2), + req_wq(this, g_conf()->rgw_op_thread_timeout, + g_conf()->rgw_op_thread_suicide_timeout, &m_tp) { +} + +void RGWAsyncRadosProcessor::start() { + m_tp.start(); +} + +void RGWAsyncRadosProcessor::stop() { + going_down = true; + m_tp.drain(&req_wq); + m_tp.stop(); + for (auto iter = m_req_queue.begin(); iter != m_req_queue.end(); ++iter) { + (*iter)->put(); + } +} + +void RGWAsyncRadosProcessor::handle_request(RGWAsyncRadosRequest *req) { + req->send_request(); + req->put(); +} + +void RGWAsyncRadosProcessor::queue(RGWAsyncRadosRequest *req) { + req_throttle.get(1); + req_wq.queue(req); +} + +int RGWAsyncGetSystemObj::_send_request() +{ + map *pattrs = want_attrs ? &attrs : nullptr; + + auto sysobj = obj_ctx.get_obj(obj); + return sysobj.rop() + .set_objv_tracker(&objv_tracker) + .set_attrs(pattrs) + .set_raw_attrs(raw_attrs) + .read(&bl); +} + +RGWAsyncGetSystemObj::RGWAsyncGetSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + bool want_attrs, bool raw_attrs) + : RGWAsyncRadosRequest(caller, cn), obj_ctx(_svc), + obj(_obj), want_attrs(want_attrs), raw_attrs(raw_attrs) +{ + if (_objv_tracker) { + objv_tracker = *_objv_tracker; + } +} + +int RGWSimpleRadosReadAttrsCR::send_request() +{ + req = new RGWAsyncGetSystemObj(this, stack->create_completion_notifier(), + svc, nullptr, obj, true, raw_attrs); + async_rados->queue(req); + return 0; +} + +int RGWSimpleRadosReadAttrsCR::request_complete() +{ + if (pattrs) { + *pattrs = std::move(req->attrs); + } + return req->get_ret_status(); +} + +int RGWAsyncPutSystemObj::_send_request() +{ + auto obj_ctx = svc->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + return sysobj.wop() + .set_objv_tracker(&objv_tracker) + .set_exclusive(exclusive) + .write_data(bl); +} + +RGWAsyncPutSystemObj::RGWAsyncPutSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, + RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + bool _exclusive, bufferlist _bl) + : RGWAsyncRadosRequest(caller, cn), svc(_svc), + obj(_obj), exclusive(_exclusive), bl(std::move(_bl)) +{ + if (_objv_tracker) { + objv_tracker = *_objv_tracker; + } +} + +int RGWAsyncPutSystemObjAttrs::_send_request() +{ + auto obj_ctx = svc->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + return sysobj.wop() + .set_objv_tracker(&objv_tracker) + .set_exclusive(false) + .set_attrs(attrs) + .write_attrs(); +} + +RGWAsyncPutSystemObjAttrs::RGWAsyncPutSystemObjAttrs(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, + RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + map _attrs) + : RGWAsyncRadosRequest(caller, cn), svc(_svc), + obj(_obj), attrs(std::move(_attrs)) +{ + if (_objv_tracker) { + objv_tracker = *_objv_tracker; + } +} + + +RGWOmapAppend::RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, const rgw_raw_obj& _obj, + uint64_t _window_size) + : RGWConsumerCR(_store->ctx()), async_rados(_async_rados), + store(_store), obj(_obj), going_down(false), num_pending_entries(0), window_size(_window_size), total_entries(0) +{ +} + +int RGWAsyncLockSystemObj::_send_request() +{ + rgw_rados_ref ref; + int r = store->get_raw_obj_ref(obj, &ref); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + rados::cls::lock::Lock l(lock_name); + utime_t duration(duration_secs, 0); + l.set_duration(duration); + l.set_cookie(cookie); + l.set_may_renew(true); + + return l.lock_exclusive(&ref.ioctx, ref.obj.oid); +} + +RGWAsyncLockSystemObj::RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + const string& _name, const string& _cookie, uint32_t _duration_secs) : RGWAsyncRadosRequest(caller, cn), store(_store), + obj(_obj), + lock_name(_name), + cookie(_cookie), + duration_secs(_duration_secs) +{ +} + +int RGWAsyncUnlockSystemObj::_send_request() +{ + rgw_rados_ref ref; + int r = store->get_raw_obj_ref(obj, &ref); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + rados::cls::lock::Lock l(lock_name); + + l.set_cookie(cookie); + + return l.unlock(&ref.ioctx, ref.obj.oid); +} + +RGWAsyncUnlockSystemObj::RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + const string& _name, const string& _cookie) : RGWAsyncRadosRequest(caller, cn), store(_store), + obj(_obj), + lock_name(_name), cookie(_cookie) +{ +} + +RGWRadosSetOmapKeysCR::RGWRadosSetOmapKeysCR(RGWRados *_store, + const rgw_raw_obj& _obj, + map& _entries) : RGWSimpleCoroutine(_store->ctx()), + store(_store), + entries(_entries), + obj(_obj), cn(NULL) +{ + stringstream& s = set_description(); + s << "set omap keys dest=" << obj << " keys=[" << s.str() << "]"; + for (auto i = entries.begin(); i != entries.end(); ++i) { + if (i != entries.begin()) { + s << ", "; + } + s << i->first; + } + s << "]"; +} + +int RGWRadosSetOmapKeysCR::send_request() +{ + int r = store->get_raw_obj_ref(obj, &ref); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "sending request"; + + librados::ObjectWriteOperation op; + op.omap_set(entries); + + cn = stack->create_completion_notifier(); + return ref.ioctx.aio_operate(ref.obj.oid, cn->completion(), &op); +} + +int RGWRadosSetOmapKeysCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosGetOmapKeysCR::RGWRadosGetOmapKeysCR(RGWRados *_store, + const rgw_raw_obj& _obj, + const string& _marker, + int _max_entries, + ResultPtr _result) + : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj), + marker(_marker), max_entries(_max_entries), + result(std::move(_result)) +{ + ceph_assert(result); // must be allocated + set_description() << "get omap keys dest=" << obj << " marker=" << marker; +} + +int RGWRadosGetOmapKeysCR::send_request() { + int r = store->get_raw_obj_ref(obj, &result->ref); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "send request"; + + librados::ObjectReadOperation op; + op.omap_get_keys2(marker, max_entries, &result->entries, &result->more, nullptr); + + cn = stack->create_completion_notifier(result); + return result->ref.ioctx.aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL); +} + +int RGWRadosGetOmapKeysCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosRemoveOmapKeysCR::RGWRadosRemoveOmapKeysCR(RGWRados *_store, + const rgw_raw_obj& _obj, + const set& _keys) : RGWSimpleCoroutine(_store->ctx()), + store(_store), + keys(_keys), + obj(_obj), cn(NULL) +{ + set_description() << "remove omap keys dest=" << obj << " keys=" << keys; +} + +int RGWRadosRemoveOmapKeysCR::send_request() { + int r = store->get_raw_obj_ref(obj, &ref); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "send request"; + + librados::ObjectWriteOperation op; + op.omap_rm_keys(keys); + + cn = stack->create_completion_notifier(); + return ref.ioctx.aio_operate(ref.obj.oid, cn->completion(), &op); +} + +int RGWRadosRemoveOmapKeysCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosRemoveCR::RGWRadosRemoveCR(RGWRados *store, const rgw_raw_obj& obj) + : RGWSimpleCoroutine(store->ctx()), store(store), obj(obj) +{ + set_description() << "remove dest=" << obj; +} + +int RGWRadosRemoveCR::send_request() +{ + auto rados = store->get_rados_handle(); + int r = rados->ioctx_create(obj.pool.name.c_str(), ioctx); + if (r < 0) { + lderr(cct) << "ERROR: failed to open pool (" << obj.pool.name << ") ret=" << r << dendl; + return r; + } + ioctx.locator_set_key(obj.loc); + + set_status() << "send request"; + + librados::ObjectWriteOperation op; + op.remove(); + + cn = stack->create_completion_notifier(); + return ioctx.aio_operate(obj.oid, cn->completion(), &op); +} + +int RGWRadosRemoveCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWSimpleRadosLockCR::RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const rgw_raw_obj& _obj, + const string& _lock_name, + const string& _cookie, + uint32_t _duration) : RGWSimpleCoroutine(_store->ctx()), + async_rados(_async_rados), + store(_store), + lock_name(_lock_name), + cookie(_cookie), + duration(_duration), + obj(_obj), + req(NULL) +{ + set_description() << "rados lock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie << " duration=" << duration; +} + +void RGWSimpleRadosLockCR::request_cleanup() +{ + if (req) { + req->finish(); + req = NULL; + } +} + +int RGWSimpleRadosLockCR::send_request() +{ + set_status() << "sending request"; + req = new RGWAsyncLockSystemObj(this, stack->create_completion_notifier(), + store, NULL, obj, lock_name, cookie, duration); + async_rados->queue(req); + return 0; +} + +int RGWSimpleRadosLockCR::request_complete() +{ + set_status() << "request complete; ret=" << req->get_ret_status(); + return req->get_ret_status(); +} + +RGWSimpleRadosUnlockCR::RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const rgw_raw_obj& _obj, + const string& _lock_name, + const string& _cookie) : RGWSimpleCoroutine(_store->ctx()), + async_rados(_async_rados), + store(_store), + lock_name(_lock_name), + cookie(_cookie), + obj(_obj), + req(NULL) +{ + set_description() << "rados unlock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie; +} + +void RGWSimpleRadosUnlockCR::request_cleanup() +{ + if (req) { + req->finish(); + req = NULL; + } +} + +int RGWSimpleRadosUnlockCR::send_request() +{ + set_status() << "sending request"; + + req = new RGWAsyncUnlockSystemObj(this, stack->create_completion_notifier(), + store, NULL, obj, lock_name, cookie); + async_rados->queue(req); + return 0; +} + +int RGWSimpleRadosUnlockCR::request_complete() +{ + set_status() << "request complete; ret=" << req->get_ret_status(); + return req->get_ret_status(); +} + +int RGWOmapAppend::operate() { + reenter(this) { + for (;;) { + if (!has_product() && going_down) { + set_status() << "going down"; + break; + } + set_status() << "waiting for product"; + yield wait_for_product(); + yield { + string entry; + while (consume(&entry)) { + set_status() << "adding entry: " << entry; + entries[entry] = bufferlist(); + if (entries.size() >= window_size) { + break; + } + } + if (entries.size() >= window_size || going_down) { + set_status() << "flushing to omap"; + call(new RGWRadosSetOmapKeysCR(store, obj, entries)); + entries.clear(); + } + } + if (get_ret_status() < 0) { + ldout(cct, 0) << "ERROR: failed to store entries in omap" << dendl; + return set_state(RGWCoroutine_Error); + } + } + /* done with coroutine */ + return set_state(RGWCoroutine_Done); + } + return 0; +} + +void RGWOmapAppend::flush_pending() { + receive(pending_entries); + num_pending_entries = 0; +} + +bool RGWOmapAppend::append(const string& s) { + if (is_done()) { + return false; + } + ++total_entries; + pending_entries.push_back(s); + if (++num_pending_entries >= (int)window_size) { + flush_pending(); + } + return true; +} + +bool RGWOmapAppend::finish() { + going_down = true; + flush_pending(); + set_sleeping(false); + return (!is_done()); +} + +int RGWAsyncGetBucketInstanceInfo::_send_request() +{ + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_instance_from_oid(obj_ctx, oid, bucket_info, NULL, NULL); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: failed to get bucket instance info for " + << oid << dendl; + return r; + } + + return 0; +} + +RGWRadosBILogTrimCR::RGWRadosBILogTrimCR(RGWRados *store, + const RGWBucketInfo& bucket_info, + int shard_id, + const std::string& start_marker, + const std::string& end_marker) + : RGWSimpleCoroutine(store->ctx()), bs(store), + start_marker(BucketIndexShardsManager::get_shard_marker(start_marker)), + end_marker(BucketIndexShardsManager::get_shard_marker(end_marker)) +{ + bs.init(bucket_info, shard_id); +} + +int RGWRadosBILogTrimCR::send_request() +{ + bufferlist in; + cls_rgw_bi_log_trim_op call; + call.start_marker = std::move(start_marker); + call.end_marker = std::move(end_marker); + encode(call, in); + + librados::ObjectWriteOperation op; + op.exec(RGW_CLASS, RGW_BI_LOG_TRIM, in); + + cn = stack->create_completion_notifier(); + return bs.index_ctx.aio_operate(bs.bucket_obj, cn->completion(), &op); +} + +int RGWRadosBILogTrimCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + set_status() << "request complete; ret=" << r; + return r; +} + +int RGWAsyncFetchRemoteObj::_send_request() +{ + RGWObjectCtx obj_ctx(store); + + string user_id; + char buf[16]; + snprintf(buf, sizeof(buf), ".%lld", (long long)store->instance_id()); + map attrs; + + rgw_obj src_obj(bucket_info.bucket, key); + + rgw_obj dest_obj(bucket_info.bucket, dest_key.value_or(key)); + + std::optional bytes_transferred; + int r = store->fetch_remote_obj(obj_ctx, + user_id, + NULL, /* req_info */ + source_zone, + dest_obj, + src_obj, + bucket_info, /* dest */ + bucket_info, /* source */ + dest_placement_rule, + NULL, /* real_time* src_mtime, */ + NULL, /* real_time* mtime, */ + NULL, /* const real_time* mod_ptr, */ + NULL, /* const real_time* unmod_ptr, */ + false, /* high precision time */ + NULL, /* const char *if_match, */ + NULL, /* const char *if_nomatch, */ + RGWRados::ATTRSMOD_NONE, + copy_if_newer, + attrs, + RGWObjCategory::Main, + versioned_epoch, + real_time(), /* delete_at */ + NULL, /* string *ptag, */ + NULL, /* string *petag, */ + NULL, /* void (*progress_cb)(off_t, void *), */ + NULL, /* void *progress_data*); */ + &zones_trace, + &bytes_transferred); + + if (r < 0) { + ldout(store->ctx(), 0) << "store->fetch_remote_obj() returned r=" << r << dendl; + if (counters) { + counters->inc(sync_counters::l_fetch_err, 1); + } + } else if (counters) { + if (bytes_transferred) { + counters->inc(sync_counters::l_fetch, *bytes_transferred); + } else { + counters->inc(sync_counters::l_fetch_not_modified); + } + } + return r; +} + +int RGWAsyncStatRemoteObj::_send_request() +{ + RGWObjectCtx obj_ctx(store); + + string user_id; + char buf[16]; + snprintf(buf, sizeof(buf), ".%lld", (long long)store->instance_id()); + + rgw_obj src_obj(bucket_info.bucket, key); + + rgw_obj dest_obj(src_obj); + + int r = store->stat_remote_obj(obj_ctx, + user_id, + nullptr, /* req_info */ + source_zone, + src_obj, + bucket_info, /* source */ + pmtime, /* real_time* src_mtime, */ + psize, /* uint64_t * */ + nullptr, /* const real_time* mod_ptr, */ + nullptr, /* const real_time* unmod_ptr, */ + true, /* high precision time */ + nullptr, /* const char *if_match, */ + nullptr, /* const char *if_nomatch, */ + pattrs, + pheaders, + nullptr, + nullptr, /* string *ptag, */ + petag); /* string *petag, */ + + if (r < 0) { + ldout(store->ctx(), 0) << "store->fetch_remote_obj() returned r=" << r << dendl; + } + return r; +} + + +int RGWAsyncRemoveObj::_send_request() +{ + RGWObjectCtx obj_ctx(store); + + rgw_obj obj(bucket_info.bucket, key); + + ldout(store->ctx(), 0) << __func__ << "(): deleting obj=" << obj << dendl; + + obj_ctx.set_atomic(obj); + + RGWObjState *state; + + int ret = store->get_obj_state(&obj_ctx, bucket_info, obj, &state); + if (ret < 0) { + ldout(store->ctx(), 20) << __func__ << "(): get_obj_state() obj=" << obj << " returned ret=" << ret << dendl; + return ret; + } + + /* has there been any racing object write? */ + if (del_if_older && (state->mtime > timestamp)) { + ldout(store->ctx(), 20) << __func__ << "(): skipping object removal obj=" << obj << " (obj mtime=" << state->mtime << ", request timestamp=" << timestamp << ")" << dendl; + return 0; + } + + RGWAccessControlPolicy policy; + + /* decode policy */ + map::iterator iter = state->attrset.find(RGW_ATTR_ACL); + if (iter != state->attrset.end()) { + auto bliter = iter->second.cbegin(); + try { + policy.decode(bliter); + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + } + + RGWRados::Object del_target(store, bucket_info, obj_ctx, obj); + RGWRados::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = bucket_info.owner; + del_op.params.obj_owner = policy.get_owner(); + if (del_if_older) { + del_op.params.unmod_since = timestamp; + } + if (versioned) { + del_op.params.versioning_status = BUCKET_VERSIONED; + } + del_op.params.olh_epoch = versioned_epoch; + del_op.params.marker_version_id = marker_version_id; + del_op.params.obj_owner.set_id(owner); + del_op.params.obj_owner.set_name(owner_display_name); + del_op.params.mtime = timestamp; + del_op.params.high_precision_time = true; + del_op.params.zones_trace = &zones_trace; + + ret = del_op.delete_obj(); + if (ret < 0) { + ldout(store->ctx(), 20) << __func__ << "(): delete_obj() obj=" << obj << " returned ret=" << ret << dendl; + } + return ret; +} + +int RGWContinuousLeaseCR::operate() +{ + if (aborted) { + caller->set_sleeping(false); + return set_cr_done(); + } + reenter(this) { + while (!going_down) { + yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval)); + + caller->set_sleeping(false); /* will only be relevant when we return, that's why we can do it early */ + if (retcode < 0) { + set_locked(false); + ldout(store->ctx(), 20) << *this << ": couldn't lock " << obj << ":" << lock_name << ": retcode=" << retcode << dendl; + return set_state(RGWCoroutine_Error, retcode); + } + set_locked(true); + yield wait(utime_t(interval / 2, 0)); + } + set_locked(false); /* moot at this point anyway */ + yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie)); + return set_state(RGWCoroutine_Done); + } + return 0; +} + +RGWRadosTimelogAddCR::RGWRadosTimelogAddCR(RGWRados *_store, const string& _oid, + const cls_log_entry& entry) : RGWSimpleCoroutine(_store->ctx()), + store(_store), + oid(_oid), cn(NULL) +{ + stringstream& s = set_description(); + s << "timelog add entry oid=" << oid << "entry={id=" << entry.id << ", section=" << entry.section << ", name=" << entry.name << "}"; + entries.push_back(entry); +} + +int RGWRadosTimelogAddCR::send_request() +{ + set_status() << "sending request"; + + cn = stack->create_completion_notifier(); + return store->time_log_add(oid, entries, cn->completion(), true); +} + +int RGWRadosTimelogAddCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + +RGWRadosTimelogTrimCR::RGWRadosTimelogTrimCR(RGWRados *store, + const std::string& oid, + const real_time& start_time, + const real_time& end_time, + const std::string& from_marker, + const std::string& to_marker) + : RGWSimpleCoroutine(store->ctx()), store(store), oid(oid), + start_time(start_time), end_time(end_time), + from_marker(from_marker), to_marker(to_marker) +{ + set_description() << "timelog trim oid=" << oid + << " start_time=" << start_time << " end_time=" << end_time + << " from_marker=" << from_marker << " to_marker=" << to_marker; +} + +int RGWRadosTimelogTrimCR::send_request() +{ + set_status() << "sending request"; + + cn = stack->create_completion_notifier(); + return store->time_log_trim(oid, start_time, end_time, from_marker, + to_marker, cn->completion()); +} + +int RGWRadosTimelogTrimCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} + + +RGWSyncLogTrimCR::RGWSyncLogTrimCR(RGWRados *store, const std::string& oid, + const std::string& to_marker, + std::string *last_trim_marker) + : RGWRadosTimelogTrimCR(store, oid, real_time{}, real_time{}, + std::string{}, to_marker), + cct(store->ctx()), last_trim_marker(last_trim_marker) +{ +} + +int RGWSyncLogTrimCR::request_complete() +{ + int r = RGWRadosTimelogTrimCR::request_complete(); + if (r != -ENODATA) { + return r; + } + // nothing left to trim, update last_trim_marker + if (*last_trim_marker < to_marker && to_marker != max_marker) { + *last_trim_marker = to_marker; + } + return 0; +} + + +int RGWAsyncStatObj::_send_request() +{ + rgw_raw_obj raw_obj; + store->obj_to_raw(bucket_info.placement_rule, obj, &raw_obj); + return store->raw_obj_stat(raw_obj, psize, pmtime, pepoch, + nullptr, nullptr, objv_tracker); +} + +RGWStatObjCR::RGWStatObjCR(RGWAsyncRadosProcessor *async_rados, RGWRados *store, + const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize, + real_time* pmtime, uint64_t *pepoch, + RGWObjVersionTracker *objv_tracker) + : RGWSimpleCoroutine(store->ctx()), store(store), async_rados(async_rados), + bucket_info(_bucket_info), obj(obj), psize(psize), pmtime(pmtime), pepoch(pepoch), + objv_tracker(objv_tracker) +{ +} + +void RGWStatObjCR::request_cleanup() +{ + if (req) { + req->finish(); + req = NULL; + } +} + +int RGWStatObjCR::send_request() +{ + req = new RGWAsyncStatObj(this, stack->create_completion_notifier(), + store, bucket_info, obj, psize, pmtime, pepoch, objv_tracker); + async_rados->queue(req); + return 0; +} + +int RGWStatObjCR::request_complete() +{ + return req->get_ret_status(); +} + +RGWRadosNotifyCR::RGWRadosNotifyCR(RGWRados *store, const rgw_raw_obj& obj, + bufferlist& request, uint64_t timeout_ms, + bufferlist *response) + : RGWSimpleCoroutine(store->ctx()), store(store), obj(obj), + request(request), timeout_ms(timeout_ms), response(response) +{ + set_description() << "notify dest=" << obj; +} + +int RGWRadosNotifyCR::send_request() +{ + int r = store->get_raw_obj_ref(obj, &ref); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl; + return r; + } + + set_status() << "sending request"; + + cn = stack->create_completion_notifier(); + return ref.ioctx.aio_notify(ref.obj.oid, cn->completion(), request, + timeout_ms, response); +} + +int RGWRadosNotifyCR::request_complete() +{ + int r = cn->completion()->get_return_value(); + + set_status() << "request complete; ret=" << r; + + return r; +} diff --git a/src/rgw/rgw_cr_rados.h b/src/rgw/rgw_cr_rados.h new file mode 100644 index 00000000..70b52f35 --- /dev/null +++ b/src/rgw/rgw_cr_rados.h @@ -0,0 +1,1351 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_CR_RADOS_H +#define CEPH_RGW_CR_RADOS_H + +#include +#include "include/ceph_assert.h" +#include "rgw_coroutine.h" +#include "rgw_rados.h" +#include "common/WorkQueue.h" +#include "common/Throttle.h" + +#include + +#include "services/svc_sys_obj.h" + +class RGWAsyncRadosRequest : public RefCountedObject { + RGWCoroutine *caller; + RGWAioCompletionNotifier *notifier; + + int retcode; + + Mutex lock; + +protected: + virtual int _send_request() = 0; +public: + RGWAsyncRadosRequest(RGWCoroutine *_caller, RGWAioCompletionNotifier *_cn) : caller(_caller), notifier(_cn), retcode(0), + lock("RGWAsyncRadosRequest::lock") { + } + ~RGWAsyncRadosRequest() override { + if (notifier) { + notifier->put(); + } + } + + void send_request() { + get(); + retcode = _send_request(); + { + Mutex::Locker l(lock); + if (notifier) { + notifier->cb(); // drops its own ref + notifier = nullptr; + } + } + put(); + } + + int get_ret_status() { return retcode; } + + void finish() { + { + Mutex::Locker l(lock); + if (notifier) { + // we won't call notifier->cb() to drop its ref, so drop it here + notifier->put(); + notifier = nullptr; + } + } + put(); + } +}; + + +class RGWAsyncRadosProcessor { + deque m_req_queue; + std::atomic going_down = { false }; +protected: + RGWRados *store; + ThreadPool m_tp; + Throttle req_throttle; + + struct RGWWQ : public ThreadPool::WorkQueue { + RGWAsyncRadosProcessor *processor; + RGWWQ(RGWAsyncRadosProcessor *p, time_t timeout, time_t suicide_timeout, ThreadPool *tp) + : ThreadPool::WorkQueue("RGWWQ", timeout, suicide_timeout, tp), processor(p) {} + + bool _enqueue(RGWAsyncRadosRequest *req) override; + void _dequeue(RGWAsyncRadosRequest *req) override { + ceph_abort(); + } + bool _empty() override; + RGWAsyncRadosRequest *_dequeue() override; + using ThreadPool::WorkQueue::_process; + void _process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) override; + void _dump_queue(); + void _clear() override { + ceph_assert(processor->m_req_queue.empty()); + } + } req_wq; + +public: + RGWAsyncRadosProcessor(RGWRados *_store, int num_threads); + ~RGWAsyncRadosProcessor() {} + void start(); + void stop(); + void handle_request(RGWAsyncRadosRequest *req); + void queue(RGWAsyncRadosRequest *req); + + bool is_going_down() { + return going_down; + } +}; + +template +class RGWSimpleWriteOnlyAsyncCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + + P params; + + class Request : public RGWAsyncRadosRequest { + RGWRados *store; + P params; + protected: + int _send_request() override; + public: + Request(RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, + RGWRados *store, + const P& _params) : RGWAsyncRadosRequest(caller, cn), + store(store), + params(_params) {} + } *req{nullptr}; + + public: + RGWSimpleWriteOnlyAsyncCR(RGWAsyncRadosProcessor *_async_rados, + RGWRados *_store, + const P& _params) : RGWSimpleCoroutine(_store->ctx()), + async_rados(_async_rados), + store(_store), + params(_params) {} + + ~RGWSimpleWriteOnlyAsyncCR() override { + request_cleanup(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new Request(this, + stack->create_completion_notifier(), + store, + params); + + async_rados->queue(req); + return 0; + } + int request_complete() override { + return req->get_ret_status(); + } +}; + + +template +class RGWSimpleAsyncCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + + P params; + std::shared_ptr result; + + class Request : public RGWAsyncRadosRequest { + RGWRados *store; + P params; + std::shared_ptr result; + protected: + int _send_request() override; + public: + Request(RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, + RGWRados *_store, + const P& _params, + std::shared_ptr& _result) : RGWAsyncRadosRequest(caller, cn), + store(_store), + params(_params), + result(_result) {} + } *req{nullptr}; + + public: + RGWSimpleAsyncCR(RGWAsyncRadosProcessor *_async_rados, + RGWRados *_store, + const P& _params, + std::shared_ptr& _result) : RGWSimpleCoroutine(_store->ctx()), + async_rados(_async_rados), + store(_store), + params(_params), + result(_result) {} + + ~RGWSimpleAsyncCR() override { + request_cleanup(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new Request(this, + stack->create_completion_notifier(), + store, + params, + result); + + async_rados->queue(req); + return 0; + } + int request_complete() override { + return req->get_ret_status(); + } +}; + + +class RGWAsyncGetSystemObj : public RGWAsyncRadosRequest { + RGWSysObjectCtx obj_ctx; + RGWObjVersionTracker objv_tracker; + rgw_raw_obj obj; + const bool want_attrs; + const bool raw_attrs; +protected: + int _send_request() override; +public: + RGWAsyncGetSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + bool want_attrs, bool raw_attrs); + + bufferlist bl; + map attrs; +}; + +class RGWAsyncPutSystemObj : public RGWAsyncRadosRequest { + RGWSI_SysObj *svc; + rgw_raw_obj obj; + bool exclusive; + bufferlist bl; + +protected: + int _send_request() override; +public: + RGWAsyncPutSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + bool _exclusive, bufferlist _bl); + + RGWObjVersionTracker objv_tracker; +}; + +class RGWAsyncPutSystemObjAttrs : public RGWAsyncRadosRequest { + RGWSI_SysObj *svc; + rgw_raw_obj obj; + map attrs; + +protected: + int _send_request() override; +public: + RGWAsyncPutSystemObjAttrs(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + map _attrs); + + RGWObjVersionTracker objv_tracker; +}; + +class RGWAsyncLockSystemObj : public RGWAsyncRadosRequest { + RGWRados *store; + rgw_raw_obj obj; + string lock_name; + string cookie; + uint32_t duration_secs; + +protected: + int _send_request() override; +public: + RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + const string& _name, const string& _cookie, uint32_t _duration_secs); +}; + +class RGWAsyncUnlockSystemObj : public RGWAsyncRadosRequest { + RGWRados *store; + rgw_raw_obj obj; + string lock_name; + string cookie; + +protected: + int _send_request() override; +public: + RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj, + const string& _name, const string& _cookie); +}; + +template +class RGWSimpleRadosReadCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWSI_SysObj *svc; + + rgw_raw_obj obj; + T *result; + /// on ENOENT, call handle_data() with an empty object instead of failing + const bool empty_on_enoent; + RGWObjVersionTracker *objv_tracker; + RGWAsyncGetSystemObj *req{nullptr}; + +public: + RGWSimpleRadosReadCR(RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc, + const rgw_raw_obj& _obj, + T *_result, bool empty_on_enoent = true, + RGWObjVersionTracker *objv_tracker = nullptr) + : RGWSimpleCoroutine(_svc->ctx()), async_rados(_async_rados), svc(_svc), + obj(_obj), result(_result), + empty_on_enoent(empty_on_enoent), objv_tracker(objv_tracker) {} + ~RGWSimpleRadosReadCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override; + int request_complete() override; + + virtual int handle_data(T& data) { + return 0; + } +}; + +template +int RGWSimpleRadosReadCR::send_request() +{ + req = new RGWAsyncGetSystemObj(this, stack->create_completion_notifier(), svc, + objv_tracker, obj, false, false); + async_rados->queue(req); + return 0; +} + +template +int RGWSimpleRadosReadCR::request_complete() +{ + int ret = req->get_ret_status(); + retcode = ret; + if (ret == -ENOENT && empty_on_enoent) { + *result = T(); + } else { + if (ret < 0) { + return ret; + } + try { + auto iter = req->bl.cbegin(); + if (iter.end()) { + // allow successful reads with empty buffers. ReadSyncStatus coroutines + // depend on this to be able to read without locking, because the + // cls lock from InitSyncStatus will create an empty object if it didn't + // exist + *result = T(); + } else { + decode(*result, iter); + } + } catch (buffer::error& err) { + return -EIO; + } + } + + return handle_data(*result); +} + +class RGWSimpleRadosReadAttrsCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWSI_SysObj *svc; + + rgw_raw_obj obj; + map *pattrs; + bool raw_attrs; + RGWAsyncGetSystemObj *req; + +public: + RGWSimpleRadosReadAttrsCR(RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc, + const rgw_raw_obj& _obj, + map *_pattrs, bool _raw_attrs) : RGWSimpleCoroutine(_svc->ctx()), + async_rados(_async_rados), svc(_svc), + obj(_obj), + pattrs(_pattrs), + raw_attrs(_raw_attrs), + req(NULL) {} + ~RGWSimpleRadosReadAttrsCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override; + int request_complete() override; +}; + +template +class RGWSimpleRadosWriteCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWSI_SysObj *svc; + bufferlist bl; + rgw_raw_obj obj; + RGWObjVersionTracker *objv_tracker; + RGWAsyncPutSystemObj *req{nullptr}; + +public: + RGWSimpleRadosWriteCR(RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc, + const rgw_raw_obj& _obj, + const T& _data, RGWObjVersionTracker *objv_tracker = nullptr) + : RGWSimpleCoroutine(_svc->ctx()), async_rados(_async_rados), + svc(_svc), obj(_obj), objv_tracker(objv_tracker) { + encode(_data, bl); + } + + ~RGWSimpleRadosWriteCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new RGWAsyncPutSystemObj(this, stack->create_completion_notifier(), + svc, objv_tracker, obj, false, std::move(bl)); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + if (objv_tracker) { // copy the updated version + *objv_tracker = req->objv_tracker; + } + return req->get_ret_status(); + } +}; + +class RGWSimpleRadosWriteAttrsCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWSI_SysObj *svc; + RGWObjVersionTracker *objv_tracker; + + rgw_raw_obj obj; + map attrs; + RGWAsyncPutSystemObjAttrs *req = nullptr; + +public: + RGWSimpleRadosWriteAttrsCR(RGWAsyncRadosProcessor *_async_rados, + RGWSI_SysObj *_svc, const rgw_raw_obj& _obj, + map _attrs, + RGWObjVersionTracker *objv_tracker = nullptr) + : RGWSimpleCoroutine(_svc->ctx()), async_rados(_async_rados), + svc(_svc), objv_tracker(objv_tracker), obj(_obj), + attrs(std::move(_attrs)) { + } + ~RGWSimpleRadosWriteAttrsCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new RGWAsyncPutSystemObjAttrs(this, stack->create_completion_notifier(), + svc, objv_tracker, obj, std::move(attrs)); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + if (objv_tracker) { // copy the updated version + *objv_tracker = req->objv_tracker; + } + return req->get_ret_status(); + } +}; + +class RGWRadosSetOmapKeysCR : public RGWSimpleCoroutine { + RGWRados *store; + map entries; + + rgw_rados_ref ref; + + rgw_raw_obj obj; + + boost::intrusive_ptr cn; + +public: + RGWRadosSetOmapKeysCR(RGWRados *_store, + const rgw_raw_obj& _obj, + map& _entries); + + int send_request() override; + int request_complete() override; +}; + +class RGWRadosGetOmapKeysCR : public RGWSimpleCoroutine { + public: + struct Result { + rgw_rados_ref ref; + std::set entries; + bool more = false; + }; + using ResultPtr = std::shared_ptr; + + RGWRadosGetOmapKeysCR(RGWRados *_store, const rgw_raw_obj& _obj, + const string& _marker, int _max_entries, + ResultPtr result); + + int send_request() override; + int request_complete() override; + + private: + RGWRados *store; + rgw_raw_obj obj; + string marker; + int max_entries; + ResultPtr result; + boost::intrusive_ptr cn; +}; + +class RGWRadosRemoveOmapKeysCR : public RGWSimpleCoroutine { + RGWRados *store; + + rgw_rados_ref ref; + + set keys; + + rgw_raw_obj obj; + + boost::intrusive_ptr cn; + +public: + RGWRadosRemoveOmapKeysCR(RGWRados *_store, + const rgw_raw_obj& _obj, + const set& _keys); + + int send_request() override; + + int request_complete() override; +}; + +class RGWRadosRemoveCR : public RGWSimpleCoroutine { + RGWRados *store; + librados::IoCtx ioctx; + const rgw_raw_obj obj; + boost::intrusive_ptr cn; + +public: + RGWRadosRemoveCR(RGWRados *store, const rgw_raw_obj& obj); + + int send_request() override; + int request_complete() override; +}; + +class RGWSimpleRadosLockCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + string lock_name; + string cookie; + uint32_t duration; + + rgw_raw_obj obj; + + RGWAsyncLockSystemObj *req; + +public: + RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const rgw_raw_obj& _obj, + const string& _lock_name, + const string& _cookie, + uint32_t _duration); + ~RGWSimpleRadosLockCR() override { + request_cleanup(); + } + void request_cleanup() override; + + int send_request() override; + int request_complete() override; + + static std::string gen_random_cookie(CephContext* cct) { +#define COOKIE_LEN 16 + char buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1); + return buf; + } +}; + +class RGWSimpleRadosUnlockCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + string lock_name; + string cookie; + + rgw_raw_obj obj; + + RGWAsyncUnlockSystemObj *req; + +public: + RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const rgw_raw_obj& _obj, + const string& _lock_name, + const string& _cookie); + ~RGWSimpleRadosUnlockCR() override { + request_cleanup(); + } + void request_cleanup() override; + + int send_request() override; + int request_complete() override; +}; + +#define OMAP_APPEND_MAX_ENTRIES_DEFAULT 100 + +class RGWOmapAppend : public RGWConsumerCR { + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + + rgw_raw_obj obj; + + bool going_down; + + int num_pending_entries; + list pending_entries; + + map entries; + + uint64_t window_size; + uint64_t total_entries; +public: + RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const rgw_raw_obj& _obj, + uint64_t _window_size = OMAP_APPEND_MAX_ENTRIES_DEFAULT); + int operate() override; + void flush_pending(); + bool append(const string& s); + bool finish(); + + uint64_t get_total_entries() { + return total_entries; + } + + const rgw_raw_obj& get_obj() { + return obj; + } +}; + +class RGWAsyncWait : public RGWAsyncRadosRequest { + CephContext *cct; + Mutex *lock; + Cond *cond; + utime_t interval; +protected: + int _send_request() override { + Mutex::Locker l(*lock); + return cond->WaitInterval(*lock, interval); + } +public: + RGWAsyncWait(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, CephContext *_cct, + Mutex *_lock, Cond *_cond, int _secs) : RGWAsyncRadosRequest(caller, cn), + cct(_cct), + lock(_lock), cond(_cond), interval(_secs, 0) {} + + void wakeup() { + Mutex::Locker l(*lock); + cond->Signal(); + } +}; + +class RGWWaitCR : public RGWSimpleCoroutine { + CephContext *cct; + RGWAsyncRadosProcessor *async_rados; + Mutex *lock; + Cond *cond; + int secs; + + RGWAsyncWait *req; + +public: + RGWWaitCR(RGWAsyncRadosProcessor *_async_rados, CephContext *_cct, + Mutex *_lock, Cond *_cond, + int _secs) : RGWSimpleCoroutine(_cct), cct(_cct), + async_rados(_async_rados), lock(_lock), cond(_cond), secs(_secs), req(NULL) { + } + ~RGWWaitCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + wakeup(); + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new RGWAsyncWait(this, stack->create_completion_notifier(), cct, lock, cond, secs); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } + + void wakeup() { + req->wakeup(); + } +}; + +class RGWShardedOmapCRManager { + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + RGWCoroutine *op; + + int num_shards; + + vector shards; +public: + RGWShardedOmapCRManager(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, RGWCoroutine *_op, int _num_shards, const rgw_pool& pool, const string& oid_prefix) + : async_rados(_async_rados), + store(_store), op(_op), num_shards(_num_shards) { + shards.reserve(num_shards); + for (int i = 0; i < num_shards; ++i) { + char buf[oid_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), i); + RGWOmapAppend *shard = new RGWOmapAppend(async_rados, store, rgw_raw_obj(pool, buf)); + shard->get(); + shards.push_back(shard); + op->spawn(shard, false); + } + } + + ~RGWShardedOmapCRManager() { + for (auto shard : shards) { + shard->put(); + } + } + + bool append(const string& entry, int shard_id) { + return shards[shard_id]->append(entry); + } + bool finish() { + bool success = true; + for (vector::iterator iter = shards.begin(); iter != shards.end(); ++iter) { + success &= ((*iter)->finish() && (!(*iter)->is_error())); + } + return success; + } + + uint64_t get_total_entries(int shard_id) { + return shards[shard_id]->get_total_entries(); + } +}; + +class RGWAsyncGetBucketInstanceInfo : public RGWAsyncRadosRequest { + RGWRados *store; + const std::string oid; + +protected: + int _send_request() override; +public: + RGWAsyncGetBucketInstanceInfo(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, + RGWRados *_store, const std::string& oid) + : RGWAsyncRadosRequest(caller, cn), store(_store), oid(oid) {} + + RGWBucketInfo bucket_info; +}; + +class RGWGetBucketInstanceInfoCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + const std::string oid; + RGWBucketInfo *bucket_info; + + RGWAsyncGetBucketInstanceInfo *req{nullptr}; + +public: + // metadata key constructor + RGWGetBucketInstanceInfoCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const std::string& meta_key, RGWBucketInfo *_bucket_info) + : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados), store(_store), + oid(RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key), + bucket_info(_bucket_info) {} + // rgw_bucket constructor + RGWGetBucketInstanceInfoCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const rgw_bucket& bucket, RGWBucketInfo *_bucket_info) + : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados), store(_store), + oid(RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':')), + bucket_info(_bucket_info) {} + ~RGWGetBucketInstanceInfoCR() override { + request_cleanup(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new RGWAsyncGetBucketInstanceInfo(this, stack->create_completion_notifier(), store, oid); + async_rados->queue(req); + return 0; + } + int request_complete() override { + if (bucket_info) { + *bucket_info = std::move(req->bucket_info); + } + return req->get_ret_status(); + } +}; + +class RGWRadosBILogTrimCR : public RGWSimpleCoroutine { + RGWRados::BucketShard bs; + std::string start_marker; + std::string end_marker; + boost::intrusive_ptr cn; + public: + RGWRadosBILogTrimCR(RGWRados *store, const RGWBucketInfo& bucket_info, + int shard_id, const std::string& start_marker, + const std::string& end_marker); + + int send_request() override; + int request_complete() override; +}; + +class RGWAsyncFetchRemoteObj : public RGWAsyncRadosRequest { + RGWRados *store; + string source_zone; + + RGWBucketInfo bucket_info; + std::optional dest_placement_rule; + + rgw_obj_key key; + std::optional dest_key; + std::optional versioned_epoch; + + real_time src_mtime; + + bool copy_if_newer; + rgw_zone_set zones_trace; + PerfCounters* counters; + +protected: + int _send_request() override; +public: + RGWAsyncFetchRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + const string& _source_zone, + RGWBucketInfo& _bucket_info, + std::optional _dest_placement_rule, + const rgw_obj_key& _key, + const std::optional& _dest_key, + std::optional _versioned_epoch, + bool _if_newer, rgw_zone_set *_zones_trace, + PerfCounters* counters) + : RGWAsyncRadosRequest(caller, cn), store(_store), + source_zone(_source_zone), + bucket_info(_bucket_info), + dest_placement_rule(_dest_placement_rule), + key(_key), + dest_key(_dest_key), + versioned_epoch(_versioned_epoch), + copy_if_newer(_if_newer), counters(counters) + { + if (_zones_trace) { + zones_trace = *_zones_trace; + } + } +}; + +class RGWFetchRemoteObjCR : public RGWSimpleCoroutine { + CephContext *cct; + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + string source_zone; + + RGWBucketInfo bucket_info; + std::optional dest_placement_rule; + + rgw_obj_key key; + std::optional dest_key; + std::optional versioned_epoch; + + real_time src_mtime; + + bool copy_if_newer; + + RGWAsyncFetchRemoteObj *req; + rgw_zone_set *zones_trace; + PerfCounters* counters; + +public: + RGWFetchRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const string& _source_zone, + RGWBucketInfo& _bucket_info, + std::optional _dest_placement_rule, + const rgw_obj_key& _key, + const std::optional& _dest_key, + std::optional _versioned_epoch, + bool _if_newer, rgw_zone_set *_zones_trace, + PerfCounters* counters) + : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()), + async_rados(_async_rados), store(_store), + source_zone(_source_zone), + bucket_info(_bucket_info), + dest_placement_rule(_dest_placement_rule), + key(_key), + dest_key(_dest_key), + versioned_epoch(_versioned_epoch), + copy_if_newer(_if_newer), req(NULL), + zones_trace(_zones_trace), counters(counters) {} + + + ~RGWFetchRemoteObjCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new RGWAsyncFetchRemoteObj(this, stack->create_completion_notifier(), store, + source_zone, bucket_info, dest_placement_rule, + key, dest_key, versioned_epoch, copy_if_newer, + zones_trace, counters); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWAsyncStatRemoteObj : public RGWAsyncRadosRequest { + RGWRados *store; + string source_zone; + + RGWBucketInfo bucket_info; + + rgw_obj_key key; + + ceph::real_time *pmtime; + uint64_t *psize; + string *petag; + map *pattrs; + map *pheaders; + +protected: + int _send_request() override; +public: + RGWAsyncStatRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + const string& _source_zone, + RGWBucketInfo& _bucket_info, + const rgw_obj_key& _key, + ceph::real_time *_pmtime, + uint64_t *_psize, + string *_petag, + map *_pattrs, + map *_pheaders) : RGWAsyncRadosRequest(caller, cn), store(_store), + source_zone(_source_zone), + bucket_info(_bucket_info), + key(_key), + pmtime(_pmtime), + psize(_psize), + petag(_petag), + pattrs(_pattrs), + pheaders(_pheaders) {} +}; + +class RGWStatRemoteObjCR : public RGWSimpleCoroutine { + CephContext *cct; + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + string source_zone; + + RGWBucketInfo bucket_info; + + rgw_obj_key key; + + ceph::real_time *pmtime; + uint64_t *psize; + string *petag; + map *pattrs; + map *pheaders; + + RGWAsyncStatRemoteObj *req; + +public: + RGWStatRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const string& _source_zone, + RGWBucketInfo& _bucket_info, + const rgw_obj_key& _key, + ceph::real_time *_pmtime, + uint64_t *_psize, + string *_petag, + map *_pattrs, + map *_pheaders) : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()), + async_rados(_async_rados), store(_store), + source_zone(_source_zone), + bucket_info(_bucket_info), + key(_key), + pmtime(_pmtime), + psize(_psize), + petag(_petag), + pattrs(_pattrs), + pheaders(_pheaders), + req(NULL) {} + + + ~RGWStatRemoteObjCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new RGWAsyncStatRemoteObj(this, stack->create_completion_notifier(), store, source_zone, + bucket_info, key, pmtime, psize, petag, pattrs, pheaders); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWAsyncRemoveObj : public RGWAsyncRadosRequest { + RGWRados *store; + string source_zone; + + RGWBucketInfo bucket_info; + + rgw_obj_key key; + string owner; + string owner_display_name; + bool versioned; + uint64_t versioned_epoch; + string marker_version_id; + + bool del_if_older; + ceph::real_time timestamp; + rgw_zone_set zones_trace; + +protected: + int _send_request() override; +public: + RGWAsyncRemoveObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + const string& _source_zone, + RGWBucketInfo& _bucket_info, + const rgw_obj_key& _key, + const string& _owner, + const string& _owner_display_name, + bool _versioned, + uint64_t _versioned_epoch, + bool _delete_marker, + bool _if_older, + real_time& _timestamp, + rgw_zone_set* _zones_trace) : RGWAsyncRadosRequest(caller, cn), store(_store), + source_zone(_source_zone), + bucket_info(_bucket_info), + key(_key), + owner(_owner), + owner_display_name(_owner_display_name), + versioned(_versioned), + versioned_epoch(_versioned_epoch), + del_if_older(_if_older), + timestamp(_timestamp) { + if (_delete_marker) { + marker_version_id = key.instance; + } + + if (_zones_trace) { + zones_trace = *_zones_trace; + } + } +}; + +class RGWRemoveObjCR : public RGWSimpleCoroutine { + CephContext *cct; + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + string source_zone; + + RGWBucketInfo bucket_info; + + rgw_obj_key key; + bool versioned; + uint64_t versioned_epoch; + bool delete_marker; + string owner; + string owner_display_name; + + bool del_if_older; + real_time timestamp; + + RGWAsyncRemoveObj *req; + + rgw_zone_set *zones_trace; + +public: + RGWRemoveObjCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const string& _source_zone, + RGWBucketInfo& _bucket_info, + const rgw_obj_key& _key, + bool _versioned, + uint64_t _versioned_epoch, + string *_owner, + string *_owner_display_name, + bool _delete_marker, + real_time *_timestamp, + rgw_zone_set *_zones_trace) : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()), + async_rados(_async_rados), store(_store), + source_zone(_source_zone), + bucket_info(_bucket_info), + key(_key), + versioned(_versioned), + versioned_epoch(_versioned_epoch), + delete_marker(_delete_marker), req(NULL), zones_trace(_zones_trace) { + del_if_older = (_timestamp != NULL); + if (_timestamp) { + timestamp = *_timestamp; + } + + if (_owner) { + owner = *_owner; + } + + if (_owner_display_name) { + owner_display_name = *_owner_display_name; + } + } + ~RGWRemoveObjCR() override { + request_cleanup(); + } + + void request_cleanup() override { + if (req) { + req->finish(); + req = NULL; + } + } + + int send_request() override { + req = new RGWAsyncRemoveObj(this, stack->create_completion_notifier(), store, source_zone, bucket_info, + key, owner, owner_display_name, versioned, versioned_epoch, + delete_marker, del_if_older, timestamp, zones_trace); + async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWContinuousLeaseCR : public RGWCoroutine { + RGWAsyncRadosProcessor *async_rados; + RGWRados *store; + + const rgw_raw_obj obj; + + const string lock_name; + const string cookie; + + int interval; + + Mutex lock; + std::atomic going_down = { false }; + bool locked{false}; + + RGWCoroutine *caller; + + bool aborted{false}; + +public: + RGWContinuousLeaseCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store, + const rgw_raw_obj& _obj, + const string& _lock_name, int _interval, RGWCoroutine *_caller) + : RGWCoroutine(_store->ctx()), async_rados(_async_rados), store(_store), + obj(_obj), lock_name(_lock_name), + cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)), + interval(_interval), lock("RGWContinuousLeaseCR"), caller(_caller) + {} + + int operate() override; + + bool is_locked() { + Mutex::Locker l(lock); + return locked; + } + + void set_locked(bool status) { + Mutex::Locker l(lock); + locked = status; + } + + void go_down() { + going_down = true; + wakeup(); + } + + void abort() { + aborted = true; + } +}; + +class RGWRadosTimelogAddCR : public RGWSimpleCoroutine { + RGWRados *store; + list entries; + + string oid; + + boost::intrusive_ptr cn; + +public: + RGWRadosTimelogAddCR(RGWRados *_store, const string& _oid, + const cls_log_entry& entry); + + int send_request() override; + int request_complete() override; +}; + +class RGWRadosTimelogTrimCR : public RGWSimpleCoroutine { + RGWRados *store; + boost::intrusive_ptr cn; + protected: + std::string oid; + real_time start_time; + real_time end_time; + std::string from_marker; + std::string to_marker; + + public: + RGWRadosTimelogTrimCR(RGWRados *store, const std::string& oid, + const real_time& start_time, const real_time& end_time, + const std::string& from_marker, + const std::string& to_marker); + + int send_request() override; + int request_complete() override; +}; + +// wrapper to update last_trim_marker on success +class RGWSyncLogTrimCR : public RGWRadosTimelogTrimCR { + CephContext *cct; + std::string *last_trim_marker; + public: + // a marker that compares greater than any timestamp-based index + static constexpr const char* max_marker = "99999999"; + + RGWSyncLogTrimCR(RGWRados *store, const std::string& oid, + const std::string& to_marker, std::string *last_trim_marker); + int request_complete() override; +}; + +class RGWAsyncStatObj : public RGWAsyncRadosRequest { + RGWRados *store; + RGWBucketInfo bucket_info; + rgw_obj obj; + uint64_t *psize; + real_time *pmtime; + uint64_t *pepoch; + RGWObjVersionTracker *objv_tracker; +protected: + int _send_request() override; +public: + RGWAsyncStatObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *store, + const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr, + real_time *pmtime = nullptr, uint64_t *pepoch = nullptr, + RGWObjVersionTracker *objv_tracker = nullptr) + : RGWAsyncRadosRequest(caller, cn), store(store), obj(obj), psize(psize), + pmtime(pmtime), pepoch(pepoch), objv_tracker(objv_tracker) {} +}; + +class RGWStatObjCR : public RGWSimpleCoroutine { + RGWRados *store; + RGWAsyncRadosProcessor *async_rados; + RGWBucketInfo bucket_info; + rgw_obj obj; + uint64_t *psize; + real_time *pmtime; + uint64_t *pepoch; + RGWObjVersionTracker *objv_tracker; + RGWAsyncStatObj *req = nullptr; + public: + RGWStatObjCR(RGWAsyncRadosProcessor *async_rados, RGWRados *store, + const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr, + real_time* pmtime = nullptr, uint64_t *pepoch = nullptr, + RGWObjVersionTracker *objv_tracker = nullptr); + ~RGWStatObjCR() override { + request_cleanup(); + } + void request_cleanup() override; + + int send_request() override; + int request_complete() override; +}; + +/// coroutine wrapper for IoCtx::aio_notify() +class RGWRadosNotifyCR : public RGWSimpleCoroutine { + RGWRados *const store; + const rgw_raw_obj obj; + bufferlist request; + const uint64_t timeout_ms; + bufferlist *response; + rgw_rados_ref ref; + boost::intrusive_ptr cn; + +public: + RGWRadosNotifyCR(RGWRados *store, const rgw_raw_obj& obj, + bufferlist& request, uint64_t timeout_ms, + bufferlist *response); + + int send_request() override; + int request_complete() override; +}; + +#endif diff --git a/src/rgw/rgw_cr_rest.cc b/src/rgw/rgw_cr_rest.cc new file mode 100644 index 00000000..6a5e38a2 --- /dev/null +++ b/src/rgw/rgw_cr_rest.cc @@ -0,0 +1,349 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_cr_rest.h" + +#include "rgw_coroutine.h" + +// re-include our assert to clobber the system one; fix dout: +#include "include/ceph_assert.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +RGWCRHTTPGetDataCB::RGWCRHTTPGetDataCB(RGWCoroutinesEnv *_env, RGWCoroutine *_cr, RGWHTTPStreamRWRequest *_req) : lock("RGWCRHTTPGetDataCB"), env(_env), cr(_cr), req(_req) { + io_id = req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_READ |RGWHTTPClient::HTTPCLIENT_IO_CONTROL); + req->set_in_cb(this); +} + +#define GET_DATA_WINDOW_SIZE 2 * 1024 * 1024 + +int RGWCRHTTPGetDataCB::handle_data(bufferlist& bl, bool *pause) { + if (data.length() < GET_DATA_WINDOW_SIZE / 2) { + notified = false; + } + + { + uint64_t bl_len = bl.length(); + + Mutex::Locker l(lock); + + if (!got_all_extra_data) { + uint64_t max = extra_data_len - extra_data.length(); + if (max > bl_len) { + max = bl_len; + } + bl.splice(0, max, &extra_data); + bl_len -= max; + got_all_extra_data = extra_data.length() == extra_data_len; + } + + data.append(bl); + } + + uint64_t data_len = data.length(); + if (data_len >= GET_DATA_WINDOW_SIZE && !notified) { + notified = true; + env->manager->io_complete(cr, io_id); + } + if (data_len >= 2 * GET_DATA_WINDOW_SIZE) { + *pause = true; + paused = true; + } + return 0; +} + +void RGWCRHTTPGetDataCB::claim_data(bufferlist *dest, uint64_t max) { + bool need_to_unpause = false; + + { + Mutex::Locker l(lock); + + if (data.length() == 0) { + return; + } + + if (data.length() < max) { + max = data.length(); + } + + data.splice(0, max, dest); + need_to_unpause = (paused && data.length() <= GET_DATA_WINDOW_SIZE); + } + + if (need_to_unpause) { + req->unpause_receive(); + } +} + +RGWStreamReadHTTPResourceCRF::~RGWStreamReadHTTPResourceCRF() +{ + if (req) { + req->cancel(); + req->wait(); + delete req; + } +} + +int RGWStreamReadHTTPResourceCRF::init() +{ + env->stack->init_new_io(req); + + in_cb.emplace(env, caller, req); + + int r = http_manager->add_request(req); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWStreamWriteHTTPResourceCRF::send() +{ + env->stack->init_new_io(req); + + req->set_write_drain_cb(&write_drain_notify_cb); + + int r = http_manager->add_request(req); + if (r < 0) { + return r; + } + + return 0; +} + +bool RGWStreamReadHTTPResourceCRF::has_attrs() +{ + return got_attrs; +} + +void RGWStreamReadHTTPResourceCRF::get_attrs(std::map *attrs) +{ + req->get_out_headers(attrs); +} + +int RGWStreamReadHTTPResourceCRF::decode_rest_obj(map& headers, bufferlist& extra_data) { + /* basic generic implementation */ + for (auto header : headers) { + const string& val = header.second; + + rest_obj.attrs[header.first] = val; + } + + return 0; +} + +int RGWStreamReadHTTPResourceCRF::read(bufferlist *out, uint64_t max_size, bool *io_pending) +{ + reenter(&read_state) { + io_read_mask = req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_READ | RGWHTTPClient::HTTPCLIENT_IO_CONTROL); + while (!req->is_done() || + in_cb->has_data()) { + *io_pending = true; + if (!in_cb->has_data()) { + yield caller->io_block(0, io_read_mask); + } + got_attrs = true; + if (need_extra_data() && !got_extra_data) { + if (!in_cb->has_all_extra_data()) { + continue; + } + extra_data.claim_append(in_cb->get_extra_data()); + map attrs; + req->get_out_headers(&attrs); + int ret = decode_rest_obj(attrs, extra_data); + if (ret < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << " decode_rest_obj() returned ret=" << ret << dendl; + return ret; + } + got_extra_data = true; + } + *io_pending = false; + in_cb->claim_data(out, max_size); + if (out->length() == 0) { + /* this may happen if we just read the prepended extra_data and didn't have any data + * after. In that case, retry reading, so that caller doesn't assume it's EOF. + */ + continue; + } + if (!req->is_done() || out->length() >= max_size) { + yield; + } + } + } + return 0; +} + +bool RGWStreamReadHTTPResourceCRF::is_done() +{ + return req->is_done(); +} + +RGWStreamWriteHTTPResourceCRF::~RGWStreamWriteHTTPResourceCRF() +{ + if (req) { + req->cancel(); + req->wait(); + delete req; + } +} + +void RGWStreamWriteHTTPResourceCRF::send_ready(const rgw_rest_obj& rest_obj) +{ + req->set_send_length(rest_obj.content_len); + for (auto h : rest_obj.attrs) { + req->append_header(h.first, h.second); + } +} + +#define PENDING_WRITES_WINDOW (1 * 1024 * 1024) + +void RGWStreamWriteHTTPResourceCRF::write_drain_notify(uint64_t pending_size) +{ + lock_guard l(blocked_lock); + if (is_blocked && (pending_size < PENDING_WRITES_WINDOW / 2)) { + env->manager->io_complete(caller, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_WRITE | RGWHTTPClient::HTTPCLIENT_IO_CONTROL)); + is_blocked = false; + } +} + +void RGWStreamWriteHTTPResourceCRF::WriteDrainNotify::notify(uint64_t pending_size) +{ + crf->write_drain_notify(pending_size); +} + +int RGWStreamWriteHTTPResourceCRF::write(bufferlist& data, bool *io_pending) +{ + reenter(&write_state) { + while (!req->is_done()) { + *io_pending = false; + if (req->get_pending_send_size() >= PENDING_WRITES_WINDOW) { + *io_pending = true; + { + lock_guard l(blocked_lock); + is_blocked = true; + + /* it's ok to unlock here, even if io_complete() arrives before io_block(), it'll wakeup + * correctly */ + } + yield caller->io_block(0, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_WRITE | RGWHTTPClient::HTTPCLIENT_IO_CONTROL)); + } + yield req->add_send_data(data); + } + return req->get_status(); + } + return 0; +} + +int RGWStreamWriteHTTPResourceCRF::drain_writes(bool *need_retry) +{ + reenter(&drain_state) { + *need_retry = true; + yield req->finish_write(); + *need_retry = !req->is_done(); + while (!req->is_done()) { + yield caller->io_block(0, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_CONTROL)); + *need_retry = !req->is_done(); + } + + map headers; + req->get_out_headers(&headers); + handle_headers(headers); + + return req->get_req_retcode(); + } + return 0; +} + +RGWStreamSpliceCR::RGWStreamSpliceCR(CephContext *_cct, RGWHTTPManager *_mgr, + shared_ptr& _in_crf, + shared_ptr& _out_crf) : RGWCoroutine(_cct), cct(_cct), http_manager(_mgr), + in_crf(_in_crf), out_crf(_out_crf) {} +RGWStreamSpliceCR::~RGWStreamSpliceCR() { } + +int RGWStreamSpliceCR::operate() { + reenter(this) { + { + int ret = in_crf->init(); + if (ret < 0) { + return set_cr_error(ret); + } + } + + do { + + bl.clear(); + + do { + yield { + ret = in_crf->read(&bl, 4 * 1024 * 1024, &need_retry); + if (ret < 0) { + return set_cr_error(ret); + } + } + + if (retcode < 0) { + ldout(cct, 20) << __func__ << ": in_crf->read() retcode=" << retcode << dendl; + return set_cr_error(ret); + } + } while (need_retry); + + ldout(cct, 20) << "read " << bl.length() << " bytes" << dendl; + + if (!in_crf->has_attrs()) { + assert (bl.length() == 0); + continue; + } + + if (!sent_attrs) { + int ret = out_crf->init(); + if (ret < 0) { + return set_cr_error(ret); + } + out_crf->send_ready(in_crf->get_rest_obj()); + ret = out_crf->send(); + if (ret < 0) { + return set_cr_error(ret); + } + sent_attrs = true; + } + + if (bl.length() == 0 && in_crf->is_done()) { + break; + } + + total_read += bl.length(); + + do { + yield { + ldout(cct, 20) << "writing " << bl.length() << " bytes" << dendl; + ret = out_crf->write(bl, &need_retry); + if (ret < 0) { + return set_cr_error(ret); + } + } + + if (retcode < 0) { + ldout(cct, 20) << __func__ << ": out_crf->write() retcode=" << retcode << dendl; + return set_cr_error(ret); + } + } while (need_retry); + } while (true); + + do { + yield { + int ret = out_crf->drain_writes(&need_retry); + if (ret < 0) { + return set_cr_error(ret); + } + } + } while (need_retry); + + return set_cr_done(); + } + return 0; +} + diff --git a/src/rgw/rgw_cr_rest.h b/src/rgw/rgw_cr_rest.h new file mode 100644 index 00000000..a73828b3 --- /dev/null +++ b/src/rgw/rgw_cr_rest.h @@ -0,0 +1,593 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_CR_REST_H +#define CEPH_RGW_CR_REST_H + +#include +#include +#include "include/ceph_assert.h" // boost header clobbers our assert.h + +#include "rgw_coroutine.h" +#include "rgw_rest_conn.h" + + +struct rgw_rest_obj { + rgw_obj_key key; + uint64_t content_len; + std::map attrs; + std::map custom_attrs; + RGWAccessControlPolicy acls; + + void init(const rgw_obj_key& _key) { + key = _key; + } +}; + +class RGWReadRawRESTResourceCR : public RGWSimpleCoroutine { + bufferlist *result; + protected: + RGWRESTConn *conn; + RGWHTTPManager *http_manager; + string path; + param_vec_t params; + param_vec_t extra_headers; +public: + boost::intrusive_ptr http_op; + RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const string& _path, + rgw_http_param_pair *params, bufferlist *_result) + : RGWSimpleCoroutine(_cct), result(_result), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(params)) + {} + + RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const string& _path, + rgw_http_param_pair *params) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(params)) + {} + + RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const string& _path, + rgw_http_param_pair *params, param_vec_t &hdrs) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(params)), + extra_headers(hdrs) + {} + + RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const string& _path, + rgw_http_param_pair *params, + std::map *hdrs) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(params)), + extra_headers(make_param_list(hdrs)) + {} + + + ~RGWReadRawRESTResourceCR() override { + request_cleanup(); + } + + int send_request() override { + auto op = boost::intrusive_ptr( + new RGWRESTReadResource(conn, path, params, &extra_headers, http_manager)); + + init_new_io(op.get()); + + int ret = op->aio_read(); + if (ret < 0) { + log_error() << "failed to send http operation: " << op->to_str() + << " ret=" << ret << std::endl; + op->put(); + return ret; + } + std::swap(http_op, op); // store reference in http_op on success + return 0; + } + + + + virtual int wait_result() { + return http_op->wait(result); + } + + int request_complete() override { + int ret; + + ret = wait_result(); + + auto op = std::move(http_op); // release ref on return + if (ret < 0) { + error_stream << "http operation failed: " << op->to_str() + << " status=" << op->get_http_status() << std::endl; + op->put(); + return ret; + } + op->put(); + return 0; + } + + void request_cleanup() override { + if (http_op) { + http_op->put(); + http_op = NULL; + } + } + +}; + + +template +class RGWReadRESTResourceCR : public RGWReadRawRESTResourceCR { + T *result; + public: + RGWReadRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const string& _path, + rgw_http_param_pair *params, T *_result) + : RGWReadRawRESTResourceCR(_cct, _conn, _http_manager, _path, params), result(_result) + {} + + RGWReadRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, const string& _path, + rgw_http_param_pair *params, + std::map *hdrs, + T *_result) + : RGWReadRawRESTResourceCR(_cct, _conn, _http_manager, _path, params, hdrs), result(_result) + {} + + int wait_result() override { + return http_op->wait(result); + } + +}; + +template +class RGWSendRawRESTResourceCR: public RGWSimpleCoroutine { + protected: + RGWRESTConn *conn; + RGWHTTPManager *http_manager; + string method; + string path; + param_vec_t params; + param_vec_t headers; + map *attrs; + T *result; + E *err_result; + bufferlist input_bl; + bool send_content_length=false; + boost::intrusive_ptr http_op; + + public: + RGWSendRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _method, const string& _path, + rgw_http_param_pair *_params, + map *_attrs, + bufferlist& _input, T *_result, + bool _send_content_length, + E *_err_result = nullptr) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + method(_method), path(_path), params(make_param_list(_params)), + headers(make_param_list(_attrs)), attrs(_attrs), + result(_result), err_result(_err_result), + input_bl(_input), send_content_length(_send_content_length) {} + + RGWSendRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _method, const string& _path, + rgw_http_param_pair *_params, map *_attrs, + T *_result, E *_err_result = nullptr) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + method(_method), path(_path), params(make_param_list(_params)), headers(make_param_list(_attrs)), attrs(_attrs), result(_result), + err_result(_err_result) {} + + ~RGWSendRawRESTResourceCR() override { + request_cleanup(); + } + + int send_request() override { + auto op = boost::intrusive_ptr( + new RGWRESTSendResource(conn, method, path, params, &headers, http_manager)); + + init_new_io(op.get()); + + int ret = op->aio_send(input_bl); + if (ret < 0) { + lsubdout(cct, rgw, 0) << "ERROR: failed to send request" << dendl; + op->put(); + return ret; + } + std::swap(http_op, op); // store reference in http_op on success + return 0; + } + + int request_complete() override { + int ret; + if (result || err_result) { + ret = http_op->wait(result, err_result); + } else { + bufferlist bl; + ret = http_op->wait(&bl); + } + auto op = std::move(http_op); // release ref on return + if (ret < 0) { + error_stream << "http operation failed: " << op->to_str() + << " status=" << op->get_http_status() << std::endl; + lsubdout(cct, rgw, 5) << "failed to wait for op, ret=" << ret + << ": " << op->to_str() << dendl; + op->put(); + return ret; + } + op->put(); + return 0; + } + + void request_cleanup() override { + if (http_op) { + http_op->put(); + http_op = NULL; + } + } +}; + +template +class RGWSendRESTResourceCR : public RGWSendRawRESTResourceCR { + public: + RGWSendRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _method, const string& _path, + rgw_http_param_pair *_params, map *_attrs, + S& _input, T *_result, E *_err_result = nullptr) + : RGWSendRawRESTResourceCR(_cct, _conn, _http_manager, _method, _path, _params, _attrs, _result, _err_result) { + + JSONFormatter jf; + encode_json("data", _input, &jf); + std::stringstream ss; + jf.flush(ss); + //bufferlist bl; + this->input_bl.append(ss.str()); + } + +}; + +template +class RGWPostRESTResourceCR : public RGWSendRESTResourceCR { +public: + RGWPostRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _path, + rgw_http_param_pair *_params, S& _input, + T *_result, E *_err_result = nullptr) + : RGWSendRESTResourceCR(_cct, _conn, _http_manager, + "POST", _path, + _params, nullptr, _input, + _result, _err_result) {} +}; + +template +class RGWPutRawRESTResourceCR: public RGWSendRawRESTResourceCR { + public: + RGWPutRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _path, + rgw_http_param_pair *_params, bufferlist& _input, + T *_result, E *_err_result = nullptr) + : RGWSendRawRESTResourceCR(_cct, _conn, _http_manager, "PUT", _path, + _params, nullptr, _input, _result, true, _err_result) {} + +}; + +template +class RGWPostRawRESTResourceCR: public RGWSendRawRESTResourceCR { + public: + RGWPostRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _path, + rgw_http_param_pair *_params, + map * _attrs, + bufferlist& _input, + T *_result, E *_err_result = nullptr) + : RGWSendRawRESTResourceCR(_cct, _conn, _http_manager, "POST", _path, + _params, _attrs, _input, _result, true, _err_result) {} + +}; + + +template +class RGWPutRESTResourceCR : public RGWSendRESTResourceCR { +public: + RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _path, + rgw_http_param_pair *_params, S& _input, + T *_result, E *_err_result = nullptr) + : RGWSendRESTResourceCR(_cct, _conn, _http_manager, + "PUT", _path, + _params, nullptr, _input, + _result, _err_result) {} + + RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _path, + rgw_http_param_pair *_params, + map *_attrs, + S& _input, T *_result, E *_err_result = nullptr) + : RGWSendRESTResourceCR(_cct, _conn, _http_manager, + "PUT", _path, + _params, _attrs, _input, + _result, _err_result) {} + +}; + +class RGWDeleteRESTResourceCR : public RGWSimpleCoroutine { + RGWRESTConn *conn; + RGWHTTPManager *http_manager; + string path; + param_vec_t params; + + boost::intrusive_ptr http_op; + +public: + RGWDeleteRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn, + RGWHTTPManager *_http_manager, + const string& _path, + rgw_http_param_pair *_params) + : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager), + path(_path), params(make_param_list(_params)) + {} + + ~RGWDeleteRESTResourceCR() override { + request_cleanup(); + } + + int send_request() override { + auto op = boost::intrusive_ptr( + new RGWRESTDeleteResource(conn, path, params, nullptr, http_manager)); + + init_new_io(op.get()); + + bufferlist bl; + + int ret = op->aio_send(bl); + if (ret < 0) { + lsubdout(cct, rgw, 0) << "ERROR: failed to send DELETE request" << dendl; + op->put(); + return ret; + } + std::swap(http_op, op); // store reference in http_op on success + return 0; + } + + int request_complete() override { + int ret; + bufferlist bl; + ret = http_op->wait(&bl); + auto op = std::move(http_op); // release ref on return + if (ret < 0) { + error_stream << "http operation failed: " << op->to_str() + << " status=" << op->get_http_status() << std::endl; + lsubdout(cct, rgw, 5) << "failed to wait for op, ret=" << ret + << ": " << op->to_str() << dendl; + op->put(); + return ret; + } + op->put(); + return 0; + } + + void request_cleanup() override { + if (http_op) { + http_op->put(); + http_op = NULL; + } + } +}; + +class RGWCRHTTPGetDataCB : public RGWHTTPStreamRWRequest::ReceiveCB { + Mutex lock; + RGWCoroutinesEnv *env; + RGWCoroutine *cr; + RGWHTTPStreamRWRequest *req; + rgw_io_id io_id; + bufferlist data; + bufferlist extra_data; + bool got_all_extra_data{false}; + bool paused{false}; + bool notified{false}; +public: + RGWCRHTTPGetDataCB(RGWCoroutinesEnv *_env, RGWCoroutine *_cr, RGWHTTPStreamRWRequest *_req); + + int handle_data(bufferlist& bl, bool *pause) override; + + void claim_data(bufferlist *dest, uint64_t max); + + bufferlist& get_extra_data() { + return extra_data; + } + + bool has_data() { + return (data.length() > 0); + } + + bool has_all_extra_data() { + return got_all_extra_data; + } +}; + + +class RGWStreamReadResourceCRF { +protected: + boost::asio::coroutine read_state; + +public: + virtual int init() = 0; + virtual int read(bufferlist *data, uint64_t max, bool *need_retry) = 0; /* reentrant */ + virtual int decode_rest_obj(map& headers, bufferlist& extra_data) = 0; + virtual bool has_attrs() = 0; + virtual void get_attrs(std::map *attrs) = 0; + virtual ~RGWStreamReadResourceCRF() = default; +}; + +class RGWStreamWriteResourceCRF { +protected: + boost::asio::coroutine write_state; + boost::asio::coroutine drain_state; + +public: + virtual int init() = 0; + virtual void send_ready(const rgw_rest_obj& rest_obj) = 0; + virtual int send() = 0; + virtual int write(bufferlist& data, bool *need_retry) = 0; /* reentrant */ + virtual int drain_writes(bool *need_retry) = 0; /* reentrant */ + + virtual ~RGWStreamWriteResourceCRF() = default; +}; + +class RGWStreamReadHTTPResourceCRF : public RGWStreamReadResourceCRF { + CephContext *cct; + RGWCoroutinesEnv *env; + RGWCoroutine *caller; + RGWHTTPManager *http_manager; + + RGWHTTPStreamRWRequest *req{nullptr}; + + std::optional in_cb; + + bufferlist extra_data; + + bool got_attrs{false}; + bool got_extra_data{false}; + + rgw_io_id io_read_mask; + +protected: + rgw_rest_obj rest_obj; + + struct range_info { + bool is_set{false}; + uint64_t ofs; + uint64_t size; + } range; + + ceph::real_time mtime; + string etag; + +public: + RGWStreamReadHTTPResourceCRF(CephContext *_cct, + RGWCoroutinesEnv *_env, + RGWCoroutine *_caller, + RGWHTTPManager *_http_manager, + const rgw_obj_key& _src_key) : cct(_cct), + env(_env), + caller(_caller), + http_manager(_http_manager) { + rest_obj.init(_src_key); + } + ~RGWStreamReadHTTPResourceCRF(); + + int init() override; + int read(bufferlist *data, uint64_t max, bool *need_retry) override; /* reentrant */ + int decode_rest_obj(map& headers, bufferlist& extra_data) override; + bool has_attrs() override; + void get_attrs(std::map *attrs) override; + bool is_done(); + virtual bool need_extra_data() { return false; } + + void set_req(RGWHTTPStreamRWRequest *r) { + req = r; + } + + rgw_rest_obj& get_rest_obj() { + return rest_obj; + } + + void set_range(uint64_t ofs, uint64_t size) { + range.is_set = true; + range.ofs = ofs; + range.size = size; + } +}; + +class RGWStreamWriteHTTPResourceCRF : public RGWStreamWriteResourceCRF { +protected: + RGWCoroutinesEnv *env; + RGWCoroutine *caller; + RGWHTTPManager *http_manager; + + using lock_guard = std::lock_guard; + + std::mutex blocked_lock; + bool is_blocked; + + RGWHTTPStreamRWRequest *req{nullptr}; + + struct multipart_info { + bool is_multipart{false}; + string upload_id; + int part_num{0}; + uint64_t part_size; + } multipart; + + class WriteDrainNotify : public RGWWriteDrainCB { + RGWStreamWriteHTTPResourceCRF *crf; + public: + explicit WriteDrainNotify(RGWStreamWriteHTTPResourceCRF *_crf) : crf(_crf) {} + void notify(uint64_t pending_size) override; + } write_drain_notify_cb; + +public: + RGWStreamWriteHTTPResourceCRF(CephContext *_cct, + RGWCoroutinesEnv *_env, + RGWCoroutine *_caller, + RGWHTTPManager *_http_manager) : env(_env), + caller(_caller), + http_manager(_http_manager), + write_drain_notify_cb(this) {} + virtual ~RGWStreamWriteHTTPResourceCRF(); + + int init() override { + return 0; + } + void send_ready(const rgw_rest_obj& rest_obj) override; + int send() override; + int write(bufferlist& data, bool *need_retry) override; /* reentrant */ + void write_drain_notify(uint64_t pending_size); + int drain_writes(bool *need_retry) override; /* reentrant */ + + virtual void handle_headers(const std::map& headers) {} + + void set_req(RGWHTTPStreamRWRequest *r) { + req = r; + } + + void set_multipart(const string& upload_id, int part_num, uint64_t part_size) { + multipart.is_multipart = true; + multipart.upload_id = upload_id; + multipart.part_num = part_num; + multipart.part_size = part_size; + } +}; + +class RGWStreamSpliceCR : public RGWCoroutine { + CephContext *cct; + RGWHTTPManager *http_manager; + string url; + std::shared_ptr in_crf; + std::shared_ptr out_crf; + bufferlist bl; + bool need_retry{false}; + bool sent_attrs{false}; + uint64_t total_read{0}; + int ret{0}; +public: + RGWStreamSpliceCR(CephContext *_cct, RGWHTTPManager *_mgr, + std::shared_ptr& _in_crf, + std::shared_ptr& _out_crf); + ~RGWStreamSpliceCR(); + + int operate() override; +}; + +#endif diff --git a/src/rgw/rgw_cr_tools.cc b/src/rgw/rgw_cr_tools.cc new file mode 100644 index 00000000..85654cb7 --- /dev/null +++ b/src/rgw/rgw_cr_tools.cc @@ -0,0 +1,275 @@ +#include "common/errno.h" + +#include "rgw_cr_tools.h" +#include "rgw_bucket.h" +#include "rgw_user.h" +#include "rgw_op.h" +#include "rgw_acl_s3.h" +#include "rgw_zone.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +template<> +int RGWUserCreateCR::Request::_send_request() +{ + CephContext *cct = store->ctx(); + + const int32_t default_max_buckets = + cct->_conf.get_val("rgw_user_max_buckets"); + + RGWUserAdminOpState op_state; + + auto& user = params.user; + + op_state.set_user_id(user); + op_state.set_display_name(params.display_name); + op_state.set_user_email(params.email); + op_state.set_caps(params.caps); + op_state.set_access_key(params.access_key); + op_state.set_secret_key(params.secret_key); + + if (!params.key_type.empty()) { + int32_t key_type = KEY_TYPE_S3; + if (params.key_type == "swift") { + key_type = KEY_TYPE_SWIFT; + } + + op_state.set_key_type(key_type); + } + + op_state.set_max_buckets(params.max_buckets.value_or(default_max_buckets)); + op_state.set_suspension(params.suspended); + op_state.set_system(params.system); + op_state.set_exclusive(params.exclusive); + + if (params.generate_key) { + op_state.set_generate_key(); + } + + + if (params.apply_quota) { + RGWQuotaInfo bucket_quota; + RGWQuotaInfo user_quota; + + if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) { + bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects; + bucket_quota.enabled = true; + } + + if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) { + bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size; + bucket_quota.enabled = true; + } + + if (cct->_conf->rgw_user_default_quota_max_objects >= 0) { + user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects; + user_quota.enabled = true; + } + + if (cct->_conf->rgw_user_default_quota_max_size >= 0) { + user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size; + user_quota.enabled = true; + } + + if (bucket_quota.enabled) { + op_state.set_bucket_quota(bucket_quota); + } + + if (user_quota.enabled) { + op_state.set_user_quota(user_quota); + } + } + + RGWNullFlusher flusher; + return RGWUserAdminOp_User::create(store, op_state, flusher); +} + +template<> +int RGWGetUserInfoCR::Request::_send_request() +{ + return rgw_get_user_info_by_uid(store, params.user, *result); +} + +template<> +int RGWGetBucketInfoCR::Request::_send_request() +{ + RGWSysObjectCtx obj_ctx(store->svc.sysobj->init_obj_ctx()); + return store->get_bucket_info(obj_ctx, params.tenant, params.bucket_name, + result->bucket_info, &result->mtime, &result->attrs); +} + +template<> +int RGWBucketCreateLocalCR::Request::_send_request() +{ + CephContext *cct = store->ctx(); + auto& zone_svc = store->svc.zone; + auto& sysobj_svc = store->svc.sysobj; + + const auto& user_info = params.user_info.get(); + const auto& user = user_info->user_id; + const auto& bucket_name = params.bucket_name; + auto& placement_rule = params.placement_rule; + + if (!placement_rule.empty() && + !zone_svc->get_zone_params().valid_placement(placement_rule)) { + ldout(cct, 0) << "placement target (" << placement_rule << ")" + << " doesn't exist in the placement targets of zonegroup" + << " (" << zone_svc->get_zonegroup().api_name << ")" << dendl; + return -ERR_INVALID_LOCATION_CONSTRAINT; + } + + /* we need to make sure we read bucket info, it's not read before for this + * specific request */ + RGWSysObjectCtx sysobj_ctx(sysobj_svc->init_obj_ctx()); + RGWBucketInfo bucket_info; + map bucket_attrs; + + int ret = store->get_bucket_info(sysobj_ctx, user.tenant, bucket_name, + bucket_info, nullptr, &bucket_attrs); + if (ret < 0 && ret != -ENOENT) + return ret; + bool bucket_exists = (ret != -ENOENT); + + RGWAccessControlPolicy old_policy(cct); + ACLOwner bucket_owner; + bucket_owner.set_id(user); + bucket_owner.set_name(user_info->display_name); + if (bucket_exists) { + ret = rgw_op_get_bucket_policy_from_attr(cct, store, bucket_info, + bucket_attrs, &old_policy); + if (ret >= 0) { + if (old_policy.get_owner().get_id().compare(user) != 0) { + return -EEXIST; + } + } + } + + RGWBucketInfo master_info; + rgw_bucket *pmaster_bucket = nullptr; + uint32_t *pmaster_num_shards = nullptr; + real_time creation_time; + + string zonegroup_id = zone_svc->get_zonegroup().get_id(); + + if (bucket_exists) { + rgw_placement_rule selected_placement_rule; + rgw_bucket bucket; + bucket.tenant = user.tenant; + bucket.name = bucket_name; + ret = zone_svc->select_bucket_placement(*user_info, zonegroup_id, + placement_rule, + &selected_placement_rule, nullptr); + if (selected_placement_rule != bucket_info.placement_rule) { + ldout(cct, 0) << "bucket already exists on a different placement rule: " + << " selected_rule= " << selected_placement_rule + << " existing_rule= " << bucket_info.placement_rule << dendl; + return -EEXIST; + } + } + + /* Encode special metadata first as we're using std::map::emplace under + * the hood. This method will add the new items only if the map doesn't + * contain such keys yet. */ + RGWAccessControlPolicy_S3 policy(cct); + policy.create_canned(bucket_owner, bucket_owner, string()); /* default private policy */ + bufferlist aclbl; + policy.encode(aclbl); + map attrs; + attrs.emplace(std::move(RGW_ATTR_ACL), std::move(aclbl)); + + RGWQuotaInfo quota_info; + const RGWQuotaInfo * pquota_info = nullptr; + + rgw_bucket bucket; + bucket.tenant = user.tenant; + bucket.name = bucket_name; + + RGWBucketInfo info; + obj_version ep_objv; + + ret = store->create_bucket(*user_info, bucket, zonegroup_id, + placement_rule, bucket_info.swift_ver_location, + pquota_info, attrs, + info, nullptr, &ep_objv, creation_time, + pmaster_bucket, pmaster_num_shards, true); + + + if (ret && ret != -EEXIST) + return ret; + + bool existed = (ret == -EEXIST); + + if (existed) { + if (info.owner != user) { + ldout(cct, 20) << "NOTICE: bucket already exists under a different user (bucket=" << bucket << " user=" << user << " bucket_owner=" << info.owner << dendl; + return -EEXIST; + } + bucket = info.bucket; + } + + ret = rgw_link_bucket(store, user, bucket, + info.creation_time, false); + if (ret && !existed && ret != -EEXIST) { + /* if it exists (or previously existed), don't remove it! */ + int r = rgw_unlink_bucket(store, user, bucket.tenant, bucket.name); + if (r < 0) { + ldout(cct, 0) << "WARNING: failed to unlink bucket: ret=" << r << dendl; + } + } else if (ret == -EEXIST || (ret == 0 && existed)) { + ret = -ERR_BUCKET_EXISTS; + } + + if (ret < 0) { + ldout(cct, 0) << "ERROR: bucket creation (bucket=" << bucket << ") return ret=" << ret << dendl; + } + + return ret; +} + +template<> +int RGWObjectSimplePutCR::Request::_send_request() +{ + RGWDataAccess::ObjectRef obj; + + CephContext *cct = store->ctx(); + + int ret = params.bucket->get_object(params.key, &obj); + if (ret < 0) { + lderr(cct) << "ERROR: failed to get object: " << cpp_strerror(-ret) << dendl; + return -ret; + } + + if (params.user_data) { + obj->set_user_data(*params.user_data); + } + + ret = obj->put(params.data, params.attrs); + if (ret < 0) { + lderr(cct) << "ERROR: put object returned error: " << cpp_strerror(-ret) << dendl; + } + + return 0; +} + +template<> +int RGWBucketLifecycleConfigCR::Request::_send_request() +{ + CephContext *cct = store->ctx(); + + RGWLC *lc = store->get_lc(); + if (!lc) { + lderr(cct) << "ERROR: lifecycle object is not initialized!" << dendl; + return -EIO; + } + + int ret = lc->set_bucket_config(params.bucket_info, + params.bucket_attrs, + ¶ms.config); + if (ret < 0) { + lderr(cct) << "ERROR: failed to set lifecycle on bucke: " << cpp_strerror(-ret) << dendl; + return -ret; + } + + return 0; +} diff --git a/src/rgw/rgw_cr_tools.h b/src/rgw/rgw_cr_tools.h new file mode 100644 index 00000000..24e9d7a8 --- /dev/null +++ b/src/rgw/rgw_cr_tools.h @@ -0,0 +1,75 @@ +#ifndef CEPH_RGW_CR_TOOLS_H +#define CEPH_RGW_CR_TOOLS_H + +#include "rgw_cr_rados.h" +#include "rgw_tools.h" +#include "rgw_lc.h" + + +struct rgw_user_create_params { + rgw_user user; + std::string display_name; + std::string email; + std::string access_key; + std::string secret_key; + std::string key_type; /* "swift" or "s3" */ + std::string caps; + + bool generate_key{true}; + bool suspended{false}; + std::optional max_buckets; + bool system{false}; + bool exclusive{false}; + bool apply_quota{true}; +}; + +using RGWUserCreateCR = RGWSimpleWriteOnlyAsyncCR; + +struct rgw_get_user_info_params { + rgw_user user; +}; + +using RGWGetUserInfoCR = RGWSimpleAsyncCR; + +struct rgw_get_bucket_info_params { + string tenant; + string bucket_name; +}; + +struct rgw_get_bucket_info_result { + ceph::real_time mtime; + RGWBucketInfo bucket_info; + map attrs; +}; + +using RGWGetBucketInfoCR = RGWSimpleAsyncCR; + +struct rgw_bucket_create_local_params { + shared_ptr user_info; + std::string bucket_name; + rgw_placement_rule placement_rule; +}; + +using RGWBucketCreateLocalCR = RGWSimpleWriteOnlyAsyncCR; + +struct rgw_object_simple_put_params { + RGWDataAccess::BucketRef bucket; + rgw_obj_key key; + bufferlist data; + map attrs; + std::optional user_data; +}; + +using RGWObjectSimplePutCR = RGWSimpleWriteOnlyAsyncCR; + + +struct rgw_bucket_lifecycle_config_params { + RGWBucketInfo bucket_info; + map bucket_attrs; + RGWLifecycleConfiguration config; +}; + +using RGWBucketLifecycleConfigCR = RGWSimpleWriteOnlyAsyncCR; + + +#endif diff --git a/src/rgw/rgw_crypt.cc b/src/rgw/rgw_crypt.cc new file mode 100644 index 00000000..08f28552 --- /dev/null +++ b/src/rgw/rgw_crypt.cc @@ -0,0 +1,1317 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/** + * Crypto filters for Put/Post/Get operations. + */ + +#include +#include +#include +#include +#include +#include "include/ceph_assert.h" +#include +#include +#include "include/str_map.h" +#include "crypto/crypto_accel.h" +#include "crypto/crypto_plugin.h" +#ifdef USE_NSS +# include +# include +# include +#endif + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace rgw; + +/** + * Encryption in CTR mode. offset is used as IV for each block. + */ +class AES_256_CTR : public BlockCrypt { +public: + static const size_t AES_256_KEYSIZE = 256 / 8; + static const size_t AES_256_IVSIZE = 128 / 8; +private: + static const uint8_t IV[AES_256_IVSIZE]; + CephContext* cct; + uint8_t key[AES_256_KEYSIZE]; +public: + explicit AES_256_CTR(CephContext* cct): cct(cct) { + } + ~AES_256_CTR() { + ::ceph::crypto::zeroize_for_security(key, AES_256_KEYSIZE); + } + bool set_key(const uint8_t* _key, size_t key_size) { + if (key_size != AES_256_KEYSIZE) { + return false; + } + memcpy(key, _key, AES_256_KEYSIZE); + return true; + } + size_t get_block_size() { + return AES_256_IVSIZE; + } + +#ifdef USE_NSS + + bool encrypt(bufferlist& input, off_t in_ofs, size_t size, bufferlist& output, off_t stream_offset) + { + bool result = false; + PK11SlotInfo *slot; + SECItem keyItem; + PK11SymKey *symkey; + CK_AES_CTR_PARAMS ctr_params = {0}; + SECItem ivItem; + SECItem *param; + SECStatus ret; + PK11Context *ectx; + int written; + unsigned int written2; + + slot = PK11_GetBestSlot(CKM_AES_CTR, NULL); + if (slot) { + keyItem.type = siBuffer; + keyItem.data = key; + keyItem.len = AES_256_KEYSIZE; + + symkey = PK11_ImportSymKey(slot, CKM_AES_CTR, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL); + if (symkey) { + static_assert(sizeof(ctr_params.cb) >= AES_256_IVSIZE, "Must fit counter"); + ctr_params.ulCounterBits = 128; + prepare_iv(reinterpret_cast(&ctr_params.cb), stream_offset); + + ivItem.type = siBuffer; + ivItem.data = (unsigned char*)&ctr_params; + ivItem.len = sizeof(ctr_params); + + param = PK11_ParamFromIV(CKM_AES_CTR, &ivItem); + if (param) { + ectx = PK11_CreateContextBySymKey(CKM_AES_CTR, CKA_ENCRYPT, symkey, param); + if (ectx) { + buffer::ptr buf((size + AES_256_KEYSIZE - 1) / AES_256_KEYSIZE * AES_256_KEYSIZE); + ret = PK11_CipherOp(ectx, + (unsigned char*)buf.c_str(), &written, buf.length(), + (unsigned char*)input.c_str() + in_ofs, size); + if (ret == SECSuccess) { + ret = PK11_DigestFinal(ectx, + (unsigned char*)buf.c_str() + written, &written2, + buf.length() - written); + if (ret == SECSuccess) { + buf.set_length(written + written2); + output.append(buf); + result = true; + } + } + PK11_DestroyContext(ectx, PR_TRUE); + } + SECITEM_FreeItem(param, PR_TRUE); + } + PK11_FreeSymKey(symkey); + } + PK11_FreeSlot(slot); + } + if (result == false) { + ldout(cct, 5) << "Failed to perform AES-CTR encryption: " << PR_GetError() << dendl; + } + return result; + } + +#else +# error "No supported crypto implementation found." +#endif + /* in CTR encrypt is the same as decrypt */ + bool decrypt(bufferlist& input, off_t in_ofs, size_t size, bufferlist& output, off_t stream_offset) { + return encrypt(input, in_ofs, size, output, stream_offset); + } + + void prepare_iv(unsigned char iv[AES_256_IVSIZE], off_t offset) { + off_t index = offset / AES_256_IVSIZE; + off_t i = AES_256_IVSIZE - 1; + unsigned int val; + unsigned int carry = 0; + while (i>=0) { + val = (index & 0xff) + IV[i] + carry; + iv[i] = val; + carry = val >> 8; + index = index >> 8; + i--; + } + } +}; + +const uint8_t AES_256_CTR::IV[AES_256_CTR::AES_256_IVSIZE] = + { 'a', 'e', 's', '2', '5', '6', 'i', 'v', '_', 'c', 't', 'r', '1', '3', '3', '7' }; + + +CryptoAccelRef get_crypto_accel(CephContext *cct) +{ + CryptoAccelRef ca_impl = nullptr; + stringstream ss; + PluginRegistry *reg = cct->get_plugin_registry(); + string crypto_accel_type = cct->_conf->plugin_crypto_accelerator; + + CryptoPlugin *factory = dynamic_cast(reg->get_with_load("crypto", crypto_accel_type)); + if (factory == nullptr) { + lderr(cct) << __func__ << " cannot load crypto accelerator of type " << crypto_accel_type << dendl; + return nullptr; + } + int err = factory->factory(&ca_impl, &ss); + if (err) { + lderr(cct) << __func__ << " factory return error " << err << + " with description: " << ss.str() << dendl; + } + return ca_impl; +} + + +/** + * Encryption in CBC mode. Chunked to 4K blocks. Offset is used as IV for each 4K block. + * + * + * + * A. Encryption + * 1. Input is split to 4K chunks + remainder in one, smaller chunk + * 2. Each full chunk is encrypted separately with CBC chained mode, with initial IV derived from offset + * 3. Last chunk is 16*m + n. + * 4. 16*m bytes are encrypted with CBC chained mode, with initial IV derived from offset + * 5. Last n bytes are xor-ed with pattern obtained by CBC encryption of + * last encrypted 16 byte block <16m-16, 16m-15) with IV = {0}. + * 6. (Special case) If m == 0 then last n bytes are xor-ed with pattern + * obtained by CBC encryption of {0} with IV derived from offset + * + * B. Decryption + * 1. Input is split to 4K chunks + remainder in one, smaller chunk + * 2. Each full chunk is decrypted separately with CBC chained mode, with initial IV derived from offset + * 3. Last chunk is 16*m + n. + * 4. 16*m bytes are decrypted with CBC chained mode, with initial IV derived from offset + * 5. Last n bytes are xor-ed with pattern obtained by CBC ENCRYPTION of + * last (still encrypted) 16 byte block <16m-16,16m-15) with IV = {0} + * 6. (Special case) If m == 0 then last n bytes are xor-ed with pattern + * obtained by CBC ENCRYPTION of {0} with IV derived from offset + */ +class AES_256_CBC : public BlockCrypt { +public: + static const size_t AES_256_KEYSIZE = 256 / 8; + static const size_t AES_256_IVSIZE = 128 / 8; + static const size_t CHUNK_SIZE = 4096; +private: + static const uint8_t IV[AES_256_IVSIZE]; + CephContext* cct; + uint8_t key[AES_256_KEYSIZE]; +public: + explicit AES_256_CBC(CephContext* cct): cct(cct) { + } + ~AES_256_CBC() { + ::ceph::crypto::zeroize_for_security(key, AES_256_KEYSIZE); + } + bool set_key(const uint8_t* _key, size_t key_size) { + if (key_size != AES_256_KEYSIZE) { + return false; + } + memcpy(key, _key, AES_256_KEYSIZE); + return true; + } + size_t get_block_size() { + return CHUNK_SIZE; + } + +#ifdef USE_NSS + + bool cbc_transform(unsigned char* out, + const unsigned char* in, + size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE], + bool encrypt) + { + bool result = false; + PK11SlotInfo *slot; + SECItem keyItem; + PK11SymKey *symkey; + CK_AES_CBC_ENCRYPT_DATA_PARAMS ctr_params = {0}; + SECItem ivItem; + SECItem *param; + SECStatus ret; + PK11Context *ectx; + int written; + + slot = PK11_GetBestSlot(CKM_AES_CBC, NULL); + if (slot) { + keyItem.type = siBuffer; + keyItem.data = const_cast(&key[0]); + keyItem.len = AES_256_KEYSIZE; + symkey = PK11_ImportSymKey(slot, CKM_AES_CBC, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL); + if (symkey) { + memcpy(ctr_params.iv, iv, AES_256_IVSIZE); + ivItem.type = siBuffer; + ivItem.data = (unsigned char*)&ctr_params; + ivItem.len = sizeof(ctr_params); + + param = PK11_ParamFromIV(CKM_AES_CBC, &ivItem); + if (param) { + ectx = PK11_CreateContextBySymKey(CKM_AES_CBC, encrypt?CKA_ENCRYPT:CKA_DECRYPT, symkey, param); + if (ectx) { + ret = PK11_CipherOp(ectx, + out, &written, size, + in, size); + if ((ret == SECSuccess) && (written == (int)size)) { + result = true; + } + PK11_DestroyContext(ectx, PR_TRUE); + } + SECITEM_FreeItem(param, PR_TRUE); + } + PK11_FreeSymKey(symkey); + } + PK11_FreeSlot(slot); + } + if (result == false) { + ldout(cct, 5) << "Failed to perform AES-CBC encryption: " << PR_GetError() << dendl; + } + return result; + } + +#else +# error "No supported crypto implementation found." +#endif + + bool cbc_transform(unsigned char* out, + const unsigned char* in, + size_t size, + off_t stream_offset, + const unsigned char (&key)[AES_256_KEYSIZE], + bool encrypt) + { + static std::atomic failed_to_get_crypto(false); + CryptoAccelRef crypto_accel; + if (! failed_to_get_crypto.load()) + { + crypto_accel = get_crypto_accel(cct); + if (!crypto_accel) + failed_to_get_crypto = true; + } + bool result = true; + unsigned char iv[AES_256_IVSIZE]; + for (size_t offset = 0; result && (offset < size); offset += CHUNK_SIZE) { + size_t process_size = offset + CHUNK_SIZE <= size ? CHUNK_SIZE : size - offset; + prepare_iv(iv, stream_offset + offset); + if (crypto_accel != nullptr) { + if (encrypt) { + result = crypto_accel->cbc_encrypt(out + offset, in + offset, + process_size, iv, key); + } else { + result = crypto_accel->cbc_decrypt(out + offset, in + offset, + process_size, iv, key); + } + } else { + result = cbc_transform( + out + offset, in + offset, process_size, + iv, key, encrypt); + } + } + return result; + } + + + bool encrypt(bufferlist& input, + off_t in_ofs, + size_t size, + bufferlist& output, + off_t stream_offset) + { + bool result = false; + size_t aligned_size = size / AES_256_IVSIZE * AES_256_IVSIZE; + size_t unaligned_rest_size = size - aligned_size; + output.clear(); + buffer::ptr buf(aligned_size + AES_256_IVSIZE); + unsigned char* buf_raw = reinterpret_cast(buf.c_str()); + const unsigned char* input_raw = reinterpret_cast(input.c_str()); + + /* encrypt main bulk of data */ + result = cbc_transform(buf_raw, + input_raw + in_ofs, + aligned_size, + stream_offset, key, true); + if (result && (unaligned_rest_size > 0)) { + /* remainder to encrypt */ + if (aligned_size % CHUNK_SIZE > 0) { + /* use last chunk for unaligned part */ + unsigned char iv[AES_256_IVSIZE] = {0}; + result = cbc_transform(buf_raw + aligned_size, + buf_raw + aligned_size - AES_256_IVSIZE, + AES_256_IVSIZE, + iv, key, true); + } else { + /* 0 full blocks in current chunk, use IV as base for unaligned part */ + unsigned char iv[AES_256_IVSIZE] = {0}; + unsigned char data[AES_256_IVSIZE]; + prepare_iv(data, stream_offset + aligned_size); + result = cbc_transform(buf_raw + aligned_size, + data, + AES_256_IVSIZE, + iv, key, true); + } + if (result) { + for(size_t i = aligned_size; i < size; i++) { + *(buf_raw + i) ^= *(input_raw + in_ofs + i); + } + } + } + if (result) { + ldout(cct, 25) << "Encrypted " << size << " bytes"<< dendl; + buf.set_length(size); + output.append(buf); + } else { + ldout(cct, 5) << "Failed to encrypt" << dendl; + } + return result; + } + + + bool decrypt(bufferlist& input, + off_t in_ofs, + size_t size, + bufferlist& output, + off_t stream_offset) + { + bool result = false; + size_t aligned_size = size / AES_256_IVSIZE * AES_256_IVSIZE; + size_t unaligned_rest_size = size - aligned_size; + output.clear(); + buffer::ptr buf(aligned_size + AES_256_IVSIZE); + unsigned char* buf_raw = reinterpret_cast(buf.c_str()); + unsigned char* input_raw = reinterpret_cast(input.c_str()); + + /* decrypt main bulk of data */ + result = cbc_transform(buf_raw, + input_raw + in_ofs, + aligned_size, + stream_offset, key, false); + if (result && unaligned_rest_size > 0) { + /* remainder to decrypt */ + if (aligned_size % CHUNK_SIZE > 0) { + /*use last chunk for unaligned part*/ + unsigned char iv[AES_256_IVSIZE] = {0}; + result = cbc_transform(buf_raw + aligned_size, + input_raw + in_ofs + aligned_size - AES_256_IVSIZE, + AES_256_IVSIZE, + iv, key, true); + } else { + /* 0 full blocks in current chunk, use IV as base for unaligned part */ + unsigned char iv[AES_256_IVSIZE] = {0}; + unsigned char data[AES_256_IVSIZE]; + prepare_iv(data, stream_offset + aligned_size); + result = cbc_transform(buf_raw + aligned_size, + data, + AES_256_IVSIZE, + iv, key, true); + } + if (result) { + for(size_t i = aligned_size; i < size; i++) { + *(buf_raw + i) ^= *(input_raw + in_ofs + i); + } + } + } + if (result) { + ldout(cct, 25) << "Decrypted " << size << " bytes"<< dendl; + buf.set_length(size); + output.append(buf); + } else { + ldout(cct, 5) << "Failed to decrypt" << dendl; + } + return result; + } + + + void prepare_iv(unsigned char (&iv)[AES_256_IVSIZE], off_t offset) { + off_t index = offset / AES_256_IVSIZE; + off_t i = AES_256_IVSIZE - 1; + unsigned int val; + unsigned int carry = 0; + while (i>=0) { + val = (index & 0xff) + IV[i] + carry; + iv[i] = val; + carry = val >> 8; + index = index >> 8; + i--; + } + } +}; + + +std::unique_ptr AES_256_CBC_create(CephContext* cct, const uint8_t* key, size_t len) +{ + auto cbc = std::unique_ptr(new AES_256_CBC(cct)); + cbc->set_key(key, AES_256_KEYSIZE); + return std::move(cbc); +} + + +const uint8_t AES_256_CBC::IV[AES_256_CBC::AES_256_IVSIZE] = + { 'a', 'e', 's', '2', '5', '6', 'i', 'v', '_', 'c', 't', 'r', '1', '3', '3', '7' }; + + +#ifdef USE_NSS + +bool AES_256_ECB_encrypt(CephContext* cct, + const uint8_t* key, + size_t key_size, + const uint8_t* data_in, + uint8_t* data_out, + size_t data_size) { + bool result = false; + PK11SlotInfo *slot; + SECItem keyItem; + PK11SymKey *symkey; + SECItem *param; + SECStatus ret; + PK11Context *ectx; + int written; + unsigned int written2; + if (key_size == AES_256_KEYSIZE) { + slot = PK11_GetBestSlot(CKM_AES_ECB, NULL); + if (slot) { + keyItem.type = siBuffer; + keyItem.data = const_cast(key); + keyItem.len = AES_256_KEYSIZE; + + param = PK11_ParamFromIV(CKM_AES_ECB, NULL); + if (param) { + symkey = PK11_ImportSymKey(slot, CKM_AES_ECB, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL); + if (symkey) { + ectx = PK11_CreateContextBySymKey(CKM_AES_ECB, CKA_ENCRYPT, symkey, param); + if (ectx) { + ret = PK11_CipherOp(ectx, + data_out, &written, data_size, + data_in, data_size); + if (ret == SECSuccess) { + ret = PK11_DigestFinal(ectx, + data_out + written, &written2, + data_size - written); + if (ret == SECSuccess) { + result = true; + } + } + PK11_DestroyContext(ectx, PR_TRUE); + } + PK11_FreeSymKey(symkey); + } + SECITEM_FreeItem(param, PR_TRUE); + } + PK11_FreeSlot(slot); + } + if (result == false) { + ldout(cct, 5) << "Failed to perform AES-ECB encryption: " << PR_GetError() << dendl; + } + } else { + ldout(cct, 5) << "Key size must be 256 bits long" << dendl; + } + return result; +} + +#else +# error "No supported crypto implementation found." +#endif + + +RGWGetObj_BlockDecrypt::RGWGetObj_BlockDecrypt(CephContext* cct, + RGWGetObj_Filter* next, + std::unique_ptr crypt): + RGWGetObj_Filter(next), + cct(cct), + crypt(std::move(crypt)), + enc_begin_skip(0), + ofs(0), + end(0), + cache() +{ + block_size = this->crypt->get_block_size(); +} + +RGWGetObj_BlockDecrypt::~RGWGetObj_BlockDecrypt() { +} + +int RGWGetObj_BlockDecrypt::read_manifest(bufferlist& manifest_bl) { + parts_len.clear(); + RGWObjManifest manifest; + if (manifest_bl.length()) { + auto miter = manifest_bl.cbegin(); + try { + decode(manifest, miter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl; + return -EIO; + } + RGWObjManifest::obj_iterator mi; + for (mi = manifest.obj_begin(); mi != manifest.obj_end(); ++mi) { + if (mi.get_cur_stripe() == 0) { + parts_len.push_back(0); + } + parts_len.back() += mi.get_stripe_size(); + } + if (cct->_conf->subsys.should_gather()) { + for (size_t i = 0; i [" << bl_ofs << "," << bl_end << "]" << dendl; + return 0; +} + +int RGWGetObj_BlockDecrypt::process(bufferlist& in, size_t part_ofs, size_t size) +{ + bufferlist data; + if (!crypt->decrypt(in, 0, size, data, part_ofs)) { + return -ERR_INTERNAL_ERROR; + } + off_t send_size = size - enc_begin_skip; + if (ofs + enc_begin_skip + send_size > end + 1) { + send_size = end + 1 - ofs - enc_begin_skip; + } + int res = next->handle_data(data, enc_begin_skip, send_size); + enc_begin_skip = 0; + ofs += size; + in.splice(0, size); + return res; +} + +int RGWGetObj_BlockDecrypt::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + ldout(cct, 25) << "Decrypt " << bl_len << " bytes" << dendl; + bl.copy(bl_ofs, bl_len, cache); + + int res = 0; + size_t part_ofs = ofs; + for (size_t part : parts_len) { + if (part_ofs >= part) { + part_ofs -= part; + } else if (part_ofs + cache.length() >= part) { + // flush data up to part boundaries, aligned or not + res = process(cache, part_ofs, part - part_ofs); + if (res < 0) { + return res; + } + part_ofs = 0; + } else { + break; + } + } + // write up to block boundaries, aligned only + off_t aligned_size = cache.length() & ~(block_size - 1); + if (aligned_size > 0) { + res = process(cache, part_ofs, aligned_size); + } + return res; +} + +/** + * flush remainder of data to output + */ +int RGWGetObj_BlockDecrypt::flush() { + ldout(cct, 25) << "Decrypt flushing " << cache.length() << " bytes" << dendl; + int res = 0; + size_t part_ofs = ofs; + for (size_t part : parts_len) { + if (part_ofs >= part) { + part_ofs -= part; + } else if (part_ofs + cache.length() >= part) { + // flush data up to part boundaries, aligned or not + res = process(cache, part_ofs, part - part_ofs); + if (res < 0) { + return res; + } + part_ofs = 0; + } else { + break; + } + } + // flush up to block boundaries, aligned or not + if (cache.length() > 0) { + res = process(cache, part_ofs, cache.length()); + } + return res; +} + +RGWPutObj_BlockEncrypt::RGWPutObj_BlockEncrypt(CephContext* cct, + rgw::putobj::DataProcessor *next, + std::unique_ptr crypt) + : Pipe(next), + cct(cct), + crypt(std::move(crypt)), + block_size(this->crypt->get_block_size()) +{ +} + +int RGWPutObj_BlockEncrypt::process(bufferlist&& data, uint64_t logical_offset) +{ + ldout(cct, 25) << "Encrypt " << data.length() << " bytes" << dendl; + + // adjust logical offset to beginning of cached data + ceph_assert(logical_offset >= cache.length()); + logical_offset -= cache.length(); + + const bool flush = (data.length() == 0); + cache.claim_append(data); + + uint64_t proc_size = cache.length() & ~(block_size - 1); + if (flush) { + proc_size = cache.length(); + } + if (proc_size > 0) { + bufferlist in, out; + cache.splice(0, proc_size, &in); + if (!crypt->encrypt(in, 0, proc_size, out, logical_offset)) { + return -ERR_INTERNAL_ERROR; + } + int r = Pipe::process(std::move(out), logical_offset); + logical_offset += proc_size; + if (r < 0) + return r; + } + + if (flush) { + /*replicate 0-sized handle_data*/ + return Pipe::process({}, logical_offset); + } + return 0; +} + + +std::string create_random_key_selector(CephContext * const cct) { + char random[AES_256_KEYSIZE]; + cct->random()->get_bytes(&random[0], sizeof(random)); + return std::string(random, sizeof(random)); +} + +static int get_barbican_url(CephContext * const cct, + std::string& url) +{ + url = cct->_conf->rgw_barbican_url; + if (url.empty()) { + ldout(cct, 0) << "ERROR: conf rgw_barbican_url is not set" << dendl; + return -EINVAL; + } + + if (url.back() != '/') { + url.append("/"); + } + + return 0; +} + +static int request_key_from_barbican(CephContext *cct, + boost::string_view key_id, + boost::string_view key_selector, + const std::string& barbican_token, + std::string& actual_key) { + std::string secret_url; + int res; + res = get_barbican_url(cct, secret_url); + if (res < 0) { + return res; + } + secret_url += "v1/secrets/" + std::string(key_id); + + bufferlist secret_bl; + RGWHTTPTransceiver secret_req(cct, "GET", secret_url, &secret_bl); + secret_req.append_header("Accept", "application/octet-stream"); + secret_req.append_header("X-Auth-Token", barbican_token); + + res = secret_req.process(); + if (res < 0) { + return res; + } + if (secret_req.get_http_status() == + RGWHTTPTransceiver::HTTP_STATUS_UNAUTHORIZED) { + return -EACCES; + } + + if (secret_req.get_http_status() >=200 && + secret_req.get_http_status() < 300 && + secret_bl.length() == AES_256_KEYSIZE) { + actual_key.assign(secret_bl.c_str(), secret_bl.length()); + ::ceph::crypto::zeroize_for_security(secret_bl.c_str(), secret_bl.length()); + } else { + res = -EACCES; + } + return res; +} + +static map get_str_map(const string &str) { + map m; + get_str_map(str, &m, ";, \t"); + return m; +} + +static int get_actual_key_from_kms(CephContext *cct, + boost::string_view key_id, + boost::string_view key_selector, + std::string& actual_key) +{ + int res = 0; + ldout(cct, 20) << "Getting KMS encryption key for key=" << key_id << dendl; + static map str_map = get_str_map( + cct->_conf->rgw_crypt_s3_kms_encryption_keys); + + map::iterator it = str_map.find(std::string(key_id)); + if (it != str_map.end() ) { + std::string master_key; + try { + master_key = from_base64((*it).second); + } catch (...) { + ldout(cct, 5) << "ERROR: get_actual_key_from_kms invalid encryption key id " + << "which contains character that is not base64 encoded." + << dendl; + return -EINVAL; + } + + if (master_key.length() == AES_256_KEYSIZE) { + uint8_t _actual_key[AES_256_KEYSIZE]; + if (AES_256_ECB_encrypt(cct, + reinterpret_cast(master_key.c_str()), AES_256_KEYSIZE, + reinterpret_cast(key_selector.data()), + _actual_key, AES_256_KEYSIZE)) { + actual_key = std::string((char*)&_actual_key[0], AES_256_KEYSIZE); + } else { + res = -EIO; + } + ::ceph::crypto::zeroize_for_security(_actual_key, sizeof(_actual_key)); + } else { + ldout(cct, 20) << "Wrong size for key=" << key_id << dendl; + res = -EIO; + } + } else { + std::string token; + if (rgw::keystone::Service::get_keystone_barbican_token(cct, token) < 0) { + ldout(cct, 5) << "Failed to retrieve token for barbican" << dendl; + res = -EINVAL; + return res; + } + + res = request_key_from_barbican(cct, key_id, key_selector, token, actual_key); + if (res != 0) { + ldout(cct, 5) << "Failed to retrieve secret from barbican:" << key_id << dendl; + } + } + return res; +} + +static inline void set_attr(map& attrs, + const char* key, + boost::string_view value) +{ + bufferlist bl; + bl.append(value.data(), value.size()); + attrs[key] = std::move(bl); +} + +static inline std::string get_str_attribute(map& attrs, + const char *name) +{ + auto iter = attrs.find(name); + if (iter == attrs.end()) { + return {}; + } + return iter->second.to_str(); +} + +typedef enum { + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM=0, + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + X_AMZ_SERVER_SIDE_ENCRYPTION, + X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID, + X_AMZ_SERVER_SIDE_ENCRYPTION_LAST +} crypt_option_e; + +typedef struct { + const char* http_header_name; + const std::string post_part_name; +} crypt_option_names; + +static const crypt_option_names crypt_options[] = { + {"HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM", "x-amz-server-side-encryption-customer-algorithm"}, + {"HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", "x-amz-server-side-encryption-customer-key"}, + {"HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5", "x-amz-server-side-encryption-customer-key-md5"}, + {"HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION", "x-amz-server-side-encryption"}, + {"HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID", "x-amz-server-side-encryption-aws-kms-key-id"}, +}; + +static boost::string_view get_crypt_attribute( + const RGWEnv* env, + std::map* parts, + crypt_option_e option) +{ + static_assert( + X_AMZ_SERVER_SIDE_ENCRYPTION_LAST == sizeof(crypt_options)/sizeof(*crypt_options), + "Missing items in crypt_options"); + if (parts != nullptr) { + auto iter + = parts->find(crypt_options[option].post_part_name); + if (iter == parts->end()) + return boost::string_view(); + bufferlist& data = iter->second.data; + boost::string_view str = boost::string_view(data.c_str(), data.length()); + return rgw_trim_whitespace(str); + } else { + const char* hdr = env->get(crypt_options[option].http_header_name, nullptr); + if (hdr != nullptr) { + return boost::string_view(hdr); + } else { + return boost::string_view(); + } + } +} + + +int rgw_s3_prepare_encrypt(struct req_state* s, + std::map& attrs, + std::map* parts, + std::unique_ptr* block_crypt, + std::map& crypt_http_responses) +{ + int res = 0; + crypt_http_responses.clear(); + { + boost::string_view req_sse_ca = + get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM); + if (! req_sse_ca.empty()) { + if (req_sse_ca != "AES256") { + ldout(s->cct, 5) << "ERROR: Invalid value for header " + << "x-amz-server-side-encryption-customer-algorithm" + << dendl; + s->err.message = "The requested encryption algorithm is not valid, must be AES256."; + return -ERR_INVALID_ENCRYPTION_ALGORITHM; + } + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + + std::string key_bin; + try { + key_bin = from_base64( + get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY) ); + } catch (...) { + ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption " + << "key which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) { + ldout(s->cct, 5) << "ERROR: invalid encryption key size" << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + boost::string_view keymd5 = + get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5); + + std::string keymd5_bin; + try { + keymd5_bin = from_base64(keymd5); + } catch (...) { + ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption key " + << "md5 which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key md5."; + return -EINVAL; + } + + if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) { + ldout(s->cct, 5) << "ERROR: Invalid key md5 size" << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key md5."; + return -EINVAL; + } + + MD5 key_hash; + unsigned char key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE]; + key_hash.Update(reinterpret_cast(key_bin.c_str()), key_bin.size()); + key_hash.Final(key_hash_res); + + if (memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) { + ldout(s->cct, 5) << "ERROR: Invalid key md5 hash" << dendl; + s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided."; + return -EINVAL; + } + + set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-C-AES256"); + set_attr(attrs, RGW_ATTR_CRYPT_KEYMD5, keymd5_bin); + + if (block_crypt) { + auto aes = std::unique_ptr(new AES_256_CBC(s->cct)); + aes->set_key(reinterpret_cast(key_bin.c_str()), AES_256_KEYSIZE); + *block_crypt = std::move(aes); + } + + crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256"; + crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = keymd5.to_string(); + return 0; + } else { + boost::string_view customer_key = + get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY); + if (!customer_key.empty()) { + ldout(s->cct, 5) << "ERROR: SSE-C encryption request is missing the header " + << "x-amz-server-side-encryption-customer-algorithm" + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide a valid encryption algorithm."; + return -EINVAL; + } + + boost::string_view customer_key_md5 = + get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5); + if (!customer_key_md5.empty()) { + ldout(s->cct, 5) << "ERROR: SSE-C encryption request is missing the header " + << "x-amz-server-side-encryption-customer-algorithm" + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide a valid encryption algorithm."; + return -EINVAL; + } + } + + /* AMAZON server side encryption with KMS (key management service) */ + boost::string_view req_sse = + get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION); + if (! req_sse.empty()) { + + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldout(s->cct, 5) << "ERROR: insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + + if (req_sse == "aws:kms") { + boost::string_view key_id = + get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID); + if (key_id.empty()) { + ldout(s->cct, 5) << "ERROR: not provide a valid key id" << dendl; + s->err.message = "Server Side Encryption with KMS managed key requires " + "HTTP header x-amz-server-side-encryption-aws-kms-key-id"; + return -ERR_INVALID_ACCESS_KEY; + } + /* try to retrieve actual key */ + std::string key_selector = create_random_key_selector(s->cct); + std::string actual_key; + res = get_actual_key_from_kms(s->cct, key_id, key_selector, actual_key); + if (res != 0) { + ldout(s->cct, 5) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl; + s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id.to_string(); + return res; + } + if (actual_key.size() != AES_256_KEYSIZE) { + ldout(s->cct, 5) << "ERROR: key obtained from key_id:" << + key_id << " is not 256 bit size" << dendl; + s->err.message = "KMS provided an invalid key for the given kms-keyid."; + return -ERR_INVALID_ACCESS_KEY; + } + set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-KMS"); + set_attr(attrs, RGW_ATTR_CRYPT_KEYID, key_id); + set_attr(attrs, RGW_ATTR_CRYPT_KEYSEL, key_selector); + + if (block_crypt) { + auto aes = std::unique_ptr(new AES_256_CBC(s->cct)); + aes->set_key(reinterpret_cast(actual_key.c_str()), AES_256_KEYSIZE); + *block_crypt = std::move(aes); + } + actual_key.replace(0, actual_key.length(), actual_key.length(), '\000'); + + crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms"; + crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = key_id.to_string(); + return 0; + } else if (req_sse == "AES256") { + /* if a default encryption key was provided, we will use it for SSE-S3 */ + } else { + ldout(s->cct, 5) << "ERROR: Invalid value for header x-amz-server-side-encryption" + << dendl; + s->err.message = "Server Side Encryption with KMS managed key requires " + "HTTP header x-amz-server-side-encryption : aws:kms or AES256"; + return -EINVAL; + } + } else { + /* x-amz-server-side-encryption not present or empty */ + boost::string_view key_id = + get_crypt_attribute(s->info.env, parts, + X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID); + if (!key_id.empty()) { + ldout(s->cct, 5) << "ERROR: SSE-KMS encryption request is missing the header " + << "x-amz-server-side-encryption" + << dendl; + s->err.message = "Server Side Encryption with KMS managed key requires " + "HTTP header x-amz-server-side-encryption : aws:kms"; + return -EINVAL; + } + } + + /* no other encryption mode, check if default encryption is selected */ + if (s->cct->_conf->rgw_crypt_default_encryption_key != "") { + std::string master_encryption_key; + try { + master_encryption_key = from_base64(s->cct->_conf->rgw_crypt_default_encryption_key); + } catch (...) { + ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid default encryption key " + << "which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + if (master_encryption_key.size() != 256 / 8) { + ldout(s->cct, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl; + /* not an error to return; missing encryption does not inhibit processing */ + return 0; + } + + set_attr(attrs, RGW_ATTR_CRYPT_MODE, "RGW-AUTO"); + std::string key_selector = create_random_key_selector(s->cct); + set_attr(attrs, RGW_ATTR_CRYPT_KEYSEL, key_selector); + + uint8_t actual_key[AES_256_KEYSIZE]; + if (AES_256_ECB_encrypt(s->cct, + reinterpret_cast(master_encryption_key.c_str()), AES_256_KEYSIZE, + reinterpret_cast(key_selector.c_str()), + actual_key, AES_256_KEYSIZE) != true) { + ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key)); + return -EIO; + } + if (block_crypt) { + auto aes = std::unique_ptr(new AES_256_CBC(s->cct)); + aes->set_key(reinterpret_cast(actual_key), AES_256_KEYSIZE); + *block_crypt = std::move(aes); + } + ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key)); + return 0; + } + } + /*no encryption*/ + return 0; +} + + +int rgw_s3_prepare_decrypt(struct req_state* s, + map& attrs, + std::unique_ptr* block_crypt, + std::map& crypt_http_responses) +{ + int res = 0; + std::string stored_mode = get_str_attribute(attrs, RGW_ATTR_CRYPT_MODE); + ldout(s->cct, 15) << "Encryption mode: " << stored_mode << dendl; + + const char *req_sse = s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION", NULL); + if (nullptr != req_sse && (s->op == OP_GET || s->op == OP_HEAD)) { + return -ERR_INVALID_REQUEST; + } + + if (stored_mode == "SSE-C-AES256") { + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + const char *req_cust_alg = + s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM", NULL); + + if (nullptr == req_cust_alg) { + ldout(s->cct, 5) << "ERROR: Request for SSE-C encrypted object missing " + << "x-amz-server-side-encryption-customer-algorithm" + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide a valid encryption algorithm."; + return -EINVAL; + } else if (strcmp(req_cust_alg, "AES256") != 0) { + ldout(s->cct, 5) << "ERROR: The requested encryption algorithm is not valid, must be AES256." << dendl; + s->err.message = "The requested encryption algorithm is not valid, must be AES256."; + return -ERR_INVALID_ENCRYPTION_ALGORITHM; + } + + std::string key_bin; + try { + key_bin = from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", "")); + } catch (...) { + ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key " + << "which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) { + ldout(s->cct, 5) << "ERROR: Invalid encryption key size" << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key."; + return -EINVAL; + } + + std::string keymd5 = + s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5", ""); + std::string keymd5_bin; + try { + keymd5_bin = from_base64(keymd5); + } catch (...) { + ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key md5 " + << "which contains character that is not base64 encoded." + << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key md5."; + return -EINVAL; + } + + + if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) { + ldout(s->cct, 5) << "ERROR: Invalid key md5 size " << dendl; + s->err.message = "Requests specifying Server Side Encryption with Customer " + "provided keys must provide an appropriate secret key md5."; + return -EINVAL; + } + + MD5 key_hash; + uint8_t key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE]; + key_hash.Update(reinterpret_cast(key_bin.c_str()), key_bin.size()); + key_hash.Final(key_hash_res); + + if ((memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) || + (get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYMD5) != keymd5_bin)) { + s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided."; + return -EINVAL; + } + auto aes = std::unique_ptr(new AES_256_CBC(s->cct)); + aes->set_key(reinterpret_cast(key_bin.c_str()), AES_256_CBC::AES_256_KEYSIZE); + if (block_crypt) *block_crypt = std::move(aes); + + crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256"; + crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = keymd5; + return 0; + } + + if (stored_mode == "SSE-KMS") { + if (s->cct->_conf->rgw_crypt_require_ssl && + !rgw_transport_is_secure(s->cct, *s->info.env)) { + ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl; + return -ERR_INVALID_REQUEST; + } + /* try to retrieve actual key */ + std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID); + std::string key_selector = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYSEL); + std::string actual_key; + res = get_actual_key_from_kms(s->cct, key_id, key_selector, actual_key); + if (res != 0) { + ldout(s->cct, 10) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl; + s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id; + return res; + } + if (actual_key.size() != AES_256_KEYSIZE) { + ldout(s->cct, 0) << "ERROR: key obtained from key_id:" << + key_id << " is not 256 bit size" << dendl; + s->err.message = "KMS provided an invalid key for the given kms-keyid."; + return -ERR_INVALID_ACCESS_KEY; + } + + auto aes = std::unique_ptr(new AES_256_CBC(s->cct)); + aes->set_key(reinterpret_cast(actual_key.c_str()), AES_256_KEYSIZE); + actual_key.replace(0, actual_key.length(), actual_key.length(), '\000'); + if (block_crypt) *block_crypt = std::move(aes); + + crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms"; + crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = key_id; + return 0; + } + + if (stored_mode == "RGW-AUTO") { + std::string master_encryption_key; + try { + master_encryption_key = from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key)); + } catch (...) { + ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid default encryption key " + << "which contains character that is not base64 encoded." + << dendl; + s->err.message = "The default encryption key is not valid base64."; + return -EINVAL; + } + + if (master_encryption_key.size() != 256 / 8) { + ldout(s->cct, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl; + return -EIO; + } + std::string attr_key_selector = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYSEL); + if (attr_key_selector.size() != AES_256_CBC::AES_256_KEYSIZE) { + ldout(s->cct, 0) << "ERROR: missing or invalid " RGW_ATTR_CRYPT_KEYSEL << dendl; + return -EIO; + } + uint8_t actual_key[AES_256_KEYSIZE]; + if (AES_256_ECB_encrypt(s->cct, + reinterpret_cast(master_encryption_key.c_str()), + AES_256_KEYSIZE, + reinterpret_cast(attr_key_selector.c_str()), + actual_key, AES_256_KEYSIZE) != true) { + ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key)); + return -EIO; + } + auto aes = std::unique_ptr(new AES_256_CBC(s->cct)); + aes->set_key(actual_key, AES_256_KEYSIZE); + ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key)); + if (block_crypt) *block_crypt = std::move(aes); + return 0; + } + /*no decryption*/ + return 0; +} diff --git a/src/rgw/rgw_crypt.h b/src/rgw/rgw_crypt.h new file mode 100644 index 00000000..e928d054 --- /dev/null +++ b/src/rgw/rgw_crypt.h @@ -0,0 +1,152 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/** + * Crypto filters for Put/Post/Get operations. + */ + +#ifndef CEPH_RGW_CRYPT_H +#define CEPH_RGW_CRYPT_H + +#include +#include +#include +#include "rgw_putobj.h" +#include + +/** + * \brief Interface for block encryption methods + * + * Encrypts and decrypts data. + * Operations are performed in context of larger stream being divided into blocks. + * Each block can be processed independently, but only as a whole. + * Part block cannot be properly processed. + * Each request must start on block-aligned offset. + * Each request should have length that is multiply of block size. + * Request with unaligned length is only acceptable for last part of stream. + */ +class BlockCrypt { +public: + BlockCrypt(){}; + virtual ~BlockCrypt(){}; + + /** + * Determines size of encryption block. + * This is usually multiply of key size. + * It determines size of chunks that should be passed to \ref encrypt and \ref decrypt. + */ + virtual size_t get_block_size() = 0; + + /** + * Encrypts data. + * Argument \ref stream_offset shows where in generalized stream chunk is located. + * Input for encryption is \ref input buffer, with relevant data in range crypt; /**< already configured stateless BlockCrypt + for operations when enough data is accumulated */ + off_t enc_begin_skip; /**< amount of data to skip from beginning of received data */ + off_t ofs; /**< stream offset of data we expect to show up next through \ref handle_data */ + off_t end; /**< stream offset of last byte that is requested */ + bufferlist cache; /**< stores extra data that could not (yet) be processed by BlockCrypt */ + size_t block_size; /**< snapshot of \ref BlockCrypt.get_block_size() */ + + int process(bufferlist& cipher, size_t part_ofs, size_t size); + +protected: + std::vector parts_len; /**< size of parts of multipart object, parsed from manifest */ +public: + RGWGetObj_BlockDecrypt(CephContext* cct, + RGWGetObj_Filter* next, + std::unique_ptr crypt); + virtual ~RGWGetObj_BlockDecrypt(); + + virtual int fixup_range(off_t& bl_ofs, + off_t& bl_end) override; + virtual int handle_data(bufferlist& bl, + off_t bl_ofs, + off_t bl_len) override; + virtual int flush() override; + + int read_manifest(bufferlist& manifest_bl); +}; /* RGWGetObj_BlockDecrypt */ + + +class RGWPutObj_BlockEncrypt : public rgw::putobj::Pipe +{ + CephContext* cct; + std::unique_ptr crypt; /**< already configured stateless BlockCrypt + for operations when enough data is accumulated */ + bufferlist cache; /**< stores extra data that could not (yet) be processed by BlockCrypt */ + const size_t block_size; /**< snapshot of \ref BlockCrypt.get_block_size() */ +public: + RGWPutObj_BlockEncrypt(CephContext* cct, + rgw::putobj::DataProcessor *next, + std::unique_ptr crypt); + + int process(bufferlist&& data, uint64_t logical_offset) override; +}; /* RGWPutObj_BlockEncrypt */ + + +int rgw_s3_prepare_encrypt(struct req_state* s, + std::map& attrs, + std::map* parts, + std::unique_ptr* block_crypt, + std::map& crypt_http_responses); + +int rgw_s3_prepare_decrypt(struct req_state* s, + std::map& attrs, + std::unique_ptr* block_crypt, + std::map& crypt_http_responses); + +#endif diff --git a/src/rgw/rgw_crypt_sanitize.cc b/src/rgw/rgw_crypt_sanitize.cc new file mode 100644 index 00000000..776f1376 --- /dev/null +++ b/src/rgw/rgw_crypt_sanitize.cc @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * rgw_crypt_sanitize.cc + * + * Created on: Mar 3, 2017 + * Author: adam + */ + +#include "rgw_common.h" +#include "rgw_crypt_sanitize.h" +#include "boost/algorithm/string/predicate.hpp" + +namespace rgw { +namespace crypt_sanitize { +const char* HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY = "HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY"; +const char* x_amz_server_side_encryption_customer_key = "x-amz-server-side-encryption-customer-key"; +const char* dollar_x_amz_server_side_encryption_customer_key = "$x-amz-server-side-encryption-customer-key"; +const char* suppression_message = "=suppressed due to key presence="; + +std::ostream& operator<<(std::ostream& out, const env& e) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs) { + if (boost::algorithm::iequals( + e.name, + HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY)) + { + out << suppression_message; + return out; + } + if (boost::algorithm::iequals(e.name, "QUERY_STRING") && + boost::algorithm::ifind_first( + e.value, + x_amz_server_side_encryption_customer_key)) + { + out << suppression_message; + return out; + } + } + out << e.value; + return out; +} + +std::ostream& operator<<(std::ostream& out, const x_meta_map& x) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs && + boost::algorithm::iequals(x.name, x_amz_server_side_encryption_customer_key)) + { + out << suppression_message; + return out; + } + out << x.value; + return out; +} + +std::ostream& operator<<(std::ostream& out, const s3_policy& x) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs && + boost::algorithm::iequals(x.name, dollar_x_amz_server_side_encryption_customer_key)) + { + out << suppression_message; + return out; + } + out << x.value; + return out; +} + +std::ostream& operator<<(std::ostream& out, const auth& x) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs && + x.s->info.env->get(HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, nullptr) != nullptr) + { + out << suppression_message; + return out; + } + out << x.value; + return out; +} + +std::ostream& operator<<(std::ostream& out, const log_content& x) { + if (g_ceph_context->_conf->rgw_crypt_suppress_logs && + boost::algorithm::ifind_first(x.buf, x_amz_server_side_encryption_customer_key)) { + out << suppression_message; + return out; + } + out << x.buf; + return out; +} + +} +} diff --git a/src/rgw/rgw_crypt_sanitize.h b/src/rgw/rgw_crypt_sanitize.h new file mode 100644 index 00000000..548c1240 --- /dev/null +++ b/src/rgw/rgw_crypt_sanitize.h @@ -0,0 +1,71 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_RGW_CRYPT_SANITIZE_H_ +#define RGW_RGW_CRYPT_SANITIZE_H_ + +#include + +#include "rgw_common.h" + +namespace rgw { +namespace crypt_sanitize { + +/* + * Temporary container for suppressing printing if variable contains secret key. + */ +struct env { + boost::string_ref name; + boost::string_ref value; + + env(boost::string_ref name, boost::string_ref value) + : name(name), value(value) {} +}; + +/* + * Temporary container for suppressing printing if aws meta attributes contains secret key. + */ +struct x_meta_map { + boost::string_ref name; + boost::string_ref value; + x_meta_map(boost::string_ref name, boost::string_ref value) + : name(name), value(value) {} +}; + +/* + * Temporary container for suppressing printing if s3_policy calculation variable contains secret key. + */ +struct s3_policy { + boost::string_ref name; + boost::string_ref value; + s3_policy(boost::string_ref name, boost::string_ref value) + : name(name), value(value) {} +}; + +/* + * Temporary container for suppressing printing if auth string contains secret key. + */ +struct auth { + const req_state* const s; + boost::string_ref value; + auth(const req_state* const s, boost::string_ref value) + : s(s), value(value) {} +}; + +/* + * Temporary container for suppressing printing if log made from civetweb may contain secret key. + */ +struct log_content { + const boost::string_view buf; + explicit log_content(const boost::string_view buf) + : buf(buf) {} +}; + +std::ostream& operator<<(std::ostream& out, const env& e); +std::ostream& operator<<(std::ostream& out, const x_meta_map& x); +std::ostream& operator<<(std::ostream& out, const s3_policy& x); +std::ostream& operator<<(std::ostream& out, const auth& x); +std::ostream& operator<<(std::ostream& out, const log_content& x); +} +} +#endif /* RGW_RGW_CRYPT_SANITIZE_H_ */ diff --git a/src/rgw/rgw_data_sync.cc b/src/rgw/rgw_data_sync.cc new file mode 100644 index 00000000..3f70ff84 --- /dev/null +++ b/src/rgw/rgw_data_sync.cc @@ -0,0 +1,3709 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "common/ceph_json.h" +#include "common/RWLock.h" +#include "common/RefCountedObj.h" +#include "common/WorkQueue.h" +#include "common/Throttle.h" +#include "common/errno.h" + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_sync.h" +#include "rgw_data_sync.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_http_client.h" +#include "rgw_bucket.h" +#include "rgw_metadata.h" +#include "rgw_sync_counters.h" +#include "rgw_sync_module.h" +#include "rgw_sync_log_trim.h" + +#include "cls/lock/cls_lock_client.h" + +#include "services/svc_zone.h" +#include "services/svc_sync_modules.h" + +#include "include/random.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "data sync: ") + +static string datalog_sync_status_oid_prefix = "datalog.sync-status"; +static string datalog_sync_status_shard_prefix = "datalog.sync-status.shard"; +static string datalog_sync_full_sync_index_prefix = "data.full-sync.index"; +static string bucket_status_oid_prefix = "bucket.sync-status"; +static string object_status_oid_prefix = "bucket.sync-status"; + + +void rgw_datalog_info::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("num_objects", num_shards, obj); +} + +void rgw_datalog_entry::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("key", key, obj); + utime_t ut; + JSONDecoder::decode_json("timestamp", ut, obj); + timestamp = ut.to_real_time(); +} + +void rgw_datalog_shard_data::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("entries", entries, obj); +}; + +class RGWReadDataSyncStatusMarkersCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + RGWDataSyncEnv *env; + const int num_shards; + int shard_id{0};; + + map& markers; + + public: + RGWReadDataSyncStatusMarkersCR(RGWDataSyncEnv *env, int num_shards, + map& markers) + : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS), + env(env), num_shards(num_shards), markers(markers) + {} + bool spawn_next() override; +}; + +bool RGWReadDataSyncStatusMarkersCR::spawn_next() +{ + if (shard_id >= num_shards) { + return false; + } + using CR = RGWSimpleRadosReadCR; + spawn(new CR(env->async_rados, env->store->svc.sysobj, + rgw_raw_obj(env->store->svc.zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(env->source_zone, shard_id)), + &markers[shard_id]), + false); + shard_id++; + return true; +} + +class RGWReadDataSyncRecoveringShardsCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + RGWDataSyncEnv *env; + + uint64_t max_entries; + int num_shards; + int shard_id{0}; + + string marker; + std::vector& omapkeys; + + public: + RGWReadDataSyncRecoveringShardsCR(RGWDataSyncEnv *env, uint64_t _max_entries, int _num_shards, + std::vector& omapkeys) + : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS), env(env), + max_entries(_max_entries), num_shards(_num_shards), omapkeys(omapkeys) + {} + bool spawn_next() override; +}; + +bool RGWReadDataSyncRecoveringShardsCR::spawn_next() +{ + if (shard_id >= num_shards) + return false; + + string error_oid = RGWDataSyncStatusManager::shard_obj_name(env->source_zone, shard_id) + ".retry"; + auto& shard_keys = omapkeys[shard_id]; + shard_keys = std::make_shared(); + spawn(new RGWRadosGetOmapKeysCR(env->store, rgw_raw_obj(env->store->svc.zone->get_zone_params().log_pool, error_oid), + marker, max_entries, shard_keys), false); + + ++shard_id; + return true; +} + +class RGWReadDataSyncStatusCoroutine : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + rgw_data_sync_status *sync_status; + +public: + RGWReadDataSyncStatusCoroutine(RGWDataSyncEnv *_sync_env, + rgw_data_sync_status *_status) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status) + {} + int operate() override; +}; + +int RGWReadDataSyncStatusCoroutine::operate() +{ + reenter(this) { + // read sync info + using ReadInfoCR = RGWSimpleRadosReadCR; + yield { + bool empty_on_enoent = false; // fail on ENOENT + call(new ReadInfoCR(sync_env->async_rados, sync_env->store->svc.sysobj, + rgw_raw_obj(sync_env->store->svc.zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sync_env->source_zone)), + &sync_status->sync_info, empty_on_enoent)); + } + if (retcode < 0) { + ldout(sync_env->cct, 4) << "failed to read sync status info with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + // read shard markers + using ReadMarkersCR = RGWReadDataSyncStatusMarkersCR; + yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards, + sync_status->sync_markers)); + if (retcode < 0) { + ldout(sync_env->cct, 4) << "failed to read sync status markers with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} + +class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + + RGWRESTReadResource *http_op; + + int shard_id; + RGWDataChangesLogInfo *shard_info; + +public: + RGWReadRemoteDataLogShardInfoCR(RGWDataSyncEnv *_sync_env, + int _shard_id, RGWDataChangesLogInfo *_shard_info) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + http_op(NULL), + shard_id(_shard_id), + shard_info(_shard_info) { + } + + ~RGWReadRemoteDataLogShardInfoCR() override { + if (http_op) { + http_op->put(); + } + } + + int operate() override { + reenter(this) { + yield { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + rgw_http_param_pair pairs[] = { { "type" , "data" }, + { "id", buf }, + { "info" , NULL }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(sync_env->conn, p, pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); + + int ret = http_op->aio_read(); + if (ret < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + int ret = http_op->wait(shard_info); + if (ret < 0) { + return set_cr_error(ret); + } + return set_cr_done(); + } + } + return 0; + } +}; + +struct read_remote_data_log_response { + string marker; + bool truncated; + list entries; + + read_remote_data_log_response() : truncated(false) {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("entries", entries, obj); + }; +}; + +class RGWReadRemoteDataLogShardCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + + RGWRESTReadResource *http_op = nullptr; + + int shard_id; + const std::string& marker; + string *pnext_marker; + list *entries; + bool *truncated; + + read_remote_data_log_response response; + std::optional timer; + +public: + RGWReadRemoteDataLogShardCR(RGWDataSyncEnv *_sync_env, int _shard_id, + const std::string& marker, string *pnext_marker, + list *_entries, + bool *_truncated) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker), + entries(_entries), truncated(_truncated) { + } + ~RGWReadRemoteDataLogShardCR() override { + if (http_op) { + http_op->put(); + } + } + + int operate() override { + reenter(this) { + yield { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + rgw_http_param_pair pairs[] = { { "type" , "data" }, + { "id", buf }, + { "marker", marker.c_str() }, + { "extra-info", "true" }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(sync_env->conn, p, pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); + + if (sync_env->counters) { + timer.emplace(sync_env->counters, sync_counters::l_poll); + } + int ret = http_op->aio_read(); + if (ret < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + if (sync_env->counters) { + sync_env->counters->inc(sync_counters::l_poll_err); + } + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + timer.reset(); + int ret = http_op->wait(&response); + if (ret < 0) { + if (sync_env->counters && ret != -ENOENT) { + sync_env->counters->inc(sync_counters::l_poll_err); + } + return set_cr_error(ret); + } + entries->clear(); + entries->swap(response.entries); + *pnext_marker = response.marker; + *truncated = response.truncated; + return set_cr_done(); + } + } + return 0; + } +}; + +class RGWReadRemoteDataLogInfoCR : public RGWShardCollectCR { + RGWDataSyncEnv *sync_env; + + int num_shards; + map *datalog_info; + + int shard_id; +#define READ_DATALOG_MAX_CONCURRENT 10 + +public: + RGWReadRemoteDataLogInfoCR(RGWDataSyncEnv *_sync_env, + int _num_shards, + map *_datalog_info) : RGWShardCollectCR(_sync_env->cct, READ_DATALOG_MAX_CONCURRENT), + sync_env(_sync_env), num_shards(_num_shards), + datalog_info(_datalog_info), shard_id(0) {} + bool spawn_next() override; +}; + +bool RGWReadRemoteDataLogInfoCR::spawn_next() { + if (shard_id >= num_shards) { + return false; + } + spawn(new RGWReadRemoteDataLogShardInfoCR(sync_env, shard_id, &(*datalog_info)[shard_id]), false); + shard_id++; + return true; +} + +class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine { + RGWDataSyncEnv *sync_env; + RGWRESTReadResource *http_op; + + int shard_id; + string marker; + uint32_t max_entries; + rgw_datalog_shard_data *result; + +public: + RGWListRemoteDataLogShardCR(RGWDataSyncEnv *env, int _shard_id, + const string& _marker, uint32_t _max_entries, + rgw_datalog_shard_data *_result) + : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL), + shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {} + + int send_request() override { + RGWRESTConn *conn = sync_env->conn; + RGWRados *store = sync_env->store; + + char buf[32]; + snprintf(buf, sizeof(buf), "%d", shard_id); + + char max_entries_buf[32]; + snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries); + + const char *marker_key = (marker.empty() ? "" : "marker"); + + rgw_http_param_pair pairs[] = { { "type", "data" }, + { "id", buf }, + { "max-entries", max_entries_buf }, + { marker_key, marker.c_str() }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager); + init_new_io(http_op); + + int ret = http_op->aio_read(); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return ret; + } + + return 0; + } + + int request_complete() override { + int ret = http_op->wait(result); + http_op->put(); + if (ret < 0 && ret != -ENOENT) { + ldout(sync_env->store->ctx(), 0) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl; + return ret; + } + return 0; + } +}; + +class RGWListRemoteDataLogCR : public RGWShardCollectCR { + RGWDataSyncEnv *sync_env; + + map shards; + int max_entries_per_shard; + map *result; + + map::iterator iter; +#define READ_DATALOG_MAX_CONCURRENT 10 + +public: + RGWListRemoteDataLogCR(RGWDataSyncEnv *_sync_env, + map& _shards, + int _max_entries_per_shard, + map *_result) : RGWShardCollectCR(_sync_env->cct, READ_DATALOG_MAX_CONCURRENT), + sync_env(_sync_env), max_entries_per_shard(_max_entries_per_shard), + result(_result) { + shards.swap(_shards); + iter = shards.begin(); + } + bool spawn_next() override; +}; + +bool RGWListRemoteDataLogCR::spawn_next() { + if (iter == shards.end()) { + return false; + } + + spawn(new RGWListRemoteDataLogShardCR(sync_env, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false); + ++iter; + return true; +} + +class RGWInitDataSyncStatusCoroutine : public RGWCoroutine { + static constexpr uint32_t lock_duration = 30; + RGWDataSyncEnv *sync_env; + RGWRados *store; + const rgw_pool& pool; + const uint32_t num_shards; + + string sync_status_oid; + + string lock_name; + string cookie; + rgw_data_sync_status *status; + map shards_info; + + RGWSyncTraceNodeRef tn; +public: + RGWInitDataSyncStatusCoroutine(RGWDataSyncEnv *_sync_env, uint32_t num_shards, + uint64_t instance_id, + RGWSyncTraceNodeRef& _tn_parent, + rgw_data_sync_status *status) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), store(sync_env->store), + pool(store->svc.zone->get_zone_params().log_pool), + num_shards(num_shards), status(status), + tn(sync_env->sync_tracer->add_node(_tn_parent, "init_data_sync_status")) { + lock_name = "sync_lock"; + + status->sync_info.instance_id = instance_id; + +#define COOKIE_LEN 16 + char buf[COOKIE_LEN + 1]; + + gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1); + cookie = buf; + + sync_status_oid = RGWDataSyncStatusManager::sync_status_oid(sync_env->source_zone); + + } + + int operate() override { + int ret; + reenter(this) { + using LockCR = RGWSimpleRadosLockCR; + yield call(new LockCR(sync_env->async_rados, store, + rgw_raw_obj{pool, sync_status_oid}, + lock_name, cookie, lock_duration)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to take a lock on " << sync_status_oid)); + return set_cr_error(retcode); + } + using WriteInfoCR = RGWSimpleRadosWriteCR; + yield call(new WriteInfoCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj{pool, sync_status_oid}, + status->sync_info)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode)); + return set_cr_error(retcode); + } + + /* take lock again, we just recreated the object */ + yield call(new LockCR(sync_env->async_rados, store, + rgw_raw_obj{pool, sync_status_oid}, + lock_name, cookie, lock_duration)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to take a lock on " << sync_status_oid)); + return set_cr_error(retcode); + } + + tn->log(10, "took lease"); + + /* fetch current position in logs */ + yield { + RGWRESTConn *conn = store->svc.zone->get_zone_conn_by_id(sync_env->source_zone); + if (!conn) { + tn->log(0, SSTR("ERROR: connection to zone " << sync_env->source_zone << " does not exist!")); + return set_cr_error(-EIO); + } + for (uint32_t i = 0; i < num_shards; i++) { + spawn(new RGWReadRemoteDataLogShardInfoCR(sync_env, i, &shards_info[i]), true); + } + } + while (collect(&ret, NULL)) { + if (ret < 0) { + tn->log(0, SSTR("ERROR: failed to read remote data log shards")); + return set_state(RGWCoroutine_Error); + } + yield; + } + yield { + for (uint32_t i = 0; i < num_shards; i++) { + RGWDataChangesLogInfo& info = shards_info[i]; + auto& marker = status->sync_markers[i]; + marker.next_step_marker = info.marker; + marker.timestamp = info.last_update; + const auto& oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, i); + using WriteMarkerCR = RGWSimpleRadosWriteCR; + spawn(new WriteMarkerCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj{pool, oid}, marker), true); + } + } + while (collect(&ret, NULL)) { + if (ret < 0) { + tn->log(0, SSTR("ERROR: failed to write data sync status markers")); + return set_state(RGWCoroutine_Error); + } + yield; + } + + status->sync_info.state = rgw_data_sync_info::StateBuildingFullSyncMaps; + yield call(new WriteInfoCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj{pool, sync_status_oid}, + status->sync_info)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode)); + return set_cr_error(retcode); + } + yield call(new RGWSimpleRadosUnlockCR(sync_env->async_rados, store, + rgw_raw_obj{pool, sync_status_oid}, + lock_name, cookie)); + return set_cr_done(); + } + return 0; + } +}; + +int RGWRemoteDataLog::read_log_info(rgw_datalog_info *log_info) +{ + rgw_http_param_pair pairs[] = { { "type", "data" }, + { NULL, NULL } }; + + int ret = sync_env.conn->get_json_resource("/admin/log", pairs, *log_info); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch datalog info" << dendl; + return ret; + } + + ldpp_dout(dpp, 20) << "remote datalog, num_shards=" << log_info->num_shards << dendl; + + return 0; +} + +int RGWRemoteDataLog::read_source_log_shards_info(map *shards_info) +{ + rgw_datalog_info log_info; + int ret = read_log_info(&log_info); + if (ret < 0) { + return ret; + } + + return run(new RGWReadRemoteDataLogInfoCR(&sync_env, log_info.num_shards, shards_info)); +} + +int RGWRemoteDataLog::read_source_log_shards_next(map shard_markers, map *result) +{ + return run(new RGWListRemoteDataLogCR(&sync_env, shard_markers, 1, result)); +} + +int RGWRemoteDataLog::init(const string& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger, + RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& _sync_module, + PerfCounters* counters) +{ + sync_env.init(dpp, store->ctx(), store, _conn, async_rados, &http_manager, _error_logger, + _sync_tracer, _source_zone, _sync_module, counters); + + if (initialized) { + return 0; + } + + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + + tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "data"); + + initialized = true; + + return 0; +} + +void RGWRemoteDataLog::finish() +{ + stop(); +} + +int RGWRemoteDataLog::read_sync_status(rgw_data_sync_status *sync_status) +{ + // cannot run concurrently with run_sync(), so run in a separate manager + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWDataSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + ret = crs.run(new RGWReadDataSyncStatusCoroutine(&sync_env_local, sync_status)); + http_manager.stop(); + return ret; +} + +int RGWRemoteDataLog::read_recovering_shards(const int num_shards, set& recovering_shards) +{ + // cannot run concurrently with run_sync(), so run in a separate manager + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWDataSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + std::vector omapkeys; + omapkeys.resize(num_shards); + uint64_t max_entries{1}; + ret = crs.run(new RGWReadDataSyncRecoveringShardsCR(&sync_env_local, max_entries, num_shards, omapkeys)); + http_manager.stop(); + + if (ret == 0) { + for (int i = 0; i < num_shards; i++) { + if (omapkeys[i]->entries.size() != 0) { + recovering_shards.insert(i); + } + } + } + + return ret; +} + +int RGWRemoteDataLog::init_sync_status(int num_shards) +{ + rgw_data_sync_status sync_status; + sync_status.sync_info.num_shards = num_shards; + + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWDataSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + auto instance_id = ceph::util::generate_random_number(); + ret = crs.run(new RGWInitDataSyncStatusCoroutine(&sync_env_local, num_shards, instance_id, tn, &sync_status)); + http_manager.stop(); + return ret; +} + +static string full_data_sync_index_shard_oid(const string& source_zone, int shard_id) +{ + char buf[datalog_sync_full_sync_index_prefix.size() + 1 + source_zone.size() + 1 + 16]; + snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_full_sync_index_prefix.c_str(), source_zone.c_str(), shard_id); + return string(buf); +} + +struct read_metadata_list { + string marker; + bool truncated; + list keys; + int count; + + read_metadata_list() : truncated(false), count(0) {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("keys", keys, obj); + JSONDecoder::decode_json("count", count, obj); + } +}; + +struct bucket_instance_meta_info { + string key; + obj_version ver; + utime_t mtime; + RGWBucketInstanceMetadataObject data; + + bucket_instance_meta_info() {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("key", key, obj); + JSONDecoder::decode_json("ver", ver, obj); + JSONDecoder::decode_json("mtime", mtime, obj); + JSONDecoder::decode_json("data", data, obj); + } +}; + +class RGWListBucketIndexesCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + + RGWRados *store; + + rgw_data_sync_status *sync_status; + int num_shards; + + int req_ret; + int ret; + + list::iterator iter; + + RGWShardedOmapCRManager *entries_index; + + string oid_prefix; + + string path; + bucket_instance_meta_info meta_info; + string key; + string s; + int i; + + bool failed; + bool truncated; + read_metadata_list result; + +public: + RGWListBucketIndexesCR(RGWDataSyncEnv *_sync_env, + rgw_data_sync_status *_sync_status) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + store(sync_env->store), sync_status(_sync_status), + req_ret(0), ret(0), entries_index(NULL), i(0), failed(false), truncated(false) { + oid_prefix = datalog_sync_full_sync_index_prefix + "." + sync_env->source_zone; + path = "/admin/metadata/bucket.instance"; + num_shards = sync_status->sync_info.num_shards; + } + ~RGWListBucketIndexesCR() override { + delete entries_index; + } + + int operate() override { + reenter(this) { + entries_index = new RGWShardedOmapCRManager(sync_env->async_rados, store, this, num_shards, + store->svc.zone->get_zone_params().log_pool, + oid_prefix); + yield; // yield so OmapAppendCRs can start + + do { + yield { + string entrypoint = string("/admin/metadata/bucket.instance"); + + rgw_http_param_pair pairs[] = {{"max-entries", "1000"}, + {"marker", result.marker.c_str()}, + {NULL, NULL}}; + + call(new RGWReadRESTResourceCR(store->ctx(), sync_env->conn, sync_env->http_manager, + entrypoint, pairs, &result)); + } + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to fetch metadata for section bucket.instance" << dendl; + return set_cr_error(retcode); + } + + for (iter = result.keys.begin(); iter != result.keys.end(); ++iter) { + ldout(sync_env->cct, 20) << "list metadata: section=bucket.instance key=" << *iter << dendl; + key = *iter; + + yield { + rgw_http_param_pair pairs[] = {{"key", key.c_str()}, + {NULL, NULL}}; + + call(new RGWReadRESTResourceCR(store->ctx(), sync_env->conn, sync_env->http_manager, path, pairs, &meta_info)); + } + + num_shards = meta_info.data.get_bucket_info().num_shards; + if (num_shards > 0) { + for (i = 0; i < num_shards; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), ":%d", i); + s = key + buf; + yield entries_index->append(s, store->data_log->get_log_shard_id(meta_info.data.get_bucket_info().bucket, i)); + } + } else { + yield entries_index->append(key, store->data_log->get_log_shard_id(meta_info.data.get_bucket_info().bucket, -1)); + } + } + truncated = result.truncated; + } while (truncated); + + yield { + if (!entries_index->finish()) { + failed = true; + } + } + if (!failed) { + for (map::iterator iter = sync_status->sync_markers.begin(); iter != sync_status->sync_markers.end(); ++iter) { + int shard_id = (int)iter->first; + rgw_data_sync_marker& marker = iter->second; + marker.total_entries = entries_index->get_total_entries(shard_id); + spawn(new RGWSimpleRadosWriteCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id)), + marker), + true); + } + } else { + yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), "data.init", "", + EIO, string("failed to build bucket instances map"))); + } + while (collect(&ret, NULL)) { + if (ret < 0) { + yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), "data.init", "", + -ret, string("failed to store sync status: ") + cpp_strerror(-ret))); + req_ret = ret; + } + yield; + } + + drain_all(); + if (req_ret < 0) { + yield return set_cr_error(req_ret); + } + yield return set_cr_done(); + } + return 0; + } +}; + +#define DATA_SYNC_UPDATE_MARKER_WINDOW 1 + +class RGWDataSyncShardMarkerTrack : public RGWSyncShardMarkerTrack { + RGWDataSyncEnv *sync_env; + + string marker_oid; + rgw_data_sync_marker sync_marker; + + map key_to_marker; + map marker_to_key; + + void handle_finish(const string& marker) override { + map::iterator iter = marker_to_key.find(marker); + if (iter == marker_to_key.end()) { + return; + } + key_to_marker.erase(iter->second); + reset_need_retry(iter->second); + marker_to_key.erase(iter); + } + + RGWSyncTraceNodeRef tn; + +public: + RGWDataSyncShardMarkerTrack(RGWDataSyncEnv *_sync_env, + const string& _marker_oid, + const rgw_data_sync_marker& _marker, + RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(DATA_SYNC_UPDATE_MARKER_WINDOW), + sync_env(_sync_env), + marker_oid(_marker_oid), + sync_marker(_marker), + tn(_tn) {} + + RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override { + sync_marker.marker = new_marker; + sync_marker.pos = index_pos; + sync_marker.timestamp = timestamp; + + tn->log(20, SSTR("updating marker marker_oid=" << marker_oid << " marker=" << new_marker)); + RGWRados *store = sync_env->store; + + return new RGWSimpleRadosWriteCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, marker_oid), + sync_marker); + } + + /* + * create index from key -> marker, and from marker -> key + * this is useful so that we can insure that we only have one + * entry for any key that is used. This is needed when doing + * incremenatl sync of data, and we don't want to run multiple + * concurrent sync operations for the same bucket shard + */ + bool index_key_to_marker(const string& key, const string& marker) { + if (key_to_marker.find(key) != key_to_marker.end()) { + set_need_retry(key); + return false; + } + key_to_marker[key] = marker; + marker_to_key[marker] = key; + return true; + } + + RGWOrderCallCR *allocate_order_control_cr() override { + return new RGWLastCallerWinsCR(sync_env->cct); + } +}; + +// ostream wrappers to print buckets without copying strings +struct bucket_str { + const rgw_bucket& b; + explicit bucket_str(const rgw_bucket& b) : b(b) {} +}; +std::ostream& operator<<(std::ostream& out, const bucket_str& rhs) { + auto& b = rhs.b; + if (!b.tenant.empty()) { + out << b.tenant << '/'; + } + out << b.name; + if (!b.bucket_id.empty()) { + out << ':' << b.bucket_id; + } + return out; +} + +struct bucket_str_noinstance { + const rgw_bucket& b; + explicit bucket_str_noinstance(const rgw_bucket& b) : b(b) {} +}; +std::ostream& operator<<(std::ostream& out, const bucket_str_noinstance& rhs) { + auto& b = rhs.b; + if (!b.tenant.empty()) { + out << b.tenant << '/'; + } + out << b.name; + return out; +} + +struct bucket_shard_str { + const rgw_bucket_shard& bs; + explicit bucket_shard_str(const rgw_bucket_shard& bs) : bs(bs) {} +}; +std::ostream& operator<<(std::ostream& out, const bucket_shard_str& rhs) { + auto& bs = rhs.bs; + out << bucket_str{bs.bucket}; + if (bs.shard_id >= 0) { + out << ':' << bs.shard_id; + } + return out; +} + +class RGWRunBucketSyncCoroutine : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + rgw_bucket_shard bs; + RGWBucketInfo bucket_info; + rgw_bucket_shard_sync_info sync_status; + RGWMetaSyncEnv meta_sync_env; + + const std::string status_oid; + + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; + + RGWSyncTraceNodeRef tn; + +public: + RGWRunBucketSyncCoroutine(RGWDataSyncEnv *_sync_env, const rgw_bucket_shard& bs, const RGWSyncTraceNodeRef& _tn_parent) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), bs(bs), + status_oid(RGWBucketSyncStatusManager::status_oid(sync_env->source_zone, bs)), + tn(sync_env->sync_tracer->add_node(_tn_parent, "bucket", + SSTR(bucket_shard_str{bs}))) { + } + ~RGWRunBucketSyncCoroutine() override { + if (lease_cr) { + lease_cr->abort(); + } + } + + int operate() override; +}; + +class RGWDataSyncSingleEntryCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + + string raw_key; + string entry_marker; + + rgw_bucket_shard bs; + + int sync_status; + + bufferlist md_bl; + + RGWDataSyncShardMarkerTrack *marker_tracker; + + boost::intrusive_ptr error_repo; + bool remove_from_repo; + + set keys; + + RGWSyncTraceNodeRef tn; +public: + RGWDataSyncSingleEntryCR(RGWDataSyncEnv *_sync_env, + const string& _raw_key, const string& _entry_marker, RGWDataSyncShardMarkerTrack *_marker_tracker, + RGWOmapAppend *_error_repo, bool _remove_from_repo, const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + raw_key(_raw_key), entry_marker(_entry_marker), + sync_status(0), + marker_tracker(_marker_tracker), + error_repo(_error_repo), remove_from_repo(_remove_from_repo) { + set_description() << "data sync single entry (source_zone=" << sync_env->source_zone << ") key=" <<_raw_key << " entry=" << entry_marker; + tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", raw_key); + } + + int operate() override { + reenter(this) { + do { + yield { + int ret = rgw_bucket_parse_bucket_key(sync_env->cct, raw_key, + &bs.bucket, &bs.shard_id); + if (ret < 0) { + return set_cr_error(-EIO); + } + if (marker_tracker) { + marker_tracker->reset_need_retry(raw_key); + } + tn->log(0, SSTR("triggering sync of bucket/shard " << bucket_shard_str{bs})); + call(new RGWRunBucketSyncCoroutine(sync_env, bs, tn)); + } + } while (marker_tracker && marker_tracker->need_retry(raw_key)); + + sync_status = retcode; + + if (sync_status == -ENOENT) { + // this was added when 'tenant/' was added to datalog entries, because + // preexisting tenant buckets could never sync and would stay in the + // error_repo forever + tn->log(0, SSTR("WARNING: skipping data log entry for missing bucket " << raw_key)); + sync_status = 0; + } + + if (sync_status < 0) { + // write actual sync failures for 'radosgw-admin sync error list' + if (sync_status != -EBUSY && sync_status != -EAGAIN) { + yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), "data", raw_key, + -sync_status, string("failed to sync bucket instance: ") + cpp_strerror(-sync_status))); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to log sync failure: retcode=" << retcode)); + } + } + if (error_repo && !error_repo->append(raw_key)) { + tn->log(0, SSTR("ERROR: failed to log sync failure in error repo: retcode=" << retcode)); + } + } else if (error_repo && remove_from_repo) { + keys = {raw_key}; + yield call(new RGWRadosRemoveOmapKeysCR(sync_env->store, error_repo->get_obj(), keys)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to remove omap key from error repo (" + << error_repo->get_obj() << " retcode=" << retcode)); + } + } + /* FIXME: what do do in case of error */ + if (marker_tracker && !entry_marker.empty()) { + /* update marker */ + yield call(marker_tracker->finish(entry_marker)); + } + if (sync_status == 0) { + sync_status = retcode; + } + if (sync_status < 0) { + return set_cr_error(sync_status); + } + return set_cr_done(); + } + return 0; + } +}; + +#define BUCKET_SHARD_SYNC_SPAWN_WINDOW 20 +#define DATA_SYNC_MAX_ERR_ENTRIES 10 + +class RGWDataSyncShardCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + + rgw_pool pool; + + uint32_t shard_id; + rgw_data_sync_marker sync_marker; + + RGWRadosGetOmapKeysCR::ResultPtr omapkeys; + std::set entries; + std::set::iterator iter; + + string oid; + + RGWDataSyncShardMarkerTrack *marker_tracker; + + std::string next_marker; + list log_entries; + list::iterator log_iter; + bool truncated; + + Mutex inc_lock; + Cond inc_cond; + + boost::asio::coroutine incremental_cr; + boost::asio::coroutine full_cr; + + + set modified_shards; + set current_modified; + + set::iterator modified_iter; + + int total_entries; + + int spawn_window; + + bool *reset_backoff; + + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; + string status_oid; + + + string error_oid; + RGWOmapAppend *error_repo; + std::set error_entries; + string error_marker; + int max_error_entries; + + ceph::coarse_real_time error_retry_time; + +#define RETRY_BACKOFF_SECS_MIN 60 +#define RETRY_BACKOFF_SECS_DEFAULT 60 +#define RETRY_BACKOFF_SECS_MAX 600 + uint32_t retry_backoff_secs; + + RGWSyncTraceNodeRef tn; +public: + RGWDataSyncShardCR(RGWDataSyncEnv *_sync_env, + rgw_pool& _pool, + uint32_t _shard_id, const rgw_data_sync_marker& _marker, + RGWSyncTraceNodeRef& _tn, + bool *_reset_backoff) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + pool(_pool), + shard_id(_shard_id), + sync_marker(_marker), + marker_tracker(NULL), truncated(false), inc_lock("RGWDataSyncShardCR::inc_lock"), + total_entries(0), spawn_window(BUCKET_SHARD_SYNC_SPAWN_WINDOW), reset_backoff(NULL), + lease_cr(nullptr), lease_stack(nullptr), error_repo(nullptr), max_error_entries(DATA_SYNC_MAX_ERR_ENTRIES), + retry_backoff_secs(RETRY_BACKOFF_SECS_DEFAULT), tn(_tn) { + set_description() << "data sync shard source_zone=" << sync_env->source_zone << " shard_id=" << shard_id; + status_oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id); + error_oid = status_oid + ".retry"; + } + + ~RGWDataSyncShardCR() override { + delete marker_tracker; + if (lease_cr) { + lease_cr->abort(); + } + if (error_repo) { + error_repo->put(); + } + } + + void append_modified_shards(set& keys) { + Mutex::Locker l(inc_lock); + modified_shards.insert(keys.begin(), keys.end()); + } + + void set_marker_tracker(RGWDataSyncShardMarkerTrack *mt) { + delete marker_tracker; + marker_tracker = mt; + } + + int operate() override { + int r; + while (true) { + switch (sync_marker.state) { + case rgw_data_sync_marker::FullSync: + r = full_sync(); + if (r < 0) { + if (r != -EBUSY) { + tn->log(10, SSTR("full sync failed (r=" << r << ")")); + } + return set_cr_error(r); + } + return 0; + case rgw_data_sync_marker::IncrementalSync: + r = incremental_sync(); + if (r < 0) { + if (r != -EBUSY) { + tn->log(10, SSTR("incremental sync failed (r=" << r << ")")); + } + return set_cr_error(r); + } + return 0; + default: + return set_cr_error(-EIO); + } + } + return 0; + } + + void init_lease_cr() { + set_status("acquiring sync lock"); + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + if (lease_cr) { + lease_cr->abort(); + } + RGWRados *store = sync_env->store; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, status_oid), + lock_name, lock_duration, this)); + lease_stack.reset(spawn(lease_cr.get(), false)); + } + + int full_sync() { +#define OMAP_GET_MAX_ENTRIES 100 + int max_entries = OMAP_GET_MAX_ENTRIES; + reenter(&full_cr) { + tn->log(10, "start full sync"); + yield init_lease_cr(); + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + tn->log(5, "failed to take lease"); + set_status("lease lock failed, early abort"); + drain_all(); + return set_cr_error(lease_cr->get_ret_status()); + } + set_sleeping(true); + yield; + } + tn->log(10, "took lease"); + oid = full_data_sync_index_shard_oid(sync_env->source_zone, shard_id); + set_marker_tracker(new RGWDataSyncShardMarkerTrack(sync_env, status_oid, sync_marker, tn)); + total_entries = sync_marker.pos; + do { + if (!lease_cr->is_locked()) { + stop_spawned_services(); + drain_all(); + return set_cr_error(-ECANCELED); + } + omapkeys = std::make_shared(); + yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid), + sync_marker.marker, max_entries, omapkeys)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: RGWRadosGetOmapKeysCR() returned ret=" << retcode)); + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + entries = std::move(omapkeys->entries); + if (entries.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync")); + iter = entries.begin(); + for (; iter != entries.end(); ++iter) { + tn->log(20, SSTR("full sync: " << *iter)); + total_entries++; + if (!marker_tracker->start(*iter, total_entries, real_time())) { + tn->log(0, SSTR("ERROR: cannot start syncing " << *iter << ". Duplicate entry?")); + } else { + // fetch remote and write locally + yield spawn(new RGWDataSyncSingleEntryCR(sync_env, *iter, *iter, marker_tracker, error_repo, false, tn), false); + } + sync_marker.marker = *iter; + + while ((int)num_spawned() > spawn_window) { + set_status() << "num_spawned() > spawn_window"; + yield wait_for_child(); + int ret; + while (collect(&ret, lease_stack.get())) { + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + } + } + } + } + } while (omapkeys->more); + omapkeys.reset(); + + drain_all_but_stack(lease_stack.get()); + + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + + yield { + /* update marker to reflect we're done with full sync */ + sync_marker.state = rgw_data_sync_marker::IncrementalSync; + sync_marker.marker = sync_marker.next_step_marker; + sync_marker.next_step_marker.clear(); + RGWRados *store = sync_env->store; + call(new RGWSimpleRadosWriteCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, status_oid), + sync_marker)); + } + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to set sync marker: retcode=" << retcode)); + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + // keep lease and transition to incremental_sync() + } + return 0; + } + + int incremental_sync() { + reenter(&incremental_cr) { + tn->log(10, "start incremental sync"); + if (lease_cr) { + tn->log(10, "lease already held from full sync"); + } else { + yield init_lease_cr(); + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + tn->log(5, "failed to take lease"); + set_status("lease lock failed, early abort"); + drain_all(); + return set_cr_error(lease_cr->get_ret_status()); + } + set_sleeping(true); + yield; + } + set_status("lease acquired"); + tn->log(10, "took lease"); + } + error_repo = new RGWOmapAppend(sync_env->async_rados, sync_env->store, + rgw_raw_obj(pool, error_oid), + 1 /* no buffer */); + error_repo->get(); + spawn(error_repo, false); + set_marker_tracker(new RGWDataSyncShardMarkerTrack(sync_env, status_oid, sync_marker, tn)); + do { + if (!lease_cr->is_locked()) { + stop_spawned_services(); + drain_all(); + return set_cr_error(-ECANCELED); + } + current_modified.clear(); + inc_lock.Lock(); + current_modified.swap(modified_shards); + inc_lock.Unlock(); + + if (current_modified.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + /* process out of band updates */ + for (modified_iter = current_modified.begin(); modified_iter != current_modified.end(); ++modified_iter) { + yield { + tn->log(20, SSTR("received async update notification: " << *modified_iter)); + spawn(new RGWDataSyncSingleEntryCR(sync_env, *modified_iter, string(), marker_tracker, error_repo, false, tn), false); + } + } + + if (error_retry_time <= ceph::coarse_real_clock::now()) { + /* process bucket shards that previously failed */ + omapkeys = std::make_shared(); + yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, error_oid), + error_marker, max_error_entries, omapkeys)); + error_entries = std::move(omapkeys->entries); + tn->log(20, SSTR("read error repo, got " << error_entries.size() << " entries")); + iter = error_entries.begin(); + for (; iter != error_entries.end(); ++iter) { + error_marker = *iter; + tn->log(20, SSTR("handle error entry: " << error_marker)); + spawn(new RGWDataSyncSingleEntryCR(sync_env, error_marker, error_marker, nullptr /* no marker tracker */, error_repo, true, tn), false); + } + if (!omapkeys->more) { + if (error_marker.empty() && error_entries.empty()) { + /* the retry repo is empty, we back off a bit before calling it again */ + retry_backoff_secs *= 2; + if (retry_backoff_secs > RETRY_BACKOFF_SECS_MAX) { + retry_backoff_secs = RETRY_BACKOFF_SECS_MAX; + } + } else { + retry_backoff_secs = RETRY_BACKOFF_SECS_DEFAULT; + } + error_retry_time = ceph::coarse_real_clock::now() + make_timespan(retry_backoff_secs); + error_marker.clear(); + } + } + omapkeys.reset(); + + tn->log(20, SSTR("shard_id=" << shard_id << " sync_marker=" << sync_marker.marker)); + yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, sync_marker.marker, + &next_marker, &log_entries, &truncated)); + if (retcode < 0 && retcode != -ENOENT) { + tn->log(0, SSTR("ERROR: failed to read remote data log info: ret=" << retcode)); + stop_spawned_services(); + drain_all(); + return set_cr_error(retcode); + } + + if (log_entries.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + + for (log_iter = log_entries.begin(); log_iter != log_entries.end(); ++log_iter) { + tn->log(20, SSTR("shard_id=" << shard_id << " log_entry: " << log_iter->log_id << ":" << log_iter->log_timestamp << ":" << log_iter->entry.key)); + if (!marker_tracker->index_key_to_marker(log_iter->entry.key, log_iter->log_id)) { + tn->log(20, SSTR("skipping sync of entry: " << log_iter->log_id << ":" << log_iter->entry.key << " sync already in progress for bucket shard")); + marker_tracker->try_update_high_marker(log_iter->log_id, 0, log_iter->log_timestamp); + continue; + } + if (!marker_tracker->start(log_iter->log_id, 0, log_iter->log_timestamp)) { + tn->log(0, SSTR("ERROR: cannot start syncing " << log_iter->log_id << ". Duplicate entry?")); + } else { + spawn(new RGWDataSyncSingleEntryCR(sync_env, log_iter->entry.key, log_iter->log_id, marker_tracker, error_repo, false, tn), false); + } + while ((int)num_spawned() > spawn_window) { + set_status() << "num_spawned() > spawn_window"; + yield wait_for_child(); + int ret; + while (collect(&ret, lease_stack.get())) { + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + /* we have reported this error */ + } + /* not waiting for child here */ + } + } + } + + tn->log(20, SSTR("shard_id=" << shard_id << " sync_marker=" << sync_marker.marker + << " next_marker=" << next_marker << " truncated=" << truncated)); + if (!next_marker.empty()) { + sync_marker.marker = next_marker; + } else if (!log_entries.empty()) { + sync_marker.marker = log_entries.back().log_id; + } + if (!truncated) { + // we reached the end, wait a while before checking for more + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + yield wait(get_idle_interval()); + } + } while (true); + } + return 0; + } + + utime_t get_idle_interval() const { +#define INCREMENTAL_INTERVAL 20 + ceph::timespan interval = std::chrono::seconds(INCREMENTAL_INTERVAL); + if (!ceph::coarse_real_clock::is_zero(error_retry_time)) { + auto now = ceph::coarse_real_clock::now(); + if (error_retry_time > now) { + auto d = error_retry_time - now; + if (interval > d) { + interval = d; + } + } + } + // convert timespan -> time_point -> utime_t + return utime_t(ceph::coarse_real_clock::zero() + interval); + } + + void stop_spawned_services() { + lease_cr->go_down(); + if (error_repo) { + error_repo->finish(); + error_repo->put(); + error_repo = NULL; + } + } +}; + +class RGWDataSyncShardControlCR : public RGWBackoffControlCR { + RGWDataSyncEnv *sync_env; + + rgw_pool pool; + + uint32_t shard_id; + rgw_data_sync_marker sync_marker; + + RGWSyncTraceNodeRef tn; +public: + RGWDataSyncShardControlCR(RGWDataSyncEnv *_sync_env, const rgw_pool& _pool, + uint32_t _shard_id, rgw_data_sync_marker& _marker, + RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sync_env->cct, false), + sync_env(_sync_env), + pool(_pool), + shard_id(_shard_id), + sync_marker(_marker) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "shard", std::to_string(shard_id)); + } + + RGWCoroutine *alloc_cr() override { + return new RGWDataSyncShardCR(sync_env, pool, shard_id, sync_marker, tn, backoff_ptr()); + } + + RGWCoroutine *alloc_finisher_cr() override { + RGWRados *store = sync_env->store; + return new RGWSimpleRadosReadCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id)), + &sync_marker); + } + + void append_modified_shards(set& keys) { + Mutex::Locker l(cr_lock()); + + RGWDataSyncShardCR *cr = static_cast(get_cr()); + if (!cr) { + return; + } + + cr->append_modified_shards(keys); + } +}; + +class RGWDataSyncCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + uint32_t num_shards; + + rgw_data_sync_status sync_status; + + RGWDataSyncShardMarkerTrack *marker_tracker; + + Mutex shard_crs_lock; + map shard_crs; + + bool *reset_backoff; + + RGWSyncTraceNodeRef tn; + + RGWDataSyncModule *data_sync_module{nullptr}; +public: + RGWDataSyncCR(RGWDataSyncEnv *_sync_env, uint32_t _num_shards, RGWSyncTraceNodeRef& _tn, bool *_reset_backoff) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + num_shards(_num_shards), + marker_tracker(NULL), + shard_crs_lock("RGWDataSyncCR::shard_crs_lock"), + reset_backoff(_reset_backoff), tn(_tn) { + + } + + ~RGWDataSyncCR() override { + for (auto iter : shard_crs) { + iter.second->put(); + } + } + + int operate() override { + reenter(this) { + + /* read sync status */ + yield call(new RGWReadDataSyncStatusCoroutine(sync_env, &sync_status)); + + data_sync_module = sync_env->sync_module->get_data_handler(); + + if (retcode < 0 && retcode != -ENOENT) { + tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode)); + return set_cr_error(retcode); + } + + /* state: init status */ + if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateInit) { + tn->log(20, SSTR("init")); + sync_status.sync_info.num_shards = num_shards; + uint64_t instance_id; + instance_id = ceph::util::generate_random_number(); + yield call(new RGWInitDataSyncStatusCoroutine(sync_env, num_shards, instance_id, tn, &sync_status)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to init sync, retcode=" << retcode)); + return set_cr_error(retcode); + } + // sets state = StateBuildingFullSyncMaps + + *reset_backoff = true; + } + + data_sync_module->init(sync_env, sync_status.sync_info.instance_id); + + if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateBuildingFullSyncMaps) { + tn->log(10, SSTR("building full sync maps")); + /* call sync module init here */ + sync_status.sync_info.num_shards = num_shards; + yield call(data_sync_module->init_sync(sync_env)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: sync module init_sync() failed, retcode=" << retcode)); + return set_cr_error(retcode); + } + /* state: building full sync maps */ + yield call(new RGWListBucketIndexesCR(sync_env, &sync_status)); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to build full sync maps, retcode=" << retcode)); + return set_cr_error(retcode); + } + sync_status.sync_info.state = rgw_data_sync_info::StateSync; + + /* update new state */ + yield call(set_sync_info_cr()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to write sync status, retcode=" << retcode)); + return set_cr_error(retcode); + } + + *reset_backoff = true; + } + + yield call(data_sync_module->start_sync(sync_env)); + + yield { + if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateSync) { + tn->log(10, SSTR("spawning " << num_shards << " shards sync")); + for (map::iterator iter = sync_status.sync_markers.begin(); + iter != sync_status.sync_markers.end(); ++iter) { + RGWDataSyncShardControlCR *cr = new RGWDataSyncShardControlCR(sync_env, sync_env->store->svc.zone->get_zone_params().log_pool, + iter->first, iter->second, tn); + cr->get(); + shard_crs_lock.Lock(); + shard_crs[iter->first] = cr; + shard_crs_lock.Unlock(); + spawn(cr, true); + } + } + } + + return set_cr_done(); + } + return 0; + } + + RGWCoroutine *set_sync_info_cr() { + RGWRados *store = sync_env->store; + return new RGWSimpleRadosWriteCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sync_env->source_zone)), + sync_status.sync_info); + } + + void wakeup(int shard_id, set& keys) { + Mutex::Locker l(shard_crs_lock); + map::iterator iter = shard_crs.find(shard_id); + if (iter == shard_crs.end()) { + return; + } + iter->second->append_modified_shards(keys); + iter->second->wakeup(); + } +}; + +class RGWDefaultDataSyncModule : public RGWDataSyncModule { +public: + RGWDefaultDataSyncModule() {} + + RGWCoroutine *sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, std::optional versioned_epoch, rgw_zone_set *zones_trace) override; + RGWCoroutine *remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override; + RGWCoroutine *create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override; +}; + +class RGWDefaultSyncModuleInstance : public RGWSyncModuleInstance { + RGWDefaultDataSyncModule data_handler; +public: + RGWDefaultSyncModuleInstance() {} + RGWDataSyncModule *get_data_handler() override { + return &data_handler; + } + bool supports_user_writes() override { + return true; + } +}; + +int RGWDefaultSyncModule::create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) +{ + instance->reset(new RGWDefaultSyncModuleInstance()); + return 0; +} + +RGWCoroutine *RGWDefaultDataSyncModule::sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, std::optional versioned_epoch, rgw_zone_set *zones_trace) +{ + return new RGWFetchRemoteObjCR(sync_env->async_rados, sync_env->store, sync_env->source_zone, bucket_info, + std::nullopt, + key, std::nullopt, versioned_epoch, + true, zones_trace, sync_env->counters); +} + +RGWCoroutine *RGWDefaultDataSyncModule::remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, + real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) +{ + return new RGWRemoveObjCR(sync_env->async_rados, sync_env->store, sync_env->source_zone, + bucket_info, key, versioned, versioned_epoch, + NULL, NULL, false, &mtime, zones_trace); +} + +RGWCoroutine *RGWDefaultDataSyncModule::create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) +{ + return new RGWRemoveObjCR(sync_env->async_rados, sync_env->store, sync_env->source_zone, + bucket_info, key, versioned, versioned_epoch, + &owner.id, &owner.display_name, true, &mtime, zones_trace); +} + +class RGWArchiveDataSyncModule : public RGWDefaultDataSyncModule { +public: + RGWArchiveDataSyncModule() {} + + RGWCoroutine *sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, std::optional versioned_epoch, rgw_zone_set *zones_trace) override; + RGWCoroutine *remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override; + RGWCoroutine *create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override; +}; + +class RGWArchiveSyncModuleInstance : public RGWDefaultSyncModuleInstance { + RGWArchiveDataSyncModule data_handler; +public: + RGWArchiveSyncModuleInstance() {} + RGWDataSyncModule *get_data_handler() override { + return &data_handler; + } + RGWMetadataHandler *alloc_bucket_meta_handler() override { + return RGWArchiveBucketMetaHandlerAllocator::alloc(); + } + RGWMetadataHandler *alloc_bucket_instance_meta_handler() override { + return RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(); + } +}; + +int RGWArchiveSyncModule::create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) +{ + instance->reset(new RGWArchiveSyncModuleInstance()); + return 0; +} + +RGWCoroutine *RGWArchiveDataSyncModule::sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, std::optional versioned_epoch, rgw_zone_set *zones_trace) +{ + ldout(sync_env->cct, 5) << "SYNC_ARCHIVE: sync_object: b=" << bucket_info.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + if (!bucket_info.versioned() || + (bucket_info.flags & BUCKET_VERSIONS_SUSPENDED)) { + ldout(sync_env->cct, 0) << "SYNC_ARCHIVE: sync_object: enabling object versioning for archive bucket" << dendl; + bucket_info.flags = (bucket_info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED; + int op_ret = sync_env->store->put_bucket_instance_info(bucket_info, false, real_time(), NULL); + if (op_ret < 0) { + ldout(sync_env->cct, 0) << "SYNC_ARCHIVE: sync_object: error versioning archive bucket" << dendl; + return NULL; + } + } + + std::optional dest_key; + + if (versioned_epoch.value_or(0) == 0) { /* force version if not set */ + versioned_epoch = 0; + dest_key = key; + if (key.instance.empty()) { + sync_env->store->gen_rand_obj_instance_name(&(*dest_key)); + } + } + + return new RGWFetchRemoteObjCR(sync_env->async_rados, sync_env->store, sync_env->source_zone, + bucket_info, std::nullopt, + key, dest_key, versioned_epoch, + true, zones_trace, nullptr); +} + +RGWCoroutine *RGWArchiveDataSyncModule::remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, + real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) +{ + ldout(sync_env->cct, 0) << "SYNC_ARCHIVE: remove_object: b=" << bucket_info.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch << dendl; + return NULL; +} + +RGWCoroutine *RGWArchiveDataSyncModule::create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) +{ + ldout(sync_env->cct, 0) << "SYNC_ARCHIVE: create_delete_marker: b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime + << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return new RGWRemoveObjCR(sync_env->async_rados, sync_env->store, sync_env->source_zone, + bucket_info, key, versioned, versioned_epoch, + &owner.id, &owner.display_name, true, &mtime, zones_trace); +} + +class RGWDataSyncControlCR : public RGWBackoffControlCR +{ + RGWDataSyncEnv *sync_env; + uint32_t num_shards; + + RGWSyncTraceNodeRef tn; + + static constexpr bool exit_on_error = false; // retry on all errors +public: + RGWDataSyncControlCR(RGWDataSyncEnv *_sync_env, uint32_t _num_shards, + RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sync_env->cct, exit_on_error), + sync_env(_sync_env), num_shards(_num_shards) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "sync"); + } + + RGWCoroutine *alloc_cr() override { + return new RGWDataSyncCR(sync_env, num_shards, tn, backoff_ptr()); + } + + void wakeup(int shard_id, set& keys) { + Mutex& m = cr_lock(); + + m.Lock(); + RGWDataSyncCR *cr = static_cast(get_cr()); + if (!cr) { + m.Unlock(); + return; + } + + cr->get(); + m.Unlock(); + + if (cr) { + tn->log(20, SSTR("notify shard=" << shard_id << " keys=" << keys)); + cr->wakeup(shard_id, keys); + } + + cr->put(); + } +}; + +void RGWRemoteDataLog::wakeup(int shard_id, set& keys) { + RWLock::RLocker rl(lock); + if (!data_sync_cr) { + return; + } + data_sync_cr->wakeup(shard_id, keys); +} + +int RGWRemoteDataLog::run_sync(int num_shards) +{ + lock.get_write(); + data_sync_cr = new RGWDataSyncControlCR(&sync_env, num_shards, tn); + data_sync_cr->get(); // run() will drop a ref, so take another + lock.unlock(); + + int r = run(data_sync_cr); + + lock.get_write(); + data_sync_cr->put(); + data_sync_cr = NULL; + lock.unlock(); + + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to run sync" << dendl; + return r; + } + return 0; +} + +int RGWDataSyncStatusManager::init() +{ + RGWZone *zone_def; + + if (!store->svc.zone->find_zone_by_id(source_zone, &zone_def)) { + ldpp_dout(this, 0) << "ERROR: failed to find zone config info for zone=" << source_zone << dendl; + return -EIO; + } + + if (!store->svc.sync_modules->get_manager()->supports_data_export(zone_def->tier_type)) { + return -ENOTSUP; + } + + const RGWZoneParams& zone_params = store->svc.zone->get_zone_params(); + + if (sync_module == nullptr) { + sync_module = store->get_sync_module(); + } + + conn = store->svc.zone->get_zone_conn_by_id(source_zone); + if (!conn) { + ldpp_dout(this, 0) << "connection object to zone " << source_zone << " does not exist" << dendl; + return -EINVAL; + } + + error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS); + + int r = source_log.init(source_zone, conn, error_logger, store->get_sync_tracer(), + sync_module, counters); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to init remote log, r=" << r << dendl; + finalize(); + return r; + } + + rgw_datalog_info datalog_info; + r = source_log.read_log_info(&datalog_info); + if (r < 0) { + ldpp_dout(this, 5) << "ERROR: master.read_log_info() returned r=" << r << dendl; + finalize(); + return r; + } + + num_shards = datalog_info.num_shards; + + for (int i = 0; i < num_shards; i++) { + shard_objs[i] = rgw_raw_obj(zone_params.log_pool, shard_obj_name(source_zone, i)); + } + + return 0; +} + +void RGWDataSyncStatusManager::finalize() +{ + delete error_logger; + error_logger = nullptr; +} + +unsigned RGWDataSyncStatusManager::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWDataSyncStatusManager::gen_prefix(std::ostream& out) const +{ + auto zone = std::string_view{source_zone}; + return out << "data sync zone:" << zone.substr(0, 8) << ' '; +} + +string RGWDataSyncStatusManager::sync_status_oid(const string& source_zone) +{ + char buf[datalog_sync_status_oid_prefix.size() + source_zone.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%s", datalog_sync_status_oid_prefix.c_str(), source_zone.c_str()); + + return string(buf); +} + +string RGWDataSyncStatusManager::shard_obj_name(const string& source_zone, int shard_id) +{ + char buf[datalog_sync_status_shard_prefix.size() + source_zone.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_status_shard_prefix.c_str(), source_zone.c_str(), shard_id); + + return string(buf); +} + +int RGWRemoteBucketLog::init(const string& _source_zone, RGWRESTConn *_conn, + const rgw_bucket& bucket, int shard_id, + RGWSyncErrorLogger *_error_logger, + RGWSyncTraceManager *_sync_tracer, + RGWSyncModuleInstanceRef& _sync_module) +{ + conn = _conn; + source_zone = _source_zone; + bs.bucket = bucket; + bs.shard_id = shard_id; + + sync_env.init(dpp, store->ctx(), store, conn, async_rados, http_manager, + _error_logger, _sync_tracer, source_zone, _sync_module, nullptr); + + return 0; +} + +class RGWReadRemoteBucketIndexLogInfoCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + const string instance_key; + + rgw_bucket_index_marker_info *info; + +public: + RGWReadRemoteBucketIndexLogInfoCR(RGWDataSyncEnv *_sync_env, + const rgw_bucket_shard& bs, + rgw_bucket_index_marker_info *_info) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + instance_key(bs.get_key()), info(_info) {} + + int operate() override { + reenter(this) { + yield { + rgw_http_param_pair pairs[] = { { "type" , "bucket-index" }, + { "bucket-instance", instance_key.c_str() }, + { "info" , NULL }, + { NULL, NULL } }; + + string p = "/admin/log/"; + call(new RGWReadRESTResourceCR(sync_env->cct, sync_env->conn, sync_env->http_manager, p, pairs, info)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +class RGWInitBucketShardSyncStatusCoroutine : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + + rgw_bucket_shard bs; + const string sync_status_oid; + + rgw_bucket_shard_sync_info& status; + + rgw_bucket_index_marker_info info; +public: + RGWInitBucketShardSyncStatusCoroutine(RGWDataSyncEnv *_sync_env, + const rgw_bucket_shard& bs, + rgw_bucket_shard_sync_info& _status) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), bs(bs), + sync_status_oid(RGWBucketSyncStatusManager::status_oid(sync_env->source_zone, bs)), + status(_status) + {} + + int operate() override { + reenter(this) { + /* fetch current position in logs */ + yield call(new RGWReadRemoteBucketIndexLogInfoCR(sync_env, bs, &info)); + if (retcode < 0 && retcode != -ENOENT) { + ldout(cct, 0) << "ERROR: failed to fetch bucket index status" << dendl; + return set_cr_error(retcode); + } + yield { + auto store = sync_env->store; + rgw_raw_obj obj(store->svc.zone->get_zone_params().log_pool, sync_status_oid); + + if (info.syncstopped) { + call(new RGWRadosRemoveCR(store, obj)); + } else { + status.state = rgw_bucket_shard_sync_info::StateFullSync; + status.inc_marker.position = info.max_marker; + map attrs; + status.encode_all_attrs(attrs); + call(new RGWSimpleRadosWriteAttrsCR(sync_env->async_rados, store->svc.sysobj, obj, attrs)); + } + } + if (info.syncstopped) { + retcode = -ENOENT; + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +RGWCoroutine *RGWRemoteBucketLog::init_sync_status_cr() +{ + return new RGWInitBucketShardSyncStatusCoroutine(&sync_env, bs, init_status); +} + +#define BUCKET_SYNC_ATTR_PREFIX RGW_ATTR_PREFIX "bucket-sync." + +template +static bool decode_attr(CephContext *cct, map& attrs, const string& attr_name, T *val) +{ + map::iterator iter = attrs.find(attr_name); + if (iter == attrs.end()) { + *val = T(); + return false; + } + + auto biter = iter->second.cbegin(); + try { + decode(*val, biter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode attribute: " << attr_name << dendl; + return false; + } + return true; +} + +void rgw_bucket_shard_sync_info::decode_from_attrs(CephContext *cct, map& attrs) +{ + if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "state", &state)) { + decode_attr(cct, attrs, "state", &state); + } + if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "full_marker", &full_marker)) { + decode_attr(cct, attrs, "full_marker", &full_marker); + } + if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "inc_marker", &inc_marker)) { + decode_attr(cct, attrs, "inc_marker", &inc_marker); + } +} + +void rgw_bucket_shard_sync_info::encode_all_attrs(map& attrs) +{ + encode_state_attr(attrs); + full_marker.encode_attr(attrs); + inc_marker.encode_attr(attrs); +} + +void rgw_bucket_shard_sync_info::encode_state_attr(map& attrs) +{ + using ceph::encode; + encode(state, attrs[BUCKET_SYNC_ATTR_PREFIX "state"]); +} + +void rgw_bucket_shard_full_sync_marker::encode_attr(map& attrs) +{ + using ceph::encode; + encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "full_marker"]); +} + +void rgw_bucket_shard_inc_sync_marker::encode_attr(map& attrs) +{ + using ceph::encode; + encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "inc_marker"]); +} + +class RGWReadBucketSyncStatusCoroutine : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + string oid; + rgw_bucket_shard_sync_info *status; + + map attrs; +public: + RGWReadBucketSyncStatusCoroutine(RGWDataSyncEnv *_sync_env, + const rgw_bucket_shard& bs, + rgw_bucket_shard_sync_info *_status) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + oid(RGWBucketSyncStatusManager::status_oid(sync_env->source_zone, bs)), + status(_status) {} + int operate() override; +}; + +int RGWReadBucketSyncStatusCoroutine::operate() +{ + reenter(this) { + yield call(new RGWSimpleRadosReadAttrsCR(sync_env->async_rados, sync_env->store->svc.sysobj, + rgw_raw_obj(sync_env->store->svc.zone->get_zone_params().log_pool, oid), + &attrs, true)); + if (retcode == -ENOENT) { + *status = rgw_bucket_shard_sync_info(); + return set_cr_done(); + } + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to call fetch bucket shard info oid=" << oid << " ret=" << retcode << dendl; + return set_cr_error(retcode); + } + status->decode_from_attrs(sync_env->cct, attrs); + return set_cr_done(); + } + return 0; +} + +#define OMAP_READ_MAX_ENTRIES 10 +class RGWReadRecoveringBucketShardsCoroutine : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWRados *store; + + const int shard_id; + int max_entries; + + set& recovering_buckets; + string marker; + string error_oid; + + RGWRadosGetOmapKeysCR::ResultPtr omapkeys; + set error_entries; + int max_omap_entries; + int count; + +public: + RGWReadRecoveringBucketShardsCoroutine(RGWDataSyncEnv *_sync_env, const int _shard_id, + set& _recovering_buckets, const int _max_entries) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + store(sync_env->store), shard_id(_shard_id), max_entries(_max_entries), + recovering_buckets(_recovering_buckets), max_omap_entries(OMAP_READ_MAX_ENTRIES) + { + error_oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id) + ".retry"; + } + + int operate() override; +}; + +int RGWReadRecoveringBucketShardsCoroutine::operate() +{ + reenter(this){ + //read recovering bucket shards + count = 0; + do { + omapkeys = std::make_shared(); + yield call(new RGWRadosGetOmapKeysCR(store, rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, error_oid), + marker, max_omap_entries, omapkeys)); + + if (retcode == -ENOENT) { + break; + } + + if (retcode < 0) { + ldout(sync_env->cct, 0) << "failed to read recovering bucket shards with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + error_entries = std::move(omapkeys->entries); + if (error_entries.empty()) { + break; + } + + count += error_entries.size(); + marker = *error_entries.rbegin(); + recovering_buckets.insert(std::make_move_iterator(error_entries.begin()), + std::make_move_iterator(error_entries.end())); + } while (omapkeys->more && count < max_entries); + + return set_cr_done(); + } + + return 0; +} + +class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWRados *store; + + const int shard_id; + int max_entries; + + set& pending_buckets; + string marker; + string status_oid; + + rgw_data_sync_marker* sync_marker; + int count; + + std::string next_marker; + list log_entries; + bool truncated; + +public: + RGWReadPendingBucketShardsCoroutine(RGWDataSyncEnv *_sync_env, const int _shard_id, + set& _pending_buckets, + rgw_data_sync_marker* _sync_marker, const int _max_entries) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + store(sync_env->store), shard_id(_shard_id), max_entries(_max_entries), + pending_buckets(_pending_buckets), sync_marker(_sync_marker) + { + status_oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, shard_id); + } + + int operate() override; +}; + +int RGWReadPendingBucketShardsCoroutine::operate() +{ + reenter(this){ + //read sync status marker + using CR = RGWSimpleRadosReadCR; + yield call(new CR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, status_oid), + sync_marker)); + if (retcode < 0) { + ldout(sync_env->cct,0) << "failed to read sync status marker with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + //read pending bucket shards + marker = sync_marker->marker; + count = 0; + do{ + yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, marker, + &next_marker, &log_entries, &truncated)); + + if (retcode == -ENOENT) { + break; + } + + if (retcode < 0) { + ldout(sync_env->cct,0) << "failed to read remote data log info with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + if (log_entries.empty()) { + break; + } + + count += log_entries.size(); + for (const auto& entry : log_entries) { + pending_buckets.insert(entry.entry.key); + } + }while(truncated && count < max_entries); + + return set_cr_done(); + } + + return 0; +} + +int RGWRemoteDataLog::read_shard_status(int shard_id, set& pending_buckets, set& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) +{ + // cannot run concurrently with run_sync(), so run in a separate manager + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWDataSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + list stacks; + RGWCoroutinesStack* recovering_stack = new RGWCoroutinesStack(store->ctx(), &crs); + recovering_stack->call(new RGWReadRecoveringBucketShardsCoroutine(&sync_env_local, shard_id, recovering_buckets, max_entries)); + stacks.push_back(recovering_stack); + RGWCoroutinesStack* pending_stack = new RGWCoroutinesStack(store->ctx(), &crs); + pending_stack->call(new RGWReadPendingBucketShardsCoroutine(&sync_env_local, shard_id, pending_buckets, sync_marker, max_entries)); + stacks.push_back(pending_stack); + ret = crs.run(stacks); + http_manager.stop(); + return ret; +} + +RGWCoroutine *RGWRemoteBucketLog::read_sync_status_cr(rgw_bucket_shard_sync_info *sync_status) +{ + return new RGWReadBucketSyncStatusCoroutine(&sync_env, bs, sync_status); +} + +RGWBucketSyncStatusManager::~RGWBucketSyncStatusManager() { + for (map::iterator iter = source_logs.begin(); iter != source_logs.end(); ++iter) { + delete iter->second; + } + delete error_logger; +} + + +void rgw_bucket_entry_owner::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("ID", id, obj); + JSONDecoder::decode_json("DisplayName", display_name, obj); +} + +struct bucket_list_entry { + bool delete_marker; + rgw_obj_key key; + bool is_latest; + real_time mtime; + string etag; + uint64_t size; + string storage_class; + rgw_bucket_entry_owner owner; + uint64_t versioned_epoch; + string rgw_tag; + + bucket_list_entry() : delete_marker(false), is_latest(false), size(0), versioned_epoch(0) {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("IsDeleteMarker", delete_marker, obj); + JSONDecoder::decode_json("Key", key.name, obj); + JSONDecoder::decode_json("VersionId", key.instance, obj); + JSONDecoder::decode_json("IsLatest", is_latest, obj); + string mtime_str; + JSONDecoder::decode_json("RgwxMtime", mtime_str, obj); + + struct tm t; + uint32_t nsec; + if (parse_iso8601(mtime_str.c_str(), &t, &nsec)) { + ceph_timespec ts; + ts.tv_sec = (uint64_t)internal_timegm(&t); + ts.tv_nsec = nsec; + mtime = real_clock::from_ceph_timespec(ts); + } + JSONDecoder::decode_json("ETag", etag, obj); + JSONDecoder::decode_json("Size", size, obj); + JSONDecoder::decode_json("StorageClass", storage_class, obj); + JSONDecoder::decode_json("Owner", owner, obj); + JSONDecoder::decode_json("VersionedEpoch", versioned_epoch, obj); + JSONDecoder::decode_json("RgwxTag", rgw_tag, obj); + if (key.instance == "null" && !versioned_epoch) { + key.instance.clear(); + } + } + + RGWModifyOp get_modify_op() const { + if (delete_marker) { + return CLS_RGW_OP_LINK_OLH_DM; + } else if (!key.instance.empty() && key.instance != "null") { + return CLS_RGW_OP_LINK_OLH; + } else { + return CLS_RGW_OP_ADD; + } + } +}; + +struct bucket_list_result { + string name; + string prefix; + string key_marker; + string version_id_marker; + int max_keys; + bool is_truncated; + list entries; + + bucket_list_result() : max_keys(0), is_truncated(false) {} + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("Name", name, obj); + JSONDecoder::decode_json("Prefix", prefix, obj); + JSONDecoder::decode_json("KeyMarker", key_marker, obj); + JSONDecoder::decode_json("VersionIdMarker", version_id_marker, obj); + JSONDecoder::decode_json("MaxKeys", max_keys, obj); + JSONDecoder::decode_json("IsTruncated", is_truncated, obj); + JSONDecoder::decode_json("Entries", entries, obj); + } +}; + +class RGWListBucketShardCR: public RGWCoroutine { + RGWDataSyncEnv *sync_env; + const rgw_bucket_shard& bs; + const string instance_key; + rgw_obj_key marker_position; + + bucket_list_result *result; + +public: + RGWListBucketShardCR(RGWDataSyncEnv *_sync_env, const rgw_bucket_shard& bs, + rgw_obj_key& _marker_position, bucket_list_result *_result) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), bs(bs), + instance_key(bs.get_key()), marker_position(_marker_position), + result(_result) {} + + int operate() override { + reenter(this) { + yield { + rgw_http_param_pair pairs[] = { { "rgwx-bucket-instance", instance_key.c_str() }, + { "versions" , NULL }, + { "format" , "json" }, + { "objs-container" , "true" }, + { "key-marker" , marker_position.name.c_str() }, + { "version-id-marker" , marker_position.instance.c_str() }, + { NULL, NULL } }; + // don't include tenant in the url, it's already part of instance_key + string p = string("/") + bs.bucket.name; + call(new RGWReadRESTResourceCR(sync_env->cct, sync_env->conn, sync_env->http_manager, p, pairs, result)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +class RGWListBucketIndexLogCR: public RGWCoroutine { + RGWDataSyncEnv *sync_env; + const string instance_key; + string marker; + + list *result; + std::optional timer; + +public: + RGWListBucketIndexLogCR(RGWDataSyncEnv *_sync_env, const rgw_bucket_shard& bs, + string& _marker, list *_result) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + instance_key(bs.get_key()), marker(_marker), result(_result) {} + + int operate() override { + reenter(this) { + if (sync_env->counters) { + timer.emplace(sync_env->counters, sync_counters::l_poll); + } + yield { + rgw_http_param_pair pairs[] = { { "bucket-instance", instance_key.c_str() }, + { "format" , "json" }, + { "marker" , marker.c_str() }, + { "type", "bucket-index" }, + { NULL, NULL } }; + + call(new RGWReadRESTResourceCR >(sync_env->cct, sync_env->conn, sync_env->http_manager, "/admin/log", pairs, result)); + } + timer.reset(); + if (retcode < 0) { + if (sync_env->counters) { + sync_env->counters->inc(sync_counters::l_poll_err); + } + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +#define BUCKET_SYNC_UPDATE_MARKER_WINDOW 10 + +class RGWBucketFullSyncShardMarkerTrack : public RGWSyncShardMarkerTrack { + RGWDataSyncEnv *sync_env; + + string marker_oid; + rgw_bucket_shard_full_sync_marker sync_marker; + + RGWSyncTraceNodeRef tn; + +public: + RGWBucketFullSyncShardMarkerTrack(RGWDataSyncEnv *_sync_env, + const string& _marker_oid, + const rgw_bucket_shard_full_sync_marker& _marker) : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW), + sync_env(_sync_env), + marker_oid(_marker_oid), + sync_marker(_marker) {} + + void set_tn(RGWSyncTraceNodeRef& _tn) { + tn = _tn; + } + + RGWCoroutine *store_marker(const rgw_obj_key& new_marker, uint64_t index_pos, const real_time& timestamp) override { + sync_marker.position = new_marker; + sync_marker.count = index_pos; + + map attrs; + sync_marker.encode_attr(attrs); + + RGWRados *store = sync_env->store; + + tn->log(20, SSTR("updating marker marker_oid=" << marker_oid << " marker=" << new_marker)); + return new RGWSimpleRadosWriteAttrsCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, marker_oid), + attrs); + } + + RGWOrderCallCR *allocate_order_control_cr() override { + return new RGWLastCallerWinsCR(sync_env->cct); + } +}; + +class RGWBucketIncSyncShardMarkerTrack : public RGWSyncShardMarkerTrack { + RGWDataSyncEnv *sync_env; + + string marker_oid; + rgw_bucket_shard_inc_sync_marker sync_marker; + + map key_to_marker; + + struct operation { + rgw_obj_key key; + bool is_olh; + }; + map marker_to_op; + std::set pending_olh; // object names with pending olh operations + + RGWSyncTraceNodeRef tn; + + void handle_finish(const string& marker) override { + auto iter = marker_to_op.find(marker); + if (iter == marker_to_op.end()) { + return; + } + auto& op = iter->second; + key_to_marker.erase(op.key); + reset_need_retry(op.key); + if (op.is_olh) { + pending_olh.erase(op.key.name); + } + marker_to_op.erase(iter); + } + +public: + RGWBucketIncSyncShardMarkerTrack(RGWDataSyncEnv *_sync_env, + const string& _marker_oid, + const rgw_bucket_shard_inc_sync_marker& _marker) : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW), + sync_env(_sync_env), + marker_oid(_marker_oid), + sync_marker(_marker) {} + + void set_tn(RGWSyncTraceNodeRef& _tn) { + tn = _tn; + } + + RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override { + sync_marker.position = new_marker; + + map attrs; + sync_marker.encode_attr(attrs); + + RGWRados *store = sync_env->store; + + tn->log(20, SSTR("updating marker marker_oid=" << marker_oid << " marker=" << new_marker)); + return new RGWSimpleRadosWriteAttrsCR(sync_env->async_rados, + store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, marker_oid), + attrs); + } + + /* + * create index from key -> , and from marker -> key + * this is useful so that we can insure that we only have one + * entry for any key that is used. This is needed when doing + * incremenatl sync of data, and we don't want to run multiple + * concurrent sync operations for the same bucket shard + * Also, we should make sure that we don't run concurrent operations on the same key with + * different ops. + */ + bool index_key_to_marker(const rgw_obj_key& key, const string& marker, bool is_olh) { + auto result = key_to_marker.emplace(key, marker); + if (!result.second) { // exists + set_need_retry(key); + return false; + } + marker_to_op[marker] = operation{key, is_olh}; + if (is_olh) { + // prevent other olh ops from starting on this object name + pending_olh.insert(key.name); + } + return true; + } + + bool can_do_op(const rgw_obj_key& key, bool is_olh) { + // serialize olh ops on the same object name + if (is_olh && pending_olh.count(key.name)) { + tn->log(20, SSTR("sync of " << key << " waiting for pending olh op")); + return false; + } + return (key_to_marker.find(key) == key_to_marker.end()); + } + + RGWOrderCallCR *allocate_order_control_cr() override { + return new RGWLastCallerWinsCR(sync_env->cct); + } +}; + +template +class RGWBucketSyncSingleEntryCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + + RGWBucketInfo *bucket_info; + const rgw_bucket_shard& bs; + + rgw_obj_key key; + bool versioned; + std::optional versioned_epoch; + rgw_bucket_entry_owner owner; + real_time timestamp; + RGWModifyOp op; + RGWPendingState op_state; + + T entry_marker; + RGWSyncShardMarkerTrack *marker_tracker; + + int sync_status; + + stringstream error_ss; + + bool error_injection; + + RGWDataSyncModule *data_sync_module; + + rgw_zone_set zones_trace; + + RGWSyncTraceNodeRef tn; +public: + RGWBucketSyncSingleEntryCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo *_bucket_info, + const rgw_bucket_shard& bs, + const rgw_obj_key& _key, bool _versioned, + std::optional _versioned_epoch, + real_time& _timestamp, + const rgw_bucket_entry_owner& _owner, + RGWModifyOp _op, RGWPendingState _op_state, + const T& _entry_marker, RGWSyncShardMarkerTrack *_marker_tracker, rgw_zone_set& _zones_trace, + RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + bucket_info(_bucket_info), bs(bs), + key(_key), versioned(_versioned), versioned_epoch(_versioned_epoch), + owner(_owner), + timestamp(_timestamp), op(_op), + op_state(_op_state), + entry_marker(_entry_marker), + marker_tracker(_marker_tracker), + sync_status(0){ + stringstream ss; + ss << bucket_shard_str{bs} << "/" << key << "[" << versioned_epoch.value_or(0) << "]"; + set_description() << "bucket sync single entry (source_zone=" << sync_env->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state; + set_status("init"); + + tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", SSTR(key)); + + tn->log(20, SSTR("bucket sync single entry (source_zone=" << sync_env->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state)); + error_injection = (sync_env->cct->_conf->rgw_sync_data_inject_err_probability > 0); + + data_sync_module = sync_env->sync_module->get_data_handler(); + + zones_trace = _zones_trace; + zones_trace.insert(sync_env->store->svc.zone->get_zone().id); + } + + int operate() override { + reenter(this) { + /* skip entries that are not complete */ + if (op_state != CLS_RGW_STATE_COMPLETE) { + goto done; + } + tn->set_flag(RGW_SNS_FLAG_ACTIVE); + do { + yield { + marker_tracker->reset_need_retry(key); + if (key.name.empty()) { + /* shouldn't happen */ + set_status("skipping empty entry"); + tn->log(0, "entry with empty obj name, skipping"); + goto done; + } + if (error_injection && + rand() % 10000 < cct->_conf->rgw_sync_data_inject_err_probability * 10000.0) { + tn->log(0, SSTR(": injecting data sync error on key=" << key.name)); + retcode = -EIO; + } else if (op == CLS_RGW_OP_ADD || + op == CLS_RGW_OP_LINK_OLH) { + set_status("syncing obj"); + tn->log(5, SSTR("bucket sync: sync obj: " << sync_env->source_zone << "/" << bucket_info->bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]")); + call(data_sync_module->sync_object(sync_env, *bucket_info, key, versioned_epoch, &zones_trace)); + } else if (op == CLS_RGW_OP_DEL || op == CLS_RGW_OP_UNLINK_INSTANCE) { + set_status("removing obj"); + if (op == CLS_RGW_OP_UNLINK_INSTANCE) { + versioned = true; + } + tn->log(10, SSTR("removing obj: " << sync_env->source_zone << "/" << bucket_info->bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]")); + call(data_sync_module->remove_object(sync_env, *bucket_info, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace)); + // our copy of the object is more recent, continue as if it succeeded + if (retcode == -ERR_PRECONDITION_FAILED) { + retcode = 0; + } + } else if (op == CLS_RGW_OP_LINK_OLH_DM) { + set_status("creating delete marker"); + tn->log(10, SSTR("creating delete marker: obj: " << sync_env->source_zone << "/" << bucket_info->bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]")); + call(data_sync_module->create_delete_marker(sync_env, *bucket_info, key, timestamp, owner, versioned, versioned_epoch.value_or(0), &zones_trace)); + } + tn->set_resource_name(SSTR(bucket_str_noinstance(bucket_info->bucket) << "/" << key)); + } + } while (marker_tracker->need_retry(key)); + { + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + if (retcode >= 0) { + tn->log(10, "success"); + } else { + tn->log(10, SSTR("failed, retcode=" << retcode << " (" << cpp_strerror(-retcode) << ")")); + } + } + + if (retcode < 0 && retcode != -ENOENT) { + set_status() << "failed to sync obj; retcode=" << retcode; + tn->log(0, SSTR("ERROR: failed to sync object: " + << bucket_shard_str{bs} << "/" << key.name)); + error_ss << bucket_shard_str{bs} << "/" << key.name; + sync_status = retcode; + } + if (!error_ss.str().empty()) { + yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), "data", error_ss.str(), -retcode, string("failed to sync object") + cpp_strerror(-sync_status))); + } +done: + if (sync_status == 0) { + /* update marker */ + set_status() << "calling marker_tracker->finish(" << entry_marker << ")"; + yield call(marker_tracker->finish(entry_marker)); + sync_status = retcode; + } + if (sync_status < 0) { + return set_cr_error(sync_status); + } + return set_cr_done(); + } + return 0; + } +}; + +#define BUCKET_SYNC_SPAWN_WINDOW 20 + +class RGWBucketShardFullSyncCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + const rgw_bucket_shard& bs; + RGWBucketInfo *bucket_info; + boost::intrusive_ptr lease_cr; + bucket_list_result list_result; + list::iterator entries_iter; + rgw_bucket_shard_sync_info& sync_info; + RGWBucketFullSyncShardMarkerTrack marker_tracker; + rgw_obj_key list_marker; + bucket_list_entry *entry{nullptr}; + + int total_entries{0}; + + int sync_status{0}; + + const string& status_oid; + + rgw_zone_set zones_trace; + + RGWSyncTraceNodeRef tn; +public: + RGWBucketShardFullSyncCR(RGWDataSyncEnv *_sync_env, const rgw_bucket_shard& bs, + RGWBucketInfo *_bucket_info, + const std::string& status_oid, + RGWContinuousLeaseCR *lease_cr, + rgw_bucket_shard_sync_info& sync_info, + RGWSyncTraceNodeRef tn_parent) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), bs(bs), + bucket_info(_bucket_info), lease_cr(lease_cr), sync_info(sync_info), + marker_tracker(sync_env, status_oid, sync_info.full_marker), + status_oid(status_oid), + tn(sync_env->sync_tracer->add_node(tn_parent, "full_sync", + SSTR(bucket_shard_str{bs}))) { + zones_trace.insert(sync_env->source_zone); + marker_tracker.set_tn(tn); + } + + int operate() override; +}; + +int RGWBucketShardFullSyncCR::operate() +{ + int ret; + reenter(this) { + list_marker = sync_info.full_marker.position; + + total_entries = sync_info.full_marker.count; + do { + if (!lease_cr->is_locked()) { + drain_all(); + return set_cr_error(-ECANCELED); + } + set_status("listing remote bucket"); + tn->log(20, "listing bucket for full sync"); + yield call(new RGWListBucketShardCR(sync_env, bs, list_marker, + &list_result)); + if (retcode < 0 && retcode != -ENOENT) { + set_status("failed bucket listing, going down"); + drain_all(); + return set_cr_error(retcode); + } + if (list_result.entries.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + entries_iter = list_result.entries.begin(); + for (; entries_iter != list_result.entries.end(); ++entries_iter) { + if (!lease_cr->is_locked()) { + drain_all(); + return set_cr_error(-ECANCELED); + } + tn->log(20, SSTR("[full sync] syncing object: " + << bucket_shard_str{bs} << "/" << entries_iter->key)); + entry = &(*entries_iter); + total_entries++; + list_marker = entries_iter->key; + if (!marker_tracker.start(entry->key, total_entries, real_time())) { + tn->log(0, SSTR("ERROR: cannot start syncing " << entry->key << ". Duplicate entry?")); + } else { + using SyncCR = RGWBucketSyncSingleEntryCR; + yield spawn(new SyncCR(sync_env, bucket_info, bs, entry->key, + false, /* versioned, only matters for object removal */ + entry->versioned_epoch, entry->mtime, + entry->owner, entry->get_modify_op(), CLS_RGW_STATE_COMPLETE, + entry->key, &marker_tracker, zones_trace, tn), + false); + } + while (num_spawned() > BUCKET_SYNC_SPAWN_WINDOW) { + yield wait_for_child(); + bool again = true; + while (again) { + again = collect(&ret, nullptr); + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + sync_status = ret; + /* we have reported this error */ + } + } + } + } + } while (list_result.is_truncated && sync_status == 0); + set_status("done iterating over all objects"); + /* wait for all operations to complete */ + while (num_spawned()) { + yield wait_for_child(); + bool again = true; + while (again) { + again = collect(&ret, nullptr); + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + sync_status = ret; + /* we have reported this error */ + } + } + } + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + if (!lease_cr->is_locked()) { + return set_cr_error(-ECANCELED); + } + /* update sync state to incremental */ + if (sync_status == 0) { + yield { + sync_info.state = rgw_bucket_shard_sync_info::StateIncrementalSync; + map attrs; + sync_info.encode_state_attr(attrs); + RGWRados *store = sync_env->store; + call(new RGWSimpleRadosWriteAttrsCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, status_oid), + attrs)); + } + } else { + tn->log(10, SSTR("backing out with sync_status=" << sync_status)); + } + if (retcode < 0 && sync_status == 0) { /* actually tried to set incremental state and failed */ + tn->log(0, SSTR("ERROR: failed to set sync state on bucket " + << bucket_shard_str{bs} << " retcode=" << retcode)); + return set_cr_error(retcode); + } + if (sync_status < 0) { + return set_cr_error(sync_status); + } + return set_cr_done(); + } + return 0; +} + +static bool has_olh_epoch(RGWModifyOp op) { + return op == CLS_RGW_OP_LINK_OLH || op == CLS_RGW_OP_UNLINK_INSTANCE; +} + +class RGWBucketShardIncrementalSyncCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + const rgw_bucket_shard& bs; + RGWBucketInfo *bucket_info; + boost::intrusive_ptr lease_cr; + list list_result; + list::iterator entries_iter, entries_end; + map, pair > squash_map; + rgw_bucket_shard_sync_info& sync_info; + rgw_obj_key key; + rgw_bi_log_entry *entry{nullptr}; + RGWBucketIncSyncShardMarkerTrack marker_tracker; + bool updated_status{false}; + const string& status_oid; + const string& zone_id; + + string cur_id; + + int sync_status{0}; + bool syncstopped{false}; + + RGWSyncTraceNodeRef tn; +public: + RGWBucketShardIncrementalSyncCR(RGWDataSyncEnv *_sync_env, + const rgw_bucket_shard& bs, + RGWBucketInfo *_bucket_info, + const std::string& status_oid, + RGWContinuousLeaseCR *lease_cr, + rgw_bucket_shard_sync_info& sync_info, + RGWSyncTraceNodeRef& _tn_parent) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), bs(bs), + bucket_info(_bucket_info), lease_cr(lease_cr), sync_info(sync_info), + marker_tracker(sync_env, status_oid, sync_info.inc_marker), + status_oid(status_oid), zone_id(_sync_env->store->svc.zone->get_zone().id), + tn(sync_env->sync_tracer->add_node(_tn_parent, "inc_sync", + SSTR(bucket_shard_str{bs}))) + { + set_description() << "bucket shard incremental sync bucket=" + << bucket_shard_str{bs}; + set_status("init"); + marker_tracker.set_tn(tn); + } + + int operate() override; +}; + +int RGWBucketShardIncrementalSyncCR::operate() +{ + int ret; + reenter(this) { + do { + if (!lease_cr->is_locked()) { + drain_all(); + tn->log(0, "ERROR: lease is not taken, abort"); + return set_cr_error(-ECANCELED); + } + tn->log(20, SSTR("listing bilog for incremental sync" << sync_info.inc_marker.position)); + set_status() << "listing bilog; position=" << sync_info.inc_marker.position; + yield call(new RGWListBucketIndexLogCR(sync_env, bs, sync_info.inc_marker.position, + &list_result)); + if (retcode < 0 && retcode != -ENOENT) { + /* wait for all operations to complete */ + drain_all(); + return set_cr_error(retcode); + } + squash_map.clear(); + entries_iter = list_result.begin(); + entries_end = list_result.end(); + for (; entries_iter != entries_end; ++entries_iter) { + auto e = *entries_iter; + if (e.op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP) { + ldout(sync_env->cct, 20) << "syncstop on " << e.timestamp << dendl; + syncstopped = true; + entries_end = entries_iter; // dont sync past here + break; + } + if (e.op == RGWModifyOp::CLS_RGW_OP_RESYNC) { + continue; + } + if (e.op == CLS_RGW_OP_CANCEL) { + continue; + } + if (e.state != CLS_RGW_STATE_COMPLETE) { + continue; + } + if (e.zones_trace.find(zone_id) != e.zones_trace.end()) { + continue; + } + auto& squash_entry = squash_map[make_pair(e.object, e.instance)]; + // don't squash over olh entries - we need to apply their olh_epoch + if (has_olh_epoch(squash_entry.second) && !has_olh_epoch(e.op)) { + continue; + } + if (squash_entry.first <= e.timestamp) { + squash_entry = make_pair<>(e.timestamp, e.op); + } + } + + entries_iter = list_result.begin(); + for (; entries_iter != entries_end; ++entries_iter) { + if (!lease_cr->is_locked()) { + drain_all(); + return set_cr_error(-ECANCELED); + } + entry = &(*entries_iter); + { + ssize_t p = entry->id.find('#'); /* entries might have explicit shard info in them, e.g., 6#00000000004.94.3 */ + if (p < 0) { + cur_id = entry->id; + } else { + cur_id = entry->id.substr(p + 1); + } + } + sync_info.inc_marker.position = cur_id; + + if (entry->op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP || entry->op == RGWModifyOp::CLS_RGW_OP_RESYNC) { + ldout(sync_env->cct, 20) << "detected syncstop or resync on " << entries_iter->timestamp << ", skipping entry" << dendl; + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + + if (!key.set(rgw_obj_index_key{entry->object, entry->instance})) { + set_status() << "parse_raw_oid() on " << entry->object << " returned false, skipping entry"; + tn->log(20, SSTR("parse_raw_oid() on " << entry->object << " returned false, skipping entry")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + + tn->log(20, SSTR("parsed entry: id=" << cur_id << " iter->object=" << entry->object << " iter->instance=" << entry->instance << " name=" << key.name << " instance=" << key.instance << " ns=" << key.ns)); + + if (!key.ns.empty()) { + set_status() << "skipping entry in namespace: " << entry->object; + tn->log(20, SSTR("skipping entry in namespace: " << entry->object)); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + + set_status() << "got entry.id=" << cur_id << " key=" << key << " op=" << (int)entry->op; + if (entry->op == CLS_RGW_OP_CANCEL) { + set_status() << "canceled operation, skipping"; + tn->log(20, SSTR("skipping object: " + << bucket_shard_str{bs} << "/" << key << ": canceled operation")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + if (entry->state != CLS_RGW_STATE_COMPLETE) { + set_status() << "non-complete operation, skipping"; + tn->log(20, SSTR("skipping object: " + << bucket_shard_str{bs} << "/" << key << ": non-complete operation")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + if (entry->zones_trace.find(zone_id) != entry->zones_trace.end()) { + set_status() << "redundant operation, skipping"; + tn->log(20, SSTR("skipping object: " + <timestamp); + continue; + } + if (make_pair<>(entry->timestamp, entry->op) != squash_map[make_pair(entry->object, entry->instance)]) { + set_status() << "squashed operation, skipping"; + tn->log(20, SSTR("skipping object: " + << bucket_shard_str{bs} << "/" << key << ": squashed operation")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + tn->set_flag(RGW_SNS_FLAG_ACTIVE); + tn->log(20, SSTR("syncing object: " + << bucket_shard_str{bs} << "/" << key)); + updated_status = false; + while (!marker_tracker.can_do_op(key, has_olh_epoch(entry->op))) { + if (!updated_status) { + set_status() << "can't do op, conflicting inflight operation"; + updated_status = true; + } + tn->log(5, SSTR("can't do op on key=" << key << " need to wait for conflicting operation to complete")); + yield wait_for_child(); + bool again = true; + while (again) { + again = collect(&ret, nullptr); + if (ret < 0) { + tn->log(0, SSTR("ERROR: a child operation returned error (ret=" << ret << ")")); + sync_status = ret; + /* we have reported this error */ + } + } + if (sync_status != 0) + break; + } + if (sync_status != 0) { + /* get error, stop */ + break; + } + if (!marker_tracker.index_key_to_marker(key, cur_id, has_olh_epoch(entry->op))) { + set_status() << "can't do op, sync already in progress for object"; + tn->log(20, SSTR("skipping sync of entry: " << cur_id << ":" << key << " sync already in progress for object")); + marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp); + continue; + } + // yield { + set_status() << "start object sync"; + if (!marker_tracker.start(cur_id, 0, entry->timestamp)) { + tn->log(0, SSTR("ERROR: cannot start syncing " << cur_id << ". Duplicate entry?")); + } else { + std::optional versioned_epoch; + rgw_bucket_entry_owner owner(entry->owner, entry->owner_display_name); + if (entry->ver.pool < 0) { + versioned_epoch = entry->ver.epoch; + } + tn->log(20, SSTR("entry->timestamp=" << entry->timestamp)); + using SyncCR = RGWBucketSyncSingleEntryCR; + spawn(new SyncCR(sync_env, bucket_info, bs, key, + entry->is_versioned(), versioned_epoch, + entry->timestamp, owner, entry->op, entry->state, + cur_id, &marker_tracker, entry->zones_trace, tn), + false); + } + // } + while (num_spawned() > BUCKET_SYNC_SPAWN_WINDOW) { + set_status() << "num_spawned() > spawn_window"; + yield wait_for_child(); + bool again = true; + while (again) { + again = collect(&ret, nullptr); + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + sync_status = ret; + /* we have reported this error */ + } + /* not waiting for child here */ + } + } + } + } while (!list_result.empty() && sync_status == 0 && !syncstopped); + + while (num_spawned()) { + yield wait_for_child(); + bool again = true; + while (again) { + again = collect(&ret, nullptr); + if (ret < 0) { + tn->log(10, "a sync operation returned error"); + sync_status = ret; + /* we have reported this error */ + } + /* not waiting for child here */ + } + } + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + + if (syncstopped) { + // transition back to StateInit in RGWRunBucketSyncCoroutine. if sync is + // still disabled, we'll delete the sync status object. otherwise we'll + // restart full sync to catch any changes that happened while sync was + // disabled + sync_info.state = rgw_bucket_shard_sync_info::StateInit; + return set_cr_done(); + } + + yield call(marker_tracker.flush()); + if (retcode < 0) { + tn->log(0, SSTR("ERROR: marker_tracker.flush() returned retcode=" << retcode)); + return set_cr_error(retcode); + } + if (sync_status < 0) { + tn->log(10, SSTR("backing out with sync_status=" << sync_status)); + return set_cr_error(sync_status); + } + return set_cr_done(); + } + return 0; +} + +int RGWRunBucketSyncCoroutine::operate() +{ + reenter(this) { + yield { + set_status("acquiring sync lock"); + auto store = sync_env->store; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, status_oid), + "sync_lock", + cct->_conf->rgw_sync_lease_period, + this)); + lease_stack.reset(spawn(lease_cr.get(), false)); + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + tn->log(5, "failed to take lease"); + set_status("lease lock failed, early abort"); + drain_all(); + return set_cr_error(lease_cr->get_ret_status()); + } + set_sleeping(true); + yield; + } + + tn->log(10, "took lease"); + yield call(new RGWReadBucketSyncStatusCoroutine(sync_env, bs, &sync_status)); + if (retcode < 0 && retcode != -ENOENT) { + tn->log(0, "ERROR: failed to read sync status for bucket"); + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + + tn->log(20, SSTR("sync status for bucket: " << sync_status.state)); + + yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->store, bs.bucket, &bucket_info)); + if (retcode == -ENOENT) { + /* bucket instance info has not been synced in yet, fetch it now */ + yield { + tn->log(10, SSTR("no local info for bucket:" << ": fetching metadata")); + string raw_key = string("bucket.instance:") + bs.bucket.get_key(); + + meta_sync_env.init(sync_env->dpp, cct, sync_env->store, sync_env->store->svc.zone->get_master_conn(), sync_env->async_rados, + sync_env->http_manager, sync_env->error_logger, sync_env->sync_tracer); + + call(new RGWMetaSyncSingleEntryCR(&meta_sync_env, raw_key, + string() /* no marker */, + MDLOG_STATUS_COMPLETE, + NULL /* no marker tracker */, + tn)); + } + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to fetch bucket instance info for " << bucket_str{bs.bucket})); + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + + yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->store, bs.bucket, &bucket_info)); + } + if (retcode < 0) { + tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{bs.bucket})); + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + + do { + if (sync_status.state == rgw_bucket_shard_sync_info::StateInit) { + yield call(new RGWInitBucketShardSyncStatusCoroutine(sync_env, bs, sync_status)); + if (retcode == -ENOENT) { + tn->log(0, "bucket sync disabled"); + lease_cr->abort(); // deleted lease object, abort/wakeup instead of unlock + lease_cr->wakeup(); + lease_cr.reset(); + drain_all(); + return set_cr_done(); + } + if (retcode < 0) { + tn->log(0, SSTR("ERROR: init sync on bucket failed, retcode=" << retcode)); + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + } + + if (sync_status.state == rgw_bucket_shard_sync_info::StateFullSync) { + yield call(new RGWBucketShardFullSyncCR(sync_env, bs, &bucket_info, + status_oid, lease_cr.get(), + sync_status, tn)); + if (retcode < 0) { + tn->log(5, SSTR("full sync on bucket failed, retcode=" << retcode)); + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + } + + if (sync_status.state == rgw_bucket_shard_sync_info::StateIncrementalSync) { + yield call(new RGWBucketShardIncrementalSyncCR(sync_env, bs, &bucket_info, + status_oid, lease_cr.get(), + sync_status, tn)); + if (retcode < 0) { + tn->log(5, SSTR("incremental sync on bucket failed, retcode=" << retcode)); + lease_cr->go_down(); + drain_all(); + return set_cr_error(retcode); + } + } + // loop back to previous states unless incremental sync returns normally + } while (sync_status.state != rgw_bucket_shard_sync_info::StateIncrementalSync); + + lease_cr->go_down(); + drain_all(); + return set_cr_done(); + } + + return 0; +} + +RGWCoroutine *RGWRemoteBucketLog::run_sync_cr() +{ + return new RGWRunBucketSyncCoroutine(&sync_env, bs, sync_env.sync_tracer->root_node); +} + +int RGWBucketSyncStatusManager::init() +{ + conn = store->svc.zone->get_zone_conn_by_id(source_zone); + if (!conn) { + ldpp_dout(this, 0) << "connection object to zone " << source_zone << " does not exist" << dendl; + return -EINVAL; + } + + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(this, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + + + const string key = bucket.get_key(); + + rgw_http_param_pair pairs[] = { { "key", key.c_str() }, + { NULL, NULL } }; + + string path = string("/admin/metadata/bucket.instance"); + + bucket_instance_meta_info result; + ret = cr_mgr.run(new RGWReadRESTResourceCR(store->ctx(), conn, &http_manager, path, pairs, &result)); + if (ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to fetch bucket metadata info from zone=" << source_zone << " path=" << path << " key=" << key << " ret=" << ret << dendl; + return ret; + } + + RGWBucketInfo& bi = result.data.get_bucket_info(); + num_shards = bi.num_shards; + + error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS); + + sync_module.reset(new RGWDefaultSyncModuleInstance()); + + int effective_num_shards = (num_shards ? num_shards : 1); + + auto async_rados = store->get_async_rados(); + + for (int i = 0; i < effective_num_shards; i++) { + RGWRemoteBucketLog *l = new RGWRemoteBucketLog(this, store, this, async_rados, &http_manager); + ret = l->init(source_zone, conn, bucket, (num_shards ? i : -1), error_logger, store->get_sync_tracer(), sync_module); + if (ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to initialize RGWRemoteBucketLog object" << dendl; + return ret; + } + source_logs[i] = l; + } + + return 0; +} + +int RGWBucketSyncStatusManager::init_sync_status() +{ + list stacks; + + for (map::iterator iter = source_logs.begin(); iter != source_logs.end(); ++iter) { + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), &cr_mgr); + RGWRemoteBucketLog *l = iter->second; + stack->call(l->init_sync_status_cr()); + + stacks.push_back(stack); + } + + return cr_mgr.run(stacks); +} + +int RGWBucketSyncStatusManager::read_sync_status() +{ + list stacks; + + for (map::iterator iter = source_logs.begin(); iter != source_logs.end(); ++iter) { + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), &cr_mgr); + RGWRemoteBucketLog *l = iter->second; + stack->call(l->read_sync_status_cr(&sync_status[iter->first])); + + stacks.push_back(stack); + } + + int ret = cr_mgr.run(stacks); + if (ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to read sync status for " + << bucket_str{bucket} << dendl; + return ret; + } + + return 0; +} + +int RGWBucketSyncStatusManager::run() +{ + list stacks; + + for (map::iterator iter = source_logs.begin(); iter != source_logs.end(); ++iter) { + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), &cr_mgr); + RGWRemoteBucketLog *l = iter->second; + stack->call(l->run_sync_cr()); + + stacks.push_back(stack); + } + + int ret = cr_mgr.run(stacks); + if (ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to read sync status for " + << bucket_str{bucket} << dendl; + return ret; + } + + return 0; +} + +unsigned RGWBucketSyncStatusManager::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWBucketSyncStatusManager::gen_prefix(std::ostream& out) const +{ + auto zone = std::string_view{source_zone}; + return out << "bucket sync zone:" << zone.substr(0, 8) + << " bucket:" << bucket.name << ' '; +} + +string RGWBucketSyncStatusManager::status_oid(const string& source_zone, + const rgw_bucket_shard& bs) +{ + return bucket_status_oid_prefix + "." + source_zone + ":" + bs.get_key(); +} + +string RGWBucketSyncStatusManager::obj_status_oid(const string& source_zone, + const rgw_obj& obj) +{ + return object_status_oid_prefix + "." + source_zone + ":" + obj.bucket.get_key() + ":" + + obj.key.name + ":" + obj.key.instance; +} + +class RGWCollectBucketSyncStatusCR : public RGWShardCollectCR { + static constexpr int max_concurrent_shards = 16; + RGWRados *const store; + RGWDataSyncEnv *const env; + const int num_shards; + rgw_bucket_shard bs; + + using Vector = std::vector; + Vector::iterator i, end; + + public: + RGWCollectBucketSyncStatusCR(RGWRados *store, RGWDataSyncEnv *env, + int num_shards, const rgw_bucket& bucket, + Vector *status) + : RGWShardCollectCR(store->ctx(), max_concurrent_shards), + store(store), env(env), num_shards(num_shards), + bs(bucket, num_shards > 0 ? 0 : -1), // start at shard 0 or -1 + i(status->begin()), end(status->end()) + {} + + bool spawn_next() override { + if (i == end) { + return false; + } + spawn(new RGWReadBucketSyncStatusCoroutine(env, bs, &*i), false); + ++i; + ++bs.shard_id; + return true; + } +}; + +int rgw_bucket_sync_status(const DoutPrefixProvider *dpp, RGWRados *store, const std::string& source_zone, + const RGWBucketInfo& bucket_info, + std::vector *status) +{ + const auto num_shards = bucket_info.num_shards; + status->clear(); + status->resize(std::max(1, num_shards)); + + RGWDataSyncEnv env; + RGWSyncModuleInstanceRef module; // null sync module + env.init(dpp, store->ctx(), store, nullptr, store->get_async_rados(), + nullptr, nullptr, nullptr, source_zone, module, nullptr); + + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + return crs.run(new RGWCollectBucketSyncStatusCR(store, &env, num_shards, + bucket_info.bucket, status)); +} + + +// TODO: move into rgw_data_sync_trim.cc +#undef dout_prefix +#define dout_prefix (*_dout << "data trim: ") + +namespace { + +/// return the marker that it's safe to trim up to +const std::string& get_stable_marker(const rgw_data_sync_marker& m) +{ + return m.state == m.FullSync ? m.next_step_marker : m.marker; +} + +/// populate the container starting with 'dest' with the minimum stable marker +/// of each shard for all of the peers in [first, last) +template +void take_min_markers(IterIn first, IterIn last, IterOut dest) +{ + if (first == last) { + return; + } + for (auto p = first; p != last; ++p) { + auto m = dest; + for (auto &shard : p->sync_markers) { + const auto& stable = get_stable_marker(shard.second); + if (*m > stable) { + *m = stable; + } + ++m; + } + } +} + +} // anonymous namespace + +class DataLogTrimCR : public RGWCoroutine { + using TrimCR = RGWSyncLogTrimCR; + RGWRados *store; + RGWHTTPManager *http; + const int num_shards; + const std::string& zone_id; //< my zone id + std::vector peer_status; //< sync status for each peer + std::vector min_shard_markers; //< min marker per shard + std::vector& last_trim; //< last trimmed marker per shard + int ret{0}; + + public: + DataLogTrimCR(RGWRados *store, RGWHTTPManager *http, + int num_shards, std::vector& last_trim) + : RGWCoroutine(store->ctx()), store(store), http(http), + num_shards(num_shards), + zone_id(store->svc.zone->get_zone().id), + peer_status(store->svc.zone->get_zone_data_notify_to_map().size()), + min_shard_markers(num_shards, TrimCR::max_marker), + last_trim(last_trim) + {} + + int operate() override; +}; + +int DataLogTrimCR::operate() +{ + reenter(this) { + ldout(cct, 10) << "fetching sync status for zone " << zone_id << dendl; + set_status("fetching sync status"); + yield { + // query data sync status from each sync peer + rgw_http_param_pair params[] = { + { "type", "data" }, + { "status", nullptr }, + { "source-zone", zone_id.c_str() }, + { nullptr, nullptr } + }; + + auto p = peer_status.begin(); + for (auto& c : store->svc.zone->get_zone_data_notify_to_map()) { + ldout(cct, 20) << "query sync status from " << c.first << dendl; + using StatusCR = RGWReadRESTResourceCR; + spawn(new StatusCR(cct, c.second, http, "/admin/log/", params, &*p), + false); + ++p; + } + } + + // must get a successful reply from all peers to consider trimming + ret = 0; + while (ret == 0 && num_spawned() > 0) { + yield wait_for_child(); + collect_next(&ret); + } + drain_all(); + + if (ret < 0) { + ldout(cct, 4) << "failed to fetch sync status from all peers" << dendl; + return set_cr_error(ret); + } + + ldout(cct, 10) << "trimming log shards" << dendl; + set_status("trimming log shards"); + yield { + // determine the minimum marker for each shard + take_min_markers(peer_status.begin(), peer_status.end(), + min_shard_markers.begin()); + + for (int i = 0; i < num_shards; i++) { + const auto& m = min_shard_markers[i]; + if (m <= last_trim[i]) { + continue; + } + ldout(cct, 10) << "trimming log shard " << i + << " at marker=" << m + << " last_trim=" << last_trim[i] << dendl; + spawn(new TrimCR(store, store->data_log->get_oid(i), + m, &last_trim[i]), + true); + } + } + return set_cr_done(); + } + return 0; +} + +RGWCoroutine* create_admin_data_log_trim_cr(RGWRados *store, + RGWHTTPManager *http, + int num_shards, + std::vector& markers) +{ + return new DataLogTrimCR(store, http, num_shards, markers); +} + +class DataLogTrimPollCR : public RGWCoroutine { + RGWRados *store; + RGWHTTPManager *http; + const int num_shards; + const utime_t interval; //< polling interval + const std::string lock_oid; //< use first data log shard for lock + const std::string lock_cookie; + std::vector last_trim; //< last trimmed marker per shard + + public: + DataLogTrimPollCR(RGWRados *store, RGWHTTPManager *http, + int num_shards, utime_t interval) + : RGWCoroutine(store->ctx()), store(store), http(http), + num_shards(num_shards), interval(interval), + lock_oid(store->data_log->get_oid(0)), + lock_cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)), + last_trim(num_shards) + {} + + int operate() override; +}; + +int DataLogTrimPollCR::operate() +{ + reenter(this) { + for (;;) { + set_status("sleeping"); + wait(interval); + + // request a 'data_trim' lock that covers the entire wait interval to + // prevent other gateways from attempting to trim for the duration + set_status("acquiring trim lock"); + yield call(new RGWSimpleRadosLockCR(store->get_async_rados(), store, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, lock_oid), + "data_trim", lock_cookie, + interval.sec())); + if (retcode < 0) { + // if the lock is already held, go back to sleep and try again later + ldout(cct, 4) << "failed to lock " << lock_oid << ", trying again in " + << interval.sec() << "s" << dendl; + continue; + } + + set_status("trimming"); + yield call(new DataLogTrimCR(store, http, num_shards, last_trim)); + + // note that the lock is not released. this is intentional, as it avoids + // duplicating this work in other gateways + } + } + return 0; +} + +RGWCoroutine* create_data_log_trim_cr(RGWRados *store, + RGWHTTPManager *http, + int num_shards, utime_t interval) +{ + return new DataLogTrimPollCR(store, http, num_shards, interval); +} diff --git a/src/rgw/rgw_data_sync.h b/src/rgw/rgw_data_sync.h new file mode 100644 index 00000000..55a71d72 --- /dev/null +++ b/src/rgw/rgw_data_sync.h @@ -0,0 +1,625 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_DATA_SYNC_H +#define CEPH_RGW_DATA_SYNC_H + +#include "include/encoding.h" + +#include "common/RWLock.h" +#include "common/ceph_json.h" + + +#include "rgw_coroutine.h" +#include "rgw_http_client.h" +#include "rgw_bucket.h" + +#include "rgw_sync_module.h" +#include "rgw_sync_trace.h" + +struct rgw_datalog_info { + uint32_t num_shards; + + rgw_datalog_info() : num_shards(0) {} + + void decode_json(JSONObj *obj); +}; + +struct rgw_data_sync_info { + enum SyncState { + StateInit = 0, + StateBuildingFullSyncMaps = 1, + StateSync = 2, + }; + + uint16_t state; + uint32_t num_shards; + + uint64_t instance_id{0}; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(state, bl); + encode(num_shards, bl); + encode(instance_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(state, bl); + decode(num_shards, bl); + if (struct_v >= 2) { + decode(instance_id, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + string s; + switch ((SyncState)state) { + case StateInit: + s = "init"; + break; + case StateBuildingFullSyncMaps: + s = "building-full-sync-maps"; + break; + case StateSync: + s = "sync"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); + encode_json("num_shards", num_shards, f); + encode_json("instance_id", instance_id, f); + } + void decode_json(JSONObj *obj) { + std::string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "building-full-sync-maps") { + state = StateBuildingFullSyncMaps; + } else if (s == "sync") { + state = StateSync; + } else { + state = StateInit; + } + JSONDecoder::decode_json("num_shards", num_shards, obj); + JSONDecoder::decode_json("instance_id", instance_id, obj); + } + static void generate_test_instances(std::list& o); + + rgw_data_sync_info() : state((int)StateInit), num_shards(0) {} +}; +WRITE_CLASS_ENCODER(rgw_data_sync_info) + +struct rgw_data_sync_marker { + enum SyncState { + FullSync = 0, + IncrementalSync = 1, + }; + uint16_t state; + string marker; + string next_step_marker; + uint64_t total_entries; + uint64_t pos; + real_time timestamp; + + rgw_data_sync_marker() : state(FullSync), total_entries(0), pos(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(state, bl); + encode(marker, bl); + encode(next_step_marker, bl); + encode(total_entries, bl); + encode(pos, bl); + encode(timestamp, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(state, bl); + decode(marker, bl); + decode(next_step_marker, bl); + decode(total_entries, bl); + decode(pos, bl); + decode(timestamp, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + const char *s{nullptr}; + switch ((SyncState)state) { + case FullSync: + s = "full-sync"; + break; + case IncrementalSync: + s = "incremental-sync"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); + encode_json("marker", marker, f); + encode_json("next_step_marker", next_step_marker, f); + encode_json("total_entries", total_entries, f); + encode_json("pos", pos, f); + encode_json("timestamp", utime_t(timestamp), f); + } + void decode_json(JSONObj *obj) { + std::string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "full-sync") { + state = FullSync; + } else if (s == "incremental-sync") { + state = IncrementalSync; + } + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("next_step_marker", next_step_marker, obj); + JSONDecoder::decode_json("total_entries", total_entries, obj); + JSONDecoder::decode_json("pos", pos, obj); + utime_t t; + JSONDecoder::decode_json("timestamp", t, obj); + timestamp = t.to_real_time(); + } + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(rgw_data_sync_marker) + +struct rgw_data_sync_status { + rgw_data_sync_info sync_info; + map sync_markers; + + rgw_data_sync_status() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(sync_info, bl); + /* sync markers are encoded separately */ + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(sync_info, bl); + /* sync markers are decoded separately */ + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + encode_json("info", sync_info, f); + encode_json("markers", sync_markers, f); + } + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("info", sync_info, obj); + JSONDecoder::decode_json("markers", sync_markers, obj); + } + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(rgw_data_sync_status) + +struct rgw_datalog_entry { + string key; + ceph::real_time timestamp; + + void decode_json(JSONObj *obj); +}; + +struct rgw_datalog_shard_data { + string marker; + bool truncated; + vector entries; + + void decode_json(JSONObj *obj); +}; + +class RGWAsyncRadosProcessor; +class RGWDataSyncControlCR; + +struct rgw_bucket_entry_owner { + string id; + string display_name; + + rgw_bucket_entry_owner() {} + rgw_bucket_entry_owner(const string& _id, const string& _display_name) : id(_id), display_name(_display_name) {} + + void decode_json(JSONObj *obj); +}; + +class RGWSyncErrorLogger; +class RGWRESTConn; + +struct RGWDataSyncEnv { + const DoutPrefixProvider *dpp{nullptr}; + CephContext *cct{nullptr}; + RGWRados *store{nullptr}; + RGWRESTConn *conn{nullptr}; + RGWAsyncRadosProcessor *async_rados{nullptr}; + RGWHTTPManager *http_manager{nullptr}; + RGWSyncErrorLogger *error_logger{nullptr}; + RGWSyncTraceManager *sync_tracer{nullptr}; + string source_zone; + RGWSyncModuleInstanceRef sync_module{nullptr}; + PerfCounters* counters{nullptr}; + + RGWDataSyncEnv() {} + + void init(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWRados *_store, RGWRESTConn *_conn, + RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager, + RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer, + const string& _source_zone, RGWSyncModuleInstanceRef& _sync_module, + PerfCounters* _counters) { + dpp = _dpp; + cct = _cct; + store = _store; + conn = _conn; + async_rados = _async_rados; + http_manager = _http_manager; + error_logger = _error_logger; + sync_tracer = _sync_tracer; + source_zone = _source_zone; + sync_module = _sync_module; + counters = _counters; + } + + string shard_obj_name(int shard_id); + string status_oid(); +}; + +class RGWRemoteDataLog : public RGWCoroutinesManager { + const DoutPrefixProvider *dpp; + RGWRados *store; + RGWAsyncRadosProcessor *async_rados; + RGWHTTPManager http_manager; + + RGWDataSyncEnv sync_env; + + RWLock lock; + RGWDataSyncControlCR *data_sync_cr; + + RGWSyncTraceNodeRef tn; + + bool initialized; + +public: + RGWRemoteDataLog(const DoutPrefixProvider *dpp, RGWRados *_store, + RGWAsyncRadosProcessor *async_rados) + : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), + dpp(dpp), store(_store), async_rados(async_rados), + http_manager(store->ctx(), completion_mgr), + lock("RGWRemoteDataLog::lock"), data_sync_cr(NULL), + initialized(false) {} + int init(const string& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger, + RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& module, + PerfCounters* _counters); + void finish(); + + int read_log_info(rgw_datalog_info *log_info); + int read_source_log_shards_info(map *shards_info); + int read_source_log_shards_next(map shard_markers, map *result); + int read_sync_status(rgw_data_sync_status *sync_status); + int read_recovering_shards(const int num_shards, set& recovering_shards); + int read_shard_status(int shard_id, set& lagging_buckets,set& recovering_buckets, rgw_data_sync_marker* sync_marker, const int max_entries); + int init_sync_status(int num_shards); + int run_sync(int num_shards); + + void wakeup(int shard_id, set& keys); +}; + +class RGWDataSyncStatusManager : public DoutPrefixProvider { + RGWRados *store; + rgw_rados_ref ref; + + string source_zone; + RGWRESTConn *conn; + RGWSyncErrorLogger *error_logger; + RGWSyncModuleInstanceRef sync_module; + PerfCounters* counters; + + RGWRemoteDataLog source_log; + + string source_status_oid; + string source_shard_status_oid_prefix; + + map shard_objs; + + int num_shards; + +public: + RGWDataSyncStatusManager(RGWRados *_store, RGWAsyncRadosProcessor *async_rados, + const string& _source_zone, PerfCounters* counters) + : store(_store), source_zone(_source_zone), conn(NULL), error_logger(NULL), + sync_module(nullptr), counters(counters), + source_log(this, store, async_rados), num_shards(0) {} + RGWDataSyncStatusManager(RGWRados *_store, RGWAsyncRadosProcessor *async_rados, + const string& _source_zone, PerfCounters* counters, + const RGWSyncModuleInstanceRef& _sync_module) + : store(_store), source_zone(_source_zone), conn(NULL), error_logger(NULL), + sync_module(_sync_module), counters(counters), + source_log(this, store, async_rados), num_shards(0) {} + ~RGWDataSyncStatusManager() { + finalize(); + } + int init(); + void finalize(); + + static string shard_obj_name(const string& source_zone, int shard_id); + static string sync_status_oid(const string& source_zone); + + int read_sync_status(rgw_data_sync_status *sync_status) { + return source_log.read_sync_status(sync_status); + } + + int read_recovering_shards(const int num_shards, set& recovering_shards) { + return source_log.read_recovering_shards(num_shards, recovering_shards); + } + + int read_shard_status(int shard_id, set& lagging_buckets, set& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) { + return source_log.read_shard_status(shard_id, lagging_buckets, recovering_buckets,sync_marker, max_entries); + } + int init_sync_status() { return source_log.init_sync_status(num_shards); } + + int read_log_info(rgw_datalog_info *log_info) { + return source_log.read_log_info(log_info); + } + int read_source_log_shards_info(map *shards_info) { + return source_log.read_source_log_shards_info(shards_info); + } + int read_source_log_shards_next(map shard_markers, map *result) { + return source_log.read_source_log_shards_next(shard_markers, result); + } + + int run() { return source_log.run_sync(num_shards); } + + void wakeup(int shard_id, set& keys) { return source_log.wakeup(shard_id, keys); } + void stop() { + source_log.finish(); + } + + // implements DoutPrefixProvider + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; +}; + +class RGWBucketSyncStatusManager; +class RGWBucketSyncCR; + +struct rgw_bucket_shard_full_sync_marker { + rgw_obj_key position; + uint64_t count; + + rgw_bucket_shard_full_sync_marker() : count(0) {} + + void encode_attr(map& attrs); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(position, bl); + encode(count, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(position, bl); + decode(count, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_bucket_shard_full_sync_marker) + +struct rgw_bucket_shard_inc_sync_marker { + string position; + + rgw_bucket_shard_inc_sync_marker() {} + + void encode_attr(map& attrs); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(position, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(position, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + bool operator<(const rgw_bucket_shard_inc_sync_marker& m) const { + return (position < m.position); + } +}; +WRITE_CLASS_ENCODER(rgw_bucket_shard_inc_sync_marker) + +struct rgw_bucket_shard_sync_info { + enum SyncState { + StateInit = 0, + StateFullSync = 1, + StateIncrementalSync = 2, + }; + + uint16_t state; + rgw_bucket_shard_full_sync_marker full_marker; + rgw_bucket_shard_inc_sync_marker inc_marker; + + void decode_from_attrs(CephContext *cct, map& attrs); + void encode_all_attrs(map& attrs); + void encode_state_attr(map& attrs); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(state, bl); + encode(full_marker, bl); + encode(inc_marker, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(state, bl); + decode(full_marker, bl); + decode(inc_marker, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + rgw_bucket_shard_sync_info() : state((int)StateInit) {} + +}; +WRITE_CLASS_ENCODER(rgw_bucket_shard_sync_info) + +struct rgw_bucket_index_marker_info { + string bucket_ver; + string master_ver; + string max_marker; + bool syncstopped{false}; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket_ver", bucket_ver, obj); + JSONDecoder::decode_json("master_ver", master_ver, obj); + JSONDecoder::decode_json("max_marker", max_marker, obj); + JSONDecoder::decode_json("syncstopped", syncstopped, obj); + } +}; + + +class RGWRemoteBucketLog : public RGWCoroutinesManager { + const DoutPrefixProvider *dpp; + RGWRados *store; + RGWRESTConn *conn{nullptr}; + string source_zone; + rgw_bucket_shard bs; + + RGWBucketSyncStatusManager *status_manager; + RGWAsyncRadosProcessor *async_rados; + RGWHTTPManager *http_manager; + + RGWDataSyncEnv sync_env; + rgw_bucket_shard_sync_info init_status; + + RGWBucketSyncCR *sync_cr{nullptr}; + +public: + RGWRemoteBucketLog(const DoutPrefixProvider *_dpp, RGWRados *_store, + RGWBucketSyncStatusManager *_sm, + RGWAsyncRadosProcessor *_async_rados, + RGWHTTPManager *_http_manager) + : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), + dpp(_dpp), store(_store), status_manager(_sm), + async_rados(_async_rados), http_manager(_http_manager) + {} + + int init(const string& _source_zone, RGWRESTConn *_conn, + const rgw_bucket& bucket, int shard_id, + RGWSyncErrorLogger *_error_logger, + RGWSyncTraceManager *_sync_tracer, + RGWSyncModuleInstanceRef& _sync_module); + void finish(); + + RGWCoroutine *read_sync_status_cr(rgw_bucket_shard_sync_info *sync_status); + RGWCoroutine *init_sync_status_cr(); + RGWCoroutine *run_sync_cr(); + + void wakeup(); +}; + +class RGWBucketSyncStatusManager : public DoutPrefixProvider { + RGWRados *store; + + RGWCoroutinesManager cr_mgr; + + RGWHTTPManager http_manager; + + string source_zone; + RGWRESTConn *conn; + RGWSyncErrorLogger *error_logger; + RGWSyncModuleInstanceRef sync_module; + + rgw_bucket bucket; + + map source_logs; + + string source_status_oid; + string source_shard_status_oid_prefix; + + map sync_status; + rgw_raw_obj status_obj; + + int num_shards; + +public: + RGWBucketSyncStatusManager(RGWRados *_store, const string& _source_zone, + const rgw_bucket& bucket) : store(_store), + cr_mgr(_store->ctx(), _store->get_cr_registry()), + http_manager(store->ctx(), cr_mgr.get_completion_mgr()), + source_zone(_source_zone), + conn(NULL), error_logger(NULL), + bucket(bucket), + num_shards(0) {} + ~RGWBucketSyncStatusManager(); + + int init(); + + map& get_sync_status() { return sync_status; } + int init_sync_status(); + + static string status_oid(const string& source_zone, const rgw_bucket_shard& bs); + static string obj_status_oid(const string& source_zone, const rgw_obj& obj); /* can be used by sync modules */ + + // implements DoutPrefixProvider + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; + + int read_sync_status(); + int run(); +}; + +/// read the sync status of all bucket shards from the given source zone +int rgw_bucket_sync_status(const DoutPrefixProvider *dpp, RGWRados *store, const std::string& source_zone, + const RGWBucketInfo& bucket_info, + std::vector *status); + +class RGWDefaultSyncModule : public RGWSyncModule { +public: + RGWDefaultSyncModule() {} + bool supports_writes() override { return true; } + bool supports_data_export() override { return true; } + int create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; + +class RGWArchiveSyncModule : public RGWDefaultSyncModule { +public: + RGWArchiveSyncModule() {} + bool supports_writes() override { return true; } + bool supports_data_export() override { return false; } + int create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; + +// DataLogTrimCR factory function +extern RGWCoroutine* create_data_log_trim_cr(RGWRados *store, + RGWHTTPManager *http, + int num_shards, utime_t interval); + +// factory function for datalog trim via radosgw-admin +RGWCoroutine* create_admin_data_log_trim_cr(RGWRados *store, + RGWHTTPManager *http, + int num_shards, + std::vector& markers); + +#endif diff --git a/src/rgw/rgw_dencoder.cc b/src/rgw/rgw_dencoder.cc new file mode 100644 index 00000000..91078c15 --- /dev/null +++ b/src/rgw/rgw_dencoder.cc @@ -0,0 +1,564 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_log.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_cache.h" +#include "rgw_meta_sync_status.h" +#include "rgw_data_sync.h" + +#include "common/Formatter.h" + +static string shadow_ns = RGW_OBJ_NS_SHADOW; + +static void init_bucket(rgw_bucket *b, const char *t, const char *n, const char *dp, const char *ip, const char *m, const char *id) +{ + b->tenant = t; + b->name = n; + b->marker = m; + b->bucket_id = id; + b->explicit_placement.data_pool = rgw_pool(dp); + b->explicit_placement.index_pool = rgw_pool(ip); +} + +void RGWObjManifestPart::generate_test_instances(std::list& o) +{ + o.push_back(new RGWObjManifestPart); + + RGWObjManifestPart *p = new RGWObjManifestPart; + rgw_bucket b; + init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12"); + + p->loc = rgw_obj(b, "object"); + p->loc_ofs = 512 * 1024; + p->size = 128 * 1024; + o.push_back(p); +} + +void RGWObjManifest::obj_iterator::seek(uint64_t o) +{ + ofs = o; + if (manifest->explicit_objs) { + explicit_iter = manifest->objs.upper_bound(ofs); + if (explicit_iter != manifest->objs.begin()) { + --explicit_iter; + } + if (ofs >= manifest->obj_size) { + ofs = manifest->obj_size; + return; + } + update_explicit_pos(); + update_location(); + return; + } + if (o < manifest->get_head_size()) { + rule_iter = manifest->rules.begin(); + stripe_ofs = 0; + stripe_size = manifest->get_head_size(); + if (rule_iter != manifest->rules.end()) { + cur_part_id = rule_iter->second.start_part_num; + cur_override_prefix = rule_iter->second.override_prefix; + } + update_location(); + return; + } + + rule_iter = manifest->rules.upper_bound(ofs); + next_rule_iter = rule_iter; + if (rule_iter != manifest->rules.begin()) { + --rule_iter; + } + + if (rule_iter == manifest->rules.end()) { + update_location(); + return; + } + + RGWObjManifestRule& rule = rule_iter->second; + + if (rule.part_size > 0) { + cur_part_id = rule.start_part_num + (ofs - rule.start_ofs) / rule.part_size; + } else { + cur_part_id = rule.start_part_num; + } + part_ofs = rule.start_ofs + (cur_part_id - rule.start_part_num) * rule.part_size; + + if (rule.stripe_max_size > 0) { + cur_stripe = (ofs - part_ofs) / rule.stripe_max_size; + + stripe_ofs = part_ofs + cur_stripe * rule.stripe_max_size; + if (!cur_part_id && manifest->get_head_size() > 0) { + cur_stripe++; + } + } else { + cur_stripe = 0; + stripe_ofs = part_ofs; + } + + if (!rule.part_size) { + stripe_size = rule.stripe_max_size; + stripe_size = std::min(manifest->get_obj_size() - stripe_ofs, stripe_size); + } else { + uint64_t next = std::min(stripe_ofs + rule.stripe_max_size, part_ofs + rule.part_size); + stripe_size = next - stripe_ofs; + } + + cur_override_prefix = rule.override_prefix; + + update_location(); +} + +void RGWObjManifest::obj_iterator::update_location() +{ + if (manifest->explicit_objs) { + location = explicit_iter->second.loc; + return; + } + + if (ofs < manifest->get_head_size()) { + location = manifest->get_obj(); + location.set_placement_rule(manifest->get_head_placement_rule()); + return; + } + + manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, &cur_override_prefix, &location); +} + +void RGWObjManifest::obj_iterator::update_explicit_pos() +{ + ofs = explicit_iter->first; + stripe_ofs = ofs; + + map::iterator next_iter = explicit_iter; + ++next_iter; + if (next_iter != manifest->objs.end()) { + stripe_size = next_iter->first - ofs; + } else { + stripe_size = manifest->obj_size - ofs; + } +} + +void RGWObjManifest::generate_test_instances(std::list& o) +{ + RGWObjManifest *m = new RGWObjManifest; + for (int i = 0; i<10; i++) { + RGWObjManifestPart p; + rgw_bucket b; + init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12"); + p.loc = rgw_obj(b, "object"); + p.loc_ofs = 0; + p.size = 512 * 1024; + m->objs[(uint64_t)i * 512 * 1024] = p; + } + m->obj_size = 5 * 1024 * 1024; + + o.push_back(m); + + o.push_back(new RGWObjManifest); +} + +void RGWObjManifest::get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs, string *override_prefix, rgw_obj_select *location) +{ + rgw_obj loc; + + string& oid = loc.key.name; + string& ns = loc.key.ns; + + if (!override_prefix || override_prefix->empty()) { + oid = prefix; + } else { + oid = *override_prefix; + } + + if (!cur_part_id) { + if (ofs < max_head_size) { + location->set_placement_rule(head_placement_rule); + *location = obj; + return; + } else { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", (int)cur_stripe); + oid += buf; + ns = shadow_ns; + } + } else { + char buf[32]; + if (cur_stripe == 0) { + snprintf(buf, sizeof(buf), ".%d", (int)cur_part_id); + oid += buf; + ns= RGW_OBJ_NS_MULTIPART; + } else { + snprintf(buf, sizeof(buf), ".%d_%d", (int)cur_part_id, (int)cur_stripe); + oid += buf; + ns = shadow_ns; + } + } + + if (!tail_placement.bucket.name.empty()) { + loc.bucket = tail_placement.bucket; + } else { + loc.bucket = obj.bucket; + } + + // Always overwrite instance with tail_instance + // to get the right shadow object location + loc.key.set_instance(tail_instance); + + location->set_placement_rule(tail_placement.placement_rule); + *location = loc; +} + + + +void rgw_log_entry::generate_test_instances(list& o) +{ + rgw_log_entry *e = new rgw_log_entry; + e->object_owner = "object_owner"; + e->bucket_owner = "bucket_owner"; + e->bucket = "bucket"; + e->remote_addr = "1.2.3.4"; + e->user = "user"; + e->obj = rgw_obj_key("obj"); + e->uri = "http://uri/bucket/obj"; + e->http_status = "200"; + e->error_code = "error_code"; + e->bytes_sent = 1024; + e->bytes_received = 512; + e->obj_size = 2048; + e->user_agent = "user_agent"; + e->referrer = "referrer"; + e->bucket_id = "10"; + o.push_back(e); + o.push_back(new rgw_log_entry); +} + +void ACLPermission::generate_test_instances(list& o) +{ + ACLPermission *p = new ACLPermission; + p->set_permissions(RGW_PERM_WRITE_ACP); + o.push_back(p); + o.push_back(new ACLPermission); +} + +void ACLGranteeType::generate_test_instances(list& o) +{ + ACLGranteeType *t = new ACLGranteeType; + t->set(ACL_TYPE_CANON_USER); + o.push_back(t); + o.push_back(new ACLGranteeType); +} + +/* the following is copied here from rgw_acl_s3.cc, to avoid having to have excessive linking + with everything it needs */ + +#define RGW_URI_ALL_USERS "http://acs.amazonaws.com/groups/global/AllUsers" +#define RGW_URI_AUTH_USERS "http://acs.amazonaws.com/groups/global/AuthenticatedUsers" + +static string rgw_uri_all_users = RGW_URI_ALL_USERS; +static string rgw_uri_auth_users = RGW_URI_AUTH_USERS; + +ACLGroupTypeEnum ACLGrant::uri_to_group(string& uri) +{ + // this is required for backward compatibility + return ACLGrant_S3::uri_to_group(uri); +} + +ACLGroupTypeEnum ACLGrant_S3::uri_to_group(string& uri) +{ + if (uri.compare(rgw_uri_all_users) == 0) + return ACL_GROUP_ALL_USERS; + else if (uri.compare(rgw_uri_auth_users) == 0) + return ACL_GROUP_AUTHENTICATED_USERS; + + return ACL_GROUP_NONE; +} + +void ACLGrant::generate_test_instances(list& o) +{ + rgw_user id("rgw"); + string name, email; + name = "Mr. RGW"; + email = "r@gw"; + + ACLGrant *g1 = new ACLGrant; + g1->set_canon(id, name, RGW_PERM_READ); + g1->email = email; + o.push_back(g1); + + ACLGrant *g2 = new ACLGrant; + g1->set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_WRITE); + o.push_back(g2); + + o.push_back(new ACLGrant); +} + +void RGWAccessControlList::generate_test_instances(list& o) +{ + RGWAccessControlList *acl = new RGWAccessControlList(NULL); + + list glist; + list::iterator iter; + + ACLGrant::generate_test_instances(glist); + for (iter = glist.begin(); iter != glist.end(); ++iter) { + ACLGrant *grant = *iter; + acl->add_grant(grant); + + delete grant; + } + o.push_back(acl); + o.push_back(new RGWAccessControlList(NULL)); +} + +void ACLOwner::generate_test_instances(list& o) +{ + ACLOwner *owner = new ACLOwner; + owner->id = "rgw"; + owner->display_name = "Mr. RGW"; + o.push_back(owner); + o.push_back(new ACLOwner); +} + +void RGWAccessControlPolicy::generate_test_instances(list& o) +{ + list acl_list; + list::iterator iter; + for (iter = acl_list.begin(); iter != acl_list.end(); ++iter) { + RGWAccessControlList::generate_test_instances(acl_list); + iter = acl_list.begin(); + + RGWAccessControlPolicy *p = new RGWAccessControlPolicy(NULL); + RGWAccessControlList *l = *iter; + p->acl = *l; + + string name = "radosgw"; + rgw_user id("rgw"); + p->owner.set_name(name); + p->owner.set_id(id); + + o.push_back(p); + + delete l; + } + + o.push_back(new RGWAccessControlPolicy(NULL)); +} + + +void ObjectMetaInfo::generate_test_instances(list& o) +{ + ObjectMetaInfo *m = new ObjectMetaInfo; + m->size = 1024 * 1024; + o.push_back(m); + o.push_back(new ObjectMetaInfo); +} + +void ObjectCacheInfo::generate_test_instances(list& o) +{ + using ceph::encode; + ObjectCacheInfo *i = new ObjectCacheInfo; + i->status = 0; + i->flags = CACHE_FLAG_MODIFY_XATTRS; + string s = "this is a string"; + string s2 = "this is a another string"; + bufferlist data, data2; + encode(s, data); + encode(s2, data2); + i->data = data; + i->xattrs["x1"] = data; + i->xattrs["x2"] = data2; + i->rm_xattrs["r2"] = data2; + i->rm_xattrs["r3"] = data; + i->meta.size = 512 * 1024; + o.push_back(i); + o.push_back(new ObjectCacheInfo); +} + +void RGWCacheNotifyInfo::generate_test_instances(list& o) +{ + o.push_back(new RGWCacheNotifyInfo); +} + +void RGWAccessKey::generate_test_instances(list& o) +{ + RGWAccessKey *k = new RGWAccessKey; + k->id = "id"; + k->key = "key"; + k->subuser = "subuser"; + o.push_back(k); + o.push_back(new RGWAccessKey); +} + +void RGWSubUser::generate_test_instances(list& o) +{ + RGWSubUser *u = new RGWSubUser; + u->name = "name"; + u->perm_mask = 0xf; + o.push_back(u); + o.push_back(new RGWSubUser); +} + +void RGWUserInfo::generate_test_instances(list& o) +{ + RGWUserInfo *i = new RGWUserInfo; + i->user_id = "user_id"; + i->display_name = "display_name"; + i->user_email = "user@email"; + RGWAccessKey k1, k2; + k1.id = "id1"; + k1.key = "key1"; + k2.id = "id2"; + k2.subuser = "subuser"; + RGWSubUser u; + u.name = "id2"; + u.perm_mask = 0x1; + i->access_keys[k1.id] = k1; + i->swift_keys[k2.id] = k2; + i->subusers[u.name] = u; + o.push_back(i); + + o.push_back(new RGWUserInfo); +} + +void rgw_bucket::generate_test_instances(list& o) +{ + rgw_bucket *b = new rgw_bucket; + init_bucket(b, "tenant", "name", "pool", ".index_pool", "marker", "123"); + o.push_back(b); + o.push_back(new rgw_bucket); +} + +void RGWBucketInfo::generate_test_instances(list& o) +{ + RGWBucketInfo *i = new RGWBucketInfo; + init_bucket(&i->bucket, "tenant", "bucket", "pool", ".index_pool", "marker", "10"); + i->owner = "owner"; + i->flags = BUCKET_SUSPENDED; + o.push_back(i); + o.push_back(new RGWBucketInfo); +} + +void RGWZoneGroup::generate_test_instances(list& o) +{ + RGWZoneGroup *r = new RGWZoneGroup; + o.push_back(r); + o.push_back(new RGWZoneGroup); +} + +void RGWZone::generate_test_instances(list &o) +{ + RGWZone *z = new RGWZone; + o.push_back(z); + o.push_back(new RGWZone); +} + +void RGWRealm::generate_test_instances(list &o) +{ + RGWRealm *z = new RGWRealm; + o.push_back(z); + o.push_back(new RGWRealm); +} + +void RGWPeriod::generate_test_instances(list &o) +{ + RGWPeriod *z = new RGWPeriod; + o.push_back(z); + o.push_back(new RGWPeriod); +} + +void RGWZoneParams::generate_test_instances(list &o) +{ + o.push_back(new RGWZoneParams); + o.push_back(new RGWZoneParams); +} + +void RGWOLHInfo::generate_test_instances(list &o) +{ + RGWOLHInfo *olh = new RGWOLHInfo; + olh->removed = false; + o.push_back(olh); + o.push_back(new RGWOLHInfo); +} + +void RGWBucketEnt::generate_test_instances(list& o) +{ + RGWBucketEnt *e = new RGWBucketEnt; + init_bucket(&e->bucket, "tenant", "bucket", "pool", ".index_pool", "marker", "10"); + e->size = 1024; + e->size_rounded = 4096; + e->count = 1; + o.push_back(e); + o.push_back(new RGWBucketEnt); +} + +void RGWUploadPartInfo::generate_test_instances(list& o) +{ + RGWUploadPartInfo *i = new RGWUploadPartInfo; + i->num = 1; + i->size = 10 * 1024 * 1024; + i->etag = "etag"; + o.push_back(i); + o.push_back(new RGWUploadPartInfo); +} + +void rgw_obj::generate_test_instances(list& o) +{ + rgw_bucket b; + init_bucket(&b, "tenant", "bucket", "pool", ".index_pool", "marker", "10"); + rgw_obj *obj = new rgw_obj(b, "object"); + o.push_back(obj); + o.push_back(new rgw_obj); +} + +void rgw_meta_sync_info::generate_test_instances(list& o) +{ + auto info = new rgw_meta_sync_info; + info->state = rgw_meta_sync_info::StateBuildingFullSyncMaps; + info->period = "periodid"; + info->realm_epoch = 5; + o.push_back(info); + o.push_back(new rgw_meta_sync_info); +} + +void rgw_meta_sync_marker::generate_test_instances(list& o) +{ + auto marker = new rgw_meta_sync_marker; + marker->state = rgw_meta_sync_marker::IncrementalSync; + marker->marker = "01234"; + marker->realm_epoch = 5; + o.push_back(marker); + o.push_back(new rgw_meta_sync_marker); +} + +void rgw_meta_sync_status::generate_test_instances(list& o) +{ + o.push_back(new rgw_meta_sync_status); +} + +void rgw_data_sync_info::generate_test_instances(list& o) +{ + auto info = new rgw_data_sync_info; + info->state = rgw_data_sync_info::StateBuildingFullSyncMaps; + info->num_shards = 8; + o.push_back(info); + o.push_back(new rgw_data_sync_info); +} + +void rgw_data_sync_marker::generate_test_instances(list& o) +{ + auto marker = new rgw_data_sync_marker; + marker->state = rgw_data_sync_marker::IncrementalSync; + marker->marker = "01234"; + marker->pos = 5; + o.push_back(marker); + o.push_back(new rgw_data_sync_marker); +} + +void rgw_data_sync_status::generate_test_instances(list& o) +{ + o.push_back(new rgw_data_sync_status); +} diff --git a/src/rgw/rgw_dmclock.h b/src/rgw/rgw_dmclock.h new file mode 100644 index 00000000..79c0aeb7 --- /dev/null +++ b/src/rgw/rgw_dmclock.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * Copyright (C) 2019 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_DMCLOCK_H +#define RGW_DMCLOCK_H +#include "dmclock/src/dmclock_server.h" + +namespace rgw::dmclock { +// TODO: implement read vs write +enum class client_id { + admin, //< /admin apis + auth, //< swift auth, sts + data, //< PutObj, GetObj + metadata, //< bucket operations, object metadata + count +}; + +// TODO move these to dmclock/types or so in submodule +using crimson::dmclock::Cost; +using crimson::dmclock::ClientInfo; + +enum class scheduler_t { + none, + throttler, + dmclock +}; + +inline scheduler_t get_scheduler_t(CephContext* const cct) +{ + const auto scheduler_type = cct->_conf.get_val("rgw_scheduler_type"); + if (scheduler_type == "dmclock") + return scheduler_t::dmclock; + else if (scheduler_type == "throttler") + return scheduler_t::throttler; + else + return scheduler_t::none; +} + +} // namespace rgw::dmclock + +#endif /* RGW_DMCLOCK_H */ diff --git a/src/rgw/rgw_dmclock_async_scheduler.cc b/src/rgw/rgw_dmclock_async_scheduler.cc new file mode 100644 index 00000000..18ba5a5e --- /dev/null +++ b/src/rgw/rgw_dmclock_async_scheduler.cc @@ -0,0 +1,175 @@ + +#include "common/async/completion.h" +#include "rgw_dmclock_async_scheduler.h" +#include "rgw_dmclock_scheduler.h" + +namespace rgw::dmclock { + +AsyncScheduler::~AsyncScheduler() +{ + cancel(); + if (observer) { + cct->_conf.remove_observer(this); + } +} + +const char** AsyncScheduler::get_tracked_conf_keys() const +{ + if (observer) { + return observer->get_tracked_conf_keys(); + } + static const char* keys[] = { "rgw_max_concurrent_requests", nullptr }; + return keys; +} + +void AsyncScheduler::handle_conf_change(const ConfigProxy& conf, + const std::set& changed) +{ + if (observer) { + observer->handle_conf_change(conf, changed); + } + if (changed.count("rgw_max_concurrent_requests")) { + auto new_max = conf.get_val("rgw_max_concurrent_requests"); + max_requests = new_max > 0 ? new_max : std::numeric_limits::max(); + } + queue.update_client_infos(); + schedule(crimson::dmclock::TimeZero); +} + +int AsyncScheduler::schedule_request_impl(const client_id& client, + const ReqParams& params, + const Time& time, const Cost& cost, + optional_yield yield_ctx) +{ + ceph_assert(yield_ctx); + + auto &yield = yield_ctx.get_yield_context(); + boost::system::error_code ec; + async_request(client, params, time, cost, yield[ec]); + + if (ec){ + if (ec == boost::system::errc::resource_unavailable_try_again) + return -EAGAIN; + else + return -ec.value(); + } + + return 0; +} + +void AsyncScheduler::request_complete() +{ + --outstanding_requests; + schedule(crimson::dmclock::TimeZero); +} + +void AsyncScheduler::cancel() +{ + ClientSums sums; + + queue.remove_by_req_filter([&] (RequestRef&& request) { + inc(sums, request->client, request->cost); + auto c = static_cast(request.release()); + Completion::dispatch(std::unique_ptr{c}, + boost::asio::error::operation_aborted, + PhaseType::priority); + return true; + }); + timer.cancel(); + + for (size_t i = 0; i < client_count; i++) { + if (auto c = counters(static_cast(i))) { + on_cancel(c, sums[i]); + } + } +} + +void AsyncScheduler::cancel(const client_id& client) +{ + ClientSum sum; + + queue.remove_by_client(client, false, [&] (RequestRef&& request) { + sum.count++; + sum.cost += request->cost; + auto c = static_cast(request.release()); + Completion::dispatch(std::unique_ptr{c}, + boost::asio::error::operation_aborted, + PhaseType::priority); + }); + if (auto c = counters(client)) { + on_cancel(c, sum); + } + schedule(crimson::dmclock::TimeZero); +} + +void AsyncScheduler::schedule(const Time& time) +{ + timer.expires_at(Clock::from_double(time)); + timer.async_wait([this] (boost::system::error_code ec) { + // process requests unless the wait was canceled. note that a canceled + // wait may execute after this AsyncScheduler destructs + if (ec != boost::asio::error::operation_aborted) { + process(get_time()); + } + }); +} + +void AsyncScheduler::process(const Time& now) +{ + // must run in the executor. we should only invoke completion handlers if the + // executor is running + assert(get_executor().running_in_this_thread()); + + ClientSums rsums, psums; + + while (outstanding_requests < max_requests) { + auto pull = queue.pull_request(now); + + if (pull.is_none()) { + // no pending requests, cancel the timer + timer.cancel(); + break; + } + if (pull.is_future()) { + // update the timer based on the future time + schedule(pull.getTime()); + break; + } + ++outstanding_requests; + + // complete the request + auto& r = pull.get_retn(); + auto client = r.client; + auto phase = r.phase; + auto started = r.request->started; + auto cost = r.request->cost; + auto c = static_cast(r.request.release()); + Completion::post(std::unique_ptr{c}, + boost::system::error_code{}, phase); + + if (auto c = counters(client)) { + auto lat = Clock::from_double(now) - Clock::from_double(started); + if (phase == PhaseType::reservation) { + inc(rsums, client, cost); + c->tinc(queue_counters::l_res_latency, lat); + } else { + inc(psums, client, cost); + c->tinc(queue_counters::l_prio_latency, lat); + } + } + } + + if (outstanding_requests >= max_requests) { + if(auto c = counters(client_id::count)){ + c->inc(throttle_counters::l_throttle); + } + } + + for (size_t i = 0; i < client_count; i++) { + if (auto c = counters(static_cast(i))) { + on_process(c, rsums[i], psums[i]); + } + } +} + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_dmclock_async_scheduler.h b/src/rgw/rgw_dmclock_async_scheduler.h new file mode 100644 index 00000000..1d454acd --- /dev/null +++ b/src/rgw/rgw_dmclock_async_scheduler.h @@ -0,0 +1,217 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_DMCLOCK_ASYNC_SCHEDULER_H +#define RGW_DMCLOCK_ASYNC_SCHEDULER_H + +#include "common/async/completion.h" + +#include +#include "rgw_dmclock_scheduler.h" +#include "rgw_dmclock_scheduler_ctx.h" + +namespace rgw::dmclock { + namespace async = ceph::async; + +/* + * A dmclock request scheduling service for use with boost::asio. + * + * An asynchronous dmclock priority queue, where scheduled requests complete + * on a boost::asio executor. + */ +class AsyncScheduler : public md_config_obs_t, public Scheduler { + public: + template // args forwarded to PullPriorityQueue ctor + AsyncScheduler(CephContext *cct, boost::asio::io_context& context, + GetClientCounters&& counters, md_config_obs_t *observer, + Args&& ...args); + ~AsyncScheduler(); + + using executor_type = boost::asio::io_context::executor_type; + + /// return the default executor for async_request() callbacks + executor_type get_executor() noexcept { + return timer.get_executor(); + } + + /// submit an async request for dmclock scheduling. the given completion + /// handler will be invoked with (error_code, PhaseType) when the request + /// is ready or canceled. on success, this grants a throttle unit that must + /// be returned with a call to request_complete() + template + auto async_request(const client_id& client, const ReqParams& params, + const Time& time, Cost cost, CompletionToken&& token); + + /// returns a throttle unit granted by async_request() + void request_complete() override; + + /// cancel all queued requests, invoking their completion handlers with an + /// operation_aborted error and default-constructed result + void cancel(); + + /// cancel all queued requests for a given client, invoking their completion + /// handler with an operation_aborted error and default-constructed result + void cancel(const client_id& client); + + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set& changed) override; + + private: + int schedule_request_impl(const client_id& client, const ReqParams& params, + const Time& time, const Cost& cost, + optional_yield yield_ctx) override; + + static constexpr bool IsDelayed = false; + using Queue = crimson::dmclock::PullPriorityQueue; + using RequestRef = typename Queue::RequestRef; + Queue queue; //< dmclock priority queue + + using Signature = void(boost::system::error_code, PhaseType); + using Completion = async::Completion>; + + using Clock = ceph::coarse_real_clock; +#if BOOST_VERSION < 107000 + using Timer = boost::asio::basic_waitable_timer; +#else + using Timer = boost::asio::basic_waitable_timer, executor_type>; +#endif + Timer timer; //< timer for the next scheduled request + + CephContext *const cct; + md_config_obs_t *const observer; //< observer to update ClientInfoFunc + GetClientCounters counters; //< provides per-client perf counters + + /// max request throttle + std::atomic max_requests; + std::atomic outstanding_requests = 0; + + /// set a timer to process the next request + void schedule(const Time& time); + + /// process ready requests, then schedule the next pending request + void process(const Time& now); +}; + + +template +AsyncScheduler::AsyncScheduler(CephContext *cct, boost::asio::io_context& context, + GetClientCounters&& counters, + md_config_obs_t *observer, Args&& ...args) + : queue(std::forward(args)...), + timer(context), cct(cct), observer(observer), + counters(std::move(counters)), + max_requests(cct->_conf.get_val("rgw_max_concurrent_requests")) +{ + if (max_requests <= 0) { + max_requests = std::numeric_limits::max(); + } + if (observer) { + cct->_conf.add_observer(this); + } +} + +template +auto AsyncScheduler::async_request(const client_id& client, + const ReqParams& params, + const Time& time, Cost cost, + CompletionToken&& token) +{ + boost::asio::async_completion init(token); + + auto ex1 = get_executor(); + auto& handler = init.completion_handler; + + // allocate the Request and add it to the queue + auto completion = Completion::create(ex1, std::move(handler), + Request{client, time, cost}); + // cast to unique_ptr + auto req = RequestRef{std::move(completion)}; + int r = queue.add_request(std::move(req), client, params, time, cost); + if (r == 0) { + // schedule an immediate call to process() on the executor + schedule(crimson::dmclock::TimeZero); + if (auto c = counters(client)) { + c->inc(queue_counters::l_qlen); + c->inc(queue_counters::l_cost, cost); + } + } else { + // post the error code + boost::system::error_code ec(r, boost::system::system_category()); + // cast back to Completion + auto completion = static_cast(req.release()); + async::post(std::unique_ptr{completion}, + ec, PhaseType::priority); + if (auto c = counters(client)) { + c->inc(queue_counters::l_limit); + c->inc(queue_counters::l_limit_cost, cost); + } + } + + return init.result.get(); +} + +class SimpleThrottler : public md_config_obs_t, public dmclock::Scheduler { +public: + SimpleThrottler(CephContext *cct) : + max_requests(cct->_conf.get_val("rgw_max_concurrent_requests")), + counters(cct, "simple-throttler") + { + if (max_requests <= 0) { + max_requests = std::numeric_limits::max(); + } + cct->_conf.add_observer(this); + } + + const char** get_tracked_conf_keys() const override { + static const char* keys[] = { "rgw_max_concurrent_requests", nullptr }; + return keys; + } + + void handle_conf_change(const ConfigProxy& conf, + const std::set& changed) override + { + if (changed.count("rgw_max_concurrent_requests")) { + auto new_max = conf.get_val("rgw_max_concurrent_requests"); + max_requests = new_max > 0 ? new_max : std::numeric_limits::max(); + } + } + + void request_complete() override { + --outstanding_requests; + } + +private: + int schedule_request_impl(const client_id&, const ReqParams&, + const Time&, const Cost&, + optional_yield) override { + if (outstanding_requests++ >= max_requests) { + if (auto c = counters(); + c != nullptr) { + c->inc(throttle_counters::l_throttle); + } + return -EAGAIN; + } + + return 0 ; + } + + std::atomic max_requests; + std::atomic outstanding_requests = 0; + ThrottleCounters counters; +}; + +} // namespace rgw::dmclock +#endif /* RGW_DMCLOCK_ASYNC_SCHEDULER_H */ diff --git a/src/rgw/rgw_dmclock_scheduler.h b/src/rgw/rgw_dmclock_scheduler.h new file mode 100644 index 00000000..aeeb695e --- /dev/null +++ b/src/rgw/rgw_dmclock_scheduler.h @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * (C) 2019 SUSE LLC + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_DMCLOCK_SCHEDULER_H +#define RGW_DMCLOCK_SCHEDULER_H + +#include "common/ceph_time.h" +#include "common/ceph_context.h" +#include "common/config.h" +#include "common/async/yield_context.h" +#include "rgw_dmclock.h" + +namespace rgw::dmclock { + +using crimson::dmclock::ReqParams; +using crimson::dmclock::PhaseType; +using crimson::dmclock::AtLimit; +using crimson::dmclock::Time; +using crimson::dmclock::get_time; + +/// function to provide client counters +using GetClientCounters = std::function; + +struct Request { + client_id client; + Time started; + Cost cost; +}; + +enum class ReqState { + Wait, + Ready, + Cancelled +}; + +template +class Completer { +public: + Completer(F &&f): f(std::move(f)) {} + // Default constructor is needed as we need to create an empty completer + // that'll be move assigned later in process request + Completer() = default; + ~Completer() { + if (f) { + f(); + } + } + Completer(const Completer&) = delete; + Completer& operator=(const Completer&) = delete; + Completer(Completer&& other) = default; + Completer& operator=(Completer&& other) = default; +private: + F f; +}; + +using SchedulerCompleter = Completer>; + +class Scheduler { +public: + auto schedule_request(const client_id& client, const ReqParams& params, + const Time& time, const Cost& cost, + optional_yield yield) + { + int r = schedule_request_impl(client,params,time,cost,yield); + return std::make_pair(r,SchedulerCompleter(std::bind(&Scheduler::request_complete,this))); + } + virtual void request_complete() {}; + + virtual ~Scheduler() {}; +private: + virtual int schedule_request_impl(const client_id&, const ReqParams&, + const Time&, const Cost&, + optional_yield) = 0; +}; + +} // namespace rgw::dmclock + +#endif // RGW_DMCLOCK_SCHEDULER_H diff --git a/src/rgw/rgw_dmclock_scheduler_ctx.cc b/src/rgw/rgw_dmclock_scheduler_ctx.cc new file mode 100644 index 00000000..3ecc977f --- /dev/null +++ b/src/rgw/rgw_dmclock_scheduler_ctx.cc @@ -0,0 +1,177 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat, Inc. + * (C) 2019 SUSE Linux LLC + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "rgw_dmclock_scheduler_ctx.h" + +namespace rgw::dmclock { + +ClientConfig::ClientConfig(CephContext *cct) +{ + update(cct->_conf); +} + +ClientInfo* ClientConfig::operator()(client_id client) +{ + return &clients[static_cast(client)]; +} + +const char** ClientConfig::get_tracked_conf_keys() const +{ + static const char* keys[] = { + "rgw_dmclock_admin_res", + "rgw_dmclock_admin_wgt", + "rgw_dmclock_admin_lim", + "rgw_dmclock_auth_res", + "rgw_dmclock_auth_wgt", + "rgw_dmclock_auth_lim", + "rgw_dmclock_data_res", + "rgw_dmclock_data_wgt", + "rgw_dmclock_data_lim", + "rgw_dmclock_metadata_res", + "rgw_dmclock_metadata_wgt", + "rgw_dmclock_metadata_lim", + "rgw_max_concurrent_requests", + nullptr + }; + return keys; +} + +void ClientConfig::update(const ConfigProxy& conf) +{ + clients.clear(); + static_assert(0 == static_cast(client_id::admin)); + clients.emplace_back(conf.get_val("rgw_dmclock_admin_res"), + conf.get_val("rgw_dmclock_admin_wgt"), + conf.get_val("rgw_dmclock_admin_lim")); + static_assert(1 == static_cast(client_id::auth)); + clients.emplace_back(conf.get_val("rgw_dmclock_auth_res"), + conf.get_val("rgw_dmclock_auth_wgt"), + conf.get_val("rgw_dmclock_auth_lim")); + static_assert(2 == static_cast(client_id::data)); + clients.emplace_back(conf.get_val("rgw_dmclock_data_res"), + conf.get_val("rgw_dmclock_data_wgt"), + conf.get_val("rgw_dmclock_data_lim")); + static_assert(3 == static_cast(client_id::metadata)); + clients.emplace_back(conf.get_val("rgw_dmclock_metadata_res"), + conf.get_val("rgw_dmclock_metadata_wgt"), + conf.get_val("rgw_dmclock_metadata_lim")); +} + +void ClientConfig::handle_conf_change(const ConfigProxy& conf, + const std::set& changed) +{ + update(conf); +} + +ClientCounters::ClientCounters(CephContext *cct) +{ + clients[static_cast(client_id::admin)] = + queue_counters::build(cct, "dmclock-admin"); + clients[static_cast(client_id::auth)] = + queue_counters::build(cct, "dmclock-auth"); + clients[static_cast(client_id::data)] = + queue_counters::build(cct, "dmclock-data"); + clients[static_cast(client_id::metadata)] = + queue_counters::build(cct, "dmclock-metadata"); + clients[static_cast(client_id::count)] = + throttle_counters::build(cct, "dmclock-scheduler"); +} + +void inc(ClientSums& sums, client_id client, Cost cost) +{ + auto& sum = sums[static_cast(client)]; + sum.count++; + sum.cost += cost; +} + +void on_cancel(PerfCounters *c, const ClientSum& sum) +{ + if (sum.count) { + c->dec(queue_counters::l_qlen, sum.count); + c->inc(queue_counters::l_cancel, sum.count); + } + if (sum.cost) { + c->dec(queue_counters::l_cost, sum.cost); + c->inc(queue_counters::l_cancel_cost, sum.cost); + } +} + +void on_process(PerfCounters* c, const ClientSum& rsum, const ClientSum& psum) +{ + if (rsum.count) { + c->inc(queue_counters::l_res, rsum.count); + } + if (rsum.cost) { + c->inc(queue_counters::l_res_cost, rsum.cost); + } + if (psum.count) { + c->inc(queue_counters::l_prio, psum.count); + } + if (psum.cost) { + c->inc(queue_counters::l_prio_cost, psum.cost); + } + if (rsum.count + psum.count) { + c->dec(queue_counters::l_qlen, rsum.count + psum.count); + } + if (rsum.cost + psum.cost) { + c->dec(queue_counters::l_cost, rsum.cost + psum.cost); + } +} +} // namespace rgw::dmclock + +namespace queue_counters { + +PerfCountersRef build(CephContext *cct, const std::string& name) +{ + if (!cct->_conf->throttler_perf_counter) { + return {}; + } + + PerfCountersBuilder b(cct, name, l_first, l_last); + b.add_u64(l_qlen, "qlen", "Queue size"); + b.add_u64(l_cost, "cost", "Cost of queued requests"); + b.add_u64_counter(l_res, "res", "Requests satisfied by reservation"); + b.add_u64_counter(l_res_cost, "res_cost", "Cost satisfied by reservation"); + b.add_u64_counter(l_prio, "prio", "Requests satisfied by priority"); + b.add_u64_counter(l_prio_cost, "prio_cost", "Cost satisfied by priority"); + b.add_u64_counter(l_limit, "limit", "Requests rejected by limit"); + b.add_u64_counter(l_limit_cost, "limit_cost", "Cost rejected by limit"); + b.add_u64_counter(l_cancel, "cancel", "Cancels"); + b.add_u64_counter(l_cancel_cost, "cancel_cost", "Canceled cost"); + b.add_time_avg(l_res_latency, "res latency", "Reservation latency"); + b.add_time_avg(l_prio_latency, "prio latency", "Priority latency"); + + auto logger = PerfCountersRef{ b.create_perf_counters(), cct }; + cct->get_perfcounters_collection()->add(logger.get()); + return logger; +} + +} // namespace queue_counters + +namespace throttle_counters { + +PerfCountersRef build(CephContext *cct, const std::string& name) +{ + if (!cct->_conf->throttler_perf_counter) { + return {}; + } + + PerfCountersBuilder b(cct, name, l_first, l_last); + b.add_u64(l_throttle, "throttle", "Requests throttled"); + + auto logger = PerfCountersRef{ b.create_perf_counters(), cct }; + cct->get_perfcounters_collection()->add(logger.get()); + return logger; +} + +} // namespace throttle_counters diff --git a/src/rgw/rgw_dmclock_scheduler_ctx.h b/src/rgw/rgw_dmclock_scheduler_ctx.h new file mode 100644 index 00000000..fe34e180 --- /dev/null +++ b/src/rgw/rgw_dmclock_scheduler_ctx.h @@ -0,0 +1,118 @@ +#ifndef RGW_DMCLOCK_SCHEDULER_CTX_H +#define RGW_DMCLOCK_SCHEDULER_CTX_H + +#include "common/perf_counters.h" +#include "common/ceph_context.h" +#include "common/config.h" +#include "rgw_dmclock.h" + +namespace queue_counters { + + enum { + l_first = 427150, + l_qlen, + l_cost, + l_res, + l_res_cost, + l_prio, + l_prio_cost, + l_limit, + l_limit_cost, + l_cancel, + l_cancel_cost, + l_res_latency, + l_prio_latency, + l_last, + }; + + PerfCountersRef build(CephContext *cct, const std::string& name); + +} // namespace queue_counters + +namespace throttle_counters { + enum { + l_first = 437219, + l_throttle, + l_last + }; + + PerfCountersRef build(CephContext *cct, const std::string& name); +} // namespace throttle + +namespace rgw::dmclock { + +// the last client counter would be for global scheduler stats +static constexpr auto counter_size = static_cast(client_id::count) + 1; +/// array of per-client counters to serve as GetClientCounters +class ClientCounters { + std::array clients; + public: + ClientCounters(CephContext *cct); + + PerfCounters* operator()(client_id client) const { + return clients[static_cast(client)].get(); + } +}; + +class ThrottleCounters { + PerfCountersRef counters; +public: + ThrottleCounters(CephContext* const cct,const std::string& name): + counters(throttle_counters::build(cct, name)) {} + + PerfCounters* operator()() const { + return counters.get(); + } +}; + + +struct ClientSum { + uint64_t count{0}; + Cost cost{0}; +}; + +constexpr auto client_count = static_cast(client_id::count); +using ClientSums = std::array; + +void inc(ClientSums& sums, client_id client, Cost cost); +void on_cancel(PerfCounters *c, const ClientSum& sum); +void on_process(PerfCounters* c, const ClientSum& rsum, const ClientSum& psum); + + +class ClientConfig : public md_config_obs_t { + std::vector clients; + + void update(const ConfigProxy &conf); + +public: + ClientConfig(CephContext *cct); + + ClientInfo* operator()(client_id client); + + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set& changed) override; +}; + +class SchedulerCtx { +public: + SchedulerCtx(CephContext* const cct) : sched_t(get_scheduler_t(cct)) + { + if(sched_t == scheduler_t::dmclock) { + dmc_client_config = std::make_shared(cct); + // we don't have a move only cref std::function yet + dmc_client_counters = std::make_optional(cct); + } + } + // We need to construct a std::function from a NonCopyable object + ClientCounters& get_dmc_client_counters() { return dmc_client_counters.value(); } + ClientConfig* const get_dmc_client_config() const { return dmc_client_config.get(); } +private: + scheduler_t sched_t; + std::shared_ptr dmc_client_config {nullptr}; + std::optional dmc_client_counters {std::nullopt}; +}; + +} // namespace rgw::dmclock + +#endif /* RGW_DMCLOCK_SCHEDULER_CTX_H */ diff --git a/src/rgw/rgw_dmclock_sync_scheduler.cc b/src/rgw/rgw_dmclock_sync_scheduler.cc new file mode 100644 index 00000000..650a995d --- /dev/null +++ b/src/rgw/rgw_dmclock_sync_scheduler.cc @@ -0,0 +1,114 @@ +#include "rgw_dmclock_scheduler.h" +#include "rgw_dmclock_sync_scheduler.h" +#include "rgw_dmclock_scheduler_ctx.h" + +namespace rgw::dmclock { + +SyncScheduler::~SyncScheduler() +{ + cancel(); +} + +int SyncScheduler::add_request(const client_id& client, const ReqParams& params, + const Time& time, Cost cost) +{ + std::mutex req_mtx; + std::condition_variable req_cv; + ReqState rstate {ReqState::Wait}; + auto req = SyncRequest{client, time, cost, req_mtx, req_cv, rstate, counters}; + int r = queue.add_request_time(req, client, params, time, cost); + if (r == 0) { + if (auto c = counters(client)) { + c->inc(queue_counters::l_qlen); + c->inc(queue_counters::l_cost, cost); + } + queue.request_completed(); + // Perform a blocking wait until the request callback is called + { + std::unique_lock lock{req_mtx}; + req_cv.wait(lock, [&rstate] {return rstate != ReqState::Wait;}); + } + if (rstate == ReqState::Cancelled) { + //FIXME: decide on error code for cancelled request + r = -ECONNABORTED; + } + } else { + // post the error code + if (auto c = counters(client)) { + c->inc(queue_counters::l_limit); + c->inc(queue_counters::l_limit_cost, cost); + } + } + return r; +} + +void SyncScheduler::handle_request_cb(const client_id &c, + std::unique_ptr req, + PhaseType phase, Cost cost) +{ + { std::lock_guard lg(req->req_mtx); + req->req_state = ReqState::Ready; + req->req_cv.notify_one(); + } + + if (auto ctr = req->counters(c)) { + auto lat = Clock::from_double(get_time()) - Clock::from_double(req->started); + if (phase == PhaseType::reservation){ + ctr->tinc(queue_counters::l_res_latency, lat); + ctr->inc(queue_counters::l_res); + if (cost) ctr->inc(queue_counters::l_res_cost); + } else if (phase == PhaseType::priority){ + ctr->tinc(queue_counters::l_prio_latency, lat); + ctr->inc(queue_counters::l_prio); + if (cost) ctr->inc(queue_counters::l_prio_cost); + } + ctr->dec(queue_counters::l_qlen); + if (cost) ctr->dec(queue_counters::l_cost); + } +} + + +void SyncScheduler::cancel(const client_id& client) +{ + ClientSum sum; + + queue.remove_by_client(client, false, [&](RequestRef&& request) + { + sum.count++; + sum.cost += request->cost; + { + std::lock_guard lg(request->req_mtx); + request->req_state = ReqState::Cancelled; + request->req_cv.notify_one(); + } + }); + if (auto c = counters(client)) { + on_cancel(c, sum); + } + + queue.request_completed(); +} + +void SyncScheduler::cancel() +{ + ClientSums sums; + + queue.remove_by_req_filter([&](RequestRef&& request) -> bool + { + inc(sums, request->client, request->cost); + { + std::lock_guard lg(request->req_mtx); + request->req_state = ReqState::Cancelled; + request->req_cv.notify_one(); + } + return true; + }); + + for (size_t i = 0; i < client_count; i++) { + if (auto c = counters(static_cast(i))) { + on_cancel(c, sums[i]); + } + } +} + +} // namespace rgw::dmclock diff --git a/src/rgw/rgw_dmclock_sync_scheduler.h b/src/rgw/rgw_dmclock_sync_scheduler.h new file mode 100644 index 00000000..ca7223f2 --- /dev/null +++ b/src/rgw/rgw_dmclock_sync_scheduler.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 SUSE Linux Gmbh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_DMCLOCK_SYNC_SCHEDULER_H +#define RGW_DMCLOCK_SYNC_SCHEDULER_H + +#include "rgw_dmclock_scheduler.h" +#include "rgw_dmclock_scheduler_ctx.h" + +namespace rgw::dmclock { +// For a blocking SyncRequest we hold a reference to a cv and the caller must +// ensure the lifetime +struct SyncRequest : public Request { + std::mutex& req_mtx; + std::condition_variable& req_cv; + ReqState& req_state; + GetClientCounters& counters; + explicit SyncRequest(client_id _id, Time started, Cost cost, + std::mutex& mtx, std::condition_variable& _cv, + ReqState& _state, GetClientCounters& counters): + Request{_id, started, cost}, req_mtx(mtx), req_cv(_cv), req_state(_state), counters(counters) {}; +}; + +class SyncScheduler: public Scheduler { +public: + template + SyncScheduler(CephContext *cct, GetClientCounters&& counters, + Args&& ...args); + ~SyncScheduler(); + + // submit a blocking request for dmclock scheduling, this function waits until + // the request is ready. + int add_request(const client_id& client, const ReqParams& params, + const Time& time, Cost cost); + + + void cancel(); + + void cancel(const client_id& client); + + static void handle_request_cb(const client_id& c, std::unique_ptr req, + PhaseType phase, Cost cost); +private: + int schedule_request_impl(const client_id& client, const ReqParams& params, + const Time& time, const Cost& cost, + optional_yield _y [[maybe_unused]]) override + { + return add_request(client, params, time, cost); + } + + static constexpr bool IsDelayed = false; + using Queue = crimson::dmclock::PushPriorityQueue; + using RequestRef = typename Queue::RequestRef; + using Clock = ceph::coarse_real_clock; + + Queue queue; + CephContext const *cct; + GetClientCounters counters; //< provides per-client perf counters +}; + +template +SyncScheduler::SyncScheduler(CephContext *cct, GetClientCounters&& counters, + Args&& ...args): + queue(std::forward(args)...), cct(cct), counters(std::move(counters)) +{} + +} // namespace rgw::dmclock +#endif /* RGW_DMCLOCK_SYNC_SCHEDULER_H */ diff --git a/src/rgw/rgw_env.cc b/src/rgw/rgw_env.cc new file mode 100644 index 00000000..95b6eeca --- /dev/null +++ b/src/rgw/rgw_env.cc @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_common.h" +#include "rgw_log.h" + +#include +#include +#include "include/ceph_assert.h" +#include "rgw_crypt_sanitize.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +void RGWEnv::init(CephContext *cct) +{ + conf.init(cct); +} + +void RGWEnv::set(std::string name, std::string val) +{ + env_map[std::move(name)] = std::move(val); +} + +void RGWEnv::init(CephContext *cct, char **envp) +{ + const char *p; + + env_map.clear(); + + for (int i=0; (p = envp[i]); ++i) { + string s(p); + int pos = s.find('='); + if (pos <= 0) // should never be 0 + continue; + string name = s.substr(0, pos); + string val = s.substr(pos + 1); + env_map[name] = val; + } + + init(cct); +} + +const char *rgw_conf_get(const map& conf_map, const char *name, const char *def_val) +{ + auto iter = conf_map.find(name); + if (iter == conf_map.end()) + return def_val; + + return iter->second.c_str(); +} + +const char *RGWEnv::get(const char *name, const char *def_val) const +{ + return rgw_conf_get(env_map, name, def_val); +} + +int rgw_conf_get_int(const map& conf_map, const char *name, int def_val) +{ + auto iter = conf_map.find(name); + if (iter == conf_map.end()) + return def_val; + + const char *s = iter->second.c_str(); + return atoi(s); +} + +int RGWEnv::get_int(const char *name, int def_val) const +{ + return rgw_conf_get_int(env_map, name, def_val); +} + +bool rgw_conf_get_bool(const map& conf_map, const char *name, bool def_val) +{ + auto iter = conf_map.find(name); + if (iter == conf_map.end()) + return def_val; + + const char *s = iter->second.c_str(); + return rgw_str_to_bool(s, def_val); +} + +bool RGWEnv::get_bool(const char *name, bool def_val) +{ + return rgw_conf_get_bool(env_map, name, def_val); +} + +size_t RGWEnv::get_size(const char *name, size_t def_val) const +{ + const auto iter = env_map.find(name); + if (iter == env_map.end()) + return def_val; + + size_t sz; + try{ + sz = stoull(iter->second); + } catch(...){ + /* it is very unlikely that we'll ever encounter out_of_range, but let's + return the default eitherway */ + sz = def_val; + } + + return sz; +} + +bool RGWEnv::exists(const char *name) const +{ + return env_map.find(name)!= env_map.end(); +} + +bool RGWEnv::exists_prefix(const char *prefix) const +{ + if (env_map.empty() || prefix == NULL) + return false; + + const auto iter = env_map.lower_bound(prefix); + if (iter == env_map.end()) + return false; + + return (strncmp(iter->first.c_str(), prefix, strlen(prefix)) == 0); +} + +void RGWEnv::remove(const char *name) +{ + map::iterator iter = env_map.find(name); + if (iter != env_map.end()) + env_map.erase(iter); +} + +void RGWConf::init(CephContext *cct) +{ + enable_ops_log = cct->_conf->rgw_enable_ops_log; + enable_usage_log = cct->_conf->rgw_enable_usage_log; + + defer_to_bucket_acls = 0; // default + if (cct->_conf->rgw_defer_to_bucket_acls == "recurse") { + defer_to_bucket_acls = RGW_DEFER_TO_BUCKET_ACLS_RECURSE; + } else if (cct->_conf->rgw_defer_to_bucket_acls == "full_control") { + defer_to_bucket_acls = RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL; + } +} diff --git a/src/rgw/rgw_es_main.cc b/src/rgw/rgw_es_main.cc new file mode 100644 index 00000000..5983dd91 --- /dev/null +++ b/src/rgw/rgw_es_main.cc @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include "global/global_init.h" +#include "global/global_context.h" + +#include "common/ceph_argparse.h" +#include "common/ceph_json.h" +#include "rgw_es_query.h" + + +int main(int argc, char *argv[]) +{ + vector args; + argv_to_vec(argc, (const char **)argv, args); + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + + common_init_finish(g_ceph_context); + + string expr; + + if (argc > 1) { + expr = argv[1]; + } else { + expr = "age >= 30"; + } + + ESQueryCompiler es_query(expr, nullptr, "x-amz-meta-"); + + map aliases = { { "key", "name" }, + { "etag", "meta.etag" }, + { "size", "meta.size" }, + { "mtime", "meta.mtime" }, + { "lastmodified", "meta.mtime" }, + { "contenttype", "meta.contenttype" }, + }; + es_query.set_field_aliases(&aliases); + + map generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR}, + {"name", ESEntityTypeMap::ES_ENTITY_STR}, + {"instance", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.contenttype", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE}, + {"meta.size", ESEntityTypeMap::ES_ENTITY_INT} }; + ESEntityTypeMap gm(generic_map); + es_query.set_generic_type_map(&gm); + + map custom_map = { {"str", ESEntityTypeMap::ES_ENTITY_STR}, + {"int", ESEntityTypeMap::ES_ENTITY_INT}, + {"date", ESEntityTypeMap::ES_ENTITY_DATE} }; + ESEntityTypeMap em(custom_map); + es_query.set_custom_type_map(&em); + + string err; + + bool valid = es_query.compile(&err); + if (!valid) { + cout << "failed to compile query: " << err << std::endl; + return EINVAL; + } + + JSONFormatter f; + encode_json("root", es_query, &f); + + f.flush(cout); + + return 0; +} + diff --git a/src/rgw/rgw_es_query.cc b/src/rgw/rgw_es_query.cc new file mode 100644 index 00000000..a2c460e9 --- /dev/null +++ b/src/rgw/rgw_es_query.cc @@ -0,0 +1,694 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include + +#include "common/ceph_json.h" +#include "rgw_common.h" +#include "rgw_es_query.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +bool pop_front(list& l, string *s) +{ + if (l.empty()) { + return false; + } + *s = l.front(); + l.pop_front(); + return true; +} + +map operator_map = { + { "or", 1 }, + { "and", 2 }, + { "<", 3 }, + { "<=", 3 }, + { "==", 3 }, + { "!=", 3 }, + { ">=", 3 }, + { ">", 3 }, +}; + +bool is_operator(const string& s) +{ + return (operator_map.find(s) != operator_map.end()); +} + +int operand_value(const string& op) +{ + auto i = operator_map.find(op); + if (i == operator_map.end()) { + return 0; + } + + return i->second; +} + +int check_precedence(const string& op1, const string& op2) +{ + return operand_value(op1) - operand_value(op2); +} + +static bool infix_to_prefix(list& source, list *out) +{ + list operator_stack; + list operand_stack; + + operator_stack.push_front("("); + source.push_back(")"); + + for (string& entity : source) { + if (entity == "(") { + operator_stack.push_front(entity); + } else if (entity == ")") { + string popped_operator; + if (!pop_front(operator_stack, &popped_operator)) { + return false; + } + + while (popped_operator != "(") { + operand_stack.push_front(popped_operator); + if (!pop_front(operator_stack, &popped_operator)) { + return false; + } + } + + } else if (is_operator(entity)) { + string popped_operator; + if (!pop_front(operator_stack, &popped_operator)) { + return false; + } + + int precedence = check_precedence(popped_operator, entity); + + while (precedence >= 0) { + operand_stack.push_front(popped_operator); + if (!pop_front(operator_stack, &popped_operator)) { + return false; + } + precedence = check_precedence(popped_operator, entity); + } + + operator_stack.push_front(popped_operator); + operator_stack.push_front(entity); + } else { + operand_stack.push_front(entity); + } + + } + + if (!operator_stack.empty()) { + return false; + } + + out->swap(operand_stack); + return true; +} + +class ESQueryNode { +protected: + ESQueryCompiler *compiler; +public: + ESQueryNode(ESQueryCompiler *_compiler) : compiler(_compiler) {} + virtual ~ESQueryNode() {} + + virtual bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) = 0; + + virtual void dump(Formatter *f) const = 0; +}; + +static bool alloc_node(ESQueryCompiler *compiler, ESQueryStack *s, ESQueryNode **pnode, string *perr); + +class ESQueryNode_Bool : public ESQueryNode { + string op; + ESQueryNode *first{nullptr}; + ESQueryNode *second{nullptr}; +public: + explicit ESQueryNode_Bool(ESQueryCompiler *compiler) : ESQueryNode(compiler) {} + ESQueryNode_Bool(ESQueryCompiler *compiler, const string& _op, ESQueryNode *_first, ESQueryNode *_second) :ESQueryNode(compiler), op(_op), first(_first), second(_second) {} + bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override { + bool valid = s->pop(&op); + if (!valid) { + *perr = "incorrect expression"; + return false; + } + valid = alloc_node(compiler, s, &first, perr) && + alloc_node(compiler, s, &second, perr); + if (!valid) { + return false; + } + *pnode = this; + return true; + } + virtual ~ESQueryNode_Bool() { + delete first; + delete second; + } + + void dump(Formatter *f) const override { + f->open_object_section("bool"); + const char *section = (op == "and" ? "must" : "should"); + f->open_array_section(section); + encode_json("entry", *first, f); + encode_json("entry", *second, f); + f->close_section(); + f->close_section(); + } + +}; + +class ESQueryNodeLeafVal { +public: + ESQueryNodeLeafVal() = default; + virtual ~ESQueryNodeLeafVal() {} + + virtual bool init(const string& str_val, string *perr) = 0; + virtual void encode_json(const string& field, Formatter *f) const = 0; +}; + +class ESQueryNodeLeafVal_Str : public ESQueryNodeLeafVal { + string val; +public: + ESQueryNodeLeafVal_Str() {} + bool init(const string& str_val, string *perr) override { + val = str_val; + return true; + } + void encode_json(const string& field, Formatter *f) const override { + ::encode_json(field.c_str(), val.c_str(), f); + } +}; + +class ESQueryNodeLeafVal_Int : public ESQueryNodeLeafVal { + int64_t val{0}; +public: + ESQueryNodeLeafVal_Int() {} + bool init(const string& str_val, string *perr) override { + string err; + val = strict_strtoll(str_val.c_str(), 10, &err); + if (!err.empty()) { + *perr = string("failed to parse integer: ") + err; + return false; + } + return true; + } + void encode_json(const string& field, Formatter *f) const override { + ::encode_json(field.c_str(), val, f); + } +}; + +class ESQueryNodeLeafVal_Date : public ESQueryNodeLeafVal { + ceph::real_time val; +public: + ESQueryNodeLeafVal_Date() {} + bool init(const string& str_val, string *perr) override { + if (parse_time(str_val.c_str(), &val) < 0) { + *perr = string("failed to parse date: ") + str_val; + return false; + } + return true; + } + void encode_json(const string& field, Formatter *f) const override { + string s; + rgw_to_iso8601(val, &s); + ::encode_json(field.c_str(), s, f); + } +}; + +class ESQueryNode_Op : public ESQueryNode { +protected: + string op; + string field; + string str_val; + ESQueryNodeLeafVal *val{nullptr}; + ESEntityTypeMap::EntityType entity_type{ESEntityTypeMap::ES_ENTITY_NONE}; + bool allow_restricted{false}; + + bool val_from_str(string *perr) { + switch (entity_type) { + case ESEntityTypeMap::ES_ENTITY_DATE: + val = new ESQueryNodeLeafVal_Date; + break; + case ESEntityTypeMap::ES_ENTITY_INT: + val = new ESQueryNodeLeafVal_Int; + break; + default: + val = new ESQueryNodeLeafVal_Str; + } + return val->init(str_val, perr); + } + bool do_init(ESQueryNode **pnode, string *perr) { + field = compiler->unalias_field(field); + ESQueryNode *effective_node; + if (!handle_nested(&effective_node, perr)) { + return false; + } + if (!val_from_str(perr)) { + return false; + } + *pnode = effective_node; + return true; + } + +public: + ESQueryNode_Op(ESQueryCompiler *compiler) : ESQueryNode(compiler) {} + ~ESQueryNode_Op() { + delete val; + } + virtual bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override { + bool valid = s->pop(&op) && + s->pop(&str_val) && + s->pop(&field); + if (!valid) { + *perr = "invalid expression"; + return false; + } + return do_init(pnode, perr); + } + bool handle_nested(ESQueryNode **pnode, string *perr); + + void set_allow_restricted(bool allow) { + allow_restricted = allow; + } + + virtual void dump(Formatter *f) const override = 0; +}; + +class ESQueryNode_Op_Equal : public ESQueryNode_Op { +public: + explicit ESQueryNode_Op_Equal(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {} + ESQueryNode_Op_Equal(ESQueryCompiler *compiler, const string& f, const string& v) : ESQueryNode_Op(compiler) { + op = "=="; + field = f; + str_val = v; + } + + bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override { + if (op.empty()) { + return ESQueryNode_Op::init(s, pnode, perr); + } + return do_init(pnode, perr); + } + + virtual void dump(Formatter *f) const override { + f->open_object_section("term"); + val->encode_json(field, f); + f->close_section(); + } +}; + +class ESQueryNode_Op_NotEqual : public ESQueryNode_Op { +public: + explicit ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {} + ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler, const string& f, const string& v) : ESQueryNode_Op(compiler) { + op = "!="; + field = f; + str_val = v; + } + + bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override { + if (op.empty()) { + return ESQueryNode_Op::init(s, pnode, perr); + } + return do_init(pnode, perr); + } + + virtual void dump(Formatter *f) const override { + f->open_object_section("bool"); + f->open_object_section("must_not"); + f->open_object_section("term"); + val->encode_json(field, f); + f->close_section(); + f->close_section(); + f->close_section(); + } +}; + +class ESQueryNode_Op_Range : public ESQueryNode_Op { + string range_str; +public: + ESQueryNode_Op_Range(ESQueryCompiler *compiler, const string& rs) : ESQueryNode_Op(compiler), range_str(rs) {} + + virtual void dump(Formatter *f) const override { + f->open_object_section("range"); + f->open_object_section(field.c_str()); + val->encode_json(range_str, f); + f->close_section(); + f->close_section(); + } +}; + +class ESQueryNode_Op_Nested_Parent : public ESQueryNode_Op { +public: + ESQueryNode_Op_Nested_Parent(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {} + + virtual string get_custom_leaf_field_name() = 0; +}; + +template +class ESQueryNode_Op_Nested : public ESQueryNode_Op_Nested_Parent { + string name; + ESQueryNode *next; +public: + ESQueryNode_Op_Nested(ESQueryCompiler *compiler, const string& _name, ESQueryNode *_next) : ESQueryNode_Op_Nested_Parent(compiler), + name(_name), next(_next) {} + ~ESQueryNode_Op_Nested() { + delete next; + } + + virtual void dump(Formatter *f) const override { + f->open_object_section("nested"); + string s = string("meta.custom-") + type_str(); + encode_json("path", s.c_str(), f); + f->open_object_section("query"); + f->open_object_section("bool"); + f->open_array_section("must"); + f->open_object_section("entry"); + f->open_object_section("match"); + string n = s + ".name"; + encode_json(n.c_str(), name.c_str(), f); + f->close_section(); + f->close_section(); + encode_json("entry", *next, f); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); + } + + string type_str() const; + string get_custom_leaf_field_name() override { + return string("meta.custom-") + type_str() + ".value"; + } +}; + +template<> +string ESQueryNode_Op_Nested::type_str() const { + return "string"; +} + +template<> +string ESQueryNode_Op_Nested::type_str() const { + return "int"; +} + +template<> +string ESQueryNode_Op_Nested::type_str() const { + return "date"; +} + +bool ESQueryNode_Op::handle_nested(ESQueryNode **pnode, string *perr) +{ + string field_name = field; + const string& custom_prefix = compiler->get_custom_prefix(); + if (!boost::algorithm::starts_with(field_name, custom_prefix)) { + *pnode = this; + auto m = compiler->get_generic_type_map(); + if (m) { + bool found = m->find(field_name, &entity_type) && + (allow_restricted || !compiler->is_restricted(field_name)); + if (!found) { + *perr = string("unexpected generic field '") + field_name + "'"; + } + return found; + } + *perr = "query parser does not support generic types"; + return false; + } + + field_name = field_name.substr(custom_prefix.size()); + auto m = compiler->get_custom_type_map(); + if (m) { + m->find(field_name, &entity_type); + /* ignoring returned bool, for now just treat it as string */ + } + + ESQueryNode_Op_Nested_Parent *new_node; + switch (entity_type) { + case ESEntityTypeMap::ES_ENTITY_INT: + new_node = new ESQueryNode_Op_Nested(compiler, field_name, this); + break; + case ESEntityTypeMap::ES_ENTITY_DATE: + new_node = new ESQueryNode_Op_Nested(compiler, field_name, this); + break; + default: + new_node = new ESQueryNode_Op_Nested(compiler, field_name, this); + } + + field = new_node->get_custom_leaf_field_name(); + *pnode = new_node; + + return true; +} + +static bool is_bool_op(const string& str) +{ + return (str == "or" || str == "and"); +} + +static bool alloc_node(ESQueryCompiler *compiler, ESQueryStack *s, ESQueryNode **pnode, string *perr) +{ + string op; + bool valid = s->peek(&op); + if (!valid) { + *perr = "incorrect expression"; + return false; + } + + ESQueryNode *node; + + if (is_bool_op(op)) { + node = new ESQueryNode_Bool(compiler); + } else if (op == "==") { + node = new ESQueryNode_Op_Equal(compiler); + } else if (op == "!=") { + node = new ESQueryNode_Op_NotEqual(compiler); + } else { + static map range_op_map = { + { "<", "lt"}, + { "<=", "lte"}, + { ">=", "gte"}, + { ">", "gt"}, + }; + + auto iter = range_op_map.find(op); + if (iter == range_op_map.end()) { + *perr = string("invalid operator: ") + op; + return false; + } + + node = new ESQueryNode_Op_Range(compiler, iter->second); + } + + if (!node->init(s, pnode, perr)) { + delete node; + return false; + } + return true; +} + + +bool is_key_char(char c) +{ + switch (c) { + case '(': + case ')': + case '<': + case '>': + case '!': + case '@': + case ',': + case ';': + case ':': + case '\\': + case '"': + case '/': + case '[': + case ']': + case '?': + case '=': + case '{': + case '}': + case ' ': + case '\t': + return false; + }; + return (isascii(c) > 0); +} + +static bool is_op_char(char c) +{ + switch (c) { + case '!': + case '<': + case '=': + case '>': + return true; + }; + return false; +} + +static bool is_val_char(char c) +{ + if (isspace(c)) { + return false; + } + return (c != ')'); +} + +void ESInfixQueryParser::skip_whitespace(const char *str, int size, int& pos) { + while (pos < size && isspace(str[pos])) { + ++pos; + } +} + +bool ESInfixQueryParser::get_next_token(bool (*filter)(char)) { + skip_whitespace(str, size, pos); + int token_start = pos; + while (pos < size && filter(str[pos])) { + ++pos; + } + if (pos == token_start) { + return false; + } + string token = string(str + token_start, pos - token_start); + args.push_back(token); + return true; +} + +bool ESInfixQueryParser::parse_condition() { + /* + * condition: + * + * whereas key: needs to conform to http header field restrictions + * operator: one of the following: < <= == != >= > + * val: ascii, terminated by either space or ')' (or end of string) + */ + + /* parse key */ + bool valid = get_next_token(is_key_char) && + get_next_token(is_op_char) && + get_next_token(is_val_char); + + if (!valid) { + return false; + } + + return true; +} + +bool ESInfixQueryParser::parse_and_or() { + skip_whitespace(str, size, pos); + if (pos + 3 <= size && strncmp(str + pos, "and", 3) == 0) { + pos += 3; + args.push_back("and"); + return true; + } + + if (pos + 2 <= size && strncmp(str + pos, "or", 2) == 0) { + pos += 2; + args.push_back("or"); + return true; + } + + return false; +} + +bool ESInfixQueryParser::parse_specific_char(const char *pchar) { + skip_whitespace(str, size, pos); + if (pos >= size) { + return false; + } + if (str[pos] != *pchar) { + return false; + } + + args.push_back(pchar); + ++pos; + return true; +} + +bool ESInfixQueryParser::parse_open_bracket() { + return parse_specific_char("("); +} + +bool ESInfixQueryParser::parse_close_bracket() { + return parse_specific_char(")"); +} + +bool ESInfixQueryParser::parse(list *result) { + /* + * expression: [(][[and/or]][)][and/or]... + */ + + while (pos < size) { + parse_open_bracket(); + if (!parse_condition()) { + return false; + } + parse_close_bracket(); + parse_and_or(); + } + + result->swap(args); + + return true; +} + +bool ESQueryCompiler::convert(list& infix, string *perr) { + list prefix; + if (!infix_to_prefix(infix, &prefix)) { + *perr = "invalid query"; + return false; + } + stack.assign(prefix); + if (!alloc_node(this, &stack, &query_root, perr)) { + return false; + } + if (!stack.done()) { + *perr = "invalid query"; + return false; + } + return true; +} + +ESQueryCompiler::~ESQueryCompiler() { + delete query_root; +} + +bool ESQueryCompiler::compile(string *perr) { + list infix; + if (!parser.parse(&infix)) { + *perr = "failed to parse query"; + return false; + } + + if (!convert(infix, perr)) { + return false; + } + + for (auto& c : eq_conds) { + ESQueryNode_Op_Equal *eq_node = new ESQueryNode_Op_Equal(this, c.first, c.second); + eq_node->set_allow_restricted(true); /* can access restricted fields */ + ESQueryNode *effective_node; + if (!eq_node->init(nullptr, &effective_node, perr)) { + delete eq_node; + return false; + } + query_root = new ESQueryNode_Bool(this, "and", effective_node, query_root); + } + + return true; +} + +void ESQueryCompiler::dump(Formatter *f) const { + encode_json("query", *query_root, f); +} + diff --git a/src/rgw/rgw_es_query.h b/src/rgw/rgw_es_query.h new file mode 100644 index 00000000..b8421f4d --- /dev/null +++ b/src/rgw/rgw_es_query.h @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_ES_QUERY_H +#define CEPH_RGW_ES_QUERY_H + +#include "rgw_string.h" + +class ESQueryStack { + list l; + list::iterator iter; + +public: + explicit ESQueryStack(list& src) { + assign(src); + } + + ESQueryStack() {} + + void assign(list& src) { + l.swap(src); + iter = l.begin(); + } + + bool peek(string *dest) { + if (done()) { + return false; + } + *dest = *iter; + return true; + } + + bool pop(string *dest) { + bool valid = peek(dest); + if (!valid) { + return false; + } + ++iter; + return true; + } + + bool done() { + return (iter == l.end()); + } +}; + +class ESInfixQueryParser { + string query; + int size; + const char *str; + int pos{0}; + list args; + + void skip_whitespace(const char *str, int size, int& pos); + bool get_next_token(bool (*filter)(char)); + + bool parse_condition(); + bool parse_and_or(); + bool parse_specific_char(const char *pchar); + bool parse_open_bracket(); + bool parse_close_bracket(); + +public: + explicit ESInfixQueryParser(const string& _query) : query(_query), size(query.size()), str(query.c_str()) {} + bool parse(list *result); +}; + +class ESQueryNode; + +struct ESEntityTypeMap { + enum EntityType { + ES_ENTITY_NONE = 0, + ES_ENTITY_STR = 1, + ES_ENTITY_INT = 2, + ES_ENTITY_DATE = 3, + }; + + map m; + + explicit ESEntityTypeMap(map& _m) : m(_m) {} + + bool find(const string& entity, EntityType *ptype) { + auto i = m.find(entity); + if (i != m.end()) { + *ptype = i->second; + return true; + } + + *ptype = ES_ENTITY_NONE; + return false; + } +}; + +class ESQueryCompiler { + ESInfixQueryParser parser; + ESQueryStack stack; + ESQueryNode *query_root{nullptr}; + + string custom_prefix; + + bool convert(list& infix, string *perr); + + list > eq_conds; + + ESEntityTypeMap *generic_type_map{nullptr}; + ESEntityTypeMap *custom_type_map{nullptr}; + + map *field_aliases = nullptr; + set *restricted_fields = nullptr; + +public: + ESQueryCompiler(const string& query, list > *prepend_eq_conds, const string& _custom_prefix) : parser(query), custom_prefix(_custom_prefix) { + if (prepend_eq_conds) { + eq_conds = std::move(*prepend_eq_conds); + } + } + ~ESQueryCompiler(); + + bool compile(string *perr); + void dump(Formatter *f) const; + + void set_generic_type_map(ESEntityTypeMap *entity_map) { + generic_type_map = entity_map; + } + + ESEntityTypeMap *get_generic_type_map() { + return generic_type_map; + } + const string& get_custom_prefix() { return custom_prefix; } + + void set_custom_type_map(ESEntityTypeMap *entity_map) { + custom_type_map = entity_map; + } + + ESEntityTypeMap *get_custom_type_map() { + return custom_type_map; + } + + void set_field_aliases(map *fa) { + field_aliases = fa; + } + + string unalias_field(const string& field) { + if (!field_aliases) { + return field; + } + auto i = field_aliases->find(field); + if (i == field_aliases->end()) { + return field; + } + + return i->second; + } + + void set_restricted_fields(set *rf) { + restricted_fields = rf; + } + + bool is_restricted(const string& f) { + return (restricted_fields && restricted_fields->find(f) != restricted_fields->end()); + } +}; + + +#endif diff --git a/src/rgw/rgw_etag_verifier.cc b/src/rgw/rgw_etag_verifier.cc new file mode 100644 index 00000000..6a9d5cc3 --- /dev/null +++ b/src/rgw/rgw_etag_verifier.cc @@ -0,0 +1,185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_etag_verifier.h" + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::putobj { + +int create_etag_verifier(CephContext* cct, DataProcessor* filter, + const bufferlist& manifest_bl, + const std::optional& compression, + etag_verifier_ptr& verifier) +{ + RGWObjManifest manifest; + + try { + auto miter = manifest_bl.cbegin(); + decode(manifest, miter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl; + return -EIO; + } + + RGWObjManifestRule rule; + bool found = manifest.get_rule(0, &rule); + if (!found) { + lderr(cct) << "ERROR: manifest->get_rule() could not find rule" << dendl; + return -EIO; + } + + if (rule.start_part_num == 0) { + /* Atomic object */ + verifier.emplace(cct, filter); + return 0; + } + + uint64_t cur_part_ofs = UINT64_MAX; + std::vector part_ofs; + + /* + * We must store the offset of each part to calculate the ETAGs for each + * MPU part. These part ETags then become the input for the MPU object + * Etag. + */ + for (auto mi = manifest.obj_begin(); mi != manifest.obj_end(); ++mi) { + if (cur_part_ofs == mi.get_part_ofs()) + continue; + cur_part_ofs = mi.get_part_ofs(); + ldout(cct, 20) << "MPU Part offset:" << cur_part_ofs << dendl; + part_ofs.push_back(cur_part_ofs); + } + + if (compression) { + // if the source object was compressed, the manifest is storing + // compressed part offsets. transform the compressed offsets back to + // their original offsets by finding the first block of each part + const auto& blocks = compression->blocks; + auto block = blocks.begin(); + for (auto& ofs : part_ofs) { + // find the compression_block with new_ofs == ofs + constexpr auto less = [] (const compression_block& block, uint64_t ofs) { + return block.new_ofs < ofs; + }; + block = std::lower_bound(block, blocks.end(), ofs, less); + if (block == blocks.end() || block->new_ofs != ofs) { + ldout(cct, 4) << "no match for compressed offset " << ofs + << ", disabling etag verification" << dendl; + return -EIO; + } + ofs = block->old_ofs; + ldout(cct, 20) << "MPU Part uncompressed offset:" << ofs << dendl; + } + } + + verifier.emplace(cct, std::move(part_ofs), filter); + return 0; +} + +int ETagVerifier_Atomic::process(bufferlist&& in, uint64_t logical_offset) +{ + bufferlist out; + if (in.length() > 0) + hash.Update((const unsigned char *)in.c_str(), in.length()); + + return Pipe::process(std::move(in), logical_offset); +} + +void ETagVerifier_Atomic::calculate_etag() +{ + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + + /* Return early if ETag has already been calculated */ + if (!calculated_etag.empty()) + return; + + hash.Final(m); + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + calculated_etag = calc_md5; + ldout(cct, 20) << "Single part object: " << " etag:" << calculated_etag + << dendl; +} + +void ETagVerifier_MPU::process_end_of_MPU_part() +{ + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char calc_md5_part[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + std::string calculated_etag_part; + + hash.Final(m); + mpu_etag_hash.Update((const unsigned char *)m, sizeof(m)); + hash.Restart(); + + if (cct->_conf->subsys.should_gather(dout_subsys, 20)) { + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5_part); + calculated_etag_part = calc_md5_part; + ldout(cct, 20) << "Part etag: " << calculated_etag_part << dendl; + } + + cur_part_index++; + next_part_index++; +} + +int ETagVerifier_MPU::process(bufferlist&& in, uint64_t logical_offset) +{ + uint64_t bl_end = in.length() + logical_offset; + + /* Handle the last MPU part */ + if (next_part_index == part_ofs.size()) { + hash.Update((const unsigned char *)in.c_str(), in.length()); + goto done; + } + + /* Incoming bufferlist spans two MPU parts. Calculate separate ETags */ + if (bl_end > part_ofs[next_part_index]) { + + uint64_t part_one_len = part_ofs[next_part_index] - logical_offset; + hash.Update((const unsigned char *)in.c_str(), part_one_len); + process_end_of_MPU_part(); + + hash.Update((const unsigned char *)in.c_str() + part_one_len, + bl_end - part_ofs[cur_part_index]); + /* + * If we've moved to the last part of the MPU, avoid usage of + * parts_ofs[next_part_index] as it will lead to our-of-range access. + */ + if (next_part_index == part_ofs.size()) + goto done; + } else { + hash.Update((const unsigned char *)in.c_str(), in.length()); + } + + /* Update the MPU Etag if the current part has ended */ + if (logical_offset + in.length() + 1 == part_ofs[next_part_index]) + process_end_of_MPU_part(); + +done: + return Pipe::process(std::move(in), logical_offset); +} + +void ETagVerifier_MPU::calculate_etag() +{ + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE], mpu_m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + + /* Return early if ETag has already been calculated */ + if (!calculated_etag.empty()) + return; + + hash.Final(m); + mpu_etag_hash.Update((const unsigned char *)m, sizeof(m)); + + /* Refer RGWCompleteMultipart::execute() for ETag calculation for MPU object */ + mpu_etag_hash.Final(mpu_m); + buf_to_hex(mpu_m, CEPH_CRYPTO_MD5_DIGESTSIZE, final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], + sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)(part_ofs.size())); + + calculated_etag = final_etag_str; + ldout(cct, 20) << "MPU calculated ETag:" << calculated_etag << dendl; +} + +} // namespace rgw::putobj diff --git a/src/rgw/rgw_etag_verifier.h b/src/rgw/rgw_etag_verifier.h new file mode 100644 index 00000000..7e2579b9 --- /dev/null +++ b/src/rgw/rgw_etag_verifier.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * RGW Etag Verifier is an RGW filter which enables the objects copied using + * multisite sync to be verified using their ETag from source i.e. the MD5 + * checksum of the object is computed at the destination and is verified to be + * identical to the ETag stored in the object HEAD at source cluster. + * + * For MPU objects, a different filter named RGWMultipartEtagFilter is applied + * which re-computes ETag using RGWObjManifest. This computes the ETag using the + * same algorithm used at the source cluster i.e. MD5 sum of the individual ETag + * on the MPU parts. + */ +#ifndef CEPH_RGW_ETAG_VERIFIER_H +#define CEPH_RGW_ETAG_VERIFIER_H + +#include "rgw_putobj.h" +#include "rgw_op.h" +#include "common/static_ptr.h" + +namespace rgw::putobj { + +class ETagVerifier : public rgw::putobj::Pipe +{ +protected: + CephContext* cct; + MD5 hash; + string calculated_etag; + +public: + ETagVerifier(CephContext* cct_, rgw::putobj::DataProcessor *next) + : Pipe(next), cct(cct_) {} + + virtual void calculate_etag() = 0; + string get_calculated_etag() { return calculated_etag;} + +}; /* ETagVerifier */ + +class ETagVerifier_Atomic : public ETagVerifier +{ +public: + ETagVerifier_Atomic(CephContext* cct_, rgw::putobj::DataProcessor *next) + : ETagVerifier(cct_, next) {} + + int process(bufferlist&& data, uint64_t logical_offset) override; + void calculate_etag() override; + +}; /* ETagVerifier_Atomic */ + +class ETagVerifier_MPU : public ETagVerifier +{ + std::vector part_ofs; + int cur_part_index{0}, next_part_index{1}; + MD5 mpu_etag_hash; + + void process_end_of_MPU_part(); + +public: + ETagVerifier_MPU(CephContext* cct, + std::vector part_ofs, + rgw::putobj::DataProcessor *next) + : ETagVerifier(cct, next), + part_ofs(std::move(part_ofs)) + {} + + int process(bufferlist&& data, uint64_t logical_offset) override; + void calculate_etag() override; + +}; /* ETagVerifier_MPU */ + +constexpr auto max_etag_verifier_size = std::max( + sizeof(ETagVerifier_Atomic), + sizeof(ETagVerifier_MPU) + ); +using etag_verifier_ptr = ceph::static_ptr; + +int create_etag_verifier(CephContext* cct, DataProcessor* next, + const bufferlist& manifest_bl, + const std::optional& compression, + etag_verifier_ptr& verifier); + +} // namespace rgw::putobj + +#endif /* CEPH_RGW_ETAG_VERIFIER_H */ diff --git a/src/rgw/rgw_fcgi.cc b/src/rgw/rgw_fcgi.cc new file mode 100644 index 00000000..a52ea509 --- /dev/null +++ b/src/rgw/rgw_fcgi.cc @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_fcgi.h" +#include "acconfig.h" + +size_t RGWFCGX::write_data(const char* const buf, const size_t len) +{ + /* According to the documentation of FCGX_PutStr if there is no error + * (signalised by negative return value), then always ret == len. */ + const auto ret = FCGX_PutStr(buf, len, fcgx->out); + if (ret < 0) { + throw rgw::io::Exception(-ret, std::system_category()); + } + return ret; +} + +size_t RGWFCGX::read_data(char* const buf, const size_t len) +{ + const auto ret = FCGX_GetStr(buf, len, fcgx->in); + if (ret < 0) { + throw rgw::io::Exception(-ret, std::system_category()); + } + return ret; +} + +void RGWFCGX::flush() +{ + txbuf.pubsync(); + FCGX_FFlush(fcgx->out); +} + +int RGWFCGX::init_env(CephContext* const cct) +{ + env.init(cct, (char **)fcgx->envp); + return 0; +} + +size_t RGWFCGX::send_status(const int status, const char* const status_name) +{ + static constexpr size_t STATUS_BUF_SIZE = 128; + + char statusbuf[STATUS_BUF_SIZE]; + const auto statuslen = snprintf(statusbuf, sizeof(statusbuf), + "Status: %d %s\r\n", status, status_name); + + return txbuf.sputn(statusbuf, statuslen); +} + +size_t RGWFCGX::send_100_continue() +{ + const auto sent = send_status(100, "Continue"); + flush(); + return sent; +} + +size_t RGWFCGX::send_header(const boost::string_ref& name, + const boost::string_ref& value) +{ + static constexpr char HEADER_SEP[] = ": "; + static constexpr char HEADER_END[] = "\r\n"; + + size_t sent = 0; + + sent += txbuf.sputn(name.data(), name.length()); + sent += txbuf.sputn(HEADER_SEP, sizeof(HEADER_SEP) - 1); + sent += txbuf.sputn(value.data(), value.length()); + sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1); + + return sent; +} + +size_t RGWFCGX::send_content_length(const uint64_t len) +{ + static constexpr size_t CONLEN_BUF_SIZE = 128; + + char sizebuf[CONLEN_BUF_SIZE]; + const auto sizelen = snprintf(sizebuf, sizeof(sizebuf), + "Content-Length: %" PRIu64 "\r\n", len); + + return txbuf.sputn(sizebuf, sizelen); +} + +size_t RGWFCGX::complete_header() +{ + static constexpr char HEADER_END[] = "\r\n"; + const size_t sent = txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1); + + flush(); + return sent; +} diff --git a/src/rgw/rgw_fcgi.h b/src/rgw/rgw_fcgi.h new file mode 100644 index 00000000..7f8e61a3 --- /dev/null +++ b/src/rgw/rgw_fcgi.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_FCGI_H +#define CEPH_RGW_FCGI_H + +#include "acconfig.h" +#include + +#include "rgw_client_io.h" + +struct FCGX_Request; + +class RGWFCGX : public rgw::io::RestfulClient, + public rgw::io::BuffererSink { + FCGX_Request *fcgx; + RGWEnv env; + + rgw::io::StaticOutputBufferer<> txbuf; + + size_t read_data(char* buf, size_t len); + size_t write_data(const char* buf, size_t len) override; + +public: + explicit RGWFCGX(FCGX_Request* const fcgx) + : fcgx(fcgx), + txbuf(*this) { + } + + int init_env(CephContext* cct) override; + size_t send_status(int status, const char* status_name) override; + size_t send_100_continue() override; + size_t send_header(const boost::string_ref& name, + const boost::string_ref& value) override; + size_t send_content_length(uint64_t len) override; + size_t complete_header() override; + + size_t recv_body(char* buf, size_t max) override { + return read_data(buf, max); + } + + size_t send_body(const char* buf, size_t len) override { + return write_data(buf, len); + } + + void flush() override; + + RGWEnv& get_env() noexcept override { + return env; + } + + size_t complete_request() override { + return 0; + } +}; + +#endif diff --git a/src/rgw/rgw_fcgi_process.cc b/src/rgw/rgw_fcgi_process.cc new file mode 100644 index 00000000..757fd3ea --- /dev/null +++ b/src/rgw/rgw_fcgi_process.cc @@ -0,0 +1,138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" + +#include "rgw_rados.h" +#include "rgw_rest.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_loadgen.h" +#include "rgw_client_io.h" +#include "rgw_client_io_filters.h" + +#define dout_subsys ceph_subsys_rgw + +void RGWFCGXProcess::run() +{ + string socket_path; + string socket_port; + string socket_host; + int socket_backlog; + + conf->get_val("socket_path", "", &socket_path); + conf->get_val("socket_port", g_conf()->rgw_port, &socket_port); + conf->get_val("socket_host", g_conf()->rgw_host, &socket_host); + socket_backlog = g_conf()->rgw_fcgi_socket_backlog; + + if (socket_path.empty() && socket_port.empty() && socket_host.empty()) { + socket_path = g_conf()->rgw_socket_path; + if (socket_path.empty()) { + dout(0) << "ERROR: no socket server point defined, cannot " + "start fcgi frontend" << dendl; + return; + } + } + + if (!socket_path.empty()) { + string path_str = socket_path; + + /* this is necessary, as FCGX_OpenSocket might not return an + * error, but rather ungracefully exit */ + int fd = open(path_str.c_str(), O_CREAT, 0644); + if (fd < 0) { + int err = errno; + /* ENXIO is actually expected, we'll get that if we try to open + * a unix domain socket */ + if (err != ENXIO) { + dout(0) << "ERROR: cannot create socket: path=" << path_str + << " error=" << cpp_strerror(err) << dendl; + return; + } + } else { + close(fd); + } + + const char *path = path_str.c_str(); + sock_fd = FCGX_OpenSocket(path, socket_backlog); + if (sock_fd < 0) { + dout(0) << "ERROR: FCGX_OpenSocket (" << path << ") returned " + << sock_fd << dendl; + return; + } + if (chmod(path, 0777) < 0) { + dout(0) << "WARNING: couldn't set permissions on unix domain socket" + << dendl; + } + } else if (!socket_port.empty()) { + string bind = socket_host + ":" + socket_port; + sock_fd = FCGX_OpenSocket(bind.c_str(), socket_backlog); + if (sock_fd < 0) { + dout(0) << "ERROR: FCGX_OpenSocket (" << bind.c_str() << ") returned " + << sock_fd << dendl; + return; + } + } + + m_tp.start(); + + FCGX_Request fcgx_reqs[max_connections]; + + QueueRing qr(max_connections); + for (int i = 0; i < max_connections; i++) { + FCGX_Request* fcgx = &fcgx_reqs[i]; + FCGX_InitRequest(fcgx, sock_fd, 0); + qr.enqueue(fcgx); + } + + for (;;) { + RGWFCGXRequest* req = new RGWFCGXRequest(store->get_new_req_id(), &qr); + dout(10) << "allocated request req=" << hex << req << dec << dendl; + req_throttle.get(1); + int ret = FCGX_Accept_r(req->fcgx); + if (ret < 0) { + delete req; + dout(0) << "ERROR: FCGX_Accept_r returned " << ret << dendl; + req_throttle.put(1); + break; + } + req_wq.queue(req); + } + + m_tp.drain(&req_wq); + m_tp.stop(); + + dout(20) << "cleaning up fcgx connections" << dendl; + + for (int i = 0; i < max_connections; i++) { + FCGX_Finish_r(&fcgx_reqs[i]); + } +} /* RGWFCGXProcess::run */ + +void RGWFCGXProcess::handle_request(RGWRequest* r) +{ + RGWFCGXRequest* const req = static_cast(r); + + RGWFCGX fcgxfe(req->fcgx); + auto real_client_io = rgw::io::add_reordering( + rgw::io::add_buffering(cct, + rgw::io::add_chunking( + &fcgxfe))); + RGWRestfulIO client_io(cct, &real_client_io); + + + int ret = process_request(store, rest, req, uri_prefix, + *auth_registry, &client_io, olog, + null_yield, nullptr); + if (ret < 0) { + /* we don't really care about return code */ + dout(20) << "process_request() returned " << ret << dendl; + } + + FCGX_Finish_r(req->fcgx); + + delete req; +} /* RGWFCGXProcess::handle_request */ diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc new file mode 100644 index 00000000..5ccc01c3 --- /dev/null +++ b/src/rgw/rgw_file.cc @@ -0,0 +1,2436 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "include/rados/rgw_file.h" + +#include +#include + +#include "rgw_lib.h" +#include "rgw_rados.h" +#include "rgw_resolve.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_rest_user.h" +#include "rgw_rest_s3.h" +#include "rgw_os_lib.h" +#include "rgw_auth_s3.h" +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_zone.h" +#include "rgw_file.h" +#include "rgw_lib_frontend.h" +#include "rgw_perf_counters.h" +#include "common/errno.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace rgw; + +namespace rgw { + + extern RGWLib rgwlib; + + const string RGWFileHandle::root_name = "/"; + + std::atomic RGWLibFS::fs_inst_counter; + + uint32_t RGWLibFS::write_completion_interval_s = 10; + + ceph::timer RGWLibFS::write_timer{ + ceph::construct_suspended}; + + inline int valid_fs_bucket_name(const string& name) { + int rc = valid_s3_bucket_name(name, false /* relaxed */); + if (rc != 0) { + if (name.size() > 255) + return -ENAMETOOLONG; + return -EINVAL; + } + return 0; + } + + inline int valid_fs_object_name(const string& name) { + int rc = valid_s3_object_name(name); + if (rc != 0) { + if (name.size() > 1024) + return -ENAMETOOLONG; + return -EINVAL; + } + return 0; + } + + LookupFHResult RGWLibFS::stat_bucket(RGWFileHandle* parent, const char *path, + RGWLibFS::BucketStats& bs, + uint32_t flags) + { + LookupFHResult fhr{nullptr, 0}; + std::string bucket_name{path}; + RGWStatBucketRequest req(cct, get_user(), bucket_name, bs); + + int rc = rgwlib.get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0) && + (req.matched())) { + fhr = lookup_fh(parent, path, + (flags & RGWFileHandle::FLAG_LOCKED)| + RGWFileHandle::FLAG_CREATE| + RGWFileHandle::FLAG_BUCKET); + if (get<0>(fhr)) { + RGWFileHandle* rgw_fh = get<0>(fhr); + if (! (flags & RGWFileHandle::FLAG_LOCKED)) { + rgw_fh->mtx.lock(); + } + rgw_fh->set_times(req.get_ctime()); + /* restore attributes */ + auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1); + auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1); + if (ux_key && ux_attrs) { + DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs); + if (get<0>(dar) || get<1>(dar)) { + update_fh(rgw_fh); + } + } + if (! (flags & RGWFileHandle::FLAG_LOCKED)) { + rgw_fh->mtx.unlock(); + } + } + } + return fhr; + } + + LookupFHResult RGWLibFS::fake_leaf(RGWFileHandle* parent, + const char *path, + enum rgw_fh_type type, + struct stat *st, uint32_t st_mask, + uint32_t flags) + { + /* synthesize a minimal handle from parent, path, type, and st */ + using std::get; + + flags |= RGWFileHandle::FLAG_CREATE; + + switch (type) { + case RGW_FS_TYPE_DIRECTORY: + flags |= RGWFileHandle::FLAG_DIRECTORY; + break; + default: + /* file */ + break; + }; + + LookupFHResult fhr = lookup_fh(parent, path, flags); + if (get<0>(fhr)) { + RGWFileHandle* rgw_fh = get<0>(fhr); + if (st) { + lock_guard guard(rgw_fh->mtx); + if (st_mask & RGW_SETATTR_SIZE) { + rgw_fh->set_size(st->st_size); + } + if (st_mask & RGW_SETATTR_MTIME) { + rgw_fh->set_times(st->st_mtim); + } + } /* st */ + } /* rgw_fh */ + return fhr; + } /* RGWLibFS::fake_leaf */ + + LookupFHResult RGWLibFS::stat_leaf(RGWFileHandle* parent, + const char *path, + enum rgw_fh_type type, + uint32_t flags) + { + /* find either-of , , only one of + * which should exist; atomicity? */ + using std::get; + + LookupFHResult fhr{nullptr, 0}; + + /* XXX the need for two round-trip operations to identify file or + * directory leaf objects is unecessary--the current proposed + * mechanism to avoid this is to store leaf object names with an + * object locator w/o trailing slash */ + + std::string obj_path = parent->format_child_name(path, false); + + for (auto ix : { 0, 1, 2 }) { + switch (ix) { + case 0: + { + /* type hint */ + if (type == RGW_FS_TYPE_DIRECTORY) + continue; + + RGWStatObjRequest req(cct, get_user(), + parent->bucket_name(), obj_path, + RGWStatObjRequest::FLAG_NONE); + int rc = rgwlib.get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0)) { + fhr = lookup_fh(parent, path, RGWFileHandle::FLAG_CREATE); + if (get<0>(fhr)) { + RGWFileHandle* rgw_fh = get<0>(fhr); + lock_guard guard(rgw_fh->mtx); + rgw_fh->set_size(req.get_size()); + rgw_fh->set_times(req.get_mtime()); + /* restore attributes */ + auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1); + auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1); + rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG))); + rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL))); + if (ux_key && ux_attrs) { + DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs); + if (get<0>(dar) || get<1>(dar)) { + update_fh(rgw_fh); + } + } + } + goto done; + } + } + break; + case 1: + { + /* try dir form */ + /* type hint */ + if (type == RGW_FS_TYPE_FILE) + continue; + + obj_path += "/"; + RGWStatObjRequest req(cct, get_user(), + parent->bucket_name(), obj_path, + RGWStatObjRequest::FLAG_NONE); + int rc = rgwlib.get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0)) { + fhr = lookup_fh(parent, path, RGWFileHandle::FLAG_DIRECTORY); + if (get<0>(fhr)) { + RGWFileHandle* rgw_fh = get<0>(fhr); + lock_guard guard(rgw_fh->mtx); + rgw_fh->set_size(req.get_size()); + rgw_fh->set_times(req.get_mtime()); + /* restore attributes */ + auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1); + auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1); + rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG))); + rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL))); + if (ux_key && ux_attrs) { + DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs); + if (get<0>(dar) || get<1>(dar)) { + update_fh(rgw_fh); + } + } + } + goto done; + } + } + break; + case 2: + { + std::string object_name{path}; + RGWStatLeafRequest req(cct, get_user(), parent, object_name); + int rc = rgwlib.get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0)) { + if (req.matched) { + /* we need rgw object's key name equal to file name, if + * not return NULL */ + if ((flags & RGWFileHandle::FLAG_EXACT_MATCH) && + !req.exact_matched) { + lsubdout(get_context(), rgw, 15) + << __func__ + << ": stat leaf not exact match file name = " + << path << dendl; + goto done; + } + fhr = lookup_fh(parent, path, + RGWFileHandle::FLAG_CREATE| + ((req.is_dir) ? + RGWFileHandle::FLAG_DIRECTORY : + RGWFileHandle::FLAG_NONE)); + /* XXX we don't have an object--in general, there need not + * be one (just a path segment in some other object). In + * actual leaf an object exists, but we'd need another round + * trip to get attrs */ + if (get<0>(fhr)) { + /* for now use the parent object's mtime */ + RGWFileHandle* rgw_fh = get<0>(fhr); + lock_guard guard(rgw_fh->mtx); + rgw_fh->set_mtime(parent->get_mtime()); + } + } + } + } + break; + default: + /* not reached */ + break; + } + } + done: + return fhr; + } /* RGWLibFS::stat_leaf */ + + int RGWLibFS::read(RGWFileHandle* rgw_fh, uint64_t offset, size_t length, + size_t* bytes_read, void* buffer, uint32_t flags) + { + if (! rgw_fh->is_file()) + return -EINVAL; + + if (rgw_fh->deleted()) + return -ESTALE; + + RGWReadRequest req(get_context(), get_user(), rgw_fh, offset, length, + buffer); + + int rc = rgwlib.get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0)) { + lock_guard guard(rgw_fh->mtx); + rgw_fh->set_atime(real_clock::to_timespec(real_clock::now())); + *bytes_read = req.nread; + } + + return rc; + } + + int RGWLibFS::readlink(RGWFileHandle* rgw_fh, uint64_t offset, size_t length, + size_t* bytes_read, void* buffer, uint32_t flags) + { + if (! rgw_fh->is_link()) + return -EINVAL; + + if (rgw_fh->deleted()) + return -ESTALE; + + RGWReadRequest req(get_context(), get_user(), rgw_fh, offset, length, + buffer); + + int rc = rgwlib.get_fe()->execute_req(&req); + if ((rc == 0) && + (req.get_ret() == 0)) { + lock_guard(rgw_fh->mtx); + rgw_fh->set_atime(real_clock::to_timespec(real_clock::now())); + *bytes_read = req.nread; + } + + return rc; + } + + int RGWLibFS::unlink(RGWFileHandle* rgw_fh, const char* name, uint32_t flags) + { + int rc = 0; + BucketStats bs; + RGWFileHandle* parent = nullptr; + RGWFileHandle* bkt_fh = nullptr; + + if (unlikely(flags & RGWFileHandle::FLAG_UNLINK_THIS)) { + /* LOCKED */ + parent = rgw_fh->get_parent(); + } else { + /* atomicity */ + parent = rgw_fh; + LookupFHResult fhr = lookup_fh(parent, name, RGWFileHandle::FLAG_LOCK); + rgw_fh = get<0>(fhr); + /* LOCKED */ + } + + if (parent->is_root()) { + /* a bucket may have an object storing Unix attributes, check + * for and delete it */ + LookupFHResult fhr; + fhr = stat_bucket(parent, name, bs, (rgw_fh) ? + RGWFileHandle::FLAG_LOCKED : + RGWFileHandle::FLAG_NONE); + bkt_fh = get<0>(fhr); + if (unlikely(! bkt_fh)) { + /* implies !rgw_fh, so also !LOCKED */ + return -ENOENT; + } + + if (bs.num_entries > 1) { + unref(bkt_fh); /* return stat_bucket ref */ + if (likely(!! rgw_fh)) { /* return lock and ref from + * lookup_fh (or caller in the + * special case of + * RGWFileHandle::FLAG_UNLINK_THIS) */ + rgw_fh->mtx.unlock(); + unref(rgw_fh); + } + return -ENOTEMPTY; + } else { + /* delete object w/key "/" (uxattrs), if any */ + string oname{"/"}; + RGWDeleteObjRequest req(cct, get_user(), bkt_fh->bucket_name(), oname); + rc = rgwlib.get_fe()->execute_req(&req); + /* don't care if ENOENT */ + unref(bkt_fh); + } + + string bname{name}; + RGWDeleteBucketRequest req(cct, get_user(), bname); + rc = rgwlib.get_fe()->execute_req(&req); + if (! rc) { + rc = req.get_ret(); + } + } else { + /* + * leaf object + */ + if (! rgw_fh) { + /* XXX for now, peform a hard lookup to deduce the type of + * object to be deleted ("foo" vs. "foo/")--also, ensures + * atomicity at this endpoint */ + struct rgw_file_handle *fh; + rc = rgw_lookup(get_fs(), parent->get_fh(), name, &fh, + nullptr /* st */, 0 /* mask */, + RGW_LOOKUP_FLAG_NONE); + if (!! rc) + return rc; + + /* rgw_fh ref+ */ + rgw_fh = get_rgwfh(fh); + rgw_fh->mtx.lock(); /* LOCKED */ + } + + std::string oname = rgw_fh->relative_object_name(); + if (rgw_fh->is_dir()) { + /* for the duration of our cache timer, trust positive + * child cache */ + if (rgw_fh->has_children()) { + rgw_fh->mtx.unlock(); + unref(rgw_fh); + return(-ENOTEMPTY); + } + oname += "/"; + } + RGWDeleteObjRequest req(cct, get_user(), parent->bucket_name(), + oname); + rc = rgwlib.get_fe()->execute_req(&req); + if (! rc) { + rc = req.get_ret(); + } + } + + /* ENOENT when raced with other s3 gateway */ + if (! rc || rc == -ENOENT) { + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh, + RGWFileHandle::FHCache::FLAG_LOCK); + } + + if (! rc) { + real_time t = real_clock::now(); + parent->set_mtime(real_clock::to_timespec(t)); + parent->set_ctime(real_clock::to_timespec(t)); + } + + rgw_fh->mtx.unlock(); + unref(rgw_fh); + + return rc; + } /* RGWLibFS::unlink */ + + int RGWLibFS::rename(RGWFileHandle* src_fh, RGWFileHandle* dst_fh, + const char *_src_name, const char *_dst_name) + + { + /* XXX initial implementation: try-copy, and delete if copy + * succeeds */ + int rc = -EINVAL; + + real_time t; + + std::string src_name{_src_name}; + std::string dst_name{_dst_name}; + + /* atomicity */ + LookupFHResult fhr = lookup_fh(src_fh, _src_name, RGWFileHandle::FLAG_LOCK); + RGWFileHandle* rgw_fh = get<0>(fhr); + + /* should not happen */ + if (! rgw_fh) { + ldout(get_context(), 0) << __func__ + << " BUG no such src renaming path=" + << src_name + << dendl; + goto out; + } + + /* forbid renaming of directories (unreasonable at scale) */ + if (rgw_fh->is_dir()) { + ldout(get_context(), 12) << __func__ + << " rejecting attempt to rename directory path=" + << rgw_fh->full_object_name() + << dendl; + rc = -EPERM; + goto unlock; + } + + /* forbid renaming open files (violates intent, for now) */ + if (rgw_fh->is_open()) { + ldout(get_context(), 12) << __func__ + << " rejecting attempt to rename open file path=" + << rgw_fh->full_object_name() + << dendl; + rc = -EPERM; + goto unlock; + } + + t = real_clock::now(); + + for (int ix : {0, 1}) { + switch (ix) { + case 0: + { + RGWCopyObjRequest req(cct, get_user(), src_fh, dst_fh, src_name, + dst_name); + int rc = rgwlib.get_fe()->execute_req(&req); + if ((rc != 0) || + ((rc = req.get_ret()) != 0)) { + ldout(get_context(), 1) + << __func__ + << " rename step 0 failed src=" + << src_fh->full_object_name() << " " << src_name + << " dst=" << dst_fh->full_object_name() + << " " << dst_name + << "rc " << rc + << dendl; + goto unlock; + } + ldout(get_context(), 12) + << __func__ + << " rename step 0 success src=" + << src_fh->full_object_name() << " " << src_name + << " dst=" << dst_fh->full_object_name() + << " " << dst_name + << " rc " << rc + << dendl; + /* update dst change id */ + dst_fh->set_times(t); + } + break; + case 1: + { + rc = this->unlink(rgw_fh /* LOCKED */, _src_name, + RGWFileHandle::FLAG_UNLINK_THIS); + /* !LOCKED, -ref */ + if (! rc) { + ldout(get_context(), 12) + << __func__ + << " rename step 1 success src=" + << src_fh->full_object_name() << " " << src_name + << " dst=" << dst_fh->full_object_name() + << " " << dst_name + << " rc " << rc + << dendl; + /* update src change id */ + src_fh->set_times(t); + } else { + ldout(get_context(), 1) + << __func__ + << " rename step 1 failed src=" + << src_fh->full_object_name() << " " << src_name + << " dst=" << dst_fh->full_object_name() + << " " << dst_name + << " rc " << rc + << dendl; + } + } + goto out; + default: + ceph_abort(); + } /* switch */ + } /* ix */ + unlock: + rgw_fh->mtx.unlock(); /* !LOCKED */ + unref(rgw_fh); /* -ref */ + + out: + return rc; + } /* RGWLibFS::rename */ + + MkObjResult RGWLibFS::mkdir(RGWFileHandle* parent, const char *name, + struct stat *st, uint32_t mask, uint32_t flags) + { + int rc, rc2; + rgw_file_handle *lfh; + + rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh, + nullptr /* st */, 0 /* mask */, + RGW_LOOKUP_FLAG_NONE); + if (! rc) { + /* conflict! */ + rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE); + return MkObjResult{nullptr, -EEXIST}; + } + + MkObjResult mkr{nullptr, -EINVAL}; + LookupFHResult fhr; + RGWFileHandle* rgw_fh = nullptr; + buffer::list ux_key, ux_attrs; + + fhr = lookup_fh(parent, name, + RGWFileHandle::FLAG_CREATE| + RGWFileHandle::FLAG_DIRECTORY| + RGWFileHandle::FLAG_LOCK); + rgw_fh = get<0>(fhr); + if (rgw_fh) { + rgw_fh->create_stat(st, mask); + rgw_fh->set_times(real_clock::now()); + /* save attrs */ + rgw_fh->encode_attrs(ux_key, ux_attrs); + if (st) + rgw_fh->stat(st, RGWFileHandle::FLAG_LOCKED); + get<0>(mkr) = rgw_fh; + } else { + get<1>(mkr) = -EIO; + return mkr; + } + + if (parent->is_root()) { + /* bucket */ + string bname{name}; + /* enforce S3 name restrictions */ + rc = valid_fs_bucket_name(bname); + if (rc != 0) { + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh, + RGWFileHandle::FHCache::FLAG_LOCK); + rgw_fh->mtx.unlock(); + unref(rgw_fh); + get<0>(mkr) = nullptr; + get<1>(mkr) = rc; + return mkr; + } + + RGWCreateBucketRequest req(get_context(), get_user(), bname); + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = rgwlib.get_fe()->execute_req(&req); + rc2 = req.get_ret(); + } else { + /* create an object representing the directory */ + buffer::list bl; + string dir_name = parent->format_child_name(name, true); + + /* need valid S3 name (characters, length <= 1024, etc) */ + rc = valid_fs_object_name(dir_name); + if (rc != 0) { + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh, + RGWFileHandle::FHCache::FLAG_LOCK); + rgw_fh->mtx.unlock(); + unref(rgw_fh); + get<0>(mkr) = nullptr; + get<1>(mkr) = rc; + return mkr; + } + + RGWPutObjRequest req(get_context(), get_user(), parent->bucket_name(), + dir_name, bl); + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = rgwlib.get_fe()->execute_req(&req); + rc2 = req.get_ret(); + } + + if (! ((rc == 0) && + (rc2 == 0))) { + /* op failed */ + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + rgw_fh->mtx.unlock(); /* !LOCKED */ + unref(rgw_fh); + get<0>(mkr) = nullptr; + /* fixup rc */ + if (!rc) + rc = rc2; + } else { + real_time t = real_clock::now(); + parent->set_mtime(real_clock::to_timespec(t)); + parent->set_ctime(real_clock::to_timespec(t)); + rgw_fh->mtx.unlock(); /* !LOCKED */ + } + + get<1>(mkr) = rc; + + return mkr; + } /* RGWLibFS::mkdir */ + + MkObjResult RGWLibFS::create(RGWFileHandle* parent, const char *name, + struct stat *st, uint32_t mask, uint32_t flags) + { + int rc, rc2; + + using std::get; + + rgw_file_handle *lfh; + rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh, + nullptr /* st */, 0 /* mask */, + RGW_LOOKUP_FLAG_NONE); + if (! rc) { + /* conflict! */ + rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE); + return MkObjResult{nullptr, -EEXIST}; + } + + /* expand and check name */ + std::string obj_name = parent->format_child_name(name, false); + rc = valid_fs_object_name(obj_name); + if (rc != 0) { + return MkObjResult{nullptr, rc}; + } + + /* create it */ + buffer::list bl; + RGWPutObjRequest req(cct, get_user(), parent->bucket_name(), obj_name, bl); + MkObjResult mkr{nullptr, -EINVAL}; + + rc = rgwlib.get_fe()->execute_req(&req); + rc2 = req.get_ret(); + + if ((rc == 0) && + (rc2 == 0)) { + /* XXX atomicity */ + LookupFHResult fhr = lookup_fh(parent, name, RGWFileHandle::FLAG_CREATE | + RGWFileHandle::FLAG_LOCK); + RGWFileHandle* rgw_fh = get<0>(fhr); + if (rgw_fh) { + if (get<1>(fhr) & RGWFileHandle::FLAG_CREATE) { + /* fill in stat data */ + real_time t = real_clock::now(); + rgw_fh->create_stat(st, mask); + rgw_fh->set_times(t); + + parent->set_mtime(real_clock::to_timespec(t)); + parent->set_ctime(real_clock::to_timespec(t)); + } + if (st) + (void) rgw_fh->stat(st, RGWFileHandle::FLAG_LOCKED); + + rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG))); + rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL))); + + get<0>(mkr) = rgw_fh; + rgw_fh->mtx.unlock(); + } else + rc = -EIO; + } + + get<1>(mkr) = rc; + + /* case like : quota exceed will be considered as fail too*/ + if(rc2 < 0) + get<1>(mkr) = rc2; + + return mkr; + } /* RGWLibFS::create */ + + MkObjResult RGWLibFS::symlink(RGWFileHandle* parent, const char *name, + const char* link_path, struct stat *st, uint32_t mask, uint32_t flags) + { + int rc, rc2; + + using std::get; + + rgw_file_handle *lfh; + rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh, + nullptr /* st */, 0 /* mask */, + RGW_LOOKUP_FLAG_NONE); + if (! rc) { + /* conflict! */ + rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE); + return MkObjResult{nullptr, -EEXIST}; + } + + MkObjResult mkr{nullptr, -EINVAL}; + LookupFHResult fhr; + RGWFileHandle* rgw_fh = nullptr; + buffer::list ux_key, ux_attrs; + + fhr = lookup_fh(parent, name, + RGWFileHandle::FLAG_CREATE| + RGWFileHandle::FLAG_SYMBOLIC_LINK| + RGWFileHandle::FLAG_LOCK); + rgw_fh = get<0>(fhr); + if (rgw_fh) { + rgw_fh->create_stat(st, mask); + rgw_fh->set_times(real_clock::now()); + /* save attrs */ + rgw_fh->encode_attrs(ux_key, ux_attrs); + if (st) + rgw_fh->stat(st); + get<0>(mkr) = rgw_fh; + } else { + get<1>(mkr) = -EIO; + return mkr; + } + + /* need valid S3 name (characters, length <= 1024, etc) */ + rc = valid_fs_object_name(name); + if (rc != 0) { + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh, + RGWFileHandle::FHCache::FLAG_LOCK); + rgw_fh->mtx.unlock(); + unref(rgw_fh); + get<0>(mkr) = nullptr; + get<1>(mkr) = rc; + return mkr; + } + + string obj_name = std::string(name); + /* create an object representing the directory */ + buffer::list bl; + + /* XXXX */ +#if 0 + bl.push_back( + buffer::create_static(len, static_cast(buffer))); +#else + + bl.push_back( + buffer::copy(link_path, strlen(link_path))); +#endif + + RGWPutObjRequest req(get_context(), get_user(), parent->bucket_name(), + obj_name, bl); + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = rgwlib.get_fe()->execute_req(&req); + rc2 = req.get_ret(); + if (! ((rc == 0) && + (rc2 == 0))) { + /* op failed */ + rgw_fh->flags |= RGWFileHandle::FLAG_DELETED; + rgw_fh->mtx.unlock(); /* !LOCKED */ + unref(rgw_fh); + get<0>(mkr) = nullptr; + /* fixup rc */ + if (!rc) + rc = rc2; + } else { + real_time t = real_clock::now(); + parent->set_mtime(real_clock::to_timespec(t)); + parent->set_ctime(real_clock::to_timespec(t)); + rgw_fh->mtx.unlock(); /* !LOCKED */ + } + + get<1>(mkr) = rc; + + return mkr; + } /* RGWLibFS::symlink */ + + int RGWLibFS::getattr(RGWFileHandle* rgw_fh, struct stat* st) + { + switch(rgw_fh->fh.fh_type) { + case RGW_FS_TYPE_FILE: + { + if (rgw_fh->deleted()) + return -ESTALE; + } + break; + default: + break; + }; + /* if rgw_fh is a directory, mtime will be advanced */ + return rgw_fh->stat(st); + } /* RGWLibFS::getattr */ + + int RGWLibFS::setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask, + uint32_t flags) + { + int rc, rc2; + buffer::list ux_key, ux_attrs; + buffer::list etag = rgw_fh->get_etag(); + buffer::list acls = rgw_fh->get_acls(); + + lock_guard guard(rgw_fh->mtx); + + switch(rgw_fh->fh.fh_type) { + case RGW_FS_TYPE_FILE: + { + if (rgw_fh->deleted()) + return -ESTALE; + } + break; + default: + break; + }; + + string obj_name{rgw_fh->relative_object_name()}; + + if (rgw_fh->is_dir() && + (likely(! rgw_fh->is_bucket()))) { + obj_name += "/"; + } + + RGWSetAttrsRequest req(cct, get_user(), rgw_fh->bucket_name(), obj_name); + + rgw_fh->create_stat(st, mask); + rgw_fh->encode_attrs(ux_key, ux_attrs); + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + req.emplace_attr(RGW_ATTR_ETAG, std::move(etag)); + req.emplace_attr(RGW_ATTR_ACL, std::move(acls)); + + rc = rgwlib.get_fe()->execute_req(&req); + rc2 = req.get_ret(); + + if (rc == -ENOENT) { + /* special case: materialize placeholder dir */ + buffer::list bl; + RGWPutObjRequest req(get_context(), get_user(), rgw_fh->bucket_name(), + obj_name, bl); + + rgw_fh->encode_attrs(ux_key, ux_attrs); /* because std::moved */ + + /* save attrs */ + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = rgwlib.get_fe()->execute_req(&req); + rc2 = req.get_ret(); + } + + if ((rc != 0) || (rc2 != 0)) { + return -EIO; + } + + rgw_fh->set_ctime(real_clock::to_timespec(real_clock::now())); + + return 0; + } /* RGWLibFS::setattr */ + + /* called under rgw_fh->mtx held */ + void RGWLibFS::update_fh(RGWFileHandle *rgw_fh) + { + int rc, rc2; + string obj_name{rgw_fh->relative_object_name()}; + buffer::list ux_key, ux_attrs; + + if (rgw_fh->is_dir() && + (likely(! rgw_fh->is_bucket()))) { + obj_name += "/"; + } + + lsubdout(get_context(), rgw, 17) + << __func__ + << " update old versioned fh : " << obj_name + << dendl; + + RGWSetAttrsRequest req(cct, get_user(), rgw_fh->bucket_name(), obj_name); + + rgw_fh->encode_attrs(ux_key, ux_attrs); + + req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + rc = rgwlib.get_fe()->execute_req(&req); + rc2 = req.get_ret(); + + if ((rc != 0) || (rc2 != 0)) { + lsubdout(get_context(), rgw, 17) + << __func__ + << " update fh failed : " << obj_name + << dendl; + } + } /* RGWLibFS::update_fh */ + + void RGWLibFS::close() + { + state.flags |= FLAG_CLOSED; + + class ObjUnref + { + RGWLibFS* fs; + public: + explicit ObjUnref(RGWLibFS* _fs) : fs(_fs) {} + void operator()(RGWFileHandle* fh) const { + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << fh->name + << " before ObjUnref refs=" << fh->get_refcnt() + << dendl; + fs->unref(fh); + } + }; + + /* force cache drain, forces objects to evict */ + fh_cache.drain(ObjUnref(this), + RGWFileHandle::FHCache::FLAG_LOCK); + rgwlib.get_fe()->get_process()->unregister_fs(this); + rele(); + } /* RGWLibFS::close */ + + inline std::ostream& operator<<(std::ostream &os, fh_key const &fhk) { + os << ""; + return os; + } + + inline std::ostream& operator<<(std::ostream &os, struct timespec const &ts) { + os << ""; + return os; + } + + std::ostream& operator<<(std::ostream &os, RGWLibFS::event const &ev) { + os << ""; + return os; + } + + void RGWLibFS::gc() + { + using std::get; + using directory = RGWFileHandle::directory; + + /* dirent invalidate timeout--basically, the upper-bound on + * inconsistency with the S3 namespace */ + auto expire_s + = get_context()->_conf->rgw_nfs_namespace_expire_secs; + + /* max events to gc in one cycle */ + uint32_t max_ev = get_context()->_conf->rgw_nfs_max_gc; + + struct timespec now, expire_ts; + event_vector ve; + bool stop = false; + std::deque &events = state.events; + + do { + (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + lsubdout(get_context(), rgw, 15) + << "GC: top of expire loop" + << " now=" << now + << " expire_s=" << expire_s + << dendl; + { + lock_guard guard(state.mtx); /* LOCKED */ + lsubdout(get_context(), rgw, 15) + << "GC: processing" + << " count=" << events.size() + << " events" + << dendl; + /* just return if no events */ + if (events.empty()) { + return; + } + uint32_t _max_ev = + (events.size() < 500) ? max_ev : (events.size() / 4); + for (uint32_t ix = 0; (ix < _max_ev) && (events.size() > 0); ++ix) { + event& ev = events.front(); + expire_ts = ev.ts; + expire_ts.tv_sec += expire_s; + if (expire_ts > now) { + stop = true; + break; + } + ve.push_back(ev); + events.pop_front(); + } + } /* anon */ + /* !LOCKED */ + for (auto& ev : ve) { + lsubdout(get_context(), rgw, 15) + << "try-expire ev: " << ev << dendl; + if (likely(ev.t == event::type::READDIR)) { + RGWFileHandle* rgw_fh = lookup_handle(ev.fhk.fh_hk); + lsubdout(get_context(), rgw, 15) + << "ev rgw_fh: " << rgw_fh << dendl; + if (rgw_fh) { + RGWFileHandle::directory* d; + if (unlikely(! rgw_fh->is_dir())) { + lsubdout(get_context(), rgw, 0) + << __func__ + << " BUG non-directory found with READDIR event " + << "(" << rgw_fh->bucket_name() << "," + << rgw_fh->object_name() << ")" + << dendl; + goto rele; + } + /* maybe clear state */ + d = get(&rgw_fh->variant_type); + if (d) { + struct timespec ev_ts = ev.ts; + lock_guard guard(rgw_fh->mtx); + struct timespec d_last_readdir = d->last_readdir; + if (unlikely(ev_ts < d_last_readdir)) { + /* readdir cycle in progress, don't invalidate */ + lsubdout(get_context(), rgw, 15) + << "GC: delay expiration for " + << rgw_fh->object_name() + << " ev.ts=" << ev_ts + << " last_readdir=" << d_last_readdir + << dendl; + continue; + } else { + lsubdout(get_context(), rgw, 15) + << "GC: expiring " + << rgw_fh->object_name() + << dendl; + rgw_fh->clear_state(); + rgw_fh->invalidate(); + } + } + rele: + unref(rgw_fh); + } /* rgw_fh */ + } /* event::type::READDIR */ + } /* ev */ + ve.clear(); + } while (! (stop || shutdown)); + } /* RGWLibFS::gc */ + + std::ostream& operator<<(std::ostream &os, + RGWFileHandle const &rgw_fh) + { + const auto& fhk = rgw_fh.get_key(); + const auto& fh = const_cast(rgw_fh).get_fh(); + os << "fh_type) { + case RGW_FS_TYPE_DIRECTORY: + os << "type=DIRECTORY;"; + break; + case RGW_FS_TYPE_FILE: + os << "type=FILE;"; + break; + default: + os << "type=UNKNOWN;"; + break; + }; + os << "fid=" << fhk.fh_hk.bucket << ":" << fhk.fh_hk.object << ";"; + os << "name=" << rgw_fh.object_name() << ";"; + os << "refcnt=" << rgw_fh.get_refcnt() << ";"; + os << ">"; + return os; + } + + RGWFileHandle::~RGWFileHandle() { + /* !recycle case, handle may STILL be in handle table, BUT + * the partition lock is not held in this path */ + if (fh_hook.is_linked()) { + fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_LOCK); + } + /* cond-unref parent */ + if (parent && (! parent->is_mount())) { + /* safe because if parent->unref causes its deletion, + * there are a) by refcnt, no other objects/paths pointing + * to it and b) by the semantics of valid iteration of + * fh_lru (observed, e.g., by cohort_lru::drain()) + * no unsafe iterators reaching it either--n.b., this constraint + * is binding oncode which may in future attempt to e.g., + * cause the eviction of objects in LRU order */ + (void) get_fs()->unref(parent); + } + } + + fh_key RGWFileHandle::make_fhk(const std::string& name) + { + std::string tenant = get_fs()->get_user()->user_id.to_str(); + if (depth == 0) { + /* S3 bucket -- assert mount-at-bucket case reaches here */ + return fh_key(name, name, tenant); + } else { + std::string key_name = make_key_name(name.c_str()); + return fh_key(fhk.fh_hk.bucket, key_name.c_str(), tenant); + } + } + + void RGWFileHandle::encode_attrs(ceph::buffer::list& ux_key1, + ceph::buffer::list& ux_attrs1) + { + using ceph::encode; + fh_key fhk(this->fh.fh_hk); + encode(fhk, ux_key1); + encode(*this, ux_attrs1); + } /* RGWFileHandle::encode_attrs */ + + DecodeAttrsResult RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1, + const ceph::buffer::list* ux_attrs1) + { + using ceph::decode; + DecodeAttrsResult dar { false, false }; + fh_key fhk; + auto bl_iter_key1 = ux_key1->cbegin(); + decode(fhk, bl_iter_key1); + get<0>(dar) = true; + + auto bl_iter_unix1 = ux_attrs1->cbegin(); + decode(*this, bl_iter_unix1); + if (this->state.version < 2) { + get<1>(dar) = true; + } + + return dar; + } /* RGWFileHandle::decode_attrs */ + + bool RGWFileHandle::reclaim(const cohort::lru::ObjectFactory* newobj_fac) { + lsubdout(fs->get_context(), rgw, 17) + << __func__ << " " << *this + << dendl; + auto factory = dynamic_cast(newobj_fac); + if (factory == nullptr) { + return false; + } + /* make sure the reclaiming object is the same partiton with newobject factory, + * then we can recycle the object, and replace with newobject */ + if (!fs->fh_cache.is_same_partition(factory->fhk.fh_hk.object, fh.fh_hk.object)) { + return false; + } + /* in the non-delete case, handle may still be in handle table */ + if (fh_hook.is_linked()) { + /* in this case, we are being called from a context which holds + * the partition lock */ + fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_NONE); + } + return true; + } /* RGWFileHandle::reclaim */ + + bool RGWFileHandle::has_children() const + { + if (unlikely(! is_dir())) + return false; + + RGWRMdirCheck req(fs->get_context(), fs->get_user(), this); + int rc = rgwlib.get_fe()->execute_req(&req); + if (! rc) { + return req.valid && req.has_children; + } + + return false; + } + + std::ostream& operator<<(std::ostream &os, + RGWFileHandle::readdir_offset const &offset) + { + using boost::get; + if (unlikely(!! get(&offset))) { + uint64_t* ioff = get(offset); + os << *ioff; + } + else + os << get(offset); + return os; + } + + int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg, + readdir_offset offset, + bool *eof, uint32_t flags) + { + using event = RGWLibFS::event; + using boost::get; + int rc = 0; + struct timespec now; + CephContext* cct = fs->get_context(); + + lsubdout(cct, rgw, 10) + << __func__ << " readdir called on " + << object_name() + << dendl; + + directory* d = get(&variant_type); + if (d) { + (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */ + lock_guard guard(mtx); + d->last_readdir = now; + } + + bool initial_off; + char* mk{nullptr}; + + if (likely(!! get(&offset))) { + mk = const_cast(get(offset)); + initial_off = !mk; + } else { + initial_off = (*get(offset) == 0); + } + + if (is_root()) { + RGWListBucketsRequest req(cct, fs->get_user(), this, rcb, cb_arg, + offset); + rc = rgwlib.get_fe()->execute_req(&req); + if (! rc) { + (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */ + lock_guard guard(mtx); + state.atime = now; + if (initial_off) + set_nlink(2); + inc_nlink(req.d_count); + *eof = req.eof(); + } + } else { + RGWReaddirRequest req(cct, fs->get_user(), this, rcb, cb_arg, offset); + rc = rgwlib.get_fe()->execute_req(&req); + if (! rc) { + (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */ + lock_guard guard(mtx); + state.atime = now; + if (initial_off) + set_nlink(2); + inc_nlink(req.d_count); + *eof = req.eof(); + } + } + + event ev(event::type::READDIR, get_key(), state.atime); + lock_guard sguard(fs->state.mtx); + fs->state.push_event(ev); + + lsubdout(fs->get_context(), rgw, 15) + << __func__ + << " final link count=" << state.nlink + << dendl; + + return rc; + } /* RGWFileHandle::readdir */ + + int RGWFileHandle::write(uint64_t off, size_t len, size_t *bytes_written, + void *buffer) + { + using std::get; + using WriteCompletion = RGWLibFS::WriteCompletion; + + lock_guard guard(mtx); + + int rc = 0; + + file* f = get(&variant_type); + if (! f) + return -EISDIR; + + if (deleted()) { + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << " write attempted on deleted object " + << this->object_name() + << dendl; + /* zap write transaction, if any */ + if (f->write_req) { + delete f->write_req; + f->write_req = nullptr; + } + return -ESTALE; + } + + if (! f->write_req) { + /* guard--we do not support (e.g., COW-backed) partial writes */ + if (off != 0) { + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << " " << object_name() + << " non-0 initial write position " << off + << " (mounting with -o sync required)" + << dendl; + return -EIO; + } + + /* start */ + std::string object_name = relative_object_name(); + f->write_req = + new RGWWriteRequest(fs->get_context(), fs->get_user(), this, + bucket_name(), object_name); + rc = rgwlib.get_fe()->start_req(f->write_req); + if (rc < 0) { + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << this->object_name() + << " write start failed " << off + << " (" << rc << ")" + << dendl; + /* zap failed write transaction */ + delete f->write_req; + f->write_req = nullptr; + return -EIO; + } else { + if (stateless_open()) { + /* start write timer */ + f->write_req->timer_id = + RGWLibFS::write_timer.add_event( + std::chrono::seconds(RGWLibFS::write_completion_interval_s), + WriteCompletion(*this)); + } + } + } + + int overlap = 0; + if ((static_cast(off) < f->write_req->real_ofs) && + ((f->write_req->real_ofs - off) <= len)) { + overlap = f->write_req->real_ofs - off; + off = f->write_req->real_ofs; + buffer = static_cast(buffer) + overlap; + len -= overlap; + } + + buffer::list bl; + /* XXXX */ +#if 0 + bl.push_back( + buffer::create_static(len, static_cast(buffer))); +#else + bl.push_back( + buffer::copy(static_cast(buffer), len)); +#endif + + f->write_req->put_data(off, bl); + rc = f->write_req->exec_continue(); + + if (rc == 0) { + size_t min_size = off + len; + if (min_size > get_size()) + set_size(min_size); + if (stateless_open()) { + /* bump write timer */ + RGWLibFS::write_timer.adjust_event( + f->write_req->timer_id, std::chrono::seconds(10)); + } + } else { + /* continuation failed (e.g., non-contiguous write position) */ + lsubdout(fs->get_context(), rgw, 5) + << __func__ + << object_name() + << " failed write at position " << off + << " (fails write transaction) " + << dendl; + /* zap failed write transaction */ + delete f->write_req; + f->write_req = nullptr; + rc = -EIO; + } + + *bytes_written = (rc == 0) ? (len + overlap) : 0; + return rc; + } /* RGWFileHandle::write */ + + int RGWFileHandle::write_finish(uint32_t flags) + { + unique_lock guard{mtx, std::defer_lock}; + int rc = 0; + + if (! (flags & FLAG_LOCKED)) { + guard.lock(); + } + + file* f = get(&variant_type); + if (f && (f->write_req)) { + lsubdout(fs->get_context(), rgw, 10) + << __func__ + << " finishing write trans on " << object_name() + << dendl; + rc = rgwlib.get_fe()->finish_req(f->write_req); + if (! rc) { + rc = f->write_req->get_ret(); + } + delete f->write_req; + f->write_req = nullptr; + } + + return rc; + } /* RGWFileHandle::write_finish */ + + int RGWFileHandle::close() + { + lock_guard guard(mtx); + + int rc = write_finish(FLAG_LOCKED); + + flags &= ~FLAG_OPEN; + flags &= ~FLAG_STATELESS_OPEN; + + return rc; + } /* RGWFileHandle::close */ + + RGWFileHandle::file::~file() + { + delete write_req; + } + + void RGWFileHandle::clear_state() + { + directory* d = get(&variant_type); + if (d) { + state.nlink = 2; + d->last_marker = rgw_obj_key{}; + } + } + + void RGWFileHandle::advance_mtime(uint32_t flags) { + /* intended for use on directories, fast-forward mtime so as to + * ensure a new, higher value for the change attribute */ + unique_lock uniq(mtx, std::defer_lock); + if (likely(! (flags & RGWFileHandle::FLAG_LOCKED))) { + uniq.lock(); + } + + /* advance mtime only if stored mtime is older than the + * configured namespace expiration */ + auto now = real_clock::now(); + auto cmptime = state.mtime; + cmptime.tv_sec += + fs->get_context()->_conf->rgw_nfs_namespace_expire_secs; + if (cmptime < real_clock::to_timespec(now)) { + /* sets ctime as well as mtime, to avoid masking updates should + * ctime inexplicably hold a higher value */ + set_times(now); + } + } + + void RGWFileHandle::invalidate() { + RGWLibFS *fs = get_fs(); + if (fs->invalidate_cb) { + fs->invalidate_cb(fs->invalidate_arg, get_key().fh_hk); + } + } + + int RGWWriteRequest::exec_start() { + struct req_state* s = get_state(); + + auto compression_type = + get_store()->svc.zone->get_zone_params().get_compression_type( + s->bucket_info.placement_rule); + + /* not obviously supportable */ + ceph_assert(! dlo_manifest); + ceph_assert(! slo_info); + + perfcounter->inc(l_rgw_put); + op_ret = -EINVAL; + rgw_obj obj{s->bucket, s->object}; + + if (s->object.empty()) { + ldout(s->cct, 0) << __func__ << " called on empty object" << dendl; + goto done; + } + + op_ret = get_params(); + if (op_ret < 0) + goto done; + + op_ret = get_system_versioning_params(s, &olh_epoch, &version_id); + if (op_ret < 0) { + goto done; + } + + /* user-supplied MD5 check skipped (not supplied) */ + /* early quota check skipped--we don't have size yet */ + /* skipping user-supplied etag--we might have one in future, but + * like data it and other attrs would arrive after open */ + + aio.emplace(s->cct->_conf->rgw_put_obj_min_window_size); + + if (s->bucket_info.versioning_enabled()) { + if (!version_id.empty()) { + obj.key.set_instance(version_id); + } else { + get_store()->gen_rand_obj_instance_name(&obj); + version_id = obj.key.instance; + } + } + processor.emplace(&*aio, get_store(), s->bucket_info, + &s->dest_placement, + s->bucket_owner.get_id(), + *static_cast(s->obj_ctx), + obj, olh_epoch, s->req_id); + + op_ret = processor->prepare(); + if (op_ret < 0) { + ldout(s->cct, 20) << "processor->prepare() returned ret=" << op_ret + << dendl; + goto done; + } + filter = &*processor; + if (compression_type != "none") { + plugin = Compressor::create(s->cct, compression_type); + if (! plugin) { + ldout(s->cct, 1) << "Cannot load plugin for rgw_compression_type " + << compression_type << dendl; + } else { + compressor.emplace(s->cct, plugin, filter); + filter = &*compressor; + } + } + + done: + return op_ret; + } /* exec_start */ + + int RGWWriteRequest::exec_continue() + { + struct req_state* s = get_state(); + op_ret = 0; + + /* check guards (e.g., contig write) */ + if (eio) { + ldout(s->cct, 5) + << " chunks arrived in wrong order" + << " (mounting with -o sync required)" + << dendl; + return -EIO; + } + + op_ret = get_store()->check_quota(s->bucket_owner.get_id(), s->bucket, + user_quota, bucket_quota, real_ofs, true); + /* max_size exceed */ + if (op_ret < 0) + return -EIO; + + size_t len = data.length(); + if (! len) + return 0; + + hash.Update((const unsigned char *)data.c_str(), data.length()); + op_ret = filter->process(std::move(data), ofs); + if (op_ret < 0) { + goto done; + } + bytes_written += len; + + done: + return op_ret; + } /* exec_continue */ + + int RGWWriteRequest::exec_finish() + { + buffer::list bl, aclbl, ux_key, ux_attrs; + map::iterator iter; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + struct req_state* s = get_state(); + + size_t osize = rgw_fh->get_size(); + struct timespec octime = rgw_fh->get_ctime(); + struct timespec omtime = rgw_fh->get_mtime(); + real_time appx_t = real_clock::now(); + + s->obj_size = bytes_written; + perfcounter->inc(l_rgw_put_b, s->obj_size); + + // flush data in filters + op_ret = filter->process({}, s->obj_size); + if (op_ret < 0) { + goto done; + } + + op_ret = get_store()->check_quota(s->bucket_owner.get_id(), s->bucket, + user_quota, bucket_quota, s->obj_size, true); + /* max_size exceed */ + if (op_ret < 0) { + goto done; + } + + op_ret = get_store()->check_bucket_shards(s->bucket_info, s->bucket, + bucket_quota); + if (op_ret < 0) { + goto done; + } + + hash.Final(m); + + if (compressor && compressor->is_compressed()) { + bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = s->obj_size; + cs_info.blocks = std::move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + attrs[RGW_ATTR_COMPRESSION] = tmp; + ldout(s->cct, 20) << "storing " << RGW_ATTR_COMPRESSION + << " with type=" << cs_info.compression_type + << ", orig_size=" << cs_info.orig_size + << ", blocks=" << cs_info.blocks.size() << dendl; + } + + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + etag = calc_md5; + + bl.append(etag.c_str(), etag.size() + 1); + emplace_attr(RGW_ATTR_ETAG, std::move(bl)); + + policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + /* unix attrs */ + rgw_fh->set_mtime(real_clock::to_timespec(appx_t)); + rgw_fh->set_ctime(real_clock::to_timespec(appx_t)); + rgw_fh->set_size(bytes_written); + rgw_fh->encode_attrs(ux_key, ux_attrs); + + emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs)); + + for (iter = s->generic_attrs.begin(); iter != s->generic_attrs.end(); + ++iter) { + buffer::list& attrbl = attrs[iter->first]; + const string& val = iter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs); + if (op_ret < 0) { + goto done; + } + encode_delete_at_attr(delete_at, attrs); + + /* Add a custom metadata to expose the information whether an object + * is an SLO or not. Appending the attribute must be performed AFTER + * processing any input from user in order to prohibit overwriting. */ + if (unlikely(!! slo_info)) { + buffer::list slo_userindicator_bl; + using ceph::encode; + encode("True", slo_userindicator_bl); + emplace_attr(RGW_ATTR_SLO_UINDICATOR, std::move(slo_userindicator_bl)); + } + + op_ret = processor->complete(s->obj_size, etag, &mtime, real_time(), attrs, + (delete_at ? *delete_at : real_time()), + if_match, if_nomatch, nullptr, nullptr, nullptr); + if (op_ret != 0) { + /* revert attr updates */ + rgw_fh->set_mtime(omtime); + rgw_fh->set_ctime(octime); + rgw_fh->set_size(osize); + } + + done: + perfcounter->tinc(l_rgw_put_lat, s->time_elapsed()); + return op_ret; + } /* exec_finish */ + +} /* namespace rgw */ + +/* librgw */ +extern "C" { + +void rgwfile_version(int *major, int *minor, int *extra) +{ + if (major) + *major = LIBRGW_FILE_VER_MAJOR; + if (minor) + *minor = LIBRGW_FILE_VER_MINOR; + if (extra) + *extra = LIBRGW_FILE_VER_EXTRA; +} + +/* + attach rgw namespace +*/ + int rgw_mount(librgw_t rgw, const char *uid, const char *acc_key, + const char *sec_key, struct rgw_fs **rgw_fs, + uint32_t flags) +{ + int rc = 0; + + /* stash access data for "mount" */ + RGWLibFS* new_fs = new RGWLibFS(static_cast(rgw), uid, acc_key, + sec_key, "/"); + ceph_assert(new_fs); + + rc = new_fs->authorize(rgwlib.get_store()); + if (rc != 0) { + delete new_fs; + return -EINVAL; + } + + /* register fs for shared gc */ + rgwlib.get_fe()->get_process()->register_fs(new_fs); + + struct rgw_fs *fs = new_fs->get_fs(); + fs->rgw = rgw; + + /* XXX we no longer assume "/" is unique, but we aren't tracking the + * roots atm */ + + *rgw_fs = fs; + + return 0; +} + +int rgw_mount2(librgw_t rgw, const char *uid, const char *acc_key, + const char *sec_key, const char *root, struct rgw_fs **rgw_fs, + uint32_t flags) +{ + int rc = 0; + + /* stash access data for "mount" */ + RGWLibFS* new_fs = new RGWLibFS(static_cast(rgw), uid, acc_key, + sec_key, root); + ceph_assert(new_fs); + + rc = new_fs->authorize(rgwlib.get_store()); + if (rc != 0) { + delete new_fs; + return -EINVAL; + } + + /* register fs for shared gc */ + rgwlib.get_fe()->get_process()->register_fs(new_fs); + + struct rgw_fs *fs = new_fs->get_fs(); + fs->rgw = rgw; + + /* XXX we no longer assume "/" is unique, but we aren't tracking the + * roots atm */ + + *rgw_fs = fs; + + return 0; +} + +/* + register invalidate callbacks +*/ +int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb, + void *arg, uint32_t flags) + +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + return fs->register_invalidate(cb, arg, flags); +} + +/* + detach rgw namespace +*/ +int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + fs->close(); + return 0; +} + +/* + get filesystem attributes +*/ +int rgw_statfs(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + struct rgw_statvfs *vfs_st, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + struct rados_cluster_stat_t stats; + + RGWGetClusterStatReq req(fs->get_context(), fs->get_user(), stats); + int rc = rgwlib.get_fe()->execute_req(&req); + if (rc < 0) { + lderr(fs->get_context()) << "ERROR: getting total cluster usage" + << cpp_strerror(-rc) << dendl; + return rc; + } + + //Set block size to 1M. + constexpr uint32_t CEPH_BLOCK_SHIFT = 20; + vfs_st->f_bsize = 1 << CEPH_BLOCK_SHIFT; + vfs_st->f_frsize = 1 << CEPH_BLOCK_SHIFT; + vfs_st->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10); + vfs_st->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10); + vfs_st->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10); + vfs_st->f_files = stats.num_objects; + vfs_st->f_ffree = -1; + vfs_st->f_fsid[0] = fs->get_fsid(); + vfs_st->f_fsid[1] = fs->get_fsid(); + vfs_st->f_flag = 0; + vfs_st->f_namemax = 4096; + return 0; +} + +/* + generic create -- create an empty regular file +*/ +int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags) +{ + using std::get; + + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* parent = get_rgwfh(parent_fh); + + if ((! parent) || + (parent->is_root()) || + (parent->is_file())) { + /* bad parent */ + return -EINVAL; + } + + MkObjResult fhr = fs->create(parent, name, st, mask, flags); + RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success + + if (nfh) + *fh = nfh->get_fh(); + + return get<1>(fhr); +} /* rgw_create */ + +/* + create a symbolic link + */ +int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, const char *link_path, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags) +{ + using std::get; + + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* parent = get_rgwfh(parent_fh); + + if ((! parent) || + (parent->is_root()) || + (parent->is_file())) { + /* bad parent */ + return -EINVAL; + } + + MkObjResult fhr = fs->symlink(parent, name, link_path, st, mask, flags); + RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success + + if (nfh) + *fh = nfh->get_fh(); + + return get<1>(fhr); +} /* rgw_symlink */ + +/* + create a new directory +*/ +int rgw_mkdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t flags) +{ + using std::get; + + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* parent = get_rgwfh(parent_fh); + + if (! parent) { + /* bad parent */ + return -EINVAL; + } + + MkObjResult fhr = fs->mkdir(parent, name, st, mask, flags); + RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success + + if (nfh) + *fh = nfh->get_fh(); + + return get<1>(fhr); +} /* rgw_mkdir */ + +/* + rename object +*/ +int rgw_rename(struct rgw_fs *rgw_fs, + struct rgw_file_handle *src, const char* src_name, + struct rgw_file_handle *dst, const char* dst_name, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + + RGWFileHandle* src_fh = get_rgwfh(src); + RGWFileHandle* dst_fh = get_rgwfh(dst); + + return fs->rename(src_fh, dst_fh, src_name, dst_name); +} + +/* + remove file or directory +*/ +int rgw_unlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* parent = get_rgwfh(parent_fh); + + return fs->unlink(parent, name); +} + +/* + lookup object by name (POSIX style) +*/ +int rgw_lookup(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char* path, + struct rgw_file_handle **fh, + struct stat *st, uint32_t mask, uint32_t flags) +{ + //CephContext* cct = static_cast(rgw_fs->rgw); + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + + RGWFileHandle* parent = get_rgwfh(parent_fh); + if ((! parent) || + (! parent->is_dir())) { + /* bad parent */ + return -EINVAL; + } + + RGWFileHandle* rgw_fh; + LookupFHResult fhr; + + if (parent->is_root()) { + /* special: parent lookup--note lack of ref()! */ + if (unlikely((strcmp(path, "..") == 0) || + (strcmp(path, "/") == 0))) { + rgw_fh = parent; + } else { + RGWLibFS::BucketStats bstat; + fhr = fs->stat_bucket(parent, path, bstat, RGWFileHandle::FLAG_NONE); + rgw_fh = get<0>(fhr); + if (! rgw_fh) + return -ENOENT; + } + } else { + /* special: after readdir--note extra ref()! */ + if (unlikely((strcmp(path, "..") == 0))) { + rgw_fh = parent; + lsubdout(fs->get_context(), rgw, 17) + << __func__ << " BANG"<< *rgw_fh + << dendl; + fs->ref(rgw_fh); + } else { + enum rgw_fh_type fh_type = fh_type_of(flags); + + uint32_t sl_flags = (flags & RGW_LOOKUP_FLAG_RCB) + ? RGWFileHandle::FLAG_NONE + : RGWFileHandle::FLAG_EXACT_MATCH; + + bool fast_attrs= fs->get_context()->_conf->rgw_nfs_s3_fast_attrs; + + if ((flags & RGW_LOOKUP_FLAG_RCB) && fast_attrs) { + /* FAKE STAT--this should mean, interpolate special + * owner, group, and perms masks */ + fhr = fs->fake_leaf(parent, path, fh_type, st, mask, sl_flags); + } else { + if ((fh_type == RGW_FS_TYPE_DIRECTORY) && fast_attrs) { + /* trust cached dir, if present */ + fhr = fs->lookup_fh(parent, path, RGWFileHandle::FLAG_DIRECTORY); + if (get<0>(fhr)) { + rgw_fh = get<0>(fhr); + goto done; + } + } + fhr = fs->stat_leaf(parent, path, fh_type, sl_flags); + } + if (! get<0>(fhr)) { + if (! (flags & RGW_LOOKUP_FLAG_CREATE)) + return -ENOENT; + else + fhr = fs->lookup_fh(parent, path, RGWFileHandle::FLAG_CREATE); + } + rgw_fh = get<0>(fhr); + } + } /* !root */ + +done: + struct rgw_file_handle *rfh = rgw_fh->get_fh(); + *fh = rfh; + + return 0; +} /* rgw_lookup */ + +/* + lookup object by handle (NFS style) +*/ +int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk, + struct rgw_file_handle **fh, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + + RGWFileHandle* rgw_fh = fs->lookup_handle(*fh_hk); + if (! rgw_fh) { + /* not found */ + return -ENOENT; + } + + struct rgw_file_handle *rfh = rgw_fh->get_fh(); + *fh = rfh; + + return 0; +} + +/* + * release file handle + */ +int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + lsubdout(fs->get_context(), rgw, 17) + << __func__ << " " << *rgw_fh + << dendl; + + fs->unref(rgw_fh); + return 0; +} + +/* + get unix attributes for object +*/ +int rgw_getattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->getattr(rgw_fh, st); +} + +/* + set unix attributes for object +*/ +int rgw_setattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, + uint32_t mask, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->setattr(rgw_fh, st, mask, flags); +} + +/* + truncate file +*/ +int rgw_truncate(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t size, uint32_t flags) +{ + return 0; +} + +/* + open file +*/ +int rgw_open(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint32_t posix_flags, uint32_t flags) +{ + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + /* XXX + * need to track specific opens--at least read opens and + * a write open; we need to know when a write open is returned, + * that closes a write transaction + * + * for now, we will support single-open only, it's preferable to + * anything we can otherwise do without access to the NFS state + */ + if (! rgw_fh->is_file()) + return -EISDIR; + + return rgw_fh->open(flags); +} + +/* + close file +*/ +int rgw_close(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + int rc = rgw_fh->close(/* XXX */); + + if (flags & RGW_CLOSE_FLAG_RELE) + fs->unref(rgw_fh); + + return rc; +} + +int rgw_readdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, uint64_t *offset, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags) +{ + RGWFileHandle* parent = get_rgwfh(parent_fh); + if (! parent) { + /* bad parent */ + return -EINVAL; + } + + lsubdout(parent->get_fs()->get_context(), rgw, 15) + << __func__ + << " offset=" << *offset + << dendl; + + if ((*offset == 0) && + (flags & RGW_READDIR_FLAG_DOTDOT)) { + /* send '.' and '..' with their NFS-defined offsets */ + rcb(".", cb_arg, 1, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + rcb("..", cb_arg, 2, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + } + + int rc = parent->readdir(rcb, cb_arg, offset, eof, flags); + return rc; +} /* rgw_readdir */ + +/* enumeration continuing from name */ +int rgw_readdir2(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char *name, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags) +{ + RGWFileHandle* parent = get_rgwfh(parent_fh); + if (! parent) { + /* bad parent */ + return -EINVAL; + } + + lsubdout(parent->get_fs()->get_context(), rgw, 15) + << __func__ + << " offset=" << ((name) ? name : "(nil)") + << dendl; + + if ((! name) && + (flags & RGW_READDIR_FLAG_DOTDOT)) { + /* send '.' and '..' with their NFS-defined offsets */ + rcb(".", cb_arg, 1, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + rcb("..", cb_arg, 2, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + } + + int rc = parent->readdir(rcb, cb_arg, name, eof, flags); + return rc; +} /* rgw_readdir2 */ + +/* project offset of dirent name */ +int rgw_dirent_offset(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, int64_t *offset, + uint32_t flags) +{ + RGWFileHandle* parent = get_rgwfh(parent_fh); + if ((! parent)) { + /* bad parent */ + return -EINVAL; + } + std::string sname{name}; + int rc = parent->offset_of(sname, offset, flags); + return rc; +} + +/* + read data from file +*/ +int rgw_read(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->read(rgw_fh, offset, length, bytes_read, buffer, flags); +} + +/* + read symbolic link +*/ +int rgw_readlink(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags) +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return fs->readlink(rgw_fh, offset, length, bytes_read, buffer, flags); +} + +/* + write data to file +*/ +int rgw_write(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_written, void *buffer, + uint32_t flags) +{ + RGWFileHandle* rgw_fh = get_rgwfh(fh); + int rc; + + *bytes_written = 0; + + if (! rgw_fh->is_file()) + return -EISDIR; + + if (! rgw_fh->is_open()) { + if (flags & RGW_OPEN_FLAG_V3) { + rc = rgw_fh->open(flags); + if (!! rc) + return rc; + } else + return -EPERM; + } + + rc = rgw_fh->write(offset, length, bytes_written, buffer); + + return rc; +} + +/* + read data from file (vector) +*/ +class RGWReadV +{ + buffer::list bl; + struct rgw_vio* vio; + +public: + RGWReadV(buffer::list& _bl, rgw_vio* _vio) : vio(_vio) { + bl.claim(_bl); + } + + struct rgw_vio* get_vio() { return vio; } + + const auto& buffers() { return bl.buffers(); } + + unsigned /* XXX */ length() { return bl.length(); } + +}; + +void rgw_readv_rele(struct rgw_uio *uio, uint32_t flags) +{ + RGWReadV* rdv = static_cast(uio->uio_p1); + rdv->~RGWReadV(); + ::operator delete(rdv); +} + +int rgw_readv(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags) +{ +#if 0 /* XXX */ + CephContext* cct = static_cast(rgw_fs->rgw); + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + if (! rgw_fh->is_file()) + return -EINVAL; + + int rc = 0; + + buffer::list bl; + RGWGetObjRequest req(cct, fs->get_user(), rgw_fh->bucket_name(), + rgw_fh->object_name(), uio->uio_offset, uio->uio_resid, + bl); + req.do_hexdump = false; + + rc = rgwlib.get_fe()->execute_req(&req); + + if (! rc) { + RGWReadV* rdv = static_cast( + ::operator new(sizeof(RGWReadV) + + (bl.buffers().size() * sizeof(struct rgw_vio)))); + + (void) new (rdv) + RGWReadV(bl, reinterpret_cast(rdv+sizeof(RGWReadV))); + + uio->uio_p1 = rdv; + uio->uio_cnt = rdv->buffers().size(); + uio->uio_resid = rdv->length(); + uio->uio_vio = rdv->get_vio(); + uio->uio_rele = rgw_readv_rele; + + int ix = 0; + auto& buffers = rdv->buffers(); + for (auto& bp : buffers) { + rgw_vio *vio = &(uio->uio_vio[ix]); + vio->vio_base = const_cast(bp.c_str()); + vio->vio_len = bp.length(); + vio->vio_u1 = nullptr; + vio->vio_p1 = nullptr; + ++ix; + } + } + + return rc; +#else + return 0; +#endif +} + +/* + write data to file (vector) +*/ +int rgw_writev(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_uio *uio, uint32_t flags) +{ + + return -ENOTSUP; + + CephContext* cct = static_cast(rgw_fs->rgw); + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + if (! rgw_fh->is_file()) + return -EINVAL; + + buffer::list bl; + for (unsigned int ix = 0; ix < uio->uio_cnt; ++ix) { + rgw_vio *vio = &(uio->uio_vio[ix]); + bl.push_back( + buffer::create_static(vio->vio_len, + static_cast(vio->vio_base))); + } + + std::string oname = rgw_fh->relative_object_name(); + RGWPutObjRequest req(cct, fs->get_user(), rgw_fh->bucket_name(), + oname, bl); + + int rc = rgwlib.get_fe()->execute_req(&req); + + /* XXX update size (in request) */ + + return rc; +} + +/* + sync written data +*/ +int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *handle, + uint32_t flags) +{ + return 0; +} + +int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint64_t offset, uint64_t length, uint32_t flags) +{ + RGWFileHandle* rgw_fh = get_rgwfh(fh); + + return rgw_fh->commit(offset, length, RGWFileHandle::FLAG_NONE); +} + +} /* extern "C" */ diff --git a/src/rgw/rgw_file.h b/src/rgw/rgw_file.h new file mode 100644 index 00000000..13680eee --- /dev/null +++ b/src/rgw/rgw_file.h @@ -0,0 +1,2806 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_FILE_H +#define RGW_FILE_H + +#include "include/rados/rgw_file.h" + +/* internal header */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xxhash.h" +#include "include/buffer.h" +#include "common/cohort_lru.h" +#include "common/ceph_timer.h" +#include "rgw_common.h" +#include "rgw_user.h" +#include "rgw_lib.h" +#include "rgw_ldap.h" +#include "rgw_token.h" +#include "rgw_putobj_processor.h" +#include "rgw_aio_throttle.h" +#include "rgw_compression.h" + + +/* XXX + * ASSERT_H somehow not defined after all the above (which bring + * in common/debug.h [e.g., dout]) + */ +#include "include/ceph_assert.h" + + +#define RGW_RWXMODE (S_IRWXU | S_IRWXG | S_IRWXO) + +#define RGW_RWMODE (RGW_RWXMODE & \ + ~(S_IXUSR | S_IXGRP | S_IXOTH)) + + +namespace rgw { + + template + static inline void ignore(T &&) {} + + + namespace bi = boost::intrusive; + + class RGWLibFS; + class RGWFileHandle; + class RGWWriteRequest; + + static inline bool operator <(const struct timespec& lhs, + const struct timespec& rhs) { + if (lhs.tv_sec == rhs.tv_sec) + return lhs.tv_nsec < rhs.tv_nsec; + else + return lhs.tv_sec < rhs.tv_sec; + } + + static inline bool operator ==(const struct timespec& lhs, + const struct timespec& rhs) { + return ((lhs.tv_sec == rhs.tv_sec) && + (lhs.tv_nsec == rhs.tv_nsec)); + } + + /* + * XXX + * The current 64-bit, non-cryptographic hash used here is intended + * for prototyping only. + * + * However, the invariant being prototyped is that objects be + * identifiable by their hash components alone. We believe this can + * be legitimately implemented using 128-hash values for bucket and + * object components, together with a cluster-resident cryptographic + * key. Since an MD5 or SHA-1 key is 128 bits and the (fast), + * non-cryptographic CityHash128 hash algorithm takes a 128-bit seed, + * speculatively we could use that for the final hash computations. + */ + struct fh_key + { + rgw_fh_hk fh_hk {}; + uint32_t version; + + static constexpr uint64_t seed = 8675309; + + fh_key() : version(0) {} + + fh_key(const rgw_fh_hk& _hk) + : fh_hk(_hk), version(0) { + // nothing + } + + fh_key(const uint64_t bk, const uint64_t ok) + : version(0) { + fh_hk.bucket = bk; + fh_hk.object = ok; + } + + fh_key(const uint64_t bk, const char *_o, const std::string& _t) + : version(0) { + fh_hk.bucket = bk; + std::string to = _t + ":" + _o; + fh_hk.object = XXH64(to.c_str(), to.length(), seed); + } + + fh_key(const std::string& _b, const std::string& _o, + const std::string& _t /* tenant */) + : version(0) { + std::string tb = _t + ":" + _b; + std::string to = _t + ":" + _o; + fh_hk.bucket = XXH64(tb.c_str(), tb.length(), seed); + fh_hk.object = XXH64(to.c_str(), to.length(), seed); + } + + void encode(buffer::list& bl) const { + ENCODE_START(2, 1, bl); + encode(fh_hk.bucket, bl); + encode(fh_hk.object, bl); + encode((uint32_t)2, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(fh_hk.bucket, bl); + decode(fh_hk.object, bl); + if (struct_v >= 2) { + decode(version, bl); + } + DECODE_FINISH(bl); + } + + friend std::ostream& operator<<(std::ostream &os, fh_key const &fhk); + + }; /* fh_key */ + + WRITE_CLASS_ENCODER(fh_key); + + inline bool operator<(const fh_key& lhs, const fh_key& rhs) + { + return ((lhs.fh_hk.bucket < rhs.fh_hk.bucket) || + ((lhs.fh_hk.bucket == rhs.fh_hk.bucket) && + (lhs.fh_hk.object < rhs.fh_hk.object))); + } + + inline bool operator>(const fh_key& lhs, const fh_key& rhs) + { + return (rhs < lhs); + } + + inline bool operator==(const fh_key& lhs, const fh_key& rhs) + { + return ((lhs.fh_hk.bucket == rhs.fh_hk.bucket) && + (lhs.fh_hk.object == rhs.fh_hk.object)); + } + + inline bool operator!=(const fh_key& lhs, const fh_key& rhs) + { + return !(lhs == rhs); + } + + inline bool operator<=(const fh_key& lhs, const fh_key& rhs) + { + return (lhs < rhs) || (lhs == rhs); + } + + using boost::variant; + using boost::container::flat_map; + + typedef std::tuple DecodeAttrsResult; + + class RGWFileHandle : public cohort::lru::Object + { + struct rgw_file_handle fh; + std::mutex mtx; + + RGWLibFS* fs; + RGWFileHandle* bucket; + RGWFileHandle* parent; + /* const */ std::string name; /* XXX file or bucket name */ + /* const */ fh_key fhk; + + using lock_guard = std::lock_guard; + using unique_lock = std::unique_lock; + + /* TODO: keeping just the last marker is sufficient for + * nfs-ganesha 2.4.5; in the near future, nfs-ganesha will + * be able to hint the name of the next dirent required, + * from which we can directly synthesize a RADOS marker. + * using marker_cache_t = flat_map; + */ + + struct State { + uint64_t dev; + uint64_t size; + uint64_t nlink; + uint32_t owner_uid; /* XXX need Unix attr */ + uint32_t owner_gid; /* XXX need Unix attr */ + mode_t unix_mode; + struct timespec ctime; + struct timespec mtime; + struct timespec atime; + uint32_t version; + State() : dev(0), size(0), nlink(1), owner_uid(0), owner_gid(0), + ctime{0,0}, mtime{0,0}, atime{0,0}, version(0) {} + } state; + + struct file { + RGWWriteRequest* write_req; + file() : write_req(nullptr) {} + ~file(); + }; + + struct directory { + + static constexpr uint32_t FLAG_NONE = 0x0000; + + uint32_t flags; + rgw_obj_key last_marker; + struct timespec last_readdir; + + directory() : flags(FLAG_NONE), last_readdir{0,0} {} + }; + + void clear_state(); + void advance_mtime(uint32_t flags = FLAG_NONE); + + boost::variant variant_type; + + uint16_t depth; + uint32_t flags; + + ceph::buffer::list etag; + ceph::buffer::list acls; + + public: + const static std::string root_name; + + static constexpr uint16_t MAX_DEPTH = 256; + + static constexpr uint32_t FLAG_NONE = 0x0000; + static constexpr uint32_t FLAG_OPEN = 0x0001; + static constexpr uint32_t FLAG_ROOT = 0x0002; + static constexpr uint32_t FLAG_CREATE = 0x0004; + static constexpr uint32_t FLAG_CREATING = 0x0008; + static constexpr uint32_t FLAG_SYMBOLIC_LINK = 0x0009; + static constexpr uint32_t FLAG_DIRECTORY = 0x0010; + static constexpr uint32_t FLAG_BUCKET = 0x0020; + static constexpr uint32_t FLAG_LOCK = 0x0040; + static constexpr uint32_t FLAG_DELETED = 0x0080; + static constexpr uint32_t FLAG_UNLINK_THIS = 0x0100; + static constexpr uint32_t FLAG_LOCKED = 0x0200; + static constexpr uint32_t FLAG_STATELESS_OPEN = 0x0400; + static constexpr uint32_t FLAG_EXACT_MATCH = 0x0800; + static constexpr uint32_t FLAG_MOUNT = 0x1000; + +#define CREATE_FLAGS(x) \ + ((x) & ~(RGWFileHandle::FLAG_CREATE|RGWFileHandle::FLAG_LOCK)) + + static constexpr uint32_t RCB_MASK = \ + RGW_SETATTR_MTIME|RGW_SETATTR_CTIME|RGW_SETATTR_ATIME|RGW_SETATTR_SIZE; + + friend class RGWLibFS; + + private: + explicit RGWFileHandle(RGWLibFS* _fs) + : fs(_fs), bucket(nullptr), parent(nullptr), variant_type{directory()}, + depth(0), flags(FLAG_NONE) + { + fh.fh_hk.bucket = 0; + fh.fh_hk.object = 0; + /* root */ + fh.fh_type = RGW_FS_TYPE_DIRECTORY; + variant_type = directory(); + /* stat */ + state.unix_mode = RGW_RWXMODE|S_IFDIR; + /* pointer to self */ + fh.fh_private = this; + } + + uint64_t init_fsid(std::string& uid) { + return XXH64(uid.c_str(), uid.length(), fh_key::seed); + } + + void init_rootfs(std::string& fsid, const std::string& object_name, + bool is_bucket) { + /* fh_key */ + fh.fh_hk.bucket = XXH64(fsid.c_str(), fsid.length(), fh_key::seed); + fh.fh_hk.object = XXH64(object_name.c_str(), object_name.length(), + fh_key::seed); + fhk = fh.fh_hk; + name = object_name; + + state.dev = init_fsid(fsid); + + if (is_bucket) { + flags |= RGWFileHandle::FLAG_BUCKET | RGWFileHandle::FLAG_MOUNT; + bucket = this; + depth = 1; + } else { + flags |= RGWFileHandle::FLAG_ROOT | RGWFileHandle::FLAG_MOUNT; + } + } + + public: + RGWFileHandle(RGWLibFS* _fs, RGWFileHandle* _parent, + const fh_key& _fhk, std::string& _name, uint32_t _flags) + : fs(_fs), bucket(nullptr), parent(_parent), name(std::move(_name)), + fhk(_fhk), flags(_flags) { + + if (parent->is_root()) { + fh.fh_type = RGW_FS_TYPE_DIRECTORY; + variant_type = directory(); + flags |= FLAG_BUCKET; + } else { + bucket = parent->is_bucket() ? parent + : parent->bucket; + if (flags & FLAG_DIRECTORY) { + fh.fh_type = RGW_FS_TYPE_DIRECTORY; + variant_type = directory(); + } else if(flags & FLAG_SYMBOLIC_LINK) { + fh.fh_type = RGW_FS_TYPE_SYMBOLIC_LINK; + variant_type = file(); + } else { + fh.fh_type = RGW_FS_TYPE_FILE; + variant_type = file(); + } + } + + depth = parent->depth + 1; + + /* save constant fhk */ + fh.fh_hk = fhk.fh_hk; /* XXX redundant in fh_hk */ + + /* inherits parent's fsid */ + state.dev = parent->state.dev; + + switch (fh.fh_type) { + case RGW_FS_TYPE_DIRECTORY: + state.unix_mode = RGW_RWXMODE|S_IFDIR; + /* virtual directories are always invalid */ + advance_mtime(); + break; + case RGW_FS_TYPE_FILE: + state.unix_mode = RGW_RWMODE|S_IFREG; + break; + case RGW_FS_TYPE_SYMBOLIC_LINK: + state.unix_mode = RGW_RWMODE|S_IFLNK; + break; + default: + break; + } + + /* pointer to self */ + fh.fh_private = this; + } + + const fh_key& get_key() const { + return fhk; + } + + directory* get_directory() { + return get(&variant_type); + } + + size_t get_size() const { return state.size; } + + const char* stype() { + return is_dir() ? "DIR" : "FILE"; + } + + uint16_t get_depth() const { return depth; } + + struct rgw_file_handle* get_fh() { return &fh; } + + RGWLibFS* get_fs() { return fs; } + + RGWFileHandle* get_parent() { return parent; } + + uint32_t get_owner_uid() const { return state.owner_uid; } + uint32_t get_owner_gid() const { return state.owner_gid; } + + struct timespec get_ctime() const { return state.ctime; } + struct timespec get_mtime() const { return state.mtime; } + + const ceph::buffer::list& get_etag() const { return etag; } + const ceph::buffer::list& get_acls() const { return acls; } + + void create_stat(struct stat* st, uint32_t mask) { + if (mask & RGW_SETATTR_UID) + state.owner_uid = st->st_uid; + + if (mask & RGW_SETATTR_GID) + state.owner_gid = st->st_gid; + + if (mask & RGW_SETATTR_MODE) { + switch (fh.fh_type) { + case RGW_FS_TYPE_DIRECTORY: + state.unix_mode = st->st_mode|S_IFDIR; + break; + case RGW_FS_TYPE_FILE: + state.unix_mode = st->st_mode|S_IFREG; + break; + case RGW_FS_TYPE_SYMBOLIC_LINK: + state.unix_mode = st->st_mode|S_IFLNK; + break; + default: + break; + } + } + + if (mask & RGW_SETATTR_ATIME) + state.atime = st->st_atim; + + if (mask & RGW_SETATTR_MTIME) { + if (fh.fh_type != RGW_FS_TYPE_DIRECTORY) + state.mtime = st->st_mtim; + } + + if (mask & RGW_SETATTR_CTIME) + state.ctime = st->st_ctim; + } + + int stat(struct stat* st, uint32_t flags = FLAG_NONE) { + /* partial Unix attrs */ + /* FIPS zeroization audit 20191115: this memset is not security + * related. */ + memset(st, 0, sizeof(struct stat)); + st->st_dev = state.dev; + st->st_ino = fh.fh_hk.object; // XXX + + st->st_uid = state.owner_uid; + st->st_gid = state.owner_gid; + + st->st_mode = state.unix_mode; + + switch (fh.fh_type) { + case RGW_FS_TYPE_DIRECTORY: + /* virtual directories are always invalid */ + advance_mtime(flags); + st->st_nlink = state.nlink; + break; + case RGW_FS_TYPE_FILE: + st->st_nlink = 1; + st->st_blksize = 4096; + st->st_size = state.size; + st->st_blocks = (state.size) / 512; + break; + case RGW_FS_TYPE_SYMBOLIC_LINK: + st->st_nlink = 1; + st->st_blksize = 4096; + st->st_size = state.size; + st->st_blocks = (state.size) / 512; + break; + default: + break; + } + +#ifdef HAVE_STAT_ST_MTIMESPEC_TV_NSEC + st->st_atimespec = state.atime; + st->st_mtimespec = state.mtime; + st->st_ctimespec = state.ctime; +#else + st->st_atim = state.atime; + st->st_mtim = state.mtime; + st->st_ctim = state.ctime; +#endif + + return 0; + } + + const std::string& bucket_name() const { + if (is_root()) + return root_name; + if (is_bucket()) + return name; + return bucket->object_name(); + } + + const std::string& object_name() const { return name; } + + std::string full_object_name(bool omit_bucket = false) const { + std::string path; + std::vector segments; + int reserve = 0; + const RGWFileHandle* tfh = this; + while (tfh && !tfh->is_root() && !(tfh->is_bucket() && omit_bucket)) { + segments.push_back(&tfh->object_name()); + reserve += (1 + tfh->object_name().length()); + tfh = tfh->parent; + } + int pos = 1; + path.reserve(reserve); + for (auto& s : boost::adaptors::reverse(segments)) { + if (pos > 1) { + path += "/"; + } else { + if (!omit_bucket && + ((path.length() == 0) || (path.front() != '/'))) + path += "/"; + } + path += *s; + ++pos; + } + return path; + } + + inline std::string relative_object_name() const { + return full_object_name(true /* omit_bucket */); + } + + inline std::string format_child_name(const std::string& cbasename, + bool is_dir) const { + std::string child_name{relative_object_name()}; + if ((child_name.size() > 0) && + (child_name.back() != '/')) + child_name += "/"; + child_name += cbasename; + if (is_dir) + child_name += "/"; + return child_name; + } + + inline std::string make_key_name(const char *name) const { + std::string key_name{full_object_name()}; + if (key_name.length() > 0) + key_name += "/"; + key_name += name; + return key_name; + } + + fh_key make_fhk(const std::string& name); + + void add_marker(uint64_t off, const rgw_obj_key& marker, + uint8_t obj_type) { + using std::get; + directory* d = get(&variant_type); + if (d) { + unique_lock guard(mtx); + d->last_marker = marker; + } + } + + const rgw_obj_key* find_marker(uint64_t off) const { + using std::get; + if (off > 0) { + const directory* d = get(&variant_type); + if (d ) { + return &d->last_marker; + } + } + return nullptr; + } + + int offset_of(const std::string& name, int64_t *offset, uint32_t flags) { + if (unlikely(! is_dir())) { + return -EINVAL; + } + *offset = XXH64(name.c_str(), name.length(), fh_key::seed); + return 0; + } + + bool is_open() const { return flags & FLAG_OPEN; } + bool is_root() const { return flags & FLAG_ROOT; } + bool is_mount() const { return flags & FLAG_MOUNT; } + bool is_bucket() const { return flags & FLAG_BUCKET; } + bool is_object() const { return !is_bucket(); } + bool is_file() const { return (fh.fh_type == RGW_FS_TYPE_FILE); } + bool is_dir() const { return (fh.fh_type == RGW_FS_TYPE_DIRECTORY); } + bool is_link() const { return (fh.fh_type == RGW_FS_TYPE_SYMBOLIC_LINK); } + bool creating() const { return flags & FLAG_CREATING; } + bool deleted() const { return flags & FLAG_DELETED; } + bool stateless_open() const { return flags & FLAG_STATELESS_OPEN; } + bool has_children() const; + + int open(uint32_t gsh_flags) { + lock_guard guard(mtx); + if (! is_open()) { + if (gsh_flags & RGW_OPEN_FLAG_V3) { + flags |= FLAG_STATELESS_OPEN; + } + flags |= FLAG_OPEN; + return 0; + } + return -EPERM; + } + + typedef boost::variant readdir_offset; + + int readdir(rgw_readdir_cb rcb, void *cb_arg, readdir_offset offset, + bool *eof, uint32_t flags); + + int write(uint64_t off, size_t len, size_t *nbytes, void *buffer); + + int commit(uint64_t offset, uint64_t length, uint32_t flags) { + /* NFS3 and NFSv4 COMMIT implementation + * the current atomic update strategy doesn't actually permit + * clients to read-stable until either CLOSE (NFSv4+) or the + * expiration of the active write timer (NFS3). In the + * interim, the client may send an arbitrary number of COMMIT + * operations which must return a success result */ + return 0; + } + + int write_finish(uint32_t flags = FLAG_NONE); + int close(); + + void open_for_create() { + lock_guard guard(mtx); + flags |= FLAG_CREATING; + } + + void clear_creating() { + lock_guard guard(mtx); + flags &= ~FLAG_CREATING; + } + + void inc_nlink(const uint64_t n) { + state.nlink += n; + } + + void set_nlink(const uint64_t n) { + state.nlink = n; + } + + void set_size(const size_t size) { + state.size = size; + } + + void set_times(const struct timespec &ts) { + state.ctime = ts; + state.mtime = state.ctime; + state.atime = state.ctime; + } + + void set_times(real_time t) { + set_times(real_clock::to_timespec(t)); + } + + void set_ctime(const struct timespec &ts) { + state.ctime = ts; + } + + void set_mtime(const struct timespec &ts) { + state.mtime = ts; + } + + void set_atime(const struct timespec &ts) { + state.atime = ts; + } + + void set_etag(const ceph::buffer::list& _etag ) { + etag = _etag; + } + + void set_acls(const ceph::buffer::list& _acls ) { + acls = _acls; + } + + void encode(buffer::list& bl) const { + ENCODE_START(2, 1, bl); + encode(uint32_t(fh.fh_type), bl); + encode(state.dev, bl); + encode(state.size, bl); + encode(state.nlink, bl); + encode(state.owner_uid, bl); + encode(state.owner_gid, bl); + encode(state.unix_mode, bl); + for (const auto& t : { state.ctime, state.mtime, state.atime }) { + encode(real_clock::from_timespec(t), bl); + } + encode((uint32_t)2, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + uint32_t fh_type; + decode(fh_type, bl); + if ((fh.fh_type != fh_type) && + (fh_type == RGW_FS_TYPE_SYMBOLIC_LINK)) + fh.fh_type = RGW_FS_TYPE_SYMBOLIC_LINK; + ceph_assert(fh.fh_type == fh_type); + decode(state.dev, bl); + decode(state.size, bl); + decode(state.nlink, bl); + decode(state.owner_uid, bl); + decode(state.owner_gid, bl); + decode(state.unix_mode, bl); + ceph::real_time enc_time; + for (auto t : { &(state.ctime), &(state.mtime), &(state.atime) }) { + decode(enc_time, bl); + *t = real_clock::to_timespec(enc_time); + } + if (struct_v >= 2) { + decode(state.version, bl); + } + DECODE_FINISH(bl); + } + + void encode_attrs(ceph::buffer::list& ux_key1, + ceph::buffer::list& ux_attrs1); + + DecodeAttrsResult decode_attrs(const ceph::buffer::list* ux_key1, + const ceph::buffer::list* ux_attrs1); + + void invalidate(); + + bool reclaim(const cohort::lru::ObjectFactory* newobj_fac) override; + + typedef cohort::lru::LRU FhLRU; + + struct FhLT + { + // for internal ordering + bool operator()(const RGWFileHandle& lhs, const RGWFileHandle& rhs) const + { return (lhs.get_key() < rhs.get_key()); } + + // for external search by fh_key + bool operator()(const fh_key& k, const RGWFileHandle& fh) const + { return k < fh.get_key(); } + + bool operator()(const RGWFileHandle& fh, const fh_key& k) const + { return fh.get_key() < k; } + }; + + struct FhEQ + { + bool operator()(const RGWFileHandle& lhs, const RGWFileHandle& rhs) const + { return (lhs.get_key() == rhs.get_key()); } + + bool operator()(const fh_key& k, const RGWFileHandle& fh) const + { return k == fh.get_key(); } + + bool operator()(const RGWFileHandle& fh, const fh_key& k) const + { return fh.get_key() == k; } + }; + + typedef bi::link_mode link_mode; /* XXX normal */ +#if defined(FHCACHE_AVL) + typedef bi::avl_set_member_hook tree_hook_type; +#else + /* RBT */ + typedef bi::set_member_hook tree_hook_type; +#endif + tree_hook_type fh_hook; + + typedef bi::member_hook< + RGWFileHandle, tree_hook_type, &RGWFileHandle::fh_hook> FhHook; + +#if defined(FHCACHE_AVL) + typedef bi::avltree, FhHook> FHTree; +#else + typedef bi::rbtree, FhHook> FhTree; +#endif + typedef cohort::lru::TreeX FHCache; + + ~RGWFileHandle() override; + + friend std::ostream& operator<<(std::ostream &os, + RGWFileHandle const &rgw_fh); + + class Factory : public cohort::lru::ObjectFactory + { + public: + RGWLibFS* fs; + RGWFileHandle* parent; + const fh_key& fhk; + std::string& name; + uint32_t flags; + + Factory() = delete; + + Factory(RGWLibFS* _fs, RGWFileHandle* _parent, + const fh_key& _fhk, std::string& _name, uint32_t _flags) + : fs(_fs), parent(_parent), fhk(_fhk), name(_name), + flags(_flags) {} + + void recycle (cohort::lru::Object* o) override { + /* re-use an existing object */ + o->~Object(); // call lru::Object virtual dtor + // placement new! + new (o) RGWFileHandle(fs, parent, fhk, name, flags); + } + + cohort::lru::Object* alloc() override { + return new RGWFileHandle(fs, parent, fhk, name, flags); + } + }; /* Factory */ + + }; /* RGWFileHandle */ + + WRITE_CLASS_ENCODER(RGWFileHandle); + + static inline RGWFileHandle* get_rgwfh(struct rgw_file_handle* fh) { + return static_cast(fh->fh_private); + } + + static inline enum rgw_fh_type fh_type_of(uint32_t flags) { + enum rgw_fh_type fh_type; + switch(flags & RGW_LOOKUP_TYPE_FLAGS) + { + case RGW_LOOKUP_FLAG_DIR: + fh_type = RGW_FS_TYPE_DIRECTORY; + break; + case RGW_LOOKUP_FLAG_FILE: + fh_type = RGW_FS_TYPE_FILE; + break; + default: + fh_type = RGW_FS_TYPE_NIL; + }; + return fh_type; + } + + typedef std::tuple LookupFHResult; + typedef std::tuple MkObjResult; + + class RGWLibFS + { + CephContext* cct; + struct rgw_fs fs{}; + RGWFileHandle root_fh; + rgw_fh_callback_t invalidate_cb; + void *invalidate_arg; + bool shutdown; + + mutable std::atomic refcnt; + + RGWFileHandle::FHCache fh_cache; + RGWFileHandle::FhLRU fh_lru; + + std::string uid; // should match user.user_id, iiuc + + RGWUserInfo user; + RGWAccessKey key; // XXXX acc_key + + static std::atomic fs_inst_counter; + + static uint32_t write_completion_interval_s; + + using lock_guard = std::lock_guard; + using unique_lock = std::unique_lock; + + struct event + { + enum class type : uint8_t { READDIR } ; + type t; + const fh_key fhk; + struct timespec ts; + event(type t, const fh_key& k, const struct timespec& ts) + : t(t), fhk(k), ts(ts) {} + }; + + friend std::ostream& operator<<(std::ostream &os, + RGWLibFS::event const &ev); + + using event_vector = /* boost::small_vector */ + std::vector; + + struct WriteCompletion + { + RGWFileHandle& rgw_fh; + + explicit WriteCompletion(RGWFileHandle& _fh) : rgw_fh(_fh) { + rgw_fh.get_fs()->ref(&rgw_fh); + } + + void operator()() { + rgw_fh.close(); /* will finish in-progress write */ + rgw_fh.get_fs()->unref(&rgw_fh); + } + }; + + static ceph::timer write_timer; + + struct State { + std::mutex mtx; + std::atomic flags; + std::deque events; + + State() : flags(0) {} + + void push_event(const event& ev) { + events.push_back(ev); + } + } state; + + uint32_t new_inst() { + return ++fs_inst_counter; + } + + friend class RGWFileHandle; + friend class RGWLibProcess; + + public: + + static constexpr uint32_t FLAG_NONE = 0x0000; + static constexpr uint32_t FLAG_CLOSED = 0x0001; + + struct BucketStats { + size_t size; + size_t size_rounded; + real_time creation_time; + uint64_t num_entries; + }; + + RGWLibFS(CephContext* _cct, const char *_uid, const char *_user_id, + const char* _key, const char *root) + : cct(_cct), root_fh(this), invalidate_cb(nullptr), + invalidate_arg(nullptr), shutdown(false), refcnt(1), + fh_cache(cct->_conf->rgw_nfs_fhcache_partitions, + cct->_conf->rgw_nfs_fhcache_size), + fh_lru(cct->_conf->rgw_nfs_lru_lanes, + cct->_conf->rgw_nfs_lru_lane_hiwat), + uid(_uid), key(_user_id, _key) { + + if (!root || !strcmp(root, "/")) { + root_fh.init_rootfs(uid, RGWFileHandle::root_name, false); + } else { + root_fh.init_rootfs(uid, root, true); + } + + /* pointer to self */ + fs.fs_private = this; + + /* expose public root fh */ + fs.root_fh = root_fh.get_fh(); + + new_inst(); + } + + friend void intrusive_ptr_add_ref(const RGWLibFS* fs) { + fs->refcnt.fetch_add(1, std::memory_order_relaxed); + } + + friend void intrusive_ptr_release(const RGWLibFS* fs) { + if (fs->refcnt.fetch_sub(1, std::memory_order_release) == 0) { + std::atomic_thread_fence(std::memory_order_acquire); + delete fs; + } + } + + RGWLibFS* ref() { + intrusive_ptr_add_ref(this); + return this; + } + + inline void rele() { + intrusive_ptr_release(this); + } + + void stop() { shutdown = true; } + + void release_evict(RGWFileHandle* fh) { + /* remove from cache, releases sentinel ref */ + fh_cache.remove(fh->fh.fh_hk.object, fh, + RGWFileHandle::FHCache::FLAG_LOCK); + /* release call-path ref */ + (void) fh_lru.unref(fh, cohort::lru::FLAG_NONE); + } + + int authorize(RGWRados* store) { + int ret = rgw_get_user_info_by_access_key(store, key.id, user); + if (ret == 0) { + RGWAccessKey* k = user.get_key(key.id); + if (!k || (k->key != key.key)) + return -EINVAL; + if (user.suspended) + return -ERR_USER_SUSPENDED; + } else { + /* try external authenticators (ldap for now) */ + rgw::LDAPHelper* ldh = rgwlib.get_ldh(); /* !nullptr */ + RGWToken token; + /* boost filters and/or string_ref may throw on invalid input */ + try { + token = rgw::from_base64(key.id); + } catch(...) { + token = std::string(""); + } + if (token.valid() && (ldh->auth(token.id, token.key) == 0)) { + /* try to store user if it doesn't already exist */ + if (rgw_get_user_info_by_uid(store, token.id, user) < 0) { + int ret = rgw_store_user_info(store, user, NULL, NULL, real_time(), + true); + if (ret < 0) { + lsubdout(get_context(), rgw, 10) + << "NOTICE: failed to store new user's info: ret=" << ret + << dendl; + } + } + } /* auth success */ + } + return ret; + } /* authorize */ + + int register_invalidate(rgw_fh_callback_t cb, void *arg, uint32_t flags) { + invalidate_cb = cb; + invalidate_arg = arg; + return 0; + } + + /* find RGWFileHandle by id */ + LookupFHResult lookup_fh(const fh_key& fhk, + const uint32_t flags = RGWFileHandle::FLAG_NONE) { + using std::get; + + // cast int32_t(RGWFileHandle::FLAG_NONE) due to strictness of Clang + // the cast transfers a lvalue into a rvalue in the ctor + // check the commit message for the full details + LookupFHResult fhr { nullptr, uint32_t(RGWFileHandle::FLAG_NONE) }; + + RGWFileHandle::FHCache::Latch lat; + bool fh_locked = flags & RGWFileHandle::FLAG_LOCKED; + + retry: + RGWFileHandle* fh = + fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/, + fhk /* key */, lat /* serializer */, + RGWFileHandle::FHCache::FLAG_LOCK); + /* LATCHED */ + if (fh) { + if (likely(! fh_locked)) + fh->mtx.lock(); // XXX !RAII because may-return-LOCKED + /* need initial ref from LRU (fast path) */ + if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) { + lat.lock->unlock(); + if (likely(! fh_locked)) + fh->mtx.unlock(); + goto retry; /* !LATCHED */ + } + /* LATCHED, LOCKED */ + if (! (flags & RGWFileHandle::FLAG_LOCK)) + fh->mtx.unlock(); /* ! LOCKED */ + } + lat.lock->unlock(); /* !LATCHED */ + get<0>(fhr) = fh; + if (fh) { + lsubdout(get_context(), rgw, 17) + << __func__ << " 1 " << *fh + << dendl; + } + return fhr; + } /* lookup_fh(const fh_key&) */ + + /* find or create an RGWFileHandle */ + LookupFHResult lookup_fh(RGWFileHandle* parent, const char *name, + const uint32_t flags = RGWFileHandle::FLAG_NONE) { + using std::get; + + // cast int32_t(RGWFileHandle::FLAG_NONE) due to strictness of Clang + // the cast transfers a lvalue into a rvalue in the ctor + // check the commit message for the full details + LookupFHResult fhr { nullptr, uint32_t(RGWFileHandle::FLAG_NONE) }; + + /* mount is stale? */ + if (state.flags & FLAG_CLOSED) + return fhr; + + RGWFileHandle::FHCache::Latch lat; + bool fh_locked = flags & RGWFileHandle::FLAG_LOCKED; + + std::string obj_name{name}; + std::string key_name{parent->make_key_name(name)}; + fh_key fhk = parent->make_fhk(obj_name); + + lsubdout(get_context(), rgw, 10) + << __func__ << " called on " + << parent->object_name() << " for " << key_name + << " (" << obj_name << ")" + << " -> " << fhk + << dendl; + + retry: + RGWFileHandle* fh = + fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/, + fhk /* key */, lat /* serializer */, + RGWFileHandle::FHCache::FLAG_LOCK); + /* LATCHED */ + if (fh) { + if (likely(! fh_locked)) + fh->mtx.lock(); // XXX !RAII because may-return-LOCKED + if (fh->flags & RGWFileHandle::FLAG_DELETED) { + /* for now, delay briefly and retry */ + lat.lock->unlock(); + if (likely(! fh_locked)) + fh->mtx.unlock(); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + goto retry; /* !LATCHED */ + } + /* need initial ref from LRU (fast path) */ + if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) { + lat.lock->unlock(); + if (likely(! fh_locked)) + fh->mtx.unlock(); + goto retry; /* !LATCHED */ + } + /* LATCHED, LOCKED */ + if (! (flags & RGWFileHandle::FLAG_LOCK)) + if (likely(! fh_locked)) + fh->mtx.unlock(); /* ! LOCKED */ + } else { + /* make or re-use handle */ + RGWFileHandle::Factory prototype(this, parent, fhk, + obj_name, CREATE_FLAGS(flags)); + uint32_t iflags{cohort::lru::FLAG_INITIAL}; + fh = static_cast( + fh_lru.insert(&prototype, + cohort::lru::Edge::MRU, + iflags)); + if (fh) { + /* lock fh (LATCHED) */ + if (flags & RGWFileHandle::FLAG_LOCK) + fh->mtx.lock(); + if (likely(! (iflags & cohort::lru::FLAG_RECYCLE))) { + /* inserts at cached insert iterator, releasing latch */ + fh_cache.insert_latched( + fh, lat, RGWFileHandle::FHCache::FLAG_UNLOCK); + } else { + /* recycle step invalidates Latch */ + fh_cache.insert( + fhk.fh_hk.object, fh, RGWFileHandle::FHCache::FLAG_NONE); + lat.lock->unlock(); /* !LATCHED */ + } + get<1>(fhr) |= RGWFileHandle::FLAG_CREATE; + /* ref parent (non-initial ref cannot fail on valid object) */ + if (! parent->is_mount()) { + (void) fh_lru.ref(parent, cohort::lru::FLAG_NONE); + } + goto out; /* !LATCHED */ + } else { + lat.lock->unlock(); + goto retry; /* !LATCHED */ + } + } + lat.lock->unlock(); /* !LATCHED */ + out: + get<0>(fhr) = fh; + if (fh) { + lsubdout(get_context(), rgw, 17) + << __func__ << " 2 " << *fh + << dendl; + } + return fhr; + } /* lookup_fh(RGWFileHandle*, const char *, const uint32_t) */ + + inline void unref(RGWFileHandle* fh) { + if (likely(! fh->is_mount())) { + (void) fh_lru.unref(fh, cohort::lru::FLAG_NONE); + } + } + + inline RGWFileHandle* ref(RGWFileHandle* fh) { + if (likely(! fh->is_mount())) { + fh_lru.ref(fh, cohort::lru::FLAG_NONE); + } + return fh; + } + + int getattr(RGWFileHandle* rgw_fh, struct stat* st); + + int setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask, + uint32_t flags); + + void update_fh(RGWFileHandle *rgw_fh); + + LookupFHResult stat_bucket(RGWFileHandle* parent, const char *path, + RGWLibFS::BucketStats& bs, + uint32_t flags); + + LookupFHResult fake_leaf(RGWFileHandle* parent, const char *path, + enum rgw_fh_type type = RGW_FS_TYPE_NIL, + struct stat *st = nullptr, uint32_t mask = 0, + uint32_t flags = RGWFileHandle::FLAG_NONE); + + LookupFHResult stat_leaf(RGWFileHandle* parent, const char *path, + enum rgw_fh_type type = RGW_FS_TYPE_NIL, + uint32_t flags = RGWFileHandle::FLAG_NONE); + + int read(RGWFileHandle* rgw_fh, uint64_t offset, size_t length, + size_t* bytes_read, void* buffer, uint32_t flags); + + int readlink(RGWFileHandle* rgw_fh, uint64_t offset, size_t length, + size_t* bytes_read, void* buffer, uint32_t flags); + + int rename(RGWFileHandle* old_fh, RGWFileHandle* new_fh, + const char *old_name, const char *new_name); + + MkObjResult create(RGWFileHandle* parent, const char *name, struct stat *st, + uint32_t mask, uint32_t flags); + + MkObjResult symlink(RGWFileHandle* parent, const char *name, + const char *link_path, struct stat *st, uint32_t mask, uint32_t flags); + + MkObjResult mkdir(RGWFileHandle* parent, const char *name, struct stat *st, + uint32_t mask, uint32_t flags); + + int unlink(RGWFileHandle* rgw_fh, const char *name, + uint32_t flags = FLAG_NONE); + + /* find existing RGWFileHandle */ + RGWFileHandle* lookup_handle(struct rgw_fh_hk fh_hk) { + + if (state.flags & FLAG_CLOSED) + return nullptr; + + RGWFileHandle::FHCache::Latch lat; + fh_key fhk(fh_hk); + + retry: + RGWFileHandle* fh = + fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/, + fhk /* key */, lat /* serializer */, + RGWFileHandle::FHCache::FLAG_LOCK); + /* LATCHED */ + if (! fh) { + if (unlikely(fhk == root_fh.fh.fh_hk)) { + /* lookup for root of this fs */ + fh = &root_fh; + goto out; + } + lsubdout(get_context(), rgw, 0) + << __func__ << " handle lookup failed " << fhk + << dendl; + goto out; + } + fh->mtx.lock(); + if (fh->flags & RGWFileHandle::FLAG_DELETED) { + /* for now, delay briefly and retry */ + lat.lock->unlock(); + fh->mtx.unlock(); /* !LOCKED */ + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + goto retry; /* !LATCHED */ + } + if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) { + lat.lock->unlock(); + fh->mtx.unlock(); + goto retry; /* !LATCHED */ + } + /* LATCHED */ + fh->mtx.unlock(); /* !LOCKED */ + out: + lat.lock->unlock(); /* !LATCHED */ + + /* special case: lookup root_fh */ + if (! fh) { + if (unlikely(fh_hk == root_fh.fh.fh_hk)) { + fh = &root_fh; + } + } + + return fh; + } + + CephContext* get_context() { + return cct; + } + + struct rgw_fs* get_fs() { return &fs; } + + uint64_t get_fsid() { return root_fh.state.dev; } + + RGWUserInfo* get_user() { return &user; } + + void update_user() { + RGWUserInfo _user = user; + int ret = rgw_get_user_info_by_access_key(rgwlib.get_store(), key.id, user); + if (ret != 0) + user = _user; + } + + void close(); + void gc(); + }; /* RGWLibFS */ + +static inline std::string make_uri(const std::string& bucket_name, + const std::string& object_name) { + std::string uri("/"); + uri.reserve(bucket_name.length() + object_name.length() + 2); + uri += bucket_name; + uri += "/"; + uri += object_name; + return uri; +} + +/* + read directory content (buckets) +*/ + +class RGWListBucketsRequest : public RGWLibRequest, + public RGWListBuckets /* RGWOp */ +{ +public: + RGWFileHandle* rgw_fh; + RGWFileHandle::readdir_offset offset; + void* cb_arg; + rgw_readdir_cb rcb; + uint64_t* ioff; + size_t ix; + uint32_t d_count; + bool rcb_eof; // caller forced early stop in readdir cycle + + RGWListBucketsRequest(CephContext* _cct, RGWUserInfo *_user, + RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb, + void* _cb_arg, RGWFileHandle::readdir_offset& _offset) + : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), offset(_offset), + cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0), + rcb_eof(false) { + + using boost::get; + + if (unlikely(!! get(&offset))) { + ioff = get(offset); + const auto& mk = rgw_fh->find_marker(*ioff); + if (mk) { + marker = mk->name; + } + } else { + const char* mk = get(offset); + if (mk) { + marker = mk; + } + } + op = this; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + struct req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + + /* XXX derp derp derp */ + s->relative_uri = "/"; + s->info.request_uri = "/"; // XXX + s->info.effective_uri = "/"; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + int get_params() override { + limit = -1; /* no limit */ + return 0; + } + + void send_response_begin(bool has_buckets) override { + sent_data = true; + } + + void send_response_data(RGWUserBuckets& buckets) override { + if (!sent_data) + return; + map& m = buckets.get_buckets(); + for (const auto& iter : m) { + boost::string_ref marker{iter.first}; + const RGWBucketEnt& ent = iter.second; + if (! this->operator()(ent.bucket.name, marker)) { + /* caller cannot accept more */ + lsubdout(cct, rgw, 5) << "ListBuckets rcb failed" + << " dirent=" << ent.bucket.name + << " call count=" << ix + << dendl; + rcb_eof = true; + return; + } + ++ix; + } + } /* send_response_data */ + + void send_response_end() override { + // do nothing + } + + int operator()(const boost::string_ref& name, + const boost::string_ref& marker) { + uint64_t off = XXH64(name.data(), name.length(), fh_key::seed); + if (!! ioff) { + *ioff = off; + } + /* update traversal cache */ + rgw_fh->add_marker(off, rgw_obj_key{marker.data(), ""}, + RGW_FS_TYPE_DIRECTORY); + ++d_count; + return rcb(name.data(), cb_arg, off, nullptr, 0, RGW_LOOKUP_FLAG_DIR); + } + + bool eof() { + if (unlikely(cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15))) { + bool is_offset = + unlikely(! get(&offset)) || + !! get(offset); + lsubdout(cct, rgw, 15) << "READDIR offset: " << + ((is_offset) ? offset : "(nil)") + << " is_truncated: " << is_truncated + << dendl; + } + return !is_truncated && !rcb_eof; + } + +}; /* RGWListBucketsRequest */ + +/* + read directory content (bucket objects) +*/ + +class RGWReaddirRequest : public RGWLibRequest, + public RGWListBucket /* RGWOp */ +{ +public: + RGWFileHandle* rgw_fh; + RGWFileHandle::readdir_offset offset; + void* cb_arg; + rgw_readdir_cb rcb; + uint64_t* ioff; + size_t ix; + uint32_t d_count; + bool rcb_eof; // caller forced early stop in readdir cycle + + RGWReaddirRequest(CephContext* _cct, RGWUserInfo *_user, + RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb, + void* _cb_arg, RGWFileHandle::readdir_offset& _offset) + : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), offset(_offset), + cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0), + rcb_eof(false) { + + using boost::get; + + if (unlikely(!! get(&offset))) { + ioff = get(offset); + const auto& mk = rgw_fh->find_marker(*ioff); + if (mk) { + marker = *mk; + } + } else { + const char* mk = get(offset); + if (mk) { + std::string tmark{rgw_fh->relative_object_name()}; + if (tmark.length() > 0) + tmark += "/"; + tmark += mk; + marker = rgw_obj_key{std::move(tmark), "", ""}; + } + } + + default_max = 1000; // XXX was being omitted + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + struct req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + + /* XXX derp derp derp */ + std::string uri = "/" + rgw_fh->bucket_name() + "/"; + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + prefix = rgw_fh->relative_object_name(); + if (prefix.length() > 0) + prefix += "/"; + delimiter = '/'; + + return 0; + } + + int operator()(const boost::string_ref name, const rgw_obj_key& marker, + const ceph::real_time& t, const uint64_t fsz, uint8_t type) { + + assert(name.length() > 0); // all cases handled in callers + + /* hash offset of name in parent (short name) for NFS readdir cookie */ + uint64_t off = XXH64(name.data(), name.length(), fh_key::seed); + if (unlikely(!! ioff)) { + *ioff = off; + } + + /* update traversal cache */ + rgw_fh->add_marker(off, marker, type); + ++d_count; + + /* set c/mtime and size from bucket index entry */ + struct stat st = {}; +#ifdef HAVE_STAT_ST_MTIMESPEC_TV_NSEC + st.st_atimespec = ceph::real_clock::to_timespec(t); + st.st_mtimespec = st.st_atimespec; + st.st_ctimespec = st.st_atimespec; +#else + st.st_atim = ceph::real_clock::to_timespec(t); + st.st_mtim = st.st_atim; + st.st_ctim = st.st_atim; +#endif + st.st_size = fsz; + + return rcb(name.data(), cb_arg, off, &st, RGWFileHandle::RCB_MASK, + (type == RGW_FS_TYPE_DIRECTORY) ? + RGW_LOOKUP_FLAG_DIR : + RGW_LOOKUP_FLAG_FILE); + } + + int get_params() override { + max = default_max; + return 0; + } + + void send_response() override { + struct req_state* s = get_state(); + auto cnow = real_clock::now(); + + /* enumerate objs and common_prefixes in parallel, + * avoiding increment on and end iterator, which is + * undefined */ + + class DirIterator + { + vector& objs; + vector::iterator obj_iter; + + map& common_prefixes; + map::iterator cp_iter; + + boost::optional obj_sref; + boost::optional cp_sref; + bool _skip_cp; + + public: + + DirIterator(vector& objs, + map& common_prefixes) + : objs(objs), common_prefixes(common_prefixes), _skip_cp(false) + { + obj_iter = objs.begin(); + parse_obj(); + cp_iter = common_prefixes.begin(); + parse_cp(); + } + + bool is_obj() { + return (obj_iter != objs.end()); + } + + bool is_cp(){ + return (cp_iter != common_prefixes.end()); + } + + bool eof() { + return ((!is_obj()) && (!is_cp())); + } + + void parse_obj() { + if (is_obj()) { + boost::string_ref sref{obj_iter->key.name}; + size_t last_del = sref.find_last_of('/'); + if (last_del != string::npos) + sref.remove_prefix(last_del+1); + obj_sref = sref; + } + } /* parse_obj */ + + void next_obj() { + ++obj_iter; + parse_obj(); + } + + void parse_cp() { + if (is_cp()) { + /* leading-/ skip case */ + if (cp_iter->first == "/") { + _skip_cp = true; + return; + } else + _skip_cp = false; + + /* it's safest to modify the element in place--a suffix-modifying + * string_ref operation is problematic since ULP rgw_file callers + * will ultimately need a c-string */ + if (cp_iter->first.back() == '/') + const_cast(cp_iter->first).pop_back(); + + boost::string_ref sref{cp_iter->first}; + size_t last_del = sref.find_last_of('/'); + if (last_del != string::npos) + sref.remove_prefix(last_del+1); + cp_sref = sref; + } /* is_cp */ + } /* parse_cp */ + + void next_cp() { + ++cp_iter; + parse_cp(); + } + + bool skip_cp() { + return _skip_cp; + } + + bool entry_is_obj() { + return (is_obj() && + ((! is_cp()) || + (obj_sref.get() < cp_sref.get()))); + } + + boost::string_ref get_obj_sref() { + return obj_sref.get(); + } + + boost::string_ref get_cp_sref() { + return cp_sref.get(); + } + + vector::iterator& get_obj_iter() { + return obj_iter; + } + + map::iterator& get_cp_iter() { + return cp_iter; + } + + }; /* DirIterator */ + + DirIterator di{objs, common_prefixes}; + + for (;;) { + + if (di.eof()) { + break; // done + } + + /* assert: one of is_obj() || is_cp() holds */ + if (di.entry_is_obj()) { + auto sref = di.get_obj_sref(); + if (sref.empty()) { + /* recursive list of a leaf dir (iirc), do nothing */ + } else { + /* send a file entry */ + auto obj_entry = *(di.get_obj_iter()); + + lsubdout(cct, rgw, 15) << "RGWReaddirRequest " + << __func__ << " " + << "list uri=" << s->relative_uri << " " + << " prefix=" << prefix << " " + << " obj path=" << obj_entry.key.name + << " (" << sref << ")" << "" + << " mtime=" + << real_clock::to_time_t(obj_entry.meta.mtime) + << " size=" << obj_entry.meta.accounted_size + << dendl; + + if (! this->operator()(sref, next_marker, obj_entry.meta.mtime, + obj_entry.meta.accounted_size, + RGW_FS_TYPE_FILE)) { + /* caller cannot accept more */ + lsubdout(cct, rgw, 5) << "readdir rcb caller signalled stop" + << " dirent=" << sref.data() + << " call count=" << ix + << dendl; + rcb_eof = true; + return; + } + } + di.next_obj(); // and advance object + } else { + /* send a dir entry */ + if (! di.skip_cp()) { + auto sref = di.get_cp_sref(); + + lsubdout(cct, rgw, 15) << "RGWReaddirRequest " + << __func__ << " " + << "list uri=" << s->relative_uri << " " + << " prefix=" << prefix << " " + << " cpref=" << sref + << dendl; + + if (sref.empty()) { + /* null path segment--could be created in S3 but has no NFS + * interpretation */ + } else { + if (! this->operator()(sref, next_marker, cnow, 0, + RGW_FS_TYPE_DIRECTORY)) { + /* caller cannot accept more */ + lsubdout(cct, rgw, 5) << "readdir rcb caller signalled stop" + << " dirent=" << sref.data() + << " call count=" << ix + << dendl; + rcb_eof = true; + return; + } + } + } + di.next_cp(); // and advance common_prefixes + } /* ! di.entry_is_obj() */ + } /* for (;;) */ + } + + virtual void send_versioned_response() { + send_response(); + } + + bool eof() { + if (unlikely(cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15))) { + bool is_offset = + unlikely(! get(&offset)) || + !! get(offset); + lsubdout(cct, rgw, 15) << "READDIR offset: " << + ((is_offset) ? offset : "(nil)") + << " next marker: " << next_marker + << " is_truncated: " << is_truncated + << dendl; + } + return !is_truncated && !rcb_eof; + } + +}; /* RGWReaddirRequest */ + +/* + dir has-children predicate (bucket objects) +*/ + +class RGWRMdirCheck : public RGWLibRequest, + public RGWListBucket /* RGWOp */ +{ +public: + const RGWFileHandle* rgw_fh; + bool valid; + bool has_children; + + RGWRMdirCheck (CephContext* _cct, RGWUserInfo *_user, + const RGWFileHandle* _rgw_fh) + : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), valid(false), + has_children(false) { + default_max = 2; + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + struct req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + + std::string uri = "/" + rgw_fh->bucket_name() + "/"; + s->relative_uri = uri; + s->info.request_uri = uri; + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + prefix = rgw_fh->relative_object_name(); + if (prefix.length() > 0) + prefix += "/"; + delimiter = '/'; + + return 0; + } + + int get_params() override { + max = default_max; + return 0; + } + + void send_response() override { + valid = true; + if ((objs.size() > 1) || + (! objs.empty() && + (objs.front().key.name != prefix))) { + has_children = true; + return; + } + for (auto& iter : common_prefixes) { + /* readdir never produces a name for this case */ + if (iter.first == "/") + continue; + has_children = true; + break; + } + } + + virtual void send_versioned_response() { + send_response(); + } + +}; /* RGWRMdirCheck */ + +/* + create bucket +*/ + +class RGWCreateBucketRequest : public RGWLibRequest, + public RGWCreateBucket /* RGWOp */ +{ +public: + const std::string& bucket_name; + + RGWCreateBucketRequest(CephContext* _cct, RGWUserInfo *_user, + std::string& _bname) + : RGWLibRequest(_cct, _user), bucket_name(_bname) { + op = this; + } + + bool only_bucket() override { return false; } + + int read_permissions(RGWOp* op_obj) override { + /* we ARE a 'create bucket' request (cf. rgw_rest.cc, ll. 1305-6) */ + return 0; + } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "PUT"; + s->op = OP_PUT; + + string uri = "/" + bucket_name; + /* XXX derp derp derp */ + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + int get_params() override { + struct req_state* s = get_state(); + RGWAccessControlPolicy_S3 s3policy(s->cct); + /* we don't have (any) headers, so just create canned ACLs */ + int ret = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); + policy = s3policy; + return ret; + } + + void send_response() override { + /* TODO: something (maybe) */ + } +}; /* RGWCreateBucketRequest */ + +/* + delete bucket +*/ + +class RGWDeleteBucketRequest : public RGWLibRequest, + public RGWDeleteBucket /* RGWOp */ +{ +public: + const std::string& bucket_name; + + RGWDeleteBucketRequest(CephContext* _cct, RGWUserInfo *_user, + std::string& _bname) + : RGWLibRequest(_cct, _user), bucket_name(_bname) { + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "DELETE"; + s->op = OP_DELETE; + + string uri = "/" + bucket_name; + /* XXX derp derp derp */ + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + void send_response() override {} + +}; /* RGWDeleteBucketRequest */ + +/* + put object +*/ +class RGWPutObjRequest : public RGWLibRequest, + public RGWPutObj /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + buffer::list& bl; /* XXX */ + size_t bytes_written; + + RGWPutObjRequest(CephContext* _cct, RGWUserInfo *_user, + const std::string& _bname, const std::string& _oname, + buffer::list& _bl) + : RGWLibRequest(_cct, _user), bucket_name(_bname), obj_name(_oname), + bl(_bl), bytes_written(0) { + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + + int rc = valid_s3_object_name(obj_name); + if (rc != 0) + return rc; + + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "PUT"; + s->op = OP_PUT; + + /* XXX derp derp derp */ + std::string uri = make_uri(bucket_name, obj_name); + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + /* XXX required in RGWOp::execute() */ + s->content_length = bl.length(); + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + int get_params() override { + struct req_state* s = get_state(); + RGWAccessControlPolicy_S3 s3policy(s->cct); + /* we don't have (any) headers, so just create canned ACLs */ + int ret = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); + policy = s3policy; + return ret; + } + + int get_data(buffer::list& _bl) override { + /* XXX for now, use sharing semantics */ + _bl.claim(bl); + uint32_t len = _bl.length(); + bytes_written += len; + return len; + } + + void send_response() override {} + + int verify_params() override { + if (bl.length() > cct->_conf->rgw_max_put_size) + return -ERR_TOO_LARGE; + return 0; + } + + buffer::list* get_attr(const std::string& k) { + auto iter = attrs.find(k); + return (iter != attrs.end()) ? &(iter->second) : nullptr; + } + +}; /* RGWPutObjRequest */ + +/* + get object +*/ + +class RGWReadRequest : public RGWLibRequest, + public RGWGetObj /* RGWOp */ +{ +public: + RGWFileHandle* rgw_fh; + void *ulp_buffer; + size_t nread; + size_t read_resid; /* initialize to len, <= sizeof(ulp_buffer) */ + bool do_hexdump = false; + + RGWReadRequest(CephContext* _cct, RGWUserInfo *_user, + RGWFileHandle* _rgw_fh, uint64_t off, uint64_t len, + void *_ulp_buffer) + : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), ulp_buffer(_ulp_buffer), + nread(0), read_resid(len) { + op = this; + + /* fixup RGWGetObj (already know range parameters) */ + RGWGetObj::range_parsed = true; + RGWGetObj::get_data = true; // XXX + RGWGetObj::partial_content = true; + RGWGetObj::ofs = off; + RGWGetObj::end = off + len; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + + /* XXX derp derp derp */ + s->relative_uri = make_uri(rgw_fh->bucket_name(), + rgw_fh->relative_object_name()); + s->info.request_uri = s->relative_uri; // XXX + s->info.effective_uri = s->relative_uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + int get_params() override { + return 0; + } + + int send_response_data(ceph::buffer::list& bl, off_t bl_off, + off_t bl_len) override { + size_t bytes; + for (auto& bp : bl.buffers()) { + /* if for some reason bl_off indicates the start-of-data is not at + * the current buffer::ptr, skip it and account */ + if (bl_off > bp.length()) { + bl_off -= bp.length(); + continue; + } + /* read no more than read_resid */ + bytes = std::min(read_resid, size_t(bp.length()-bl_off)); + memcpy(static_cast(ulp_buffer)+nread, bp.c_str()+bl_off, bytes); + read_resid -= bytes; /* reduce read_resid by bytes read */ + nread += bytes; + bl_off = 0; + /* stop if we have no residual ulp_buffer */ + if (! read_resid) + break; + } + return 0; + } + + int send_response_data_error() override { + /* S3 implementation just sends nothing--there is no side effect + * to simulate here */ + return 0; + } + +}; /* RGWReadRequest */ + +/* + delete object +*/ + +class RGWDeleteObjRequest : public RGWLibRequest, + public RGWDeleteObj /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + + RGWDeleteObjRequest(CephContext* _cct, RGWUserInfo *_user, + const std::string& _bname, const std::string& _oname) + : RGWLibRequest(_cct, _user), bucket_name(_bname), obj_name(_oname) { + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "DELETE"; + s->op = OP_DELETE; + + /* XXX derp derp derp */ + std::string uri = make_uri(bucket_name, obj_name); + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + void send_response() override {} + +}; /* RGWDeleteObjRequest */ + +class RGWStatObjRequest : public RGWLibRequest, + public RGWGetObj /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + uint64_t _size; + uint32_t flags; + + static constexpr uint32_t FLAG_NONE = 0x000; + + RGWStatObjRequest(CephContext* _cct, RGWUserInfo *_user, + const std::string& _bname, const std::string& _oname, + uint32_t _flags) + : RGWLibRequest(_cct, _user), bucket_name(_bname), obj_name(_oname), + _size(0), flags(_flags) { + op = this; + + /* fixup RGWGetObj (already know range parameters) */ + RGWGetObj::range_parsed = true; + RGWGetObj::get_data = false; // XXX + RGWGetObj::partial_content = true; + RGWGetObj::ofs = 0; + RGWGetObj::end = UINT64_MAX; + } + + const char* name() const override { return "stat_obj"; } + RGWOpType get_type() override { return RGW_OP_STAT_OBJ; } + + real_time get_mtime() const { + return lastmod; + } + + /* attributes */ + uint64_t get_size() { return _size; } + real_time ctime() { return mod_time; } // XXX + real_time mtime() { return mod_time; } + std::map& get_attrs() { return attrs; } + + buffer::list* get_attr(const std::string& k) { + auto iter = attrs.find(k); + return (iter != attrs.end()) ? &(iter->second) : nullptr; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + + /* XXX derp derp derp */ + s->relative_uri = make_uri(bucket_name, obj_name); + s->info.request_uri = s->relative_uri; // XXX + s->info.effective_uri = s->relative_uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + int get_params() override { + return 0; + } + + int send_response_data(ceph::buffer::list& _bl, off_t s_off, + off_t e_off) override { + /* NOP */ + /* XXX save attrs? */ + return 0; + } + + int send_response_data_error() override { + /* NOP */ + return 0; + } + + void execute() override { + RGWGetObj::execute(); + _size = get_state()->obj_size; + } + +}; /* RGWStatObjRequest */ + +class RGWStatBucketRequest : public RGWLibRequest, + public RGWStatBucket /* RGWOp */ +{ +public: + std::string uri; + std::map attrs; + RGWLibFS::BucketStats& bs; + + RGWStatBucketRequest(CephContext* _cct, RGWUserInfo *_user, + const std::string& _path, + RGWLibFS::BucketStats& _stats) + : RGWLibRequest(_cct, _user), bs(_stats) { + uri = "/" + _path; + op = this; + } + + buffer::list* get_attr(const std::string& k) { + auto iter = attrs.find(k); + return (iter != attrs.end()) ? &(iter->second) : nullptr; + } + + real_time get_ctime() const { + return bucket.creation_time; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + + /* XXX derp derp derp */ + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + virtual int get_params() { + return 0; + } + + void send_response() override { + bucket.creation_time = get_state()->bucket_info.creation_time; + bs.size = bucket.size; + bs.size_rounded = bucket.size_rounded; + bs.creation_time = bucket.creation_time; + bs.num_entries = bucket.count; + std::swap(attrs, get_state()->bucket_attrs); + } + + bool matched() { + return (bucket.bucket.name.length() > 0); + } + +}; /* RGWStatBucketRequest */ + +class RGWStatLeafRequest : public RGWLibRequest, + public RGWListBucket /* RGWOp */ +{ +public: + RGWFileHandle* rgw_fh; + std::string path; + bool matched; + bool is_dir; + bool exact_matched; + + RGWStatLeafRequest(CephContext* _cct, RGWUserInfo *_user, + RGWFileHandle* _rgw_fh, const std::string& _path) + : RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), path(_path), + matched(false), is_dir(false), exact_matched(false) { + default_max = 1000; // logical max {"foo", "foo/"} + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + + /* XXX derp derp derp */ + std::string uri = "/" + rgw_fh->bucket_name() + "/"; + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + prefix = rgw_fh->relative_object_name(); + if (prefix.length() > 0) + prefix += "/"; + prefix += path; + delimiter = '/'; + + return 0; + } + + int get_params() override { + max = default_max; + return 0; + } + + void send_response() override { + struct req_state* s = get_state(); + // try objects + for (const auto& iter : objs) { + auto& name = iter.key.name; + lsubdout(cct, rgw, 15) << "RGWStatLeafRequest " + << __func__ << " " + << "list uri=" << s->relative_uri << " " + << " prefix=" << prefix << " " + << " obj path=" << name << "" + << " target = " << path << "" + << dendl; + /* XXX is there a missing match-dir case (trailing '/')? */ + matched = true; + if (name == path) + exact_matched = true; + return; + } + // try prefixes + for (auto& iter : common_prefixes) { + auto& name = iter.first; + lsubdout(cct, rgw, 15) << "RGWStatLeafRequest " + << __func__ << " " + << "list uri=" << s->relative_uri << " " + << " prefix=" << prefix << " " + << " pref path=" << name << " (not chomped)" + << " target = " << path << "" + << dendl; + matched = true; + /* match-dir case (trailing '/') */ + if (name == prefix + "/") + exact_matched = true; + is_dir = true; + break; + } + } + + virtual void send_versioned_response() { + send_response(); + } +}; /* RGWStatLeafRequest */ + +/* + put object +*/ + +class RGWWriteRequest : public RGWLibContinuedReq, + public RGWPutObj /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + RGWFileHandle* rgw_fh; + std::optional aio; + std::optional processor; + rgw::putobj::DataProcessor* filter; + boost::optional compressor; + CompressorRef plugin; + buffer::list data; + uint64_t timer_id; + MD5 hash; + off_t real_ofs; + size_t bytes_written; + bool eio; + + RGWWriteRequest(CephContext* _cct, RGWUserInfo *_user, RGWFileHandle* _fh, + const std::string& _bname, const std::string& _oname) + : RGWLibContinuedReq(_cct, _user), + bucket_name(_bname), obj_name(_oname), + rgw_fh(_fh), filter(nullptr), real_ofs(0), + bytes_written(0), eio(false) { + + int ret = header_init(); + if (ret == 0) { + ret = init_from_header(get_state()); + } + op = this; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "PUT"; + s->op = OP_PUT; + + /* XXX derp derp derp */ + std::string uri = make_uri(bucket_name, obj_name); + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + int get_params() override { + struct req_state* s = get_state(); + RGWAccessControlPolicy_S3 s3policy(s->cct); + /* we don't have (any) headers, so just create canned ACLs */ + int ret = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); + policy = s3policy; + return ret; + } + + int get_data(buffer::list& _bl) override { + /* XXX for now, use sharing semantics */ + uint32_t len = data.length(); + _bl.claim(data); + bytes_written += len; + return len; + } + + void put_data(off_t off, buffer::list& _bl) { + if (off != real_ofs) { + eio = true; + } + data.claim(_bl); + real_ofs += data.length(); + ofs = off; /* consumed in exec_continue() */ + } + + int exec_start() override; + int exec_continue() override; + int exec_finish() override; + + void send_response() override {} + + int verify_params() override { + return 0; + } +}; /* RGWWriteRequest */ + +/* + copy object +*/ +class RGWCopyObjRequest : public RGWLibRequest, + public RGWCopyObj /* RGWOp */ +{ +public: + RGWFileHandle* src_parent; + RGWFileHandle* dst_parent; + const std::string& src_name; + const std::string& dst_name; + + RGWCopyObjRequest(CephContext* _cct, RGWUserInfo *_user, + RGWFileHandle* _src_parent, RGWFileHandle* _dst_parent, + const std::string& _src_name, const std::string& _dst_name) + : RGWLibRequest(_cct, _user), src_parent(_src_parent), + dst_parent(_dst_parent), src_name(_src_name), dst_name(_dst_name) { + /* all requests have this */ + op = this; + + /* allow this request to replace selected attrs */ + attrs_mod = RGWRados::ATTRSMOD_MERGE; + } + + bool only_bucket() override { return true; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "PUT"; // XXX check + s->op = OP_PUT; + + src_bucket_name = src_parent->bucket_name(); + // need s->src_bucket_name? + src_object.name = src_parent->format_child_name(src_name, false); + // need s->src_object? + + dest_bucket_name = dst_parent->bucket_name(); + // need s->bucket.name? + dest_object = dst_parent->format_child_name(dst_name, false); + // need s->object_name? + + int rc = valid_s3_object_name(dest_object); + if (rc != 0) + return rc; + + /* XXX and fixup key attr (could optimize w/string ref and + * dest_object) */ + buffer::list ux_key; + fh_key fhk = dst_parent->make_fhk(dst_name); + rgw::encode(fhk, ux_key); + emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key)); + +#if 0 /* XXX needed? */ + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ +#endif + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + int get_params() override { + struct req_state* s = get_state(); + RGWAccessControlPolicy_S3 s3policy(s->cct); + /* we don't have (any) headers, so just create canned ACLs */ + int ret = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); + dest_policy = s3policy; + return ret; + } + + void send_response() override {} + void send_partial_response(off_t ofs) override {} + +}; /* RGWCopyObjRequest */ + +class RGWSetAttrsRequest : public RGWLibRequest, + public RGWSetAttrs /* RGWOp */ +{ +public: + const std::string& bucket_name; + const std::string& obj_name; + + RGWSetAttrsRequest(CephContext* _cct, RGWUserInfo *_user, + const std::string& _bname, const std::string& _oname) + : RGWLibRequest(_cct, _user), bucket_name(_bname), obj_name(_oname) { + op = this; + } + + bool only_bucket() override { return false; } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + + struct req_state* s = get_state(); + s->info.method = "PUT"; + s->op = OP_PUT; + + /* XXX derp derp derp */ + std::string uri = make_uri(bucket_name, obj_name); + s->relative_uri = uri; + s->info.request_uri = uri; // XXX + s->info.effective_uri = uri; + s->info.request_params = ""; + s->info.domain = ""; /* XXX ? */ + + // woo + s->user = user; + s->bucket_tenant = user->user_id.tenant; + + return 0; + } + + int get_params() override { + return 0; + } + + void send_response() override {} + +}; /* RGWSetAttrsRequest */ + +/* + * Send request to get the rados cluster stats + */ +class RGWGetClusterStatReq : public RGWLibRequest, + public RGWGetClusterStat { +public: + struct rados_cluster_stat_t& stats_req; + RGWGetClusterStatReq(CephContext* _cct,RGWUserInfo *_user, + rados_cluster_stat_t& _stats): + RGWLibRequest(_cct, _user), stats_req(_stats){ + op = this; + } + + int op_init() override { + // assign store, s, and dialect_handler + RGWObjectCtx* rados_ctx + = static_cast(get_state()->obj_ctx); + // framework promises to call op_init after parent init + ceph_assert(rados_ctx); + RGWOp::init(rados_ctx->get_store(), get_state(), this); + op = this; // assign self as op: REQUIRED + return 0; + } + + int header_init() override { + struct req_state* s = get_state(); + s->info.method = "GET"; + s->op = OP_GET; + s->user = user; + return 0; + } + + int get_params() override { return 0; } + bool only_bucket() override { return false; } + void send_response() override { + stats_req.kb = stats_op.kb; + stats_req.kb_avail = stats_op.kb_avail; + stats_req.kb_used = stats_op.kb_used; + stats_req.num_objects = stats_op.num_objects; + } +}; /* RGWGetClusterStatReq */ + + +} /* namespace rgw */ + +#endif /* RGW_FILE_H */ diff --git a/src/rgw/rgw_formats.cc b/src/rgw/rgw_formats.cc new file mode 100644 index 00000000..f8abf72f --- /dev/null +++ b/src/rgw/rgw_formats.cc @@ -0,0 +1,374 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include + +#include "common/escape.h" +#include "common/Formatter.h" +#include "rgw/rgw_common.h" +#include "rgw/rgw_formats.h" +#include "rgw/rgw_rest.h" + +#define LARGE_SIZE 8192 + +#define dout_subsys ceph_subsys_rgw + +RGWFormatter_Plain::RGWFormatter_Plain(const bool ukv) + : use_kv(ukv) +{ +} + +RGWFormatter_Plain::~RGWFormatter_Plain() +{ + free(buf); +} + +void RGWFormatter_Plain::flush(ostream& os) +{ + if (!buf) + return; + + if (len) { + os << buf; + os.flush(); + } + + reset_buf(); +} + +void RGWFormatter_Plain::reset_buf() +{ + free(buf); + buf = NULL; + len = 0; + max_len = 0; +} + +void RGWFormatter_Plain::reset() +{ + reset_buf(); + stack.clear(); + min_stack_level = 0; +} + +void RGWFormatter_Plain::open_array_section(const char *name) +{ + struct plain_stack_entry new_entry; + new_entry.is_array = true; + new_entry.size = 0; + + if (use_kv && min_stack_level > 0 && !stack.empty()) { + struct plain_stack_entry& entry = stack.back(); + + if (!entry.is_array) + dump_format(name, ""); + } + + stack.push_back(new_entry); +} + +void RGWFormatter_Plain::open_array_section_in_ns(const char *name, const char *ns) +{ + ostringstream oss; + oss << name << " " << ns; + open_array_section(oss.str().c_str()); +} + +void RGWFormatter_Plain::open_object_section(const char *name) +{ + struct plain_stack_entry new_entry; + new_entry.is_array = false; + new_entry.size = 0; + + if (use_kv && min_stack_level > 0) + dump_format(name, ""); + + stack.push_back(new_entry); +} + +void RGWFormatter_Plain::open_object_section_in_ns(const char *name, + const char *ns) +{ + ostringstream oss; + oss << name << " " << ns; + open_object_section(oss.str().c_str()); +} + +void RGWFormatter_Plain::close_section() +{ + stack.pop_back(); +} + +void RGWFormatter_Plain::dump_unsigned(const char *name, uint64_t u) +{ + dump_value_int(name, "%" PRIu64, u); +} + +void RGWFormatter_Plain::dump_int(const char *name, int64_t u) +{ + dump_value_int(name, "%" PRId64, u); +} + +void RGWFormatter_Plain::dump_float(const char *name, double d) +{ + dump_value_int(name, "%f", d); +} + +void RGWFormatter_Plain::dump_string(const char *name, std::string_view s) +{ + dump_format(name, "%s", s.data()); +} + +std::ostream& RGWFormatter_Plain::dump_stream(const char *name) +{ + // TODO: implement this! + ceph_abort(); +} + +void RGWFormatter_Plain::dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) +{ + char buf[LARGE_SIZE]; + + struct plain_stack_entry& entry = stack.back(); + + if (!min_stack_level) + min_stack_level = stack.size(); + + bool should_print = ((stack.size() == min_stack_level && !entry.size) || use_kv); + + entry.size++; + + if (!should_print) + return; + + vsnprintf(buf, LARGE_SIZE, fmt, ap); + + const char *eol; + if (wrote_something) { + if (use_kv && entry.is_array && entry.size > 1) + eol = ", "; + else + eol = "\n"; + } else + eol = ""; + wrote_something = true; + + if (use_kv && !entry.is_array) + write_data("%s%s: %s", eol, name, buf); + else + write_data("%s%s", eol, buf); +} + +int RGWFormatter_Plain::get_len() const +{ + // don't include null termination in length + return (len ? len - 1 : 0); +} + +void RGWFormatter_Plain::write_raw_data(const char *data) +{ + write_data("%s", data); +} + +void RGWFormatter_Plain::write_data(const char *fmt, ...) +{ +#define LARGE_ENOUGH_LEN 128 + int n, size = LARGE_ENOUGH_LEN; + char s[size + 8]; + char *p, *np; + bool p_on_stack; + va_list ap; + int pos; + + p = s; + p_on_stack = true; + + while (1) { + va_start(ap, fmt); + n = vsnprintf(p, size, fmt, ap); + va_end(ap); + + if (n > -1 && n < size) + goto done; + /* Else try again with more space. */ + if (n > -1) /* glibc 2.1 */ + size = n+1; /* precisely what is needed */ + else /* glibc 2.0 */ + size *= 2; /* twice the old size */ + if (p_on_stack) + np = (char *)malloc(size + 8); + else + np = (char *)realloc(p, size + 8); + if (!np) + goto done_free; + p = np; + p_on_stack = false; + } +done: +#define LARGE_ENOUGH_BUF 4096 + if (!buf) { + max_len = std::max(LARGE_ENOUGH_BUF, size); + buf = (char *)malloc(max_len); + if (!buf) { + cerr << "ERROR: RGWFormatter_Plain::write_data: failed allocating " << max_len << " bytes" << std::endl; + goto done_free; + } + } + + if (len + size > max_len) { + max_len = len + size + LARGE_ENOUGH_BUF; + void *_realloc = NULL; + if ((_realloc = realloc(buf, max_len)) == NULL) { + cerr << "ERROR: RGWFormatter_Plain::write_data: failed allocating " << max_len << " bytes" << std::endl; + goto done_free; + } else { + buf = (char *)_realloc; + } + } + + pos = len; + if (len) + pos--; // squash null termination + strcpy(buf + pos, p); + len = pos + strlen(p) + 1; +done_free: + if (!p_on_stack) + free(p); +} + +void RGWFormatter_Plain::dump_value_int(const char *name, const char *fmt, ...) +{ + char buf[LARGE_SIZE]; + va_list ap; + + if (!min_stack_level) + min_stack_level = stack.size(); + + struct plain_stack_entry& entry = stack.back(); + bool should_print = ((stack.size() == min_stack_level && !entry.size) || use_kv); + + entry.size++; + + if (!should_print) + return; + + va_start(ap, fmt); + vsnprintf(buf, LARGE_SIZE, fmt, ap); + va_end(ap); + + const char *eol; + if (wrote_something) { + eol = "\n"; + } else + eol = ""; + wrote_something = true; + + if (use_kv && !entry.is_array) + write_data("%s%s: %s", eol, name, buf); + else + write_data("%s%s", eol, buf); + +} + + +/* An utility class that serves as a mean to access the protected static + * methods of XMLFormatter. */ +class HTMLHelper : public XMLFormatter { +public: + static std::string escape(const std::string& unescaped_str) { + int len = escape_xml_attr_len(unescaped_str.c_str()); + std::string escaped(len, 0); + escape_xml_attr(unescaped_str.c_str(), escaped.data()); + return escaped; + } +}; + +void RGWSwiftWebsiteListingFormatter::generate_header( + const std::string& dir_path, + const std::string& css_path) +{ + ss << R"()"; + + ss << "Listing of " << xml_stream_escaper(dir_path) + << ""; + + if (! css_path.empty()) { + ss << boost::format(R"()") + % url_encode(css_path); + } else { + ss << R"()"; + } + + ss << ""; + + ss << R"(

Listing of )" << xml_stream_escaper(dir_path) << "

" + << R"()" + << R"()" + << R"()" + << R"()" + << R"()" + << R"()"; + + if (! prefix.empty()) { + ss << R"()" + << R"()" + << R"()" + << R"()" + << R"()"; + } +} + +void RGWSwiftWebsiteListingFormatter::generate_footer() +{ + ss << R"(
NameSizeDate
../  
)"; +} + +std::string RGWSwiftWebsiteListingFormatter::format_name( + const std::string& item_name) const +{ + return item_name.substr(prefix.length()); +} + +void RGWSwiftWebsiteListingFormatter::dump_object(const rgw_bucket_dir_entry& objent) +{ + const auto name = format_name(objent.key.name); + ss << boost::format(R"()") + % "default" + << boost::format(R"(%s)") + % url_encode(name) + % HTMLHelper::escape(name) + << boost::format(R"(%lld)") % objent.meta.size + << boost::format(R"(%s)") + % dump_time_to_str(objent.meta.mtime) + << R"()"; +} + +void RGWSwiftWebsiteListingFormatter::dump_subdir(const std::string& name) +{ + const auto fname = format_name(name); + ss << R"()" + << boost::format(R"(%s)") + % url_encode(fname) + % HTMLHelper::escape(fname) + << R"( )" + << R"( )" + << R"()"; +} diff --git a/src/rgw/rgw_formats.h b/src/rgw/rgw_formats.h new file mode 100644 index 00000000..10cc0deb --- /dev/null +++ b/src/rgw/rgw_formats.h @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_FORMATS_H +#define CEPH_RGW_FORMATS_H + +#include "common/Formatter.h" + +#include +#include +#include +#include + +struct plain_stack_entry { + int size; + bool is_array; +}; + +/* FIXME: this class is mis-named. + * FIXME: This was a hack to send certain swift messages. + * There is a much better way to do this. + */ +class RGWFormatter_Plain : public Formatter { + void reset_buf(); +public: + explicit RGWFormatter_Plain(bool use_kv = false); + ~RGWFormatter_Plain() override; + + void set_status(int status, const char* status_name) override {}; + void output_header() override {}; + void output_footer() override {}; + void enable_line_break() override {}; + void flush(ostream& os) override; + void reset() override; + + void open_array_section(const char *name) override; + void open_array_section_in_ns(const char *name, const char *ns) override; + void open_object_section(const char *name) override; + void open_object_section_in_ns(const char *name, const char *ns) override; + void close_section() override; + void dump_unsigned(const char *name, uint64_t u) override; + void dump_int(const char *name, int64_t u) override; + void dump_float(const char *name, double d) override; + void dump_string(const char *name, std::string_view s) override; + std::ostream& dump_stream(const char *name) override; + void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override; + int get_len() const override; + void write_raw_data(const char *data) override; + +private: + void write_data(const char *fmt, ...); + void dump_value_int(const char *name, const char *fmt, ...); + + char *buf = nullptr; + int len = 0; + int max_len = 0; + + std::list stack; + size_t min_stack_level = 0; + bool use_kv; + bool wrote_something = 0; +}; + + +/* This is a presentation layer. No logic inside, please. */ +class RGWSwiftWebsiteListingFormatter { + std::ostream& ss; + const std::string prefix; +protected: + std::string format_name(const std::string& item_name) const; +public: + RGWSwiftWebsiteListingFormatter(std::ostream& ss, + std::string prefix) + : ss(ss), + prefix(std::move(prefix)) { + } + + /* The supplied css_path can be empty. In such situation a default, + * embedded style sheet will be generated. */ + void generate_header(const std::string& dir_path, + const std::string& css_path); + void generate_footer(); + void dump_object(const rgw_bucket_dir_entry& objent); + void dump_subdir(const std::string& name); +}; + + +class RGWFormatterFlusher { +protected: + Formatter *formatter; + bool flushed; + bool started; + virtual void do_flush() = 0; + virtual void do_start(int ret) {} + void set_formatter(Formatter *f) { + formatter = f; + } +public: + explicit RGWFormatterFlusher(Formatter *f) : formatter(f), flushed(false), started(false) {} + virtual ~RGWFormatterFlusher() {} + + void flush() { + do_flush(); + flushed = true; + } + + virtual void start(int client_ret) { + if (!started) + do_start(client_ret); + started = true; + } + + Formatter *get_formatter() { return formatter; } + bool did_flush() { return flushed; } + bool did_start() { return started; } +}; + +class RGWStreamFlusher : public RGWFormatterFlusher { + ostream& os; +protected: + void do_flush() override { + formatter->flush(os); + } +public: + RGWStreamFlusher(Formatter *f, ostream& _os) : RGWFormatterFlusher(f), os(_os) {} +}; + +class RGWNullFlusher : public RGWFormatterFlusher { +protected: + void do_flush() override { + } +public: + RGWNullFlusher() : RGWFormatterFlusher(nullptr) {} +}; + +#endif diff --git a/src/rgw/rgw_frontend.cc b/src/rgw/rgw_frontend.cc new file mode 100644 index 00000000..f22ec124 --- /dev/null +++ b/src/rgw/rgw_frontend.cc @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "rgw_frontend.h" +#include "include/str_list.h" + +#include "include/ceph_assert.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +int RGWFrontendConfig::parse_config(const string& config, + std::multimap& config_map) +{ + for (auto& entry : get_str_vec(config, " ")) { + string key; + string val; + + if (framework.empty()) { + framework = entry; + dout(0) << "framework: " << framework << dendl; + continue; + } + + ssize_t pos = entry.find('='); + if (pos < 0) { + dout(0) << "framework conf key: " << entry << dendl; + config_map.emplace(std::move(entry), ""); + continue; + } + + int ret = parse_key_value(entry, key, val); + if (ret < 0) { + cerr << "ERROR: can't parse " << entry << std::endl; + return ret; + } + + dout(0) << "framework conf key: " << key << ", val: " << val << dendl; + config_map.emplace(std::move(key), std::move(val)); + } + + return 0; +} + +bool RGWFrontendConfig::get_val(const string& key, const string& def_val, + string *out) +{ + auto iter = config_map.find(key); + if (iter == config_map.end()) { + *out = def_val; + return false; + } + + *out = iter->second; + return true; +} + +bool RGWFrontendConfig::get_val(const string& key, int def_val, int *out) +{ + string str; + bool found = get_val(key, "", &str); + if (!found) { + *out = def_val; + return false; + } + string err; + *out = strict_strtol(str.c_str(), 10, &err); + if (!err.empty()) { + cerr << "error parsing int: " << str << ": " << err << std::endl; + return -EINVAL; + } + return 0; +} + +void RGWProcessFrontend::stop() +{ + pprocess->close_fd(); + thread->kill(SIGUSR1); +} diff --git a/src/rgw/rgw_frontend.h b/src/rgw/rgw_frontend.h new file mode 100644 index 00000000..c797e4d5 --- /dev/null +++ b/src/rgw/rgw_frontend.h @@ -0,0 +1,285 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_FRONTEND_H +#define RGW_FRONTEND_H + +#include +#include + +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_realm_reloader.h" + +#include "rgw_civetweb.h" +#include "rgw_civetweb_log.h" +#include "civetweb/civetweb.h" +#include "rgw_auth_registry.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +namespace rgw::dmclock { + class SyncScheduler; + class ClientConfig; + class SchedulerCtx; +} + +class RGWFrontendConfig { + std::string config; + std::multimap config_map; + std::string framework; + + int parse_config(const std::string& config, + std::multimap& config_map); + +public: + explicit RGWFrontendConfig(const std::string& config) + : config(config) { + } + + int init() { + const int ret = parse_config(config, config_map); + return ret < 0 ? ret : 0; + } + + bool get_val(const std::string& key, + const std::string& def_val, + std::string* out); + bool get_val(const std::string& key, int def_val, int *out); + + std::string get_val(const std::string& key, + const std::string& def_val) { + std::string out; + get_val(key, def_val, &out); + return out; + } + + const std::string& get_config() { + return config; + } + + std::multimap& get_config_map() { + return config_map; + } + + std::string get_framework() const { + return framework; + } +}; + +class RGWFrontend { +public: + virtual ~RGWFrontend() {} + + virtual int init() = 0; + + virtual int run() = 0; + virtual void stop() = 0; + virtual void join() = 0; + + virtual void pause_for_new_config() = 0; + virtual void unpause_with_new_config(RGWRados* store, + rgw_auth_registry_ptr_t auth_registry) = 0; +}; + + +struct RGWMongooseEnv : public RGWProcessEnv { + // every request holds a read lock, so we need to prioritize write locks to + // avoid starving pause_for_new_config() + static constexpr bool prioritize_write = true; + RWLock mutex; + + explicit RGWMongooseEnv(const RGWProcessEnv &env) + : RGWProcessEnv(env), + mutex("RGWCivetWebFrontend", false, true, prioritize_write) { + } +}; + + +class RGWCivetWebFrontend : public RGWFrontend { + RGWFrontendConfig* conf; + struct mg_context* ctx; + RGWMongooseEnv env; + + std::unique_ptr scheduler; + std::unique_ptr client_config; + + void set_conf_default(std::multimap& m, + const std::string& key, + const std::string& def_val) { + if (m.find(key) == std::end(m)) { + m.emplace(key, def_val); + } + } + + CephContext* cct() const { return env.store->ctx(); } +public: + RGWCivetWebFrontend(RGWProcessEnv& env, + RGWFrontendConfig *conf, + rgw::dmclock::SchedulerCtx& sched_ctx); + + int init() override { + return 0; + } + + int run() override; + + int process(struct mg_connection* conn); + + void stop() override { + if (ctx) { + mg_stop(ctx); + } + } + + void join() override { + return; + } + + void pause_for_new_config() override { + // block callbacks until unpause + env.mutex.get_write(); + } + + void unpause_with_new_config(RGWRados* const store, + rgw_auth_registry_ptr_t auth_registry) override { + env.store = store; + env.auth_registry = std::move(auth_registry); + // unpause callbacks + env.mutex.put_write(); + } +}; /* RGWCivetWebFrontend */ + +class RGWProcessFrontend : public RGWFrontend { +protected: + RGWFrontendConfig* conf; + RGWProcess* pprocess; + RGWProcessEnv env; + RGWProcessControlThread* thread; + +public: + RGWProcessFrontend(RGWProcessEnv& pe, RGWFrontendConfig* _conf) + : conf(_conf), pprocess(nullptr), env(pe), thread(nullptr) { + } + + ~RGWProcessFrontend() override { + delete thread; + delete pprocess; + } + + int run() override { + ceph_assert(pprocess); /* should have initialized by init() */ + thread = new RGWProcessControlThread(pprocess); + thread->create("rgw_frontend"); + return 0; + } + + void stop() override; + + void join() override { + thread->join(); + } + + void pause_for_new_config() override { + pprocess->pause(); + } + + void unpause_with_new_config(RGWRados* const store, + rgw_auth_registry_ptr_t auth_registry) override { + env.store = store; + env.auth_registry = auth_registry; + pprocess->unpause_with_new_config(store, std::move(auth_registry)); + } +}; /* RGWProcessFrontend */ + +class RGWFCGXFrontend : public RGWProcessFrontend { +public: + RGWFCGXFrontend(RGWProcessEnv& pe, RGWFrontendConfig* _conf) + : RGWProcessFrontend(pe, _conf) {} + + int init() override { + pprocess = new RGWFCGXProcess(g_ceph_context, &env, + g_conf()->rgw_thread_pool_size, conf); + return 0; + } +}; /* RGWFCGXFrontend */ + +class RGWLoadGenFrontend : public RGWProcessFrontend { +public: + RGWLoadGenFrontend(RGWProcessEnv& pe, RGWFrontendConfig *_conf) + : RGWProcessFrontend(pe, _conf) {} + + int init() override { + int num_threads; + conf->get_val("num_threads", g_conf()->rgw_thread_pool_size, &num_threads); + RGWLoadGenProcess *pp = new RGWLoadGenProcess(g_ceph_context, &env, + num_threads, conf); + + pprocess = pp; + + string uid_str; + conf->get_val("uid", "", &uid_str); + if (uid_str.empty()) { + derr << "ERROR: uid param must be specified for loadgen frontend" + << dendl; + return -EINVAL; + } + + rgw_user uid(uid_str); + + RGWUserInfo user_info; + int ret = rgw_get_user_info_by_uid(env.store, uid, user_info, NULL); + if (ret < 0) { + derr << "ERROR: failed reading user info: uid=" << uid << " ret=" + << ret << dendl; + return ret; + } + + map::iterator aiter = user_info.access_keys.begin(); + if (aiter == user_info.access_keys.end()) { + derr << "ERROR: user has no S3 access keys set" << dendl; + return -EINVAL; + } + + pp->set_access_key(aiter->second); + + return 0; + } +}; /* RGWLoadGenFrontend */ + +// FrontendPauser implementation for RGWRealmReloader +class RGWFrontendPauser : public RGWRealmReloader::Pauser { + std::list &frontends; + RGWRealmReloader::Pauser* pauser; + rgw::auth::ImplicitTenants& implicit_tenants; + + public: + RGWFrontendPauser(std::list &frontends, + rgw::auth::ImplicitTenants& implicit_tenants, + RGWRealmReloader::Pauser* pauser = nullptr) + : frontends(frontends), + pauser(pauser), + implicit_tenants(implicit_tenants) { + } + + void pause() override { + for (auto frontend : frontends) + frontend->pause_for_new_config(); + if (pauser) + pauser->pause(); + } + void resume(RGWRados *store) override { + /* Initialize the registry of auth strategies which will coordinate + * the dynamic reconfiguration. */ + auto auth_registry = \ + rgw::auth::StrategyRegistry::create(g_ceph_context, implicit_tenants, store); + + for (auto frontend : frontends) + frontend->unpause_with_new_config(store, auth_registry); + if (pauser) + pauser->resume(store); + } +}; + +#endif /* RGW_FRONTEND_H */ diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc new file mode 100644 index 00000000..0b99e087 --- /dev/null +++ b/src/rgw/rgw_gc.cc @@ -0,0 +1,528 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_gc.h" + +#include "include/scope_guard.h" +#include "rgw_tools.h" +#include "include/rados/librados.hpp" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/refcount/cls_refcount_client.h" +#include "rgw_perf_counters.h" +#include "cls/lock/cls_lock_client.h" +#include "include/random.h" + +#include // XXX +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace librados; + +static string gc_oid_prefix = "gc"; +static string gc_index_lock_name = "gc_process"; + + +void RGWGC::initialize(CephContext *_cct, RGWRados *_store) { + cct = _cct; + store = _store; + + max_objs = min(static_cast(cct->_conf->rgw_gc_max_objs), rgw_shards_max()); + + obj_names = new string[max_objs]; + + for (int i = 0; i < max_objs; i++) { + obj_names[i] = gc_oid_prefix; + char buf[32]; + snprintf(buf, 32, ".%d", i); + obj_names[i].append(buf); + } +} + +void RGWGC::finalize() +{ + delete[] obj_names; +} + +int RGWGC::tag_index(const string& tag) +{ + return rgw_shard_id(tag, max_objs); +} + +void RGWGC::add_chain(ObjectWriteOperation& op, cls_rgw_obj_chain& chain, const string& tag) +{ + cls_rgw_gc_obj_info info; + info.chain = chain; + info.tag = tag; + + cls_rgw_gc_set_entry(op, cct->_conf->rgw_gc_obj_min_wait, info); +} + +int RGWGC::send_chain(cls_rgw_obj_chain& chain, const string& tag, bool sync) +{ + ObjectWriteOperation op; + add_chain(op, chain, tag); + + int i = tag_index(tag); + + if (sync) + return store->gc_operate(obj_names[i], &op); + + return store->gc_aio_operate(obj_names[i], &op); +} + +int RGWGC::defer_chain(const string& tag, bool sync) +{ + ObjectWriteOperation op; + cls_rgw_gc_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, tag); + + int i = tag_index(tag); + + if (sync) + return store->gc_operate(obj_names[i], &op); + + return store->gc_aio_operate(obj_names[i], &op); +} + +int RGWGC::remove(int index, const std::vector& tags, AioCompletion **pc) +{ + ObjectWriteOperation op; + cls_rgw_gc_remove(op, tags); + return store->gc_aio_operate(obj_names[index], &op, pc); +} + +int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated) +{ + result.clear(); + string next_marker; + + for (; *index < max_objs && result.size() < max; (*index)++, marker.clear()) { + std::list entries; + int ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, max - result.size(), expired_only, entries, truncated, next_marker); + if (ret == -ENOENT) + continue; + if (ret < 0) + return ret; + + std::list::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + result.push_back(*iter); + } + + marker = next_marker; + + if (*index == max_objs - 1) { + /* we cut short here, truncated will hold the correct value */ + return 0; + } + + if (result.size() == max) { + /* close approximation, it might be that the next of the objects don't hold + * anything, in this case truncated should have been false, but we can find + * that out on the next iteration + */ + *truncated = true; + return 0; + } + + } + *truncated = false; + + return 0; +} + +class RGWGCIOManager { + const DoutPrefixProvider* dpp; + CephContext *cct; + RGWGC *gc; + + struct IO { + enum Type { + UnknownIO = 0, + TailIO = 1, + IndexIO = 2, + } type{UnknownIO}; + librados::AioCompletion *c{nullptr}; + string oid; + int index{-1}; + string tag; + }; + + deque ios; + vector > remove_tags; + /* tracks the number of remaining shadow objects for a given tag in order to + * only remove the tag once all shadow objects have themselves been removed + */ + vector > tag_io_size; + +#define MAX_AIO_DEFAULT 10 + size_t max_aio{MAX_AIO_DEFAULT}; + +public: + RGWGCIOManager(const DoutPrefixProvider* _dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp), + cct(_cct), + gc(_gc), + remove_tags(cct->_conf->rgw_gc_max_objs), + tag_io_size(cct->_conf->rgw_gc_max_objs) { + max_aio = cct->_conf->rgw_gc_max_concurrent_io; + } + + ~RGWGCIOManager() { + for (auto io : ios) { + io.c->release(); + } + } + + int schedule_io(IoCtx *ioctx, const string& oid, ObjectWriteOperation *op, + int index, const string& tag) { + while (ios.size() > max_aio) { + if (gc->going_down()) { + return 0; + } + handle_next_completion(); + } + + AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); + int ret = ioctx->aio_operate(oid, c, op); + if (ret < 0) { + return ret; + } + ios.push_back(IO{IO::TailIO, c, oid, index, tag}); + + return 0; + } + + void handle_next_completion() { + ceph_assert(!ios.empty()); + IO& io = ios.front(); + io.c->wait_for_safe(); + int ret = io.c->get_return_value(); + io.c->release(); + + if (ret == -ENOENT) { + ret = 0; + } + + if (io.type == IO::IndexIO) { + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: gc cleanup of tags on gc shard index=" << + io.index << " returned error, ret=" << ret << dendl; + } + goto done; + } + + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: gc could not remove oid=" << io.oid << + ", ret=" << ret << dendl; + goto done; + } + + schedule_tag_removal(io.index, io.tag); + + done: + ios.pop_front(); + } + + /* This is a request to schedule a tag removal. It will be called once when + * there are no shadow objects. But it will also be called for every shadow + * object when there are any. Since we do not want the tag to be removed + * until all shadow objects have been successfully removed, the scheduling + * will not happen until the shadow object count goes down to zero + */ + void schedule_tag_removal(int index, string tag) { + auto& ts = tag_io_size[index]; + auto ts_it = ts.find(tag); + if (ts_it != ts.end()) { + auto& size = ts_it->second; + --size; + // wait all shadow obj delete return + if (size != 0) + return; + + ts.erase(ts_it); + } + + auto& rt = remove_tags[index]; + + rt.push_back(tag); + if (rt.size() >= (size_t)cct->_conf->rgw_gc_max_trim_chunk) { + flush_remove_tags(index, rt); + } + } + + void add_tag_io_size(int index, string tag, size_t size) { + auto& ts = tag_io_size[index]; + ts.emplace(tag, size); + } + + void drain_ios() { + while (!ios.empty()) { + if (gc->going_down()) { + return; + } + handle_next_completion(); + } + } + + void drain() { + drain_ios(); + flush_remove_tags(); + /* the tags draining might have generated more ios, drain those too */ + drain_ios(); + } + + void flush_remove_tags(int index, vector& rt) { + IO index_io; + index_io.type = IO::IndexIO; + index_io.index = index; + + ldpp_dout(dpp, 20) << __func__ << + " removing entries from gc log shard index=" << index << ", size=" << + rt.size() << ", entries=" << rt << dendl; + + auto rt_guard = make_scope_guard( + [&] + { + rt.clear(); + } + ); + + int ret = gc->remove(index, rt, &index_io.c); + if (ret < 0) { + /* we already cleared list of tags, this prevents us from + * ballooning in case of a persistent problem + */ + ldpp_dout(dpp, 0) << "WARNING: failed to remove tags on gc shard index=" << + index << " ret=" << ret << dendl; + return; + } + if (perfcounter) { + /* log the count of tags retired for rate estimation */ + perfcounter->inc(l_rgw_gc_retire, rt.size()); + } + ios.push_back(index_io); + } + + void flush_remove_tags() { + int index = 0; + for (auto& rt : remove_tags) { + flush_remove_tags(index, rt); + ++index; + } + } +}; // class RGWGCIOManger + +int RGWGC::process(int index, int max_secs, bool expired_only, + RGWGCIOManager& io_manager) +{ + ldpp_dout(this, 20) << "RGWGC::process entered with GC index_shard=" << + index << ", max_secs=" << max_secs << ", expired_only=" << + expired_only << dendl; + + rados::cls::lock::Lock l(gc_index_lock_name); + utime_t end = ceph_clock_now(); + + /* max_secs should be greater than zero. We don't want a zero max_secs + * to be translated as no timeout, since we'd then need to break the + * lock and that would require a manual intervention. In this case + * we can just wait it out. */ + if (max_secs <= 0) + return -EAGAIN; + + end += max_secs; + utime_t time(max_secs, 0); + l.set_duration(time); + + int ret = l.lock_exclusive(&store->gc_pool_ctx, obj_names[index]); + if (ret == -EBUSY) { /* already locked by another gc processor */ + ldpp_dout(this, 10) << "RGWGC::process failed to acquire lock on " << + obj_names[index] << dendl; + return 0; + } + if (ret < 0) + return ret; + + string marker; + string next_marker; + bool truncated; + IoCtx *ctx = new IoCtx; + do { + int max = 100; + std::list entries; + + ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, max, + expired_only, entries, &truncated, next_marker); + ldpp_dout(this, 20) << + "RGWGC::process cls_rgw_gc_list returned with returned:" << ret << + ", entries.size=" << entries.size() << ", truncated=" << truncated << + ", next_marker='" << next_marker << "'" << dendl; + + if (ret == -ENOENT) { + ret = 0; + goto done; + } + if (ret < 0) + goto done; + + marker = next_marker; + + string last_pool; + std::list::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + cls_rgw_gc_obj_info& info = *iter; + + ldpp_dout(this, 20) << "RGWGC::process iterating over entry tag='" << + info.tag << "', time=" << info.time << ", chain.objs.size()=" << + info.chain.objs.size() << dendl; + + std::list::iterator liter; + cls_rgw_obj_chain& chain = info.chain; + + utime_t now = ceph_clock_now(); + if (now >= end) { + goto done; + } + + if (chain.objs.empty()) { + io_manager.schedule_tag_removal(index, info.tag); + } else { + io_manager.add_tag_io_size(index, info.tag, chain.objs.size()); + for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) { + cls_rgw_obj& obj = *liter; + + if (obj.pool != last_pool) { + delete ctx; + ctx = new IoCtx; + ret = rgw_init_ioctx(store->get_rados_handle(), obj.pool, *ctx); + if (ret < 0) { + last_pool = ""; + ldpp_dout(this, 0) << "ERROR: failed to create ioctx pool=" << + obj.pool << dendl; + continue; + } + last_pool = obj.pool; + } + + ctx->locator_set_key(obj.loc); + + const string& oid = obj.key.name; /* just stored raw oid there */ + + ldpp_dout(this, 5) << "RGWGC::process removing " << obj.pool << + ":" << obj.key.name << dendl; + ObjectWriteOperation op; + cls_refcount_put(op, info.tag, true); + + ret = io_manager.schedule_io(ctx, oid, &op, index, info.tag); + if (ret < 0) { + ldpp_dout(this, 0) << + "WARNING: failed to schedule deletion for oid=" << oid << dendl; + } + if (going_down()) { + // leave early, even if tag isn't removed, it's ok since it + // will be picked up next time around + goto done; + } + } // chains loop + } // else -- chains not empty + } // entries loop + } while (truncated); + +done: + /* we don't drain here, because if we're going down we don't want to + * hold the system if backend is unresponsive + */ + l.unlock(&store->gc_pool_ctx, obj_names[index]); + delete ctx; + + return 0; +} + +int RGWGC::process(bool expired_only) +{ + int max_secs = cct->_conf->rgw_gc_processor_max_time; + + const int start = ceph::util::generate_random_number(0, max_objs - 1); + + RGWGCIOManager io_manager(this, store->ctx(), this); + + for (int i = 0; i < max_objs; i++) { + int index = (i + start) % max_objs; + int ret = process(index, max_secs, expired_only, io_manager); + if (ret < 0) + return ret; + } + if (!going_down()) { + io_manager.drain(); + } + + return 0; +} + +bool RGWGC::going_down() +{ + return down_flag; +} + +void RGWGC::start_processor() +{ + worker = new GCWorker(this, cct, this); + worker->create("rgw_gc"); +} + +void RGWGC::stop_processor() +{ + down_flag = true; + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = NULL; +} + +unsigned RGWGC::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWGC::gen_prefix(std::ostream& out) const +{ + return out << "garbage collection: "; +} + +void *RGWGC::GCWorker::entry() { + do { + utime_t start = ceph_clock_now(); + ldpp_dout(dpp, 2) << "garbage collection: start" << dendl; + int r = gc->process(true); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: garbage collection process() returned error r=" << r << dendl; + } + ldpp_dout(dpp, 2) << "garbage collection: stop" << dendl; + + if (gc->going_down()) + break; + + utime_t end = ceph_clock_now(); + end -= start; + int secs = cct->_conf->rgw_gc_processor_period; + + if (secs <= end.sec()) + continue; // next round + + secs -= end.sec(); + + lock.Lock(); + cond.WaitInterval(lock, utime_t(secs, 0)); + lock.Unlock(); + } while (!gc->going_down()); + + return NULL; +} + +void RGWGC::GCWorker::stop() +{ + Mutex::Locker l(lock); + cond.Signal(); +} diff --git a/src/rgw/rgw_gc.h b/src/rgw/rgw_gc.h new file mode 100644 index 00000000..f8f24e97 --- /dev/null +++ b/src/rgw/rgw_gc.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_GC_H +#define CEPH_RGW_GC_H + + +#include "include/types.h" +#include "include/rados/librados.hpp" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" +#include "rgw_common.h" +#include "rgw_rados.h" +#include "cls/rgw/cls_rgw_types.h" + +#include + +class RGWGCIOManager; + +class RGWGC : public DoutPrefixProvider { + CephContext *cct; + RGWRados *store; + int max_objs; + string *obj_names; + std::atomic down_flag = { false }; + + int tag_index(const string& tag); + + class GCWorker : public Thread { + const DoutPrefixProvider *dpp; + CephContext *cct; + RGWGC *gc; + Mutex lock; + Cond cond; + + public: + GCWorker(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp), cct(_cct), gc(_gc), lock("GCWorker") {} + void *entry() override; + void stop(); + }; + + GCWorker *worker; +public: + RGWGC() : cct(NULL), store(NULL), max_objs(0), obj_names(NULL), worker(NULL) {} + ~RGWGC() { + stop_processor(); + finalize(); + } + + void add_chain(librados::ObjectWriteOperation& op, cls_rgw_obj_chain& chain, const string& tag); + int send_chain(cls_rgw_obj_chain& chain, const string& tag, bool sync); + int defer_chain(const string& tag, bool sync); + int remove(int index, const std::vector& tags, librados::AioCompletion **pc); + + void initialize(CephContext *_cct, RGWRados *_store); + void finalize(); + + int list(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated); + void list_init(int *index) { *index = 0; } + int process(int index, int process_max_secs, bool expired_only, + RGWGCIOManager& io_manager); + int process(bool expired_only); + + bool going_down(); + void start_processor(); + void stop_processor(); + + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const; + + std::ostream& gen_prefix(std::ostream& out) const; + +}; + + +#endif diff --git a/src/rgw/rgw_http_client.cc b/src/rgw/rgw_http_client.cc new file mode 100644 index 00000000..18f7a4ad --- /dev/null +++ b/src/rgw/rgw_http_client.cc @@ -0,0 +1,1255 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "common/errno.h" + +#include + +#include +#include +#include + +#include "rgw_common.h" +#include "rgw_http_client.h" +#include "rgw_http_errors.h" +#include "common/async/completion.h" +#include "common/RefCountedObj.h" + +#include "rgw_coroutine.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +RGWHTTPManager *rgw_http_manager; + +struct RGWCurlHandle; + +static void do_curl_easy_cleanup(RGWCurlHandle *curl_handle); + +struct rgw_http_req_data : public RefCountedObject { + RGWCurlHandle *curl_handle{nullptr}; + curl_slist *h{nullptr}; + uint64_t id; + int ret{0}; + std::atomic done = { false }; + RGWHTTPClient *client{nullptr}; + rgw_io_id control_io_id; + void *user_info{nullptr}; + bool registered{false}; + RGWHTTPManager *mgr{nullptr}; + char error_buf[CURL_ERROR_SIZE]; + bool write_paused{false}; + bool read_paused{false}; + + Mutex lock; + Cond cond; + + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion; + std::unique_ptr completion; + + rgw_http_req_data() : id(-1), lock("rgw_http_req_data::lock") { + // FIPS zeroization audit 20191115: this memset is not security related. + memset(error_buf, 0, sizeof(error_buf)); + } + + template + auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { + boost::asio::async_completion init(token); + auto& handler = init.completion_handler; + { + std::unique_lock l{lock}; + completion = Completion::create(ctx.get_executor(), std::move(handler)); + } + return init.result.get(); + } + int wait(optional_yield y) { + if (done) { + return ret; + } +#ifdef HAVE_BOOST_CONTEXT + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + boost::system::error_code ec; + async_wait(context, yield[ec]); + return -ec.value(); + } +#endif + Mutex::Locker l(lock); + cond.Wait(lock); + return ret; + } + + void set_state(int bitmask); + + void finish(int r, long http_status = -1) { + Mutex::Locker l(lock); + if (http_status != -1) { + if (client) { + client->set_http_status(http_status); + } + } + ret = r; + if (curl_handle) + do_curl_easy_cleanup(curl_handle); + + if (h) + curl_slist_free_all(h); + + curl_handle = NULL; + h = NULL; + done = true; + if (completion) { + boost::system::error_code ec(-ret, boost::system::system_category()); + Completion::post(std::move(completion), ec); + } else { + cond.Signal(); + } + } + + bool is_done() { + return done; + } + + int get_retcode() { + Mutex::Locker l(lock); + return ret; + } + + RGWHTTPManager *get_manager() { + Mutex::Locker l(lock); + return mgr; + } + + CURL *get_easy_handle() const; +}; + +struct RGWCurlHandle { + int uses; + mono_time lastuse; + CURL* h; + + explicit RGWCurlHandle(CURL* h) : uses(0), h(h) {}; + CURL* operator*() { + return this->h; + } +}; + +void rgw_http_req_data::set_state(int bitmask) { + /* no need to lock here, moreover curl_easy_pause() might trigger + * the data receive callback :/ + */ + CURLcode rc = curl_easy_pause(**curl_handle, bitmask); + if (rc != CURLE_OK) { + dout(0) << "ERROR: curl_easy_pause() returned rc=" << rc << dendl; + } +} + +#define MAXIDLE 5 +class RGWCurlHandles : public Thread { +public: + Mutex cleaner_lock; + std::vectorsaved_curl; + int cleaner_shutdown; + Cond cleaner_cond; + + RGWCurlHandles() : + cleaner_lock{"RGWCurlHandles::cleaner_lock"}, + cleaner_shutdown{0} { + } + + RGWCurlHandle* get_curl_handle(); + void release_curl_handle_now(RGWCurlHandle* curl); + void release_curl_handle(RGWCurlHandle* curl); + void flush_curl_handles(); + void* entry(); + void stop(); +}; + +RGWCurlHandle* RGWCurlHandles::get_curl_handle() { + RGWCurlHandle* curl = 0; + CURL* h; + { + Mutex::Locker lock(cleaner_lock); + if (!saved_curl.empty()) { + curl = *saved_curl.begin(); + saved_curl.erase(saved_curl.begin()); + } + } + if (curl) { + } else if ((h = curl_easy_init())) { + curl = new RGWCurlHandle{h}; + } else { + // curl = 0; + } + return curl; +} + +void RGWCurlHandles::release_curl_handle_now(RGWCurlHandle* curl) +{ + curl_easy_cleanup(**curl); + delete curl; +} + +void RGWCurlHandles::release_curl_handle(RGWCurlHandle* curl) +{ + if (cleaner_shutdown) { + release_curl_handle_now(curl); + } else { + curl_easy_reset(**curl); + Mutex::Locker lock(cleaner_lock); + curl->lastuse = mono_clock::now(); + saved_curl.insert(saved_curl.begin(), 1, curl); + } +} + +void* RGWCurlHandles::entry() +{ + RGWCurlHandle* curl; + Mutex::Locker lock(cleaner_lock); + + for (;;) { + if (cleaner_shutdown) { + if (saved_curl.empty()) + break; + } else { + utime_t release = ceph_clock_now() + utime_t(MAXIDLE,0); + cleaner_cond.WaitUntil(cleaner_lock, release); + } + mono_time now = mono_clock::now(); + while (!saved_curl.empty()) { + auto cend = saved_curl.end(); + --cend; + curl = *cend; + if (!cleaner_shutdown && now - curl->lastuse < std::chrono::seconds(MAXIDLE)) + break; + saved_curl.erase(cend); + release_curl_handle_now(curl); + } + } + return nullptr; +} + +void RGWCurlHandles::stop() +{ + Mutex::Locker lock(cleaner_lock); + cleaner_shutdown = 1; + cleaner_cond.Signal(); +} + +void RGWCurlHandles::flush_curl_handles() +{ + stop(); + join(); + if (!saved_curl.empty()) { + dout(0) << "ERROR: " << __func__ << " failed final cleanup" << dendl; + } + saved_curl.shrink_to_fit(); +} + +CURL *rgw_http_req_data::get_easy_handle() const +{ + return **curl_handle; +} + +static RGWCurlHandles *handles; + +static RGWCurlHandle *do_curl_easy_init() +{ + return handles->get_curl_handle(); +} + +static void do_curl_easy_cleanup(RGWCurlHandle *curl_handle) +{ + handles->release_curl_handle(curl_handle); +} + +// XXX make this part of the token cache? (but that's swift-only; +// and this especially needs to integrates with s3...) + +void rgw_setup_saved_curl_handles() +{ + handles = new RGWCurlHandles(); + handles->create("rgw_curl"); +} + +void rgw_release_all_curl_handles() +{ + handles->flush_curl_handles(); + delete handles; +} + +void RGWIOProvider::assign_io(RGWIOIDProvider& io_id_provider, int io_type) +{ + if (id == 0) { + id = io_id_provider.get_next(); + } +} + +/* + * the following set of callbacks will be called either on RGWHTTPManager::process(), + * or via the RGWHTTPManager async processing. + */ +size_t RGWHTTPClient::receive_http_header(void * const ptr, + const size_t size, + const size_t nmemb, + void * const _info) +{ + rgw_http_req_data *req_data = static_cast(_info); + size_t len = size * nmemb; + + Mutex::Locker l(req_data->lock); + + if (!req_data->registered) { + return len; + } + + int ret = req_data->client->receive_header(ptr, size * nmemb); + if (ret < 0) { + dout(0) << "WARNING: client->receive_header() returned ret=" << ret << dendl; + } + + return len; +} + +size_t RGWHTTPClient::receive_http_data(void * const ptr, + const size_t size, + const size_t nmemb, + void * const _info) +{ + rgw_http_req_data *req_data = static_cast(_info); + size_t len = size * nmemb; + + bool pause = false; + + RGWHTTPClient *client; + + { + Mutex::Locker l(req_data->lock); + if (!req_data->registered) { + return len; + } + + client = req_data->client; + } + + size_t& skip_bytes = client->receive_pause_skip; + + if (skip_bytes >= len) { + skip_bytes -= len; + return len; + } + + int ret = client->receive_data((char *)ptr + skip_bytes, len - skip_bytes, &pause); + if (ret < 0) { + dout(0) << "WARNING: client->receive_data() returned ret=" << ret << dendl; + } + + if (pause) { + dout(20) << "RGWHTTPClient::receive_http_data(): pause" << dendl; + skip_bytes = len; + Mutex::Locker l(req_data->lock); + req_data->read_paused = true; + return CURL_WRITEFUNC_PAUSE; + } + + skip_bytes = 0; + + return len; +} + +size_t RGWHTTPClient::send_http_data(void * const ptr, + const size_t size, + const size_t nmemb, + void * const _info) +{ + rgw_http_req_data *req_data = static_cast(_info); + + RGWHTTPClient *client; + + { + Mutex::Locker l(req_data->lock); + + if (!req_data->registered) { + return 0; + } + + client = req_data->client; + } + + bool pause = false; + + int ret = client->send_data(ptr, size * nmemb, &pause); + if (ret < 0) { + dout(0) << "WARNING: client->receive_data() returned ret=" << ret << dendl; + } + + if (ret == 0 && + pause) { + Mutex::Locker l(req_data->lock); + req_data->write_paused = true; + return CURL_READFUNC_PAUSE; + } + + return ret; +} + +Mutex& RGWHTTPClient::get_req_lock() +{ + return req_data->lock; +} + +void RGWHTTPClient::_set_write_paused(bool pause) +{ + ceph_assert(req_data->lock.is_locked()); + + RGWHTTPManager *mgr = req_data->mgr; + if (pause == req_data->write_paused) { + return; + } + if (pause) { + mgr->set_request_state(this, SET_WRITE_PAUSED); + } else { + mgr->set_request_state(this, SET_WRITE_RESUME); + } +} + +void RGWHTTPClient::_set_read_paused(bool pause) +{ + ceph_assert(req_data->lock.is_locked()); + + RGWHTTPManager *mgr = req_data->mgr; + if (pause == req_data->read_paused) { + return; + } + if (pause) { + mgr->set_request_state(this, SET_READ_PAUSED); + } else { + mgr->set_request_state(this, SET_READ_RESUME); + } +} + +static curl_slist *headers_to_slist(param_vec_t& headers) +{ + curl_slist *h = NULL; + + param_vec_t::iterator iter; + for (iter = headers.begin(); iter != headers.end(); ++iter) { + pair& p = *iter; + string val = p.first; + + if (strncmp(val.c_str(), "HTTP_", 5) == 0) { + val = val.substr(5); + } + + /* we need to convert all underscores into dashes as some web servers forbid them + * in the http header field names + */ + for (size_t i = 0; i < val.size(); i++) { + if (val[i] == '_') { + val[i] = '-'; + } + } + + val = camelcase_dash_http_attr(val); + + // curl won't send headers with empty values unless it ends with a ; instead + if (p.second.empty()) { + val.append(1, ';'); + } else { + val.append(": "); + val.append(p.second); + } + h = curl_slist_append(h, val.c_str()); + } + + return h; +} + +static bool is_upload_request(const string& method) +{ + return method == "POST" || method == "PUT"; +} + +/* + * process a single simple one off request + */ +int RGWHTTPClient::process(optional_yield y) +{ + return RGWHTTP::process(this, y); +} + +string RGWHTTPClient::to_str() +{ + string method_str = (method.empty() ? "" : method); + string url_str = (url.empty() ? "" : url); + return method_str + " " + url_str; +} + +int RGWHTTPClient::get_req_retcode() +{ + if (!req_data) { + return -EINVAL; + } + + return req_data->get_retcode(); +} + +/* + * init request, will be used later with RGWHTTPManager + */ +int RGWHTTPClient::init_request(rgw_http_req_data *_req_data) +{ + ceph_assert(!req_data); + _req_data->get(); + req_data = _req_data; + + req_data->curl_handle = do_curl_easy_init(); + + CURL *easy_handle = req_data->get_easy_handle(); + + dout(20) << "sending request to " << url << dendl; + + curl_slist *h = headers_to_slist(headers); + + req_data->h = h; + + curl_easy_setopt(easy_handle, CURLOPT_CUSTOMREQUEST, method.c_str()); + curl_easy_setopt(easy_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(easy_handle, CURLOPT_NOPROGRESS, 1L); + curl_easy_setopt(easy_handle, CURLOPT_NOSIGNAL, 1L); + curl_easy_setopt(easy_handle, CURLOPT_HEADERFUNCTION, receive_http_header); + curl_easy_setopt(easy_handle, CURLOPT_WRITEHEADER, (void *)req_data); + curl_easy_setopt(easy_handle, CURLOPT_WRITEFUNCTION, receive_http_data); + curl_easy_setopt(easy_handle, CURLOPT_WRITEDATA, (void *)req_data); + curl_easy_setopt(easy_handle, CURLOPT_ERRORBUFFER, (void *)req_data->error_buf); + curl_easy_setopt(easy_handle, CURLOPT_LOW_SPEED_TIME, cct->_conf->rgw_curl_low_speed_time); + curl_easy_setopt(easy_handle, CURLOPT_LOW_SPEED_LIMIT, cct->_conf->rgw_curl_low_speed_limit); + if (h) { + curl_easy_setopt(easy_handle, CURLOPT_HTTPHEADER, (void *)h); + } + curl_easy_setopt(easy_handle, CURLOPT_READFUNCTION, send_http_data); + curl_easy_setopt(easy_handle, CURLOPT_READDATA, (void *)req_data); + if (send_data_hint || is_upload_request(method)) { + curl_easy_setopt(easy_handle, CURLOPT_UPLOAD, 1L); + } + if (has_send_len) { + curl_easy_setopt(easy_handle, CURLOPT_INFILESIZE, (void *)send_len); + } + if (!verify_ssl) { + curl_easy_setopt(easy_handle, CURLOPT_SSL_VERIFYPEER, 0L); + curl_easy_setopt(easy_handle, CURLOPT_SSL_VERIFYHOST, 0L); + dout(20) << "ssl verification is set to off" << dendl; + } + curl_easy_setopt(easy_handle, CURLOPT_PRIVATE, (void *)req_data); + + return 0; +} + +bool RGWHTTPClient::is_done() +{ + return req_data->is_done(); +} + +/* + * wait for async request to complete + */ +int RGWHTTPClient::wait(optional_yield y) +{ + return req_data->wait(y); +} + +void RGWHTTPClient::cancel() +{ + if (req_data) { + RGWHTTPManager *http_manager = req_data->mgr; + if (http_manager) { + http_manager->remove_request(this); + } + } +} + +RGWHTTPClient::~RGWHTTPClient() +{ + cancel(); + if (req_data) { + req_data->put(); + } +} + + +int RGWHTTPHeadersCollector::receive_header(void * const ptr, const size_t len) +{ + const boost::string_ref header_line(static_cast(ptr), len); + + /* We're tokening the line that way due to backward compatibility. */ + const size_t sep_loc = header_line.find_first_of(" \t:"); + + if (boost::string_ref::npos == sep_loc) { + /* Wrongly formatted header? Just skip it. */ + return 0; + } + + header_name_t name(header_line.substr(0, sep_loc)); + if (0 == relevant_headers.count(name)) { + /* Not interested in this particular header. */ + return 0; + } + + const auto value_part = header_line.substr(sep_loc + 1); + + /* Skip spaces and tabs after the separator. */ + const size_t val_loc_s = value_part.find_first_not_of(' '); + const size_t val_loc_e = value_part.find_first_of("\r\n"); + + if (boost::string_ref::npos == val_loc_s || + boost::string_ref::npos == val_loc_e) { + /* Empty value case. */ + found_headers.emplace(name, header_value_t()); + } else { + found_headers.emplace(name, header_value_t( + value_part.substr(val_loc_s, val_loc_e - val_loc_s))); + } + + return 0; +} + +int RGWHTTPTransceiver::send_data(void* ptr, size_t len, bool* pause) +{ + int length_to_copy = 0; + if (post_data_index < post_data.length()) { + length_to_copy = min(post_data.length() - post_data_index, len); + memcpy(ptr, post_data.data() + post_data_index, length_to_copy); + post_data_index += length_to_copy; + } + return length_to_copy; +} + + +static int clear_signal(int fd) +{ + // since we're in non-blocking mode, we can try to read a lot more than + // one signal from signal_thread() to avoid later wakeups. non-blocking reads + // are also required to support the curl_multi_wait bug workaround + std::array buf; + int ret = ::read(fd, (void *)buf.data(), buf.size()); + if (ret < 0) { + ret = -errno; + return ret == -EAGAIN ? 0 : ret; // clear EAGAIN + } + return 0; +} + +#if HAVE_CURL_MULTI_WAIT + +static std::once_flag detect_flag; +static bool curl_multi_wait_bug_present = false; + +static int detect_curl_multi_wait_bug(CephContext *cct, CURLM *handle, + int write_fd, int read_fd) +{ + int ret = 0; + + // write to write_fd so that read_fd becomes readable + uint32_t buf = 0; + ret = ::write(write_fd, &buf, sizeof(buf)); + if (ret < 0) { + ret = -errno; + ldout(cct, 0) << "ERROR: " << __func__ << "(): write() returned " << ret << dendl; + return ret; + } + + // pass read_fd in extra_fds for curl_multi_wait() + int num_fds; + struct curl_waitfd wait_fd; + + wait_fd.fd = read_fd; + wait_fd.events = CURL_WAIT_POLLIN; + wait_fd.revents = 0; + + ret = curl_multi_wait(handle, &wait_fd, 1, 0, &num_fds); + if (ret != CURLM_OK) { + ldout(cct, 0) << "ERROR: curl_multi_wait() returned " << ret << dendl; + return -EIO; + } + + // curl_multi_wait should flag revents when extra_fd is readable. if it + // doesn't, the bug is present and we can't rely on revents + if (wait_fd.revents == 0) { + curl_multi_wait_bug_present = true; + ldout(cct, 0) << "WARNING: detected a version of libcurl which contains a " + "bug in curl_multi_wait(). enabling a workaround that may degrade " + "performance slightly." << dendl; + } + + return clear_signal(read_fd); +} + +static bool is_signaled(const curl_waitfd& wait_fd) +{ + if (wait_fd.fd < 0) { + // no fd to signal + return false; + } + + if (curl_multi_wait_bug_present) { + // we can't rely on revents, so we always return true if a wait_fd is given. + // this means we'll be trying a non-blocking read on this fd every time that + // curl_multi_wait() wakes up + return true; + } + + return wait_fd.revents > 0; +} + +static int do_curl_wait(CephContext *cct, CURLM *handle, int signal_fd) +{ + int num_fds; + struct curl_waitfd wait_fd; + + wait_fd.fd = signal_fd; + wait_fd.events = CURL_WAIT_POLLIN; + wait_fd.revents = 0; + + int ret = curl_multi_wait(handle, &wait_fd, 1, cct->_conf->rgw_curl_wait_timeout_ms, &num_fds); + if (ret) { + ldout(cct, 0) << "ERROR: curl_multi_wait() returned " << ret << dendl; + return -EIO; + } + + if (is_signaled(wait_fd)) { + ret = clear_signal(signal_fd); + if (ret < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << "(): read() returned " << ret << dendl; + return ret; + } + } + return 0; +} + +#else + +static int do_curl_wait(CephContext *cct, CURLM *handle, int signal_fd) +{ + fd_set fdread; + fd_set fdwrite; + fd_set fdexcep; + int maxfd = -1; + + FD_ZERO(&fdread); + FD_ZERO(&fdwrite); + FD_ZERO(&fdexcep); + + /* get file descriptors from the transfers */ + int ret = curl_multi_fdset(handle, &fdread, &fdwrite, &fdexcep, &maxfd); + if (ret) { + ldout(cct, 0) << "ERROR: curl_multi_fdset returned " << ret << dendl; + return -EIO; + } + + if (signal_fd > 0) { + FD_SET(signal_fd, &fdread); + if (signal_fd >= maxfd) { + maxfd = signal_fd + 1; + } + } + + /* forcing a strict timeout, as the returned fdsets might not reference all fds we wait on */ + uint64_t to = cct->_conf->rgw_curl_wait_timeout_ms; +#define RGW_CURL_TIMEOUT 1000 + if (!to) + to = RGW_CURL_TIMEOUT; + struct timeval timeout; + timeout.tv_sec = to / 1000; + timeout.tv_usec = to % 1000; + + ret = select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout); + if (ret < 0) { + ret = -errno; + ldout(cct, 0) << "ERROR: select returned " << ret << dendl; + return ret; + } + + if (signal_fd > 0 && FD_ISSET(signal_fd, &fdread)) { + ret = clear_signal(signal_fd); + if (ret < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << "(): read() returned " << ret << dendl; + return ret; + } + } + + return 0; +} + +#endif + +void *RGWHTTPManager::ReqsThread::entry() +{ + manager->reqs_thread_entry(); + return NULL; +} + +/* + * RGWHTTPManager has two modes of operation: threaded and non-threaded. + */ +RGWHTTPManager::RGWHTTPManager(CephContext *_cct, RGWCompletionManager *_cm) : cct(_cct), + completion_mgr(_cm), is_started(false), + reqs_lock("RGWHTTPManager::reqs_lock"), num_reqs(0), max_threaded_req(0), + reqs_thread(NULL) +{ + multi_handle = (void *)curl_multi_init(); + thread_pipe[0] = -1; + thread_pipe[1] = -1; +} + +RGWHTTPManager::~RGWHTTPManager() { + stop(); + if (multi_handle) + curl_multi_cleanup((CURLM *)multi_handle); +} + +void RGWHTTPManager::register_request(rgw_http_req_data *req_data) +{ + RWLock::WLocker rl(reqs_lock); + req_data->id = num_reqs; + req_data->registered = true; + reqs[num_reqs] = req_data; + num_reqs++; + ldout(cct, 20) << __func__ << " mgr=" << this << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl; +} + +bool RGWHTTPManager::unregister_request(rgw_http_req_data *req_data) +{ + RWLock::WLocker rl(reqs_lock); + if (!req_data->registered) { + return false; + } + req_data->get(); + req_data->registered = false; + unregistered_reqs.push_back(req_data); + ldout(cct, 20) << __func__ << " mgr=" << this << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl; + return true; +} + +void RGWHTTPManager::complete_request(rgw_http_req_data *req_data) +{ + RWLock::WLocker rl(reqs_lock); + _complete_request(req_data); +} + +void RGWHTTPManager::_complete_request(rgw_http_req_data *req_data) +{ + map::iterator iter = reqs.find(req_data->id); + if (iter != reqs.end()) { + reqs.erase(iter); + } + { + Mutex::Locker l(req_data->lock); + req_data->mgr = nullptr; + } + if (completion_mgr) { + completion_mgr->complete(NULL, req_data->control_io_id, req_data->user_info); + } + + req_data->put(); +} + +void RGWHTTPManager::finish_request(rgw_http_req_data *req_data, int ret, long http_status) +{ + req_data->finish(ret, http_status); + complete_request(req_data); +} + +void RGWHTTPManager::_finish_request(rgw_http_req_data *req_data, int ret) +{ + req_data->finish(ret); + _complete_request(req_data); +} + +void RGWHTTPManager::_set_req_state(set_state& ss) +{ + ss.req->set_state(ss.bitmask); +} +/* + * hook request to the curl multi handle + */ +int RGWHTTPManager::link_request(rgw_http_req_data *req_data) +{ + ldout(cct, 20) << __func__ << " req_data=" << req_data << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl; + CURLMcode mstatus = curl_multi_add_handle((CURLM *)multi_handle, req_data->get_easy_handle()); + if (mstatus) { + dout(0) << "ERROR: failed on curl_multi_add_handle, status=" << mstatus << dendl; + return -EIO; + } + return 0; +} + +/* + * unhook request from the curl multi handle, and finish request if it wasn't finished yet as + * there will be no more processing on this request + */ +void RGWHTTPManager::_unlink_request(rgw_http_req_data *req_data) +{ + if (req_data->curl_handle) { + curl_multi_remove_handle((CURLM *)multi_handle, req_data->get_easy_handle()); + } + if (!req_data->is_done()) { + _finish_request(req_data, -ECANCELED); + } +} + +void RGWHTTPManager::unlink_request(rgw_http_req_data *req_data) +{ + RWLock::WLocker wl(reqs_lock); + _unlink_request(req_data); +} + +void RGWHTTPManager::manage_pending_requests() +{ + reqs_lock.get_read(); + if (max_threaded_req == num_reqs && + unregistered_reqs.empty() && + reqs_change_state.empty()) { + reqs_lock.unlock(); + return; + } + reqs_lock.unlock(); + + RWLock::WLocker wl(reqs_lock); + + if (!reqs_change_state.empty()) { + for (auto siter : reqs_change_state) { + _set_req_state(siter); + } + reqs_change_state.clear(); + } + + if (!unregistered_reqs.empty()) { + for (auto& r : unregistered_reqs) { + _unlink_request(r); + r->put(); + } + + unregistered_reqs.clear(); + } + + map::iterator iter = reqs.find(max_threaded_req); + + list > remove_reqs; + + for (; iter != reqs.end(); ++iter) { + rgw_http_req_data *req_data = iter->second; + int r = link_request(req_data); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to link http request" << dendl; + remove_reqs.push_back(std::make_pair(iter->second, r)); + } else { + max_threaded_req = iter->first + 1; + } + } + + for (auto piter : remove_reqs) { + rgw_http_req_data *req_data = piter.first; + int r = piter.second; + + _finish_request(req_data, r); + } +} + +int RGWHTTPManager::add_request(RGWHTTPClient *client) +{ + rgw_http_req_data *req_data = new rgw_http_req_data; + + int ret = client->init_request(req_data); + if (ret < 0) { + req_data->put(); + req_data = NULL; + return ret; + } + + req_data->mgr = this; + req_data->client = client; + req_data->control_io_id = client->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_CONTROL); + req_data->user_info = client->get_io_user_info(); + + register_request(req_data); + + if (!is_started) { + ret = link_request(req_data); + if (ret < 0) { + req_data->put(); + req_data = NULL; + } + return ret; + } + ret = signal_thread(); + if (ret < 0) { + finish_request(req_data, ret); + } + + return ret; +} + +int RGWHTTPManager::remove_request(RGWHTTPClient *client) +{ + rgw_http_req_data *req_data = client->get_req_data(); + + if (!is_started) { + unlink_request(req_data); + return 0; + } + if (!unregister_request(req_data)) { + return 0; + } + int ret = signal_thread(); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWHTTPManager::set_request_state(RGWHTTPClient *client, RGWHTTPRequestSetState state) +{ + rgw_http_req_data *req_data = client->get_req_data(); + + ceph_assert(req_data->lock.is_locked()); + + /* can only do that if threaded */ + if (!is_started) { + return -EINVAL; + } + + bool suggested_wr_paused = req_data->write_paused; + bool suggested_rd_paused = req_data->read_paused; + + switch (state) { + case SET_WRITE_PAUSED: + suggested_wr_paused = true; + break; + case SET_WRITE_RESUME: + suggested_wr_paused = false; + break; + case SET_READ_PAUSED: + suggested_rd_paused = true; + break; + case SET_READ_RESUME: + suggested_rd_paused = false; + break; + default: + /* shouldn't really be here */ + return -EIO; + } + if (suggested_wr_paused == req_data->write_paused && + suggested_rd_paused == req_data->read_paused) { + return 0; + } + + req_data->write_paused = suggested_wr_paused; + req_data->read_paused = suggested_rd_paused; + + int bitmask = CURLPAUSE_CONT; + + if (req_data->write_paused) { + bitmask |= CURLPAUSE_SEND; + } + + if (req_data->read_paused) { + bitmask |= CURLPAUSE_RECV; + } + + reqs_change_state.push_back(set_state(req_data, bitmask)); + int ret = signal_thread(); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWHTTPManager::start() +{ + if (pipe_cloexec(thread_pipe) < 0) { + int e = errno; + ldout(cct, 0) << "ERROR: pipe(): " << cpp_strerror(e) << dendl; + return -e; + } + + // enable non-blocking reads + if (::fcntl(thread_pipe[0], F_SETFL, O_NONBLOCK) < 0) { + int e = errno; + ldout(cct, 0) << "ERROR: fcntl(): " << cpp_strerror(e) << dendl; + TEMP_FAILURE_RETRY(::close(thread_pipe[0])); + TEMP_FAILURE_RETRY(::close(thread_pipe[1])); + return -e; + } + +#ifdef HAVE_CURL_MULTI_WAIT + // on first initialization, use this pipe to detect whether we're using a + // buggy version of libcurl + std::call_once(detect_flag, detect_curl_multi_wait_bug, cct, + static_cast(multi_handle), + thread_pipe[1], thread_pipe[0]); +#endif + + is_started = true; + reqs_thread = new ReqsThread(this); + reqs_thread->create("http_manager"); + return 0; +} + +void RGWHTTPManager::stop() +{ + if (is_stopped) { + return; + } + + is_stopped = true; + + if (is_started) { + going_down = true; + signal_thread(); + reqs_thread->join(); + delete reqs_thread; + TEMP_FAILURE_RETRY(::close(thread_pipe[1])); + TEMP_FAILURE_RETRY(::close(thread_pipe[0])); + } +} + +int RGWHTTPManager::signal_thread() +{ + uint32_t buf = 0; + int ret = write(thread_pipe[1], (void *)&buf, sizeof(buf)); + if (ret < 0) { + ret = -errno; + ldout(cct, 0) << "ERROR: " << __func__ << ": write() returned ret=" << ret << dendl; + return ret; + } + return 0; +} + +void *RGWHTTPManager::reqs_thread_entry() +{ + int still_running; + int mstatus; + + ldout(cct, 20) << __func__ << ": start" << dendl; + + while (!going_down) { + int ret = do_curl_wait(cct, (CURLM *)multi_handle, thread_pipe[0]); + if (ret < 0) { + dout(0) << "ERROR: do_curl_wait() returned: " << ret << dendl; + return NULL; + } + + manage_pending_requests(); + + mstatus = curl_multi_perform((CURLM *)multi_handle, &still_running); + switch (mstatus) { + case CURLM_OK: + case CURLM_CALL_MULTI_PERFORM: + break; + default: + dout(10) << "curl_multi_perform returned: " << mstatus << dendl; + break; + } + int msgs_left; + CURLMsg *msg; + while ((msg = curl_multi_info_read((CURLM *)multi_handle, &msgs_left))) { + if (msg->msg == CURLMSG_DONE) { + int result = msg->data.result; + CURL *e = msg->easy_handle; + rgw_http_req_data *req_data; + curl_easy_getinfo(e, CURLINFO_PRIVATE, (void **)&req_data); + curl_multi_remove_handle((CURLM *)multi_handle, e); + + long http_status; + curl_easy_getinfo(e, CURLINFO_RESPONSE_CODE, (void **)&http_status); + + int status = rgw_http_error_to_errno(http_status); + if (result != CURLE_OK && status == 0) { + dout(0) << "ERROR: curl error: " << curl_easy_strerror((CURLcode)result) << ", maybe network unstable" << dendl; + status = -EAGAIN; + } + int id = req_data->id; + finish_request(req_data, status, http_status); + switch (result) { + case CURLE_OK: + break; + case CURLE_OPERATION_TIMEDOUT: + dout(0) << "WARNING: curl operation timed out, network average transfer speed less than " + << cct->_conf->rgw_curl_low_speed_limit << " Bytes per second during " << cct->_conf->rgw_curl_low_speed_time << " seconds." << dendl; + default: + dout(20) << "ERROR: msg->data.result=" << result << " req_data->id=" << id << " http_status=" << http_status << dendl; + dout(20) << "ERROR: curl error: " << curl_easy_strerror((CURLcode)result) << dendl; + break; + } + } + } + } + + + RWLock::WLocker rl(reqs_lock); + for (auto r : unregistered_reqs) { + _unlink_request(r); + } + + unregistered_reqs.clear(); + + auto all_reqs = std::move(reqs); + for (auto iter : all_reqs) { + _unlink_request(iter.second); + } + + reqs.clear(); + + if (completion_mgr) { + completion_mgr->go_down(); + } + + return 0; +} + +void rgw_http_client_init(CephContext *cct) +{ + curl_global_init(CURL_GLOBAL_ALL); + rgw_http_manager = new RGWHTTPManager(cct); + rgw_http_manager->start(); +} + +void rgw_http_client_cleanup() +{ + rgw_http_manager->stop(); + delete rgw_http_manager; + curl_global_cleanup(); +} + + +int RGWHTTP::send(RGWHTTPClient *req) { + if (!req) { + return 0; + } + int r = rgw_http_manager->add_request(req); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWHTTP::process(RGWHTTPClient *req, optional_yield y) { + if (!req) { + return 0; + } + int r = send(req); + if (r < 0) { + return r; + } + + return req->wait(y); +} + diff --git a/src/rgw/rgw_http_client.h b/src/rgw/rgw_http_client.h new file mode 100644 index 00000000..eabe8a85 --- /dev/null +++ b/src/rgw/rgw_http_client.h @@ -0,0 +1,370 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_HTTP_CLIENT_H +#define CEPH_RGW_HTTP_CLIENT_H + +#include "common/async/yield_context.h" +#include "common/RWLock.h" +#include "common/Cond.h" +#include "rgw_common.h" +#include "rgw_string.h" + +#include + +using param_pair_t = pair; +using param_vec_t = vector; + +void rgw_http_client_init(CephContext *cct); +void rgw_http_client_cleanup(); + +struct rgw_http_req_data; +class RGWHTTPManager; + +class RGWIOIDProvider +{ + std::atomic max = {0}; + +public: + RGWIOIDProvider() {} + int64_t get_next() { + return ++max; + } +}; + +struct rgw_io_id { + int64_t id{0}; + int channels{0}; + + rgw_io_id() {} + rgw_io_id(int64_t _id, int _channels) : id(_id), channels(_channels) {} + + bool intersects(const rgw_io_id& rhs) { + return (id == rhs.id && ((channels | rhs.channels) != 0)); + } + + bool operator<(const rgw_io_id& rhs) const { + if (id < rhs.id) { + return true; + } + return (id == rhs.id && + channels < rhs.channels); + } +}; + +class RGWIOProvider +{ + int64_t id{-1}; + +public: + RGWIOProvider() {} + virtual ~RGWIOProvider() = default; + + void assign_io(RGWIOIDProvider& io_id_provider, int io_type = -1); + rgw_io_id get_io_id(int io_type) { + return rgw_io_id{id, io_type}; + } + + virtual void set_io_user_info(void *_user_info) = 0; + virtual void *get_io_user_info() = 0; +}; + +class RGWHTTPClient : public RGWIOProvider +{ + friend class RGWHTTPManager; + + bufferlist send_bl; + bufferlist::iterator send_iter; + bool has_send_len; + long http_status; + bool send_data_hint{false}; + size_t receive_pause_skip{0}; /* how many bytes to skip next time receive_data is called + due to being paused */ + + void *user_info{nullptr}; + + rgw_http_req_data *req_data; + + bool verify_ssl; // Do not validate self signed certificates, default to false + + std::atomic stopped { 0 }; + + +protected: + CephContext *cct; + + string method; + string url; + + size_t send_len{0}; + + param_vec_t headers; + + RGWHTTPManager *get_manager(); + + int init_request(rgw_http_req_data *req_data); + + virtual int receive_header(void *ptr, size_t len) { + return 0; + } + virtual int receive_data(void *ptr, size_t len, bool *pause) { + return 0; + } + + virtual int send_data(void *ptr, size_t len, bool *pause=nullptr) { + return 0; + } + + /* Callbacks for libcurl. */ + static size_t receive_http_header(void *ptr, + size_t size, + size_t nmemb, + void *_info); + + static size_t receive_http_data(void *ptr, + size_t size, + size_t nmemb, + void *_info); + + static size_t send_http_data(void *ptr, + size_t size, + size_t nmemb, + void *_info); + + Mutex& get_req_lock(); + + /* needs to be called under req_lock() */ + void _set_write_paused(bool pause); + void _set_read_paused(bool pause); +public: + static const long HTTP_STATUS_NOSTATUS = 0; + static const long HTTP_STATUS_UNAUTHORIZED = 401; + static const long HTTP_STATUS_NOTFOUND = 404; + + static constexpr int HTTPCLIENT_IO_READ = 0x1; + static constexpr int HTTPCLIENT_IO_WRITE = 0x2; + static constexpr int HTTPCLIENT_IO_CONTROL = 0x4; + + virtual ~RGWHTTPClient(); + explicit RGWHTTPClient(CephContext *cct, + const string& _method, + const string& _url) + : has_send_len(false), + http_status(HTTP_STATUS_NOSTATUS), + req_data(nullptr), + verify_ssl(cct->_conf->rgw_verify_ssl), + cct(cct), + method(_method), + url(_url) { + } + + void append_header(const string& name, const string& val) { + headers.push_back(pair(name, val)); + } + + void set_send_length(size_t len) { + send_len = len; + has_send_len = true; + } + + void set_send_data_hint(bool hint) { + send_data_hint = hint; + } + + long get_http_status() const { + return http_status; + } + + void set_http_status(long _http_status) { + http_status = _http_status; + } + + void set_verify_ssl(bool flag) { + verify_ssl = flag; + } + + int process(optional_yield y=null_yield); + + int wait(optional_yield y=null_yield); + void cancel(); + bool is_done(); + + rgw_http_req_data *get_req_data() { return req_data; } + + string to_str(); + + int get_req_retcode(); + + void set_url(const string& _url) { + url = _url; + } + + void set_method(const string& _method) { + method = _method; + } + + void set_io_user_info(void *_user_info) override { + user_info = _user_info; + } + + void *get_io_user_info() override { + return user_info; + } +}; + + +class RGWHTTPHeadersCollector : public RGWHTTPClient { +public: + typedef std::string header_name_t; + typedef std::string header_value_t; + typedef std::set header_spec_t; + + RGWHTTPHeadersCollector(CephContext * const cct, + const string& method, + const string& url, + const header_spec_t &relevant_headers) + : RGWHTTPClient(cct, method, url), + relevant_headers(relevant_headers) { + } + + std::map get_headers() const { + return found_headers; + } + + /* Throws std::out_of_range */ + const header_value_t& get_header_value(const header_name_t& name) const { + return found_headers.at(name); + } + +protected: + int receive_header(void *ptr, size_t len) override; + +private: + const std::set relevant_headers; + std::map found_headers; +}; + + +class RGWHTTPTransceiver : public RGWHTTPHeadersCollector { + bufferlist * const read_bl; + std::string post_data; + size_t post_data_index; + +public: + RGWHTTPTransceiver(CephContext * const cct, + const string& method, + const string& url, + bufferlist * const read_bl, + const header_spec_t intercept_headers = {}) + : RGWHTTPHeadersCollector(cct, method, url, intercept_headers), + read_bl(read_bl), + post_data_index(0) { + } + + RGWHTTPTransceiver(CephContext * const cct, + const string& method, + const string& url, + bufferlist * const read_bl, + const bool verify_ssl, + const header_spec_t intercept_headers = {}) + : RGWHTTPHeadersCollector(cct, method, url, intercept_headers), + read_bl(read_bl), + post_data_index(0) { + set_verify_ssl(verify_ssl); + } + + void set_post_data(const std::string& _post_data) { + this->post_data = _post_data; + } + +protected: + int send_data(void* ptr, size_t len, bool *pause=nullptr) override; + + int receive_data(void *ptr, size_t len, bool *pause) override { + read_bl->append((char *)ptr, len); + return 0; + } +}; + +typedef RGWHTTPTransceiver RGWPostHTTPData; + + +class RGWCompletionManager; + +enum RGWHTTPRequestSetState { + SET_NOP = 0, + SET_WRITE_PAUSED = 1, + SET_WRITE_RESUME = 2, + SET_READ_PAUSED = 3, + SET_READ_RESUME = 4, +}; + +class RGWHTTPManager { + struct set_state { + rgw_http_req_data *req; + int bitmask; + + set_state(rgw_http_req_data *_req, int _bitmask) : req(_req), bitmask(_bitmask) {} + }; + CephContext *cct; + RGWCompletionManager *completion_mgr; + void *multi_handle; + bool is_started; + std::atomic going_down { 0 }; + std::atomic is_stopped { 0 }; + + RWLock reqs_lock; + map reqs; + list unregistered_reqs; + list reqs_change_state; + map complete_reqs; + int64_t num_reqs; + int64_t max_threaded_req; + int thread_pipe[2]; + + void register_request(rgw_http_req_data *req_data); + void complete_request(rgw_http_req_data *req_data); + void _complete_request(rgw_http_req_data *req_data); + bool unregister_request(rgw_http_req_data *req_data); + void _unlink_request(rgw_http_req_data *req_data); + void unlink_request(rgw_http_req_data *req_data); + void finish_request(rgw_http_req_data *req_data, int r, long http_status = -1); + void _finish_request(rgw_http_req_data *req_data, int r); + void _set_req_state(set_state& ss); + int link_request(rgw_http_req_data *req_data); + + void manage_pending_requests(); + + class ReqsThread : public Thread { + RGWHTTPManager *manager; + + public: + explicit ReqsThread(RGWHTTPManager *_m) : manager(_m) {} + void *entry() override; + }; + + ReqsThread *reqs_thread; + + void *reqs_thread_entry(); + + int signal_thread(); + +public: + RGWHTTPManager(CephContext *_cct, RGWCompletionManager *completion_mgr = NULL); + ~RGWHTTPManager(); + + int start(); + void stop(); + + int add_request(RGWHTTPClient *client); + int remove_request(RGWHTTPClient *client); + int set_request_state(RGWHTTPClient *client, RGWHTTPRequestSetState state); +}; + +class RGWHTTP +{ +public: + static int send(RGWHTTPClient *req); + static int process(RGWHTTPClient *req, optional_yield y=null_yield); +}; +#endif diff --git a/src/rgw/rgw_http_client_curl.cc b/src/rgw/rgw_http_client_curl.cc new file mode 100644 index 00000000..34681348 --- /dev/null +++ b/src/rgw/rgw_http_client_curl.cc @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_http_client_curl.h" +#include +#include +#include + +#include "rgw_common.h" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +#ifdef WITH_CURL_OPENSSL +#include +#endif + +#if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L +namespace openssl { + +class RGWSSLSetup +{ + std::vector locks; +public: + explicit RGWSSLSetup(int n) : locks (n){} + + void set_lock(int id){ + try { + locks.at(id).lock(); + } catch (std::out_of_range& e) { + dout(0) << __func__ << " failed to set locks" << dendl; + } + } + + void clear_lock(int id){ + try { + locks.at(id).unlock(); + } catch (std::out_of_range& e) { + dout(0) << __func__ << " failed to unlock" << dendl; + } + } +}; + + +void rgw_ssl_locking_callback(int mode, int id, const char *file, int line) +{ + static RGWSSLSetup locks(CRYPTO_num_locks()); + if (mode & CRYPTO_LOCK) + locks.set_lock(id); + else + locks.clear_lock(id); +} + +unsigned long rgw_ssl_thread_id_callback(){ + return (unsigned long)pthread_self(); +} + +void init_ssl(){ + CRYPTO_set_id_callback((unsigned long (*) ()) rgw_ssl_thread_id_callback); + CRYPTO_set_locking_callback(rgw_ssl_locking_callback); +} + +} /* namespace openssl */ +#endif // WITH_CURL_OPENSSL + + +namespace rgw { +namespace curl { + +static void check_curl() +{ +#ifndef HAVE_CURL_MULTI_WAIT + derr << "WARNING: libcurl doesn't support curl_multi_wait()" << dendl; + derr << "WARNING: cross zone / region transfer performance may be affected" << dendl; +#endif +} + +#if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L +void init_ssl() { + ::openssl::init_ssl(); +} + +bool fe_inits_ssl(boost::optional m, long& curl_global_flags){ + if (m) { + for (const auto& kv: *m){ + if (kv.first == "civetweb" || kv.first == "beast"){ + std::string cert; + kv.second->get_val("ssl_certificate","", &cert); + if (!cert.empty()){ + /* TODO this flag is no op for curl > 7.57 */ + curl_global_flags &= ~CURL_GLOBAL_SSL; + return true; + } + } + } + } + return false; +} +#endif // WITH_CURL_OPENSSL + +std::once_flag curl_init_flag; + +void setup_curl(boost::optional m) { + check_curl(); + + long curl_global_flags = CURL_GLOBAL_ALL; + + #if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L + if (!fe_inits_ssl(m, curl_global_flags)) + init_ssl(); + #endif + + std::call_once(curl_init_flag, curl_global_init, curl_global_flags); + rgw_setup_saved_curl_handles(); +} + +void cleanup_curl() { + rgw_release_all_curl_handles(); + curl_global_cleanup(); +} + +} /* namespace curl */ +} /* namespace rgw */ diff --git a/src/rgw/rgw_http_client_curl.h b/src/rgw/rgw_http_client_curl.h new file mode 100644 index 00000000..d9080f44 --- /dev/null +++ b/src/rgw/rgw_http_client_curl.h @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 SUSE Linux GmBH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_HTTP_CLIENT_CURL_H +#define RGW_HTTP_CLIENT_CURL_H + +#include +#include +#include "rgw_frontend.h" + +namespace rgw { +namespace curl { +using fe_map_t = std::multimap ; + +void setup_curl(boost::optional m); +void cleanup_curl(); +} +} + +#endif diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h new file mode 100644 index 00000000..22113448 --- /dev/null +++ b/src/rgw/rgw_http_errors.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_HTTP_ERRORS_H_ +#define RGW_HTTP_ERRORS_H_ + +#include "rgw_common.h" + +typedef const std::map> rgw_http_errors; + +extern rgw_http_errors rgw_http_s3_errors; + +extern rgw_http_errors rgw_http_swift_errors; + +extern rgw_http_errors rgw_http_sts_errors; + +extern rgw_http_errors rgw_http_iam_errors; + +static inline int rgw_http_error_to_errno(int http_err) +{ + if (http_err >= 200 && http_err <= 299) + return 0; + switch (http_err) { + case 304: + return -ERR_NOT_MODIFIED; + case 400: + return -EINVAL; + case 401: + return -EPERM; + case 403: + return -EACCES; + case 404: + return -ENOENT; + case 409: + return -ENOTEMPTY; + case 503: + return -EBUSY; + default: + return -EIO; + } + + return 0; /* unreachable */ +} + + +#endif diff --git a/src/rgw/rgw_iam_policy.cc b/src/rgw/rgw_iam_policy.cc new file mode 100644 index 00000000..53573993 --- /dev/null +++ b/src/rgw/rgw_iam_policy.cc @@ -0,0 +1,1432 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#include +#include +#include +#include +#include +#include + +#include + +#include "rapidjson/reader.h" + +#include "rgw_auth.h" +#include +#include "rgw_iam_policy.h" + +namespace { +constexpr int dout_subsys = ceph_subsys_rgw; +} + +using std::bitset; +using std::find; +using std::int64_t; +using std::move; +using std::pair; +using std::size_t; +using std::string; +using std::stringstream; +using std::ostream; +using std::uint16_t; +using std::uint64_t; +using std::unordered_map; + +using boost::container::flat_set; +using std::regex; +using std::regex_constants::ECMAScript; +using std::regex_constants::optimize; +using std::regex_match; +using std::smatch; + +using rapidjson::BaseReaderHandler; +using rapidjson::UTF8; +using rapidjson::SizeType; +using rapidjson::Reader; +using rapidjson::kParseCommentsFlag; +using rapidjson::kParseNumbersAsStringsFlag; +using rapidjson::StringStream; +using rapidjson::ParseResult; + +using rgw::auth::Principal; + +namespace rgw { +namespace IAM { +#include "rgw_iam_policy_keywords.frag.cc" + +struct actpair { + const char* name; + const uint64_t bit; +}; + + + +static const actpair actpairs[] = +{{ "s3:AbortMultipartUpload", s3AbortMultipartUpload }, + { "s3:CreateBucket", s3CreateBucket }, + { "s3:DeleteBucketPolicy", s3DeleteBucketPolicy }, + { "s3:DeleteBucket", s3DeleteBucket }, + { "s3:DeleteBucketWebsite", s3DeleteBucketWebsite }, + { "s3:DeleteObject", s3DeleteObject }, + { "s3:DeleteObjectVersion", s3DeleteObjectVersion }, + { "s3:DeleteObjectTagging", s3DeleteObjectTagging }, + { "s3:DeleteObjectVersionTagging", s3DeleteObjectVersionTagging }, + { "s3:DeleteReplicationConfiguration", s3DeleteReplicationConfiguration }, + { "s3:GetAccelerateConfiguration", s3GetAccelerateConfiguration }, + { "s3:GetBucketAcl", s3GetBucketAcl }, + { "s3:GetBucketCORS", s3GetBucketCORS }, + { "s3:GetBucketLocation", s3GetBucketLocation }, + { "s3:GetBucketLogging", s3GetBucketLogging }, + { "s3:GetBucketNotification", s3GetBucketNotification }, + { "s3:GetBucketPolicy", s3GetBucketPolicy }, + { "s3:GetBucketRequestPayment", s3GetBucketRequestPayment }, + { "s3:GetBucketTagging", s3GetBucketTagging }, + { "s3:GetBucketVersioning", s3GetBucketVersioning }, + { "s3:GetBucketWebsite", s3GetBucketWebsite }, + { "s3:GetLifecycleConfiguration", s3GetLifecycleConfiguration }, + { "s3:GetBucketObjectLockConfiguration", s3GetBucketObjectLockConfiguration }, + { "s3:GetObjectAcl", s3GetObjectAcl }, + { "s3:GetObject", s3GetObject }, + { "s3:GetObjectTorrent", s3GetObjectTorrent }, + { "s3:GetObjectVersionAcl", s3GetObjectVersionAcl }, + { "s3:GetObjectVersion", s3GetObjectVersion }, + { "s3:GetObjectVersionTorrent", s3GetObjectVersionTorrent }, + { "s3:GetObjectTagging", s3GetObjectTagging }, + { "s3:GetObjectVersionTagging", s3GetObjectVersionTagging}, + { "s3:GetObjectRetention", s3GetObjectRetention}, + { "s3:GetObjectLegalHold", s3GetObjectLegalHold}, + { "s3:GetReplicationConfiguration", s3GetReplicationConfiguration }, + { "s3:ListAllMyBuckets", s3ListAllMyBuckets }, + { "s3:ListBucketMultipartUploads", s3ListBucketMultipartUploads }, + { "s3:ListBucket", s3ListBucket }, + { "s3:ListBucketVersions", s3ListBucketVersions }, + { "s3:ListMultipartUploadParts", s3ListMultipartUploadParts }, + { "s3:PutAccelerateConfiguration", s3PutAccelerateConfiguration }, + { "s3:PutBucketAcl", s3PutBucketAcl }, + { "s3:PutBucketCORS", s3PutBucketCORS }, + { "s3:PutBucketLogging", s3PutBucketLogging }, + { "s3:PutBucketNotification", s3PutBucketNotification }, + { "s3:PutBucketPolicy", s3PutBucketPolicy }, + { "s3:PutBucketRequestPayment", s3PutBucketRequestPayment }, + { "s3:PutBucketTagging", s3PutBucketTagging }, + { "s3:PutBucketVersioning", s3PutBucketVersioning }, + { "s3:PutBucketWebsite", s3PutBucketWebsite }, + { "s3:PutLifecycleConfiguration", s3PutLifecycleConfiguration }, + { "s3:PutBucketObjectLockConfiguration", s3PutBucketObjectLockConfiguration }, + { "s3:PutObjectAcl", s3PutObjectAcl }, + { "s3:PutObject", s3PutObject }, + { "s3:PutObjectVersionAcl", s3PutObjectVersionAcl }, + { "s3:PutObjectTagging", s3PutObjectTagging }, + { "s3:PutObjectVersionTagging", s3PutObjectVersionTagging }, + { "s3:PutObjectRetention", s3PutObjectRetention }, + { "s3:PutObjectLegalHold", s3PutObjectLegalHold }, + { "s3:BypassGovernanceRetention", s3BypassGovernanceRetention }, + { "s3:PutReplicationConfiguration", s3PutReplicationConfiguration }, + { "s3:RestoreObject", s3RestoreObject }, + { "iam:PutUserPolicy", iamPutUserPolicy }, + { "iam:GetUserPolicy", iamGetUserPolicy }, + { "iam:DeleteUserPolicy", iamDeleteUserPolicy }, + { "iam:ListUserPolicies", iamListUserPolicies }, + { "iam:CreateRole", iamCreateRole}, + { "iam:DeleteRole", iamDeleteRole}, + { "iam:GetRole", iamGetRole}, + { "iam:ModifyRole", iamModifyRole}, + { "iam:ListRoles", iamListRoles}, + { "iam:PutRolePolicy", iamPutRolePolicy}, + { "iam:GetRolePolicy", iamGetRolePolicy}, + { "iam:ListRolePolicies", iamListRolePolicies}, + { "iam:DeleteRolePolicy", iamDeleteRolePolicy}, + { "sts:AssumeRole", stsAssumeRole}, + { "sts:AssumeRoleWithWebIdentity", stsAssumeRoleWithWebIdentity}, + { "sts:GetSessionToken", stsGetSessionToken}, +}; + +struct PolicyParser; + +const Keyword top[1]{"", TokenKind::pseudo, TokenID::Top, 0, false, + false}; +const Keyword cond_key[1]{"", TokenKind::cond_key, + TokenID::CondKey, 0, true, false}; + +struct ParseState { + PolicyParser* pp; + const Keyword* w; + + bool arraying = false; + bool objecting = false; + bool cond_ifexists = false; + + void reset(); + + ParseState(PolicyParser* pp, const Keyword* w) + : pp(pp), w(w) {} + + bool obj_start(); + + bool obj_end(); + + bool array_start() { + if (w->arrayable && !arraying) { + arraying = true; + return true; + } + return false; + } + + bool array_end(); + + bool key(const char* s, size_t l); + bool do_string(CephContext* cct, const char* s, size_t l); + bool number(const char* str, size_t l); +}; + +// If this confuses you, look up the Curiously Recurring Template Pattern +struct PolicyParser : public BaseReaderHandler, PolicyParser> { + keyword_hash tokens; + std::vector s; + CephContext* cct; + const string& tenant; + Policy& policy; + uint32_t v = 0; + + uint32_t seen = 0; + + uint32_t dex(TokenID in) const { + switch (in) { + case TokenID::Version: + return 0x1; + case TokenID::Id: + return 0x2; + case TokenID::Statement: + return 0x4; + case TokenID::Sid: + return 0x8; + case TokenID::Effect: + return 0x10; + case TokenID::Principal: + return 0x20; + case TokenID::NotPrincipal: + return 0x40; + case TokenID::Action: + return 0x80; + case TokenID::NotAction: + return 0x100; + case TokenID::Resource: + return 0x200; + case TokenID::NotResource: + return 0x400; + case TokenID::Condition: + return 0x800; + case TokenID::AWS: + return 0x1000; + case TokenID::Federated: + return 0x2000; + case TokenID::Service: + return 0x4000; + case TokenID::CanonicalUser: + return 0x8000; + default: + ceph_abort(); + } + } + bool test(TokenID in) { + return seen & dex(in); + } + void set(TokenID in) { + seen |= dex(in); + if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) | + dex(TokenID::Principal) | dex(TokenID::NotPrincipal) | + dex(TokenID::Action) | dex(TokenID::NotAction) | + dex(TokenID::Resource) | dex(TokenID::NotResource) | + dex(TokenID::Condition) | dex(TokenID::AWS) | + dex(TokenID::Federated) | dex(TokenID::Service) | + dex(TokenID::CanonicalUser))) { + v |= dex(in); + } + } + void set(std::initializer_list l) { + for (auto in : l) { + seen |= dex(in); + if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) | + dex(TokenID::Principal) | dex(TokenID::NotPrincipal) | + dex(TokenID::Action) | dex(TokenID::NotAction) | + dex(TokenID::Resource) | dex(TokenID::NotResource) | + dex(TokenID::Condition) | dex(TokenID::AWS) | + dex(TokenID::Federated) | dex(TokenID::Service) | + dex(TokenID::CanonicalUser))) { + v |= dex(in); + } + } + } + void reset(TokenID in) { + seen &= ~dex(in); + if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) | + dex(TokenID::Principal) | dex(TokenID::NotPrincipal) | + dex(TokenID::Action) | dex(TokenID::NotAction) | + dex(TokenID::Resource) | dex(TokenID::NotResource) | + dex(TokenID::Condition) | dex(TokenID::AWS) | + dex(TokenID::Federated) | dex(TokenID::Service) | + dex(TokenID::CanonicalUser))) { + v &= ~dex(in); + } + } + void reset(std::initializer_list l) { + for (auto in : l) { + seen &= ~dex(in); + if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) | + dex(TokenID::Principal) | dex(TokenID::NotPrincipal) | + dex(TokenID::Action) | dex(TokenID::NotAction) | + dex(TokenID::Resource) | dex(TokenID::NotResource) | + dex(TokenID::Condition) | dex(TokenID::AWS) | + dex(TokenID::Federated) | dex(TokenID::Service) | + dex(TokenID::CanonicalUser))) { + v &= ~dex(in); + } + } + } + void reset(uint32_t& v) { + seen &= ~v; + v = 0; + } + + PolicyParser(CephContext* cct, const string& tenant, Policy& policy) + : cct(cct), tenant(tenant), policy(policy) {} + PolicyParser(const PolicyParser& policy) = delete; + + bool StartObject() { + if (s.empty()) { + s.push_back({this, top}); + s.back().objecting = true; + return true; + } + + return s.back().obj_start(); + } + bool EndObject(SizeType memberCount) { + if (s.empty()) { + return false; + } + return s.back().obj_end(); + } + bool Key(const char* str, SizeType length, bool copy) { + if (s.empty()) { + return false; + } + return s.back().key(str, length); + } + + bool String(const char* str, SizeType length, bool copy) { + if (s.empty()) { + return false; + } + return s.back().do_string(cct, str, length); + } + bool RawNumber(const char* str, SizeType length, bool copy) { + if (s.empty()) { + return false; + } + + return s.back().number(str, length); + } + bool StartArray() { + if (s.empty()) { + return false; + } + + return s.back().array_start(); + } + bool EndArray(SizeType) { + if (s.empty()) { + return false; + } + + return s.back().array_end(); + } + + bool Default() { + return false; + } +}; + + +// I really despise this misfeature of C++. +// +bool ParseState::obj_end() { + if (objecting) { + objecting = false; + if (!arraying) { + pp->s.pop_back(); + } else { + reset(); + } + return true; + } + return false; +} + +bool ParseState::key(const char* s, size_t l) { + auto token_len = l; + bool ifexists = false; + if (w->id == TokenID::Condition && w->kind == TokenKind::statement) { + static constexpr char IfExists[] = "IfExists"; + if (boost::algorithm::ends_with(boost::string_view{s, l}, IfExists)) { + ifexists = true; + token_len -= sizeof(IfExists)-1; + } + } + auto k = pp->tokens.lookup(s, token_len); + + if (!k) { + if (w->kind == TokenKind::cond_op) { + auto id = w->id; + auto& t = pp->policy.statements.back(); + auto c_ife = cond_ifexists; + pp->s.emplace_back(pp, cond_key); + t.conditions.emplace_back(id, s, l, c_ife); + return true; + } else { + return false; + } + } + + // If the token we're going with belongs within the condition at the + // top of the stack and we haven't already encountered it, push it + // on the stack + // Top + if ((((w->id == TokenID::Top) && (k->kind == TokenKind::top)) || + // Statement + ((w->id == TokenID::Statement) && (k->kind == TokenKind::statement)) || + + /// Principal + ((w->id == TokenID::Principal || w->id == TokenID::NotPrincipal) && + (k->kind == TokenKind::princ_type))) && + + // Check that it hasn't been encountered. Note that this + // conjoins with the run of disjunctions above. + !pp->test(k->id)) { + pp->set(k->id); + pp->s.emplace_back(pp, k); + return true; + } else if ((w->id == TokenID::Condition) && + (k->kind == TokenKind::cond_op)) { + pp->s.emplace_back(pp, k); + pp->s.back().cond_ifexists = ifexists; + return true; + } + return false; +} + +// I should just rewrite a few helper functions to use iterators, +// which will make all of this ever so much nicer. +static boost::optional parse_principal(CephContext* cct, TokenID t, + string&& s) { + // Wildcard! + if ((t == TokenID::AWS) && (s == "*")) { + return Principal::wildcard(); + + // Do nothing for now. + } else if (t == TokenID::CanonicalUser) { + + } // AWS and Federated ARNs + else if (t == TokenID::AWS || t == TokenID::Federated) { + if (auto a = ARN::parse(s)) { + if (a->resource == "root") { + return Principal::tenant(std::move(a->account)); + } + + static const char rx_str[] = "([^/]*)/(.*)"; + static const regex rx(rx_str, sizeof(rx_str) - 1, + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + smatch match; + if (regex_match(a->resource, match, rx) && match.size() == 3) { + if (match[1] == "user") { + return Principal::user(std::move(a->account), + match[2]); + } + + if (match[1] == "role") { + return Principal::role(std::move(a->account), + match[2]); + } + + if (match[1] == "oidc-provider") { + return Principal::oidc_provider(std::move(match[2])); + } + } + } else { + if (std::none_of(s.begin(), s.end(), + [](const char& c) { + return (c == ':') || (c == '/'); + })) { + // Since tenants are simply prefixes, there's no really good + // way to see if one exists or not. So we return the thing and + // let them try to match against it. + return Principal::tenant(std::move(s)); + } + } + } + + ldout(cct, 0) << "Supplied principal is discarded: " << s << dendl; + return boost::none; +} + +bool ParseState::do_string(CephContext* cct, const char* s, size_t l) { + auto k = pp->tokens.lookup(s, l); + Policy& p = pp->policy; + bool is_action = false; + bool is_validaction = false; + Statement* t = p.statements.empty() ? nullptr : &(p.statements.back()); + + // Top level! + if ((w->id == TokenID::Version) && k && + k->kind == TokenKind::version_key) { + p.version = static_cast(k->specific); + } else if (w->id == TokenID::Id) { + p.id = string(s, l); + + // Statement + + } else if (w->id == TokenID::Sid) { + t->sid.emplace(s, l); + } else if ((w->id == TokenID::Effect) && k && + k->kind == TokenKind::effect_key) { + t->effect = static_cast(k->specific); + } else if (w->id == TokenID::Principal && s && *s == '*') { + t->princ.emplace(Principal::wildcard()); + } else if (w->id == TokenID::NotPrincipal && s && *s == '*') { + t->noprinc.emplace(Principal::wildcard()); + } else if ((w->id == TokenID::Action) || + (w->id == TokenID::NotAction)) { + is_action = true; + if (*s == '*') { + is_validaction = true; + (w->id == TokenID::Action ? + t->action = allValue : t->notaction = allValue); + } else { + for (auto& p : actpairs) { + if (match_policy({s, l}, p.name, MATCH_POLICY_ACTION)) { + is_validaction = true; + (w->id == TokenID::Action ? t->action[p.bit] = 1 : t->notaction[p.bit] = 1); + } + if ((t->action & s3AllValue) == s3AllValue) { + t->action[s3All] = 1; + } + if ((t->notaction & s3AllValue) == s3AllValue) { + t->notaction[s3All] = 1; + } + if ((t->action & iamAllValue) == iamAllValue) { + t->action[iamAll] = 1; + } + if ((t->notaction & iamAllValue) == iamAllValue) { + t->notaction[iamAll] = 1; + } + if ((t->action & stsAllValue) == stsAllValue) { + t->action[stsAll] = 1; + } + if ((t->notaction & stsAllValue) == stsAllValue) { + t->notaction[stsAll] = 1; + } + } + } + } else if (w->id == TokenID::Resource || w->id == TokenID::NotResource) { + auto a = ARN::parse({s, l}, true); + // You can't specify resources for someone ELSE'S account. + if (a && (a->account.empty() || a->account == pp->tenant || + a->account == "*")) { + if (a->account.empty() || a->account == "*") + a->account = pp->tenant; + (w->id == TokenID::Resource ? t->resource : t->notresource) + .emplace(std::move(*a)); + } + else + ldout(cct, 0) << "Supplied resource is discarded: " << string(s, l) + << dendl; + } else if (w->kind == TokenKind::cond_key) { + auto& t = pp->policy.statements.back(); + t.conditions.back().vals.emplace_back(s, l); + + // Principals + + } else if (w->kind == TokenKind::princ_type) { + if (pp->s.size() <= 1) { + return false; + } + auto& pri = pp->s[pp->s.size() - 2].w->id == TokenID::Principal ? + t->princ : t->noprinc; + + + if (auto o = parse_principal(pp->cct, w->id, string(s, l))) { + pri.emplace(std::move(*o)); + } + + // Failure + + } else { + return false; + } + + if (!arraying) { + pp->s.pop_back(); + } + + if (is_action && !is_validaction){ + return false; + } + + return true; +} + +bool ParseState::number(const char* s, size_t l) { + // Top level! + if (w->kind == TokenKind::cond_key) { + auto& t = pp->policy.statements.back(); + t.conditions.back().vals.emplace_back(s, l); + + // Failure + + } else { + return false; + } + + if (!arraying) { + pp->s.pop_back(); + } + + return true; +} + +void ParseState::reset() { + pp->reset(pp->v); +} + +bool ParseState::obj_start() { + if (w->objectable && !objecting) { + objecting = true; + if (w->id == TokenID::Statement) { + pp->policy.statements.emplace_back(); + } + + return true; + } + + return false; +} + + +bool ParseState::array_end() { + if (arraying && !objecting) { + pp->s.pop_back(); + return true; + } + + return false; +} + +ostream& operator <<(ostream& m, const MaskedIP& ip) { + // I have a theory about why std::bitset is the way it is. + if (ip.v6) { + for (int i = 7; i >= 0; --i) { + uint16_t hextet = 0; + for (int j = 15; j >= 0; --j) { + hextet |= (ip.addr[(i * 16) + j] << j); + } + m << hex << (unsigned int) hextet; + if (i != 0) { + m << ":"; + } + } + } else { + // It involves Satan. + for (int i = 3; i >= 0; --i) { + uint8_t b = 0; + for (int j = 7; j >= 0; --j) { + b |= (ip.addr[(i * 8) + j] << j); + } + m << (unsigned int) b; + if (i != 0) { + m << "."; + } + } + } + m << "/" << dec << ip.prefix; + // It would explain a lot + return m; +} + +bool Condition::eval(const Environment& env) const { + auto i = env.find(key); + if (op == TokenID::Null) { + return i == env.end() ? true : false; + } + + if (i == env.end()) { + return ifexists; + } + const auto& s = i->second; + + switch (op) { + // String! + case TokenID::StringEquals: + return orrible(std::equal_to(), s, vals); + + case TokenID::StringNotEquals: + return orrible(std::not_fn(std::equal_to()), + s, vals); + + case TokenID::StringEqualsIgnoreCase: + return orrible(ci_equal_to(), s, vals); + + case TokenID::StringNotEqualsIgnoreCase: + return orrible(std::not_fn(ci_equal_to()), s, vals); + + case TokenID::StringLike: + return orrible(string_like(), s, vals); + + case TokenID::StringNotLike: + return orrible(std::not_fn(string_like()), s, vals); + + // Numeric + case TokenID::NumericEquals: + return shortible(std::equal_to(), as_number, s, vals); + + case TokenID::NumericNotEquals: + return shortible(std::not_fn(std::equal_to()), + as_number, s, vals); + + + case TokenID::NumericLessThan: + return shortible(std::less(), as_number, s, vals); + + + case TokenID::NumericLessThanEquals: + return shortible(std::less_equal(), as_number, s, vals); + + case TokenID::NumericGreaterThan: + return shortible(std::greater(), as_number, s, vals); + + case TokenID::NumericGreaterThanEquals: + return shortible(std::greater_equal(), as_number, s, vals); + + // Date! + case TokenID::DateEquals: + return shortible(std::equal_to(), as_date, s, vals); + + case TokenID::DateNotEquals: + return shortible(std::not_fn(std::equal_to()), + as_date, s, vals); + + case TokenID::DateLessThan: + return shortible(std::less(), as_date, s, vals); + + + case TokenID::DateLessThanEquals: + return shortible(std::less_equal(), as_date, s, vals); + + case TokenID::DateGreaterThan: + return shortible(std::greater(), as_date, s, vals); + + case TokenID::DateGreaterThanEquals: + return shortible(std::greater_equal(), as_date, s, + vals); + + // Bool! + case TokenID::Bool: + return shortible(std::equal_to(), as_bool, s, vals); + + // Binary! + case TokenID::BinaryEquals: + return shortible(std::equal_to(), as_binary, s, + vals); + + // IP Address! + case TokenID::IpAddress: + return shortible(std::equal_to(), as_network, s, vals); + + case TokenID::NotIpAddress: + { + auto xc = as_network(s); + if (!xc) { + return false; + } + + for (const string& d : vals) { + auto xd = as_network(d); + if (!xd) { + continue; + } + + if (xc == xd) { + return false; + } + } + return true; + } + +#if 0 + // Amazon Resource Names! (Does S3 need this?) + TokenID::ArnEquals, TokenID::ArnNotEquals, TokenID::ArnLike, + TokenID::ArnNotLike, +#endif + + default: + return false; + } +} + +boost::optional Condition::as_network(const string& s) { + MaskedIP m; + if (s.empty()) { + return boost::none; + } + + m.v6 = (s.find(':') == string::npos) ? false : true; + + auto slash = s.find('/'); + if (slash == string::npos) { + m.prefix = m.v6 ? 128 : 32; + } else { + char* end = 0; + m.prefix = strtoul(s.data() + slash + 1, &end, 10); + if (*end != 0 || (m.v6 && m.prefix > 128) || + (!m.v6 && m.prefix > 32)) { + return boost::none; + } + } + + string t; + auto p = &s; + + if (slash != string::npos) { + t.assign(s, 0, slash); + p = &t; + } + + if (m.v6) { + struct in6_addr a; + if (inet_pton(AF_INET6, p->c_str(), static_cast(&a)) != 1) { + return boost::none; + } + + m.addr |= Address(a.s6_addr[15]) << 0; + m.addr |= Address(a.s6_addr[14]) << 8; + m.addr |= Address(a.s6_addr[13]) << 16; + m.addr |= Address(a.s6_addr[12]) << 24; + m.addr |= Address(a.s6_addr[11]) << 32; + m.addr |= Address(a.s6_addr[10]) << 40; + m.addr |= Address(a.s6_addr[9]) << 48; + m.addr |= Address(a.s6_addr[8]) << 56; + m.addr |= Address(a.s6_addr[7]) << 64; + m.addr |= Address(a.s6_addr[6]) << 72; + m.addr |= Address(a.s6_addr[5]) << 80; + m.addr |= Address(a.s6_addr[4]) << 88; + m.addr |= Address(a.s6_addr[3]) << 96; + m.addr |= Address(a.s6_addr[2]) << 104; + m.addr |= Address(a.s6_addr[1]) << 112; + m.addr |= Address(a.s6_addr[0]) << 120; + } else { + struct in_addr a; + if (inet_pton(AF_INET, p->c_str(), static_cast(&a)) != 1) { + return boost::none; + } + + m.addr = ntohl(a.s_addr); + } + + return m; +} + +namespace { +const char* condop_string(const TokenID t) { + switch (t) { + case TokenID::StringEquals: + return "StringEquals"; + + case TokenID::StringNotEquals: + return "StringNotEquals"; + + case TokenID::StringEqualsIgnoreCase: + return "StringEqualsIgnoreCase"; + + case TokenID::StringNotEqualsIgnoreCase: + return "StringNotEqualsIgnoreCase"; + + case TokenID::StringLike: + return "StringLike"; + + case TokenID::StringNotLike: + return "StringNotLike"; + + // Numeric! + case TokenID::NumericEquals: + return "NumericEquals"; + + case TokenID::NumericNotEquals: + return "NumericNotEquals"; + + case TokenID::NumericLessThan: + return "NumericLessThan"; + + case TokenID::NumericLessThanEquals: + return "NumericLessThanEquals"; + + case TokenID::NumericGreaterThan: + return "NumericGreaterThan"; + + case TokenID::NumericGreaterThanEquals: + return "NumericGreaterThanEquals"; + + case TokenID::DateEquals: + return "DateEquals"; + + case TokenID::DateNotEquals: + return "DateNotEquals"; + + case TokenID::DateLessThan: + return "DateLessThan"; + + case TokenID::DateLessThanEquals: + return "DateLessThanEquals"; + + case TokenID::DateGreaterThan: + return "DateGreaterThan"; + + case TokenID::DateGreaterThanEquals: + return "DateGreaterThanEquals"; + + case TokenID::Bool: + return "Bool"; + + case TokenID::BinaryEquals: + return "BinaryEquals"; + + case TokenID::IpAddress: + return "case TokenID::IpAddress"; + + case TokenID::NotIpAddress: + return "NotIpAddress"; + + case TokenID::ArnEquals: + return "ArnEquals"; + + case TokenID::ArnNotEquals: + return "ArnNotEquals"; + + case TokenID::ArnLike: + return "ArnLike"; + + case TokenID::ArnNotLike: + return "ArnNotLike"; + + case TokenID::Null: + return "Null"; + + default: + return "InvalidConditionOperator"; + } +} + +template +ostream& print_array(ostream& m, Iterator begin, Iterator end) { + if (begin == end) { + m << "[]"; + } else { + m << "[ "; + std::copy(begin, end, std::experimental::make_ostream_joiner(m, ", ")); + m << " ]"; + } + return m; +} + +template +ostream& print_dict(ostream& m, Iterator begin, Iterator end) { + m << "{ "; + std::copy(begin, end, std::experimental::make_ostream_joiner(m, ", ")); + m << " }"; + return m; +} + +} + +ostream& operator <<(ostream& m, const Condition& c) { + m << condop_string(c.op); + if (c.ifexists) { + m << "IfExists"; + } + m << ": { " << c.key; + print_array(m, c.vals.cbegin(), c.vals.cend()); + return m << " }"; +} + +Effect Statement::eval(const Environment& e, + boost::optional ida, + uint64_t act, const ARN& res) const { + + if (eval_principal(e, ida) == Effect::Deny) { + return Effect::Pass; + } + + if (!resource.empty()) { + if (!std::any_of(resource.begin(), resource.end(), + [&res](const ARN& pattern) { + return pattern.match(res); + })) { + return Effect::Pass; + } + } else if (!notresource.empty()) { + if (std::any_of(notresource.begin(), notresource.end(), + [&res](const ARN& pattern) { + return pattern.match(res); + })) { + return Effect::Pass; + } + } + + if (!(action[act] == 1) || (notaction[act] == 1)) { + return Effect::Pass; + } + + if (std::all_of(conditions.begin(), + conditions.end(), + [&e](const Condition& c) { return c.eval(e);})) { + return effect; + } + + return Effect::Pass; +} + +Effect Statement::eval_principal(const Environment& e, + boost::optional ida) const { + if (ida) { + if (princ.empty() && noprinc.empty()) { + return Effect::Deny; + } + if (!princ.empty() && !ida->is_identity(princ)) { + return Effect::Deny; + } else if (!noprinc.empty() && ida->is_identity(noprinc)) { + return Effect::Deny; + } + } + return Effect::Allow; +} + +Effect Statement::eval_conditions(const Environment& e) const { + if (std::all_of(conditions.begin(), + conditions.end(), + [&e](const Condition& c) { return c.eval(e);})) { + return Effect::Allow; + } + return Effect::Deny; +} + +namespace { +const char* action_bit_string(uint64_t action) { + switch (action) { + case s3GetObject: + return "s3:GetObject"; + + case s3GetObjectVersion: + return "s3:GetObjectVersion"; + + case s3PutObject: + return "s3:PutObject"; + + case s3GetObjectAcl: + return "s3:GetObjectAcl"; + + case s3GetObjectVersionAcl: + return "s3:GetObjectVersionAcl"; + + case s3PutObjectAcl: + return "s3:PutObjectAcl"; + + case s3PutObjectVersionAcl: + return "s3:PutObjectVersionAcl"; + + case s3DeleteObject: + return "s3:DeleteObject"; + + case s3DeleteObjectVersion: + return "s3:DeleteObjectVersion"; + + case s3ListMultipartUploadParts: + return "s3:ListMultipartUploadParts"; + + case s3AbortMultipartUpload: + return "s3:AbortMultipartUpload"; + + case s3GetObjectTorrent: + return "s3:GetObjectTorrent"; + + case s3GetObjectVersionTorrent: + return "s3:GetObjectVersionTorrent"; + + case s3RestoreObject: + return "s3:RestoreObject"; + + case s3CreateBucket: + return "s3:CreateBucket"; + + case s3DeleteBucket: + return "s3:DeleteBucket"; + + case s3ListBucket: + return "s3:ListBucket"; + + case s3ListBucketVersions: + return "s3:ListBucketVersions"; + case s3ListAllMyBuckets: + return "s3:ListAllMyBuckets"; + + case s3ListBucketMultipartUploads: + return "s3:ListBucketMultipartUploads"; + + case s3GetAccelerateConfiguration: + return "s3:GetAccelerateConfiguration"; + + case s3PutAccelerateConfiguration: + return "s3:PutAccelerateConfiguration"; + + case s3GetBucketAcl: + return "s3:GetBucketAcl"; + + case s3PutBucketAcl: + return "s3:PutBucketAcl"; + + case s3GetBucketCORS: + return "s3:GetBucketCORS"; + + case s3PutBucketCORS: + return "s3:PutBucketCORS"; + + case s3GetBucketVersioning: + return "s3:GetBucketVersioning"; + + case s3PutBucketVersioning: + return "s3:PutBucketVersioning"; + + case s3GetBucketRequestPayment: + return "s3:GetBucketRequestPayment"; + + case s3PutBucketRequestPayment: + return "s3:PutBucketRequestPayment"; + + case s3GetBucketLocation: + return "s3:GetBucketLocation"; + + case s3GetBucketPolicy: + return "s3:GetBucketPolicy"; + + case s3DeleteBucketPolicy: + return "s3:DeleteBucketPolicy"; + + case s3PutBucketPolicy: + return "s3:PutBucketPolicy"; + + case s3GetBucketNotification: + return "s3:GetBucketNotification"; + + case s3PutBucketNotification: + return "s3:PutBucketNotification"; + + case s3GetBucketLogging: + return "s3:GetBucketLogging"; + + case s3PutBucketLogging: + return "s3:PutBucketLogging"; + + case s3GetBucketTagging: + return "s3:GetBucketTagging"; + + case s3PutBucketTagging: + return "s3:PutBucketTagging"; + + case s3GetBucketWebsite: + return "s3:GetBucketWebsite"; + + case s3PutBucketWebsite: + return "s3:PutBucketWebsite"; + + case s3DeleteBucketWebsite: + return "s3:DeleteBucketWebsite"; + + case s3GetLifecycleConfiguration: + return "s3:GetLifecycleConfiguration"; + + case s3PutLifecycleConfiguration: + return "s3:PutLifecycleConfiguration"; + + case s3PutReplicationConfiguration: + return "s3:PutReplicationConfiguration"; + + case s3GetReplicationConfiguration: + return "s3:GetReplicationConfiguration"; + + case s3DeleteReplicationConfiguration: + return "s3:DeleteReplicationConfiguration"; + + case s3PutObjectTagging: + return "s3:PutObjectTagging"; + + case s3PutObjectVersionTagging: + return "s3:PutObjectVersionTagging"; + + case s3GetObjectTagging: + return "s3:GetObjectTagging"; + + case s3GetObjectVersionTagging: + return "s3:GetObjectVersionTagging"; + + case s3DeleteObjectTagging: + return "s3:DeleteObjectTagging"; + + case s3DeleteObjectVersionTagging: + return "s3:DeleteObjectVersionTagging"; + + case s3PutBucketObjectLockConfiguration: + return "s3:PutBucketObjectLockConfiguration"; + + case s3GetBucketObjectLockConfiguration: + return "s3:GetBucketObjectLockConfiguration"; + + case s3PutObjectRetention: + return "s3:PutObjectRetention"; + + case s3GetObjectRetention: + return "s3:GetObjectRetention"; + + case s3PutObjectLegalHold: + return "s3:PutObjectLegalHold"; + + case s3GetObjectLegalHold: + return "s3:GetObjectLegalHold"; + + case s3BypassGovernanceRetention: + return "s3:BypassGovernanceRetention"; + + case iamPutUserPolicy: + return "iam:PutUserPolicy"; + + case iamGetUserPolicy: + return "iam:GetUserPolicy"; + + case iamListUserPolicies: + return "iam:ListUserPolicies"; + + case iamDeleteUserPolicy: + return "iam:DeleteUserPolicy"; + + case iamCreateRole: + return "iam:CreateRole"; + + case iamDeleteRole: + return "iam:DeleteRole"; + + case iamGetRole: + return "iam:GetRole"; + + case iamModifyRole: + return "iam:ModifyRole"; + + case iamListRoles: + return "iam:ListRoles"; + + case iamPutRolePolicy: + return "iam:PutRolePolicy"; + + case iamGetRolePolicy: + return "iam:GetRolePolicy"; + + case iamListRolePolicies: + return "iam:ListRolePolicies"; + + case iamDeleteRolePolicy: + return "iam:DeleteRolePolicy"; + + case stsAssumeRole: + return "sts:AssumeRole"; + + case stsAssumeRoleWithWebIdentity: + return "sts:AssumeRoleWithWebIdentity"; + + case stsGetSessionToken: + return "sts:GetSessionToken"; + } + return "s3Invalid"; +} + +ostream& print_actions(ostream& m, const Action_t a) { + bool begun = false; + m << "[ "; + for (auto i = 0U; i < allCount; ++i) { + if (a[i] == 1) { + if (begun) { + m << ", "; + } else { + begun = true; + } + m << action_bit_string(i); + } + } + if (begun) { + m << " ]"; + } else { + m << "]"; + } + return m; +} +} + +ostream& operator <<(ostream& m, const Statement& s) { + m << "{ "; + if (s.sid) { + m << "Sid: " << *s.sid << ", "; + } + if (!s.princ.empty()) { + m << "Principal: "; + print_dict(m, s.princ.cbegin(), s.princ.cend()); + m << ", "; + } + if (!s.noprinc.empty()) { + m << "NotPrincipal: "; + print_dict(m, s.noprinc.cbegin(), s.noprinc.cend()); + m << ", "; + } + + m << "Effect: " << + (s.effect == Effect::Allow ? + (const char*) "Allow" : + (const char*) "Deny"); + + if (s.action.any() || s.notaction.any() || !s.resource.empty() || + !s.notresource.empty() || !s.conditions.empty()) { + m << ", "; + } + + if (s.action.any()) { + m << "Action: "; + print_actions(m, s.action); + + if (s.notaction.any() || !s.resource.empty() || + !s.notresource.empty() || !s.conditions.empty()) { + m << ", "; + } + } + + if (s.notaction.any()) { + m << "NotAction: "; + print_actions(m, s.notaction); + + if (!s.resource.empty() || !s.notresource.empty() || + !s.conditions.empty()) { + m << ", "; + } + } + + if (!s.resource.empty()) { + m << "Resource: "; + print_array(m, s.resource.cbegin(), s.resource.cend()); + + if (!s.notresource.empty() || !s.conditions.empty()) { + m << ", "; + } + } + + if (!s.notresource.empty()) { + m << "NotResource: "; + print_array(m, s.notresource.cbegin(), s.notresource.cend()); + + if (!s.conditions.empty()) { + m << ", "; + } + } + + if (!s.conditions.empty()) { + m << "Condition: "; + print_dict(m, s.conditions.cbegin(), s.conditions.cend()); + } + + return m << " }"; +} + +Policy::Policy(CephContext* cct, const string& tenant, + const bufferlist& _text) + : text(_text.to_str()) { + StringStream ss(text.data()); + PolicyParser pp(cct, tenant, *this); + auto pr = Reader{}.Parse(ss, pp); + if (!pr) { + throw PolicyParseException(std::move(pr)); + } +} + +Effect Policy::eval(const Environment& e, + boost::optional ida, + std::uint64_t action, const ARN& resource) const { + auto allowed = false; + for (auto& s : statements) { + auto g = s.eval(e, ida, action, resource); + if (g == Effect::Deny) { + return g; + } else if (g == Effect::Allow) { + allowed = true; + } + } + return allowed ? Effect::Allow : Effect::Pass; +} + +Effect Policy::eval_principal(const Environment& e, + boost::optional ida) const { + auto allowed = false; + for (auto& s : statements) { + auto g = s.eval_principal(e, ida); + if (g == Effect::Deny) { + return g; + } else if (g == Effect::Allow) { + allowed = true; + } + } + return allowed ? Effect::Allow : Effect::Deny; +} + +Effect Policy::eval_conditions(const Environment& e) const { + auto allowed = false; + for (auto& s : statements) { + auto g = s.eval_conditions(e); + if (g == Effect::Deny) { + return g; + } else if (g == Effect::Allow) { + allowed = true; + } + } + return allowed ? Effect::Allow : Effect::Deny; +} + +ostream& operator <<(ostream& m, const Policy& p) { + m << "{ Version: " + << (p.version == Version::v2008_10_17 ? "2008-10-17" : "2012-10-17"); + + if (p.id || !p.statements.empty()) { + m << ", "; + } + + if (p.id) { + m << "Id: " << *p.id; + if (!p.statements.empty()) { + m << ", "; + } + } + + if (!p.statements.empty()) { + m << "Statements: "; + print_array(m, p.statements.cbegin(), p.statements.cend()); + m << ", "; + } + return m << " }"; +} + +} +} diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h new file mode 100644 index 00000000..8f7875ca --- /dev/null +++ b/src/rgw/rgw_iam_policy.h @@ -0,0 +1,480 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_IAM_POLICY_H +#define CEPH_RGW_IAM_POLICY_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "common/ceph_time.h" +#include "common/iso_8601.h" + +#include "rapidjson/error/error.h" +#include "rapidjson/error/en.h" + +#include "rgw_acl.h" +#include "rgw_basic_types.h" +#include "rgw_iam_policy_keywords.h" +#include "rgw_string.h" +#include "rgw_arn.h" + +class RGWRados; +namespace rgw { +namespace auth { +class Identity; +} +} +struct rgw_obj; +struct rgw_bucket; + +namespace rgw { +namespace IAM { + +static constexpr std::uint64_t s3GetObject = 0; +static constexpr std::uint64_t s3GetObjectVersion = 1; +static constexpr std::uint64_t s3PutObject = 2; +static constexpr std::uint64_t s3GetObjectAcl = 3; +static constexpr std::uint64_t s3GetObjectVersionAcl = 4; +static constexpr std::uint64_t s3PutObjectAcl = 5; +static constexpr std::uint64_t s3PutObjectVersionAcl = 6; +static constexpr std::uint64_t s3DeleteObject = 7; +static constexpr std::uint64_t s3DeleteObjectVersion = 8; +static constexpr std::uint64_t s3ListMultipartUploadParts = 9; +static constexpr std::uint64_t s3AbortMultipartUpload = 10; +static constexpr std::uint64_t s3GetObjectTorrent = 11; +static constexpr std::uint64_t s3GetObjectVersionTorrent = 12; +static constexpr std::uint64_t s3RestoreObject = 13; +static constexpr std::uint64_t s3CreateBucket = 14; +static constexpr std::uint64_t s3DeleteBucket = 15; +static constexpr std::uint64_t s3ListBucket = 16; +static constexpr std::uint64_t s3ListBucketVersions = 17; +static constexpr std::uint64_t s3ListAllMyBuckets = 18; +static constexpr std::uint64_t s3ListBucketMultipartUploads = 19; +static constexpr std::uint64_t s3GetAccelerateConfiguration = 20; +static constexpr std::uint64_t s3PutAccelerateConfiguration = 21; +static constexpr std::uint64_t s3GetBucketAcl = 22; +static constexpr std::uint64_t s3PutBucketAcl = 23; +static constexpr std::uint64_t s3GetBucketCORS = 24; +static constexpr std::uint64_t s3PutBucketCORS = 25; +static constexpr std::uint64_t s3GetBucketVersioning = 26; +static constexpr std::uint64_t s3PutBucketVersioning = 27; +static constexpr std::uint64_t s3GetBucketRequestPayment = 28; +static constexpr std::uint64_t s3PutBucketRequestPayment = 29; +static constexpr std::uint64_t s3GetBucketLocation = 30; +static constexpr std::uint64_t s3GetBucketPolicy = 31; +static constexpr std::uint64_t s3DeleteBucketPolicy = 32; +static constexpr std::uint64_t s3PutBucketPolicy = 33; +static constexpr std::uint64_t s3GetBucketNotification = 34; +static constexpr std::uint64_t s3PutBucketNotification = 35; +static constexpr std::uint64_t s3GetBucketLogging = 36; +static constexpr std::uint64_t s3PutBucketLogging = 37; +static constexpr std::uint64_t s3GetBucketTagging = 38; +static constexpr std::uint64_t s3PutBucketTagging = 39; +static constexpr std::uint64_t s3GetBucketWebsite = 40; +static constexpr std::uint64_t s3PutBucketWebsite = 41; +static constexpr std::uint64_t s3DeleteBucketWebsite = 42; +static constexpr std::uint64_t s3GetLifecycleConfiguration = 43; +static constexpr std::uint64_t s3PutLifecycleConfiguration = 44; +static constexpr std::uint64_t s3PutReplicationConfiguration = 45; +static constexpr std::uint64_t s3GetReplicationConfiguration = 46; +static constexpr std::uint64_t s3DeleteReplicationConfiguration = 47; +static constexpr std::uint64_t s3GetObjectTagging = 48; +static constexpr std::uint64_t s3PutObjectTagging = 49; +static constexpr std::uint64_t s3DeleteObjectTagging = 50; +static constexpr std::uint64_t s3GetObjectVersionTagging = 51; +static constexpr std::uint64_t s3PutObjectVersionTagging = 52; +static constexpr std::uint64_t s3DeleteObjectVersionTagging = 53; +static constexpr std::uint64_t s3PutBucketObjectLockConfiguration = 54; +static constexpr std::uint64_t s3GetBucketObjectLockConfiguration = 55; +static constexpr std::uint64_t s3PutObjectRetention = 56; +static constexpr std::uint64_t s3GetObjectRetention = 57; +static constexpr std::uint64_t s3PutObjectLegalHold = 58; +static constexpr std::uint64_t s3GetObjectLegalHold = 59; +static constexpr std::uint64_t s3BypassGovernanceRetention = 60; +static constexpr std::uint64_t s3All = 61; + +static constexpr std::uint64_t iamPutUserPolicy = 62; +static constexpr std::uint64_t iamGetUserPolicy = 63; +static constexpr std::uint64_t iamDeleteUserPolicy = 64; +static constexpr std::uint64_t iamListUserPolicies = 65; +static constexpr std::uint64_t iamCreateRole = 66; +static constexpr std::uint64_t iamDeleteRole = 67; +static constexpr std::uint64_t iamModifyRole = 68; +static constexpr std::uint64_t iamGetRole = 69; +static constexpr std::uint64_t iamListRoles = 70; +static constexpr std::uint64_t iamPutRolePolicy = 71; +static constexpr std::uint64_t iamGetRolePolicy = 72; +static constexpr std::uint64_t iamListRolePolicies = 73; +static constexpr std::uint64_t iamDeleteRolePolicy = 74; +static constexpr std::uint64_t iamAll = 75; +static constexpr std::uint64_t stsAssumeRole = 76; +static constexpr std::uint64_t stsAssumeRoleWithWebIdentity = 77; +static constexpr std::uint64_t stsGetSessionToken = 78; +static constexpr std::uint64_t stsAll = 79; + +static constexpr std::uint64_t s3Count = s3BypassGovernanceRetention + 1; +static constexpr std::uint64_t allCount = stsAll + 1; + +using Action_t = std::bitset; +using NotAction_t = Action_t; + +static const Action_t None(0); +static const Action_t s3AllValue("1111111111111111111111111111111111111111111111111111111111111"); +static const Action_t iamAllValue("111111111111100000000000000000000000000000000000000000000000000000000000000"); +static const Action_t stsAllValue("1110000000000000000000000000000000000000000000000000000000000000000000000000000"); +//Modify allValue if more Actions are added +static const Action_t allValue("11111111111111111111111111111111111111111111111111111111111111111111111111111111"); + +namespace { +// Please update the table in doc/radosgw/s3/authentication.rst if you +// modify this function. +inline int op_to_perm(std::uint64_t op) { + switch (op) { + case s3GetObject: + case s3GetObjectTorrent: + case s3GetObjectVersion: + case s3GetObjectVersionTorrent: + case s3GetObjectTagging: + case s3GetObjectVersionTagging: + case s3GetObjectRetention: + case s3GetObjectLegalHold: + case s3ListAllMyBuckets: + case s3ListBucket: + case s3ListBucketMultipartUploads: + case s3ListBucketVersions: + case s3ListMultipartUploadParts: + return RGW_PERM_READ; + + case s3AbortMultipartUpload: + case s3CreateBucket: + case s3DeleteBucket: + case s3DeleteObject: + case s3DeleteObjectVersion: + case s3PutObject: + case s3PutObjectTagging: + case s3PutObjectVersionTagging: + case s3DeleteObjectTagging: + case s3DeleteObjectVersionTagging: + case s3RestoreObject: + case s3PutObjectRetention: + case s3PutObjectLegalHold: + case s3BypassGovernanceRetention: + return RGW_PERM_WRITE; + + case s3GetAccelerateConfiguration: + case s3GetBucketAcl: + case s3GetBucketCORS: + case s3GetBucketLocation: + case s3GetBucketLogging: + case s3GetBucketNotification: + case s3GetBucketPolicy: + case s3GetBucketRequestPayment: + case s3GetBucketTagging: + case s3GetBucketVersioning: + case s3GetBucketWebsite: + case s3GetLifecycleConfiguration: + case s3GetObjectAcl: + case s3GetObjectVersionAcl: + case s3GetReplicationConfiguration: + case s3GetBucketObjectLockConfiguration: + return RGW_PERM_READ_ACP; + + case s3DeleteBucketPolicy: + case s3DeleteBucketWebsite: + case s3DeleteReplicationConfiguration: + case s3PutAccelerateConfiguration: + case s3PutBucketAcl: + case s3PutBucketCORS: + case s3PutBucketLogging: + case s3PutBucketNotification: + case s3PutBucketPolicy: + case s3PutBucketRequestPayment: + case s3PutBucketTagging: + case s3PutBucketVersioning: + case s3PutBucketWebsite: + case s3PutLifecycleConfiguration: + case s3PutObjectAcl: + case s3PutObjectVersionAcl: + case s3PutReplicationConfiguration: + case s3PutBucketObjectLockConfiguration: + return RGW_PERM_WRITE_ACP; + + case s3All: + return RGW_PERM_FULL_CONTROL; + } + return RGW_PERM_INVALID; +} +} + +using Environment = boost::container::flat_map; + +using Address = std::bitset<128>; +struct MaskedIP { + bool v6; + Address addr; + // Since we're mapping IPv6 to IPv4 addresses, we may want to + // consider making the prefix always be in terms of a v6 address + // and just use the v6 bit to rewrite it as a v4 prefix for + // output. + unsigned int prefix; +}; + +std::ostream& operator <<(std::ostream& m, const MaskedIP& ip); + +inline bool operator ==(const MaskedIP& l, const MaskedIP& r) { + auto shift = std::max((l.v6 ? 128 : 32) - ((int) l.prefix), + (r.v6 ? 128 : 32) - ((int) r.prefix)); + ceph_assert(shift >= 0); + return (l.addr >> shift) == (r.addr >> shift); +} + +struct Condition { + TokenID op; + // Originally I was going to use a perfect hash table, but Marcus + // says keys are to be added at run-time not compile time. + + // In future development, use symbol internment. + std::string key; + bool ifexists = false; + // Much to my annoyance there is no actual way to do this in a + // typed way that is compatible with AWS. I know this because I've + // seen examples where the same value is used as a string in one + // context and a date in another. + std::vector vals; + + Condition() = default; + Condition(TokenID op, const char* s, std::size_t len, bool ifexists) + : op(op), key(s, len), ifexists(ifexists) {} + + bool eval(const Environment& e) const; + + static boost::optional as_number(const std::string& s) { + std::size_t p = 0; + + try { + double d = std::stod(s, &p); + if (p < s.length()) { + return boost::none; + } + + return d; + } catch (const std::logic_error& e) { + return boost::none; + } + } + + static boost::optional as_date(const std::string& s) { + std::size_t p = 0; + + try { + double d = std::stod(s, &p); + if (p == s.length()) { + return ceph::real_time( + std::chrono::seconds(static_cast(d)) + + std::chrono::nanoseconds( + static_cast((d - static_cast(d)) + * 1000000000))); + } + + return from_iso_8601(boost::string_ref(s), false); + } catch (const std::logic_error& e) { + return boost::none; + } + } + + static boost::optional as_bool(const std::string& s) { + std::size_t p = 0; + + if (s.empty() || boost::iequals(s, "false")) { + return false; + } + + try { + double d = std::stod(s, &p); + if (p == s.length()) { + return !((d == +0.0) || (d == -0.0) || std::isnan(d)); + } + } catch (const std::logic_error& e) { + // Fallthrough + } + + return true; + } + + static boost::optional as_binary(const std::string& s) { + // In a just world + ceph::bufferlist base64; + // I could populate a bufferlist + base64.push_back(buffer::create_static( + s.length(), + const_cast(s.data()))); // Yuck + // From a base64 encoded std::string. + ceph::bufferlist bin; + + try { + bin.decode_base64(base64); + } catch (const ceph::buffer::malformed_input& e) { + return boost::none; + } + return bin; + } + + static boost::optional as_network(const std::string& s); + + + struct ci_equal_to { + bool operator ()(const std::string& s1, + const std::string& s2) const { + return boost::iequals(s1, s2); + } + }; + + struct string_like { + bool operator ()(const std::string& input, + const std::string& pattern) const { + return match_wildcards(pattern, input, 0); + } + }; + + struct ci_starts_with { + bool operator()(const std::string& s1, + const std::string& s2) const { + return boost::istarts_with(s1, s2); + } + }; + + template + static bool orrible(F&& f, const std::string& c, + const std::vector& v) { + for (const auto& d : v) { + if (std::forward(f)(c, d)) { + return true; + } + } + return false; + } + + template + static bool shortible(F&& f, X& x, const std::string& c, + const std::vector& v) { + auto xc = std::forward(x)(c); + if (!xc) { + return false; + } + + for (const auto& d : v) { + auto xd = std::forward(x)(d); + if (!xd) { + continue; + } + + if (std::forward(f)(*xc, *xd)) { + return true; + } + } + return false; + } + + template + bool has_key_p(const std::string& _key, F p) const { + return p(key, _key); + } +}; + +std::ostream& operator <<(std::ostream& m, const Condition& c); + +struct Statement { + boost::optional sid = boost::none; + + boost::container::flat_set princ; + boost::container::flat_set noprinc; + + // Every statement MUST provide an effect. I just initialize it to + // deny as defensive programming. + Effect effect = Effect::Deny; + + Action_t action = 0; + NotAction_t notaction = 0; + + boost::container::flat_set resource; + boost::container::flat_set notresource; + + std::vector conditions; + + Effect eval(const Environment& e, + boost::optional ida, + std::uint64_t action, const ARN& resource) const; + + Effect eval_principal(const Environment& e, + boost::optional ida) const; + + Effect eval_conditions(const Environment& e) const; +}; + +std::ostream& operator <<(ostream& m, const Statement& s); + +struct PolicyParseException : public std::exception { + rapidjson::ParseResult pr; + + explicit PolicyParseException(rapidjson::ParseResult&& pr) + : pr(pr) { } + const char* what() const noexcept override { + return rapidjson::GetParseError_En(pr.Code()); + } +}; + +struct Policy { + std::string text; + Version version = Version::v2008_10_17; + boost::optional id = boost::none; + + std::vector statements; + + Policy(CephContext* cct, const std::string& tenant, + const bufferlist& text); + + Effect eval(const Environment& e, + boost::optional ida, + std::uint64_t action, const ARN& resource) const; + + Effect eval_principal(const Environment& e, + boost::optional ida) const; + + Effect eval_conditions(const Environment& e) const; + + template + bool has_conditional(const string& conditional, F p) const { + for (const auto&s: statements){ + if (std::any_of(s.conditions.begin(), s.conditions.end(), + [&](const Condition& c) { return c.has_key_p(conditional, p);})) + return true; + } + return false; + } + + bool has_conditional(const string& c) const { + return has_conditional(c, Condition::ci_equal_to()); + } + + bool has_partial_conditional(const string& c) const { + return has_conditional(c, Condition::ci_starts_with()); + } +}; + +std::ostream& operator <<(ostream& m, const Policy& p); +} +} + +#endif diff --git a/src/rgw/rgw_iam_policy_keywords.gperf b/src/rgw/rgw_iam_policy_keywords.gperf new file mode 100644 index 00000000..4f6f22a9 --- /dev/null +++ b/src/rgw/rgw_iam_policy_keywords.gperf @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +%language=C++ +%compare-strncmp +%define class-name keyword_hash +%define lookup-function-name lookup +%struct-type +struct Keyword { + const char* name; + TokenKind kind; + TokenID id; + uint64_t specific; + bool arrayable; + bool objectable; +}; +%% +# Top-level +# +Version, TokenKind::top, TokenID::Version, 0, false, false +Id, TokenKind::top, TokenID::Id, 0, false, false +Statement, TokenKind::top, TokenID::Statement, 0, true, true +# +# Statement level +# +Sid, TokenKind::statement, TokenID::Sid, 0, false, false +Effect, TokenKind::statement, TokenID::Effect, 0, false, false +Principal, TokenKind::statement, TokenID::Principal, 0, false, true +NotPrincipal, TokenKind::statement, TokenID::NotPrincipal, 0, true, true +Action, TokenKind::statement, TokenID::Action, 0, true, false +NotAction, TokenKind::statement, TokenID::NotAction, 0, true, false +Resource, TokenKind::statement, TokenID::Resource, 0, true, false +NotResource, TokenKind::statement, TokenID::NotResource, 0, true, false +Condition, TokenKind::statement, TokenID::Condition, 0, true, true +# +# Condition operators +# +# String +StringEquals, TokenKind::cond_op, TokenID::StringEquals, (uint64_t) Type::string, true, true +StringNotEquals, TokenKind::cond_op, TokenID::StringNotEquals, (uint64_t) Type::string, true, true +StringEqualsIgnoreCase, TokenKind::cond_op, TokenID::StringEqualsIgnoreCase, (uint64_t) Type::string, true, true +StringNotEqualsIgnoreCase, TokenKind::cond_op, TokenID::StringNotEqualsIgnoreCase, (uint64_t) Type::string, true, true +StringLike, TokenKind::cond_op, TokenID::StringLike, (uint64_t) Type::string, true, true, +StringNotLike, TokenKind::cond_op, TokenID::StringNotLike, (uint64_t) Type::string, true, true +# Numeric +NumericEquals, TokenKind::cond_op, TokenID::NumericEquals, (uint64_t) Type::number, true, true +NumericNotEquals, TokenKind::cond_op, TokenID::NumericNotEquals, (uint64_t) Type::number, true, true +NumericLessThan, TokenKind::cond_op, TokenID::NumericLessThan, (uint64_t) Type::number, true, true +NumericLessThanEquals, TokenKind::cond_op, TokenID::NumericLessThanEquals, (uint64_t) Type::number, true, true +NumericGreaterThan, TokenKind::cond_op, TokenID::NumericGreaterThan, (uint64_t) Type::number, true, true +NumericGreaterThanEquals, TokenKind::cond_op, TokenID::NumericGreaterThanEquals, (uint64_t) Type::number, true, true +# Date +DateEquals, TokenKind::cond_op, TokenID::DateEquals, (uint64_t) Type::date, true, true +DateNotEquals, TokenKind::cond_op, TokenID::DateNotEquals, (uint64_t) Type::date, true, true +DateLessThan, TokenKind::cond_op, TokenID::DateLessThan, (uint64_t) Type::date, true, true +DateLessThanEquals, TokenKind::cond_op, TokenID::DateLessThanEquals, (uint64_t) Type::date, true, true +DateGreaterThan, TokenKind::cond_op, TokenID::DateGreaterThan, (uint64_t) Type::date, true, true +DateGreaterThanEquals, TokenKind::cond_op, TokenID::DateGreaterThanEquals, (uint64_t) Type::date, true, true +# Bool +Bool, TokenKind::cond_op, TokenID::Bool, (uint64_t) Type::boolean, true, true +# Binary +BinaryEquals, TokenKind::cond_op, TokenID::BinaryEquals, (uint64_t) Type::binary, true, true +# IP Address +IpAddress, TokenKind::cond_op, TokenID::IpAddress, (uint64_t) Type::ipaddr, true, true +NotIpAddress, TokenKind::cond_op, TokenID::NotIpAddress, (uint64_t) Type::ipaddr, true, true +# Amazon Resource Names +ArnEquals, TokenKind::cond_op, TokenID::ArnEquals, (uint64_t) Type::arn, true, true +ArnNotEquals, TokenKind::cond_op, TokenID::ArnNotEquals, (uint64_t) Type::arn, true, true +ArnLike, TokenKind::cond_op, TokenID::ArnLike, (uint64_t) Type::arn, true, true +ArnNotLike, TokenKind::cond_op, TokenID::ArnNotLike, (uint64_t) Type::arn, true, true +# Null +Null, TokenKind::cond_op, TokenID::Null, (uint64_t) Type::null, true, true +# +# Condition keys +# +# AWS +#aws:CurrentTime, TokenKind::cond_key, TokenID::awsCurrentTime, (uint64_t) Type::date, true, false +#aws:EpochTime, TokenKind::cond_key, TokenID::awsEpochTime, (uint64_t) Type::date, true, false +#aws:TokenIssueTime, TokenKind::cond_key, TokenID::awsTokenIssueTime, (uint64_t) Type::date, true, false +#aws:MultiFactorAuthPresent, TokenKind::cond_key, TokenID::awsMultiFactorAuthPresent, (uint64_t) Type::boolean, true, false +#aws:MultiFactorAuthAge, TokenKind::cond_key, TokenID::awsMultiFactorAuthAge, (uint64_t) Type::number, true, false +#aws:PrincipalType, TokenKind::cond_key, TokenID::awsPrincipalType, (uint64_t) Type::string, true, false +#aws:Referer, TokenKind::cond_key, TokenID::awsReferer, (uint64_t) Type::string, true, false +#aws:SecureTransport, TokenKind::cond_key, TokenID::awsSecureTransport, (uint64_t) Type::boolean, true, false +#aws:SourceArn, TokenKind::cond_key, TokenID::awsSourceArn, (uint64_t) Type::arn, true, false +#aws:SourceIp, TokenKind::cond_key, TokenID::awsSourceIp, (uint64_t) Type::ipaddr, true, false +#aws:SourceVpc, TokenKind::cond_key, TokenID::awsSourceVpc, (uint64_t) Type::string, true, false +#aws:SourceVpce, TokenKind::cond_key, TokenID::awsSourceVpce, (uint64_t) Type::string, true, false +#aws:UserAgent, TokenKind::cond_key, TokenID::awsUserAgent, (uint64_t) Type::string, true, false +#aws:userid, TokenKind::cond_key, TokenID::awsuserid, (uint64_t) Type::string, true, false +#aws:username, TokenKind::cond_key, TokenID::awsusername, (uint64_t) Type::string, true, false +# S3 +#s3:x-amz-acl, TokenKind::cond_key, TokenID::s3x_amz_acl, (uint64_t) Type::string, true, false +#s3:x-amz-grant-read, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-grant-write, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-grant-read-acp, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-grant-write-acp, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-grant-full-control, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false +#s3:x-amz-copy-source, TokenKind::cond_key, TokenID::s3x_amz_copy_source, (uint64_t) Type::string, true, false +#s3:x-amz-server-side-encryption, TokenKind::cond_key, TokenID::s3x_amz_server_side_encryption, (uint64_t) Type::boolean, true, false +#s3:x-amz-server-side-encryption-aws-kms-key-id, TokenKind::cond_key, TokenID::s3x_amz_server_side_encryption_aws_kms_key_id, (uint64_t) Type::arn, true, false +#s3:x-amz-metadata-directive, TokenKind::cond_key, TokenID::s3x_amz_metadata_directive, (uint64_t) Type::string, true, false +#s3:x-amz-storage-class, TokenKind::cond_key, TokenID::s3x_amz_storage_class, (uint64_t) Type::string, true, false +#s3:VersionId, TokenKind::cond_key, TokenID::s3VersionId, (uint64_t) Type::string, true, false +#s3:LocationConstraint, TokenKind::cond_key, TokenID::s3LocationConstraint, (uint64_t) Type::string, true, false +#s3:prefix, TokenKind::cond_key, TokenID::s3prefix, (uint64_t) Type::string, true, false +#s3:delimiter, TokenKind::cond_key, TokenID::s3delimiter, (uint64_t) Type::string, true, false +#s3:max-keys, TokenKind::cond_key, TokenID::s3max_keys, (uint64_t) Type::number, true, false +#s3:signatureversion, TokenKind::cond_key, TokenID::s3signatureversion, (uint64_t) Type::string, true, false +#s3:authType, TokenKind::cond_key, TokenID::s3authType, (uint64_t) Type::string, true, false +#s3:signatureAge, TokenKind::cond_key, TokenID::s3signatureAge, (uint64_t) Type::number, true, false +#s3:x-amz-content-sha256, TokenKind::cond_key, TokenID::s3x_amz_content_sha256, (uint64_t) Type::string, true, false +# STS +#sts:authentication, TokenKind::cond_key, TokenID::stsauthentication, (uint64_t) Type::boolean, true, false +# +# Version Keywords +# +2008-10-17, TokenKind::version_key, TokenID::v2008_10_17, (uint64_t) Version::v2008_10_17, false, false +2012-10-17, TokenKind::version_key, TokenID::v2012_10_17, (uint64_t) Version::v2012_10_17, false, false +# +# Effect Keywords +# +Allow, TokenKind::effect_key, TokenID::Allow, (uint64_t) Effect::Allow, false, false +Deny, TokenKind::effect_key, TokenID::Deny, (uint64_t) Effect::Deny, false, false +# +# Principal types +# +AWS, TokenKind::princ_type, TokenID::AWS, 0, true, false +Federated, TokenKind::princ_type, TokenID::Federated, 0, true, false +Service, TokenKind::princ_type, TokenID::Service, 0, true, false +CanonicalUser, TokenKind::princ_type, TokenID::CanonicalUser, 0, true, false diff --git a/src/rgw/rgw_iam_policy_keywords.h b/src/rgw/rgw_iam_policy_keywords.h new file mode 100644 index 00000000..a0cd34b6 --- /dev/null +++ b/src/rgw/rgw_iam_policy_keywords.h @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_POLICY_S3V2_KEYWORDS_H +#define CEPH_RGW_POLICY_S3V2_KEYWORDS_H + +namespace rgw { +namespace IAM { + +enum class TokenKind { + pseudo, top, statement, cond_op, cond_key, version_key, effect_key, + princ_type +}; + +enum class TokenID { + /// Pseudo-token + Top, + + /// Top-level tokens + Version, Id, Statement, + + /// Statement level tokens + Sid, Effect, Principal, NotPrincipal, Action, NotAction, + Resource, NotResource, Condition, + + /// Condition Operators! + /// Any of these, except Null, can have an IfExists variant. + + // String! + StringEquals, StringNotEquals, StringEqualsIgnoreCase, + StringNotEqualsIgnoreCase, StringLike, StringNotLike, + + // Numeric! + NumericEquals, NumericNotEquals, NumericLessThan, NumericLessThanEquals, + NumericGreaterThan, NumericGreaterThanEquals, + + // Date! + DateEquals, DateNotEquals, DateLessThan, DateLessThanEquals, + DateGreaterThan, DateGreaterThanEquals, + + // Bool! + Bool, + + // Binary! + BinaryEquals, + + // IP Address! + IpAddress, NotIpAddress, + + // Amazon Resource Names! (Does S3 need this?) + ArnEquals, ArnNotEquals, ArnLike, ArnNotLike, + + // Null! + Null, + +#if 0 // Keys are done at runtime now + + /// Condition Keys! + awsCurrentTime, + awsEpochTime, + awsTokenIssueTime, + awsMultiFactorAuthPresent, + awsMultiFactorAuthAge, + awsPrincipalType, + awsReferer, + awsSecureTransport, + awsSourceArn, + awsSourceIp, + awsSourceVpc, + awsSourceVpce, + awsUserAgent, + awsuserid, + awsusername, + s3x_amz_acl, + s3x_amz_grant_permission, + s3x_amz_copy_source, + s3x_amz_server_side_encryption, + s3x_amz_server_side_encryption_aws_kms_key_id, + s3x_amz_metadata_directive, + s3x_amz_storage_class, + s3VersionId, + s3LocationConstraint, + s3prefix, + s3delimiter, + s3max_keys, + s3signatureversion, + s3authType, + s3signatureAge, + s3x_amz_content_sha256, +#else + CondKey, +#endif + + /// + /// Versions! + /// + v2008_10_17, + v2012_10_17, + + /// + /// Effects! + /// + Allow, + Deny, + + /// Principal Types! + AWS, + Federated, + Service, + CanonicalUser +}; + + +enum class Version { + v2008_10_17, + v2012_10_17 +}; + + +enum class Effect { + Allow, + Deny, + Pass +}; + +enum class Type { + string, + number, + date, + boolean, + binary, + ipaddr, + arn, + null +}; +} +} + +#endif // CEPH_RGW_POLICY_S3V2_KEYWORDS_H diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc new file mode 100644 index 00000000..5804d7c7 --- /dev/null +++ b/src/rgw/rgw_json_enc.cc @@ -0,0 +1,1777 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_log.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_cache.h" +#include "rgw_bucket.h" +#include "rgw_keystone.h" +#include "rgw_basic_types.h" +#include "rgw_op.h" +#include "rgw_data_sync.h" +#include "rgw_sync.h" +#include "rgw_orphan.h" + +#include "common/ceph_json.h" +#include "common/Formatter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +void encode_json(const char *name, const obj_version& v, Formatter *f) +{ + f->open_object_section(name); + f->dump_string("tag", v.tag); + f->dump_unsigned("ver", v.ver); + f->close_section(); +} + +void decode_json_obj(obj_version& v, JSONObj *obj) +{ + JSONDecoder::decode_json("tag", v.tag, obj); + JSONDecoder::decode_json("ver", v.ver, obj); +} + +void encode_json(const char *name, const RGWUserCaps& val, Formatter *f) +{ + val.dump(f, name); +} + + +void encode_json(const char *name, const rgw_pool& pool, Formatter *f) +{ + f->dump_string(name, pool.to_str()); +} + +void decode_json_obj(rgw_pool& pool, JSONObj *obj) +{ + string s; + decode_json_obj(s, obj); + pool = rgw_pool(s); +} + +void encode_json(const char *name, const rgw_placement_rule& r, Formatter *f) +{ + encode_json(name, r.to_str(), f); +} + +void decode_json_obj(rgw_placement_rule& v, JSONObj *obj) +{ + string s; + decode_json_obj(s, obj); + v.from_str(s); +} + +void RGWOLHInfo::dump(Formatter *f) const +{ + encode_json("target", target, f); +} + +void RGWOLHPendingInfo::dump(Formatter *f) const +{ + utime_t ut(time); + encode_json("time", ut, f); +} + +void RGWObjManifestPart::dump(Formatter *f) const +{ + f->open_object_section("loc"); + loc.dump(f); + f->close_section(); + f->dump_unsigned("loc_ofs", loc_ofs); + f->dump_unsigned("size", size); +} + +void RGWObjManifestRule::dump(Formatter *f) const +{ + encode_json("start_part_num", start_part_num, f); + encode_json("start_ofs", start_ofs, f); + encode_json("part_size", part_size, f); + encode_json("stripe_max_size", stripe_max_size, f); + encode_json("override_prefix", override_prefix, f); +} + +void rgw_bucket_placement::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + encode_json("placement_rule", placement_rule, f); +} + +void rgw_obj_select::dump(Formatter *f) const +{ + f->dump_string("placement_rule", placement_rule.to_str()); + f->dump_object("obj", obj); + f->dump_object("raw_obj", raw_obj); + f->dump_bool("is_raw", is_raw); +} + +void RGWObjManifest::obj_iterator::dump(Formatter *f) const +{ + f->dump_unsigned("part_ofs", part_ofs); + f->dump_unsigned("stripe_ofs", stripe_ofs); + f->dump_unsigned("ofs", ofs); + f->dump_unsigned("stripe_size", stripe_size); + f->dump_int("cur_part_id", cur_part_id); + f->dump_int("cur_stripe", cur_stripe); + f->dump_string("cur_override_prefix", cur_override_prefix); + f->dump_object("location", location); +} + +void RGWObjManifest::dump(Formatter *f) const +{ + map::const_iterator iter = objs.begin(); + f->open_array_section("objs"); + for (; iter != objs.end(); ++iter) { + f->dump_unsigned("ofs", iter->first); + f->open_object_section("part"); + iter->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_unsigned("obj_size", obj_size); + ::encode_json("explicit_objs", explicit_objs, f); + ::encode_json("head_size", head_size, f); + ::encode_json("max_head_size", max_head_size, f); + ::encode_json("prefix", prefix, f); + ::encode_json("rules", rules, f); + ::encode_json("tail_instance", tail_instance, f); + ::encode_json("tail_placement", tail_placement, f); + + f->dump_object("begin_iter", begin_iter); + f->dump_object("end_iter", end_iter); +} + +void rgw_log_entry::dump(Formatter *f) const +{ + f->dump_string("object_owner", object_owner.to_str()); + f->dump_string("bucket_owner", bucket_owner.to_str()); + f->dump_string("bucket", bucket); + f->dump_stream("time") << time; + f->dump_string("remote_addr", remote_addr); + f->dump_string("user", user); + stringstream s; + s << obj; + f->dump_string("obj", s.str()); + f->dump_string("op", op); + f->dump_string("uri", uri); + f->dump_string("http_status", http_status); + f->dump_string("error_code", error_code); + f->dump_unsigned("bytes_sent", bytes_sent); + f->dump_unsigned("bytes_received", bytes_received); + f->dump_unsigned("obj_size", obj_size); + f->dump_stream("total_time") << total_time; + f->dump_string("user_agent", user_agent); + f->dump_string("referrer", referrer); + f->dump_string("bucket_id", bucket_id); +} + +void ACLPermission::dump(Formatter *f) const +{ + f->dump_int("flags", flags); +} + +void ACLGranteeType::dump(Formatter *f) const +{ + f->dump_unsigned("type", type); +} + +void ACLGrant::dump(Formatter *f) const +{ + f->open_object_section("type"); + type.dump(f); + f->close_section(); + + f->dump_string("id", id.to_str()); + f->dump_string("email", email); + + f->open_object_section("permission"); + permission.dump(f); + f->close_section(); + + f->dump_string("name", name); + f->dump_int("group", (int)group); + f->dump_string("url_spec", url_spec); +} + +void RGWAccessControlList::dump(Formatter *f) const +{ + map::const_iterator acl_user_iter = acl_user_map.begin(); + f->open_array_section("acl_user_map"); + for (; acl_user_iter != acl_user_map.end(); ++acl_user_iter) { + f->open_object_section("entry"); + f->dump_string("user", acl_user_iter->first); + f->dump_int("acl", acl_user_iter->second); + f->close_section(); + } + f->close_section(); + + map::const_iterator acl_group_iter = acl_group_map.begin(); + f->open_array_section("acl_group_map"); + for (; acl_group_iter != acl_group_map.end(); ++acl_group_iter) { + f->open_object_section("entry"); + f->dump_unsigned("group", acl_group_iter->first); + f->dump_int("acl", acl_group_iter->second); + f->close_section(); + } + f->close_section(); + + multimap::const_iterator giter = grant_map.begin(); + f->open_array_section("grant_map"); + for (; giter != grant_map.end(); ++giter) { + f->open_object_section("entry"); + f->dump_string("id", giter->first); + f->open_object_section("grant"); + giter->second.dump(f); + f->close_section(); + f->close_section(); + } + f->close_section(); +} + +void ACLOwner::dump(Formatter *f) const +{ + encode_json("id", id.to_str(), f); + encode_json("display_name", display_name, f); +} + +void ACLOwner::decode_json(JSONObj *obj) { + string id_str; + JSONDecoder::decode_json("id", id_str, obj); + id.from_str(id_str); + JSONDecoder::decode_json("display_name", display_name, obj); +} + +void RGWAccessControlPolicy::dump(Formatter *f) const +{ + encode_json("acl", acl, f); + encode_json("owner", owner, f); +} + +void ObjectMetaInfo::dump(Formatter *f) const +{ + encode_json("size", size, f); + encode_json("mtime", utime_t(mtime), f); +} + +void ObjectCacheInfo::dump(Formatter *f) const +{ + encode_json("status", status, f); + encode_json("flags", flags, f); + encode_json("data", data, f); + encode_json_map("xattrs", "name", "value", "length", xattrs, f); + encode_json_map("rm_xattrs", "name", "value", "length", rm_xattrs, f); + encode_json("meta", meta, f); + +} + +void RGWCacheNotifyInfo::dump(Formatter *f) const +{ + encode_json("op", op, f); + encode_json("obj", obj, f); + encode_json("obj_info", obj_info, f); + encode_json("ofs", ofs, f); + encode_json("ns", ns, f); +} + +void RGWAccessKey::dump(Formatter *f) const +{ + encode_json("access_key", id, f); + encode_json("secret_key", key, f); + encode_json("subuser", subuser, f); +} + +void RGWAccessKey::dump_plain(Formatter *f) const +{ + encode_json("access_key", id, f); + encode_json("secret_key", key, f); +} + +void encode_json_plain(const char *name, const RGWAccessKey& val, Formatter *f) +{ + f->open_object_section(name); + val.dump_plain(f); + f->close_section(); +} + +void RGWAccessKey::dump(Formatter *f, const string& user, bool swift) const +{ + string u = user; + if (!subuser.empty()) { + u.append(":"); + u.append(subuser); + } + encode_json("user", u, f); + if (!swift) { + encode_json("access_key", id, f); + } + encode_json("secret_key", key, f); +} + +void RGWAccessKey::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("access_key", id, obj, true); + JSONDecoder::decode_json("secret_key", key, obj, true); + if (!JSONDecoder::decode_json("subuser", subuser, obj)) { + string user; + JSONDecoder::decode_json("user", user, obj); + int pos = user.find(':'); + if (pos >= 0) { + subuser = user.substr(pos + 1); + } + } +} + +void RGWAccessKey::decode_json(JSONObj *obj, bool swift) { + if (!swift) { + decode_json(obj); + return; + } + + if (!JSONDecoder::decode_json("subuser", subuser, obj)) { + JSONDecoder::decode_json("user", id, obj, true); + int pos = id.find(':'); + if (pos >= 0) { + subuser = id.substr(pos + 1); + } + } + JSONDecoder::decode_json("secret_key", key, obj, true); +} + +struct rgw_flags_desc { + uint32_t mask; + const char *str; +}; + +static struct rgw_flags_desc rgw_perms[] = { + { RGW_PERM_FULL_CONTROL, "full-control" }, + { RGW_PERM_READ | RGW_PERM_WRITE, "read-write" }, + { RGW_PERM_READ, "read" }, + { RGW_PERM_WRITE, "write" }, + { RGW_PERM_READ_ACP, "read-acp" }, + { RGW_PERM_WRITE_ACP, "write-acp" }, + { 0, NULL } +}; + +static void mask_to_str(rgw_flags_desc *mask_list, uint32_t mask, char *buf, int len) +{ + const char *sep = ""; + int pos = 0; + if (!mask) { + snprintf(buf, len, ""); + return; + } + while (mask) { + uint32_t orig_mask = mask; + for (int i = 0; mask_list[i].mask; i++) { + struct rgw_flags_desc *desc = &mask_list[i]; + if ((mask & desc->mask) == desc->mask) { + pos += snprintf(buf + pos, len - pos, "%s%s", sep, desc->str); + if (pos == len) + return; + sep = ", "; + mask &= ~desc->mask; + if (!mask) + return; + } + } + if (mask == orig_mask) // no change + break; + } +} + +static void perm_to_str(uint32_t mask, char *buf, int len) +{ + return mask_to_str(rgw_perms, mask, buf, len); +} + +static struct rgw_flags_desc op_type_flags[] = { + { RGW_OP_TYPE_READ, "read" }, + { RGW_OP_TYPE_WRITE, "write" }, + { RGW_OP_TYPE_DELETE, "delete" }, + { 0, NULL } +}; + +extern void op_type_to_str(uint32_t mask, char *buf, int len) +{ + return mask_to_str(op_type_flags, mask, buf, len); +} + +void RGWSubUser::dump(Formatter *f) const +{ + encode_json("id", name, f); + char buf[256]; + perm_to_str(perm_mask, buf, sizeof(buf)); + encode_json("permissions", (const char *)buf, f); +} + +void RGWSubUser::dump(Formatter *f, const string& user) const +{ + string s = user; + s.append(":"); + s.append(name); + encode_json("id", s, f); + char buf[256]; + perm_to_str(perm_mask, buf, sizeof(buf)); + encode_json("permissions", (const char *)buf, f); +} + +static uint32_t str_to_perm(const string& s) +{ + if (s.compare("read") == 0) + return RGW_PERM_READ; + else if (s.compare("write") == 0) + return RGW_PERM_WRITE; + else if (s.compare("read-write") == 0) + return RGW_PERM_READ | RGW_PERM_WRITE; + else if (s.compare("full-control") == 0) + return RGW_PERM_FULL_CONTROL; + return 0; +} + +void RGWSubUser::decode_json(JSONObj *obj) +{ + string uid; + JSONDecoder::decode_json("id", uid, obj); + int pos = uid.find(':'); + if (pos >= 0) + name = uid.substr(pos + 1); + string perm_str; + JSONDecoder::decode_json("permissions", perm_str, obj); + perm_mask = str_to_perm(perm_str); +} + +static void user_info_dump_subuser(const char *name, const RGWSubUser& subuser, Formatter *f, void *parent) +{ + RGWUserInfo *info = static_cast(parent); + subuser.dump(f, info->user_id.to_str()); +} + +static void user_info_dump_key(const char *name, const RGWAccessKey& key, Formatter *f, void *parent) +{ + RGWUserInfo *info = static_cast(parent); + key.dump(f, info->user_id.to_str(), false); +} + +static void user_info_dump_swift_key(const char *name, const RGWAccessKey& key, Formatter *f, void *parent) +{ + RGWUserInfo *info = static_cast(parent); + key.dump(f, info->user_id.to_str(), true); +} + +void RGWUserInfo::dump(Formatter *f) const +{ + + encode_json("user_id", user_id.to_str(), f); + encode_json("display_name", display_name, f); + encode_json("email", user_email, f); + encode_json("suspended", (int)suspended, f); + encode_json("max_buckets", (int)max_buckets, f); + + encode_json_map("subusers", NULL, "subuser", NULL, user_info_dump_subuser,(void *)this, subusers, f); + encode_json_map("keys", NULL, "key", NULL, user_info_dump_key,(void *)this, access_keys, f); + encode_json_map("swift_keys", NULL, "key", NULL, user_info_dump_swift_key,(void *)this, swift_keys, f); + + encode_json("caps", caps, f); + + char buf[256]; + op_type_to_str(op_mask, buf, sizeof(buf)); + encode_json("op_mask", (const char *)buf, f); + + if (system) { /* no need to show it for every user */ + encode_json("system", (bool)system, f); + } + if (admin) { + encode_json("admin", (bool)admin, f); + } + encode_json("default_placement", default_placement.name, f); + encode_json("default_storage_class", default_placement.storage_class, f); + encode_json("placement_tags", placement_tags, f); + encode_json("bucket_quota", bucket_quota, f); + encode_json("user_quota", user_quota, f); + encode_json("temp_url_keys", temp_url_keys, f); + + string user_source_type; + switch ((RGWIdentityType)type) { + case TYPE_RGW: + user_source_type = "rgw"; + break; + case TYPE_KEYSTONE: + user_source_type = "keystone"; + break; + case TYPE_LDAP: + user_source_type = "ldap"; + break; + case TYPE_NONE: + user_source_type = "none"; + break; + default: + user_source_type = "none"; + break; + } + encode_json("type", user_source_type, f); + encode_json("mfa_ids", mfa_ids, f); +} + + +static void decode_access_keys(map& m, JSONObj *o) +{ + RGWAccessKey k; + k.decode_json(o); + m[k.id] = k; +} + +static void decode_swift_keys(map& m, JSONObj *o) +{ + RGWAccessKey k; + k.decode_json(o, true); + m[k.id] = k; +} + +static void decode_subusers(map& m, JSONObj *o) +{ + RGWSubUser u; + u.decode_json(o); + m[u.name] = u; +} + +void RGWUserInfo::decode_json(JSONObj *obj) +{ + string uid; + + JSONDecoder::decode_json("user_id", uid, obj, true); + user_id.from_str(uid); + + JSONDecoder::decode_json("display_name", display_name, obj); + JSONDecoder::decode_json("email", user_email, obj); + bool susp = false; + JSONDecoder::decode_json("suspended", susp, obj); + suspended = (__u8)susp; + JSONDecoder::decode_json("max_buckets", max_buckets, obj); + + JSONDecoder::decode_json("keys", access_keys, decode_access_keys, obj); + JSONDecoder::decode_json("swift_keys", swift_keys, decode_swift_keys, obj); + JSONDecoder::decode_json("subusers", subusers, decode_subusers, obj); + + JSONDecoder::decode_json("caps", caps, obj); + + string mask_str; + JSONDecoder::decode_json("op_mask", mask_str, obj); + rgw_parse_op_type_list(mask_str, &op_mask); + + bool sys = false; + JSONDecoder::decode_json("system", sys, obj); + system = (__u8)sys; + bool ad = false; + JSONDecoder::decode_json("admin", ad, obj); + admin = (__u8)ad; + JSONDecoder::decode_json("default_placement", default_placement.name, obj); + JSONDecoder::decode_json("default_storage_class", default_placement.storage_class, obj); + JSONDecoder::decode_json("placement_tags", placement_tags, obj); + JSONDecoder::decode_json("bucket_quota", bucket_quota, obj); + JSONDecoder::decode_json("user_quota", user_quota, obj); + JSONDecoder::decode_json("temp_url_keys", temp_url_keys, obj); + + string user_source_type; + JSONDecoder::decode_json("type", user_source_type, obj); + if (user_source_type == "rgw") { + type = TYPE_RGW; + } else if (user_source_type == "keystone") { + type = TYPE_KEYSTONE; + } else if (user_source_type == "ldap") { + type = TYPE_LDAP; + } else if (user_source_type == "none") { + type = TYPE_NONE; + } + JSONDecoder::decode_json("mfa_ids", mfa_ids, obj); +} + +void RGWQuotaInfo::dump(Formatter *f) const +{ + f->dump_bool("enabled", enabled); + f->dump_bool("check_on_raw", check_on_raw); + + f->dump_int("max_size", max_size); + f->dump_int("max_size_kb", rgw_rounded_kb(max_size)); + f->dump_int("max_objects", max_objects); +} + +void RGWQuotaInfo::decode_json(JSONObj *obj) +{ + if (false == JSONDecoder::decode_json("max_size", max_size, obj)) { + /* We're parsing an older version of the struct. */ + int64_t max_size_kb = 0; + + JSONDecoder::decode_json("max_size_kb", max_size_kb, obj); + max_size = max_size_kb * 1024; + } + JSONDecoder::decode_json("max_objects", max_objects, obj); + + JSONDecoder::decode_json("check_on_raw", check_on_raw, obj); + JSONDecoder::decode_json("enabled", enabled, obj); +} + +void rgw_data_placement_target::dump(Formatter *f) const +{ + encode_json("data_pool", data_pool, f); + encode_json("data_extra_pool", data_extra_pool, f); + encode_json("index_pool", index_pool, f); +} + +void rgw_data_placement_target::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("data_pool", data_pool, obj); + JSONDecoder::decode_json("data_extra_pool", data_extra_pool, obj); + JSONDecoder::decode_json("index_pool", index_pool, obj); +} + +void rgw_bucket::dump(Formatter *f) const +{ + encode_json("name", name, f); + encode_json("marker", marker, f); + encode_json("bucket_id", bucket_id, f); + encode_json("tenant", tenant, f); + encode_json("explicit_placement", explicit_placement, f); +} + +void rgw_bucket::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("bucket_id", bucket_id, obj); + JSONDecoder::decode_json("tenant", tenant, obj); + JSONDecoder::decode_json("explicit_placement", explicit_placement, obj); + if (explicit_placement.data_pool.empty()) { + /* decoding old format */ + JSONDecoder::decode_json("pool", explicit_placement.data_pool, obj); + JSONDecoder::decode_json("data_extra_pool", explicit_placement.data_extra_pool, obj); + JSONDecoder::decode_json("index_pool", explicit_placement.index_pool, obj); + } +} + +void RGWBucketEntryPoint::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + encode_json("owner", owner, f); + utime_t ut(creation_time); + encode_json("creation_time", ut, f); + encode_json("linked", linked, f); + encode_json("has_bucket_info", has_bucket_info, f); + if (has_bucket_info) { + encode_json("old_bucket_info", old_bucket_info, f); + } +} + +void RGWBucketEntryPoint::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket", bucket, obj); + JSONDecoder::decode_json("owner", owner, obj); + utime_t ut; + JSONDecoder::decode_json("creation_time", ut, obj); + creation_time = ut.to_real_time(); + JSONDecoder::decode_json("linked", linked, obj); + JSONDecoder::decode_json("has_bucket_info", has_bucket_info, obj); + if (has_bucket_info) { + JSONDecoder::decode_json("old_bucket_info", old_bucket_info, obj); + } +} + +void RGWStorageStats::dump(Formatter *f) const +{ + encode_json("size", size, f); + encode_json("size_actual", size_rounded, f); + encode_json("size_utilized", size_utilized, f); + encode_json("size_kb", rgw_rounded_kb(size), f); + encode_json("size_kb_actual", rgw_rounded_kb(size_rounded), f); + encode_json("size_kb_utilized", rgw_rounded_kb(size_utilized), f); + encode_json("num_objects", num_objects, f); +} + +void RGWRedirectInfo::dump(Formatter *f) const +{ + encode_json("protocol", protocol, f); + encode_json("hostname", hostname, f); + encode_json("http_redirect_code", (int)http_redirect_code, f); +} + +void RGWRedirectInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("protocol", protocol, obj); + JSONDecoder::decode_json("hostname", hostname, obj); + int code; + JSONDecoder::decode_json("http_redirect_code", code, obj); + http_redirect_code = code; +} + +void RGWBWRedirectInfo::dump(Formatter *f) const +{ + encode_json("redirect", redirect, f); + encode_json("replace_key_prefix_with", replace_key_prefix_with, f); + encode_json("replace_key_with", replace_key_with, f); +} + +void RGWBWRedirectInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("redirect", redirect, obj); + JSONDecoder::decode_json("replace_key_prefix_with", replace_key_prefix_with, obj); + JSONDecoder::decode_json("replace_key_with", replace_key_with, obj); +} + +void RGWBWRoutingRuleCondition::dump(Formatter *f) const +{ + encode_json("key_prefix_equals", key_prefix_equals, f); + encode_json("http_error_code_returned_equals", (int)http_error_code_returned_equals, f); +} + +void RGWBWRoutingRuleCondition::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("key_prefix_equals", key_prefix_equals, obj); + int code; + JSONDecoder::decode_json("http_error_code_returned_equals", code, obj); + http_error_code_returned_equals = code; +} + +void RGWBWRoutingRule::dump(Formatter *f) const +{ + encode_json("condition", condition, f); + encode_json("redirect_info", redirect_info, f); +} + +void RGWBWRoutingRule::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("condition", condition, obj); + JSONDecoder::decode_json("redirect_info", redirect_info, obj); +} + +void RGWBWRoutingRules::dump(Formatter *f) const +{ + encode_json("rules", rules, f); +} + +void RGWBWRoutingRules::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("rules", rules, obj); +} + +void RGWBucketWebsiteConf::dump(Formatter *f) const +{ + if (!redirect_all.hostname.empty()) { + encode_json("redirect_all", redirect_all, f); + } else { + encode_json("index_doc_suffix", index_doc_suffix, f); + encode_json("error_doc", error_doc, f); + encode_json("routing_rules", routing_rules, f); + } +} + +void RGWBucketWebsiteConf::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("redirect_all", redirect_all, obj); + JSONDecoder::decode_json("index_doc_suffix", index_doc_suffix, obj); + JSONDecoder::decode_json("error_doc", error_doc, obj); + JSONDecoder::decode_json("routing_rules", routing_rules, obj); +} + +void RGWBucketInfo::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + utime_t ut(creation_time); + encode_json("creation_time", ut, f); + encode_json("owner", owner.to_str(), f); + encode_json("flags", flags, f); + encode_json("zonegroup", zonegroup, f); + encode_json("placement_rule", placement_rule, f); + encode_json("has_instance_obj", has_instance_obj, f); + encode_json("quota", quota, f); + encode_json("num_shards", num_shards, f); + encode_json("bi_shard_hash_type", (uint32_t)bucket_index_shard_hash_type, f); + encode_json("requester_pays", requester_pays, f); + encode_json("has_website", has_website, f); + if (has_website) { + encode_json("website_conf", website_conf, f); + } + encode_json("swift_versioning", swift_versioning, f); + encode_json("swift_ver_location", swift_ver_location, f); + encode_json("index_type", (uint32_t)index_type, f); + encode_json("mdsearch_config", mdsearch_config, f); + encode_json("reshard_status", (int)reshard_status, f); + encode_json("new_bucket_instance_id", new_bucket_instance_id, f); +} + +void RGWBucketInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket", bucket, obj); + utime_t ut; + JSONDecoder::decode_json("creation_time", ut, obj); + creation_time = ut.to_real_time(); + JSONDecoder::decode_json("owner", owner, obj); + JSONDecoder::decode_json("flags", flags, obj); + JSONDecoder::decode_json("zonegroup", zonegroup, obj); + /* backward compatability with region */ + if (zonegroup.empty()) { + JSONDecoder::decode_json("region", zonegroup, obj); + } + string pr; + JSONDecoder::decode_json("placement_rule", pr, obj); + placement_rule.from_str(pr); + JSONDecoder::decode_json("has_instance_obj", has_instance_obj, obj); + JSONDecoder::decode_json("quota", quota, obj); + JSONDecoder::decode_json("num_shards", num_shards, obj); + uint32_t hash_type; + JSONDecoder::decode_json("bi_shard_hash_type", hash_type, obj); + bucket_index_shard_hash_type = (uint8_t)hash_type; + JSONDecoder::decode_json("requester_pays", requester_pays, obj); + JSONDecoder::decode_json("has_website", has_website, obj); + if (has_website) { + JSONDecoder::decode_json("website_conf", website_conf, obj); + } + JSONDecoder::decode_json("swift_versioning", swift_versioning, obj); + JSONDecoder::decode_json("swift_ver_location", swift_ver_location, obj); + uint32_t it; + JSONDecoder::decode_json("index_type", it, obj); + index_type = (RGWBucketIndexType)it; + JSONDecoder::decode_json("mdsearch_config", mdsearch_config, obj); + int rs; + JSONDecoder::decode_json("reshard_status", rs, obj); + reshard_status = (cls_rgw_reshard_status)rs; +} + +void rgw_obj_key::dump(Formatter *f) const +{ + encode_json("name", name, f); + encode_json("instance", instance, f); + encode_json("ns", ns, f); +} + +void rgw_obj_key::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("instance", instance, obj); + JSONDecoder::decode_json("ns", ns, obj); +} + +void RGWBucketEnt::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + encode_json("size", size, f); + encode_json("size_rounded", size_rounded, f); + utime_t ut(creation_time); + encode_json("mtime", ut, f); /* mtime / creation time discrepency needed for backward compatibility */ + encode_json("count", count, f); + encode_json("placement_rule", placement_rule.to_str(), f); +} + +void RGWUploadPartInfo::dump(Formatter *f) const +{ + encode_json("num", num, f); + encode_json("size", size, f); + encode_json("etag", etag, f); + utime_t ut(modified); + encode_json("modified", ut, f); +} + +void rgw_raw_obj::dump(Formatter *f) const +{ + encode_json("pool", pool, f); + encode_json("oid", oid, f); + encode_json("loc", loc, f); +} + +void rgw_raw_obj::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("oid", oid, obj); + JSONDecoder::decode_json("loc", loc, obj); +} + +void rgw_obj::dump(Formatter *f) const +{ + encode_json("bucket", bucket, f); + encode_json("key", key, f); +} + +void RGWDefaultSystemMetaObjInfo::dump(Formatter *f) const { + encode_json("default_id", default_id, f); +} + +void RGWDefaultSystemMetaObjInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("default_id", default_id, obj); +} + +void RGWNameToId::dump(Formatter *f) const { + encode_json("obj_id", obj_id, f); +} + +void RGWNameToId::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("obj_id", obj_id, obj); +} + +void RGWSystemMetaObj::dump(Formatter *f) const +{ + encode_json("id", id , f); + encode_json("name", name , f); +} + +void RGWSystemMetaObj::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("name", name, obj); +} + +void RGWPeriodLatestEpochInfo::dump(Formatter *f) const { + encode_json("latest_epoch", epoch, f); +} + +void RGWPeriodLatestEpochInfo::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("latest_epoch", epoch, obj); +} + +void RGWPeriod::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("epoch", epoch , f); + encode_json("predecessor_uuid", predecessor_uuid, f); + encode_json("sync_status", sync_status, f); + encode_json("period_map", period_map, f); + encode_json("master_zonegroup", master_zonegroup, f); + encode_json("master_zone", master_zone, f); + encode_json("period_config", period_config, f); + encode_json("realm_id", realm_id, f); + encode_json("realm_name", realm_name, f); + encode_json("realm_epoch", realm_epoch, f); +} + +void RGWPeriod::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("epoch", epoch, obj); + JSONDecoder::decode_json("predecessor_uuid", predecessor_uuid, obj); + JSONDecoder::decode_json("sync_status", sync_status, obj); + JSONDecoder::decode_json("period_map", period_map, obj); + JSONDecoder::decode_json("master_zonegroup", master_zonegroup, obj); + JSONDecoder::decode_json("master_zone", master_zone, obj); + JSONDecoder::decode_json("period_config", period_config, obj); + JSONDecoder::decode_json("realm_id", realm_id, obj); + JSONDecoder::decode_json("realm_name", realm_name, obj); + JSONDecoder::decode_json("realm_epoch", realm_epoch, obj); +} + +void RGWZoneParams::dump(Formatter *f) const +{ + RGWSystemMetaObj::dump(f); + encode_json("domain_root", domain_root, f); + encode_json("control_pool", control_pool, f); + encode_json("gc_pool", gc_pool, f); + encode_json("lc_pool", lc_pool, f); + encode_json("log_pool", log_pool, f); + encode_json("intent_log_pool", intent_log_pool, f); + encode_json("usage_log_pool", usage_log_pool, f); + encode_json("reshard_pool", reshard_pool, f); + encode_json("user_keys_pool", user_keys_pool, f); + encode_json("user_email_pool", user_email_pool, f); + encode_json("user_swift_pool", user_swift_pool, f); + encode_json("user_uid_pool", user_uid_pool, f); + encode_json("otp_pool", otp_pool, f); + encode_json_plain("system_key", system_key, f); + encode_json("placement_pools", placement_pools, f); + encode_json("metadata_heap", metadata_heap, f); + encode_json("tier_config", tier_config, f); + encode_json("realm_id", realm_id, f); +} + +void RGWZoneStorageClass::dump(Formatter *f) const +{ + if (data_pool) { + encode_json("data_pool", data_pool.get(), f); + } + if (compression_type) { + encode_json("compression_type", compression_type.get(), f); + } +} + +void RGWZoneStorageClass::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("data_pool", data_pool, obj); + JSONDecoder::decode_json("compression_type", compression_type, obj); +} + +void RGWZoneStorageClasses::dump(Formatter *f) const +{ + for (auto& i : m) { + encode_json(i.first.c_str(), i.second, f); + } +} + +void RGWZoneStorageClasses::decode_json(JSONObj *obj) +{ + JSONFormattable f; + decode_json_obj(f, obj); + + for (auto& field : f.object()) { + JSONObj *field_obj = obj->find_obj(field.first); + assert(field_obj); + + decode_json_obj(m[field.first], field_obj); + } + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; +} + +void RGWZonePlacementInfo::dump(Formatter *f) const +{ + encode_json("index_pool", index_pool, f); + encode_json("storage_classes", storage_classes, f); + encode_json("data_extra_pool", data_extra_pool, f); + encode_json("index_type", (uint32_t)index_type, f); + + /* no real need for backward compatibility of compression_type and data_pool in here, + * rather not clutter the output */ +} + +void RGWZonePlacementInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("index_pool", index_pool, obj); + JSONDecoder::decode_json("storage_classes", storage_classes, obj); + JSONDecoder::decode_json("data_extra_pool", data_extra_pool, obj); + uint32_t it; + JSONDecoder::decode_json("index_type", it, obj); + index_type = (RGWBucketIndexType)it; + + /* backward compatibility, these are now defined in storage_classes */ + string standard_compression_type; + string *pcompression = nullptr; + if (JSONDecoder::decode_json("compression", standard_compression_type, obj)) { + pcompression = &standard_compression_type; + } + rgw_pool standard_data_pool; + rgw_pool *ppool = nullptr; + if (JSONDecoder::decode_json("data_pool", standard_data_pool, obj)) { + ppool = &standard_data_pool; + } + if (ppool || pcompression) { + storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, ppool, pcompression); + } +} + +void RGWZoneParams::decode_json(JSONObj *obj) +{ + RGWSystemMetaObj::decode_json(obj); + JSONDecoder::decode_json("domain_root", domain_root, obj); + JSONDecoder::decode_json("control_pool", control_pool, obj); + JSONDecoder::decode_json("gc_pool", gc_pool, obj); + JSONDecoder::decode_json("lc_pool", lc_pool, obj); + JSONDecoder::decode_json("log_pool", log_pool, obj); + JSONDecoder::decode_json("intent_log_pool", intent_log_pool, obj); + JSONDecoder::decode_json("reshard_pool", reshard_pool, obj); + JSONDecoder::decode_json("usage_log_pool", usage_log_pool, obj); + JSONDecoder::decode_json("user_keys_pool", user_keys_pool, obj); + JSONDecoder::decode_json("user_email_pool", user_email_pool, obj); + JSONDecoder::decode_json("user_swift_pool", user_swift_pool, obj); + JSONDecoder::decode_json("user_uid_pool", user_uid_pool, obj); + JSONDecoder::decode_json("otp_pool", otp_pool, obj); + JSONDecoder::decode_json("system_key", system_key, obj); + JSONDecoder::decode_json("placement_pools", placement_pools, obj); + JSONDecoder::decode_json("metadata_heap", metadata_heap, obj); + JSONDecoder::decode_json("tier_config", tier_config, obj); + JSONDecoder::decode_json("realm_id", realm_id, obj); + +} + +void RGWZone::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("name", name, f); + encode_json("endpoints", endpoints, f); + encode_json("log_meta", log_meta, f); + encode_json("log_data", log_data, f); + encode_json("bucket_index_max_shards", bucket_index_max_shards, f); + encode_json("read_only", read_only, f); + encode_json("tier_type", tier_type, f); + encode_json("sync_from_all", sync_from_all, f); + encode_json("sync_from", sync_from, f); + encode_json("redirect_zone", redirect_zone, f); +} + +void RGWZone::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("name", name, obj); + if (id.empty()) { + id = name; + } + JSONDecoder::decode_json("endpoints", endpoints, obj); + JSONDecoder::decode_json("log_meta", log_meta, obj); + JSONDecoder::decode_json("log_data", log_data, obj); + JSONDecoder::decode_json("bucket_index_max_shards", bucket_index_max_shards, obj); + JSONDecoder::decode_json("read_only", read_only, obj); + JSONDecoder::decode_json("tier_type", tier_type, obj); + JSONDecoder::decode_json("sync_from_all", sync_from_all, true, obj); + JSONDecoder::decode_json("sync_from", sync_from, obj); + JSONDecoder::decode_json("redirect_zone", redirect_zone, obj); +} + +void RGWZoneGroupPlacementTarget::dump(Formatter *f) const +{ + encode_json("name", name, f); + encode_json("tags", tags, f); + encode_json("storage_classes", storage_classes, f); +} + +void RGWZoneGroupPlacementTarget::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("tags", tags, obj); + JSONDecoder::decode_json("storage_classes", storage_classes, obj); + if (storage_classes.empty()) { + storage_classes.insert(RGW_STORAGE_CLASS_STANDARD); + } +} + +void RGWZoneGroup::dump(Formatter *f) const +{ + RGWSystemMetaObj::dump(f); + encode_json("api_name", api_name, f); + encode_json("is_master", is_master, f); + encode_json("endpoints", endpoints, f); + encode_json("hostnames", hostnames, f); + encode_json("hostnames_s3website", hostnames_s3website, f); + encode_json("master_zone", master_zone, f); + encode_json_map("zones", zones, f); /* more friendly representation */ + encode_json_map("placement_targets", placement_targets, f); /* more friendly representation */ + encode_json("default_placement", default_placement, f); + encode_json("realm_id", realm_id, f); +} + +static void decode_zones(map& zones, JSONObj *o) +{ + RGWZone z; + z.decode_json(o); + zones[z.id] = z; +} + +static void decode_placement_targets(map& targets, JSONObj *o) +{ + RGWZoneGroupPlacementTarget t; + t.decode_json(o); + targets[t.name] = t; +} + + +void RGWZoneGroup::decode_json(JSONObj *obj) +{ + RGWSystemMetaObj::decode_json(obj); + if (id.empty()) { + derr << "old format " << dendl; + JSONDecoder::decode_json("name", name, obj); + id = name; + } + JSONDecoder::decode_json("api_name", api_name, obj); + JSONDecoder::decode_json("is_master", is_master, obj); + JSONDecoder::decode_json("endpoints", endpoints, obj); + JSONDecoder::decode_json("hostnames", hostnames, obj); + JSONDecoder::decode_json("hostnames_s3website", hostnames_s3website, obj); + JSONDecoder::decode_json("master_zone", master_zone, obj); + JSONDecoder::decode_json("zones", zones, decode_zones, obj); + JSONDecoder::decode_json("placement_targets", placement_targets, decode_placement_targets, obj); + JSONDecoder::decode_json("default_placement", default_placement.name, obj); + JSONDecoder::decode_json("default_storage_class", default_placement.storage_class, obj); + JSONDecoder::decode_json("realm_id", realm_id, obj); +} + + +void RGWPeriodMap::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json_map("zonegroups", zonegroups, f); + encode_json("short_zone_ids", short_zone_ids, f); +} + +static void decode_zonegroups(map& zonegroups, JSONObj *o) +{ + RGWZoneGroup zg; + zg.decode_json(o); + zonegroups[zg.get_id()] = zg; +} + +void RGWPeriodMap::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("zonegroups", zonegroups, decode_zonegroups, obj); + /* backward compatability with region */ + if (zonegroups.empty()) { + JSONDecoder::decode_json("regions", zonegroups, obj); + } + /* backward compatability with region */ + if (master_zonegroup.empty()) { + JSONDecoder::decode_json("master_region", master_zonegroup, obj); + } + JSONDecoder::decode_json("short_zone_ids", short_zone_ids, obj); +} + + +void RGWPeriodConfig::dump(Formatter *f) const +{ + encode_json("bucket_quota", bucket_quota, f); + encode_json("user_quota", user_quota, f); +} + +void RGWPeriodConfig::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("bucket_quota", bucket_quota, obj); + JSONDecoder::decode_json("user_quota", user_quota, obj); +} + +void RGWRegionMap::dump(Formatter *f) const +{ + encode_json("regions", regions, f); + encode_json("master_region", master_region, f); + encode_json("bucket_quota", bucket_quota, f); + encode_json("user_quota", user_quota, f); +} + +void RGWRegionMap::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("regions", regions, obj); + JSONDecoder::decode_json("master_region", master_region, obj); + JSONDecoder::decode_json("bucket_quota", bucket_quota, obj); + JSONDecoder::decode_json("user_quota", user_quota, obj); +} + +void RGWZoneGroupMap::dump(Formatter *f) const +{ + encode_json("zonegroups", zonegroups, f); + encode_json("master_zonegroup", master_zonegroup, f); + encode_json("bucket_quota", bucket_quota, f); + encode_json("user_quota", user_quota, f); +} + +void RGWZoneGroupMap::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("zonegroups", zonegroups, obj); + /* backward compatability with region */ + if (zonegroups.empty()) { + JSONDecoder::decode_json("regions", zonegroups, obj); + } + JSONDecoder::decode_json("master_zonegroup", master_zonegroup, obj); + /* backward compatability with region */ + if (master_zonegroup.empty()) { + JSONDecoder::decode_json("master_region", master_zonegroup, obj); + } + + JSONDecoder::decode_json("bucket_quota", bucket_quota, obj); + JSONDecoder::decode_json("user_quota", user_quota, obj); +} + +void RGWMetadataLogInfo::dump(Formatter *f) const +{ + encode_json("marker", marker, f); + utime_t ut(last_update); + encode_json("last_update", ut, f); +} + +void RGWMetadataLogInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("marker", marker, obj); + utime_t ut; + JSONDecoder::decode_json("last_update", ut, obj); + last_update = ut.to_real_time(); +} + +void RGWDataChangesLogInfo::dump(Formatter *f) const +{ + encode_json("marker", marker, f); + utime_t ut(last_update); + encode_json("last_update", ut, f); +} + +void RGWDataChangesLogInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("marker", marker, obj); + utime_t ut; + JSONDecoder::decode_json("last_update", ut, obj); + last_update = ut.to_real_time(); +} + + +void RGWRealm::dump(Formatter *f) const +{ + RGWSystemMetaObj::dump(f); + encode_json("current_period", current_period, f); + encode_json("epoch", epoch, f); +} + + +void RGWRealm::decode_json(JSONObj *obj) +{ + RGWSystemMetaObj::decode_json(obj); + JSONDecoder::decode_json("current_period", current_period, obj); + JSONDecoder::decode_json("epoch", epoch, obj); +} + +void rgw::keystone::TokenEnvelope::Token::decode_json(JSONObj *obj) +{ + string expires_iso8601; + struct tm t; + + JSONDecoder::decode_json("id", id, obj, true); + JSONDecoder::decode_json("tenant", tenant_v2, obj, true); + JSONDecoder::decode_json("expires", expires_iso8601, obj, true); + + if (parse_iso8601(expires_iso8601.c_str(), &t)) { + expires = internal_timegm(&t); + } else { + expires = 0; + throw JSONDecoder::err("Failed to parse ISO8601 expiration date from Keystone response."); + } +} + +void rgw::keystone::TokenEnvelope::Role::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("name", name, obj, true); +} + +void rgw::keystone::TokenEnvelope::Domain::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj, true); + JSONDecoder::decode_json("name", name, obj, true); +} + +void rgw::keystone::TokenEnvelope::Project::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj, true); + JSONDecoder::decode_json("name", name, obj, true); + JSONDecoder::decode_json("domain", domain, obj); +} + +void rgw::keystone::TokenEnvelope::User::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj, true); + JSONDecoder::decode_json("name", name, obj, true); + JSONDecoder::decode_json("domain", domain, obj); + JSONDecoder::decode_json("roles", roles_v2, obj); +} + +void rgw::keystone::TokenEnvelope::decode_v3(JSONObj* const root_obj) +{ + std::string expires_iso8601; + + JSONDecoder::decode_json("user", user, root_obj, true); + JSONDecoder::decode_json("expires_at", expires_iso8601, root_obj, true); + JSONDecoder::decode_json("roles", roles, root_obj, true); + JSONDecoder::decode_json("project", project, root_obj, true); + + struct tm t; + if (parse_iso8601(expires_iso8601.c_str(), &t)) { + token.expires = internal_timegm(&t); + } else { + token.expires = 0; + throw JSONDecoder::err("Failed to parse ISO8601 expiration date" + "from Keystone response."); + } +} + +void rgw::keystone::TokenEnvelope::decode_v2(JSONObj* const root_obj) +{ + JSONDecoder::decode_json("user", user, root_obj, true); + JSONDecoder::decode_json("token", token, root_obj, true); + + roles = user.roles_v2; + project = token.tenant_v2; +} + +void rgw_slo_entry::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("path", path, obj); + JSONDecoder::decode_json("etag", etag, obj); + JSONDecoder::decode_json("size_bytes", size_bytes, obj); +}; + +void rgw_meta_sync_info::decode_json(JSONObj *obj) +{ + string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "init") { + state = StateInit; + } else if (s == "building-full-sync-maps") { + state = StateBuildingFullSyncMaps; + } else if (s == "sync") { + state = StateSync; + } + JSONDecoder::decode_json("num_shards", num_shards, obj); + JSONDecoder::decode_json("period", period, obj); + JSONDecoder::decode_json("realm_epoch", realm_epoch, obj); +} + +void rgw_meta_sync_info::dump(Formatter *f) const +{ + string s; + switch ((SyncState)state) { + case StateInit: + s = "init"; + break; + case StateBuildingFullSyncMaps: + s = "building-full-sync-maps"; + break; + case StateSync: + s = "sync"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); + encode_json("num_shards", num_shards, f); + encode_json("period", period, f); + encode_json("realm_epoch", realm_epoch, f); +} + +void rgw_meta_sync_marker::decode_json(JSONObj *obj) +{ + int s; + JSONDecoder::decode_json("state", s, obj); + state = s; + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("next_step_marker", next_step_marker, obj); + JSONDecoder::decode_json("total_entries", total_entries, obj); + JSONDecoder::decode_json("pos", pos, obj); + utime_t ut; + JSONDecoder::decode_json("timestamp", ut, obj); + timestamp = ut.to_real_time(); + JSONDecoder::decode_json("realm_epoch", realm_epoch, obj); +} + +void rgw_meta_sync_marker::dump(Formatter *f) const +{ + encode_json("state", (int)state, f); + encode_json("marker", marker, f); + encode_json("next_step_marker", next_step_marker, f); + encode_json("total_entries", total_entries, f); + encode_json("pos", pos, f); + encode_json("timestamp", utime_t(timestamp), f); + encode_json("realm_epoch", realm_epoch, f); +} + +void rgw_meta_sync_status::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("info", sync_info, obj); + JSONDecoder::decode_json("markers", sync_markers, obj); +} + +void rgw_meta_sync_status::dump(Formatter *f) const { + encode_json("info", sync_info, f); + encode_json("markers", sync_markers, f); +} + +void rgw_sync_error_info::dump(Formatter *f) const { + encode_json("source_zone", source_zone, f); + encode_json("error_code", error_code, f); + encode_json("message", message, f); +} + +void rgw_bucket_shard_full_sync_marker::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("position", position, obj); + JSONDecoder::decode_json("count", count, obj); +} + +void rgw_bucket_shard_full_sync_marker::dump(Formatter *f) const +{ + encode_json("position", position, f); + encode_json("count", count, f); +} + +void rgw_bucket_shard_inc_sync_marker::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("position", position, obj); +} + +void rgw_bucket_shard_inc_sync_marker::dump(Formatter *f) const +{ + encode_json("position", position, f); +} + +void rgw_bucket_shard_sync_info::decode_json(JSONObj *obj) +{ + std::string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "full-sync") { + state = StateFullSync; + } else if (s == "incremental-sync") { + state = StateIncrementalSync; + } else { + state = StateInit; + } + JSONDecoder::decode_json("full_marker", full_marker, obj); + JSONDecoder::decode_json("inc_marker", inc_marker, obj); +} + +void rgw_bucket_shard_sync_info::dump(Formatter *f) const +{ + const char *s{nullptr}; + switch ((SyncState)state) { + case StateInit: + s = "init"; + break; + case StateFullSync: + s = "full-sync"; + break; + case StateIncrementalSync: + s = "incremental-sync"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); + encode_json("full_marker", full_marker, f); + encode_json("inc_marker", inc_marker, f); +} + +/* This utility function shouldn't conflict with the overload of std::to_string + * provided by string_ref since Boost 1.54 as it's defined outside of the std + * namespace. I hope we'll remove it soon - just after merging the Matt's PR + * for bundled Boost. It would allow us to forget that CentOS 7 has Boost 1.53. */ +static inline std::string to_string(const boost::string_ref& s) +{ + return std::string(s.data(), s.length()); +} + +void rgw::keystone::AdminTokenRequestVer2::dump(Formatter* const f) const +{ + f->open_object_section("token_request"); + f->open_object_section("auth"); + f->open_object_section("passwordCredentials"); + encode_json("username", ::to_string(conf.get_admin_user()), f); + encode_json("password", conf.get_admin_password(), f); + f->close_section(); + encode_json("tenantName", ::to_string(conf.get_admin_tenant()), f); + f->close_section(); + f->close_section(); +} + +void rgw::keystone::AdminTokenRequestVer3::dump(Formatter* const f) const +{ + f->open_object_section("token_request"); + f->open_object_section("auth"); + f->open_object_section("identity"); + f->open_array_section("methods"); + f->dump_string("", "password"); + f->close_section(); + f->open_object_section("password"); + f->open_object_section("user"); + f->open_object_section("domain"); + encode_json("name", ::to_string(conf.get_admin_domain()), f); + f->close_section(); + encode_json("name", ::to_string(conf.get_admin_user()), f); + encode_json("password", conf.get_admin_password(), f); + f->close_section(); + f->close_section(); + f->close_section(); + f->open_object_section("scope"); + f->open_object_section("project"); + if (! conf.get_admin_project().empty()) { + encode_json("name", ::to_string(conf.get_admin_project()), f); + } else { + encode_json("name", ::to_string(conf.get_admin_tenant()), f); + } + f->open_object_section("domain"); + encode_json("name", ::to_string(conf.get_admin_domain()), f); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); +} + + +void rgw::keystone::BarbicanTokenRequestVer2::dump(Formatter* const f) const +{ + f->open_object_section("token_request"); + f->open_object_section("auth"); + f->open_object_section("passwordCredentials"); + encode_json("username", cct->_conf->rgw_keystone_barbican_user, f); + encode_json("password", cct->_conf->rgw_keystone_barbican_password, f); + f->close_section(); + encode_json("tenantName", cct->_conf->rgw_keystone_barbican_tenant, f); + f->close_section(); + f->close_section(); +} + +void rgw::keystone::BarbicanTokenRequestVer3::dump(Formatter* const f) const +{ + f->open_object_section("token_request"); + f->open_object_section("auth"); + f->open_object_section("identity"); + f->open_array_section("methods"); + f->dump_string("", "password"); + f->close_section(); + f->open_object_section("password"); + f->open_object_section("user"); + f->open_object_section("domain"); + encode_json("name", cct->_conf->rgw_keystone_barbican_domain, f); + f->close_section(); + encode_json("name", cct->_conf->rgw_keystone_barbican_user, f); + encode_json("password", cct->_conf->rgw_keystone_barbican_password, f); + f->close_section(); + f->close_section(); + f->close_section(); + f->open_object_section("scope"); + f->open_object_section("project"); + if (!cct->_conf->rgw_keystone_barbican_project.empty()) { + encode_json("name", cct->_conf->rgw_keystone_barbican_project, f); + } else { + encode_json("name", cct->_conf->rgw_keystone_barbican_tenant, f); + } + f->open_object_section("domain"); + encode_json("name", cct->_conf->rgw_keystone_barbican_domain, f); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); + f->close_section(); +} + +void RGWOrphanSearchStage::dump(Formatter *f) const +{ + f->open_object_section("orphan_search_stage"); + string s; + switch(stage){ + case ORPHAN_SEARCH_STAGE_INIT: + s = "init"; + break; + case ORPHAN_SEARCH_STAGE_LSPOOL: + s = "lspool"; + break; + case ORPHAN_SEARCH_STAGE_LSBUCKETS: + s = "lsbuckets"; + break; + case ORPHAN_SEARCH_STAGE_ITERATE_BI: + s = "iterate_bucket_index"; + break; + case ORPHAN_SEARCH_STAGE_COMPARE: + s = "comparing"; + break; + default: + s = "unknown"; + } + f->dump_string("search_stage", s); + f->dump_int("shard",shard); + f->dump_string("marker",marker); + f->close_section(); +} + +void RGWOrphanSearchInfo::dump(Formatter *f) const +{ + f->open_object_section("orphan_search_info"); + f->dump_string("job_name", job_name); + encode_json("pool", pool, f); + f->dump_int("num_shards", num_shards); + encode_json("start_time", start_time, f); + f->close_section(); +} + +void RGWOrphanSearchState::dump(Formatter *f) const +{ + f->open_object_section("orphan_search_state"); + encode_json("info", info, f); + encode_json("stage", stage, f); + f->close_section(); +} + +void RGWObjTags::dump(Formatter *f) const +{ + for (auto& tag: tag_map){ + f->open_object_section("tag_map"); + f->dump_string("key", tag.first); + f->dump_string("value", tag.second); + f->close_section(); + } +} + +void lc_op::dump(Formatter *f) const +{ + f->dump_bool("status", status); + f->dump_bool("dm_expiration", dm_expiration); + + f->dump_int("expiration", expiration); + f->dump_int("noncur_expiration", noncur_expiration); + f->dump_int("mp_expiration", mp_expiration); + if (expiration_date) { + utime_t ut(*expiration_date); + f->dump_stream("expiration_date") << ut; + } + if (obj_tags) { + f->dump_object("obj_tags", *obj_tags); + } + f->open_object_section("transitions"); + for(auto& [storage_class, transition] : transitions) { + f->dump_object(storage_class.c_str(), transition); + } + f->close_section(); + + f->open_object_section("noncur_transitions"); + for (auto& [storage_class, transition] : noncur_transitions) { + f->dump_object(storage_class.c_str(), transition); + } + f->close_section(); +} + +void LCFilter::dump(Formatter *f) const +{ + f->dump_string("prefix", prefix); + f->dump_object("obj_tags", obj_tags); +} + +void LCExpiration::dump(Formatter *f) const +{ + f->dump_string("days", days); + f->dump_string("date", date); +} + +void LCRule::dump(Formatter *f) const +{ + f->dump_string("id", id); + f->dump_string("prefix", prefix); + f->dump_string("status", status); + f->dump_object("expiration", expiration); + f->dump_object("noncur_expiration", noncur_expiration); + f->dump_object("mp_expiration", mp_expiration); + f->dump_object("filter", filter); + f->open_object_section("transitions"); + for (auto& [storage_class, transition] : transitions) { + f->dump_object(storage_class.c_str(), transition); + } + f->close_section(); + + f->open_object_section("noncur_transitions"); + for (auto& [storage_class, transition] : noncur_transitions) { + f->dump_object(storage_class.c_str(), transition); + } + f->close_section(); + f->dump_bool("dm_expiration", dm_expiration); +} + +void RGWLifecycleConfiguration::dump(Formatter *f) const +{ + f->open_object_section("prefix_map"); + for (auto& prefix : prefix_map) { + f->dump_object(prefix.first.c_str(), prefix.second); + } + f->close_section(); + + f->open_array_section("rule_map"); + for (auto& rule : rule_map) { + f->open_object_section("entry"); + f->dump_string("id", rule.first); + f->open_object_section("rule"); + rule.second.dump(f); + f->close_section(); + f->close_section(); + } + f->close_section(); +} + +void compression_block::dump(Formatter *f) const +{ + f->dump_unsigned("old_ofs", old_ofs); + f->dump_unsigned("new_ofs", new_ofs); + f->dump_unsigned("len", len); +} + +void RGWCompressionInfo::dump(Formatter *f) const +{ + f->dump_string("compression_type", compression_type); + f->dump_unsigned("orig_size", orig_size); + ::encode_json("blocks", blocks, f); +} diff --git a/src/rgw/rgw_jsonparser.cc b/src/rgw/rgw_jsonparser.cc new file mode 100644 index 00000000..df520140 --- /dev/null +++ b/src/rgw/rgw_jsonparser.cc @@ -0,0 +1,132 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include +#include + +#include "include/types.h" + +#include "common/Formatter.h" +#include "common/ceph_json.h" + +#include "rgw_common.h" + +#define dout_subsys ceph_subsys_rgw + + +void dump_array(JSONObj *obj) +{ + + JSONObjIter iter = obj->find_first(); + + for (; !iter.end(); ++iter) { + JSONObj *o = *iter; + cout << "data=" << o->get_data() << std::endl; + } + +} + +struct Key { + string user; + string access_key; + string secret_key; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("user", user, obj); + JSONDecoder::decode_json("access_key", access_key, obj); + JSONDecoder::decode_json("secret_key", secret_key, obj); + } +}; + +struct UserInfo { + string uid; + string display_name; + int max_buckets; + list keys; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("user_id", uid, obj); + JSONDecoder::decode_json("display_name", display_name, obj); + JSONDecoder::decode_json("max_buckets", max_buckets, obj); + JSONDecoder::decode_json("keys", keys, obj); + } +}; + + +int main(int argc, char **argv) { + JSONParser parser; + + char buf[1024]; + bufferlist bl; + + for (;;) { + int done; + int len; + + len = fread(buf, 1, sizeof(buf), stdin); + if (ferror(stdin)) { + cerr << "read error" << std::endl; + exit(-1); + } + done = feof(stdin); + + bool ret = parser.parse(buf, len); + if (!ret) + cerr << "parse error" << std::endl; + + if (done) { + bl.append(buf, len); + break; + } + } + + JSONObjIter iter = parser.find_first(); + + for (; !iter.end(); ++iter) { + JSONObj *obj = *iter; + cout << "is_object=" << obj->is_object() << std::endl; + cout << "is_array=" << obj->is_array() << std::endl; + cout << "name=" << obj->get_name() << std::endl; + cout << "data=" << obj->get_data() << std::endl; + } + + iter = parser.find_first("conditions"); + if (!iter.end()) { + JSONObj *obj = *iter; + + JSONObjIter iter2 = obj->find_first(); + for (; !iter2.end(); ++iter2) { + JSONObj *child = *iter2; + cout << "is_object=" << child->is_object() << std::endl; + cout << "is_array=" << child->is_array() << std::endl; + if (child->is_array()) { + dump_array(child); + } + cout << "name=" << child->get_name() < +#include "include/ceph_assert.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/dout.h" + +#define dout_subsys ceph_subsys_rgw + +// TODO investigation, not necessarily issues: +// (1) in case of single threaded writer context use spsc_queue +// (2) check performance of emptying queue to local list, and go over the list and publish +// (3) use std::shared_mutex (c++17) or equivalent for the connections lock + +// cmparisson operator between topic pointer and name +bool operator==(const rd_kafka_topic_t* rkt, const std::string& name) { + return name == std::string_view(rd_kafka_topic_name(rkt)); +} + +namespace rgw::kafka { + +// status codes for publishing +// TODO: use the actual error code (when conn exists) instead of STATUS_CONNECTION_CLOSED when replying to client +static const int STATUS_CONNECTION_CLOSED = -0x1002; +static const int STATUS_QUEUE_FULL = -0x1003; +static const int STATUS_MAX_INFLIGHT = -0x1004; +static const int STATUS_MANAGER_STOPPED = -0x1005; +// status code for connection opening +static const int STATUS_CONF_ALLOC_FAILED = -0x2001; + +static const int STATUS_OK = 0x0; + +// struct for holding the callback and its tag in the callback list +struct reply_callback_with_tag_t { + uint64_t tag; + reply_callback_t cb; + + reply_callback_with_tag_t(uint64_t _tag, reply_callback_t _cb) : tag(_tag), cb(_cb) {} + + bool operator==(uint64_t rhs) { + return tag == rhs; + } +}; + +typedef std::vector CallbackList; + +// struct for holding the connection state object as well as list of topics +// it is used inside an intrusive ref counted pointer (boost::intrusive_ptr) +// since references to deleted objects may still exist in the calling code +struct connection_t { + rd_kafka_t* producer = nullptr; + rd_kafka_conf_t* temp_conf = nullptr; + std::vector topics; + bool marked_for_deletion = false; + uint64_t delivery_tag = 1; + int status; + mutable std::atomic ref_count = 0; + CephContext* const cct; + CallbackList callbacks; + const std::string broker; + const bool use_ssl; + const bool verify_ssl; // TODO currently iognored, not supported in librdkafka v0.11.6 + const boost::optional ca_location; + const std::string user; + const std::string password; + + // cleanup of all internal connection resource + // the object can still remain, and internal connection + // resources created again on successful reconnection + void destroy(int s) { + status = s; + // destroy temporary conf (if connection was never established) + if (temp_conf) { + rd_kafka_conf_destroy(temp_conf); + return; + } + // wait for all remaining acks/nacks + rd_kafka_flush(producer, 5*1000 /* wait for max 5 seconds */); + // destroy all topics + std::for_each(topics.begin(), topics.end(), [](auto topic) {rd_kafka_topic_destroy(topic);}); + // destroy producer + rd_kafka_destroy(producer); + // fire all remaining callbacks (if not fired by rd_kafka_flush) + std::for_each(callbacks.begin(), callbacks.end(), [this](auto& cb_tag) { + cb_tag.cb(status); + ldout(cct, 20) << "Kafka destroy: invoking callback with tag=" << cb_tag.tag << dendl; + }); + callbacks.clear(); + delivery_tag = 1; + } + + bool is_ok() const { + return (producer != nullptr && !marked_for_deletion); + } + + // ctor for setting immutable values + connection_t(CephContext* _cct, const std::string& _broker, bool _use_ssl, bool _verify_ssl, + const boost::optional& _ca_location, + const std::string& _user, const std::string& _password) : + cct(_cct), broker(_broker), use_ssl(_use_ssl), verify_ssl(_verify_ssl), ca_location(_ca_location), user(_user), password(_password) {} + + // dtor also destroys the internals + ~connection_t() { + destroy(STATUS_CONNECTION_CLOSED); + } + + friend void intrusive_ptr_add_ref(const connection_t* p); + friend void intrusive_ptr_release(const connection_t* p); +}; + +std::string to_string(const connection_ptr_t& conn) { + std::string str; + str += "\nBroker: " + conn->broker; + str += conn->use_ssl ? "\nUse SSL" : ""; + str += conn->ca_location ? "\nCA Location: " + *(conn->ca_location) : ""; + return str; +} +// these are required interfaces so that connection_t could be used inside boost::intrusive_ptr +void intrusive_ptr_add_ref(const connection_t* p) { + ++p->ref_count; +} +void intrusive_ptr_release(const connection_t* p) { + if (--p->ref_count == 0) { + delete p; + } +} + +// convert int status to string - including RGW specific values +std::string status_to_string(int s) { + switch (s) { + case STATUS_OK: + return "STATUS_OK"; + case STATUS_CONNECTION_CLOSED: + return "RGW_KAFKA_STATUS_CONNECTION_CLOSED"; + case STATUS_QUEUE_FULL: + return "RGW_KAFKA_STATUS_QUEUE_FULL"; + case STATUS_MAX_INFLIGHT: + return "RGW_KAFKA_STATUS_MAX_INFLIGHT"; + case STATUS_MANAGER_STOPPED: + return "RGW_KAFKA_STATUS_MANAGER_STOPPED"; + case STATUS_CONF_ALLOC_FAILED: + return "RGW_KAFKA_STATUS_CONF_ALLOC_FAILED"; + } + return std::string(rd_kafka_err2str((rd_kafka_resp_err_t)s)); +} + +void message_callback(rd_kafka_t* rk, const rd_kafka_message_t* rkmessage, void* opaque) { + ceph_assert(opaque); + + const auto conn = reinterpret_cast(opaque); + const auto result = rkmessage->err; + + if (!rkmessage->_private) { + ldout(conn->cct, 20) << "Kafka run: n/ack received, (no callback) with result=" << result << dendl; + return; + } + + const auto tag = reinterpret_cast(rkmessage->_private); + const auto& callbacks_end = conn->callbacks.end(); + const auto& callbacks_begin = conn->callbacks.begin(); + const auto tag_it = std::find(callbacks_begin, callbacks_end, *tag); + if (tag_it != callbacks_end) { + ldout(conn->cct, 20) << "Kafka run: n/ack received, invoking callback with tag=" << + *tag << " and result=" << rd_kafka_err2str(result) << dendl; + tag_it->cb(result); + conn->callbacks.erase(tag_it); + } else { + // TODO add counter for acks with no callback + ldout(conn->cct, 10) << "Kafka run: unsolicited n/ack received with tag=" << + *tag << dendl; + } + delete tag; + // rkmessage is destroyed automatically by librdkafka +} + +// utility function to create a connection, when the connection object already exists +connection_ptr_t& create_connection(connection_ptr_t& conn) { + // pointer must be valid and not marked for deletion + ceph_assert(conn && !conn->marked_for_deletion); + + // reset all status codes + conn->status = STATUS_OK; + char errstr[512] = {0}; + + conn->temp_conf = rd_kafka_conf_new(); + if (!conn->temp_conf) { + conn->status = STATUS_CONF_ALLOC_FAILED; + return conn; + } + + // get list of brokers based on the bootsrap broker + if (rd_kafka_conf_set(conn->temp_conf, "bootstrap.servers", conn->broker.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + + if (conn->use_ssl) { + if (!conn->user.empty()) { + // use SSL+SASL + if (rd_kafka_conf_set(conn->temp_conf, "security.protocol", "SASL_SSL", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK || + rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", "PLAIN", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK || + rd_kafka_conf_set(conn->temp_conf, "sasl.username", conn->user.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK || + rd_kafka_conf_set(conn->temp_conf, "sasl.password", conn->password.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured SSL+SASL security" << dendl; + } else { + // use only SSL + if (rd_kafka_conf_set(conn->temp_conf, "security.protocol", "SSL", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured SSL security" << dendl; + } + if (conn->ca_location) { + if (rd_kafka_conf_set(conn->temp_conf, "ssl.ca.location", conn->ca_location->c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + ldout(conn->cct, 20) << "Kafka connect: successfully configured CA location" << dendl; + } else { + ldout(conn->cct, 20) << "Kafka connect: using default CA location" << dendl; + } + // Note: when librdkafka.1.0 is available the following line could be uncommented instead of the callback setting call + // if (rd_kafka_conf_set(conn->temp_conf, "enable.ssl.certificate.verification", "0", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error; + + ldout(conn->cct, 20) << "Kafka connect: successfully configured security" << dendl; + } + + // set the global callback for delivery success/fail + rd_kafka_conf_set_dr_msg_cb(conn->temp_conf, message_callback); + + // set the global opaque pointer to be the connection itself + rd_kafka_conf_set_opaque(conn->temp_conf, conn.get()); + + // create the producer + conn->producer = rd_kafka_new(RD_KAFKA_PRODUCER, conn->temp_conf, errstr, sizeof(errstr)); + if (!conn->producer) { + conn->status = rd_kafka_last_error(); + ldout(conn->cct, 1) << "Kafka connect: failed to create producer: " << errstr << dendl; + return conn; + } + ldout(conn->cct, 20) << "Kafka connect: successfully created new producer" << dendl; + + // conf ownership passed to producer + conn->temp_conf = nullptr; + return conn; + +conf_error: + conn->status = rd_kafka_last_error(); + ldout(conn->cct, 1) << "Kafka connect: configuration failed: " << errstr << dendl; + return conn; +} + +// utility function to create a new connection +connection_ptr_t create_new_connection(const std::string& broker, CephContext* cct, + bool use_ssl, + bool verify_ssl, + boost::optional ca_location, + const std::string& user, + const std::string& password) { + // create connection state + connection_ptr_t conn(new connection_t(cct, broker, use_ssl, verify_ssl, ca_location, user, password)); + return create_connection(conn); +} + +/// struct used for holding messages in the message queue +struct message_wrapper_t { + connection_ptr_t conn; + std::string topic; + std::string message; + reply_callback_t cb; + + message_wrapper_t(connection_ptr_t& _conn, + const std::string& _topic, + const std::string& _message, + reply_callback_t _cb) : conn(_conn), topic(_topic), message(_message), cb(_cb) {} +}; + +typedef std::unordered_map ConnectionList; +typedef boost::lockfree::queue> MessageQueue; + +// macros used inside a loop where an iterator is either incremented or erased +#define INCREMENT_AND_CONTINUE(IT) \ + ++IT; \ + continue; + +#define ERASE_AND_CONTINUE(IT,CONTAINER) \ + IT=CONTAINER.erase(IT); \ + --connection_count; \ + continue; + +class Manager { +public: + const size_t max_connections; + const size_t max_inflight; + const size_t max_queue; +private: + std::atomic connection_count; + bool stopped; + int read_timeout_ms; + ConnectionList connections; + MessageQueue messages; + std::atomic queued; + std::atomic dequeued; + CephContext* const cct; + mutable std::mutex connections_lock; + std::thread runner; + + // TODO use rd_kafka_produce_batch for better performance + void publish_internal(message_wrapper_t* message) { + const std::unique_ptr msg_owner(message); + auto& conn = message->conn; + + if (!conn->is_ok()) { + // connection had an issue while message was in the queue + // TODO add error stats + ldout(conn->cct, 1) << "Kafka publish: connection had an issue while message was in the queue. error: " << status_to_string(conn->status) << dendl; + if (message->cb) { + message->cb(conn->status); + } + return; + } + + // create a new topic unless it was already created + auto topic_it = std::find(conn->topics.begin(), conn->topics.end(), message->topic); + rd_kafka_topic_t* topic = nullptr; + if (topic_it == conn->topics.end()) { + topic = rd_kafka_topic_new(conn->producer, message->topic.c_str(), nullptr); + if (!topic) { + const auto err = rd_kafka_last_error(); + ldout(conn->cct, 1) << "Kafka publish: failed to create topic: " << message->topic << " error: " << status_to_string(err) << dendl; + if (message->cb) { + message->cb(err); + } + conn->destroy(err); + return; + } + // TODO use the topics list as an LRU cache + conn->topics.push_back(topic); + ldout(conn->cct, 20) << "Kafka publish: successfully created topic: " << message->topic << dendl; + } else { + topic = *topic_it; + ldout(conn->cct, 20) << "Kafka publish: reused existing topic: " << message->topic << dendl; + } + + const auto tag = (message->cb == nullptr ? nullptr : new uint64_t(conn->delivery_tag++)); + const auto rc = rd_kafka_produce( + topic, + // TODO: non builtin partitioning + RD_KAFKA_PARTITION_UA, + // make a copy of the payload + // so it is safe to pass the pointer from the string + RD_KAFKA_MSG_F_COPY, + message->message.data(), + message->message.length(), + // optional key and its length + nullptr, + 0, + // opaque data: tag, used in the global callback + // in order to invoke the real callback + // null if no callback exists + tag); + if (rc == -1) { + const auto err = rd_kafka_last_error(); + ldout(conn->cct, 10) << "Kafka publish: failed to produce: " << rd_kafka_err2str(err) << dendl; + // TODO: dont error on full queue, and don't destroy connection, retry instead + // immediatly invoke callback on error if needed + if (message->cb) { + message->cb(err); + } + conn->destroy(err); + delete tag; + } + + if (tag) { + auto const q_len = conn->callbacks.size(); + if (q_len < max_inflight) { + ldout(conn->cct, 20) << "Kafka publish (with callback, tag=" << *tag << "): OK. Queue has: " << q_len << " callbacks" << dendl; + conn->callbacks.emplace_back(*tag, message->cb); + } else { + // immediately invoke callback with error - this is not a connection error + ldout(conn->cct, 1) << "Kafka publish (with callback): failed with error: callback queue full" << dendl; + message->cb(STATUS_MAX_INFLIGHT); + // tag will be deleted when the global callback is invoked + } + } else { + ldout(conn->cct, 20) << "Kafka publish (no callback): OK" << dendl; + } + } + + // the managers thread: + // (1) empty the queue of messages to be published + // (2) loop over all connections and read acks + // (3) manages deleted connections + // (4) TODO reconnect on connection errors + // (5) TODO cleanup timedout callbacks + void run() { + while (!stopped) { + + // publish all messages in the queue + auto reply_count = 0U; + const auto send_count = messages.consume_all(std::bind(&Manager::publish_internal, this, std::placeholders::_1)); + dequeued += send_count; + ConnectionList::iterator conn_it; + ConnectionList::const_iterator end_it; + { + // thread safe access to the connection list + // once the iterators are fetched they are guaranteed to remain valid + std::lock_guard lock(connections_lock); + conn_it = connections.begin(); + end_it = connections.end(); + } + // loop over all connections to read acks + for (;conn_it != end_it;) { + + auto& conn = conn_it->second; + // delete the connection if marked for deletion + if (conn->marked_for_deletion) { + ldout(conn->cct, 10) << "Kafka run: connection is deleted" << dendl; + conn->destroy(STATUS_CONNECTION_CLOSED); + std::lock_guard lock(connections_lock); + // erase is safe - does not invalidate any other iterator + // lock so no insertion happens at the same time + ERASE_AND_CONTINUE(conn_it, connections); + } + + // try to reconnect the connection if it has an error + if (!conn->is_ok()) { + ldout(conn->cct, 10) << "Kafka run: connection status is: " << status_to_string(conn->status) << dendl; + const auto& broker = conn_it->first; + ldout(conn->cct, 20) << "Kafka run: retry connection" << dendl; + if (create_connection(conn)->is_ok() == false) { + ldout(conn->cct, 10) << "Kafka run: connection (" << broker << ") retry failed" << dendl; + // TODO: add error counter for failed retries + // TODO: add exponential backoff for retries + } else { + ldout(conn->cct, 10) << "Kafka run: connection (" << broker << ") retry successfull" << dendl; + } + INCREMENT_AND_CONTINUE(conn_it); + } + + reply_count += rd_kafka_poll(conn->producer, read_timeout_ms); + + // just increment the iterator + ++conn_it; + } + // if no messages were received or published + // across all connection, sleep for 100ms + if (send_count == 0 && reply_count == 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + } + + // used in the dtor for message cleanup + static void delete_message(const message_wrapper_t* message) { + delete message; + } + +public: + Manager(size_t _max_connections, + size_t _max_inflight, + size_t _max_queue, + int _read_timeout_ms, + CephContext* _cct) : + max_connections(_max_connections), + max_inflight(_max_inflight), + max_queue(_max_queue), + connection_count(0), + stopped(false), + read_timeout_ms(_read_timeout_ms), + connections(_max_connections), + messages(max_queue), + queued(0), + dequeued(0), + cct(_cct), + runner(&Manager::run, this) { + // The hashmap has "max connections" as the initial number of buckets, + // and allows for 10 collisions per bucket before rehash. + // This is to prevent rehashing so that iterators are not invalidated + // when a new connection is added. + connections.max_load_factor(10.0); + // give the runner thread a name for easier debugging + const auto rc = ceph_pthread_setname(runner.native_handle(), "kafka_manager"); + ceph_assert(rc==0); + } + + // non copyable + Manager(const Manager&) = delete; + const Manager& operator=(const Manager&) = delete; + + // stop the main thread + void stop() { + stopped = true; + } + + // disconnect from a broker + bool disconnect(connection_ptr_t& conn) { + if (!conn || stopped) { + return false; + } + conn->marked_for_deletion = true; + return true; + } + + // connect to a broker, or reuse an existing connection if already connected + connection_ptr_t connect(const std::string& url, + bool use_ssl, + bool verify_ssl, + boost::optional ca_location) { + if (stopped) { + // TODO: increment counter + ldout(cct, 1) << "Kafka connect: manager is stopped" << dendl; + return nullptr; + } + + std::string broker; + std::string user; + std::string password; + if (!parse_url_authority(url, broker, user, password)) { + // TODO: increment counter + ldout(cct, 1) << "Kafka connect: URL parsing failed" << dendl; + return nullptr; + } + + // this should be validated by the regex in parse_url() + ceph_assert(user.empty() == password.empty()); + + if (!user.empty() && !use_ssl) { + ldout(cct, 1) << "Kafka connect: user/password are only allowed over secure connection" << dendl; + return nullptr; + } + + std::lock_guard lock(connections_lock); + const auto it = connections.find(broker); + // note that ssl vs. non-ssl connection to the same host are two separate conenctions + if (it != connections.end()) { + if (it->second->marked_for_deletion) { + // TODO: increment counter + ldout(cct, 1) << "Kafka connect: endpoint marked for deletion" << dendl; + return nullptr; + } + // connection found - return even if non-ok + ldout(cct, 20) << "Kafka connect: connection found" << dendl; + return it->second; + } + + // connection not found, creating a new one + if (connection_count >= max_connections) { + // TODO: increment counter + ldout(cct, 1) << "Kafka connect: max connections exceeded" << dendl; + return nullptr; + } + const auto conn = create_new_connection(broker, cct, use_ssl, verify_ssl, ca_location, user, password); + // create_new_connection must always return a connection object + // even if error occurred during creation. + // in such a case the creation will be retried in the main thread + ceph_assert(conn); + ++connection_count; + ldout(cct, 10) << "Kafka connect: new connection is created. Total connections: " << connection_count << dendl; + return connections.emplace(broker, conn).first->second; + } + + // TODO publish with confirm is needed in "none" case as well, cb should be invoked publish is ok (no ack) + int publish(connection_ptr_t& conn, + const std::string& topic, + const std::string& message) { + if (stopped) { + return STATUS_MANAGER_STOPPED; + } + if (!conn || !conn->is_ok()) { + return STATUS_CONNECTION_CLOSED; + } + if (messages.push(new message_wrapper_t(conn, topic, message, nullptr))) { + ++queued; + return STATUS_OK; + } + return STATUS_QUEUE_FULL; + } + + int publish_with_confirm(connection_ptr_t& conn, + const std::string& topic, + const std::string& message, + reply_callback_t cb) { + if (stopped) { + return STATUS_MANAGER_STOPPED; + } + if (!conn || !conn->is_ok()) { + return STATUS_CONNECTION_CLOSED; + } + if (messages.push(new message_wrapper_t(conn, topic, message, cb))) { + ++queued; + return STATUS_OK; + } + return STATUS_QUEUE_FULL; + } + + // dtor wait for thread to stop + // then connection are cleaned-up + ~Manager() { + stopped = true; + runner.join(); + messages.consume_all(delete_message); + } + + // get the number of connections + size_t get_connection_count() const { + return connection_count; + } + + // get the number of in-flight messages + size_t get_inflight() const { + size_t sum = 0; + std::lock_guard lock(connections_lock); + std::for_each(connections.begin(), connections.end(), [&sum](auto& conn_pair) { + sum += conn_pair.second->callbacks.size(); + }); + return sum; + } + + // running counter of the queued messages + size_t get_queued() const { + return queued; + } + + // running counter of the dequeued messages + size_t get_dequeued() const { + return dequeued; + } +}; + +// singleton manager +// note that the manager itself is not a singleton, and multiple instances may co-exist +// TODO make the pointer atomic in allocation and deallocation to avoid race conditions +static Manager* s_manager = nullptr; + +static const size_t MAX_CONNECTIONS_DEFAULT = 256; +static const size_t MAX_INFLIGHT_DEFAULT = 8192; +static const size_t MAX_QUEUE_DEFAULT = 8192; +static const int READ_TIMEOUT_MS_DEFAULT = 500; + +bool init(CephContext* cct) { + if (s_manager) { + return false; + } + // TODO: take conf from CephContext + s_manager = new Manager(MAX_CONNECTIONS_DEFAULT, MAX_INFLIGHT_DEFAULT, MAX_QUEUE_DEFAULT, READ_TIMEOUT_MS_DEFAULT, cct); + return true; +} + +void shutdown() { + delete s_manager; + s_manager = nullptr; +} + +connection_ptr_t connect(const std::string& url, bool use_ssl, bool verify_ssl, + boost::optional ca_location) { + if (!s_manager) return nullptr; + return s_manager->connect(url, use_ssl, verify_ssl, ca_location); +} + +int publish(connection_ptr_t& conn, + const std::string& topic, + const std::string& message) { + if (!s_manager) return STATUS_MANAGER_STOPPED; + return s_manager->publish(conn, topic, message); +} + +int publish_with_confirm(connection_ptr_t& conn, + const std::string& topic, + const std::string& message, + reply_callback_t cb) { + if (!s_manager) return STATUS_MANAGER_STOPPED; + return s_manager->publish_with_confirm(conn, topic, message, cb); +} + +size_t get_connection_count() { + if (!s_manager) return 0; + return s_manager->get_connection_count(); +} + +size_t get_inflight() { + if (!s_manager) return 0; + return s_manager->get_inflight(); +} + +size_t get_queued() { + if (!s_manager) return 0; + return s_manager->get_queued(); +} + +size_t get_dequeued() { + if (!s_manager) return 0; + return s_manager->get_dequeued(); +} + +size_t get_max_connections() { + if (!s_manager) return MAX_CONNECTIONS_DEFAULT; + return s_manager->max_connections; +} + +size_t get_max_inflight() { + if (!s_manager) return MAX_INFLIGHT_DEFAULT; + return s_manager->max_inflight; +} + +size_t get_max_queue() { + if (!s_manager) return MAX_QUEUE_DEFAULT; + return s_manager->max_queue; +} + +bool disconnect(connection_ptr_t& conn) { + if (!s_manager) return false; + return s_manager->disconnect(conn); +} + +} // namespace kafka + diff --git a/src/rgw/rgw_kafka.h b/src/rgw/rgw_kafka.h new file mode 100644 index 00000000..cccdd65b --- /dev/null +++ b/src/rgw/rgw_kafka.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +class CephContext; + +namespace rgw::kafka { +// forward declaration of connection object +struct connection_t; + +typedef boost::intrusive_ptr connection_ptr_t; + +// required interfaces needed so that connection_t could be used inside boost::intrusive_ptr +void intrusive_ptr_add_ref(const connection_t* p); +void intrusive_ptr_release(const connection_t* p); + +// the reply callback is expected to get an integer parameter +// indicating the result, and not to return anything +typedef std::function reply_callback_t; + +// initialize the kafka manager +bool init(CephContext* cct); + +// shutdown the kafka manager +void shutdown(); + +// connect to a kafka endpoint +connection_ptr_t connect(const std::string& url, bool use_ssl, bool verify_ssl, boost::optional ca_location); + +// publish a message over a connection that was already created +int publish(connection_ptr_t& conn, + const std::string& topic, + const std::string& message); + +// publish a message over a connection that was already created +// and pass a callback that will be invoked (async) when broker confirms +// receiving the message +int publish_with_confirm(connection_ptr_t& conn, + const std::string& topic, + const std::string& message, + reply_callback_t cb); + +// convert the integer status returned from the "publish" function to a string +std::string status_to_string(int s); + +// number of connections +size_t get_connection_count(); + +// return the number of messages that were sent +// to broker, but were not yet acked/nacked/timedout +size_t get_inflight(); + +// running counter of successfully queued messages +size_t get_queued(); + +// running counter of dequeued messages +size_t get_dequeued(); + +// number of maximum allowed connections +size_t get_max_connections(); + +// number of maximum allowed inflight messages +size_t get_max_inflight(); + +// maximum number of messages in the queue +size_t get_max_queue(); + +// disconnect from a kafka broker +bool disconnect(connection_ptr_t& conn); + +// display connection as string +std::string to_string(const connection_ptr_t& conn); + +} + diff --git a/src/rgw/rgw_keystone.cc b/src/rgw/rgw_keystone.cc new file mode 100644 index 00000000..956ac1bc --- /dev/null +++ b/src/rgw/rgw_keystone.cc @@ -0,0 +1,713 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include +#include +#include + +#include "common/errno.h" +#include "common/ceph_json.h" +#include "include/types.h" +#include "include/str_list.h" + +#include "rgw_common.h" +#include "rgw_keystone.h" +#include "common/ceph_crypto_cms.h" +#include "common/armor.h" +#include "common/Cond.h" +#include "rgw_perf_counters.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +int rgw_open_cms_envelope(CephContext * const cct, + const std::string& src, + std::string& dst) /* out */ +{ +#define BEGIN_CMS "-----BEGIN CMS-----" +#define END_CMS "-----END CMS-----" + + int start = src.find(BEGIN_CMS); + if (start < 0) { + ldout(cct, 0) << "failed to find " << BEGIN_CMS << " in response" << dendl; + return -EINVAL; + } + start += sizeof(BEGIN_CMS) - 1; + + int end = src.find(END_CMS); + if (end < 0) { + ldout(cct, 0) << "failed to find " << END_CMS << " in response" << dendl; + return -EINVAL; + } + + string s = src.substr(start, end - start); + + int pos = 0; + + do { + int next = s.find('\n', pos); + if (next < 0) { + dst.append(s.substr(pos)); + break; + } else { + dst.append(s.substr(pos, next - pos)); + } + pos = next + 1; + } while (pos < (int)s.size()); + + return 0; +} + +int rgw_decode_b64_cms(CephContext * const cct, + const string& signed_b64, + bufferlist& bl) +{ + bufferptr signed_ber(signed_b64.size() * 2); + char *dest = signed_ber.c_str(); + const char *src = signed_b64.c_str(); + size_t len = signed_b64.size(); + char buf[len + 1]; + buf[len] = '\0'; + + for (size_t i = 0; i < len; i++, src++) { + if (*src != '-') { + buf[i] = *src; + } else { + buf[i] = '/'; + } + } + + int ret = ceph_unarmor(dest, dest + signed_ber.length(), buf, + buf + signed_b64.size()); + if (ret < 0) { + ldout(cct, 0) << "ceph_unarmor() failed, ret=" << ret << dendl; + return ret; + } + + bufferlist signed_ber_bl; + signed_ber_bl.append(signed_ber); + + ret = ceph_decode_cms(cct, signed_ber_bl, bl); + if (ret < 0) { + ldout(cct, 0) << "ceph_decode_cms returned " << ret << dendl; + return ret; + } + + return 0; +} + +#define PKI_ANS1_PREFIX "MII" + +bool rgw_is_pki_token(const string& token) +{ + return token.compare(0, sizeof(PKI_ANS1_PREFIX) - 1, PKI_ANS1_PREFIX) == 0; +} + +void rgw_get_token_id(const string& token, string& token_id) +{ + if (!rgw_is_pki_token(token)) { + token_id = token; + return; + } + + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + + MD5 hash; + hash.Update((const unsigned char *)token.c_str(), token.size()); + hash.Final(m); + + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + token_id = calc_md5; +} + +bool rgw_decode_pki_token(CephContext * const cct, + const string& token, + bufferlist& bl) +{ + if (!rgw_is_pki_token(token)) { + return false; + } + + int ret = rgw_decode_b64_cms(cct, token, bl); + if (ret < 0) { + return false; + } + + ldout(cct, 20) << "successfully decoded pki token" << dendl; + + return true; +} + + +namespace rgw { +namespace keystone { + +ApiVersion CephCtxConfig::get_api_version() const noexcept +{ + switch (g_ceph_context->_conf->rgw_keystone_api_version) { + case 3: + return ApiVersion::VER_3; + case 2: + return ApiVersion::VER_2; + default: + dout(0) << "ERROR: wrong Keystone API version: " + << g_ceph_context->_conf->rgw_keystone_api_version + << "; falling back to v2" << dendl; + return ApiVersion::VER_2; + } +} + +std::string CephCtxConfig::get_endpoint_url() const noexcept +{ + static const std::string url = g_ceph_context->_conf->rgw_keystone_url; + + if (url.empty() || boost::algorithm::ends_with(url, "/")) { + return url; + } else { + static const std::string url_normalised = url + '/'; + return url_normalised; + } +} + +/* secrets */ +const std::string CephCtxConfig::empty{""}; + +static inline std::string read_secret(const std::string& file_path) +{ + using namespace std; + + constexpr int16_t size{1024}; + char buf[size]; + string s; + + s.reserve(size); + ifstream ifs(file_path, ios::in | ios::binary); + if (ifs) { + while (true) { + auto sbuf = ifs.rdbuf(); + auto len = sbuf->sgetn(buf, size); + if (!len) + break; + s.append(buf, len); + } + boost::algorithm::trim(s); + if (s.back() == '\n') + s.pop_back(); + } + return s; +} + +std::string CephCtxConfig::get_admin_token() const noexcept +{ + auto& atv = g_ceph_context->_conf->rgw_keystone_admin_token_path; + if (!atv.empty()) { + return read_secret(atv); + } else { + auto& atv = g_ceph_context->_conf->rgw_keystone_admin_token; + if (!atv.empty()) { + return atv; + } + } + return empty; +} + +std::string CephCtxConfig::get_admin_password() const noexcept { + auto& apv = g_ceph_context->_conf->rgw_keystone_admin_password_path; + if (!apv.empty()) { + return read_secret(apv); + } else { + auto& apv = g_ceph_context->_conf->rgw_keystone_admin_password; + if (!apv.empty()) { + return apv; + } + } + return empty; +} + +int Service::get_admin_token(CephContext* const cct, + TokenCache& token_cache, + const Config& config, + std::string& token) +{ + /* Let's check whether someone uses the deprecated "admin token" feauture + * based on a shared secret from keystone.conf file. */ + const auto& admin_token = config.get_admin_token(); + if (! admin_token.empty()) { + token = std::string(admin_token.data(), admin_token.length()); + return 0; + } + + TokenEnvelope t; + + /* Try cache first before calling Keystone for a new admin token. */ + if (token_cache.find_admin(t)) { + ldout(cct, 20) << "found cached admin token" << dendl; + token = t.token.id; + return 0; + } + + /* Call Keystone now. */ + const auto ret = issue_admin_token_request(cct, config, t); + if (! ret) { + token_cache.add_admin(t); + token = t.token.id; + } + + return ret; +} + +int Service::issue_admin_token_request(CephContext* const cct, + const Config& config, + TokenEnvelope& t) +{ + std::string token_url = config.get_endpoint_url(); + if (token_url.empty()) { + return -EINVAL; + } + + bufferlist token_bl; + RGWGetKeystoneAdminToken token_req(cct, "POST", "", &token_bl); + token_req.append_header("Content-Type", "application/json"); + JSONFormatter jf; + + const auto keystone_version = config.get_api_version(); + if (keystone_version == ApiVersion::VER_2) { + AdminTokenRequestVer2 req_serializer(config); + req_serializer.dump(&jf); + + std::stringstream ss; + jf.flush(ss); + token_req.set_post_data(ss.str()); + token_req.set_send_length(ss.str().length()); + token_url.append("v2.0/tokens"); + + } else if (keystone_version == ApiVersion::VER_3) { + AdminTokenRequestVer3 req_serializer(config); + req_serializer.dump(&jf); + + std::stringstream ss; + jf.flush(ss); + token_req.set_post_data(ss.str()); + token_req.set_send_length(ss.str().length()); + token_url.append("v3/auth/tokens"); + } else { + return -ENOTSUP; + } + + token_req.set_url(token_url); + + const int ret = token_req.process(); + if (ret < 0) { + return ret; + } + + /* Detect rejection earlier than during the token parsing step. */ + if (token_req.get_http_status() == + RGWGetKeystoneAdminToken::HTTP_STATUS_UNAUTHORIZED) { + return -EACCES; + } + + if (t.parse(cct, token_req.get_subject_token(), token_bl, + keystone_version) != 0) { + return -EINVAL; + } + + return 0; +} + +int Service::get_keystone_barbican_token(CephContext * const cct, + std::string& token) +{ + using keystone_config_t = rgw::keystone::CephCtxConfig; + using keystone_cache_t = rgw::keystone::TokenCache; + + auto& config = keystone_config_t::get_instance(); + auto& token_cache = keystone_cache_t::get_instance(); + + std::string token_url = config.get_endpoint_url(); + if (token_url.empty()) { + return -EINVAL; + } + + rgw::keystone::TokenEnvelope t; + + /* Try cache first. */ + if (token_cache.find_barbican(t)) { + ldout(cct, 20) << "found cached barbican token" << dendl; + token = t.token.id; + return 0; + } + + bufferlist token_bl; + RGWKeystoneHTTPTransceiver token_req(cct, "POST", "", &token_bl); + token_req.append_header("Content-Type", "application/json"); + JSONFormatter jf; + + const auto keystone_version = config.get_api_version(); + if (keystone_version == ApiVersion::VER_2) { + rgw::keystone::BarbicanTokenRequestVer2 req_serializer(cct); + req_serializer.dump(&jf); + + std::stringstream ss; + jf.flush(ss); + token_req.set_post_data(ss.str()); + token_req.set_send_length(ss.str().length()); + token_url.append("v2.0/tokens"); + + } else if (keystone_version == ApiVersion::VER_3) { + BarbicanTokenRequestVer3 req_serializer(cct); + req_serializer.dump(&jf); + + std::stringstream ss; + jf.flush(ss); + token_req.set_post_data(ss.str()); + token_req.set_send_length(ss.str().length()); + token_url.append("v3/auth/tokens"); + } else { + return -ENOTSUP; + } + + token_req.set_url(token_url); + + ldout(cct, 20) << "Requesting secret from barbican url=" << token_url << dendl; + const int ret = token_req.process(); + if (ret < 0) { + ldout(cct, 20) << "Barbican process error:" << token_bl.c_str() << dendl; + return ret; + } + + /* Detect rejection earlier than during the token parsing step. */ + if (token_req.get_http_status() == + RGWKeystoneHTTPTransceiver::HTTP_STATUS_UNAUTHORIZED) { + return -EACCES; + } + + if (t.parse(cct, token_req.get_subject_token(), token_bl, + keystone_version) != 0) { + return -EINVAL; + } + + token_cache.add_barbican(t); + token = t.token.id; + return 0; +} + + +bool TokenEnvelope::has_role(const std::string& r) const +{ + list::const_iterator iter; + for (iter = roles.cbegin(); iter != roles.cend(); ++iter) { + if (fnmatch(r.c_str(), ((*iter).name.c_str()), 0) == 0) { + return true; + } + } + return false; +} + +int TokenEnvelope::parse(CephContext* const cct, + const std::string& token_str, + ceph::bufferlist& bl, + const ApiVersion version) +{ + JSONParser parser; + if (! parser.parse(bl.c_str(), bl.length())) { + ldout(cct, 0) << "Keystone token parse error: malformed json" << dendl; + return -EINVAL; + } + + JSONObjIter token_iter = parser.find_first("token"); + JSONObjIter access_iter = parser.find_first("access"); + + try { + if (version == rgw::keystone::ApiVersion::VER_2) { + if (! access_iter.end()) { + decode_v2(*access_iter); + } else if (! token_iter.end()) { + /* TokenEnvelope structure doesn't follow Identity API v2, so let's + * fallback to v3. Otherwise we can assume it's wrongly formatted. + * The whole mechanism is a workaround for s3_token middleware that + * speaks in v2 disregarding the promise to go with v3. */ + decode_v3(*token_iter); + + /* Identity v3 conveys the token inforamtion not as a part of JSON but + * in the X-Subject-Token HTTP header we're getting from caller. */ + token.id = token_str; + } else { + return -EINVAL; + } + } else if (version == rgw::keystone::ApiVersion::VER_3) { + if (! token_iter.end()) { + decode_v3(*token_iter); + /* v3 suceeded. We have to fill token.id from external input as it + * isn't a part of the JSON response anymore. It has been moved + * to X-Subject-Token HTTP header instead. */ + token.id = token_str; + } else if (! access_iter.end()) { + /* If the token cannot be parsed according to V3, try V2. */ + decode_v2(*access_iter); + } else { + return -EINVAL; + } + } else { + return -ENOTSUP; + } + } catch (JSONDecoder::err& err) { + ldout(cct, 0) << "Keystone token parse error: " << err.message << dendl; + return -EINVAL; + } + + return 0; +} + +bool TokenCache::find(const std::string& token_id, + rgw::keystone::TokenEnvelope& token) +{ + Mutex::Locker l(lock); + return find_locked(token_id, token); +} + +bool TokenCache::find_locked(const std::string& token_id, + rgw::keystone::TokenEnvelope& token) +{ + ceph_assert(lock.is_locked_by_me()); + map::iterator iter = tokens.find(token_id); + if (iter == tokens.end()) { + if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_miss); + return false; + } + + token_entry& entry = iter->second; + tokens_lru.erase(entry.lru_iter); + + if (entry.token.expired()) { + tokens.erase(iter); + if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit); + return false; + } + token = entry.token; + + tokens_lru.push_front(token_id); + entry.lru_iter = tokens_lru.begin(); + + if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit); + + return true; +} + +bool TokenCache::find_admin(rgw::keystone::TokenEnvelope& token) +{ + Mutex::Locker l(lock); + + return find_locked(admin_token_id, token); +} + +bool TokenCache::find_barbican(rgw::keystone::TokenEnvelope& token) +{ + Mutex::Locker l(lock); + + return find_locked(barbican_token_id, token); +} + +void TokenCache::add(const std::string& token_id, + const rgw::keystone::TokenEnvelope& token) +{ + Mutex::Locker l(lock); + add_locked(token_id, token); +} + +void TokenCache::add_locked(const std::string& token_id, + const rgw::keystone::TokenEnvelope& token) +{ + ceph_assert(lock.is_locked_by_me()); + map::iterator iter = tokens.find(token_id); + if (iter != tokens.end()) { + token_entry& e = iter->second; + tokens_lru.erase(e.lru_iter); + } + + tokens_lru.push_front(token_id); + token_entry& entry = tokens[token_id]; + entry.token = token; + entry.lru_iter = tokens_lru.begin(); + + while (tokens_lru.size() > max) { + list::reverse_iterator riter = tokens_lru.rbegin(); + iter = tokens.find(*riter); + ceph_assert(iter != tokens.end()); + tokens.erase(iter); + tokens_lru.pop_back(); + } +} + +void TokenCache::add_admin(const rgw::keystone::TokenEnvelope& token) +{ + Mutex::Locker l(lock); + + rgw_get_token_id(token.token.id, admin_token_id); + add_locked(admin_token_id, token); +} + +void TokenCache::add_barbican(const rgw::keystone::TokenEnvelope& token) +{ + Mutex::Locker l(lock); + + rgw_get_token_id(token.token.id, barbican_token_id); + add_locked(barbican_token_id, token); +} + +void TokenCache::invalidate(const std::string& token_id) +{ + Mutex::Locker l(lock); + map::iterator iter = tokens.find(token_id); + if (iter == tokens.end()) + return; + + ldout(cct, 20) << "invalidating revoked token id=" << token_id << dendl; + token_entry& e = iter->second; + tokens_lru.erase(e.lru_iter); + tokens.erase(iter); +} + +int TokenCache::RevokeThread::check_revoked() +{ + std::string url; + std::string token; + + bufferlist bl; + RGWGetRevokedTokens req(cct, "GET", "", &bl); + + if (rgw::keystone::Service::get_admin_token(cct, *cache, config, token) < 0) { + return -EINVAL; + } + + url = config.get_endpoint_url(); + if (url.empty()) { + return -EINVAL; + } + + req.append_header("X-Auth-Token", token); + + const auto keystone_version = config.get_api_version(); + if (keystone_version == rgw::keystone::ApiVersion::VER_2) { + url.append("v2.0/tokens/revoked"); + } else if (keystone_version == rgw::keystone::ApiVersion::VER_3) { + url.append("v3/auth/tokens/OS-PKI/revoked"); + } + + req.set_url(url); + + req.set_send_length(0); + int ret = req.process(); + if (ret < 0) { + return ret; + } + + bl.append((char)0); // NULL terminate for debug output + + ldout(cct, 10) << "request returned " << bl.c_str() << dendl; + + JSONParser parser; + + if (!parser.parse(bl.c_str(), bl.length())) { + ldout(cct, 0) << "malformed json" << dendl; + return -EINVAL; + } + + JSONObjIter iter = parser.find_first("signed"); + if (iter.end()) { + ldout(cct, 0) << "revoked tokens response is missing signed section" << dendl; + return -EINVAL; + } + + JSONObj *signed_obj = *iter; + const std::string signed_str = signed_obj->get_data(); + + ldout(cct, 10) << "signed=" << signed_str << dendl; + + std::string signed_b64; + ret = rgw_open_cms_envelope(cct, signed_str, signed_b64); + if (ret < 0) { + return ret; + } + + ldout(cct, 10) << "content=" << signed_b64 << dendl; + + bufferlist json; + ret = rgw_decode_b64_cms(cct, signed_b64, json); + if (ret < 0) { + return ret; + } + + ldout(cct, 10) << "ceph_decode_cms: decoded: " << json.c_str() << dendl; + + JSONParser list_parser; + if (!list_parser.parse(json.c_str(), json.length())) { + ldout(cct, 0) << "malformed json" << dendl; + return -EINVAL; + } + + JSONObjIter revoked_iter = list_parser.find_first("revoked"); + if (revoked_iter.end()) { + ldout(cct, 0) << "no revoked section in json" << dendl; + return -EINVAL; + } + + JSONObj *revoked_obj = *revoked_iter; + + JSONObjIter tokens_iter = revoked_obj->find_first(); + for (; !tokens_iter.end(); ++tokens_iter) { + JSONObj *o = *tokens_iter; + + JSONObj *token = o->find_obj("id"); + if (!token) { + ldout(cct, 0) << "bad token in array, missing id" << dendl; + continue; + } + + const std::string token_id = token->get_data(); + cache->invalidate(token_id); + } + + return 0; +} + +bool TokenCache::going_down() const +{ + return down_flag; +} + +void* TokenCache::RevokeThread::entry() +{ + do { + ldout(cct, 2) << "keystone revoke thread: start" << dendl; + int r = check_revoked(); + if (r < 0) { + ldout(cct, 0) << "ERROR: keystone revocation processing returned error r=" + << r << dendl; + } + + if (cache->going_down()) { + break; + } + + lock.Lock(); + cond.WaitInterval(lock, + utime_t(cct->_conf->rgw_keystone_revocation_interval, 0)); + lock.Unlock(); + } while (!cache->going_down()); + + return nullptr; +} + +void TokenCache::RevokeThread::stop() +{ + Mutex::Locker l(lock); + cond.Signal(); +} + +}; /* namespace keystone */ +}; /* namespace rgw */ diff --git a/src/rgw/rgw_keystone.h b/src/rgw/rgw_keystone.h new file mode 100644 index 00000000..55ad2f94 --- /dev/null +++ b/src/rgw/rgw_keystone.h @@ -0,0 +1,373 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_KEYSTONE_H +#define CEPH_RGW_KEYSTONE_H + +#include + +#include +#include + +#include "rgw_common.h" +#include "rgw_http_client.h" +#include "common/Cond.h" +#include "global/global_init.h" + +#include + +int rgw_open_cms_envelope(CephContext *cct, + const std::string& src, + std::string& dst); /* out */ +int rgw_decode_b64_cms(CephContext *cct, + const string& signed_b64, + bufferlist& bl); +bool rgw_is_pki_token(const string& token); +void rgw_get_token_id(const string& token, string& token_id); +static inline std::string rgw_get_token_id(const string& token) +{ + std::string token_id; + rgw_get_token_id(token, token_id); + + return token_id; +} +bool rgw_decode_pki_token(CephContext *cct, + const string& token, + bufferlist& bl); + +namespace rgw { +namespace keystone { + +enum class ApiVersion { + VER_2, + VER_3 +}; + + +class Config { +protected: + Config() = default; + virtual ~Config() = default; + +public: + virtual std::string get_endpoint_url() const noexcept = 0; + virtual ApiVersion get_api_version() const noexcept = 0; + + virtual std::string get_admin_token() const noexcept = 0; + virtual boost::string_ref get_admin_user() const noexcept = 0; + virtual std::string get_admin_password() const noexcept = 0; + virtual boost::string_ref get_admin_tenant() const noexcept = 0; + virtual boost::string_ref get_admin_project() const noexcept = 0; + virtual boost::string_ref get_admin_domain() const noexcept = 0; +}; + +class CephCtxConfig : public Config { +protected: + CephCtxConfig() = default; + virtual ~CephCtxConfig() = default; + + const static std::string empty; + +public: + static CephCtxConfig& get_instance() { + static CephCtxConfig instance; + return instance; + } + + std::string get_endpoint_url() const noexcept override; + ApiVersion get_api_version() const noexcept override; + + std::string get_admin_token() const noexcept override; + + boost::string_ref get_admin_user() const noexcept override { + return g_ceph_context->_conf->rgw_keystone_admin_user; + } + + std::string get_admin_password() const noexcept override; + + boost::string_ref get_admin_tenant() const noexcept override { + return g_ceph_context->_conf->rgw_keystone_admin_tenant; + } + + boost::string_ref get_admin_project() const noexcept override { + return g_ceph_context->_conf->rgw_keystone_admin_project; + } + + boost::string_ref get_admin_domain() const noexcept override { + return g_ceph_context->_conf->rgw_keystone_admin_domain; + } +}; + + +class TokenEnvelope; +class TokenCache; + +class Service { +public: + class RGWKeystoneHTTPTransceiver : public RGWHTTPTransceiver { + public: + RGWKeystoneHTTPTransceiver(CephContext * const cct, + const string& method, + const string& url, + bufferlist * const token_body_bl) + : RGWHTTPTransceiver(cct, method, url, token_body_bl, + cct->_conf->rgw_keystone_verify_ssl, + { "X-Subject-Token" }) { + } + + const header_value_t& get_subject_token() const { + try { + return get_header_value("X-Subject-Token"); + } catch (std::out_of_range&) { + static header_value_t empty_val; + return empty_val; + } + } + }; + + typedef RGWKeystoneHTTPTransceiver RGWValidateKeystoneToken; + typedef RGWKeystoneHTTPTransceiver RGWGetKeystoneAdminToken; + typedef RGWKeystoneHTTPTransceiver RGWGetRevokedTokens; + + static int get_admin_token(CephContext* const cct, + TokenCache& token_cache, + const Config& config, + std::string& token); + static int issue_admin_token_request(CephContext* const cct, + const Config& config, + TokenEnvelope& token); + static int get_keystone_barbican_token(CephContext * const cct, + std::string& token); +}; + + +class TokenEnvelope { +public: + class Domain { + public: + string id; + string name; + void decode_json(JSONObj *obj); + }; + class Project { + public: + Domain domain; + string id; + string name; + void decode_json(JSONObj *obj); + }; + + class Token { + public: + Token() : expires(0) { } + string id; + time_t expires; + Project tenant_v2; + void decode_json(JSONObj *obj); + }; + + class Role { + public: + string id; + string name; + void decode_json(JSONObj *obj); + }; + + class User { + public: + string id; + string name; + Domain domain; + list roles_v2; + void decode_json(JSONObj *obj); + }; + + Token token; + Project project; + User user; + list roles; + + void decode_v3(JSONObj* obj); + void decode_v2(JSONObj* obj); + +public: + /* We really need the default ctor because of the internals of TokenCache. */ + TokenEnvelope() = default; + + time_t get_expires() const { return token.expires; } + const std::string& get_domain_id() const {return project.domain.id;}; + const std::string& get_domain_name() const {return project.domain.name;}; + const std::string& get_project_id() const {return project.id;}; + const std::string& get_project_name() const {return project.name;}; + const std::string& get_user_id() const {return user.id;}; + const std::string& get_user_name() const {return user.name;}; + bool has_role(const string& r) const; + bool expired() const { + const uint64_t now = ceph_clock_now().sec(); + return now >= static_cast(get_expires()); + } + int parse(CephContext* cct, + const std::string& token_str, + ceph::buffer::list& bl /* in */, + ApiVersion version); +}; + + +class TokenCache { + struct token_entry { + TokenEnvelope token; + list::iterator lru_iter; + }; + + std::atomic down_flag = { false }; + + class RevokeThread : public Thread { + friend class TokenCache; + typedef RGWPostHTTPData RGWGetRevokedTokens; + + CephContext* const cct; + TokenCache* const cache; + const rgw::keystone::Config& config; + + Mutex lock; + Cond cond; + + RevokeThread(CephContext* const cct, + TokenCache* const cache, + const rgw::keystone::Config& config) + : cct(cct), + cache(cache), + config(config), + lock("rgw::keystone::TokenCache::RevokeThread") { + } + + void *entry() override; + void stop(); + int check_revoked(); + } revocator; + + const boost::intrusive_ptr cct; + + std::string admin_token_id; + std::string barbican_token_id; + std::map tokens; + std::list tokens_lru; + + Mutex lock; + + const size_t max; + + explicit TokenCache(const rgw::keystone::Config& config) + : revocator(g_ceph_context, this, config), + cct(g_ceph_context), + lock("rgw::keystone::TokenCache"), + max(cct->_conf->rgw_keystone_token_cache_size) { + /* revocation logic needs to be smarter, but meanwhile, + * make it optional. + * see http://tracker.ceph.com/issues/9493 + * http://tracker.ceph.com/issues/19499 + */ + if (cct->_conf->rgw_keystone_revocation_interval > 0 + && cct->_conf->rgw_keystone_token_cache_size ) { + /* The thread name has been kept for backward compliance. */ + revocator.create("rgw_swift_k_rev"); + } + } + + ~TokenCache() { + down_flag = true; + + // Only stop and join if revocator thread is started. + if (revocator.is_started()) { + revocator.stop(); + revocator.join(); + } + } + +public: + TokenCache(const TokenCache&) = delete; + void operator=(const TokenCache&) = delete; + + template + static TokenCache& get_instance() { + static_assert(std::is_base_of::value, + "ConfigT must be a subclass of rgw::keystone::Config"); + + /* In C++11 this is thread safe. */ + static TokenCache instance(ConfigT::get_instance()); + return instance; + } + + bool find(const std::string& token_id, TokenEnvelope& token); + boost::optional find(const std::string& token_id) { + TokenEnvelope token_envlp; + if (find(token_id, token_envlp)) { + return token_envlp; + } + return boost::none; + } + bool find_admin(TokenEnvelope& token); + bool find_barbican(TokenEnvelope& token); + void add(const std::string& token_id, const TokenEnvelope& token); + void add_admin(const TokenEnvelope& token); + void add_barbican(const TokenEnvelope& token); + void invalidate(const std::string& token_id); + bool going_down() const; +private: + void add_locked(const std::string& token_id, const TokenEnvelope& token); + bool find_locked(const std::string& token_id, TokenEnvelope& token); + +}; + + +class AdminTokenRequest { +public: + virtual ~AdminTokenRequest() = default; + virtual void dump(Formatter* f) const = 0; +}; + +class AdminTokenRequestVer2 : public AdminTokenRequest { + const Config& conf; + +public: + explicit AdminTokenRequestVer2(const Config& conf) + : conf(conf) { + } + void dump(Formatter *f) const override; +}; + +class AdminTokenRequestVer3 : public AdminTokenRequest { + const Config& conf; + +public: + explicit AdminTokenRequestVer3(const Config& conf) + : conf(conf) { + } + void dump(Formatter *f) const override; +}; + +class BarbicanTokenRequestVer2 : public AdminTokenRequest { + CephContext *cct; + +public: + explicit BarbicanTokenRequestVer2(CephContext * const _cct) + : cct(_cct) { + } + void dump(Formatter *f) const override; +}; + +class BarbicanTokenRequestVer3 : public AdminTokenRequest { + CephContext *cct; + +public: + explicit BarbicanTokenRequestVer3(CephContext * const _cct) + : cct(_cct) { + } + void dump(Formatter *f) const override; +}; + + +}; /* namespace keystone */ +}; /* namespace rgw */ + +#endif diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc new file mode 100644 index 00000000..eeb14be1 --- /dev/null +++ b/src/rgw/rgw_lc.cc @@ -0,0 +1,1678 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include +#include +#include + +#include "common/Formatter.h" +#include +#include "include/random.h" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/lock/cls_lock_client.h" +#include "rgw_common.h" +#include "rgw_bucket.h" +#include "rgw_lc.h" +#include "rgw_zone.h" +#include "rgw_string.h" + +// this seems safe to use, at least for now--arguably, we should +// prefer header-only fmt, in general +#undef FMT_HEADER_ONLY +#define FMT_HEADER_ONLY 1 +#include "seastar/fmt/include/fmt/format.h" + +#include "services/svc_sys_obj.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +const char* LC_STATUS[] = { + "UNINITIAL", + "PROCESSING", + "FAILED", + "COMPLETE" +}; + +using namespace librados; + +bool LCRule::valid() const +{ + if (id.length() > MAX_ID_LEN) { + return false; + } + else if(expiration.empty() && noncur_expiration.empty() && mp_expiration.empty() && !dm_expiration && + transitions.empty() && noncur_transitions.empty()) { + return false; + } + else if (!expiration.valid() || !noncur_expiration.valid() || !mp_expiration.valid()) { + return false; + } + if (!transitions.empty()) { + bool using_days = expiration.has_days(); + bool using_date = expiration.has_date(); + for (const auto& elem : transitions) { + if (!elem.second.valid()) { + return false; + } + using_days = using_days || elem.second.has_days(); + using_date = using_date || elem.second.has_date(); + if (using_days && using_date) { + return false; + } + } + } + for (const auto& elem : noncur_transitions) { + if (!elem.second.valid()) { + return false; + } + } + + return true; +} + +void LCRule::init_simple_days_rule(std::string_view _id, std::string_view _prefix, int num_days) +{ + id = _id; + prefix = _prefix; + char buf[32]; + snprintf(buf, sizeof(buf), "%d", num_days); + expiration.set_days(buf); + set_enabled(true); +} + +void RGWLifecycleConfiguration::add_rule(const LCRule& rule) +{ + auto& id = rule.get_id(); // note that this will return false for groups, but that's ok, we won't search groups + rule_map.insert(pair(id, rule)); +} + +bool RGWLifecycleConfiguration::_add_rule(const LCRule& rule) +{ + lc_op op(rule.get_id()); + op.status = rule.is_enabled(); + if (rule.get_expiration().has_days()) { + op.expiration = rule.get_expiration().get_days(); + } + if (rule.get_expiration().has_date()) { + op.expiration_date = ceph::from_iso_8601(rule.get_expiration().get_date()); + } + if (rule.get_noncur_expiration().has_days()) { + op.noncur_expiration = rule.get_noncur_expiration().get_days(); + } + if (rule.get_mp_expiration().has_days()) { + op.mp_expiration = rule.get_mp_expiration().get_days(); + } + op.dm_expiration = rule.get_dm_expiration(); + for (const auto &elem : rule.get_transitions()) { + transition_action action; + if (elem.second.has_days()) { + action.days = elem.second.get_days(); + } else { + action.date = ceph::from_iso_8601(elem.second.get_date()); + } + action.storage_class = rgw_placement_rule::get_canonical_storage_class(elem.first); + op.transitions.emplace(elem.first, std::move(action)); + } + for (const auto &elem : rule.get_noncur_transitions()) { + transition_action action; + action.days = elem.second.get_days(); + action.date = ceph::from_iso_8601(elem.second.get_date()); + action.storage_class = elem.first; + op.noncur_transitions.emplace(elem.first, std::move(action)); + } + std::string prefix; + if (rule.get_filter().has_prefix()){ + prefix = rule.get_filter().get_prefix(); + } else { + prefix = rule.get_prefix(); + } + + if (rule.get_filter().has_tags()){ + op.obj_tags = rule.get_filter().get_tags(); + } + prefix_map.emplace(std::move(prefix), std::move(op)); + return true; +} + +int RGWLifecycleConfiguration::check_and_add_rule(const LCRule& rule) +{ + if (!rule.valid()) { + return -EINVAL; + } + auto& id = rule.get_id(); + if (rule_map.find(id) != rule_map.end()) { //id shouldn't be the same + return -EINVAL; + } + rule_map.insert(pair(id, rule)); + + if (!_add_rule(rule)) { + return -ERR_INVALID_REQUEST; + } + return 0; +} + +bool RGWLifecycleConfiguration::has_same_action(const lc_op& first, const lc_op& second) { + if ((first.expiration > 0 || first.expiration_date != boost::none) && + (second.expiration > 0 || second.expiration_date != boost::none)) { + return true; + } else if (first.noncur_expiration > 0 && second.noncur_expiration > 0) { + return true; + } else if (first.mp_expiration > 0 && second.mp_expiration > 0) { + return true; + } else if (!first.transitions.empty() && !second.transitions.empty()) { + for (auto &elem : first.transitions) { + if (second.transitions.find(elem.first) != second.transitions.end()) { + return true; + } + } + } else if (!first.noncur_transitions.empty() && !second.noncur_transitions.empty()) { + for (auto &elem : first.noncur_transitions) { + if (second.noncur_transitions.find(elem.first) != second.noncur_transitions.end()) { + return true; + } + } + } + return false; +} + +/* Formerly, this method checked for duplicate rules using an invalid + * method (prefix uniqueness). */ +bool RGWLifecycleConfiguration::valid() +{ + return true; +} + +void *RGWLC::LCWorker::entry() { + do { + utime_t start = ceph_clock_now(); + if (should_work(start)) { + ldpp_dout(dpp, 2) << "life cycle: start" << dendl; + int r = lc->process(); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: do life cycle process() returned error r=" << r << dendl; + } + ldpp_dout(dpp, 2) << "life cycle: stop" << dendl; + } + if (lc->going_down()) + break; + + utime_t end = ceph_clock_now(); + int secs = schedule_next_start_time(start, end); + utime_t next; + next.set_from_double(end + secs); + + ldpp_dout(dpp, 5) << "schedule life cycle next start time: " << rgw_to_asctime(next) << dendl; + + lock.Lock(); + cond.WaitInterval(lock, utime_t(secs, 0)); + lock.Unlock(); + } while (!lc->going_down()); + + return NULL; +} + +void RGWLC::initialize(CephContext *_cct, RGWRados *_store) { + cct = _cct; + store = _store; + max_objs = cct->_conf->rgw_lc_max_objs; + if (max_objs > HASH_PRIME) + max_objs = HASH_PRIME; + + obj_names = new string[max_objs]; + + for (int i = 0; i < max_objs; i++) { + obj_names[i] = lc_oid_prefix; + char buf[32]; + snprintf(buf, 32, ".%d", i); + obj_names[i].append(buf); + } + +#define COOKIE_LEN 16 + char cookie_buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(cct, cookie_buf, sizeof(cookie_buf) - 1); + cookie = cookie_buf; +} + +void RGWLC::finalize() +{ + delete[] obj_names; +} + +bool RGWLC::if_already_run_today(time_t& start_date) +{ + struct tm bdt; + time_t begin_of_day; + utime_t now = ceph_clock_now(); + localtime_r(&start_date, &bdt); + + if (cct->_conf->rgw_lc_debug_interval > 0) { + if (now - start_date < cct->_conf->rgw_lc_debug_interval) + return true; + else + return false; + } + + bdt.tm_hour = 0; + bdt.tm_min = 0; + bdt.tm_sec = 0; + begin_of_day = mktime(&bdt); + if (now - begin_of_day < 24*60*60) + return true; + else + return false; +} + +int RGWLC::bucket_lc_prepare(int index) +{ + map entries; + + string marker; + +#define MAX_LC_LIST_ENTRIES 100 + do { + int ret = cls_rgw_lc_list(store->lc_pool_ctx, obj_names[index], marker, MAX_LC_LIST_ENTRIES, entries); + if (ret < 0) + return ret; + map::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + pair entry(iter->first, lc_uninitial); + ret = cls_rgw_lc_set_entry(store->lc_pool_ctx, obj_names[index], entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::bucket_lc_prepare() failed to set entry on " + << obj_names[index] << dendl; + return ret; + } + } + + if (!entries.empty()) { + marker = std::move(entries.rbegin()->first); + } + } while (!entries.empty()); + + return 0; +} + +static bool obj_has_expired(CephContext *cct, ceph::real_time mtime, int days, ceph::real_time *expire_time = nullptr) +{ + double timediff, cmp; + utime_t base_time; + if (cct->_conf->rgw_lc_debug_interval <= 0) { + /* Normal case, run properly */ + cmp = double(days)*24*60*60; + base_time = ceph_clock_now().round_to_day(); + } else { + /* We're in debug mode; Treat each rgw_lc_debug_interval seconds as a day */ + cmp = double(days)*cct->_conf->rgw_lc_debug_interval; + base_time = ceph_clock_now(); + } + timediff = base_time - ceph::real_clock::to_time_t(mtime); + + if (expire_time) { + *expire_time = mtime + make_timespan(cmp); + } + ldout(cct, 20) << __func__ << "(): mtime=" << mtime << " days=" << days << " base_time=" << base_time << " timediff=" << timediff << " cmp=" << cmp << dendl; + + return (timediff >= cmp); +} + +static bool pass_object_lock_check(RGWRados *store, RGWBucketInfo& bucket_info, rgw_obj& obj, RGWObjectCtx& ctx) +{ + if (!bucket_info.obj_lock_enabled()) { + return true; + } + RGWRados::Object op_target(store, bucket_info, ctx, obj); + RGWRados::Object::Read read_op(&op_target); + map attrs; + read_op.params.attrs = &attrs; + int ret = read_op.prepare(); + if (ret < 0) { + if (ret == -ENOENT) { + return true; + } else { + return false; + } + } else { + auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (iter != attrs.end()) { + RGWObjectRetention retention; + try { + decode(retention, iter->second); + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: failed to decode RGWObjectRetention" << dendl; + return false; + } + if (ceph::real_clock::to_time_t(retention.get_retain_until_date()) > ceph_clock_now()) { + return false; + } + } + iter = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD); + if (iter != attrs.end()) { + RGWObjectLegalHold obj_legal_hold; + try { + decode(obj_legal_hold, iter->second); + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: failed to decode RGWObjectLegalHold" << dendl; + return false; + } + if (obj_legal_hold.is_enabled()) { + return false; + } + } + return true; + } +} + +int RGWLC::handle_multipart_expiration( + RGWRados::Bucket *target, const multimap& prefix_map) +{ + MultipartMetaFilter mp_filter; + vector objs; + RGWMPObj mp_obj; + bool is_truncated; + int ret; + RGWBucketInfo& bucket_info = target->get_bucket_info(); + RGWRados::Bucket::List list_op(target); + auto delay_ms = cct->_conf.get_val("rgw_lc_thread_delay"); + list_op.params.list_versions = false; + /* lifecycle processing does not depend on total order, so can + * take advantage of unorderd listing optimizations--such as + * operating on one shard at a time */ + list_op.params.allow_unordered = true; + list_op.params.ns = RGW_OBJ_NS_MULTIPART; + list_op.params.filter = &mp_filter; + for (auto prefix_iter = prefix_map.begin(); prefix_iter != prefix_map.end(); ++prefix_iter) { + if (!prefix_iter->second.status || prefix_iter->second.mp_expiration <= 0) { + continue; + } + list_op.params.prefix = prefix_iter->first; + do { + objs.clear(); + list_op.params.marker = list_op.get_next_marker(); + ret = list_op.list_objects(1000, &objs, NULL, &is_truncated); + if (ret < 0) { + if (ret == (-ENOENT)) + return 0; + ldpp_dout(this, 0) << "ERROR: store->list_objects():" <meta.mtime, prefix_iter->second.mp_expiration)) { + rgw_obj_key key(obj_iter->key); + if (!mp_obj.from_meta(key.name)) { + continue; + } + RGWObjectCtx rctx(store); + ret = abort_multipart_upload(store, cct, &rctx, bucket_info, mp_obj); + if (ret < 0 && ret != -ERR_NO_SUCH_UPLOAD) { + ldpp_dout(this, 0) << "ERROR: abort_multipart_upload failed, ret=" << ret << ", meta:" << obj_iter->key << dendl; + } else if (ret == -ERR_NO_SUCH_UPLOAD) { + ldpp_dout(this, 5) << "ERROR: abort_multipart_upload failed, ret=" << ret << ", meta:" << obj_iter->key << dendl; + } + if (going_down()) + return 0; + } + } /* for objs */ + std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms)); + } while(is_truncated); + } + return 0; +} + +static int read_obj_tags(RGWRados *store, RGWBucketInfo& bucket_info, rgw_obj& obj, RGWObjectCtx& ctx, bufferlist& tags_bl) +{ + RGWRados::Object op_target(store, bucket_info, ctx, obj); + RGWRados::Object::Read read_op(&op_target); + + return read_op.get_attr(RGW_ATTR_TAGS, tags_bl); +} + +static bool is_valid_op(const lc_op& op) +{ + return (op.status && + (op.expiration > 0 + || op.expiration_date != boost::none + || op.noncur_expiration > 0 + || op.dm_expiration + || !op.transitions.empty() + || !op.noncur_transitions.empty())); +} + +static inline bool has_all_tags(const lc_op& rule_action, + const RGWObjTags& object_tags) +{ + if(! rule_action.obj_tags) + return false; + if(object_tags.count() < rule_action.obj_tags->count()) + return false; + size_t tag_count = 0; + for (const auto& tag : object_tags.get_tags()) { + const auto& rule_tags = rule_action.obj_tags->get_tags(); + const auto& iter = rule_tags.find(tag.first); + if(iter == rule_tags.end()) + continue; + if(iter->second == tag.second) + { + tag_count++; + } + /* all tags in the rule appear in obj tags */ + } + return tag_count == rule_action.obj_tags->count(); +} + +class LCObjsLister { + RGWRados *store; + RGWBucketInfo& bucket_info; + RGWRados::Bucket target; + RGWRados::Bucket::List list_op; + bool is_truncated{false}; + rgw_obj_key next_marker; + string prefix; + vector objs; + vector::iterator obj_iter; + rgw_bucket_dir_entry pre_obj; + int64_t delay_ms; + +public: + LCObjsLister(RGWRados *_store, RGWBucketInfo& _bucket_info) : + store(_store), bucket_info(_bucket_info), + target(store, bucket_info), list_op(&target) { + list_op.params.list_versions = bucket_info.versioned(); + list_op.params.allow_unordered = true; + delay_ms = store->ctx()->_conf.get_val("rgw_lc_thread_delay"); + } + + void set_prefix(const string& p) { + prefix = p; + list_op.params.prefix = prefix; + } + + int init() { + return fetch(); + } + + int fetch() { + int ret = list_op.list_objects(1000, &objs, NULL, &is_truncated); + if (ret < 0) { + return ret; + } + + obj_iter = objs.begin(); + + return 0; + } + + void delay() { + std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms)); + } + + bool get_obj(rgw_bucket_dir_entry *obj) { + if (obj_iter == objs.end()) { + delay(); + return false; + } + if (is_truncated && (obj_iter + 1)==objs.end()) { + list_op.params.marker = obj_iter->key; + + int ret = fetch(); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: list_op returned ret=" << ret << dendl; + return ret; + } else { + obj_iter = objs.begin(); + } + delay(); + } + *obj = *obj_iter; + return true; + } + + rgw_bucket_dir_entry get_prev_obj() { + return pre_obj; + } + + void next() { + pre_obj = *obj_iter; + ++obj_iter; + } + + bool next_has_same_name() + { + if ((obj_iter + 1) == objs.end()) { + /* this should have been called after get_obj() was called, so this should + * only happen if is_truncated is false */ + return false; + } + return (obj_iter->key.name.compare((obj_iter + 1)->key.name) == 0); + } +}; + + +struct op_env { + lc_op& op; + RGWRados *store; + RGWLC *lc; + RGWBucketInfo& bucket_info; + LCObjsLister& ol; + + op_env(lc_op& _op, RGWRados *_store, RGWLC *_lc, RGWBucketInfo& _bucket_info, + LCObjsLister& _ol) : op(_op), store(_store), lc(_lc), bucket_info(_bucket_info), ol(_ol) {} +}; + +class LCRuleOp; + +struct lc_op_ctx { + CephContext *cct; + op_env& env; + rgw_bucket_dir_entry& o; + + RGWRados *store; + RGWBucketInfo& bucket_info; + lc_op& op; + LCObjsLister& ol; + + rgw_obj obj; + RGWObjectCtx rctx; + + lc_op_ctx(op_env& _env, rgw_bucket_dir_entry& _o) : cct(_env.store->ctx()), env(_env), o(_o), + store(env.store), bucket_info(env.bucket_info), op(env.op), ol(env.ol), + obj(env.bucket_info.bucket, o.key), rctx(env.store) {} +}; + +static int remove_expired_obj(lc_op_ctx& oc, bool remove_indeed) +{ + auto& store = oc.store; + auto& bucket_info = oc.bucket_info; + auto& o = oc.o; + auto obj_key = o.key; + auto& meta = o.meta; + + if (!remove_indeed) { + obj_key.instance.clear(); + } else if (obj_key.instance.empty()) { + obj_key.instance = "null"; + } + + rgw_obj obj(bucket_info.bucket, obj_key); + ACLOwner obj_owner; + obj_owner.set_id(rgw_user {meta.owner}); + obj_owner.set_name(meta.owner_display_name); + + RGWRados::Object del_target(store, bucket_info, oc.rctx, obj); + RGWRados::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = bucket_info.owner; + del_op.params.versioning_status = bucket_info.versioning_status(); + del_op.params.obj_owner = obj_owner; + del_op.params.unmod_since = meta.mtime; + + return del_op.delete_obj(); +} + +class LCOpAction { +public: + virtual ~LCOpAction() {} + + virtual bool check(lc_op_ctx& oc, ceph::real_time *exp_time) { + return false; + }; + + /* called after check(). Check should tell us whether this action + * is applicable. If there are multiple actions, we'll end up executing + * the latest applicable action + * For example: + * one action after 10 days, another after 20, third after 40. + * After 10 days, the latest applicable action would be the first one, + * after 20 days it will be the second one. After 21 days it will still be the + * second one. So check() should return true for the second action at that point, + * but should_process() if the action has already been applied. In object removal + * it doesn't matter, but in object transition it does. + */ + virtual bool should_process() { + return true; + } + + virtual int process(lc_op_ctx& oc) { + return 0; + } +}; + +class LCOpFilter { +public: +virtual ~LCOpFilter() {} + virtual bool check(lc_op_ctx& oc) { + return false; + } +}; + +class LCOpRule { + friend class LCOpAction; + + op_env& env; + + std::vector > filters; + std::vector > actions; + +public: + LCOpRule(op_env& _env) : env(_env) {} + + void build(); + int process(rgw_bucket_dir_entry& o); +}; + +static int check_tags(lc_op_ctx& oc, bool *skip) +{ + auto& op = oc.op; + + if (op.obj_tags != boost::none) { + *skip = true; + + bufferlist tags_bl; + int ret = read_obj_tags(oc.store, oc.bucket_info, oc.obj, oc.rctx, tags_bl); + if (ret < 0) { + if (ret != -ENODATA) { + ldout(oc.cct, 5) << "ERROR: read_obj_tags returned r=" << ret << dendl; + } + return 0; + } + RGWObjTags dest_obj_tags; + try { + auto iter = tags_bl.cbegin(); + dest_obj_tags.decode(iter); + } catch (buffer::error& err) { + ldout(oc.cct,0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + return -EIO; + } + + if (! has_all_tags(op, dest_obj_tags)) { + ldout(oc.cct, 20) << __func__ << "() skipping obj " << oc.obj << " as tags do not match in rule: " << op.id << dendl; + return 0; + } + } + *skip = false; + return 0; +} + +class LCOpFilter_Tags : public LCOpFilter { +public: + bool check(lc_op_ctx& oc) override { + auto& o = oc.o; + + if (o.is_delete_marker()) { + return true; + } + + bool skip; + + int ret = check_tags(oc, &skip); + if (ret < 0) { + if (ret == -ENOENT) { + return false; + } + ldout(oc.cct, 0) << "ERROR: check_tags on obj=" << oc.obj << " returned ret=" << ret << dendl; + return false; + } + + return !skip; + }; +}; + +class LCOpAction_CurrentExpiration : public LCOpAction { +public: + bool check(lc_op_ctx& oc, ceph::real_time *exp_time) override { + auto& o = oc.o; + if (!o.is_current()) { + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": not current, skipping" << dendl; + return false; + } + if (o.is_delete_marker()) { + if (oc.ol.next_has_same_name()) { + return false; + } else { + *exp_time = real_clock::now(); + return true; + } + } + + auto& mtime = o.meta.mtime; + bool is_expired; + auto& op = oc.op; + if (op.expiration <= 0) { + if (op.expiration_date == boost::none) { + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": no expiration set in rule, skipping" << dendl; + return false; + } + is_expired = ceph_clock_now() >= ceph::real_clock::to_time_t(*op.expiration_date); + *exp_time = *op.expiration_date; + } else { + is_expired = obj_has_expired(oc.cct, mtime, op.expiration, exp_time); + } + + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": is_expired=" << (int)is_expired << dendl; + return is_expired; + } + + int process(lc_op_ctx& oc) { + auto& o = oc.o; + int r; + if (o.is_delete_marker()) { + r = remove_expired_obj(oc, true); + } else { + r = remove_expired_obj(oc, !oc.bucket_info.versioned()); + } + if (r < 0) { + ldout(oc.cct, 0) << "ERROR: remove_expired_obj " << dendl; + return r; + } + ldout(oc.cct, 2) << "DELETED:" << oc.bucket_info.bucket << ":" << o.key << dendl; + return 0; + } +}; + +class LCOpAction_NonCurrentExpiration : public LCOpAction { +public: + bool check(lc_op_ctx& oc, ceph::real_time *exp_time) override { + auto& o = oc.o; + if (o.is_current()) { + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": current version, skipping" << dendl; + return false; + } + + auto mtime = oc.ol.get_prev_obj().meta.mtime; + int expiration = oc.op.noncur_expiration; + bool is_expired = obj_has_expired(oc.cct, mtime, expiration, exp_time); + + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": is_expired=" << is_expired << dendl; + return is_expired && pass_object_lock_check(oc.store, oc.bucket_info, oc.obj, oc.rctx); + } + + int process(lc_op_ctx& oc) { + auto& o = oc.o; + int r = remove_expired_obj(oc, true); + if (r < 0) { + ldout(oc.cct, 0) << "ERROR: remove_expired_obj " << dendl; + return r; + } + ldout(oc.cct, 2) << "DELETED:" << oc.bucket_info.bucket << ":" << o.key << " (non-current expiration)" << dendl; + return 0; + } +}; + +class LCOpAction_DMExpiration : public LCOpAction { +public: + bool check(lc_op_ctx& oc, ceph::real_time *exp_time) override { + auto& o = oc.o; + if (!o.is_delete_marker()) { + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": not a delete marker, skipping" << dendl; + return false; + } + + if (oc.ol.next_has_same_name()) { + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": next is same object, skipping" << dendl; + return false; + } + + *exp_time = real_clock::now(); + + return true; + } + + int process(lc_op_ctx& oc) { + auto& o = oc.o; + int r = remove_expired_obj(oc, true); + if (r < 0) { + ldout(oc.cct, 0) << "ERROR: remove_expired_obj " << dendl; + return r; + } + ldout(oc.cct, 2) << "DELETED:" << oc.bucket_info.bucket << ":" << o.key << " (delete marker expiration)" << dendl; + return 0; + } +}; + +class LCOpAction_Transition : public LCOpAction { + const transition_action& transition; + bool need_to_process{false}; + +protected: + virtual bool check_current_state(bool is_current) = 0; + virtual ceph::real_time get_effective_mtime(lc_op_ctx& oc) = 0; +public: + LCOpAction_Transition(const transition_action& _transition) : transition(_transition) {} + + bool check(lc_op_ctx& oc, ceph::real_time *exp_time) override { + auto& o = oc.o; + + if (o.is_delete_marker()) { + return false; + } + + if (!check_current_state(o.is_current())) { + return false; + } + + auto mtime = get_effective_mtime(oc); + bool is_expired; + if (transition.days < 0) { + if (transition.date == boost::none) { + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": no transition day/date set in rule, skipping" << dendl; + return false; + } + is_expired = ceph_clock_now() >= ceph::real_clock::to_time_t(*transition.date); + *exp_time = *transition.date; + } else { + is_expired = obj_has_expired(oc.cct, mtime, transition.days, exp_time); + } + + ldout(oc.cct, 20) << __func__ << "(): key=" << o.key << ": is_expired=" << is_expired << dendl; + + need_to_process = (rgw_placement_rule::get_canonical_storage_class(o.meta.storage_class) != transition.storage_class); + + return is_expired; + } + + bool should_process() override { + return need_to_process; + } + + int process(lc_op_ctx& oc) { + auto& o = oc.o; + + rgw_placement_rule target_placement; + target_placement.inherit_from(oc.bucket_info.placement_rule); + target_placement.storage_class = transition.storage_class; + + if (!oc.store->svc.zone->get_zone_params().valid_placement(target_placement)) { + ldout(oc.cct, 0) << "ERROR: non existent dest placement: " << target_placement + << " bucket="<< oc.bucket_info.bucket + << " rule_id=" << oc.op.id << dendl; + return -EINVAL; + } + + int r = oc.store->transition_obj(oc.rctx, oc.bucket_info, oc.obj, + target_placement, o.meta.mtime, o.versioned_epoch); + if (r < 0) { + ldout(oc.cct, 0) << "ERROR: failed to transition obj (r=" << r << ")" << dendl; + return r; + } + ldout(oc.cct, 2) << "TRANSITIONED:" << oc.bucket_info.bucket << ":" << o.key << " -> " << transition.storage_class << dendl; + return 0; + } +}; + +class LCOpAction_CurrentTransition : public LCOpAction_Transition { +protected: + bool check_current_state(bool is_current) override { + return is_current; + } + + ceph::real_time get_effective_mtime(lc_op_ctx& oc) override { + return oc.o.meta.mtime; + } +public: + LCOpAction_CurrentTransition(const transition_action& _transition) : LCOpAction_Transition(_transition) {} +}; + +class LCOpAction_NonCurrentTransition : public LCOpAction_Transition { +protected: + bool check_current_state(bool is_current) override { + return !is_current; + } + + ceph::real_time get_effective_mtime(lc_op_ctx& oc) override { + return oc.ol.get_prev_obj().meta.mtime; + } +public: + LCOpAction_NonCurrentTransition(const transition_action& _transition) : LCOpAction_Transition(_transition) {} +}; + +void LCOpRule::build() +{ + filters.emplace_back(new LCOpFilter_Tags); + + auto& op = env.op; + + if (op.expiration > 0 || + op.expiration_date != boost::none) { + actions.emplace_back(new LCOpAction_CurrentExpiration); + } + + if (op.dm_expiration) { + actions.emplace_back(new LCOpAction_DMExpiration); + } + + if (op.noncur_expiration > 0) { + actions.emplace_back(new LCOpAction_NonCurrentExpiration); + } + + for (auto& iter : op.transitions) { + actions.emplace_back(new LCOpAction_CurrentTransition(iter.second)); + } + + for (auto& iter : op.noncur_transitions) { + actions.emplace_back(new LCOpAction_NonCurrentTransition(iter.second)); + } +} + +int LCOpRule::process(rgw_bucket_dir_entry& o) +{ + lc_op_ctx ctx(env, o); + + unique_ptr *selected = nullptr; + real_time exp; + + for (auto& a : actions) { + real_time action_exp; + + if (a->check(ctx, &action_exp)) { + if (action_exp > exp) { + exp = action_exp; + selected = &a; + } + } + } + + if (selected && + (*selected)->should_process()) { + + /* + * Calling filter checks after action checks because + * all action checks (as they are implemented now) do + * not access the objects themselves, but return result + * from info from bucket index listing. The current tags filter + * check does access the objects, so we avoid unnecessary rados calls + * having filters check later in the process. + */ + + bool cont = false; + for (auto& f : filters) { + if (f->check(ctx)) { + cont = true; + break; + } + } + + if (!cont) { + ldout(env.store->ctx(), 20) << __func__ << "(): key=" << o.key << ": no rule match, skipping" << dendl; + return 0; + } + + int r = (*selected)->process(ctx); + if (r < 0) { + ldout(ctx.cct, 0) << "ERROR: remove_expired_obj " << dendl; + return r; + } + ldout(ctx.cct, 20) << "processed:" << env.bucket_info.bucket << ":" << o.key << dendl; + } + + return 0; + +} + +int RGWLC::bucket_lc_process(string& shard_id) +{ + RGWLifecycleConfiguration config(cct); + RGWBucketInfo bucket_info; + map bucket_attrs; + string no_ns, list_versions; + vector objs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + vector result; + boost::split(result, shard_id, boost::is_any_of(":")); + string bucket_tenant = result[0]; + string bucket_name = result[1]; + string bucket_marker = result[2]; + int ret = store->get_bucket_info(obj_ctx, bucket_tenant, bucket_name, bucket_info, NULL, &bucket_attrs); + if (ret < 0) { + ldpp_dout(this, 0) << "LC:get_bucket_info for " << bucket_name << " failed" << dendl; + return ret; + } + + if (bucket_info.bucket.marker != bucket_marker) { + ldpp_dout(this, 1) << "LC: deleting stale entry found for bucket=" << bucket_tenant + << ":" << bucket_name << " cur_marker=" << bucket_info.bucket.marker + << " orig_marker=" << bucket_marker << dendl; + return -ENOENT; + } + + RGWRados::Bucket target(store, bucket_info); + + map::iterator aiter = bucket_attrs.find(RGW_ATTR_LC); + if (aiter == bucket_attrs.end()) + return 0; + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(this, 0) << __func__ << "() decode life cycle config failed" << dendl; + return -1; + } + + multimap& prefix_map = config.get_prefix_map(); + + ldpp_dout(this, 10) << __func__ << "() prefix_map size=" + << prefix_map.size() + << dendl; + + rgw_obj_key pre_marker; + rgw_obj_key next_marker; + for(auto prefix_iter = prefix_map.begin(); prefix_iter != prefix_map.end(); ++prefix_iter) { + auto& op = prefix_iter->second; + if (!is_valid_op(op)) { + continue; + } + ldpp_dout(this, 20) << __func__ << "(): prefix=" << prefix_iter->first << dendl; + if (prefix_iter != prefix_map.begin() && + (prefix_iter->first.compare(0, prev(prefix_iter)->first.length(), prev(prefix_iter)->first) == 0)) { + next_marker = pre_marker; + } else { + pre_marker = next_marker; + } + + LCObjsLister ol(store, bucket_info); + ol.set_prefix(prefix_iter->first); + + ret = ol.init(); + + if (ret < 0) { + if (ret == (-ENOENT)) + return 0; + ldpp_dout(this, 0) << "ERROR: store->list_objects():" <lc_pool_ctx, obj_names[index], entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::bucket_lc_post() failed to remove entry " + << obj_names[index] << dendl; + } + goto clean; + } else if (result < 0) { + entry.second = lc_failed; + } else { + entry.second = lc_complete; + } + + ret = cls_rgw_lc_set_entry(store->lc_pool_ctx, obj_names[index], entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to set entry on " + << obj_names[index] << dendl; + } +clean: + l.unlock(&store->lc_pool_ctx, obj_names[index]); + ldpp_dout(this, 20) << "RGWLC::bucket_lc_post() unlock " << obj_names[index] << dendl; + return 0; + } while (true); +} + +int RGWLC::list_lc_progress(const string& marker, uint32_t max_entries, map *progress_map) +{ + int index = 0; + progress_map->clear(); + for(; index entries; + int ret = cls_rgw_lc_list(store->lc_pool_ctx, obj_names[index], marker, max_entries, entries); + if (ret < 0) { + if (ret == -ENOENT) { + ldpp_dout(this, 10) << __func__ << "() ignoring unfound lc object=" + << obj_names[index] << dendl; + continue; + } else { + return ret; + } + } + map::iterator iter; + for (iter = entries.begin(); iter != entries.end(); ++iter) { + progress_map->insert(*iter); + } + } + return 0; +} + +int RGWLC::process() +{ + int max_secs = cct->_conf->rgw_lc_lock_max_time; + + const int start = ceph::util::generate_random_number(0, max_objs - 1); + + for (int i = 0; i < max_objs; i++) { + int index = (i + start) % max_objs; + int ret = process(index, max_secs); + if (ret < 0) + return ret; + } + + return 0; +} + +int RGWLC::process(int index, int max_lock_secs) +{ + rados::cls::lock::Lock l(lc_index_lock_name); + do { + utime_t now = ceph_clock_now(); + pair entry;//string = bucket_name:bucket_id ,int = LC_BUCKET_STATUS + if (max_lock_secs <= 0) + return -EAGAIN; + + utime_t time(max_lock_secs, 0); + l.set_duration(time); + + int ret = l.lock_exclusive(&store->lc_pool_ctx, obj_names[index]); + if (ret == -EBUSY || ret == -EEXIST) { /* already locked by another lc processor */ + ldpp_dout(this, 0) << "RGWLC::process() failed to acquire lock on " + << obj_names[index] << ", sleep 5, try again" << dendl; + sleep(5); + continue; + } + if (ret < 0) + return 0; + + cls_rgw_lc_obj_head head; + ret = cls_rgw_lc_get_head(store->lc_pool_ctx, obj_names[index], head); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to get obj head " + << obj_names[index] << ", ret=" << ret << dendl; + goto exit; + } + + if(!if_already_run_today(head.start_date)) { + head.start_date = now; + head.marker.clear(); + ret = bucket_lc_prepare(index); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to update lc object " + << obj_names[index] << ", ret=" << ret << dendl; + goto exit; + } + } + + ret = cls_rgw_lc_get_next_entry(store->lc_pool_ctx, obj_names[index], head.marker, entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to get obj entry " + << obj_names[index] << dendl; + goto exit; + } + + if (entry.first.empty()) + goto exit; + + entry.second = lc_processing; + ret = cls_rgw_lc_set_entry(store->lc_pool_ctx, obj_names[index], entry); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to set obj entry " << obj_names[index] + << " (" << entry.first << "," << entry.second << ")" << dendl; + goto exit; + } + + head.marker = entry.first; + ret = cls_rgw_lc_put_head(store->lc_pool_ctx, obj_names[index], head); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWLC::process() failed to put head " << obj_names[index] << dendl; + goto exit; + } + l.unlock(&store->lc_pool_ctx, obj_names[index]); + ret = bucket_lc_process(entry.first); + bucket_lc_post(index, max_lock_secs, entry, ret); + }while(1); + +exit: + l.unlock(&store->lc_pool_ctx, obj_names[index]); + return 0; +} + +void RGWLC::start_processor() +{ + worker = new LCWorker(this, cct, this); + worker->create("lifecycle_thr"); +} + +void RGWLC::stop_processor() +{ + down_flag = true; + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = NULL; +} + + +unsigned RGWLC::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWLC::gen_prefix(std::ostream& out) const +{ + return out << "lifecycle: "; +} + +void RGWLC::LCWorker::stop() +{ + Mutex::Locker l(lock); + cond.Signal(); +} + +bool RGWLC::going_down() +{ + return down_flag; +} + +bool RGWLC::LCWorker::should_work(utime_t& now) +{ + int start_hour; + int start_minute; + int end_hour; + int end_minute; + string worktime = cct->_conf->rgw_lifecycle_work_time; + sscanf(worktime.c_str(),"%d:%d-%d:%d",&start_hour, &start_minute, &end_hour, &end_minute); + struct tm bdt; + time_t tt = now.sec(); + localtime_r(&tt, &bdt); + + if (cct->_conf->rgw_lc_debug_interval > 0) { + /* We're debugging, so say we can run */ + return true; + } else if ((bdt.tm_hour*60 + bdt.tm_min >= start_hour*60 + start_minute) && + (bdt.tm_hour*60 + bdt.tm_min <= end_hour*60 + end_minute)) { + return true; + } else { + return false; + } + +} + +int RGWLC::LCWorker::schedule_next_start_time(utime_t &start, utime_t& now) +{ + int secs; + + if (cct->_conf->rgw_lc_debug_interval > 0) { + secs = start + cct->_conf->rgw_lc_debug_interval - now; + if (secs < 0) + secs = 0; + return (secs); + } + + int start_hour; + int start_minute; + int end_hour; + int end_minute; + string worktime = cct->_conf->rgw_lifecycle_work_time; + sscanf(worktime.c_str(),"%d:%d-%d:%d",&start_hour, &start_minute, &end_hour, &end_minute); + struct tm bdt; + time_t tt = now.sec(); + time_t nt; + localtime_r(&tt, &bdt); + bdt.tm_hour = start_hour; + bdt.tm_min = start_minute; + bdt.tm_sec = 0; + nt = mktime(&bdt); + secs = nt - tt; + + return secs>0 ? secs : secs+24*60*60; +} + +void RGWLifecycleConfiguration::generate_test_instances(list& o) +{ + o.push_back(new RGWLifecycleConfiguration); +} + +void get_lc_oid(CephContext *cct, const string& shard_id, string *oid) +{ + int max_objs = (cct->_conf->rgw_lc_max_objs > HASH_PRIME ? HASH_PRIME : cct->_conf->rgw_lc_max_objs); + int index = ceph_str_hash_linux(shard_id.c_str(), shard_id.size()) % HASH_PRIME % max_objs; + *oid = lc_oid_prefix; + char buf[32]; + snprintf(buf, 32, ".%d", index); + oid->append(buf); + return; +} + + + +static std::string get_lc_shard_name(const rgw_bucket& bucket){ + return string_join_reserve(':', bucket.tenant, bucket.name, bucket.marker); +} + +template +static int guard_lc_modify(RGWRados* store, const rgw_bucket& bucket, const string& cookie, const F& f) { + CephContext *cct = store->ctx(); + + string shard_id = get_lc_shard_name(bucket); + + string oid; + get_lc_oid(cct, shard_id, &oid); + + pair entry(shard_id, lc_uninitial); + int max_lock_secs = cct->_conf->rgw_lc_lock_max_time; + + rados::cls::lock::Lock l(lc_index_lock_name); + utime_t time(max_lock_secs, 0); + l.set_duration(time); + l.set_cookie(cookie); + + librados::IoCtx *ctx = store->get_lc_pool_ctx(); + int ret; + + do { + ret = l.lock_exclusive(ctx, oid); + if (ret == -EBUSY || ret == -EEXIST) { + ldout(cct, 0) << "RGWLC::RGWPutLC() failed to acquire lock on " + << oid << ", sleep 5, try again" << dendl; + sleep(5); // XXX: return retryable error + continue; + } + if (ret < 0) { + ldout(cct, 0) << "RGWLC::RGWPutLC() failed to acquire lock on " + << oid << ", ret=" << ret << dendl; + break; + } + ret = f(ctx, oid, entry); + if (ret < 0) { + ldout(cct, 0) << "RGWLC::RGWPutLC() failed to set entry on " + << oid << ", ret=" << ret << dendl; + } + break; + } while(true); + l.unlock(ctx, oid); + return ret; +} + +int RGWLC::set_bucket_config(RGWBucketInfo& bucket_info, + const map& bucket_attrs, + RGWLifecycleConfiguration *config) +{ + map attrs = bucket_attrs; + bufferlist lc_bl; + config->encode(lc_bl); + + attrs[RGW_ATTR_LC] = std::move(lc_bl); + + int ret = rgw_bucket_set_attrs(store, bucket_info, attrs, &bucket_info.objv_tracker); + if (ret < 0) + return ret; + + rgw_bucket& bucket = bucket_info.bucket; + + + ret = guard_lc_modify(store, bucket, cookie, [&](librados::IoCtx *ctx, const string& oid, + const pair& entry) { + return cls_rgw_lc_set_entry(*ctx, oid, entry); + }); + + return ret; +} + +int RGWLC::remove_bucket_config(RGWBucketInfo& bucket_info, + const map& bucket_attrs) +{ + map attrs = bucket_attrs; + attrs.erase(RGW_ATTR_LC); + int ret = rgw_bucket_set_attrs(store, bucket_info, attrs, + &bucket_info.objv_tracker); + + rgw_bucket& bucket = bucket_info.bucket; + + if (ret < 0) { + ldout(cct, 0) << "RGWLC::RGWDeleteLC() failed to set attrs on bucket=" + << bucket.name << " returned err=" << ret << dendl; + return ret; + } + + + ret = guard_lc_modify(store, bucket, cookie, [&](librados::IoCtx *ctx, const string& oid, + const pair& entry) { + return cls_rgw_lc_rm_entry(*ctx, oid, entry); + }); + + return ret; +} + +namespace rgw::lc { + +int fix_lc_shard_entry(RGWRados* store, const RGWBucketInfo& bucket_info, + const map& battrs) +{ + if (auto aiter = battrs.find(RGW_ATTR_LC); + aiter == battrs.end()) { + return 0; // No entry, nothing to fix + } + + auto shard_name = get_lc_shard_name(bucket_info.bucket); + std::string lc_oid; + get_lc_oid(store->ctx(), shard_name, &lc_oid); + + rgw_lc_entry_t entry; + // There are multiple cases we need to encounter here + // 1. entry exists and is already set to marker, happens in plain buckets & newly resharded buckets + // 2. entry doesn't exist, which usually happens when reshard has happened prior to update and next LC process has already dropped the update + // 3. entry exists matching the current bucket id which was after a reshard (needs to be updated to the marker) + // We are not dropping the old marker here as that would be caught by the next LC process update + auto lc_pool_ctx = store->get_lc_pool_ctx(); + int ret = cls_rgw_lc_get_entry(*lc_pool_ctx, + lc_oid, shard_name, entry); + if (ret == 0) { + ldout(store->ctx(), 5) << "Entry already exists, nothing to do" << dendl; + return ret; // entry is already existing correctly set to marker + } + ldout(store->ctx(), 5) << "cls_rgw_lc_get_entry errored ret code=" << ret << dendl; + if (ret == -ENOENT) { + ldout(store->ctx(), 1) << "No entry for bucket=" << bucket_info.bucket.name + << " creating " << dendl; + // TODO: we have too many ppl making cookies like this! + char cookie_buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1); + std::string cookie = cookie_buf; + + ret = guard_lc_modify(store, bucket_info.bucket, cookie, + [&lc_pool_ctx, &lc_oid](librados::IoCtx *ctx, const string& oid, + const pair& entry) { + return cls_rgw_lc_set_entry(*lc_pool_ctx, + lc_oid, entry); + }); + + } + + return ret; +} + +std::string s3_expiration_header( + DoutPrefixProvider* dpp, + const rgw_obj_key& obj_key, + const RGWObjTags& obj_tagset, + const ceph::real_time& mtime, + const std::map& bucket_attrs) +{ + CephContext* cct = dpp->get_cct(); + RGWLifecycleConfiguration config(cct); + std::string hdr{""}; + + const auto& aiter = bucket_attrs.find(RGW_ATTR_LC); + if (aiter == bucket_attrs.end()) + return hdr; + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + ldpp_dout(dpp, 0) << __func__ + << "() decode life cycle config failed" + << dendl; + return hdr; + } /* catch */ + + /* dump tags at debug level 16 */ + RGWObjTags::tag_map_t obj_tag_map = obj_tagset.get_tags(); + if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 16)) { + for (const auto& elt : obj_tag_map) { + ldout(cct, 16) << __func__ + << "() key=" << elt.first << " val=" << elt.second + << dendl; + } + } + + boost::optional expiration_date; + boost::optional rule_id; + + const auto& rule_map = config.get_rule_map(); + for (const auto& ri : rule_map) { + const auto& rule = ri.second; + auto& id = rule.get_id(); + auto& prefix = rule.get_prefix(); + auto& filter = rule.get_filter(); + auto& expiration = rule.get_expiration(); + auto& noncur_expiration = rule.get_noncur_expiration(); + + ldpp_dout(dpp, 10) << "rule: " << ri.first + << " prefix: " << prefix + << " expiration: " + << " date: " << expiration.get_date() + << " days: " << expiration.get_days() + << " noncur_expiration: " + << " date: " << noncur_expiration.get_date() + << " days: " << noncur_expiration.get_days() + << dendl; + + /* skip if rule !enabled + * if rule has prefix, skip iff object !match prefix + * if rule has tags, skip iff object !match tags + * note if object is current or non-current, compare accordingly + * if rule has days, construct date expression and save iff older + * than last saved + * if rule has date, convert date expression and save iff older + * than last saved + * if the date accum has a value, format it into hdr + */ + + if (!rule.is_enabled()) + continue; + + if(!prefix.empty()) { + if (!boost::starts_with(obj_key.name, prefix)) + continue; + } + + if (filter.has_tags()) { + bool tag_match = false; + const RGWObjTags& rule_tagset = filter.get_tags(); + for (auto& tag : rule_tagset.get_tags()) { + /* remember, S3 tags are {key,value} tuples */ + tag_match = true; + auto obj_tag = obj_tag_map.find(tag.first); + if (obj_tag == obj_tag_map.end() || obj_tag->second != tag.second) { + ldpp_dout(dpp, 10) << "tag does not match obj_key=" << obj_key + << " rule_id=" << id + << " tag=" << tag + << dendl; + tag_match = false; + break; + } + } + if (! tag_match) + continue; + } + + // compute a uniform expiration date + boost::optional rule_expiration_date; + const LCExpiration& rule_expiration = + (obj_key.instance.empty()) ? expiration : noncur_expiration; + + if (rule_expiration.has_date()) { + rule_expiration_date = + boost::optional( + ceph::from_iso_8601(rule.get_expiration().get_date())); + rule_id = id; + } else { + if (rule_expiration.has_days()) { + rule_expiration_date = + boost::optional( + mtime + make_timespan(double(rule_expiration.get_days())*24*60*60)); + rule_id = id; + } + } + + // update earliest expiration + if (rule_expiration_date) { + if ((! expiration_date) || + (*expiration_date < *rule_expiration_date)) { + expiration_date = + boost::optional(rule_expiration_date); + } + } + } + + // cond format header + if (expiration_date && rule_id) { + // Fri, 23 Dec 2012 00:00:00 GMT + char exp_buf[100]; + time_t exp = ceph::real_clock::to_time_t(*expiration_date); + if (std::strftime(exp_buf, sizeof(exp_buf), + "%a, %d %b %Y %T %Z", std::gmtime(&exp))) { + hdr = fmt::format("expiry-date=\"{0}\", rule-id=\"{1}\"", exp_buf, + *rule_id); + } else { + ldpp_dout(dpp, 0) << __func__ << + "() strftime of life cycle expiration header failed" + << dendl; + } + } + + return hdr; + +} /* rgwlc_s3_expiration_header */ + +} /* namespace rgw::lc */ diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h new file mode 100644 index 00000000..6a373502 --- /dev/null +++ b/src/rgw/rgw_lc.h @@ -0,0 +1,539 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_LC_H +#define CEPH_RGW_LC_H + +#include +#include +#include + +#include "common/debug.h" + +#include "include/types.h" +#include "include/rados/librados.hpp" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/iso_8601.h" +#include "common/Thread.h" +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_multi.h" +#include "cls/rgw/cls_rgw_types.h" +#include "rgw_tag.h" + +#include +#include + +#define HASH_PRIME 7877 +#define MAX_ID_LEN 255 +static string lc_oid_prefix = "lc"; +static string lc_index_lock_name = "lc_process"; + +extern const char* LC_STATUS[]; + +typedef enum { + lc_uninitial = 0, + lc_processing, + lc_failed, + lc_complete, +} LC_BUCKET_STATUS; + +class LCExpiration +{ +protected: + string days; + //At present only current object has expiration date + string date; +public: + LCExpiration() {} + LCExpiration(const string& _days, const string& _date) : days(_days), date(_date) {} + + void encode(bufferlist& bl) const { + ENCODE_START(3, 2, bl); + encode(days, bl); + encode(date, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(days, bl); + if (struct_v >= 3) { + decode(date, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +// static void generate_test_instances(list& o); + void set_days(const string& _days) { days = _days; } + string get_days_str() const { + return days; + } + int get_days() const {return atoi(days.c_str()); } + bool has_days() const { + return !days.empty(); + } + void set_date(const string& _date) { date = _date; } + string get_date() const { + return date; + } + bool has_date() const { + return !date.empty(); + } + bool empty() const { + return days.empty() && date.empty(); + } + bool valid() const { + if (!days.empty() && !date.empty()) { + return false; + } else if (!days.empty() && get_days() <= 0) { + return false; + } + //We've checked date in xml parsing + return true; + } +}; +WRITE_CLASS_ENCODER(LCExpiration) + +class LCTransition +{ +protected: + string days; + string date; + string storage_class; + +public: + int get_days() const { + return atoi(days.c_str()); + } + + string get_date() const { + return date; + } + + string get_storage_class() const { + return storage_class; + } + + bool has_days() const { + return !days.empty(); + } + + bool has_date() const { + return !date.empty(); + } + + bool empty() const { + return days.empty() && date.empty(); + } + + bool valid() const { + if (!days.empty() && !date.empty()) { + return false; + } else if (!days.empty() && get_days() < 0) { + return false; + } + //We've checked date in xml parsing + return true; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(days, bl); + encode(date, bl); + encode(storage_class, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(days, bl); + decode(date, bl); + decode(storage_class, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const { + f->dump_string("days", days); + f->dump_string("date", date); + f->dump_string("storage_class", storage_class); + } +}; +WRITE_CLASS_ENCODER(LCTransition) + +class LCFilter +{ + protected: + std::string prefix; + RGWObjTags obj_tags; + + public: + + const std::string& get_prefix() const { + return prefix; + } + + const RGWObjTags& get_tags() const { + return obj_tags; + } + + bool empty() const { + return !(has_prefix() || has_tags()); + } + + // Determine if we need AND tag when creating xml + bool has_multi_condition() const { + if (obj_tags.count() > 1) + return true; + else if (has_prefix() && has_tags()) + return true; + + return false; + } + + bool has_prefix() const { + return !prefix.empty(); + } + + bool has_tags() const { + return !obj_tags.empty(); + } + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(prefix, bl); + encode(obj_tags, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(prefix, bl); + if (struct_v >= 2) { + decode(obj_tags, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(LCFilter) + +class LCRule +{ +protected: + string id; + string prefix; + string status; + LCExpiration expiration; + LCExpiration noncur_expiration; + LCExpiration mp_expiration; + LCFilter filter; + map transitions; + map noncur_transitions; + bool dm_expiration = false; + +public: + + LCRule(){}; + ~LCRule(){}; + + const string& get_id() const { + return id; + } + + const string& get_status() const { + return status; + } + + bool is_enabled() const { + return status == "Enabled"; + } + + void set_enabled(bool flag) { + status = (flag ? "Enabled" : "Disabled"); + } + + const string& get_prefix() const { + return prefix; + } + + const LCFilter& get_filter() const { + return filter; + } + + const LCExpiration& get_expiration() const { + return expiration; + } + + const LCExpiration& get_noncur_expiration() const { + return noncur_expiration; + } + + const LCExpiration& get_mp_expiration() const { + return mp_expiration; + } + + bool get_dm_expiration() const { + return dm_expiration; + } + + const map& get_transitions() const { + return transitions; + } + + const map& get_noncur_transitions() const { + return noncur_transitions; + } + + void set_id(const string& _id) { + id = _id; + } + + void set_prefix(const string& _prefix) { + prefix = _prefix; + } + + void set_status(const string& _status) { + status = _status; + } + + void set_expiration(const LCExpiration& _expiration) { + expiration = _expiration; + } + + void set_noncur_expiration(const LCExpiration& _noncur_expiration) { + noncur_expiration = _noncur_expiration; + } + + void set_mp_expiration(const LCExpiration& _mp_expiration) { + mp_expiration = _mp_expiration; + } + + void set_dm_expiration(bool _dm_expiration) { + dm_expiration = _dm_expiration; + } + + bool add_transition(const LCTransition& _transition) { + auto ret = transitions.emplace(_transition.get_storage_class(), _transition); + return ret.second; + } + + bool add_noncur_transition(const LCTransition& _noncur_transition) { + auto ret = noncur_transitions.emplace(_noncur_transition.get_storage_class(), _noncur_transition); + return ret.second; + } + + bool valid() const; + + void encode(bufferlist& bl) const { + ENCODE_START(6, 1, bl); + encode(id, bl); + encode(prefix, bl); + encode(status, bl); + encode(expiration, bl); + encode(noncur_expiration, bl); + encode(mp_expiration, bl); + encode(dm_expiration, bl); + encode(filter, bl); + encode(transitions, bl); + encode(noncur_transitions, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(6, 1, 1, bl); + decode(id, bl); + decode(prefix, bl); + decode(status, bl); + decode(expiration, bl); + if (struct_v >=2) { + decode(noncur_expiration, bl); + } + if (struct_v >= 3) { + decode(mp_expiration, bl); + } + if (struct_v >= 4) { + decode(dm_expiration, bl); + } + if (struct_v >= 5) { + decode(filter, bl); + } + if (struct_v >= 6) { + decode(transitions, bl); + decode(noncur_transitions, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + + void init_simple_days_rule(std::string_view _id, std::string_view _prefix, int num_days); +}; +WRITE_CLASS_ENCODER(LCRule) + +struct transition_action +{ + int days; + boost::optional date; + string storage_class; + transition_action() : days(0) {} + void dump(Formatter *f) const { + if (!date) { + f->dump_int("days", days); + } else { + utime_t ut(*date); + f->dump_stream("date") << ut; + } + } +}; + +/* XXX why not LCRule? */ +struct lc_op +{ + string id; + bool status{false}; + bool dm_expiration{false}; + int expiration{0}; + int noncur_expiration{0}; + int mp_expiration{0}; + boost::optional expiration_date; + boost::optional obj_tags; + map transitions; + map noncur_transitions; + + /* ctors are nice */ + lc_op() = delete; + + lc_op(const std::string id) : id(id) + {} + + void dump(Formatter *f) const; +}; + +class RGWLifecycleConfiguration +{ +protected: + CephContext *cct; + multimap prefix_map; + multimap rule_map; + bool _add_rule(const LCRule& rule); + bool has_same_action(const lc_op& first, const lc_op& second); +public: + explicit RGWLifecycleConfiguration(CephContext *_cct) : cct(_cct) {} + RGWLifecycleConfiguration() : cct(NULL) {} + + void set_ctx(CephContext *ctx) { + cct = ctx; + } + + virtual ~RGWLifecycleConfiguration() {} + +// int get_perm(string& id, int perm_mask); +// int get_group_perm(ACLGroupTypeEnum group, int perm_mask); + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(rule_map, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl); + decode(rule_map, bl); + multimap::iterator iter; + for (iter = rule_map.begin(); iter != rule_map.end(); ++iter) { + LCRule& rule = iter->second; + _add_rule(rule); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + void add_rule(const LCRule& rule); + + int check_and_add_rule(const LCRule& rule); + + bool valid(); + + multimap& get_rule_map() { return rule_map; } + multimap& get_prefix_map() { return prefix_map; } +/* + void create_default(string id, string name) { + ACLGrant grant; + grant.set_canon(id, name, RGW_PERM_FULL_CONTROL); + add_grant(&grant); + } +*/ +}; +WRITE_CLASS_ENCODER(RGWLifecycleConfiguration) + +class RGWLC : public DoutPrefixProvider { + CephContext *cct; + RGWRados *store; + int max_objs{0}; + string *obj_names{nullptr}; + std::atomic down_flag = { false }; + string cookie; + + class LCWorker : public Thread { + const DoutPrefixProvider *dpp; + CephContext *cct; + RGWLC *lc; + Mutex lock; + Cond cond; + + public: + LCWorker(const DoutPrefixProvider* _dpp, CephContext *_cct, RGWLC *_lc) : dpp(_dpp), cct(_cct), lc(_lc), lock("LCWorker") {} + void *entry() override; + void stop(); + bool should_work(utime_t& now); + int schedule_next_start_time(utime_t& start, utime_t& now); + }; + + public: + LCWorker *worker; + RGWLC() : cct(NULL), store(NULL), worker(NULL) {} + ~RGWLC() { + stop_processor(); + finalize(); + } + + void initialize(CephContext *_cct, RGWRados *_store); + void finalize(); + + int process(); + int process(int index, int max_secs); + bool if_already_run_today(time_t& start_date); + int list_lc_progress(const string& marker, uint32_t max_entries, map *progress_map); + int bucket_lc_prepare(int index); + int bucket_lc_process(string& shard_id); + int bucket_lc_post(int index, int max_lock_sec, pair& entry, int& result); + bool going_down(); + void start_processor(); + void stop_processor(); + int set_bucket_config(RGWBucketInfo& bucket_info, + const map& bucket_attrs, + RGWLifecycleConfiguration *config); + int remove_bucket_config(RGWBucketInfo& bucket_info, + const map& bucket_attrs); + + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const; + std::ostream& gen_prefix(std::ostream& out) const; + + private: + + int handle_multipart_expiration(RGWRados::Bucket *target, + const multimap& prefix_map); +}; + +namespace rgw::lc { + +int fix_lc_shard_entry(RGWRados *store, const RGWBucketInfo& bucket_info, + const map& battrs); + +std::string s3_expiration_header( + DoutPrefixProvider* dpp, + const rgw_obj_key& obj_key, + const RGWObjTags& obj_tagset, + const ceph::real_time& mtime, + const std::map& bucket_attrs); + +} // namespace rgw::lc + +#endif diff --git a/src/rgw/rgw_lc_s3.cc b/src/rgw/rgw_lc_s3.cc new file mode 100644 index 00000000..09eb216f --- /dev/null +++ b/src/rgw/rgw_lc_s3.cc @@ -0,0 +1,344 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_user.h" +#include "rgw_lc_s3.h" + + +#define dout_subsys ceph_subsys_rgw + +static bool check_date(const string& _date) +{ + boost::optional date = ceph::from_iso_8601(_date); + if (boost::none == date) { + return false; + } + struct timespec time = ceph::real_clock::to_timespec(*date); + if (time.tv_sec % (24*60*60) || time.tv_nsec) { + return false; + } + return true; +} + +void LCExpiration_S3::dump_xml(Formatter *f) const { + if (dm_expiration) { + encode_xml("ExpiredObjectDeleteMarker", "true", f); + } else if (!days.empty()) { + encode_xml("Days", days, f); + } else { + encode_xml("Date", date, f); + } +} + +void LCExpiration_S3::decode_xml(XMLObj *obj) +{ + bool has_days = RGWXMLDecoder::decode_xml("Days", days, obj); + bool has_date = RGWXMLDecoder::decode_xml("Date", date, obj); + string dm; + bool has_dm = RGWXMLDecoder::decode_xml("ExpiredObjectDeleteMarker", dm, obj); + + int num = !!has_days + !!has_date + !!has_dm; + + if (num != 1) { + throw RGWXMLDecoder::err("bad Expiration section"); + } + + if (has_date && !check_date(date)) { + //We need return xml error according to S3 + throw RGWXMLDecoder::err("bad date in Date section"); + } + + if (has_dm) { + dm_expiration = (dm == "true"); + } +} + +void LCNoncurExpiration_S3::decode_xml(XMLObj *obj) +{ + RGWXMLDecoder::decode_xml("NoncurrentDays", days, obj, true); +} + +void LCNoncurExpiration_S3::dump_xml(Formatter *f) const +{ + encode_xml("NoncurrentDays", days, f); +} + +void LCMPExpiration_S3::decode_xml(XMLObj *obj) +{ + RGWXMLDecoder::decode_xml("DaysAfterInitiation", days, obj, true); +} + +void LCMPExpiration_S3::dump_xml(Formatter *f) const +{ + encode_xml("DaysAfterInitiation", days, f); +} + +void RGWLifecycleConfiguration_S3::decode_xml(XMLObj *obj) +{ + if (!cct) { + throw RGWXMLDecoder::err("ERROR: RGWLifecycleConfiguration_S3 can't be decoded without cct initialized"); + } + vector rules; + + RGWXMLDecoder::decode_xml("Rule", rules, obj, true); + + for (auto& rule : rules) { + if (rule.get_id().empty()) { + // S3 generates a 48 bit random ID, maybe we could generate shorter IDs + static constexpr auto LC_ID_LENGTH = 48; + string id = gen_rand_alphanumeric_lower(cct, LC_ID_LENGTH); + rule.set_id(id); + } + + add_rule(rule); + } + + if (cct->_conf->rgw_lc_max_rules < rule_map.size()) { + stringstream ss; + ss << "Warn: The lifecycle config has too many rules, rule number is:" + << rule_map.size() << ", max number is:" << cct->_conf->rgw_lc_max_rules; + throw RGWXMLDecoder::err(ss.str()); + } +} + +void LCFilter_S3::dump_xml(Formatter *f) const +{ + if (has_prefix()) { + encode_xml("Prefix", prefix, f); + } + bool multi = has_multi_condition(); + if (multi) { + f->open_array_section("And"); + } + if (has_tags()) { + const auto& tagset_s3 = static_cast(obj_tags); + tagset_s3.dump_xml(f); + } + if (multi) { + f->close_section(); + } +} + +void LCFilter_S3::decode_xml(XMLObj *obj) +{ + XMLObj *o = obj->find_first("And"); + bool single_cond = false; + int num_conditions = 0; + // If there is an AND condition, every tag is a child of and + // else we only support single conditions and return false if we see multiple + + if (o == nullptr){ + o = obj; + single_cond = true; + } + + RGWXMLDecoder::decode_xml("Prefix", prefix, o); + if (!prefix.empty()) + num_conditions++; + auto tags_iter = o->find("Tag"); + obj_tags.clear(); + while (auto tag_xml =tags_iter.get_next()){ + std::string _key,_val; + RGWXMLDecoder::decode_xml("Key", _key, tag_xml); + RGWXMLDecoder::decode_xml("Value", _val, tag_xml); + obj_tags.emplace_tag(std::move(_key), std::move(_val)); + num_conditions++; + } + + if (single_cond && num_conditions > 1) { + throw RGWXMLDecoder::err("Bad filter: badly formed multiple conditions"); + } +} + +void LCTransition_S3::decode_xml(XMLObj *obj) +{ + bool has_days = RGWXMLDecoder::decode_xml("Days", days, obj); + bool has_date = RGWXMLDecoder::decode_xml("Date", date, obj); + if ((has_days && has_date) || (!has_days && !has_date)) { + throw RGWXMLDecoder::err("bad Transition section"); + } + + if (has_date && !check_date(date)) { + //We need return xml error according to S3 + throw RGWXMLDecoder::err("bad Date in Transition section"); + } + + if (!RGWXMLDecoder::decode_xml("StorageClass", storage_class, obj)) { + throw RGWXMLDecoder::err("missing StorageClass in Transition section"); + } +} + +void LCTransition_S3::dump_xml(Formatter *f) const { + if (!days.empty()) { + encode_xml("Days", days, f); + } else { + encode_xml("Date", date, f); + } + encode_xml("StorageClass", storage_class, f); +} + +void LCNoncurTransition_S3::decode_xml(XMLObj *obj) +{ + if (!RGWXMLDecoder::decode_xml("NoncurrentDays", days, obj)) { + throw RGWXMLDecoder::err("missing NoncurrentDays in NoncurrentVersionTransition section"); + } + if (!RGWXMLDecoder::decode_xml("StorageClass", storage_class, obj)) { + throw RGWXMLDecoder::err("missing StorageClass in NoncurrentVersionTransition section"); + } +} + +void LCNoncurTransition_S3::dump_xml(Formatter *f) const +{ + encode_xml("NoncurrentDays", days, f); + encode_xml("StorageClass", storage_class, f); +} + +void LCRule_S3::decode_xml(XMLObj *obj) +{ + id.clear(); + prefix.clear(); + status.clear(); + dm_expiration = false; + + RGWXMLDecoder::decode_xml("ID", id, obj); + + LCFilter_S3 filter_s3; + if (!RGWXMLDecoder::decode_xml("Filter", filter_s3, obj)) { + // Ideally the following code should be deprecated and we should return + // False here, The new S3 LC configuration xml spec. makes Filter mandatory + // and Prefix optional. However older clients including boto2 still generate + // xml according to the older spec, where Prefix existed outside of Filter + // and S3 itself seems to be sloppy on enforcing the mandatory Filter + // argument. A day will come when S3 enforces their own xml-spec, but it is + // not this day + + if (!RGWXMLDecoder::decode_xml("Prefix", prefix, obj)) { + throw RGWXMLDecoder::err("missing Prefix in Filter"); + } + } + filter = (LCFilter)filter_s3; + + if (!RGWXMLDecoder::decode_xml("Status", status, obj)) { + throw RGWXMLDecoder::err("missing Status in Filter"); + } + if (status.compare("Enabled") != 0 && status.compare("Disabled") != 0) { + throw RGWXMLDecoder::err("bad Status in Filter"); + } + + LCExpiration_S3 s3_expiration; + LCNoncurExpiration_S3 s3_noncur_expiration; + LCMPExpiration_S3 s3_mp_expiration; + LCFilter_S3 s3_filter; + + bool has_expiration = RGWXMLDecoder::decode_xml("Expiration", s3_expiration, obj); + bool has_noncur_expiration = RGWXMLDecoder::decode_xml("NoncurrentVersionExpiration", s3_noncur_expiration, obj); + bool has_mp_expiration = RGWXMLDecoder::decode_xml("AbortIncompleteMultipartUpload", s3_mp_expiration, obj); + + vector transitions; + vector noncur_transitions; + + bool has_transition = RGWXMLDecoder::decode_xml("Transition", transitions, obj); + bool has_noncur_transition = RGWXMLDecoder::decode_xml("NoncurrentVersionTransition", noncur_transitions, obj); + + if (!has_expiration && + !has_noncur_expiration && + !has_mp_expiration && + !has_transition && + !has_noncur_transition) { + throw RGWXMLDecoder::err("bad Rule"); + } + + if (has_expiration) { + if (s3_expiration.has_days() || + s3_expiration.has_date()) { + expiration = s3_expiration; + } else { + dm_expiration = s3_expiration.get_dm_expiration(); + } + } + if (has_noncur_expiration) { + noncur_expiration = s3_noncur_expiration; + } + if (has_mp_expiration) { + mp_expiration = s3_mp_expiration; + } + for (auto& t : transitions) { + if (!add_transition(t)) { + throw RGWXMLDecoder::err("Failed to add transition"); + } + } + for (auto& t : noncur_transitions) { + if (!add_noncur_transition(t)) { + throw RGWXMLDecoder::err("Failed to add non-current version transition"); + } + } +} + +void LCRule_S3::dump_xml(Formatter *f) const { + encode_xml("ID", id, f); + // In case of an empty filter and an empty Prefix, we defer to Prefix. + if (!filter.empty()) { + const LCFilter_S3& lc_filter = static_cast(filter); + encode_xml("Filter", lc_filter, f); + } else { + encode_xml("Prefix", prefix, f); + } + encode_xml("Status", status, f); + if (!expiration.empty() || dm_expiration) { + LCExpiration_S3 expir(expiration.get_days_str(), expiration.get_date(), dm_expiration); + encode_xml("Expiration", expir, f); + } + if (!noncur_expiration.empty()) { + const LCNoncurExpiration_S3& noncur_expir = static_cast(noncur_expiration); + encode_xml("NoncurrentVersionExpiration", noncur_expir, f); + } + if (!mp_expiration.empty()) { + const LCMPExpiration_S3& mp_expir = static_cast(mp_expiration); + encode_xml("AbortIncompleteMultipartUpload", mp_expir, f); + } + if (!transitions.empty()) { + for (auto &elem : transitions) { + const LCTransition_S3& tran = static_cast(elem.second); + encode_xml("Transition", tran, f); + } + } + if (!noncur_transitions.empty()) { + for (auto &elem : noncur_transitions) { + const LCNoncurTransition_S3& noncur_tran = static_cast(elem.second); + encode_xml("NoncurrentVersionTransition", noncur_tran, f); + } + } +} + +int RGWLifecycleConfiguration_S3::rebuild(RGWRados *store, RGWLifecycleConfiguration& dest) +{ + int ret = 0; + multimap::iterator iter; + for (iter = rule_map.begin(); iter != rule_map.end(); ++iter) { + LCRule& src_rule = iter->second; + ret = dest.check_and_add_rule(src_rule); + if (ret < 0) + return ret; + } + if (!dest.valid()) { + ret = -ERR_INVALID_REQUEST; + } + return ret; +} + + +void RGWLifecycleConfiguration_S3::dump_xml(Formatter *f) const +{ + for (auto iter = rule_map.begin(); iter != rule_map.end(); ++iter) { + const LCRule_S3& rule = static_cast(iter->second); + encode_xml("Rule", rule, f); + } +} + diff --git a/src/rgw/rgw_lc_s3.h b/src/rgw/rgw_lc_s3.h new file mode 100644 index 00000000..214ca54c --- /dev/null +++ b/src/rgw/rgw_lc_s3.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_LC_S3_H +#define CEPH_RGW_LC_S3_H + +#include +#include +#include +#include + +#include "include/str_list.h" +#include "rgw_lc.h" +#include "rgw_xml.h" +#include "rgw_tag_s3.h" + +class LCFilter_S3 : public LCFilter +{ +public: + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); +}; + +class LCExpiration_S3 : public LCExpiration +{ +private: + bool dm_expiration{false}; +public: + LCExpiration_S3() {} + LCExpiration_S3(string _days, string _date, bool _dm_expiration) : LCExpiration(_days, _date), dm_expiration(_dm_expiration) {} + + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); + + void set_dm_expiration(bool _dm_expiration) { + dm_expiration = _dm_expiration; + } + + bool get_dm_expiration() { + return dm_expiration; + } +}; + +class LCNoncurExpiration_S3 : public LCExpiration +{ +public: + LCNoncurExpiration_S3() {} + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +class LCMPExpiration_S3 : public LCExpiration +{ +public: + LCMPExpiration_S3() {} + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +class LCTransition_S3 : public LCTransition +{ +public: + LCTransition_S3() {} + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +class LCNoncurTransition_S3 : public LCTransition +{ +public: + LCNoncurTransition_S3() {} + ~LCNoncurTransition_S3() {} + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + + +class LCRule_S3 : public LCRule +{ +public: + LCRule_S3() {} + + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); +}; + +class RGWLifecycleConfiguration_S3 : public RGWLifecycleConfiguration +{ +public: + explicit RGWLifecycleConfiguration_S3(CephContext *_cct) : RGWLifecycleConfiguration(_cct) {} + RGWLifecycleConfiguration_S3() : RGWLifecycleConfiguration(nullptr) {} + + void decode_xml(XMLObj *obj); + int rebuild(RGWRados *store, RGWLifecycleConfiguration& dest); + void dump_xml(Formatter *f) const; +}; + +#endif diff --git a/src/rgw/rgw_ldap.cc b/src/rgw/rgw_ldap.cc new file mode 100644 index 00000000..f2009b06 --- /dev/null +++ b/src/rgw/rgw_ldap.cc @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_ldap.h" + +#include "common/ceph_crypto.h" +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/dout.h" +#include "common/safe_io.h" +#include + +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +std::string parse_rgw_ldap_bindpw(CephContext* ctx) +{ + string ldap_bindpw; + string ldap_secret = ctx->_conf->rgw_ldap_secret; + + if (ldap_secret.empty()) { + ldout(ctx, 10) + << __func__ << " LDAP auth no rgw_ldap_secret file found in conf" + << dendl; + } else { + // FIPS zeroization audit 20191116: this memset is not intended to + // wipe out a secret after use. + char bindpw[1024]; + memset(bindpw, 0, 1024); + int pwlen = safe_read_file("" /* base */, ldap_secret.c_str(), + bindpw, 1023); + if (pwlen > 0) { + ldap_bindpw = bindpw; + boost::algorithm::trim(ldap_bindpw); + if (ldap_bindpw.back() == '\n') + ldap_bindpw.pop_back(); + } + ::ceph::crypto::zeroize_for_security(bindpw, sizeof(bindpw)); + } + + return ldap_bindpw; +} + +#if defined(HAVE_OPENLDAP) +namespace rgw { + + int LDAPHelper::auth(const std::string &uid, const std::string &pwd) { + int ret; + std::string filter; + if (msad) { + filter = "(&(objectClass=user)(sAMAccountName="; + filter += uid; + filter += "))"; + } else { + /* openldap */ + if (searchfilter.empty()) { + /* no search filter provided in config, we construct our own */ + filter = "("; + filter += dnattr; + filter += "="; + filter += uid; + filter += ")"; + } else { + if (searchfilter.find("@USERNAME@") != std::string::npos) { + /* we need to substitute the @USERNAME@ placeholder */ + filter = searchfilter; + filter.replace(searchfilter.find("@USERNAME@"), std::string("@USERNAME@").length(), uid); + } else { + /* no placeholder for username, so we need to append our own username filter to the custom searchfilter */ + filter = "(&("; + filter += searchfilter; + filter += ")("; + filter += dnattr; + filter += "="; + filter += uid; + filter += "))"; + } + } + } + ldout(g_ceph_context, 12) + << __func__ << " search filter: " << filter + << dendl; + char *attrs[] = { const_cast(dnattr.c_str()), nullptr }; + LDAPMessage *answer = nullptr, *entry = nullptr; + bool once = true; + + lock_guard guard(mtx); + + retry_bind: + ret = ldap_search_s(ldap, searchdn.c_str(), LDAP_SCOPE_SUBTREE, + filter.c_str(), attrs, 0, &answer); + if (ret == LDAP_SUCCESS) { + entry = ldap_first_entry(ldap, answer); + if (entry) { + char *dn = ldap_get_dn(ldap, entry); + ret = simple_bind(dn, pwd); + if (ret != LDAP_SUCCESS) { + ldout(g_ceph_context, 10) + << __func__ << " simple_bind failed uid=" << uid + << "ldap err=" << ret + << dendl; + } + ldap_memfree(dn); + } else { + ldout(g_ceph_context, 12) + << __func__ << " ldap_search_s no user matching uid=" << uid + << dendl; + ret = LDAP_NO_SUCH_ATTRIBUTE; // fixup result + } + ldap_msgfree(answer); + } else { + ldout(g_ceph_context, 5) + << __func__ << " ldap_search_s error uid=" << uid + << " ldap err=" << ret + << dendl; + /* search should never fail--try to rebind */ + if (once) { + rebind(); + once = false; + goto retry_bind; + } + } + return (ret == LDAP_SUCCESS) ? ret : -EACCES; + } /* LDAPHelper::auth */ +} + +#endif /* defined(HAVE_OPENLDAP) */ diff --git a/src/rgw/rgw_ldap.h b/src/rgw/rgw_ldap.h new file mode 100644 index 00000000..aeb5f613 --- /dev/null +++ b/src/rgw/rgw_ldap.h @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_LDAP_H +#define RGW_LDAP_H + +#include "acconfig.h" + +#if defined(HAVE_OPENLDAP) +#define LDAP_DEPRECATED 1 +#include "ldap.h" +#endif + +#include +#include +#include +#include +#include +#include + +namespace rgw { + +#if defined(HAVE_OPENLDAP) + + class LDAPHelper + { + std::string uri; + std::string binddn; + std::string bindpw; + std::string searchdn; + std::string searchfilter; + std::string dnattr; + LDAP *ldap; + bool msad = false; /* TODO: possible future specialization */ + std::mutex mtx; + + public: + using lock_guard = std::lock_guard; + + LDAPHelper(std::string _uri, std::string _binddn, std::string _bindpw, + const std::string &_searchdn, const std::string &_searchfilter, const std::string &_dnattr) + : uri(std::move(_uri)), binddn(std::move(_binddn)), + bindpw(std::move(_bindpw)), searchdn(_searchdn), searchfilter(_searchfilter), dnattr(_dnattr), + ldap(nullptr) { + // nothing + } + + int init() { + int ret; + ret = ldap_initialize(&ldap, uri.c_str()); + if (ret == LDAP_SUCCESS) { + unsigned long ldap_ver = LDAP_VERSION3; + ret = ldap_set_option(ldap, LDAP_OPT_PROTOCOL_VERSION, + (void*) &ldap_ver); + } + if (ret == LDAP_SUCCESS) { + ret = ldap_set_option(ldap, LDAP_OPT_REFERRALS, LDAP_OPT_OFF); + } + return (ret == LDAP_SUCCESS) ? ret : -EINVAL; + } + + int bind() { + int ret; + ret = ldap_simple_bind_s(ldap, binddn.c_str(), bindpw.c_str()); + return (ret == LDAP_SUCCESS) ? ret : -EINVAL; + } + + int rebind() { + if (ldap) { + (void) ldap_unbind(ldap); + (void) init(); + return bind(); + } + return -EINVAL; + } + + int simple_bind(const char *dn, const std::string& pwd) { + LDAP* tldap; + int ret = ldap_initialize(&tldap, uri.c_str()); + if (ret == LDAP_SUCCESS) { + unsigned long ldap_ver = LDAP_VERSION3; + ret = ldap_set_option(tldap, LDAP_OPT_PROTOCOL_VERSION, + (void*) &ldap_ver); + if (ret == LDAP_SUCCESS) { + ret = ldap_simple_bind_s(tldap, dn, pwd.c_str()); + if (ret == LDAP_SUCCESS) { + (void) ldap_unbind(tldap); + } + } + } + return ret; // OpenLDAP client error space + } + + int auth(const std::string &uid, const std::string &pwd); + + ~LDAPHelper() { + if (ldap) + (void) ldap_unbind(ldap); + } + + }; /* LDAPHelper */ + +#else + + class LDAPHelper + { + public: + LDAPHelper(const std::string &_uri, const std::string &_binddn, const std::string &_bindpw, + const std::string &_searchdn, const std::string &_searchfilter, const std::string &_dnattr) + {} + + int init() { + return -ENOTSUP; + } + + int bind() { + return -ENOTSUP; + } + + int auth(const std::string &uid, const std::string &pwd) { + return -EACCES; + } + + ~LDAPHelper() {} + + }; /* LDAPHelper */ + + +#endif /* HAVE_OPENLDAP */ + +} /* namespace rgw */ + +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/dout.h" +#include "common/safe_io.h" +#include + +#include "include/ceph_assert.h" + +std::string parse_rgw_ldap_bindpw(CephContext* ctx); + +#endif /* RGW_LDAP_H */ diff --git a/src/rgw/rgw_lib.h b/src/rgw/rgw_lib.h new file mode 100644 index 00000000..9afd8d13 --- /dev/null +++ b/src/rgw/rgw_lib.h @@ -0,0 +1,225 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_LIB_H +#define RGW_LIB_H + +#include +#include "include/unordered_map.h" +#include "global/global_init.h" +#include "rgw_common.h" +#include "rgw_client_io.h" +#include "rgw_rest.h" +#include "rgw_request.h" +#include "rgw_frontend.h" +#include "rgw_process.h" +#include "rgw_rest_s3.h" // RGW_Auth_S3 +#include "rgw_ldap.h" +#include "services/svc_zone_utils.h" +#include "include/ceph_assert.h" + +class OpsLogSocket; + +namespace rgw { + + class RGWLibFrontend; + + class RGWLib { + RGWFrontendConfig* fec; + RGWLibFrontend* fe; + OpsLogSocket* olog; + rgw::LDAPHelper* ldh{nullptr}; + RGWREST rest; // XXX needed for RGWProcessEnv + RGWRados* store; + boost::intrusive_ptr cct; + + public: + RGWLib() : fec(nullptr), fe(nullptr), olog(nullptr), store(nullptr) + {} + ~RGWLib() {} + + RGWRados* get_store() { return store; } + + RGWLibFrontend* get_fe() { return fe; } + + rgw::LDAPHelper* get_ldh() { return ldh; } + + int init(); + int init(vector& args); + int stop(); + }; + + extern RGWLib rgwlib; + +/* request interface */ + + class RGWLibIO : public rgw::io::BasicClient, + public rgw::io::Accounter + { + RGWUserInfo user_info; + RGWEnv env; + public: + RGWLibIO() { + get_env().set("HTTP_HOST", ""); + } + explicit RGWLibIO(const RGWUserInfo &_user_info) + : user_info(_user_info) {} + + int init_env(CephContext *cct) override { + env.init(cct); + return 0; + } + + const RGWUserInfo& get_user() { + return user_info; + } + + int set_uid(RGWRados* store, const rgw_user& uid); + + int write_data(const char *buf, int len); + int read_data(char *buf, int len); + int send_status(int status, const char *status_name); + int send_100_continue(); + int complete_header(); + int send_content_length(uint64_t len); + + RGWEnv& get_env() noexcept override { + return env; + } + + size_t complete_request() override { /* XXX */ + return 0; + }; + + void set_account(bool) override { + return; + } + + uint64_t get_bytes_sent() const override { + return 0; + } + + uint64_t get_bytes_received() const override { + return 0; + } + + }; /* RGWLibIO */ + +/* XXX */ + class RGWRESTMgr_Lib : public RGWRESTMgr { + public: + RGWRESTMgr_Lib() {} + ~RGWRESTMgr_Lib() override {} + }; /* RGWRESTMgr_Lib */ + +/* XXX */ + class RGWHandler_Lib : public RGWHandler { + friend class RGWRESTMgr_Lib; + public: + + int authorize(const DoutPrefixProvider *dpp) override; + + RGWHandler_Lib() {} + ~RGWHandler_Lib() override {} + static int init_from_header(struct req_state *s); + }; /* RGWHandler_Lib */ + + class RGWLibRequest : public RGWRequest, + public RGWHandler_Lib { + public: + CephContext* cct; + RGWUserInfo* user; + boost::optional sysobj_ctx; + + /* unambiguiously return req_state */ + inline struct req_state* get_state() { return this->RGWRequest::s; } + + RGWLibRequest(CephContext* _cct, RGWUserInfo* _user) + : RGWRequest(rgwlib.get_store()->get_new_req_id()), cct(_cct), + user(_user) + {} + + RGWUserInfo* get_user() { return user; } + + int postauth_init() override { return 0; } + + /* descendant equivalent of *REST*::init_from_header(...): + * prepare request for execute()--should mean, fixup URI-alikes + * and any other expected stat vars in local req_state, for + * now */ + virtual int header_init() = 0; + + /* descendant initializer responsible to call RGWOp::init()--which + * descendants are required to inherit */ + virtual int op_init() = 0; + + using RGWHandler::init; + + int init(const RGWEnv& rgw_env, RGWObjectCtx* rados_ctx, + RGWLibIO* io, struct req_state* _s) { + + RGWRequest::init_state(_s); + RGWHandler::init(rados_ctx->get_store(), _s, io); + + sysobj_ctx.emplace(store->svc.sysobj); + + get_state()->obj_ctx = rados_ctx; + get_state()->sysobj_ctx = &(sysobj_ctx.get()); + get_state()->req_id = store->svc.zone_utils->unique_id(id); + get_state()->trans_id = store->svc.zone_utils->unique_trans_id(id); + + ldpp_dout(_s, 2) << "initializing for trans_id = " + << get_state()->trans_id.c_str() << dendl; + + int ret = header_init(); + if (ret == 0) { + ret = init_from_header(_s); + } + return ret; + } + + virtual bool only_bucket() = 0; + + int read_permissions(RGWOp *op) override; + + }; /* RGWLibRequest */ + + class RGWLibContinuedReq : public RGWLibRequest { + RGWLibIO io_ctx; + struct req_state rstate; + RGWObjectCtx rados_ctx; + public: + + RGWLibContinuedReq(CephContext* _cct, RGWUserInfo* _user) + : RGWLibRequest(_cct, _user), io_ctx(), + rstate(_cct, &io_ctx.get_env(), _user, id), + rados_ctx(rgwlib.get_store(), &rstate) + { + io_ctx.init(_cct); + + RGWRequest::init_state(&rstate); + RGWHandler::init(rados_ctx.get_store(), &rstate, &io_ctx); + + sysobj_ctx.emplace(store->svc.sysobj); + + get_state()->obj_ctx = &rados_ctx; + get_state()->sysobj_ctx = &(sysobj_ctx.get()); + get_state()->req_id = store->svc.zone_utils->unique_id(id); + get_state()->trans_id = store->svc.zone_utils->unique_trans_id(id); + + ldpp_dout(get_state(), 2) << "initializing for trans_id = " + << get_state()->trans_id.c_str() << dendl; + } + + inline RGWRados* get_store() { return store; } + + virtual int execute() final { ceph_abort(); } + virtual int exec_start() = 0; + virtual int exec_continue() = 0; + virtual int exec_finish() = 0; + + }; /* RGWLibContinuedReq */ + +} /* namespace rgw */ + +#endif /* RGW_LIB_H */ diff --git a/src/rgw/rgw_lib_frontend.h b/src/rgw/rgw_lib_frontend.h new file mode 100644 index 00000000..ec4ede04 --- /dev/null +++ b/src/rgw/rgw_lib_frontend.h @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_LIB_FRONTEND_H +#define RGW_LIB_FRONTEND_H + +#include + +#include "rgw_lib.h" +#include "rgw_file.h" + +namespace rgw { + + class RGWLibProcess : public RGWProcess { + RGWAccessKey access_key; + std::mutex mtx; + std::condition_variable cv; + int gen; + bool shutdown; + + typedef flat_map FSMAP; + FSMAP mounted_fs; + + using lock_guard = std::lock_guard; + using unique_lock = std::unique_lock; + + public: + RGWLibProcess(CephContext* cct, RGWProcessEnv* pe, int num_threads, + RGWFrontendConfig* _conf) : + RGWProcess(cct, pe, num_threads, _conf), gen(0), shutdown(false) {} + + void run() override; + void checkpoint(); + + void stop() { + shutdown = true; + for (const auto& fs: mounted_fs) { + fs.second->stop(); + } + cv.notify_all(); + } + + void register_fs(RGWLibFS* fs) { + lock_guard guard(mtx); + mounted_fs.insert(FSMAP::value_type(fs, fs)); + ++gen; + } + + void unregister_fs(RGWLibFS* fs) { + lock_guard guard(mtx); + FSMAP::iterator it = mounted_fs.find(fs); + if (it != mounted_fs.end()) { + mounted_fs.erase(it); + ++gen; + } + } + + void enqueue_req(RGWLibRequest* req) { + + lsubdout(g_ceph_context, rgw, 10) + << __func__ << " enqueue request req=" + << hex << req << dec << dendl; + + req_throttle.get(1); + req_wq.queue(req); + } /* enqueue_req */ + + /* "regular" requests */ + void handle_request(RGWRequest* req) override; // async handler, deletes req + int process_request(RGWLibRequest* req); + int process_request(RGWLibRequest* req, RGWLibIO* io); + void set_access_key(RGWAccessKey& key) { access_key = key; } + + /* requests w/continue semantics */ + int start_request(RGWLibContinuedReq* req); + int finish_request(RGWLibContinuedReq* req); + }; /* RGWLibProcess */ + + class RGWLibFrontend : public RGWProcessFrontend { + public: + RGWLibFrontend(RGWProcessEnv& pe, RGWFrontendConfig *_conf) + : RGWProcessFrontend(pe, _conf) {} + + int init() override; + + void stop() override { + RGWProcessFrontend::stop(); + get_process()->stop(); + } + + RGWLibProcess* get_process() { + return static_cast(pprocess); + } + + inline void enqueue_req(RGWLibRequest* req) { + static_cast(pprocess)->enqueue_req(req); // async + } + + inline int execute_req(RGWLibRequest* req) { + return static_cast(pprocess)->process_request(req); // !async + } + + inline int start_req(RGWLibContinuedReq* req) { + return static_cast(pprocess)->start_request(req); + } + + inline int finish_req(RGWLibContinuedReq* req) { + return static_cast(pprocess)->finish_request(req); + } + + }; /* RGWLibFrontend */ + +} /* namespace rgw */ + +#endif /* RGW_LIB_FRONTEND_H */ diff --git a/src/rgw/rgw_loadgen.cc b/src/rgw/rgw_loadgen.cc new file mode 100644 index 00000000..e13520dd --- /dev/null +++ b/src/rgw/rgw_loadgen.cc @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include "rgw_loadgen.h" +#include "rgw_auth_s3.h" + + +#define dout_subsys ceph_subsys_rgw + +void RGWLoadGenRequestEnv::set_date(utime_t& tm) +{ + date_str = rgw_to_asctime(tm); +} + +int RGWLoadGenRequestEnv::sign(RGWAccessKey& access_key) +{ + meta_map_t meta_map; + map sub_resources; + + string canonical_header; + string digest; + + rgw_create_s3_canonical_header(request_method.c_str(), + nullptr, /* const char *content_md5 */ + content_type.c_str(), + date_str.c_str(), + meta_map, + meta_map_t{}, + uri.c_str(), + sub_resources, + canonical_header); + + headers["HTTP_DATE"] = date_str; + try { + /* FIXME(rzarzynski): kill the dependency on g_ceph_context. */ + const auto signature = static_cast( + rgw::auth::s3::get_v2_signature(g_ceph_context, canonical_header, + access_key.key)); + headers["HTTP_AUTHORIZATION"] = \ + std::string("AWS ") + access_key.id + ":" + signature; + } catch (int ret) { + return ret; + } + + return 0; +} + +size_t RGWLoadGenIO::write_data(const char* const buf, + const size_t len) +{ + return len; +} + +size_t RGWLoadGenIO::read_data(char* const buf, const size_t len) +{ + const size_t read_len = std::min(left_to_read, + static_cast(len)); + left_to_read -= read_len; + return read_len; +} + +void RGWLoadGenIO::flush() +{ +} + +size_t RGWLoadGenIO::complete_request() +{ + return 0; +} + +int RGWLoadGenIO::init_env(CephContext *cct) +{ + env.init(cct); + + left_to_read = req->content_length; + + char buf[32]; + snprintf(buf, sizeof(buf), "%lld", (long long)req->content_length); + env.set("CONTENT_LENGTH", buf); + + env.set("CONTENT_TYPE", req->content_type.c_str()); + env.set("HTTP_DATE", req->date_str.c_str()); + + for (map::iterator iter = req->headers.begin(); iter != req->headers.end(); ++iter) { + env.set(iter->first.c_str(), iter->second.c_str()); + } + + env.set("REQUEST_METHOD", req->request_method.c_str()); + env.set("REQUEST_URI", req->uri.c_str()); + env.set("QUERY_STRING", req->query_string.c_str()); + env.set("SCRIPT_URI", req->uri.c_str()); + + char port_buf[16]; + snprintf(port_buf, sizeof(port_buf), "%d", req->port); + env.set("SERVER_PORT", port_buf); + return 0; +} + +size_t RGWLoadGenIO::send_status(const int status, + const char* const status_name) +{ + return 0; +} + +size_t RGWLoadGenIO::send_100_continue() +{ + return 0; +} + +size_t RGWLoadGenIO::send_header(const boost::string_ref& name, + const boost::string_ref& value) +{ + return 0; +} + +size_t RGWLoadGenIO::complete_header() +{ + return 0; +} + +size_t RGWLoadGenIO::send_content_length(const uint64_t len) +{ + return 0; +} diff --git a/src/rgw/rgw_loadgen.h b/src/rgw/rgw_loadgen.h new file mode 100644 index 00000000..72aace76 --- /dev/null +++ b/src/rgw/rgw_loadgen.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_LOADGEN_H +#define CEPH_RGW_LOADGEN_H + +#include +#include + +#include "rgw_client_io.h" + + +struct RGWLoadGenRequestEnv { + int port; + uint64_t content_length; + std::string content_type; + std::string request_method; + std::string uri; + std::string query_string; + std::string date_str; + + std::map headers; + + RGWLoadGenRequestEnv() + : port(0), + content_length(0) { + } + + void set_date(utime_t& tm); + int sign(RGWAccessKey& access_key); +}; + +/* XXX does RGWLoadGenIO actually want to perform stream/HTTP I/O, + * or (e.g) are these NOOPs? */ +class RGWLoadGenIO : public rgw::io::RestfulClient +{ + uint64_t left_to_read; + RGWLoadGenRequestEnv* req; + RGWEnv env; + + int init_env(CephContext *cct) override; + size_t read_data(char *buf, size_t len); + size_t write_data(const char *buf, size_t len); + +public: + explicit RGWLoadGenIO(RGWLoadGenRequestEnv* const req) + : left_to_read(0), + req(req) { + } + + size_t send_status(int status, const char *status_name) override; + size_t send_100_continue() override; + size_t send_header(const boost::string_ref& name, + const boost::string_ref& value) override; + size_t complete_header() override; + size_t send_content_length(uint64_t len) override; + + size_t recv_body(char* buf, size_t max) override { + return read_data(buf, max); + } + + size_t send_body(const char* buf, size_t len) override { + return write_data(buf, len); + } + + void flush() override; + + RGWEnv& get_env() noexcept override { + return env; + } + + size_t complete_request() override; +}; + +#endif diff --git a/src/rgw/rgw_loadgen_process.cc b/src/rgw/rgw_loadgen_process.cc new file mode 100644 index 00000000..677599f0 --- /dev/null +++ b/src/rgw/rgw_loadgen_process.cc @@ -0,0 +1,149 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" + +#include "rgw_rados.h" +#include "rgw_rest.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_loadgen.h" +#include "rgw_client_io.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +extern void signal_shutdown(); + +void RGWLoadGenProcess::checkpoint() +{ + m_tp.drain(&req_wq); +} + +void RGWLoadGenProcess::run() +{ + m_tp.start(); /* start thread pool */ + + int i; + + int num_objs; + + conf->get_val("num_objs", 1000, &num_objs); + + int num_buckets; + conf->get_val("num_buckets", 1, &num_buckets); + + vector buckets(num_buckets); + + std::atomic failed = { false }; + + for (i = 0; i < num_buckets; i++) { + buckets[i] = "/loadgen"; + string& bucket = buckets[i]; + append_rand_alpha(cct, bucket, bucket, 16); + + /* first create a bucket */ + gen_request("PUT", bucket, 0, &failed); + checkpoint(); + } + + string *objs = new string[num_objs]; + + if (failed) { + derr << "ERROR: bucket creation failed" << dendl; + goto done; + } + + for (i = 0; i < num_objs; i++) { + char buf[16 + 1]; + gen_rand_alphanumeric(cct, buf, sizeof(buf)); + buf[16] = '\0'; + objs[i] = buckets[i % num_buckets] + "/" + buf; + } + + for (i = 0; i < num_objs; i++) { + gen_request("PUT", objs[i], 4096, &failed); + } + + checkpoint(); + + if (failed) { + derr << "ERROR: bucket creation failed" << dendl; + goto done; + } + + for (i = 0; i < num_objs; i++) { + gen_request("GET", objs[i], 4096, NULL); + } + + checkpoint(); + + for (i = 0; i < num_objs; i++) { + gen_request("DELETE", objs[i], 0, NULL); + } + + checkpoint(); + + for (i = 0; i < num_buckets; i++) { + gen_request("DELETE", buckets[i], 0, NULL); + } + +done: + checkpoint(); + + m_tp.stop(); + + delete[] objs; + + signal_shutdown(); +} /* RGWLoadGenProcess::run() */ + +void RGWLoadGenProcess::gen_request(const string& method, + const string& resource, + int content_length, std::atomic* fail_flag) +{ + RGWLoadGenRequest* req = + new RGWLoadGenRequest(store->get_new_req_id(), method, resource, + content_length, fail_flag); + dout(10) << "allocated request req=" << hex << req << dec << dendl; + req_throttle.get(1); + req_wq.queue(req); +} /* RGWLoadGenProcess::gen_request */ + +void RGWLoadGenProcess::handle_request(RGWRequest* r) +{ + RGWLoadGenRequest* req = static_cast(r); + + RGWLoadGenRequestEnv env; + + utime_t tm = ceph_clock_now(); + + env.port = 80; + env.content_length = req->content_length; + env.content_type = "binary/octet-stream"; + env.request_method = req->method; + env.uri = req->resource; + env.set_date(tm); + env.sign(access_key); + + RGWLoadGenIO real_client_io(&env); + RGWRestfulIO client_io(cct, &real_client_io); + + int ret = process_request(store, rest, req, uri_prefix, + *auth_registry, &client_io, olog, + null_yield, nullptr); + if (ret < 0) { + /* we don't really care about return code */ + dout(20) << "process_request() returned " << ret << dendl; + + if (req->fail_flag) { + req->fail_flag++; + } + } + + delete req; +} /* RGWLoadGenProcess::handle_request */ diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc new file mode 100644 index 00000000..f6722f1b --- /dev/null +++ b/src/rgw/rgw_log.cc @@ -0,0 +1,467 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Clock.h" +#include "common/Timer.h" +#include "common/utf8.h" +#include "common/OutputDataSocket.h" +#include "common/Formatter.h" + +#include "rgw_bucket.h" +#include "rgw_log.h" +#include "rgw_acl.h" +#include "rgw_rados.h" +#include "rgw_client_io.h" +#include "rgw_rest.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +static void set_param_str(struct req_state *s, const char *name, string& str) +{ + const char *p = s->info.env->get(name); + if (p) + str = p; +} + +string render_log_object_name(const string& format, + struct tm *dt, string& bucket_id, + const string& bucket_name) +{ + string o; + for (unsigned i=0; itm_year + 1900); + break; + case 'y': + sprintf(buf, "%.2d", dt->tm_year % 100); + break; + case 'm': + sprintf(buf, "%.2d", dt->tm_mon + 1); + break; + case 'd': + sprintf(buf, "%.2d", dt->tm_mday); + break; + case 'H': + sprintf(buf, "%.2d", dt->tm_hour); + break; + case 'I': + sprintf(buf, "%.2d", (dt->tm_hour % 12) + 1); + break; + case 'k': + sprintf(buf, "%d", dt->tm_hour); + break; + case 'l': + sprintf(buf, "%d", (dt->tm_hour % 12) + 1); + break; + case 'M': + sprintf(buf, "%.2d", dt->tm_min); + break; + + case 'i': + o += bucket_id; + continue; + case 'n': + o += bucket_name; + continue; + default: + // unknown code + sprintf(buf, "%%%c", format[i]); + break; + } + o += buf; + continue; + } + o += format[i]; + } + return o; +} + +/* usage logger */ +class UsageLogger { + CephContext *cct; + RGWRados *store; + map usage_map; + Mutex lock; + int32_t num_entries; + Mutex timer_lock; + SafeTimer timer; + utime_t round_timestamp; + + class C_UsageLogTimeout : public Context { + UsageLogger *logger; + public: + explicit C_UsageLogTimeout(UsageLogger *_l) : logger(_l) {} + void finish(int r) override { + logger->flush(); + logger->set_timer(); + } + }; + + void set_timer() { + timer.add_event_after(cct->_conf->rgw_usage_log_tick_interval, new C_UsageLogTimeout(this)); + } +public: + + UsageLogger(CephContext *_cct, RGWRados *_store) : cct(_cct), store(_store), lock("UsageLogger"), num_entries(0), timer_lock("UsageLogger::timer_lock"), timer(cct, timer_lock) { + timer.init(); + Mutex::Locker l(timer_lock); + set_timer(); + utime_t ts = ceph_clock_now(); + recalc_round_timestamp(ts); + } + + ~UsageLogger() { + Mutex::Locker l(timer_lock); + flush(); + timer.cancel_all_events(); + timer.shutdown(); + } + + void recalc_round_timestamp(utime_t& ts) { + round_timestamp = ts.round_to_hour(); + } + + void insert_user(utime_t& timestamp, const rgw_user& user, rgw_usage_log_entry& entry) { + lock.Lock(); + if (timestamp.sec() > round_timestamp + 3600) + recalc_round_timestamp(timestamp); + entry.epoch = round_timestamp.sec(); + bool account; + string u = user.to_str(); + rgw_user_bucket ub(u, entry.bucket); + real_time rt = round_timestamp.to_real_time(); + usage_map[ub].insert(rt, entry, &account); + if (account) + num_entries++; + bool need_flush = (num_entries > cct->_conf->rgw_usage_log_flush_threshold); + lock.Unlock(); + if (need_flush) { + Mutex::Locker l(timer_lock); + flush(); + } + } + + void insert(utime_t& timestamp, rgw_usage_log_entry& entry) { + if (entry.payer.empty()) { + insert_user(timestamp, entry.owner, entry); + } else { + insert_user(timestamp, entry.payer, entry); + } + } + + void flush() { + map old_map; + lock.Lock(); + old_map.swap(usage_map); + num_entries = 0; + lock.Unlock(); + + store->log_usage(old_map); + } +}; + +static UsageLogger *usage_logger = NULL; + +void rgw_log_usage_init(CephContext *cct, RGWRados *store) +{ + usage_logger = new UsageLogger(cct, store); +} + +void rgw_log_usage_finalize() +{ + delete usage_logger; + usage_logger = NULL; +} + +static void log_usage(struct req_state *s, const string& op_name) +{ + if (s->system_request) /* don't log system user operations */ + return; + + if (!usage_logger) + return; + + rgw_user user; + rgw_user payer; + string bucket_name; + + bucket_name = s->bucket_name; + + if (!bucket_name.empty()) { + user = s->bucket_owner.get_id(); + if (s->bucket_info.requester_pays) { + payer = s->user->user_id; + } + } else { + user = s->user->user_id; + } + + bool error = s->err.is_err(); + if (error && s->err.http_ret == 404) { + bucket_name = "-"; /* bucket not found, use the invalid '-' as bucket name */ + } + + string u = user.to_str(); + string p = payer.to_str(); + rgw_usage_log_entry entry(u, p, bucket_name); + + uint64_t bytes_sent = ACCOUNTING_IO(s)->get_bytes_sent(); + uint64_t bytes_received = ACCOUNTING_IO(s)->get_bytes_received(); + + rgw_usage_data data(bytes_sent, bytes_received); + + data.ops = 1; + if (!s->is_err()) + data.successful_ops = 1; + + ldout(s->cct, 30) << "log_usage: bucket_name=" << bucket_name + << " tenant=" << s->bucket_tenant + << ", bytes_sent=" << bytes_sent << ", bytes_received=" + << bytes_received << ", success=" << data.successful_ops << dendl; + + entry.add(op_name, data); + + utime_t ts = ceph_clock_now(); + + usage_logger->insert(ts, entry); +} + +void rgw_format_ops_log_entry(struct rgw_log_entry& entry, Formatter *formatter) +{ + formatter->open_object_section("log_entry"); + formatter->dump_string("bucket", entry.bucket); + { + auto t = utime_t{entry.time}; + t.gmtime(formatter->dump_stream("time")); // UTC + t.localtime(formatter->dump_stream("time_local")); + } + formatter->dump_string("remote_addr", entry.remote_addr); + string obj_owner = entry.object_owner.to_str(); + if (obj_owner.length()) + formatter->dump_string("object_owner", obj_owner); + formatter->dump_string("user", entry.user); + formatter->dump_string("operation", entry.op); + formatter->dump_string("uri", entry.uri); + formatter->dump_string("http_status", entry.http_status); + formatter->dump_string("error_code", entry.error_code); + formatter->dump_int("bytes_sent", entry.bytes_sent); + formatter->dump_int("bytes_received", entry.bytes_received); + formatter->dump_int("object_size", entry.obj_size); + { + using namespace std::chrono; + uint64_t total_time = duration_cast(entry.total_time).count(); + formatter->dump_int("total_time", total_time); + } + formatter->dump_string("user_agent", entry.user_agent); + formatter->dump_string("referrer", entry.referrer); + if (entry.x_headers.size() > 0) { + formatter->open_array_section("http_x_headers"); + for (const auto& iter: entry.x_headers) { + formatter->open_object_section(iter.first.c_str()); + formatter->dump_string(iter.first.c_str(), iter.second); + formatter->close_section(); + } + formatter->close_section(); + } + formatter->close_section(); +} + +void OpsLogSocket::formatter_to_bl(bufferlist& bl) +{ + stringstream ss; + formatter->flush(ss); + const string& s = ss.str(); + + bl.append(s); +} + +void OpsLogSocket::init_connection(bufferlist& bl) +{ + bl.append("["); +} + +OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog), lock("OpsLogSocket") +{ + formatter = new JSONFormatter; + delim.append(",\n"); +} + +OpsLogSocket::~OpsLogSocket() +{ + delete formatter; +} + +void OpsLogSocket::log(struct rgw_log_entry& entry) +{ + bufferlist bl; + + lock.Lock(); + rgw_format_ops_log_entry(entry, formatter); + formatter_to_bl(bl); + lock.Unlock(); + + append_output(bl); +} + +int rgw_log_op(RGWRados *store, RGWREST* const rest, struct req_state *s, + const string& op_name, OpsLogSocket *olog) +{ + struct rgw_log_entry entry; + string bucket_id; + + if (s->enable_usage_log) + log_usage(s, op_name); + + if (!s->enable_ops_log) + return 0; + + if (s->bucket_name.empty()) { + ldout(s->cct, 5) << "nothing to log for operation" << dendl; + return -EINVAL; + } + if (s->err.ret == -ERR_NO_SUCH_BUCKET) { + if (!s->cct->_conf->rgw_log_nonexistent_bucket) { + ldout(s->cct, 5) << "bucket " << s->bucket << " doesn't exist, not logging" << dendl; + return 0; + } + bucket_id = ""; + } else { + bucket_id = s->bucket.bucket_id; + } + rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name, entry.bucket); + + if (check_utf8(entry.bucket.c_str(), entry.bucket.size()) != 0) { + ldout(s->cct, 5) << "not logging op on bucket with non-utf8 name" << dendl; + return 0; + } + + if (!s->object.empty()) { + entry.obj = s->object; + } else { + entry.obj = rgw_obj_key("-"); + } + + entry.obj_size = s->obj_size; + + if (s->cct->_conf->rgw_remote_addr_param.length()) + set_param_str(s, s->cct->_conf->rgw_remote_addr_param.c_str(), + entry.remote_addr); + else + set_param_str(s, "REMOTE_ADDR", entry.remote_addr); + set_param_str(s, "HTTP_USER_AGENT", entry.user_agent); + // legacy apps are still using misspelling referer, such as curl -e option + if (s->info.env->exists("HTTP_REFERRER")) + set_param_str(s, "HTTP_REFERRER", entry.referrer); + else + set_param_str(s, "HTTP_REFERER", entry.referrer); + + std::string uri; + if (s->info.env->exists("REQUEST_METHOD")) { + uri.append(s->info.env->get("REQUEST_METHOD")); + uri.append(" "); + } + + if (s->info.env->exists("REQUEST_URI")) { + uri.append(s->info.env->get("REQUEST_URI")); + } + + if (s->info.env->exists("QUERY_STRING")) { + const char* qs = s->info.env->get("QUERY_STRING"); + if(qs && (*qs != '\0')) { + uri.append("?"); + uri.append(qs); + } + } + + if (s->info.env->exists("HTTP_VERSION")) { + uri.append(" "); + uri.append("HTTP/"); + uri.append(s->info.env->get("HTTP_VERSION")); + } + + entry.uri = std::move(uri); + + entry.op = op_name; + + /* custom header logging */ + if (rest) { + if (rest->log_x_headers()) { + for (const auto& iter : s->info.env->get_map()) { + if (rest->log_x_header(iter.first)) { + entry.x_headers.insert( + rgw_log_entry::headers_map::value_type(iter.first, iter.second)); + } + } + } + } + + entry.user = s->user->user_id.to_str(); + if (s->object_acl) + entry.object_owner = s->object_acl->get_owner().get_id(); + entry.bucket_owner = s->bucket_owner.get_id(); + + uint64_t bytes_sent = ACCOUNTING_IO(s)->get_bytes_sent(); + uint64_t bytes_received = ACCOUNTING_IO(s)->get_bytes_received(); + + entry.time = s->time; + entry.total_time = s->time_elapsed(); + entry.bytes_sent = bytes_sent; + entry.bytes_received = bytes_received; + if (s->err.http_ret) { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", s->err.http_ret); + entry.http_status = buf; + } else + entry.http_status = "200"; // default + + entry.error_code = s->err.err_code; + entry.bucket_id = bucket_id; + + bufferlist bl; + encode(entry, bl); + + struct tm bdt; + time_t t = req_state::Clock::to_time_t(entry.time); + if (s->cct->_conf->rgw_log_object_name_utc) + gmtime_r(&t, &bdt); + else + localtime_r(&t, &bdt); + + int ret = 0; + + if (s->cct->_conf->rgw_ops_log_rados) { + string oid = render_log_object_name(s->cct->_conf->rgw_log_object_name, &bdt, + s->bucket.bucket_id, entry.bucket); + + rgw_raw_obj obj(store->svc.zone->get_zone_params().log_pool, oid); + + ret = store->append_async(obj, bl.length(), bl); + if (ret == -ENOENT) { + ret = store->create_pool(store->svc.zone->get_zone_params().log_pool); + if (ret < 0) + goto done; + // retry + ret = store->append_async(obj, bl.length(), bl); + } + } + + if (olog) { + olog->log(entry); + } +done: + if (ret < 0) + ldout(s->cct, 0) << "ERROR: failed to log entry" << dendl; + + return ret; +} + diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h new file mode 100644 index 00000000..9614624d --- /dev/null +++ b/src/rgw/rgw_log.h @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_LOG_H +#define CEPH_RGW_LOG_H +#include +#include "rgw_common.h" +#include "common/Formatter.h" +#include "common/OutputDataSocket.h" + +class RGWRados; + +struct rgw_log_entry { + + using headers_map = boost::container::flat_map; + using Clock = req_state::Clock; + + rgw_user object_owner; + rgw_user bucket_owner; + string bucket; + Clock::time_point time; + string remote_addr; + string user; + rgw_obj_key obj; + string op; + string uri; + string http_status; + string error_code; + uint64_t bytes_sent; + uint64_t bytes_received; + uint64_t obj_size; + Clock::duration total_time; + string user_agent; + string referrer; + string bucket_id; + headers_map x_headers; + + void encode(bufferlist &bl) const { + ENCODE_START(9, 5, bl); + encode(object_owner.id, bl); + encode(bucket_owner.id, bl); + encode(bucket, bl); + encode(time, bl); + encode(remote_addr, bl); + encode(user, bl); + encode(obj.name, bl); + encode(op, bl); + encode(uri, bl); + encode(http_status, bl); + encode(error_code, bl); + encode(bytes_sent, bl); + encode(obj_size, bl); + encode(total_time, bl); + encode(user_agent, bl); + encode(referrer, bl); + encode(bytes_received, bl); + encode(bucket_id, bl); + encode(obj, bl); + encode(object_owner, bl); + encode(bucket_owner, bl); + encode(x_headers, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &p) { + DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, p); + decode(object_owner.id, p); + if (struct_v > 3) + decode(bucket_owner.id, p); + decode(bucket, p); + decode(time, p); + decode(remote_addr, p); + decode(user, p); + decode(obj.name, p); + decode(op, p); + decode(uri, p); + decode(http_status, p); + decode(error_code, p); + decode(bytes_sent, p); + decode(obj_size, p); + decode(total_time, p); + decode(user_agent, p); + decode(referrer, p); + if (struct_v >= 2) + decode(bytes_received, p); + else + bytes_received = 0; + + if (struct_v >= 3) { + if (struct_v <= 5) { + uint64_t id; + decode(id, p); + char buf[32]; + snprintf(buf, sizeof(buf), "%" PRIu64, id); + bucket_id = buf; + } else { + decode(bucket_id, p); + } + } else { + bucket_id = ""; + } + if (struct_v >= 7) { + decode(obj, p); + } + if (struct_v >= 8) { + decode(object_owner, p); + decode(bucket_owner, p); + } + if (struct_v >= 9) { + decode(x_headers, p); + } + DECODE_FINISH(p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(rgw_log_entry) + +class OpsLogSocket : public OutputDataSocket { + Formatter *formatter; + Mutex lock; + + void formatter_to_bl(bufferlist& bl); + +protected: + void init_connection(bufferlist& bl) override; + +public: + OpsLogSocket(CephContext *cct, uint64_t _backlog); + ~OpsLogSocket() override; + + void log(struct rgw_log_entry& entry); +}; + +class RGWREST; + +int rgw_log_op(RGWRados *store, RGWREST* const rest, struct req_state *s, + const string& op_name, OpsLogSocket *olog); +void rgw_log_usage_init(CephContext *cct, RGWRados *store); +void rgw_log_usage_finalize(); +void rgw_format_ops_log_entry(struct rgw_log_entry& entry, + Formatter *formatter); + +#endif /* CEPH_RGW_LOG_H */ + diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc new file mode 100644 index 00000000..c0c43a6e --- /dev/null +++ b/src/rgw/rgw_main.cc @@ -0,0 +1,637 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "global/signal_handler.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/safe_io.h" +#include "common/TracepointProvider.h" +#include "include/compat.h" +#include "include/str_list.h" +#include "include/stringify.h" +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_otp.h" +#include "rgw_period_pusher.h" +#include "rgw_realm_reloader.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_swift.h" +#include "rgw_rest_admin.h" +#include "rgw_rest_usage.h" +#include "rgw_rest_user.h" +#include "rgw_rest_bucket.h" +#include "rgw_rest_metadata.h" +#include "rgw_rest_log.h" +#include "rgw_rest_config.h" +#include "rgw_rest_realm.h" +#include "rgw_rest_sts.h" +#include "rgw_swift_auth.h" +#include "rgw_log.h" +#include "rgw_tools.h" +#include "rgw_resolve.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_frontend.h" +#include "rgw_http_client_curl.h" +#include "rgw_perf_counters.h" +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +#include "rgw_amqp.h" +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +#include "rgw_kafka.h" +#endif +#if defined(WITH_RADOSGW_BEAST_FRONTEND) +#include "rgw_asio_frontend.h" +#endif /* WITH_RADOSGW_BEAST_FRONTEND */ + +#include "rgw_dmclock_scheduler_ctx.h" + +#include "services/svc_zone.h" + +#ifdef HAVE_SYS_PRCTL_H +#include +#endif + +#define dout_subsys ceph_subsys_rgw + +namespace { +TracepointProvider::Traits rgw_op_tracepoint_traits("librgw_op_tp.so", + "rgw_op_tracing"); +TracepointProvider::Traits rgw_rados_tracepoint_traits("librgw_rados_tp.so", + "rgw_rados_tracing"); +} + +static sig_t sighandler_alrm; + +class RGWProcess; + +static int signal_fd[2] = {0, 0}; + +void signal_shutdown() +{ + int val = 0; + int ret = write(signal_fd[0], (char *)&val, sizeof(val)); + if (ret < 0) { + derr << "ERROR: " << __func__ << ": write() returned " + << cpp_strerror(errno) << dendl; + } +} + +static void wait_shutdown() +{ + int val; + int r = safe_read_exact(signal_fd[1], &val, sizeof(val)); + if (r < 0) { + derr << "safe_read_exact returned with error" << dendl; + } +} + +static int signal_fd_init() +{ + return socketpair(AF_UNIX, SOCK_STREAM, 0, signal_fd); +} + +static void signal_fd_finalize() +{ + close(signal_fd[0]); + close(signal_fd[1]); +} + +static void handle_sigterm(int signum) +{ + dout(1) << __func__ << dendl; +#if defined(WITH_RADOSGW_FCGI_FRONTEND) + FCGX_ShutdownPending(); +#endif + + // send a signal to make fcgi's accept(2) wake up. unfortunately the + // initial signal often isn't sufficient because we race with accept's + // check of the flag wet by ShutdownPending() above. + if (signum != SIGUSR1) { + signal_shutdown(); + + // safety net in case we get stuck doing an orderly shutdown. + uint64_t secs = g_ceph_context->_conf->rgw_exit_timeout_secs; + if (secs) + alarm(secs); + dout(1) << __func__ << " set alarm for " << secs << dendl; + } + +} + +static void godown_alarm(int signum) +{ + _exit(0); +} + + +class C_InitTimeout : public Context { +public: + C_InitTimeout() {} + void finish(int r) override { + derr << "Initialization timeout, failed to initialize" << dendl; + exit(1); + } +}; + +static int usage() +{ + cout << "usage: radosgw [options...]" << std::endl; + cout << "options:\n"; + cout << " --rgw-region= region in which radosgw runs\n"; + cout << " --rgw-zone= zone in which radosgw runs\n"; + cout << " --rgw-socket-path= specify a unix domain socket path\n"; + cout << " -m monaddress[:port] connect to specified monitor\n"; + cout << " --keyring= path to radosgw keyring\n"; + cout << " --logfile= file to log debug output\n"; + cout << " --debug-rgw=/ set radosgw debug level\n"; + generic_server_usage(); + + return 0; +} + +static RGWRESTMgr *set_logging(RGWRESTMgr *mgr) +{ + mgr->set_logging(true); + return mgr; +} + +static RGWRESTMgr *rest_filter(RGWRados *store, int dialect, RGWRESTMgr *orig) +{ + RGWSyncModuleInstanceRef sync_module = store->get_sync_module(); + if (sync_module) { + return sync_module->get_rest_filter(dialect, orig); + } else { + return orig; + } +} + +/* + * start up the RADOS connection and then handle HTTP messages as they come in + */ +int main(int argc, const char **argv) +{ + // dout() messages will be sent to stderr, but FCGX wants messages on stdout + // Redirect stderr to stdout. + TEMP_FAILURE_RETRY(close(STDERR_FILENO)); + if (TEMP_FAILURE_RETRY(dup2(STDOUT_FILENO, STDERR_FILENO)) < 0) { + int err = errno; + cout << "failed to redirect stderr to stdout: " << cpp_strerror(err) + << std::endl; + return ENOSYS; + } + + /* alternative default for module */ + map defaults = { + { "debug_rgw", "1/5" }, + { "keyring", "$rgw_data/keyring" }, + { "objecter_inflight_ops", "24576" } + }; + + vector args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + // First, let's determine which frontends are configured. + int flags = CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS; + global_pre_init( + &defaults, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_DAEMON, + flags); + + list frontends; + g_conf().early_expand_meta(g_conf()->rgw_frontends, &cerr); + get_str_list(g_conf()->rgw_frontends, ",", frontends); + multimap fe_map; + list configs; + if (frontends.empty()) { + frontends.push_back("civetweb"); + } + for (list::iterator iter = frontends.begin(); iter != frontends.end(); ++iter) { + string& f = *iter; + + if (f.find("civetweb") != string::npos || f.find("beast") != string::npos) { + // If civetweb or beast is configured as a frontend, prevent global_init() from + // dropping permissions by setting the appropriate flag. + flags |= CINIT_FLAG_DEFER_DROP_PRIVILEGES; + if (f.find("port") != string::npos) { + // check for the most common ws problems + if ((f.find("port=") == string::npos) || + (f.find("port= ") != string::npos)) { + derr << "WARNING: radosgw frontend config found unexpected spacing around 'port' " + << "(ensure frontend port parameter has the form 'port=80' with no spaces " + << "before or after '=')" << dendl; + } + } + } + + RGWFrontendConfig *config = new RGWFrontendConfig(f); + int r = config->init(); + if (r < 0) { + delete config; + cerr << "ERROR: failed to init config: " << f << std::endl; + return EINVAL; + } + + configs.push_back(config); + + string framework = config->get_framework(); + fe_map.insert(pair(framework, config)); + } + + // Now that we've determined which frontend(s) to use, continue with global + // initialization. Passing false as the final argument ensures that + // global_pre_init() is not invoked twice. + // claim the reference and release it after subsequent destructors have fired + auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + flags, "rgw_data", false); + + // maintain existing region root pool for new multisite objects + if (!g_conf()->rgw_region_root_pool.empty()) { + const char *root_pool = g_conf()->rgw_region_root_pool.c_str(); + if (g_conf()->rgw_zonegroup_root_pool.empty()) { + g_conf().set_val_or_die("rgw_zonegroup_root_pool", root_pool); + } + if (g_conf()->rgw_period_root_pool.empty()) { + g_conf().set_val_or_die("rgw_period_root_pool", root_pool); + } + if (g_conf()->rgw_realm_root_pool.empty()) { + g_conf().set_val_or_die("rgw_realm_root_pool", root_pool); + } + } + + // for region -> zonegroup conversion (must happen before common_init_finish()) + if (!g_conf()->rgw_region.empty() && g_conf()->rgw_zonegroup.empty()) { + g_conf().set_val_or_die("rgw_zonegroup", g_conf()->rgw_region.c_str()); + } + + if (g_conf()->daemonize) { + global_init_daemonize(g_ceph_context); + } + Mutex mutex("main"); + SafeTimer init_timer(g_ceph_context, mutex); + init_timer.init(); + mutex.Lock(); + init_timer.add_event_after(g_conf()->rgw_init_timeout, new C_InitTimeout); + mutex.Unlock(); + + common_init_finish(g_ceph_context); + + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + + TracepointProvider::initialize(g_ceph_context); + TracepointProvider::initialize(g_ceph_context); + + int r = rgw_tools_init(g_ceph_context); + if (r < 0) { + derr << "ERROR: unable to initialize rgw tools" << dendl; + return -r; + } + + rgw_init_resolver(); + rgw::curl::setup_curl(fe_map); + rgw_http_client_init(g_ceph_context); + +#if defined(WITH_RADOSGW_FCGI_FRONTEND) + FCGX_Init(); +#endif + + RGWRados *store = + RGWStoreManager::get_storage(g_ceph_context, + g_conf()->rgw_enable_gc_threads, + g_conf()->rgw_enable_lc_threads, + g_conf()->rgw_enable_quota_threads, + g_conf()->rgw_run_sync_thread, + g_conf().get_val("rgw_dynamic_resharding"), + g_conf()->rgw_cache_enabled); + if (!store) { + mutex.Lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.Unlock(); + + derr << "Couldn't init storage provider (RADOS)" << dendl; + return EIO; + } + r = rgw_perf_start(g_ceph_context); + if (r < 0) { + derr << "ERROR: failed starting rgw perf" << dendl; + return -r; + } + + rgw_rest_init(g_ceph_context, store, store->svc.zone->get_zonegroup()); + + mutex.Lock(); + init_timer.cancel_all_events(); + init_timer.shutdown(); + mutex.Unlock(); + + rgw_user_init(store); + rgw_bucket_init(store->meta_mgr); + rgw_otp_init(store); + rgw_log_usage_init(g_ceph_context, store); + + RGWREST rest; + + list apis; + + get_str_list(g_conf()->rgw_enable_apis, apis); + + map apis_map; + for (list::iterator li = apis.begin(); li != apis.end(); ++li) { + apis_map[*li] = true; + } + + /* warn about insecure keystone secret config options */ + if (!(g_ceph_context->_conf->rgw_keystone_admin_token.empty() || + g_ceph_context->_conf->rgw_keystone_admin_password.empty())) { + dout(0) << "WARNING: rgw_keystone_admin_token and rgw_keystone_admin_password should be avoided as they can expose secrets. Prefer the new rgw_keystone_admin_token_path and rgw_keystone_admin_password_path options, which read their secrets from files." << dendl; + } + + // S3 website mode is a specialization of S3 + const bool s3website_enabled = apis_map.count("s3website") > 0; + const bool sts_enabled = apis_map.count("sts") > 0; + const bool iam_enabled = apis_map.count("iam") > 0; + const bool pubsub_enabled = apis_map.count("pubsub") > 0; + // Swift API entrypoint could placed in the root instead of S3 + const bool swift_at_root = g_conf()->rgw_swift_url_prefix == "/"; + if (apis_map.count("s3") > 0 || s3website_enabled) { + if (! swift_at_root) { + rest.register_default_mgr(set_logging(rest_filter(store, RGW_REST_S3, + new RGWRESTMgr_S3(s3website_enabled, sts_enabled, iam_enabled, pubsub_enabled)))); + } else { + derr << "Cannot have the S3 or S3 Website enabled together with " + << "Swift API placed in the root of hierarchy" << dendl; + return EINVAL; + } + } + + if (pubsub_enabled) { +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + if (!rgw::amqp::init(cct.get())) { + dout(1) << "ERROR: failed to initialize AMQP manager" << dendl; + } +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + if (!rgw::kafka::init(cct.get())) { + dout(1) << "ERROR: failed to initialize Kafka manager" << dendl; + } +#endif + } + + if (apis_map.count("swift") > 0) { + RGWRESTMgr_SWIFT* const swift_resource = new RGWRESTMgr_SWIFT; + + if (! g_conf()->rgw_cross_domain_policy.empty()) { + swift_resource->register_resource("crossdomain.xml", + set_logging(new RGWRESTMgr_SWIFT_CrossDomain)); + } + + swift_resource->register_resource("healthcheck", + set_logging(new RGWRESTMgr_SWIFT_HealthCheck)); + + swift_resource->register_resource("info", + set_logging(new RGWRESTMgr_SWIFT_Info)); + + if (! swift_at_root) { + rest.register_resource(g_conf()->rgw_swift_url_prefix, + set_logging(rest_filter(store, RGW_REST_SWIFT, + swift_resource))); + } else { + if (store->svc.zone->get_zonegroup().zones.size() > 1) { + derr << "Placing Swift API in the root of URL hierarchy while running" + << " multi-site configuration requires another instance of RadosGW" + << " with S3 API enabled!" << dendl; + } + + rest.register_default_mgr(set_logging(swift_resource)); + } + } + + if (apis_map.count("swift_auth") > 0) { + rest.register_resource(g_conf()->rgw_swift_auth_entry, + set_logging(new RGWRESTMgr_SWIFT_Auth)); + } + + if (apis_map.count("admin") > 0) { + RGWRESTMgr_Admin *admin_resource = new RGWRESTMgr_Admin; + admin_resource->register_resource("usage", new RGWRESTMgr_Usage); + admin_resource->register_resource("user", new RGWRESTMgr_User); + admin_resource->register_resource("bucket", new RGWRESTMgr_Bucket); + + /*Registering resource for /admin/metadata */ + admin_resource->register_resource("metadata", new RGWRESTMgr_Metadata); + admin_resource->register_resource("log", new RGWRESTMgr_Log); + admin_resource->register_resource("config", new RGWRESTMgr_Config); + admin_resource->register_resource("realm", new RGWRESTMgr_Realm); + rest.register_resource(g_conf()->rgw_admin_entry, admin_resource); + } + + /* Initialize the registry of auth strategies which will coordinate + * the dynamic reconfiguration. */ + rgw::auth::ImplicitTenants implicit_tenant_context{g_conf()}; + g_conf().add_observer(&implicit_tenant_context); + auto auth_registry = \ + rgw::auth::StrategyRegistry::create(g_ceph_context, implicit_tenant_context, store); + + /* Header custom behavior */ + rest.register_x_headers(g_conf()->rgw_log_http_headers); + + if (cct->_conf.get_val("rgw_scheduler_type") == "dmclock" && + !cct->check_experimental_feature_enabled("dmclock")){ + derr << "dmclock scheduler type is experimental and needs to be" + << "set in the option enable experimental data corrupting features" + << dendl; + return EINVAL; + } + + rgw::dmclock::SchedulerCtx sched_ctx{cct.get()}; + + OpsLogSocket *olog = NULL; + + if (!g_conf()->rgw_ops_log_socket_path.empty()) { + olog = new OpsLogSocket(g_ceph_context, g_conf()->rgw_ops_log_data_backlog); + olog->init(g_conf()->rgw_ops_log_socket_path); + } + + r = signal_fd_init(); + if (r < 0) { + derr << "ERROR: unable to initialize signal fds" << dendl; + exit(1); + } + + register_async_signal_handler(SIGTERM, handle_sigterm); + register_async_signal_handler(SIGINT, handle_sigterm); + register_async_signal_handler(SIGUSR1, handle_sigterm); + sighandler_alrm = signal(SIGALRM, godown_alarm); + + map service_map_meta; + service_map_meta["pid"] = stringify(getpid()); + + list fes; + + int fe_count = 0; + + for (multimap::iterator fiter = fe_map.begin(); + fiter != fe_map.end(); ++fiter, ++fe_count) { + RGWFrontendConfig *config = fiter->second; + string framework = config->get_framework(); + RGWFrontend *fe = NULL; + + if (framework == "civetweb" || framework == "mongoose") { + framework = "civetweb"; + std::string uri_prefix; + config->get_val("prefix", "", &uri_prefix); + + RGWProcessEnv env = { store, &rest, olog, 0, uri_prefix, auth_registry }; + //TODO: move all of scheduler initializations to frontends? + + fe = new RGWCivetWebFrontend(env, config, sched_ctx); + } + else if (framework == "loadgen") { + int port; + config->get_val("port", 80, &port); + std::string uri_prefix; + config->get_val("prefix", "", &uri_prefix); + + RGWProcessEnv env = { store, &rest, olog, port, uri_prefix, auth_registry }; + + fe = new RGWLoadGenFrontend(env, config); + } +#if defined(WITH_RADOSGW_BEAST_FRONTEND) + else if (framework == "beast") { + int port; + config->get_val("port", 80, &port); + std::string uri_prefix; + config->get_val("prefix", "", &uri_prefix); + RGWProcessEnv env{ store, &rest, olog, port, uri_prefix, auth_registry }; + fe = new RGWAsioFrontend(env, config, sched_ctx); + } +#endif /* WITH_RADOSGW_BEAST_FRONTEND */ +#if defined(WITH_RADOSGW_FCGI_FRONTEND) + else if (framework == "fastcgi" || framework == "fcgi") { + framework = "fastcgi"; + std::string uri_prefix; + config->get_val("prefix", "", &uri_prefix); + RGWProcessEnv fcgi_pe = { store, &rest, olog, 0, uri_prefix, auth_registry }; + + fe = new RGWFCGXFrontend(fcgi_pe, config); + } +#endif /* WITH_RADOSGW_FCGI_FRONTEND */ + + service_map_meta["frontend_type#" + stringify(fe_count)] = framework; + service_map_meta["frontend_config#" + stringify(fe_count)] = config->get_config(); + + if (fe == NULL) { + dout(0) << "WARNING: skipping unknown framework: " << framework << dendl; + continue; + } + + dout(0) << "starting handler: " << fiter->first << dendl; + int r = fe->init(); + if (r < 0) { + derr << "ERROR: failed initializing frontend" << dendl; + return -r; + } + r = fe->run(); + if (r < 0) { + derr << "ERROR: failed run" << dendl; + return -r; + } + + fes.push_back(fe); + } + + r = store->register_to_service_map("rgw", service_map_meta); + if (r < 0) { + derr << "ERROR: failed to register to service map: " << cpp_strerror(-r) << dendl; + + /* ignore error */ + } + + + // add a watcher to respond to realm configuration changes + RGWPeriodPusher pusher(store); + RGWFrontendPauser pauser(fes, implicit_tenant_context, &pusher); + auto reloader = std::make_unique(store, + service_map_meta, &pauser); + + RGWRealmWatcher realm_watcher(g_ceph_context, store->svc.zone->get_realm()); + realm_watcher.add_watcher(RGWRealmNotify::Reload, *reloader); + realm_watcher.add_watcher(RGWRealmNotify::ZonesNeedPeriod, pusher); + +#if defined(HAVE_SYS_PRCTL_H) + if (prctl(PR_SET_DUMPABLE, 1) == -1) { + cerr << "warning: unable to set dumpable flag: " << cpp_strerror(errno) << std::endl; + } +#endif + + wait_shutdown(); + + derr << "shutting down" << dendl; + + reloader.reset(); // stop the realm reloader + + for (list::iterator liter = fes.begin(); liter != fes.end(); + ++liter) { + RGWFrontend *fe = *liter; + fe->stop(); + } + + for (list::iterator liter = fes.begin(); liter != fes.end(); + ++liter) { + RGWFrontend *fe = *liter; + fe->join(); + delete fe; + } + + for (list::iterator liter = configs.begin(); + liter != configs.end(); ++liter) { + RGWFrontendConfig *fec = *liter; + delete fec; + } + + unregister_async_signal_handler(SIGHUP, sighup_handler); + unregister_async_signal_handler(SIGTERM, handle_sigterm); + unregister_async_signal_handler(SIGINT, handle_sigterm); + unregister_async_signal_handler(SIGUSR1, handle_sigterm); + shutdown_async_signal_handler(); + + rgw_log_usage_finalize(); + + delete olog; + + RGWStoreManager::close_storage(store); + rgw::auth::s3::LDAPEngine::shutdown(); + rgw_tools_cleanup(); + rgw_shutdown_resolver(); + rgw_http_client_cleanup(); + rgw::curl::cleanup_curl(); +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + rgw::amqp::shutdown(); +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + rgw::kafka::shutdown(); +#endif + g_conf().remove_observer(&implicit_tenant_context); + + rgw_perf_stop(g_ceph_context); + + dout(1) << "final shutdown" << dendl; + + signal_fd_finalize(); + + return 0; +} diff --git a/src/rgw/rgw_meta_sync_status.h b/src/rgw/rgw_meta_sync_status.h new file mode 100644 index 00000000..a3174e3e --- /dev/null +++ b/src/rgw/rgw_meta_sync_status.h @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_META_SYNC_STATUS_H +#define RGW_META_SYNC_STATUS_H + +#include + +#include "common/ceph_time.h" + +struct rgw_meta_sync_info { + enum SyncState { + StateInit = 0, + StateBuildingFullSyncMaps = 1, + StateSync = 2, + }; + + uint16_t state; + uint32_t num_shards; + std::string period; //< period id of current metadata log + epoch_t realm_epoch = 0; //< realm epoch of period + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(state, bl); + encode(num_shards, bl); + encode(period, bl); + encode(realm_epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(state, bl); + decode(num_shards, bl); + if (struct_v >= 2) { + decode(period, bl); + decode(realm_epoch, bl); + } + DECODE_FINISH(bl); + } + + void decode_json(JSONObj *obj); + void dump(Formatter *f) const; + static void generate_test_instances(std::list& ls); + + rgw_meta_sync_info() : state((int)StateInit), num_shards(0) {} +}; +WRITE_CLASS_ENCODER(rgw_meta_sync_info) + +struct rgw_meta_sync_marker { + enum SyncState { + FullSync = 0, + IncrementalSync = 1, + }; + uint16_t state; + string marker; + string next_step_marker; + uint64_t total_entries; + uint64_t pos; + real_time timestamp; + epoch_t realm_epoch{0}; //< realm_epoch of period marker + + rgw_meta_sync_marker() : state(FullSync), total_entries(0), pos(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(state, bl); + encode(marker, bl); + encode(next_step_marker, bl); + encode(total_entries, bl); + encode(pos, bl); + encode(timestamp, bl); + encode(realm_epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(state, bl); + decode(marker, bl); + decode(next_step_marker, bl); + decode(total_entries, bl); + decode(pos, bl); + decode(timestamp, bl); + if (struct_v >= 2) { + decode(realm_epoch, bl); + } + DECODE_FINISH(bl); + } + + void decode_json(JSONObj *obj); + void dump(Formatter *f) const; + static void generate_test_instances(std::list& ls); +}; +WRITE_CLASS_ENCODER(rgw_meta_sync_marker) + +struct rgw_meta_sync_status { + rgw_meta_sync_info sync_info; + map sync_markers; + + rgw_meta_sync_status() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(sync_info, bl); + encode(sync_markers, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(sync_info, bl); + decode(sync_markers, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list& ls); +}; +WRITE_CLASS_ENCODER(rgw_meta_sync_status) + +#endif diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc new file mode 100644 index 00000000..9741cba9 --- /dev/null +++ b/src/rgw/rgw_metadata.cc @@ -0,0 +1,1178 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "common/ceph_json.h" +#include "common/errno.h" +#include "rgw_metadata.h" +#include "rgw_coroutine.h" +#include "cls/version/cls_version_types.h" + +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_tools.h" + +#include "rgw_cr_rados.h" + +#include "services/svc_zone.h" + +#include "include/ceph_assert.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +void LogStatusDump::dump(Formatter *f) const { + string s; + switch (status) { + case MDLOG_STATUS_WRITE: + s = "write"; + break; + case MDLOG_STATUS_SETATTRS: + s = "set_attrs"; + break; + case MDLOG_STATUS_REMOVE: + s = "remove"; + break; + case MDLOG_STATUS_COMPLETE: + s = "complete"; + break; + case MDLOG_STATUS_ABORT: + s = "abort"; + break; + default: + s = "unknown"; + break; + } + encode_json("status", s, f); +} + +void RGWMetadataLogData::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(read_version, bl); + encode(write_version, bl); + uint32_t s = (uint32_t)status; + encode(s, bl); + ENCODE_FINISH(bl); +} + +void RGWMetadataLogData::decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(read_version, bl); + decode(write_version, bl); + uint32_t s; + decode(s, bl); + status = (RGWMDLogStatus)s; + DECODE_FINISH(bl); +} + +void RGWMetadataLogData::dump(Formatter *f) const { + encode_json("read_version", read_version, f); + encode_json("write_version", write_version, f); + encode_json("status", LogStatusDump(status), f); +} + +void decode_json_obj(RGWMDLogStatus& status, JSONObj *obj) { + string s; + JSONDecoder::decode_json("status", s, obj); + if (s == "complete") { + status = MDLOG_STATUS_COMPLETE; + } else if (s == "write") { + status = MDLOG_STATUS_WRITE; + } else if (s == "remove") { + status = MDLOG_STATUS_REMOVE; + } else if (s == "set_attrs") { + status = MDLOG_STATUS_SETATTRS; + } else if (s == "abort") { + status = MDLOG_STATUS_ABORT; + } else { + status = MDLOG_STATUS_UNKNOWN; + } +} + +void RGWMetadataLogData::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("read_version", read_version, obj); + JSONDecoder::decode_json("write_version", write_version, obj); + JSONDecoder::decode_json("status", status, obj); +} + + +int RGWMetadataLog::add_entry(RGWMetadataHandler *handler, const string& section, const string& key, bufferlist& bl) { + if (!store->svc.zone->need_to_log_metadata()) + return 0; + + string oid; + + string hash_key; + handler->get_hash_key(section, key, hash_key); + + int shard_id; + store->shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, &shard_id); + mark_modified(shard_id); + real_time now = real_clock::now(); + return store->time_log_add(oid, now, section, key, bl); +} + +int RGWMetadataLog::store_entries_in_shard(list& entries, int shard_id, librados::AioCompletion *completion) +{ + string oid; + + mark_modified(shard_id); + store->shard_name(prefix, shard_id, oid); + return store->time_log_add(oid, entries, completion, false); +} + +void RGWMetadataLog::init_list_entries(int shard_id, const real_time& from_time, const real_time& end_time, + string& marker, void **handle) +{ + LogListCtx *ctx = new LogListCtx(); + + ctx->cur_shard = shard_id; + ctx->from_time = from_time; + ctx->end_time = end_time; + ctx->marker = marker; + + get_shard_oid(ctx->cur_shard, ctx->cur_oid); + + *handle = (void *)ctx; +} + +void RGWMetadataLog::complete_list_entries(void *handle) { + LogListCtx *ctx = static_cast(handle); + delete ctx; +} + +int RGWMetadataLog::list_entries(void *handle, + int max_entries, + list& entries, + string *last_marker, + bool *truncated) { + LogListCtx *ctx = static_cast(handle); + + if (!max_entries) { + *truncated = false; + return 0; + } + + std::string next_marker; + int ret = store->time_log_list(ctx->cur_oid, ctx->from_time, ctx->end_time, + max_entries, entries, ctx->marker, + &next_marker, truncated); + if ((ret < 0) && (ret != -ENOENT)) + return ret; + + ctx->marker = std::move(next_marker); + if (last_marker) { + *last_marker = ctx->marker; + } + + if (ret == -ENOENT) + *truncated = false; + + return 0; +} + +int RGWMetadataLog::get_info(int shard_id, RGWMetadataLogInfo *info) +{ + string oid; + get_shard_oid(shard_id, oid); + + cls_log_header header; + + int ret = store->time_log_info(oid, &header); + if ((ret < 0) && (ret != -ENOENT)) + return ret; + + info->marker = header.max_marker; + info->last_update = header.max_time.to_real_time(); + + return 0; +} + +static void _mdlog_info_completion(librados::completion_t cb, void *arg) +{ + auto infoc = static_cast(arg); + infoc->finish(cb); + infoc->put(); // drop the ref from get_info_async() +} + +RGWMetadataLogInfoCompletion::RGWMetadataLogInfoCompletion(info_callback_t cb) + : completion(librados::Rados::aio_create_completion((void *)this, nullptr, + _mdlog_info_completion)), + callback(cb) +{ +} + +RGWMetadataLogInfoCompletion::~RGWMetadataLogInfoCompletion() +{ + completion->release(); +} + +int RGWMetadataLog::get_info_async(int shard_id, RGWMetadataLogInfoCompletion *completion) +{ + string oid; + get_shard_oid(shard_id, oid); + + completion->get(); // hold a ref until the completion fires + + return store->time_log_info_async(completion->get_io_ctx(), oid, + &completion->get_header(), + completion->get_completion()); +} + +int RGWMetadataLog::trim(int shard_id, const real_time& from_time, const real_time& end_time, + const string& start_marker, const string& end_marker) +{ + string oid; + get_shard_oid(shard_id, oid); + + return store->time_log_trim(oid, from_time, end_time, start_marker, + end_marker, nullptr); +} + +int RGWMetadataLog::lock_exclusive(int shard_id, timespan duration, string& zone_id, string& owner_id) { + string oid; + get_shard_oid(shard_id, oid); + + return store->lock_exclusive(store->svc.zone->get_zone_params().log_pool, oid, duration, zone_id, owner_id); +} + +int RGWMetadataLog::unlock(int shard_id, string& zone_id, string& owner_id) { + string oid; + get_shard_oid(shard_id, oid); + + return store->unlock(store->svc.zone->get_zone_params().log_pool, oid, zone_id, owner_id); +} + +void RGWMetadataLog::mark_modified(int shard_id) +{ + lock.get_read(); + if (modified_shards.find(shard_id) != modified_shards.end()) { + lock.unlock(); + return; + } + lock.unlock(); + + RWLock::WLocker wl(lock); + modified_shards.insert(shard_id); +} + +void RGWMetadataLog::read_clear_modified(set &modified) +{ + RWLock::WLocker wl(lock); + modified.swap(modified_shards); + modified_shards.clear(); +} + +obj_version& RGWMetadataObject::get_version() +{ + return objv; +} + +class RGWMetadataTopHandler : public RGWMetadataHandler { + struct iter_data { + set sections; + set::iterator iter; + }; + +public: + RGWMetadataTopHandler() {} + + string get_type() override { return string(); } + + int get(RGWRados *store, string& entry, RGWMetadataObject **obj) override { return -ENOTSUP; } + int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker, + real_time mtime, JSONObj *obj, sync_type_t sync_type) override { return -ENOTSUP; } + + virtual void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) override {} + + int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override { return -ENOTSUP; } + + int list_keys_init(RGWRados *store, const string& marker, void **phandle) override { + iter_data *data = new iter_data; + list sections; + store->meta_mgr->get_sections(sections); + for (auto& s : sections) { + data->sections.insert(s); + } + data->iter = data->sections.lower_bound(marker); + + *phandle = data; + + return 0; + } + int list_keys_next(void *handle, int max, list& keys, bool *truncated) override { + iter_data *data = static_cast(handle); + for (int i = 0; i < max && data->iter != data->sections.end(); ++i, ++(data->iter)) { + keys.push_back(*data->iter); + } + + *truncated = (data->iter != data->sections.end()); + + return 0; + } + void list_keys_complete(void *handle) override { + iter_data *data = static_cast(handle); + + delete data; + } + + virtual string get_marker(void *handle) override { + iter_data *data = static_cast(handle); + + if (data->iter != data->sections.end()) { + return *(data->iter); + } + + return string(); + } +}; + +static RGWMetadataTopHandler md_top_handler; + + +RGWMetadataManager::RGWMetadataManager(CephContext *_cct, RGWRados *_store) + : cct(_cct), store(_store) +{ +} + +RGWMetadataManager::~RGWMetadataManager() +{ + map::iterator iter; + + for (iter = handlers.begin(); iter != handlers.end(); ++iter) { + delete iter->second; + } + + handlers.clear(); +} + +const std::string RGWMetadataLogHistory::oid = "meta.history"; + +namespace { + +int read_history(RGWRados *store, RGWMetadataLogHistory *state, + RGWObjVersionTracker *objv_tracker) +{ + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto& pool = store->svc.zone->get_zone_params().log_pool; + const auto& oid = RGWMetadataLogHistory::oid; + bufferlist bl; + int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv_tracker, nullptr); + if (ret < 0) { + return ret; + } + if (bl.length() == 0) { + /* bad history object, remove it */ + rgw_raw_obj obj(pool, oid); + auto sysobj = obj_ctx.get_obj(obj); + ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: meta history is empty, but cannot remove it (" << cpp_strerror(-ret) << ")" << dendl; + return ret; + } + return -ENOENT; + } + try { + auto p = bl.cbegin(); + state->decode(p); + } catch (buffer::error& e) { + ldout(store->ctx(), 1) << "failed to decode the mdlog history: " + << e.what() << dendl; + return -EIO; + } + return 0; +} + +int write_history(RGWRados *store, const RGWMetadataLogHistory& state, + RGWObjVersionTracker *objv_tracker, bool exclusive = false) +{ + bufferlist bl; + state.encode(bl); + + auto& pool = store->svc.zone->get_zone_params().log_pool; + const auto& oid = RGWMetadataLogHistory::oid; + return rgw_put_system_obj(store, pool, oid, bl, + exclusive, objv_tracker, real_time{}); +} + +using Cursor = RGWPeriodHistory::Cursor; + +/// read the mdlog history and use it to initialize the given cursor +class ReadHistoryCR : public RGWCoroutine { + RGWRados *store; + Cursor *cursor; + RGWObjVersionTracker *objv_tracker; + RGWMetadataLogHistory state; + public: + ReadHistoryCR(RGWRados *store, Cursor *cursor, + RGWObjVersionTracker *objv_tracker) + : RGWCoroutine(store->ctx()), store(store), cursor(cursor), + objv_tracker(objv_tracker) + {} + + int operate() { + reenter(this) { + yield { + rgw_raw_obj obj{store->svc.zone->get_zone_params().log_pool, + RGWMetadataLogHistory::oid}; + constexpr bool empty_on_enoent = false; + + using ReadCR = RGWSimpleRadosReadCR; + call(new ReadCR(store->get_async_rados(), store->svc.sysobj, obj, + &state, empty_on_enoent, objv_tracker)); + } + if (retcode < 0) { + ldout(cct, 1) << "failed to read mdlog history: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + *cursor = store->period_history->lookup(state.oldest_realm_epoch); + if (!*cursor) { + return set_cr_error(cursor->get_error()); + } + + ldout(cct, 10) << "read mdlog history with oldest period id=" + << state.oldest_period_id << " realm_epoch=" + << state.oldest_realm_epoch << dendl; + return set_cr_done(); + } + return 0; + } +}; + +/// write the given cursor to the mdlog history +class WriteHistoryCR : public RGWCoroutine { + RGWRados *store; + Cursor cursor; + RGWObjVersionTracker *objv; + RGWMetadataLogHistory state; + public: + WriteHistoryCR(RGWRados *store, const Cursor& cursor, + RGWObjVersionTracker *objv) + : RGWCoroutine(store->ctx()), store(store), cursor(cursor), objv(objv) + {} + + int operate() { + reenter(this) { + state.oldest_period_id = cursor.get_period().get_id(); + state.oldest_realm_epoch = cursor.get_epoch(); + + yield { + rgw_raw_obj obj{store->svc.zone->get_zone_params().log_pool, + RGWMetadataLogHistory::oid}; + + using WriteCR = RGWSimpleRadosWriteCR; + call(new WriteCR(store->get_async_rados(), store->svc.sysobj, obj, state, objv)); + } + if (retcode < 0) { + ldout(cct, 1) << "failed to write mdlog history: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + ldout(cct, 10) << "wrote mdlog history with oldest period id=" + << state.oldest_period_id << " realm_epoch=" + << state.oldest_realm_epoch << dendl; + return set_cr_done(); + } + return 0; + } +}; + +/// update the mdlog history to reflect trimmed logs +class TrimHistoryCR : public RGWCoroutine { + RGWRados *store; + const Cursor cursor; //< cursor to trimmed period + RGWObjVersionTracker *objv; //< to prevent racing updates + Cursor next; //< target cursor for oldest log period + Cursor existing; //< existing cursor read from disk + + public: + TrimHistoryCR(RGWRados *store, Cursor cursor, RGWObjVersionTracker *objv) + : RGWCoroutine(store->ctx()), + store(store), cursor(cursor), objv(objv), next(cursor) + { + next.next(); // advance past cursor + } + + int operate() { + reenter(this) { + // read an existing history, and write the new history if it's newer + yield call(new ReadHistoryCR(store, &existing, objv)); + if (retcode < 0) { + return set_cr_error(retcode); + } + // reject older trims with ECANCELED + if (cursor.get_epoch() < existing.get_epoch()) { + ldout(cct, 4) << "found oldest log epoch=" << existing.get_epoch() + << ", rejecting trim at epoch=" << cursor.get_epoch() << dendl; + return set_cr_error(-ECANCELED); + } + // overwrite with updated history + yield call(new WriteHistoryCR(store, next, objv)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +// traverse all the way back to the beginning of the period history, and +// return a cursor to the first period in a fully attached history +Cursor find_oldest_period(RGWRados *store) +{ + auto cct = store->ctx(); + auto cursor = store->period_history->get_current(); + + while (cursor) { + // advance to the period's predecessor + if (!cursor.has_prev()) { + auto& predecessor = cursor.get_period().get_predecessor(); + if (predecessor.empty()) { + // this is the first period, so our logs must start here + ldout(cct, 10) << "find_oldest_period returning first " + "period " << cursor.get_period().get_id() << dendl; + return cursor; + } + // pull the predecessor and add it to our history + RGWPeriod period; + int r = store->period_puller->pull(predecessor, period); + if (r < 0) { + return cursor; + } + auto prev = store->period_history->insert(std::move(period)); + if (!prev) { + return prev; + } + ldout(cct, 20) << "find_oldest_period advancing to " + "predecessor period " << predecessor << dendl; + ceph_assert(cursor.has_prev()); + } + cursor.prev(); + } + ldout(cct, 10) << "find_oldest_period returning empty cursor" << dendl; + return cursor; +} + +} // anonymous namespace + +Cursor RGWMetadataManager::init_oldest_log_period() +{ + // read the mdlog history + RGWMetadataLogHistory state; + RGWObjVersionTracker objv; + int ret = read_history(store, &state, &objv); + + if (ret == -ENOENT) { + // initialize the mdlog history and write it + ldout(cct, 10) << "initializing mdlog history" << dendl; + auto cursor = find_oldest_period(store); + if (!cursor) { + return cursor; + } + + // write the initial history + state.oldest_realm_epoch = cursor.get_epoch(); + state.oldest_period_id = cursor.get_period().get_id(); + + constexpr bool exclusive = true; // don't overwrite + int ret = write_history(store, state, &objv, exclusive); + if (ret < 0 && ret != -EEXIST) { + ldout(cct, 1) << "failed to write mdlog history: " + << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + return cursor; + } else if (ret < 0) { + ldout(cct, 1) << "failed to read mdlog history: " + << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + + // if it's already in the history, return it + auto cursor = store->period_history->lookup(state.oldest_realm_epoch); + if (cursor) { + return cursor; + } else { + cursor = find_oldest_period(store); + state.oldest_realm_epoch = cursor.get_epoch(); + state.oldest_period_id = cursor.get_period().get_id(); + ldout(cct, 10) << "rewriting mdlog history" << dendl; + ret = write_history(store, state, &objv); + if (ret < 0 && ret != -ECANCELED) { + ldout(cct, 1) << "failed to write mdlog history: " + << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + return cursor; + } + + // pull the oldest period by id + RGWPeriod period; + ret = store->period_puller->pull(state.oldest_period_id, period); + if (ret < 0) { + ldout(cct, 1) << "failed to read period id=" << state.oldest_period_id + << " for mdlog history: " << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + // verify its realm_epoch + if (period.get_realm_epoch() != state.oldest_realm_epoch) { + ldout(cct, 1) << "inconsistent mdlog history: read period id=" + << period.get_id() << " with realm_epoch=" << period.get_realm_epoch() + << ", expected realm_epoch=" << state.oldest_realm_epoch << dendl; + return Cursor{-EINVAL}; + } + // attach the period to our history + return store->period_history->attach(std::move(period)); +} + +Cursor RGWMetadataManager::read_oldest_log_period() const +{ + RGWMetadataLogHistory state; + int ret = read_history(store, &state, nullptr); + if (ret < 0) { + ldout(store->ctx(), 1) << "failed to read mdlog history: " + << cpp_strerror(ret) << dendl; + return Cursor{ret}; + } + + ldout(store->ctx(), 10) << "read mdlog history with oldest period id=" + << state.oldest_period_id << " realm_epoch=" + << state.oldest_realm_epoch << dendl; + + return store->period_history->lookup(state.oldest_realm_epoch); +} + +RGWCoroutine* RGWMetadataManager::read_oldest_log_period_cr(Cursor *period, + RGWObjVersionTracker *objv) const +{ + return new ReadHistoryCR(store, period, objv); +} + +RGWCoroutine* RGWMetadataManager::trim_log_period_cr(Cursor period, + RGWObjVersionTracker *objv) const +{ + return new TrimHistoryCR(store, period, objv); +} + +int RGWMetadataManager::init(const std::string& current_period) +{ + // open a log for the current period + current_log = get_log(current_period); + return 0; +} + +RGWMetadataLog* RGWMetadataManager::get_log(const std::string& period) +{ + // construct the period's log in place if it doesn't exist + auto insert = md_logs.emplace(std::piecewise_construct, + std::forward_as_tuple(period), + std::forward_as_tuple(cct, store, period)); + return &insert.first->second; +} + +int RGWMetadataManager::register_handler(RGWMetadataHandler *handler) +{ + string type = handler->get_type(); + + if (handlers.find(type) != handlers.end()) + return -EINVAL; + + handlers[type] = handler; + + return 0; +} + +RGWMetadataHandler *RGWMetadataManager::get_handler(const string& type) +{ + map::iterator iter = handlers.find(type); + if (iter == handlers.end()) + return NULL; + + return iter->second; +} + +void RGWMetadataManager::parse_metadata_key(const string& metadata_key, string& type, string& entry) +{ + auto pos = metadata_key.find(':'); + if (pos == string::npos) { + type = metadata_key; + } else { + type = metadata_key.substr(0, pos); + entry = metadata_key.substr(pos + 1); + } +} + +int RGWMetadataManager::find_handler(const string& metadata_key, RGWMetadataHandler **handler, string& entry) +{ + string type; + + parse_metadata_key(metadata_key, type, entry); + + if (type.empty()) { + *handler = &md_top_handler; + return 0; + } + + map::iterator iter = handlers.find(type); + if (iter == handlers.end()) + return -ENOENT; + + *handler = iter->second; + + return 0; + +} + +int RGWMetadataManager::get(string& metadata_key, Formatter *f) +{ + RGWMetadataHandler *handler; + string entry; + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) { + return ret; + } + + RGWMetadataObject *obj; + + ret = handler->get(store, entry, &obj); + if (ret < 0) { + return ret; + } + + f->open_object_section("metadata_info"); + encode_json("key", metadata_key, f); + encode_json("ver", obj->get_version(), f); + real_time mtime = obj->get_mtime(); + if (!real_clock::is_zero(mtime)) { + utime_t ut(mtime); + encode_json("mtime", ut, f); + } + encode_json("data", *obj, f); + f->close_section(); + + delete obj; + + return 0; +} + +int RGWMetadataManager::put(string& metadata_key, bufferlist& bl, + RGWMetadataHandler::sync_type_t sync_type, + obj_version *existing_version) +{ + RGWMetadataHandler *handler; + string entry; + + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) { + return ret; + } + + JSONParser parser; + if (!parser.parse(bl.c_str(), bl.length())) { + return -EINVAL; + } + + RGWObjVersionTracker objv_tracker; + + obj_version *objv = &objv_tracker.write_version; + + utime_t mtime; + + try { + JSONDecoder::decode_json("key", metadata_key, &parser); + JSONDecoder::decode_json("ver", *objv, &parser); + JSONDecoder::decode_json("mtime", mtime, &parser); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + + JSONObj *jo = parser.find_obj("data"); + if (!jo) { + return -EINVAL; + } + + ret = handler->put(store, entry, objv_tracker, mtime.to_real_time(), jo, sync_type); + if (existing_version) { + *existing_version = objv_tracker.read_version; + } + return ret; +} + +int RGWMetadataManager::prepare_mutate(RGWRados *store, + rgw_pool& pool, const string& oid, + const real_time& mtime, + RGWObjVersionTracker *objv_tracker, + RGWMetadataHandler::sync_type_t sync_mode) +{ + bufferlist bl; + real_time orig_mtime; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, + bl, objv_tracker, &orig_mtime, + nullptr, nullptr); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + if (ret != -ENOENT && + !RGWMetadataHandler::check_versions(objv_tracker->read_version, orig_mtime, + objv_tracker->write_version, mtime, sync_mode)) { + return STATUS_NO_APPLY; + } + + if (objv_tracker->write_version.tag.empty()) { + if (objv_tracker->read_version.tag.empty()) { + objv_tracker->generate_new_write_ver(store->ctx()); + } else { + objv_tracker->write_version = objv_tracker->read_version; + objv_tracker->write_version.ver++; + } + } + return 0; +} + +int RGWMetadataManager::remove(string& metadata_key) +{ + RGWMetadataHandler *handler; + string entry; + + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) { + return ret; + } + + RGWMetadataObject *obj; + ret = handler->get(store, entry, &obj); + if (ret < 0) { + return ret; + } + RGWObjVersionTracker objv_tracker; + objv_tracker.read_version = obj->get_version(); + delete obj; + + return handler->remove(store, entry, objv_tracker); +} + +int RGWMetadataManager::lock_exclusive(string& metadata_key, timespan duration, string& owner_id) { + RGWMetadataHandler *handler; + string entry; + string zone_id; + + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) + return ret; + + rgw_pool pool; + string oid; + + handler->get_pool_and_oid(store, entry, pool, oid); + + return store->lock_exclusive(pool, oid, duration, zone_id, owner_id); +} + +int RGWMetadataManager::unlock(string& metadata_key, string& owner_id) { + librados::IoCtx io_ctx; + RGWMetadataHandler *handler; + string entry; + string zone_id; + + int ret = find_handler(metadata_key, &handler, entry); + if (ret < 0) + return ret; + + rgw_pool pool; + string oid; + + handler->get_pool_and_oid(store, entry, pool, oid); + + return store->unlock(pool, oid, zone_id, owner_id); +} + +struct list_keys_handle { + void *handle; + RGWMetadataHandler *handler; +}; + +int RGWMetadataManager::list_keys_init(const string& section, void **handle) +{ + return list_keys_init(section, string(), handle); +} + +int RGWMetadataManager::list_keys_init(const string& section, + const string& marker, void **handle) +{ + string entry; + RGWMetadataHandler *handler; + + int ret; + + ret = find_handler(section, &handler, entry); + if (ret < 0) { + return -ENOENT; + } + + list_keys_handle *h = new list_keys_handle; + h->handler = handler; + ret = handler->list_keys_init(store, marker, &h->handle); + if (ret < 0) { + delete h; + return ret; + } + + *handle = (void *)h; + + return 0; +} + +int RGWMetadataManager::list_keys_next(void *handle, int max, list& keys, bool *truncated) +{ + list_keys_handle *h = static_cast(handle); + + RGWMetadataHandler *handler = h->handler; + + return handler->list_keys_next(h->handle, max, keys, truncated); +} + +void RGWMetadataManager::list_keys_complete(void *handle) +{ + list_keys_handle *h = static_cast(handle); + + RGWMetadataHandler *handler = h->handler; + + handler->list_keys_complete(h->handle); + delete h; +} + +string RGWMetadataManager::get_marker(void *handle) +{ + list_keys_handle *h = static_cast(handle); + + return h->handler->get_marker(h->handle); +} + +void RGWMetadataManager::dump_log_entry(cls_log_entry& entry, Formatter *f) +{ + f->open_object_section("entry"); + f->dump_string("id", entry.id); + f->dump_string("section", entry.section); + f->dump_string("name", entry.name); + entry.timestamp.gmtime_nsec(f->dump_stream("timestamp")); + + try { + RGWMetadataLogData log_data; + auto iter = entry.data.cbegin(); + decode(log_data, iter); + + encode_json("data", log_data, f); + } catch (buffer::error& err) { + lderr(cct) << "failed to decode log entry: " << entry.section << ":" << entry.name<< " ts=" << entry.timestamp << dendl; + } + f->close_section(); +} + +void RGWMetadataManager::get_sections(list& sections) +{ + for (map::iterator iter = handlers.begin(); iter != handlers.end(); ++iter) { + sections.push_back(iter->first); + } +} + +int RGWMetadataManager::pre_modify(RGWMetadataHandler *handler, string& section, const string& key, + RGWMetadataLogData& log_data, RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type) +{ + section = handler->get_type(); + + /* if write version has not been set, and there's a read version, set it so that we can + * log it + */ + if (objv_tracker) { + if (objv_tracker->read_version.ver && !objv_tracker->write_version.ver) { + objv_tracker->write_version = objv_tracker->read_version; + objv_tracker->write_version.ver++; + } + log_data.read_version = objv_tracker->read_version; + log_data.write_version = objv_tracker->write_version; + } + + log_data.status = op_type; + + bufferlist logbl; + encode(log_data, logbl); + + ceph_assert(current_log); // must have called init() + int ret = current_log->add_entry(handler, section, key, logbl); + if (ret < 0) + return ret; + + return 0; +} + +int RGWMetadataManager::post_modify(RGWMetadataHandler *handler, const string& section, const string& key, RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, int ret) +{ + if (ret >= 0) + log_data.status = MDLOG_STATUS_COMPLETE; + else + log_data.status = MDLOG_STATUS_ABORT; + + bufferlist logbl; + encode(log_data, logbl); + + ceph_assert(current_log); // must have called init() + int r = current_log->add_entry(handler, section, key, logbl); + if (ret < 0) + return ret; + + if (r < 0) + return r; + + return 0; +} + +string RGWMetadataManager::heap_oid(RGWMetadataHandler *handler, const string& key, const obj_version& objv) +{ + char buf[objv.tag.size() + 32]; + snprintf(buf, sizeof(buf), "%s:%lld", objv.tag.c_str(), (long long)objv.ver); + return string(".meta:") + handler->get_type() + ":" + key + ":" + buf; +} + +int RGWMetadataManager::store_in_heap(RGWMetadataHandler *handler, const string& key, bufferlist& bl, + RGWObjVersionTracker *objv_tracker, real_time mtime, + map *pattrs) +{ + if (!objv_tracker) { + return -EINVAL; + } + + rgw_pool heap_pool(store->svc.zone->get_zone_params().metadata_heap); + + if (heap_pool.empty()) { + return 0; + } + + RGWObjVersionTracker otracker; + otracker.write_version = objv_tracker->write_version; + string oid = heap_oid(handler, key, objv_tracker->write_version); + int ret = rgw_put_system_obj(store, heap_pool, oid, + bl, false, &otracker, mtime, pattrs); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: rgw_put_system_obj() oid=" << oid << " returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWMetadataManager::remove_from_heap(RGWMetadataHandler *handler, const string& key, RGWObjVersionTracker *objv_tracker) +{ + if (!objv_tracker) { + return -EINVAL; + } + + rgw_pool heap_pool(store->svc.zone->get_zone_params().metadata_heap); + + if (heap_pool.empty()) { + return 0; + } + + string oid = heap_oid(handler, key, objv_tracker->write_version); + rgw_raw_obj obj(heap_pool, oid); + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + int ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: sysobj.wop().remove() oid=" << oid << " returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWMetadataManager::put_entry(RGWMetadataHandler *handler, const string& key, bufferlist& bl, bool exclusive, + RGWObjVersionTracker *objv_tracker, real_time mtime, map *pattrs) +{ + string section; + RGWMetadataLogData log_data; + int ret = pre_modify(handler, section, key, log_data, objv_tracker, MDLOG_STATUS_WRITE); + if (ret < 0) + return ret; + + string oid; + rgw_pool pool; + + handler->get_pool_and_oid(store, key, pool, oid); + + ret = store_in_heap(handler, key, bl, objv_tracker, mtime, pattrs); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": store_in_heap() key=" << key << " returned ret=" << ret << dendl; + goto done; + } + + ret = rgw_put_system_obj(store, pool, oid, bl, exclusive, + objv_tracker, mtime, pattrs); + + if (ret < 0) { + int r = remove_from_heap(handler, key, objv_tracker); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": remove_from_heap() key=" << key << " returned ret=" << r << dendl; + } + } +done: + /* cascading ret into post_modify() */ + + ret = post_modify(handler, section, key, log_data, objv_tracker, ret); + if (ret < 0) + return ret; + + return 0; +} + +int RGWMetadataManager::remove_entry(RGWMetadataHandler *handler, + const string& key, + RGWObjVersionTracker *objv_tracker) +{ + string section; + RGWMetadataLogData log_data; + int ret = pre_modify(handler, section, key, log_data, objv_tracker, MDLOG_STATUS_REMOVE); + if (ret < 0) { + return ret; + } + + string oid; + rgw_pool pool; + + handler->get_pool_and_oid(store, key, pool, oid); + + rgw_raw_obj obj(pool, oid); + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + ret = sysobj.wop() + .set_objv_tracker(objv_tracker) + .remove(); + /* cascading ret into post_modify() */ + + ret = post_modify(handler, section, key, log_data, objv_tracker, ret); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWMetadataManager::get_log_shard_id(const string& section, + const string& key, int *shard_id) +{ + RGWMetadataHandler *handler = get_handler(section); + if (!handler) { + return -EINVAL; + } + string hash_key; + handler->get_hash_key(section, key, hash_key); + *shard_id = store->key_to_shard_id(hash_key, cct->_conf->rgw_md_log_max_shards); + return 0; +} diff --git a/src/rgw/rgw_metadata.h b/src/rgw/rgw_metadata.h new file mode 100644 index 00000000..e4107677 --- /dev/null +++ b/src/rgw/rgw_metadata.h @@ -0,0 +1,426 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_METADATA_H +#define CEPH_RGW_METADATA_H + +#include +#include +#include + +#include "include/types.h" +#include "rgw_common.h" +#include "rgw_period_history.h" +#include "cls/version/cls_version_types.h" +#include "cls/log/cls_log_types.h" +#include "common/RWLock.h" +#include "common/RefCountedObj.h" +#include "common/ceph_time.h" + + +class RGWRados; +class RGWCoroutine; +class JSONObj; +struct RGWObjVersionTracker; + +struct obj_version; + + +enum RGWMDLogStatus { + MDLOG_STATUS_UNKNOWN, + MDLOG_STATUS_WRITE, + MDLOG_STATUS_SETATTRS, + MDLOG_STATUS_REMOVE, + MDLOG_STATUS_COMPLETE, + MDLOG_STATUS_ABORT, +}; + +class RGWMetadataObject { +protected: + obj_version objv; + ceph::real_time mtime; + +public: + RGWMetadataObject() {} + virtual ~RGWMetadataObject() {} + obj_version& get_version(); + real_time get_mtime() { return mtime; } + + virtual void dump(Formatter *f) const = 0; +}; + +class RGWMetadataManager; + +class RGWMetadataHandler { + friend class RGWMetadataManager; + +public: + enum sync_type_t { + APPLY_ALWAYS, + APPLY_UPDATES, + APPLY_NEWER + }; + static bool string_to_sync_type(const string& sync_string, + sync_type_t& type) { + if (sync_string.compare("update-by-version") == 0) + type = APPLY_UPDATES; + else if (sync_string.compare("update-by-timestamp") == 0) + type = APPLY_NEWER; + else if (sync_string.compare("always") == 0) + type = APPLY_ALWAYS; + else + return false; + return true; + } + + virtual ~RGWMetadataHandler() {} + virtual string get_type() = 0; + + virtual int get(RGWRados *store, string& entry, RGWMetadataObject **obj) = 0; + virtual int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker, + real_time mtime, JSONObj *obj, sync_type_t type) = 0; + virtual int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) = 0; + + virtual int list_keys_init(RGWRados *store, const string& marker, void **phandle) = 0; + virtual int list_keys_next(void *handle, int max, list& keys, bool *truncated) = 0; + virtual void list_keys_complete(void *handle) = 0; + + virtual string get_marker(void *handle) = 0; + + /* key to use for hashing entries for log shard placement */ + virtual void get_hash_key(const string& section, const string& key, string& hash_key) { + hash_key = section + ":" + key; + } + +protected: + virtual void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) = 0; + /** + * Compare an incoming versus on-disk tag/version+mtime combo against + * the sync mode to see if the new one should replace the on-disk one. + * + * @return true if the update should proceed, false otherwise. + */ + static bool check_versions(const obj_version& ondisk, const real_time& ondisk_time, + const obj_version& incoming, const real_time& incoming_time, + sync_type_t sync_mode) { + switch (sync_mode) { + case APPLY_UPDATES: + if ((ondisk.tag != incoming.tag) || + (ondisk.ver >= incoming.ver)) + return false; + break; + case APPLY_NEWER: + if (ondisk_time >= incoming_time) + return false; + break; + case APPLY_ALWAYS: //deliberate fall-thru -- we always apply! + default: break; + } + return true; + } + + /* + * The tenant_name is always returned on purpose. May be empty, of course. + */ + static void parse_bucket(const string& bucket, + string *tenant_name, + string *bucket_name, + string *bucket_instance = nullptr /* optional */) + { + int pos = bucket.find('/'); + if (pos >= 0) { + *tenant_name = bucket.substr(0, pos); + } else { + tenant_name->clear(); + } + string bn = bucket.substr(pos + 1); + pos = bn.find (':'); + if (pos < 0) { + *bucket_name = std::move(bn); + return; + } + *bucket_name = bn.substr(0, pos); + if (bucket_instance) { + *bucket_instance = bn.substr(pos + 1); + } + } +}; + +#define META_LOG_OBJ_PREFIX "meta.log." + +struct RGWMetadataLogInfo { + string marker; + real_time last_update; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; + +class RGWCompletionManager; + +class RGWMetadataLogInfoCompletion : public RefCountedObject { + public: + using info_callback_t = std::function; + private: + cls_log_header header; + librados::IoCtx io_ctx; + librados::AioCompletion *completion; + std::mutex mutex; //< protects callback between cancel/complete + boost::optional callback; //< cleared on cancel + public: + explicit RGWMetadataLogInfoCompletion(info_callback_t callback); + ~RGWMetadataLogInfoCompletion() override; + + librados::IoCtx& get_io_ctx() { return io_ctx; } + cls_log_header& get_header() { return header; } + librados::AioCompletion* get_completion() { return completion; } + + void finish(librados::completion_t cb) { + std::lock_guard lock(mutex); + if (callback) { + (*callback)(completion->get_return_value(), header); + } + } + void cancel() { + std::lock_guard lock(mutex); + callback = boost::none; + } +}; + +class RGWMetadataLog { + CephContext *cct; + RGWRados *store; + const string prefix; + + static std::string make_prefix(const std::string& period) { + if (period.empty()) + return META_LOG_OBJ_PREFIX; + return META_LOG_OBJ_PREFIX + period + "."; + } + + RWLock lock; + set modified_shards; + + void mark_modified(int shard_id); +public: + RGWMetadataLog(CephContext *_cct, RGWRados *_store, const std::string& period) + : cct(_cct), store(_store), + prefix(make_prefix(period)), + lock("RGWMetaLog::lock") {} + + void get_shard_oid(int id, string& oid) const { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", id); + oid = prefix + buf; + } + + int add_entry(RGWMetadataHandler *handler, const string& section, const string& key, bufferlist& bl); + int store_entries_in_shard(list& entries, int shard_id, librados::AioCompletion *completion); + + struct LogListCtx { + int cur_shard; + string marker; + real_time from_time; + real_time end_time; + + string cur_oid; + + bool done; + + LogListCtx() : cur_shard(0), done(false) {} + }; + + void init_list_entries(int shard_id, const real_time& from_time, const real_time& end_time, string& marker, void **handle); + void complete_list_entries(void *handle); + int list_entries(void *handle, + int max_entries, + list& entries, + string *out_marker, + bool *truncated); + + int trim(int shard_id, const real_time& from_time, const real_time& end_time, const string& start_marker, const string& end_marker); + int get_info(int shard_id, RGWMetadataLogInfo *info); + int get_info_async(int shard_id, RGWMetadataLogInfoCompletion *completion); + int lock_exclusive(int shard_id, timespan duration, string&zone_id, string& owner_id); + int unlock(int shard_id, string& zone_id, string& owner_id); + + int update_shards(list& shards); + + void read_clear_modified(set &modified); +}; + +struct LogStatusDump { + RGWMDLogStatus status; + + explicit LogStatusDump(RGWMDLogStatus _status) : status(_status) {} + void dump(Formatter *f) const; +}; + +struct RGWMetadataLogData { + obj_version read_version; + obj_version write_version; + RGWMDLogStatus status; + + RGWMetadataLogData() : status(MDLOG_STATUS_UNKNOWN) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWMetadataLogData) + +struct RGWMetadataLogHistory { + epoch_t oldest_realm_epoch; + std::string oldest_period_id; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(oldest_realm_epoch, bl); + encode(oldest_period_id, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(oldest_realm_epoch, p); + decode(oldest_period_id, p); + DECODE_FINISH(p); + } + + static const std::string oid; +}; +WRITE_CLASS_ENCODER(RGWMetadataLogHistory) + +class RGWMetadataManager { + map handlers; + CephContext *cct; + RGWRados *store; + + // maintain a separate metadata log for each period + std::map md_logs; + // use the current period's log for mutating operations + RGWMetadataLog* current_log = nullptr; + + int find_handler(const string& metadata_key, RGWMetadataHandler **handler, string& entry); + int pre_modify(RGWMetadataHandler *handler, string& section, const string& key, + RGWMetadataLogData& log_data, RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type); + int post_modify(RGWMetadataHandler *handler, const string& section, const string& key, RGWMetadataLogData& log_data, + RGWObjVersionTracker *objv_tracker, int ret); + + string heap_oid(RGWMetadataHandler *handler, const string& key, const obj_version& objv); + int store_in_heap(RGWMetadataHandler *handler, const string& key, bufferlist& bl, + RGWObjVersionTracker *objv_tracker, real_time mtime, + map *pattrs); + int remove_from_heap(RGWMetadataHandler *handler, const string& key, RGWObjVersionTracker *objv_tracker); + int prepare_mutate(RGWRados *store, rgw_pool& pool, const string& oid, + const real_time& mtime, + RGWObjVersionTracker *objv_tracker, + RGWMetadataHandler::sync_type_t sync_mode); + +public: + RGWMetadataManager(CephContext *_cct, RGWRados *_store); + ~RGWMetadataManager(); + + RGWRados* get_store() { return store; } + + int init(const std::string& current_period); + + /// initialize the oldest log period if it doesn't exist, and attach it to + /// our current history + RGWPeriodHistory::Cursor init_oldest_log_period(); + + /// read the oldest log period, and return a cursor to it in our existing + /// period history + RGWPeriodHistory::Cursor read_oldest_log_period() const; + + /// read the oldest log period asynchronously and write its result to the + /// given cursor pointer + RGWCoroutine* read_oldest_log_period_cr(RGWPeriodHistory::Cursor *period, + RGWObjVersionTracker *objv) const; + + /// try to advance the oldest log period when the given period is trimmed, + /// using a rados lock to provide atomicity + RGWCoroutine* trim_log_period_cr(RGWPeriodHistory::Cursor period, + RGWObjVersionTracker *objv) const; + + /// find or create the metadata log for the given period + RGWMetadataLog* get_log(const std::string& period); + + int register_handler(RGWMetadataHandler *handler); + + template + int mutate(RGWMetadataHandler *handler, const string& key, + const ceph::real_time& mtime, RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type, + RGWMetadataHandler::sync_type_t sync_mode, + F&& f); + + RGWMetadataHandler *get_handler(const string& type); + + int put_entry(RGWMetadataHandler *handler, const string& key, bufferlist& bl, bool exclusive, + RGWObjVersionTracker *objv_tracker, real_time mtime, map *pattrs = NULL); + int remove_entry(RGWMetadataHandler *handler, + const string& key, + RGWObjVersionTracker *objv_tracker); + int get(string& metadata_key, Formatter *f); + int put(string& metadata_key, bufferlist& bl, + RGWMetadataHandler::sync_type_t sync_mode, + obj_version *existing_version = NULL); + int remove(string& metadata_key); + + int list_keys_init(const string& section, void **phandle); + int list_keys_init(const string& section, const string& marker, void **phandle); + int list_keys_next(void *handle, int max, list& keys, bool *truncated); + void list_keys_complete(void *handle); + + string get_marker(void *handle); + + void dump_log_entry(cls_log_entry& entry, Formatter *f); + + void get_sections(list& sections); + int lock_exclusive(string& metadata_key, timespan duration, string& owner_id); + int unlock(string& metadata_key, string& owner_id); + + int get_log_shard_id(const string& section, const string& key, int *shard_id); + + void parse_metadata_key(const string& metadata_key, string& type, string& entry); +}; + +template +int RGWMetadataManager::mutate(RGWMetadataHandler *handler, const string& key, + const ceph::real_time& mtime, RGWObjVersionTracker *objv_tracker, + RGWMDLogStatus op_type, + RGWMetadataHandler::sync_type_t sync_mode, + F&& f) +{ + string oid; + rgw_pool pool; + + handler->get_pool_and_oid(store, key, pool, oid); + + int ret = prepare_mutate(store, pool, oid, mtime, objv_tracker, sync_mode); + if (ret < 0 || + ret == STATUS_NO_APPLY) { + return ret; + } + + string section; + RGWMetadataLogData log_data; + ret = pre_modify(handler, section, key, log_data, objv_tracker, MDLOG_STATUS_WRITE); + if (ret < 0) { + return ret; + } + + ret = std::forward(f)(); + + /* cascading ret into post_modify() */ + + ret = post_modify(handler, section, key, log_data, objv_tracker, ret); + if (ret < 0) + return ret; + + return 0; +} + +#endif diff --git a/src/rgw/rgw_multi.cc b/src/rgw/rgw_multi.cc new file mode 100644 index 00000000..d055d98b --- /dev/null +++ b/src/rgw/rgw_multi.cc @@ -0,0 +1,384 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_xml.h" +#include "rgw_multi.h" +#include "rgw_op.h" + +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + + + +bool MultipartMetaFilter::filter(const string& name, string& key) { + // the length of the suffix so we can skip past it + static const size_t MP_META_SUFFIX_LEN = MP_META_SUFFIX.length(); + + size_t len = name.size(); + + // make sure there's room for suffix plus at least one more + // character + if (len <= MP_META_SUFFIX_LEN) + return false; + + size_t pos = name.find(MP_META_SUFFIX, len - MP_META_SUFFIX_LEN); + if (pos == string::npos) + return false; + + pos = name.rfind('.', pos - 1); + if (pos == string::npos) + return false; + + key = name.substr(0, pos); + + return true; +} + + +bool RGWMultiPart::xml_end(const char *el) +{ + RGWMultiPartNumber *num_obj = static_cast(find_first("PartNumber")); + RGWMultiETag *etag_obj = static_cast(find_first("ETag")); + + if (!num_obj || !etag_obj) + return false; + + string s = num_obj->get_data(); + if (s.empty()) + return false; + + num = atoi(s.c_str()); + + s = etag_obj->get_data(); + etag = s; + + return true; +} + +bool RGWMultiCompleteUpload::xml_end(const char *el) { + XMLObjIter iter = find("Part"); + RGWMultiPart *part = static_cast(iter.get_next()); + while (part) { + int num = part->get_num(); + string etag = part->get_etag(); + parts[num] = etag; + part = static_cast(iter.get_next()); + } + return true; +} + + +XMLObj *RGWMultiXMLParser::alloc_obj(const char *el) { + XMLObj *obj = NULL; + if (strcmp(el, "CompleteMultipartUpload") == 0 || + strcmp(el, "MultipartUpload") == 0) { + obj = new RGWMultiCompleteUpload(); + } else if (strcmp(el, "Part") == 0) { + obj = new RGWMultiPart(); + } else if (strcmp(el, "PartNumber") == 0) { + obj = new RGWMultiPartNumber(); + } else if (strcmp(el, "ETag") == 0) { + obj = new RGWMultiETag(); + } + + return obj; +} + +bool is_v2_upload_id(const string& upload_id) +{ + const char *uid = upload_id.c_str(); + + return (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX, sizeof(MULTIPART_UPLOAD_ID_PREFIX) - 1) == 0) || + (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX_LEGACY, sizeof(MULTIPART_UPLOAD_ID_PREFIX_LEGACY) - 1) == 0); +} + +int list_multipart_parts(RGWRados *store, RGWBucketInfo& bucket_info, + CephContext *cct, + const string& upload_id, + const string& meta_oid, int num_parts, + int marker, map& parts, + int *next_marker, bool *truncated, + bool assume_unsorted) +{ + map parts_map; + map::iterator iter; + + rgw_obj obj; + obj.init_ns(bucket_info.bucket, meta_oid, RGW_OBJ_NS_MULTIPART); + obj.set_in_extra_data(true); + + rgw_raw_obj raw_obj; + store->obj_to_raw(bucket_info.placement_rule, obj, &raw_obj); + + bool sorted_omap = is_v2_upload_id(upload_id) && !assume_unsorted; + + parts.clear(); + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(raw_obj); + int ret; + if (sorted_omap) { + string p; + p = "part."; + char buf[32]; + + snprintf(buf, sizeof(buf), "%08d", marker); + p.append(buf); + + ret = sysobj.omap().get_vals(p, num_parts + 1, &parts_map, nullptr); + } else { + ret = sysobj.omap().get_all(&parts_map); + } + if (ret < 0) { + return ret; + } + + int i; + int last_num = 0; + + uint32_t expected_next = marker + 1; + + for (i = 0, iter = parts_map.begin(); + (i < num_parts || !sorted_omap) && iter != parts_map.end(); + ++iter, ++i) { + bufferlist& bl = iter->second; + auto bli = bl.cbegin(); + RGWUploadPartInfo info; + try { + decode(info, bli); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not part info, caught buffer::error" << + dendl; + return -EIO; + } + if (sorted_omap) { + if (info.num != expected_next) { + /* ouch, we expected a specific part num here, but we got a + * different one. Either a part is missing, or it could be a + * case of mixed rgw versions working on the same upload, + * where one gateway doesn't support correctly sorted omap + * keys for multipart upload just assume data is unsorted. + */ + return list_multipart_parts(store, bucket_info, cct, upload_id, + meta_oid, num_parts, marker, parts, + next_marker, truncated, true); + } + expected_next++; + } + if (sorted_omap || + (int)info.num > marker) { + parts[info.num] = info; + last_num = info.num; + } + } + + if (sorted_omap) { + if (truncated) { + *truncated = (iter != parts_map.end()); + } + } else { + /* rebuild a map with only num_parts entries */ + map new_parts; + map::iterator piter; + for (i = 0, piter = parts.begin(); + i < num_parts && piter != parts.end(); + ++i, ++piter) { + new_parts[piter->first] = piter->second; + last_num = piter->first; + } + + if (truncated) { + *truncated = (piter != parts.end()); + } + + parts.swap(new_parts); + } + + if (next_marker) { + *next_marker = last_num; + } + + return 0; +} + +int list_multipart_parts(RGWRados *store, struct req_state *s, + const string& upload_id, + const string& meta_oid, int num_parts, + int marker, map& parts, + int *next_marker, bool *truncated, + bool assume_unsorted) +{ + return list_multipart_parts(store, s->bucket_info, s->cct, upload_id, + meta_oid, num_parts, marker, parts, + next_marker, truncated, assume_unsorted); +} + +int abort_multipart_upload(RGWRados *store, CephContext *cct, + RGWObjectCtx *obj_ctx, RGWBucketInfo& bucket_info, + RGWMPObj& mp_obj) +{ + rgw_obj meta_obj; + meta_obj.init_ns(bucket_info.bucket, mp_obj.get_meta(), RGW_OBJ_NS_MULTIPART); + meta_obj.set_in_extra_data(true); + meta_obj.index_hash_source = mp_obj.get_key(); + cls_rgw_obj_chain chain; + list remove_objs; + map obj_parts; + bool truncated; + int marker = 0; + int ret; + uint64_t parts_accounted_size = 0; + + do { + ret = list_multipart_parts(store, bucket_info, cct, + mp_obj.get_upload_id(), mp_obj.get_meta(), + 1000, marker, obj_parts, &marker, &truncated); + if (ret < 0) { + ldout(cct, 20) << __func__ << ": list_multipart_parts returned " << + ret << dendl; + return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret; + } + + for (auto obj_iter = obj_parts.begin(); + obj_iter != obj_parts.end(); + ++obj_iter) { + RGWUploadPartInfo& obj_part = obj_iter->second; + rgw_obj obj; + if (obj_part.manifest.empty()) { + string oid = mp_obj.get_part(obj_iter->second.num); + obj.init_ns(bucket_info.bucket, oid, RGW_OBJ_NS_MULTIPART); + obj.index_hash_source = mp_obj.get_key(); + ret = store->delete_obj(*obj_ctx, bucket_info, obj, 0); + if (ret < 0 && ret != -ENOENT) + return ret; + } else { + store->update_gc_chain(meta_obj, obj_part.manifest, &chain); + RGWObjManifest::obj_iterator oiter = obj_part.manifest.obj_begin(); + if (oiter != obj_part.manifest.obj_end()) { + rgw_obj head; + rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store); + rgw_raw_obj_to_obj(bucket_info.bucket, raw_head, &head); + + rgw_obj_index_key key; + head.key.get_index_key(&key); + remove_objs.push_back(key); + } + } + parts_accounted_size += obj_part.accounted_size; + } + } while (truncated); + + /* use upload id as tag and do it asynchronously */ + ret = store->send_chain_to_gc(chain, mp_obj.get_upload_id(), false); + if (ret < 0) { + ldout(cct, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl; + return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret; + } + + RGWRados::Object del_target(store, bucket_info, *obj_ctx, meta_obj); + RGWRados::Object::Delete del_op(&del_target); + del_op.params.bucket_owner = bucket_info.owner; + del_op.params.versioning_status = 0; + if (!remove_objs.empty()) { + del_op.params.remove_objs = &remove_objs; + } + + del_op.params.abortmp = true; + del_op.params.parts_accounted_size = parts_accounted_size; + + // and also remove the metadata obj + ret = del_op.delete_obj(); + if (ret < 0) { + ldout(cct, 20) << __func__ << ": del_op.delete_obj returned " << + ret << dendl; + } + return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret; +} + +int list_bucket_multiparts(RGWRados *store, RGWBucketInfo& bucket_info, + const string& prefix, const string& marker, + const string& delim, + const int& max_uploads, + vector *objs, + map *common_prefixes, bool *is_truncated) +{ + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + MultipartMetaFilter mp_filter; + + list_op.params.prefix = prefix; + list_op.params.delim = delim; + list_op.params.marker = marker; + list_op.params.ns = RGW_OBJ_NS_MULTIPART; + list_op.params.filter = &mp_filter; + + return(list_op.list_objects(max_uploads, objs, common_prefixes, is_truncated)); +} + +int abort_bucket_multiparts(RGWRados *store, CephContext *cct, RGWBucketInfo& bucket_info, + string& prefix, string& delim) +{ + constexpr int max = 1000; + int ret, num_deleted = 0; + vector objs; + RGWObjectCtx obj_ctx(store); + string marker; + bool is_truncated; + + do { + ret = list_bucket_multiparts(store, bucket_info, prefix, marker, delim, + max, &objs, nullptr, &is_truncated); + if (ret < 0) { + ldout(store->ctx(), 0) << __func__ << + " ERROR : calling list_bucket_multiparts; ret=" << ret << + "; bucket=\"" << bucket_info.bucket << "\"; prefix=\"" << + prefix << "\"; delim=\"" << delim << "\"" << dendl; + return ret; + } + ldout(store->ctx(), 20) << __func__ << + " INFO: aborting and cleaning up multipart upload(s); bucket=\"" << + bucket_info.bucket << "\"; objs.size()=" << objs.size() << + "; is_truncated=" << is_truncated << dendl; + + if (!objs.empty()) { + RGWMPObj mp; + for (const auto& obj : objs) { + rgw_obj_key key(obj.key); + if (!mp.from_meta(key.name)) + continue; + ret = abort_multipart_upload(store, cct, &obj_ctx, bucket_info, mp); + if (ret < 0) { + // we're doing a best-effort; if something cannot be found, + // log it and keep moving forward + if (ret != -ENOENT && ret != -ERR_NO_SUCH_UPLOAD) { + ldout(store->ctx(), 0) << __func__ << + " ERROR : failed to abort and clean-up multipart upload \"" << + key.get_oid() << "\"" << dendl; + return ret; + } else { + ldout(store->ctx(), 10) << __func__ << + " NOTE : unable to find part(s) of " + "aborted multipart upload of \"" << key.get_oid() << + "\" for cleaning up" << dendl; + } + } + num_deleted++; + } + if (num_deleted) { + ldout(store->ctx(), 0) << __func__ << + " WARNING : aborted " << num_deleted << + " incomplete multipart uploads" << dendl; + } + } + } while (is_truncated); + + return 0; +} diff --git a/src/rgw/rgw_multi.h b/src/rgw/rgw_multi.h new file mode 100644 index 00000000..8c682888 --- /dev/null +++ b/src/rgw/rgw_multi.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_MULTI_H +#define CEPH_RGW_MULTI_H + +#include +#include "rgw_xml.h" +#include "rgw_rados.h" + +#define MULTIPART_UPLOAD_ID_PREFIX_LEGACY "2/" +#define MULTIPART_UPLOAD_ID_PREFIX "2~" // must contain a unique char that may not come up in gen_rand_alpha() + +class RGWMultiCompleteUpload : public XMLObj +{ +public: + RGWMultiCompleteUpload() {} + ~RGWMultiCompleteUpload() override {} + bool xml_end(const char *el) override; + + std::map parts; +}; + +class RGWMultiPart : public XMLObj +{ + string etag; + int num; +public: + RGWMultiPart() : num(0) {} + ~RGWMultiPart() override {} + bool xml_end(const char *el) override; + + string& get_etag() { return etag; } + int get_num() { return num; } +}; + +class RGWMultiPartNumber : public XMLObj +{ +public: + RGWMultiPartNumber() {} + ~RGWMultiPartNumber() override {} +}; + +class RGWMultiETag : public XMLObj +{ +public: + RGWMultiETag() {} + ~RGWMultiETag() override {} +}; + +class RGWMultiXMLParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) override; +public: + RGWMultiXMLParser() {} + ~RGWMultiXMLParser() override {} +}; + +/** + * A filter to a) test whether an object name is a multipart meta + * object, and b) filter out just the key used to determine the bucket + * index shard. + * + * Objects for multipart meta have names adorned with an upload id and + * other elements -- specifically a ".", MULTIPART_UPLOAD_ID_PREFIX, + * unique id, and MP_META_SUFFIX. This filter will return true when + * the name provided is such. It will also extract the key used for + * bucket index shard calculation from the adorned name. + */ +class MultipartMetaFilter : public RGWAccessListFilter { +public: + MultipartMetaFilter() {} + + /** + * @param name [in] The object name as it appears in the bucket index. + * @param key [out] An output parameter that will contain the bucket + * index key if this entry is in the form of a multipart meta object. + * @return true if the name provided is in the form of a multipart meta + * object, false otherwise + */ + bool filter(const string& name, string& key) override; +}; // class MultipartMetaFilter + +extern bool is_v2_upload_id(const string& upload_id); + +extern int list_multipart_parts(RGWRados *store, RGWBucketInfo& bucket_info, + CephContext *cct, + const string& upload_id, + const string& meta_oid, int num_parts, + int marker, map& parts, + int *next_marker, bool *truncated, + bool assume_unsorted = false); + +extern int list_multipart_parts(RGWRados *store, struct req_state *s, + const string& upload_id, + const string& meta_oid, int num_parts, + int marker, map& parts, + int *next_marker, bool *truncated, + bool assume_unsorted = false); + +extern int abort_multipart_upload(RGWRados *store, CephContext *cct, RGWObjectCtx *obj_ctx, + RGWBucketInfo& bucket_info, RGWMPObj& mp_obj); + +extern int list_bucket_multiparts(RGWRados *store, RGWBucketInfo& bucket_info, + const string& prefix, + const string& marker, + const string& delim, + const int& max_uploads, + vector *objs, + map *common_prefixes, bool *is_truncated); + +extern int abort_bucket_multiparts(RGWRados *store, CephContext *cct, RGWBucketInfo& bucket_info, + string& prefix, string& delim); +#endif diff --git a/src/rgw/rgw_multi_del.cc b/src/rgw/rgw_multi_del.cc new file mode 100644 index 00000000..2faa8069 --- /dev/null +++ b/src/rgw/rgw_multi_del.cc @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include + +#include "include/types.h" + +#include "rgw_xml.h" +#include "rgw_multi_del.h" + +#define dout_subsys ceph_subsys_rgw + + + +bool RGWMultiDelObject::xml_end(const char *el) +{ + RGWMultiDelKey *key_obj = static_cast(find_first("Key")); + RGWMultiDelVersionId *vid = static_cast(find_first("VersionId")); + + if (!key_obj) + return false; + + string s = key_obj->get_data(); + if (s.empty()) + return false; + + key = s; + + if (vid) { + version_id = vid->get_data(); + } + + return true; +} + +bool RGWMultiDelDelete::xml_end(const char *el) { + RGWMultiDelQuiet *quiet_set = static_cast(find_first("Quiet")); + if (quiet_set) { + string quiet_val = quiet_set->get_data(); + quiet = (strcasecmp(quiet_val.c_str(), "true") == 0); + } + + XMLObjIter iter = find("Object"); + RGWMultiDelObject *object = static_cast(iter.get_next()); + while (object) { + const string& key = object->get_key(); + const string& instance = object->get_version_id(); + rgw_obj_key k(key, instance); + objects.push_back(k); + object = static_cast(iter.get_next()); + } + return true; +} + +XMLObj *RGWMultiDelXMLParser::alloc_obj(const char *el) { + XMLObj *obj = NULL; + if (strcmp(el, "Delete") == 0) { + obj = new RGWMultiDelDelete(); + } else if (strcmp(el, "Quiet") == 0) { + obj = new RGWMultiDelQuiet(); + } else if (strcmp(el, "Object") == 0) { + obj = new RGWMultiDelObject (); + } else if (strcmp(el, "Key") == 0) { + obj = new RGWMultiDelKey(); + } else if (strcmp(el, "VersionId") == 0) { + obj = new RGWMultiDelVersionId(); + } + + return obj; +} + diff --git a/src/rgw/rgw_multi_del.h b/src/rgw/rgw_multi_del.h new file mode 100644 index 00000000..1ac8e491 --- /dev/null +++ b/src/rgw/rgw_multi_del.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_MULTI_DELETE_H_ +#define RGW_MULTI_DELETE_H_ + +#include +#include "rgw_xml.h" +#include "rgw_common.h" + +class RGWMultiDelDelete : public XMLObj +{ +public: + RGWMultiDelDelete() :quiet(false) {} + ~RGWMultiDelDelete() override {} + bool xml_end(const char *el) override; + + std::vector objects; + bool quiet; + bool is_quiet() { return quiet; } +}; + +class RGWMultiDelQuiet : public XMLObj +{ +public: + RGWMultiDelQuiet() {} + ~RGWMultiDelQuiet() override {} +}; + +class RGWMultiDelObject : public XMLObj +{ + string key; + string version_id; +public: + RGWMultiDelObject() {} + ~RGWMultiDelObject() override {} + bool xml_end(const char *el) override; + + const string& get_key() { return key; } + const string& get_version_id() { return version_id; } +}; + +class RGWMultiDelKey : public XMLObj +{ +public: + RGWMultiDelKey() {} + ~RGWMultiDelKey() override {} +}; + +class RGWMultiDelVersionId : public XMLObj +{ +public: + RGWMultiDelVersionId() {} + ~RGWMultiDelVersionId() override {} +}; + +class RGWMultiDelXMLParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) override; +public: + RGWMultiDelXMLParser() {} + ~RGWMultiDelXMLParser() override {} +}; + + +#endif diff --git a/src/rgw/rgw_multiparser.cc b/src/rgw/rgw_multiparser.cc new file mode 100644 index 00000000..63d70d72 --- /dev/null +++ b/src/rgw/rgw_multiparser.cc @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_multi.h" + +#define dout_subsys ceph_subsys_rgw + + +int main(int argc, char **argv) { + RGWMultiXMLParser parser; + + if (!parser.init()) + exit(1); + + char buf[1024]; + + for (;;) { + int done; + int len; + + len = fread(buf, 1, sizeof(buf), stdin); + if (ferror(stdin)) { + fprintf(stderr, "Read error\n"); + exit(-1); + } + done = feof(stdin); + + bool result = parser.parse(buf, len, done); + if (!result) { + cerr << "failed to parse!" << std::endl; + } + + if (done) + break; + } + + exit(0); +} + diff --git a/src/rgw/rgw_notify.cc b/src/rgw/rgw_notify.cc new file mode 100644 index 00000000..2104031a --- /dev/null +++ b/src/rgw/rgw_notify.cc @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_notify.h" +#include +#include +#include "rgw_pubsub.h" +#include "rgw_pubsub_push.h" +#include "rgw_perf_counters.h" +#include "common/dout.h" + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::notify { + +// populate record from request +void populate_record_from_request(const req_state *s, + const rgw_obj_key& key, + uint64_t size, + const ceph::real_time& mtime, + const std::string& etag, + EventType event_type, + rgw_pubsub_s3_record& record) { + record.eventTime = mtime; + record.eventName = to_string(event_type); + record.userIdentity = s->user->user_id.id; // user that triggered the change + record.x_amz_request_id = s->req_id; // request ID of the original change + record.x_amz_id_2 = s->host_id; // RGW on which the change was made + // configurationId is filled from notification configuration + record.bucket_name = s->bucket_name; + record.bucket_ownerIdentity = s->bucket_owner.get_id().id; + record.bucket_arn = to_string(rgw::ARN(s->bucket)); + record.object_key = key.name; + record.object_size = size; + record.object_etag = etag; + record.object_versionId = key.instance; + // use timestamp as per key sequence id (hex encoded) + const utime_t ts(real_clock::now()); + boost::algorithm::hex((const char*)&ts, (const char*)&ts + sizeof(utime_t), + std::back_inserter(record.object_sequencer)); + set_event_id(record.id, etag, ts); + record.bucket_id = s->bucket.bucket_id; + // pass meta data + record.x_meta_map = s->info.x_meta_map; + // pass tags + record.tags = s->tagset.get_tags(); + // opaque data will be filled from topic configuration +} + +bool match(const rgw_pubsub_topic_filter& filter, const req_state* s, EventType event) { + if (!::match(filter.events, event)) { + return false; + } + if (!::match(filter.s3_filter.key_filter, s->object.name)) { + return false; + } + if (!::match(filter.s3_filter.metadata_filter, s->info.x_meta_map)) { + return false; + } + if (!::match(filter.s3_filter.tag_filter, s->tagset.get_tags())) { + return false; + } + return true; +} + +int publish(const req_state* s, + const rgw_obj_key& key, + uint64_t size, + const ceph::real_time& mtime, + const std::string& etag, + EventType event_type, + RGWRados* store) { + RGWUserPubSub ps_user(store, s->user->user_id); + RGWUserPubSub::Bucket ps_bucket(&ps_user, s->bucket); + rgw_pubsub_bucket_topics bucket_topics; + auto rc = ps_bucket.get_topics(&bucket_topics); + if (rc < 0) { + // failed to fetch bucket topics + return rc; + } + rgw_pubsub_s3_record record; + populate_record_from_request(s, key, size, mtime, etag, event_type, record); + bool event_handled = false; + bool event_should_be_handled = false; + for (const auto& bucket_topic : bucket_topics.topics) { + const rgw_pubsub_topic_filter& topic_filter = bucket_topic.second; + const rgw_pubsub_topic& topic_cfg = topic_filter.topic; + if (!match(topic_filter, s, event_type)) { + // topic does not apply to req_state + continue; + } + event_should_be_handled = true; + record.configurationId = topic_filter.s3_id; + record.opaque_data = topic_cfg.opaque_data; + ldout(s->cct, 20) << "notification: '" << topic_filter.s3_id << + "' on topic: '" << topic_cfg.dest.arn_topic << + "' and bucket: '" << s->bucket.name << + "' (unique topic: '" << topic_cfg.name << + "') apply to event of type: '" << to_string(event_type) << "'" << dendl; + try { + // TODO add endpoint LRU cache + const auto push_endpoint = RGWPubSubEndpoint::create(topic_cfg.dest.push_endpoint, + topic_cfg.dest.arn_topic, + RGWHTTPArgs(topic_cfg.dest.push_endpoint_args), + s->cct); + const std::string push_endpoint_str = push_endpoint->to_str(); + ldout(s->cct, 20) << "push endpoint created: " << push_endpoint_str << dendl; + auto rc = push_endpoint->send_to_completion_async(s->cct, record, s->yield); + if (rc < 0) { + // bail out on first error + // TODO: add conf for bail out policy + ldout(s->cct, 1) << "push to endpoint " << push_endpoint_str << " failed, with error: " << rc << dendl; + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed); + return rc; + } + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok); + ldout(s->cct, 20) << "successfull push to endpoint " << push_endpoint_str << dendl; + event_handled = true; + } catch (const RGWPubSubEndpoint::configuration_error& e) { + ldout(s->cct, 1) << "ERROR: failed to create push endpoint: " + << topic_cfg.dest.push_endpoint << " due to: " << e.what() << dendl; + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed); + return -EINVAL; + } + } + + if (event_should_be_handled) { + // not counting events with no notifications or events that are filtered + // counting a single event, regardless of the number of notifications it sends + if (perfcounter) perfcounter->inc(l_rgw_pubsub_event_triggered); + if (!event_handled) { + // all notifications for this event failed + if (perfcounter) perfcounter->inc(l_rgw_pubsub_event_lost); + } + } + + return 0; +} + +} + diff --git a/src/rgw/rgw_notify.h b/src/rgw/rgw_notify.h new file mode 100644 index 00000000..5b480c0e --- /dev/null +++ b/src/rgw/rgw_notify.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include "common/ceph_time.h" +#include "rgw_notify_event_type.h" + +// forward declarations +class RGWRados; +class req_state; +struct rgw_obj_key; + +namespace rgw::notify { + +// publish notification +int publish(const req_state* s, + const rgw_obj_key& key, + uint64_t size, + const ceph::real_time& mtime, + const std::string& etag, + EventType event_type, + RGWRados* store); + +} + diff --git a/src/rgw/rgw_notify_event_type.cc b/src/rgw/rgw_notify_event_type.cc new file mode 100644 index 00000000..10c77c28 --- /dev/null +++ b/src/rgw/rgw_notify_event_type.cc @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_notify_event_type.h" +#include "include/str_list.h" + +namespace rgw::notify { + + std::string to_string(EventType t) { + switch (t) { + case ObjectCreated: + return "s3:ObjectCreated:*"; + case ObjectCreatedPut: + return "s3:ObjectCreated:Put"; + case ObjectCreatedPost: + return "s3:ObjectCreated:Post"; + case ObjectCreatedCopy: + return "s3:ObjectCreated:Copy"; + case ObjectCreatedCompleteMultipartUpload: + return "s3:ObjectCreated:CompleteMultipartUpload"; + case ObjectRemoved: + return "s3:ObjectRemoved:*"; + case ObjectRemovedDelete: + return "s3:ObjectRemoved:Delete"; + case ObjectRemovedDeleteMarkerCreated: + return "s3:ObjectRemoved:DeleteMarkerCreated"; + case UnknownEvent: + return "s3:UnknownEvet"; + } + return "s3:UnknownEvent"; + } + + std::string to_ceph_string(EventType t) { + switch (t) { + case ObjectCreated: + case ObjectCreatedPut: + case ObjectCreatedPost: + case ObjectCreatedCopy: + case ObjectCreatedCompleteMultipartUpload: + return "OBJECT_CREATE"; + case ObjectRemovedDelete: + return "OBJECT_DELETE"; + case ObjectRemovedDeleteMarkerCreated: + return "DELETE_MARKER_CREATE"; + case ObjectRemoved: + case UnknownEvent: + return "UNKNOWN_EVENT"; + } + return "UNKNOWN_EVENT"; + } + + EventType from_string(const std::string& s) { + if (s == "s3:ObjectCreated:*" || s == "OBJECT_CREATE") + return ObjectCreated; + if (s == "s3:ObjectCreated:Put") + return ObjectCreatedPut; + if (s == "s3:ObjectCreated:Post") + return ObjectCreatedPost; + if (s == "s3:ObjectCreated:Copy") + return ObjectCreatedCopy; + if (s == "s3:ObjectCreated:CompleteMultipartUpload") + return ObjectCreatedCompleteMultipartUpload; + if (s == "s3:ObjectRemoved:*") + return ObjectRemoved; + if (s == "s3:ObjectRemoved:Delete" || s == "OBJECT_DELETE") + return ObjectRemovedDelete; + if (s == "s3:ObjectRemoved:DeleteMarkerCreated" || s == "DELETE_MARKER_CREATE") + return ObjectRemovedDeleteMarkerCreated; + return UnknownEvent; + } + +bool operator==(EventType lhs, EventType rhs) { + return lhs & rhs; +} + +void from_string_list(const std::string& string_list, EventTypeList& event_list) { + event_list.clear(); + ceph::for_each_substr(string_list, ",", [&event_list] (auto token) { + event_list.push_back(rgw::notify::from_string(std::string(token.begin(), token.end()))); + }); +} +} diff --git a/src/rgw/rgw_notify_event_type.h b/src/rgw/rgw_notify_event_type.h new file mode 100644 index 00000000..0d86bf3f --- /dev/null +++ b/src/rgw/rgw_notify_event_type.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +#include +#include + +namespace rgw::notify { + enum EventType { + ObjectCreated = 0xF, + ObjectCreatedPut = 0x1, + ObjectCreatedPost = 0x2, + ObjectCreatedCopy = 0x4, + ObjectCreatedCompleteMultipartUpload = 0x8, + ObjectRemoved = 0xF0, + ObjectRemovedDelete = 0x10, + ObjectRemovedDeleteMarkerCreated = 0x20, + UnknownEvent = 0x100 + }; + + using EventTypeList = std::vector; + + // two event types are considered equal if their bits intersect + bool operator==(EventType lhs, EventType rhs); + + std::string to_string(EventType t); + + std::string to_ceph_string(EventType t); + + EventType from_string(const std::string& s); + + // create a vector of event types from comma separated list of event types + void from_string_list(const std::string& string_list, EventTypeList& event_list); +} + diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc new file mode 100644 index 00000000..93258b42 --- /dev/null +++ b/src/rgw/rgw_object_expirer.cc @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_object_expirer_core.h" + +#define dout_subsys ceph_subsys_rgw + +static RGWRados *store = NULL; + +class StoreDestructor { + RGWRados *store; + +public: + explicit StoreDestructor(RGWRados *_s) : store(_s) {} + ~StoreDestructor() { + if (store) { + RGWStoreManager::close_storage(store); + } + } +}; + +static void usage() +{ + generic_server_usage(); +} + +int main(const int argc, const char **argv) +{ + vector args; + argv_to_vec(argc, argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS, "rgw_data"); + + for (std::vector::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } + } + + if (g_conf()->daemonize) { + global_init_daemonize(g_ceph_context); + } + + common_init_finish(g_ceph_context); + + store = RGWStoreManager::get_storage(g_ceph_context, false, false, false, false, false); + if (!store) { + std::cerr << "couldn't init storage provider" << std::endl; + return EIO; + } + + rgw_user_init(store); + rgw_bucket_init(store->meta_mgr); + + /* Guard to not forget about closing the rados store. */ + StoreDestructor store_dtor(store); + + RGWObjectExpirer objexp(store); + objexp.start_processor(); + + const utime_t interval(g_ceph_context->_conf->rgw_objexp_gc_interval, 0); + while (true) { + interval.sleep(); + } + + /* unreachable */ + + return EXIT_SUCCESS; +} diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc new file mode 100644 index 00000000..b2e302ba --- /dev/null +++ b/src/rgw/rgw_object_expirer_core.cc @@ -0,0 +1,294 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" +#include "rgw_object_expirer_core.h" + +#include "services/svc_sys_obj.h" + +#include "cls/lock/cls_lock_client.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +static string objexp_lock_name = "gc_process"; + +int RGWObjectExpirer::init_bucket_info(const string& tenant_name, + const string& bucket_name, + const string& bucket_id, + RGWBucketInfo& bucket_info) +{ + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + /* + * XXX Here's where it gets tricky. We went to all the trouble of + * punching the tenant through the objexp_hint_entry, but now we + * find that our instances do not actually have tenants. They are + * unique thanks to IDs. So the tenant string is not needed... + + * XXX reloaded: it turns out tenants were needed after all since bucket ids + * are ephemeral, good call encoding tenant info! + */ + + return store->get_bucket_info(obj_ctx, tenant_name, bucket_name, + bucket_info, nullptr, nullptr); + +} + +int RGWObjectExpirer::garbage_single_object(objexp_hint_entry& hint) +{ + RGWBucketInfo bucket_info; + + int ret = init_bucket_info(hint.tenant, hint.bucket_name, + hint.bucket_id, bucket_info); + if (-ENOENT == ret) { + ldout(store->ctx(), 15) << "NOTICE: cannot find bucket = " \ + << hint.bucket_name << ". The object must be already removed" << dendl; + return -ERR_PRECONDITION_FAILED; + } else if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: could not init bucket = " \ + << hint.bucket_name << "due to ret = " << ret << dendl; + return ret; + } + + RGWObjectCtx rctx(store); + + rgw_obj_key key = hint.obj_key; + if (key.instance.empty()) { + key.instance = "null"; + } + + rgw_obj obj(bucket_info.bucket, key); + store->set_atomic(&rctx, obj); + ret = store->delete_obj(rctx, bucket_info, obj, + bucket_info.versioning_status(), 0, hint.exp_time); + + return ret; +} + +void RGWObjectExpirer::garbage_chunk(list& entries, /* in */ + bool& need_trim) /* out */ +{ + need_trim = false; + + for (list::iterator iter = entries.begin(); + iter != entries.end(); + ++iter) + { + objexp_hint_entry hint; + ldout(store->ctx(), 15) << "got removal hint for: " << iter->key_ts.sec() \ + << " - " << iter->key_ext << dendl; + + int ret = store->objexp_hint_parse(*iter, hint); + if (ret < 0) { + ldout(store->ctx(), 1) << "cannot parse removal hint for " << hint.obj_key << dendl; + continue; + } + + /* PRECOND_FAILED simply means that our hint is not valid. + * We can silently ignore that and move forward. */ + ret = garbage_single_object(hint); + if (ret == -ERR_PRECONDITION_FAILED) { + ldout(store->ctx(), 15) << "not actual hint for object: " << hint.obj_key << dendl; + } else if (ret < 0) { + ldout(store->ctx(), 1) << "cannot remove expired object: " << hint.obj_key << dendl; + } + + need_trim = true; + } + + return; +} + +void RGWObjectExpirer::trim_chunk(const string& shard, + const utime_t& from, + const utime_t& to, + const string& from_marker, + const string& to_marker) +{ + ldout(store->ctx(), 20) << "trying to trim removal hints to=" << to + << ", to_marker=" << to_marker << dendl; + + real_time rt_from = from.to_real_time(); + real_time rt_to = to.to_real_time(); + + int ret = store->objexp_hint_trim(shard, rt_from, rt_to, + from_marker, to_marker); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR during trim: " << ret << dendl; + } + + return; +} + +bool RGWObjectExpirer::process_single_shard(const string& shard, + const utime_t& last_run, + const utime_t& round_start) +{ + string marker; + string out_marker; + bool truncated = false; + bool done = true; + + CephContext *cct = store->ctx(); + int num_entries = cct->_conf->rgw_objexp_chunk_size; + + int max_secs = cct->_conf->rgw_objexp_gc_interval; + utime_t end = ceph_clock_now(); + end += max_secs; + + rados::cls::lock::Lock l(objexp_lock_name); + + utime_t time(max_secs, 0); + l.set_duration(time); + + int ret = l.lock_exclusive(&store->objexp_pool_ctx, shard); + if (ret == -EBUSY) { /* already locked by another processor */ + dout(5) << __func__ << "(): failed to acquire lock on " << shard << dendl; + return false; + } + + do { + real_time rt_last = last_run.to_real_time(); + real_time rt_start = round_start.to_real_time(); + + list entries; + ret = store->objexp_hint_list(shard, rt_last, rt_start, + num_entries, marker, entries, + &out_marker, &truncated); + if (ret < 0) { + ldout(cct, 10) << "cannot get removal hints from shard: " << shard + << dendl; + continue; + } + + bool need_trim; + garbage_chunk(entries, need_trim); + + if (need_trim) { + trim_chunk(shard, last_run, round_start, marker, out_marker); + } + + utime_t now = ceph_clock_now(); + if (now >= end) { + done = false; + break; + } + + marker = out_marker; + } while (truncated); + + l.unlock(&store->objexp_pool_ctx, shard); + return done; +} + +/* Returns true if all shards have been processed successfully. */ +bool RGWObjectExpirer::inspect_all_shards(const utime_t& last_run, + const utime_t& round_start) +{ + CephContext * const cct = store->ctx(); + int num_shards = cct->_conf->rgw_objexp_hints_num_shards; + bool all_done = true; + + for (int i = 0; i < num_shards; i++) { + string shard; + store->objexp_get_shard(i, shard); + + ldout(store->ctx(), 20) << "processing shard = " << shard << dendl; + + if (! process_single_shard(shard, last_run, round_start)) { + all_done = false; + } + } + + return all_done; +} + +bool RGWObjectExpirer::going_down() +{ + return down_flag; +} + +void RGWObjectExpirer::start_processor() +{ + worker = new OEWorker(store->ctx(), this); + worker->create("rgw_obj_expirer"); +} + +void RGWObjectExpirer::stop_processor() +{ + down_flag = true; + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = NULL; +} + +void *RGWObjectExpirer::OEWorker::entry() { + utime_t last_run; + do { + utime_t start = ceph_clock_now(); + ldout(cct, 2) << "object expiration: start" << dendl; + if (oe->inspect_all_shards(last_run, start)) { + /* All shards have been processed properly. Next time we can start + * from this moment. */ + last_run = start; + } + ldout(cct, 2) << "object expiration: stop" << dendl; + + + if (oe->going_down()) + break; + + utime_t end = ceph_clock_now(); + end -= start; + int secs = cct->_conf->rgw_objexp_gc_interval; + + if (secs <= end.sec()) + continue; // next round + + secs -= end.sec(); + + lock.Lock(); + cond.WaitInterval(lock, utime_t(secs, 0)); + lock.Unlock(); + } while (!oe->going_down()); + + return NULL; +} + +void RGWObjectExpirer::OEWorker::stop() +{ + Mutex::Locker l(lock); + cond.Signal(); +} + diff --git a/src/rgw/rgw_object_expirer_core.h b/src/rgw/rgw_object_expirer_core.h new file mode 100644 index 00000000..c3caff5c --- /dev/null +++ b/src/rgw/rgw_object_expirer_core.h @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OBJEXP_H +#define CEPH_OBJEXP_H + +#include +#include +#include +#include +#include + +#include "auth/Crypto.h" + +#include "common/armor.h" +#include "common/ceph_json.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" + +#include "global/global_init.h" + +#include "include/utime.h" +#include "include/str_list.h" + +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_log.h" +#include "rgw_formats.h" +#include "rgw_usage.h" + +class RGWObjectExpirer { +protected: + RGWRados *store; + + int init_bucket_info(const std::string& tenant_name, + const std::string& bucket_name, + const std::string& bucket_id, + RGWBucketInfo& bucket_info); + + class OEWorker : public Thread { + CephContext *cct; + RGWObjectExpirer *oe; + Mutex lock; + Cond cond; + + public: + OEWorker(CephContext * const cct, + RGWObjectExpirer * const oe) + : cct(cct), + oe(oe), + lock("OEWorker") { + } + + void *entry() override; + void stop(); + }; + + OEWorker *worker{nullptr}; + std::atomic down_flag = { false }; + +public: + explicit RGWObjectExpirer(RGWRados *_store) + : store(_store), worker(NULL) { + } + ~RGWObjectExpirer() { + stop_processor(); + } + + int garbage_single_object(objexp_hint_entry& hint); + + void garbage_chunk(std::list& entries, /* in */ + bool& need_trim); /* out */ + + void trim_chunk(const std::string& shard, + const utime_t& from, + const utime_t& to, + const string& from_marker, + const string& to_marker); + + bool process_single_shard(const std::string& shard, + const utime_t& last_run, + const utime_t& round_start); + + bool inspect_all_shards(const utime_t& last_run, + const utime_t& round_start); + + bool going_down(); + void start_processor(); + void stop_processor(); +}; +#endif /* CEPH_OBJEXP_H */ diff --git a/src/rgw/rgw_object_lock.cc b/src/rgw/rgw_object_lock.cc new file mode 100644 index 00000000..69da8881 --- /dev/null +++ b/src/rgw/rgw_object_lock.cc @@ -0,0 +1,96 @@ +#include "rgw_object_lock.h" + +void DefaultRetention::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Mode", mode, obj, true); + if (mode.compare("GOVERNANCE") != 0 && mode.compare("COMPLIANCE") != 0) { + throw RGWXMLDecoder::err("bad Mode in lock rule"); + } + bool days_exist = RGWXMLDecoder::decode_xml("Days", days, obj); + bool years_exist = RGWXMLDecoder::decode_xml("Years", years, obj); + if ((days_exist && years_exist) || (!days_exist && !years_exist)) { + throw RGWXMLDecoder::err("either Days or Years must be specified, but not both"); + } +} + +void DefaultRetention::dump_xml(Formatter *f) const { + encode_xml("Mode", mode, f); + if (days > 0) { + encode_xml("Days", days, f); + } else { + encode_xml("Years", years, f); + } +} + +void ObjectLockRule::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("DefaultRetention", defaultRetention, obj, true); +} + +void ObjectLockRule::dump_xml(Formatter *f) const { + encode_xml("DefaultRetention", defaultRetention, f); +} + +void RGWObjectLock::decode_xml(XMLObj *obj) { + string enabled_str; + RGWXMLDecoder::decode_xml("ObjectLockEnabled", enabled_str, obj, true); + if (enabled_str.compare("Enabled") != 0) { + throw RGWXMLDecoder::err("invalid ObjectLockEnabled value"); + } else { + enabled = true; + } + rule_exist = RGWXMLDecoder::decode_xml("Rule", rule, obj); +} + +void RGWObjectLock::dump_xml(Formatter *f) const { + if (enabled) { + encode_xml("ObjectLockEnabled", "Enabled", f); + } + if (rule_exist) { + encode_xml("Rule", rule, f); + } +} + +ceph::real_time RGWObjectLock::get_lock_until_date(const ceph::real_time& mtime) const { + if (!rule_exist) { + return ceph::real_time(); + } + int days = get_days(); + if (days <= 0) { + days = get_years()*365; + } + return mtime + make_timespan(days*24*60*60); +} + +void RGWObjectRetention::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Mode", mode, obj, true); + if (mode.compare("GOVERNANCE") != 0 && mode.compare("COMPLIANCE") != 0) { + throw RGWXMLDecoder::err("bad Mode in retention"); + } + string date_str; + RGWXMLDecoder::decode_xml("RetainUntilDate", date_str, obj, true); + boost::optional date = ceph::from_iso_8601(date_str); + if (boost::none == date) { + throw RGWXMLDecoder::err("invalid RetainUntilDate value"); + } + retain_until_date = *date; +} + +void RGWObjectRetention::dump_xml(Formatter *f) const { + encode_xml("Mode", mode, f); + string date = ceph::to_iso_8601(retain_until_date); + encode_xml("RetainUntilDate", date, f); +} + +void RGWObjectLegalHold::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Status", status, obj, true); + if (status.compare("ON") != 0 && status.compare("OFF") != 0) { + throw RGWXMLDecoder::err("bad status in legal hold"); + } +} + +void RGWObjectLegalHold::dump_xml(Formatter *f) const { + encode_xml("Status", status, f); +} + +bool RGWObjectLegalHold::is_enabled() const { + return status.compare("ON") == 0; +} diff --git a/src/rgw/rgw_object_lock.h b/src/rgw/rgw_object_lock.h new file mode 100644 index 00000000..63990d62 --- /dev/null +++ b/src/rgw/rgw_object_lock.h @@ -0,0 +1,221 @@ +#ifndef CEPH_RGW_OBJECT_LOCK_H +#define CEPH_RGW_OBJECT_LOCK_H + +#include +#include "common/ceph_time.h" +#include "common/iso_8601.h" +#include "rgw_xml.h" + +class DefaultRetention +{ +protected: + string mode; + int days; + int years; + +public: + DefaultRetention(): days(0), years(0) {}; + + int get_days() const { + return days; + } + + int get_years() const { + return years; + } + + string get_mode() const { + return mode; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(mode, bl); + encode(days, bl); + encode(years, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(mode, bl); + decode(days, bl); + decode(years, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(DefaultRetention) + +class ObjectLockRule +{ +protected: + DefaultRetention defaultRetention; +public: + int get_days() const { + return defaultRetention.get_days(); + } + + int get_years() const { + return defaultRetention.get_years(); + } + + string get_mode() const { + return defaultRetention.get_mode(); + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(defaultRetention, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(defaultRetention, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(ObjectLockRule) + +class RGWObjectLock +{ +protected: + bool enabled; + bool rule_exist; + ObjectLockRule rule; + +public: + RGWObjectLock():enabled(true), rule_exist(false) {} + + int get_days() const { + return rule.get_days(); + } + + int get_years() const { + return rule.get_years(); + } + + string get_mode() const { + return rule.get_mode(); + } + + bool retention_period_valid() const { + // DefaultRetention requires either Days or Years. + // You can't specify both at the same time. + // see https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTObjectLockConfiguration.html + return (get_years() > 0) != (get_days() > 0); + } + + bool has_rule() const { + return rule_exist; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(enabled, bl); + encode(rule_exist, bl); + if (rule_exist) { + encode(rule, bl); + } + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(enabled, bl); + decode(rule_exist, bl); + if (rule_exist) { + decode(rule, bl); + } + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + ceph::real_time get_lock_until_date(const ceph::real_time& mtime) const; +}; +WRITE_CLASS_ENCODER(RGWObjectLock) + +class RGWObjectRetention +{ +protected: + string mode; + ceph::real_time retain_until_date; +public: + RGWObjectRetention() {} + RGWObjectRetention(string _mode, ceph::real_time _date): mode(_mode), retain_until_date(_date) {} + + void set_mode(string _mode) { + mode = _mode; + } + + string get_mode() const { + return mode; + } + + void set_retain_until_date(ceph::real_time _retain_until_date) { + retain_until_date = _retain_until_date; + } + + ceph::real_time get_retain_until_date() const { + return retain_until_date; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(mode, bl); + encode(retain_until_date, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(mode, bl); + decode(retain_until_date, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWObjectRetention) + +class RGWObjectLegalHold +{ +protected: + string status; +public: + RGWObjectLegalHold() {} + RGWObjectLegalHold(string _status): status(_status) {} + void set_status(string _status) { + status = _status; + } + + string get_status() const { + return status; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(status, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(status, bl); + DECODE_FINISH(bl); + } + + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + bool is_enabled() const; +}; +WRITE_CLASS_ENCODER(RGWObjectLegalHold) +#endif //CEPH_RGW_OBJECT_LOCK_H diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc new file mode 100644 index 00000000..c2501b78 --- /dev/null +++ b/src/rgw/rgw_op.cc @@ -0,0 +1,7942 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include "include/scope_guard.h" +#include "common/Clock.h" +#include "common/armor.h" +#include "common/errno.h" +#include "common/mime.h" +#include "common/utf8.h" +#include "common/ceph_json.h" +#include "common/static_ptr.h" + +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_acl_swift.h" +#include "rgw_aio_throttle.h" +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_log.h" +#include "rgw_multi.h" +#include "rgw_multi_del.h" +#include "rgw_cors.h" +#include "rgw_cors_s3.h" +#include "rgw_rest_conn.h" +#include "rgw_rest_s3.h" +#include "rgw_tar.h" +#include "rgw_client_io.h" +#include "rgw_compression.h" +#include "rgw_role.h" +#include "rgw_tag_s3.h" +#include "rgw_putobj_processor.h" +#include "rgw_crypt.h" +#include "rgw_perf_counters.h" +#include "rgw_notify.h" +#include "rgw_notify_event_type.h" + +#include "services/svc_zone.h" +#include "services/svc_quota.h" +#include "services/svc_sys_obj.h" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rgw/cls_rgw_client.h" + + +#include "include/ceph_assert.h" + +#include "compressor/Compressor.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/rgw_op.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace librados; +using ceph::crypto::MD5; +using boost::optional; +using boost::none; + +using rgw::ARN; +using rgw::IAM::Effect; +using rgw::IAM::Policy; + +static string mp_ns = RGW_OBJ_NS_MULTIPART; +static string shadow_ns = RGW_OBJ_NS_SHADOW; + +static void forward_req_info(CephContext *cct, req_info& info, const std::string& bucket_name); +static int forward_request_to_master(struct req_state *s, obj_version *objv, RGWRados *store, + bufferlist& in_data, JSONParser *jp, req_info *forward_info = nullptr); + +static MultipartMetaFilter mp_filter; + +// this probably should belong in the rgw_iam_policy_keywords, I'll get it to it +// at some point +static constexpr auto S3_EXISTING_OBJTAG = "s3:ExistingObjectTag"; + +int RGWGetObj::parse_range(void) +{ + int r = -ERANGE; + string rs(range_str); + string ofs_str; + string end_str; + + ignore_invalid_range = s->cct->_conf->rgw_ignore_get_invalid_range; + partial_content = false; + + size_t pos = rs.find("bytes="); + if (pos == string::npos) { + pos = 0; + while (isspace(rs[pos])) + pos++; + int end = pos; + while (isalpha(rs[end])) + end++; + if (strncasecmp(rs.c_str(), "bytes", end - pos) != 0) + return 0; + while (isspace(rs[end])) + end++; + if (rs[end] != '=') + return 0; + rs = rs.substr(end + 1); + } else { + rs = rs.substr(pos + 6); /* size of("bytes=") */ + } + pos = rs.find('-'); + if (pos == string::npos) + goto done; + + partial_content = true; + + ofs_str = rs.substr(0, pos); + end_str = rs.substr(pos + 1); + if (end_str.length()) { + end = atoll(end_str.c_str()); + if (end < 0) + goto done; + } + + if (ofs_str.length()) { + ofs = atoll(ofs_str.c_str()); + } else { // RFC2616 suffix-byte-range-spec + ofs = -end; + end = -1; + } + + if (end >= 0 && end < ofs) + goto done; + + range_parsed = true; + return 0; + +done: + if (ignore_invalid_range) { + partial_content = false; + ofs = 0; + end = -1; + range_parsed = false; // allow retry + r = 0; + } + + return r; +} + +static int decode_policy(CephContext *cct, + bufferlist& bl, + RGWAccessControlPolicy *policy) +{ + auto iter = bl.cbegin(); + try { + policy->decode(iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (cct->_conf->subsys.should_gather()) { + ldout(cct, 15) << __func__ << " Read AccessControlPolicy"; + RGWAccessControlPolicy_S3 *s3policy = static_cast(policy); + s3policy->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + + +static int get_user_policy_from_attr(CephContext * const cct, + RGWRados * const store, + map& attrs, + RGWAccessControlPolicy& policy /* out */) +{ + auto aiter = attrs.find(RGW_ATTR_ACL); + if (aiter != attrs.end()) { + int ret = decode_policy(cct, aiter->second, &policy); + if (ret < 0) { + return ret; + } + } else { + return -ENOENT; + } + + return 0; +} + +static int get_bucket_instance_policy_from_attr(CephContext *cct, + RGWRados *store, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy *policy) +{ + map::iterator aiter = bucket_attrs.find(RGW_ATTR_ACL); + + if (aiter != bucket_attrs.end()) { + int ret = decode_policy(cct, aiter->second, policy); + if (ret < 0) + return ret; + } else { + ldout(cct, 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl; + RGWUserInfo uinfo; + /* object exists, but policy is broken */ + int r = rgw_get_user_info_by_uid(store, bucket_info.owner, uinfo); + if (r < 0) + return r; + + policy->create_default(bucket_info.owner, uinfo.display_name); + } + return 0; +} + +static int get_obj_policy_from_attr(CephContext *cct, + RGWRados *store, + RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy *policy, + string *storage_class, + rgw_obj& obj) +{ + bufferlist bl; + int ret = 0; + + RGWRados::Object op_target(store, bucket_info, obj_ctx, obj); + RGWRados::Object::Read rop(&op_target); + + ret = rop.get_attr(RGW_ATTR_ACL, bl); + if (ret >= 0) { + ret = decode_policy(cct, bl, policy); + if (ret < 0) + return ret; + } else if (ret == -ENODATA) { + /* object exists, but policy is broken */ + ldout(cct, 0) << "WARNING: couldn't find acl header for object, generating default" << dendl; + RGWUserInfo uinfo; + ret = rgw_get_user_info_by_uid(store, bucket_info.owner, uinfo); + if (ret < 0) + return ret; + + policy->create_default(bucket_info.owner, uinfo.display_name); + } + + if (storage_class) { + bufferlist scbl; + int r = rop.get_attr(RGW_ATTR_STORAGE_CLASS, scbl); + if (r >= 0) { + *storage_class = scbl.to_str(); + } else { + storage_class->clear(); + } + } + + return ret; +} + + +/** + * Get the AccessControlPolicy for an object off of disk. + * policy: must point to a valid RGWACL, and will be filled upon return. + * bucket: name of the bucket containing the object. + * object: name of the object to get the ACL for. + * Returns: 0 on success, -ERR# otherwise. + */ +int rgw_op_get_bucket_policy_from_attr(CephContext *cct, + RGWRados *store, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy *policy) +{ + return get_bucket_instance_policy_from_attr(cct, store, bucket_info, bucket_attrs, policy); +} + +static boost::optional get_iam_policy_from_attr(CephContext* cct, + RGWRados* store, + map& attrs, + const string& tenant) { + auto i = attrs.find(RGW_ATTR_IAM_POLICY); + if (i != attrs.end()) { + return Policy(cct, tenant, i->second); + } else { + return none; + } +} + +vector get_iam_user_policy_from_attr(CephContext* cct, + RGWRados* store, + map& attrs, + const string& tenant) { + vector policies; + if (auto it = attrs.find(RGW_ATTR_USER_POLICY); it != attrs.end()) { + bufferlist out_bl = attrs[RGW_ATTR_USER_POLICY]; + map policy_map; + decode(policy_map, out_bl); + for (auto& it : policy_map) { + bufferlist bl = bufferlist::static_from_string(it.second); + Policy p(cct, tenant, bl); + policies.push_back(std::move(p)); + } + } + return policies; +} + +static int get_obj_attrs(RGWRados *store, struct req_state *s, const rgw_obj& obj, map& attrs, rgw_obj *target_obj = nullptr) +{ + RGWRados::Object op_target(store, s->bucket_info, *static_cast(s->obj_ctx), obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrs; + read_op.params.target_obj = target_obj; + + return read_op.prepare(); +} + +static int get_obj_head(RGWRados *store, struct req_state *s, + const rgw_obj& obj, + map *attrs, + bufferlist *pbl) +{ + store->set_prefetch_data(s->obj_ctx, obj); + + RGWRados::Object op_target(store, s->bucket_info, *static_cast(s->obj_ctx), obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = attrs; + + int ret = read_op.prepare(); + if (ret < 0) { + return ret; + } + + if (!pbl) { + return 0; + } + + ret = read_op.read(0, s->cct->_conf->rgw_max_chunk_size, *pbl); + + return 0; +} + +struct multipart_upload_info +{ + rgw_placement_rule dest_placement; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(dest_placement, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(dest_placement, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(multipart_upload_info) + +static int get_multipart_info(RGWRados *store, struct req_state *s, + const rgw_obj& obj, + RGWAccessControlPolicy *policy, + map *attrs, + multipart_upload_info *upload_info) +{ + bufferlist header; + + bufferlist headbl; + bufferlist *pheadbl = (upload_info ? &headbl : nullptr); + + int op_ret = get_obj_head(store, s, obj, attrs, pheadbl); + if (op_ret < 0) { + if (op_ret == -ENOENT) { + return -ERR_NO_SUCH_UPLOAD; + } + return op_ret; + } + + if (upload_info && headbl.length() > 0) { + auto hiter = headbl.cbegin(); + try { + decode(*upload_info, hiter); + } catch (buffer::error& err) { + ldpp_dout(s, 0) << "ERROR: failed to decode multipart upload info" << dendl; + return -EIO; + } + } + + if (policy && attrs) { + for (auto& iter : *attrs) { + string name = iter.first; + if (name.compare(RGW_ATTR_ACL) == 0) { + bufferlist& bl = iter.second; + auto bli = bl.cbegin(); + try { + decode(*policy, bli); + } catch (buffer::error& err) { + ldpp_dout(s, 0) << "ERROR: could not decode policy" << dendl; + return -EIO; + } + break; + } + } + } + + return 0; +} + +static int get_multipart_info(RGWRados *store, struct req_state *s, + const string& meta_oid, + RGWAccessControlPolicy *policy, + map *attrs, + multipart_upload_info *upload_info) +{ + map::iterator iter; + bufferlist header; + + rgw_obj meta_obj; + meta_obj.init_ns(s->bucket, meta_oid, mp_ns); + meta_obj.set_in_extra_data(true); + + return get_multipart_info(store, s, meta_obj, policy, attrs, upload_info); +} + +static int modify_obj_attr(RGWRados *store, struct req_state *s, const rgw_obj& obj, const char* attr_name, bufferlist& attr_val) +{ + map attrs; + RGWRados::Object op_target(store, s->bucket_info, *static_cast(s->obj_ctx), obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrs; + + int r = read_op.prepare(); + if (r < 0) { + return r; + } + store->set_atomic(s->obj_ctx, read_op.state.obj); + attrs[attr_name] = attr_val; + return store->set_attrs(s->obj_ctx, s->bucket_info, read_op.state.obj, attrs, NULL); +} + +static int read_bucket_policy(RGWRados *store, + struct req_state *s, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy *policy, + rgw_bucket& bucket) +{ + if (!s->system_request && bucket_info.flags & BUCKET_SUSPENDED) { + ldpp_dout(s, 0) << "NOTICE: bucket " << bucket_info.bucket.name + << " is suspended" << dendl; + return -ERR_USER_SUSPENDED; + } + + if (bucket.name.empty()) { + return 0; + } + + int ret = rgw_op_get_bucket_policy_from_attr(s->cct, store, bucket_info, bucket_attrs, policy); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_BUCKET; + } + + return ret; +} + +static int read_obj_policy(RGWRados *store, + struct req_state *s, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy* acl, + string *storage_class, + boost::optional& policy, + rgw_bucket& bucket, + rgw_obj_key& object) +{ + string upload_id; + upload_id = s->info.args.get("uploadId"); + rgw_obj obj; + + if (!s->system_request && bucket_info.flags & BUCKET_SUSPENDED) { + ldpp_dout(s, 0) << "NOTICE: bucket " << bucket_info.bucket.name + << " is suspended" << dendl; + return -ERR_USER_SUSPENDED; + } + + if (!upload_id.empty()) { + /* multipart upload */ + RGWMPObj mp(object.name, upload_id); + string oid = mp.get_meta(); + obj.init_ns(bucket, oid, mp_ns); + obj.set_in_extra_data(true); + } else { + obj = rgw_obj(bucket, object); + } + policy = get_iam_policy_from_attr(s->cct, store, bucket_attrs, bucket.tenant); + + RGWObjectCtx *obj_ctx = static_cast(s->obj_ctx); + int ret = get_obj_policy_from_attr(s->cct, store, *obj_ctx, + bucket_info, bucket_attrs, acl, storage_class, obj); + if (ret == -ENOENT) { + /* object does not exist checking the bucket's ACL to make sure + that we send a proper error code */ + RGWAccessControlPolicy bucket_policy(s->cct); + ret = rgw_op_get_bucket_policy_from_attr(s->cct, store, bucket_info, bucket_attrs, &bucket_policy); + if (ret < 0) { + return ret; + } + const rgw_user& bucket_owner = bucket_policy.get_owner().get_id(); + if (bucket_owner.compare(s->user->user_id) != 0 && + ! s->auth.identity->is_admin_of(bucket_owner)) { + if (policy) { + auto r = policy->eval(s->env, *s->auth.identity, rgw::IAM::s3ListBucket, ARN(bucket)); + if (r == Effect::Allow) + return -ENOENT; + if (r == Effect::Deny) + return -EACCES; + } + if (! bucket_policy.verify_permission(s, *s->auth.identity, s->perm_mask, RGW_PERM_READ)) + ret = -EACCES; + else + ret = -ENOENT; + } else { + ret = -ENOENT; + } + } + + return ret; +} + +/** + * Get the AccessControlPolicy for an user, bucket or object off of disk. + * s: The req_state to draw information from. + * only_bucket: If true, reads the user and bucket ACLs rather than the object ACL. + * Returns: 0 on success, -ERR# otherwise. + */ +int rgw_build_bucket_policies(RGWRados* store, struct req_state* s) +{ + int ret = 0; + rgw_obj_key obj; + RGWUserInfo bucket_owner_info; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + string bi = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance"); + if (!bi.empty()) { + ret = rgw_bucket_parse_bucket_instance(bi, &s->bucket_instance_id, &s->bucket_instance_shard_id); + if (ret < 0) { + return ret; + } + } + + if(s->dialect.compare("s3") == 0) { + s->bucket_acl = std::make_unique(s->cct); + } else if(s->dialect.compare("swift") == 0) { + /* We aren't allocating the account policy for those operations using + * the Swift's infrastructure that don't really need req_state::user. + * Typical example here is the implementation of /info. */ + if (!s->user->user_id.empty()) { + s->user_acl = std::make_unique(s->cct); + } + s->bucket_acl = std::make_unique(s->cct); + } else { + s->bucket_acl = std::make_unique(s->cct); + } + + /* check if copy source is within the current domain */ + if (!s->src_bucket_name.empty()) { + RGWBucketInfo source_info; + + if (s->bucket_instance_id.empty()) { + ret = store->get_bucket_info(obj_ctx, s->src_tenant_name, s->src_bucket_name, source_info, NULL); + } else { + ret = store->get_bucket_instance_info(obj_ctx, s->bucket_instance_id, source_info, NULL, NULL); + } + if (ret == 0) { + string& zonegroup = source_info.zonegroup; + s->local_source = store->svc.zone->get_zonegroup().equals(zonegroup); + } + } + + struct { + rgw_user uid; + std::string display_name; + } acct_acl_user = { + s->user->user_id, + s->user->display_name, + }; + + if (!s->bucket_name.empty()) { + s->bucket_exists = true; + if (s->bucket_instance_id.empty()) { + ret = store->get_bucket_info(obj_ctx, s->bucket_tenant, s->bucket_name, + s->bucket_info, &s->bucket_mtime, + &s->bucket_attrs); + } else { + ret = store->get_bucket_instance_info(obj_ctx, s->bucket_instance_id, + s->bucket_info, &s->bucket_mtime, + &s->bucket_attrs); + } + if (ret < 0) { + if (ret != -ENOENT) { + string bucket_log; + rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name, bucket_log); + ldpp_dout(s, 0) << "NOTICE: couldn't get bucket from bucket_name (name=" + << bucket_log << ")" << dendl; + return ret; + } + s->bucket_exists = false; + } + s->bucket = s->bucket_info.bucket; + + if (s->bucket_exists) { + ret = read_bucket_policy(store, s, s->bucket_info, s->bucket_attrs, + s->bucket_acl.get(), s->bucket); + acct_acl_user = { + s->bucket_info.owner, + s->bucket_acl->get_owner().get_display_name(), + }; + } else { + s->bucket_acl->create_default(s->user->user_id, s->user->display_name); + ret = -ERR_NO_SUCH_BUCKET; + } + + s->bucket_owner = s->bucket_acl->get_owner(); + + RGWZoneGroup zonegroup; + int r = store->svc.zone->get_zonegroup(s->bucket_info.zonegroup, zonegroup); + if (!r) { + if (!zonegroup.endpoints.empty()) { + s->zonegroup_endpoint = zonegroup.endpoints.front(); + } else { + // use zonegroup's master zone endpoints + auto z = zonegroup.zones.find(zonegroup.master_zone); + if (z != zonegroup.zones.end() && !z->second.endpoints.empty()) { + s->zonegroup_endpoint = z->second.endpoints.front(); + } + } + s->zonegroup_name = zonegroup.get_name(); + } + if (r < 0 && ret == 0) { + ret = r; + } + + if (s->bucket_exists && !store->svc.zone->get_zonegroup().equals(s->bucket_info.zonegroup)) { + ldpp_dout(s, 0) << "NOTICE: request for data in a different zonegroup (" + << s->bucket_info.zonegroup << " != " + << store->svc.zone->get_zonegroup().get_id() << ")" << dendl; + /* we now need to make sure that the operation actually requires copy source, that is + * it's a copy operation + */ + if (store->svc.zone->get_zonegroup().is_master_zonegroup() && s->system_request) { + /*If this is the master, don't redirect*/ + } else if (s->op_type == RGW_OP_GET_BUCKET_LOCATION ) { + /* If op is get bucket location, don't redirect */ + } else if (!s->local_source || + (s->op != OP_PUT && s->op != OP_COPY) || + s->object.empty()) { + return -ERR_PERMANENT_REDIRECT; + } + } + + /* init dest placement -- only if bucket exists, otherwise request is either not relevant, or + * it's a create_bucket request, in which case the op will deal with the placement later */ + if (s->bucket_exists) { + s->dest_placement.storage_class = s->info.storage_class; + s->dest_placement.inherit_from(s->bucket_info.placement_rule); + + if (!store->svc.zone->get_zone_params().valid_placement(s->dest_placement)) { + ldpp_dout(s, 0) << "NOTICE: invalid dest placement: " << s->dest_placement.to_str() << dendl; + return -EINVAL; + } + } + } + + /* handle user ACL only for those APIs which support it */ + if (s->user_acl) { + map uattrs; + ret = rgw_get_user_attrs_by_uid(store, acct_acl_user.uid, uattrs); + if (!ret) { + ret = get_user_policy_from_attr(s->cct, store, uattrs, *s->user_acl); + } + if (-ENOENT == ret) { + /* In already existing clusters users won't have ACL. In such case + * assuming that only account owner has the rights seems to be + * reasonable. That allows to have only one verification logic. + * NOTE: there is small compatibility kludge for global, empty tenant: + * 1. if we try to reach an existing bucket, its owner is considered + * as account owner. + * 2. otherwise account owner is identity stored in s->user->user_id. */ + s->user_acl->create_default(acct_acl_user.uid, + acct_acl_user.display_name); + ret = 0; + } else if (ret < 0) { + ldpp_dout(s, 0) << "NOTICE: couldn't get user attrs for handling ACL " + "(user_id=" << s->user->user_id << ", ret=" << ret << ")" << dendl; + return ret; + } + } + // We don't need user policies in case of STS token returned by AssumeRole, + // hence the check for user type + if (! s->user->user_id.empty() && s->auth.identity->get_identity_type() != TYPE_ROLE) { + try { + map uattrs; + if (ret = rgw_get_user_attrs_by_uid(store, s->user->user_id, uattrs); ! ret) { + if (s->iam_user_policies.empty()) { + s->iam_user_policies = get_iam_user_policy_from_attr(s->cct, store, uattrs, s->user->user_id.tenant); + } else { + // This scenario can happen when a STS token has a policy, then we need to append other user policies + // to the existing ones. (e.g. token returned by GetSessionToken) + auto user_policies = get_iam_user_policy_from_attr(s->cct, store, uattrs, s->user->user_id.tenant); + s->iam_user_policies.insert(s->iam_user_policies.end(), user_policies.begin(), user_policies.end()); + } + } else { + if (ret == -ENOENT) + ret = 0; + else ret = -EACCES; + } + } catch (const std::exception& e) { + lderr(s->cct) << "Error reading IAM User Policy: " << e.what() << dendl; + ret = -EACCES; + } + } + + try { + s->iam_policy = get_iam_policy_from_attr(s->cct, store, s->bucket_attrs, + s->bucket_tenant); + } catch (const std::exception& e) { + // Really this is a can't happen condition. We parse the policy + // when it's given to us, so perhaps we should abort or otherwise + // raise bloody murder. + ldpp_dout(s, 0) << "Error reading IAM Policy: " << e.what() << dendl; + ret = -EACCES; + } + + bool success = store->svc.zone->get_redirect_zone_endpoint(&s->redirect_zone_endpoint); + if (success) { + ldpp_dout(s, 20) << "redirect_zone_endpoint=" << s->redirect_zone_endpoint << dendl; + } + + return ret; +} + +/** + * Get the AccessControlPolicy for a bucket or object off of disk. + * s: The req_state to draw information from. + * only_bucket: If true, reads the bucket ACL rather than the object ACL. + * Returns: 0 on success, -ERR# otherwise. + */ +int rgw_build_object_policies(RGWRados *store, struct req_state *s, + bool prefetch_data) +{ + int ret = 0; + + if (!s->object.empty()) { + if (!s->bucket_exists) { + return -ERR_NO_SUCH_BUCKET; + } + s->object_acl = std::make_unique(s->cct); + rgw_obj obj(s->bucket, s->object); + + store->set_atomic(s->obj_ctx, obj); + if (prefetch_data) { + store->set_prefetch_data(s->obj_ctx, obj); + } + ret = read_obj_policy(store, s, s->bucket_info, s->bucket_attrs, + s->object_acl.get(), nullptr, s->iam_policy, s->bucket, + s->object); + } + + return ret; +} + +void rgw_add_to_iam_environment(rgw::IAM::Environment& e, std::string_view key, std::string_view val){ + // This variant just adds non empty key pairs to IAM env., values can be empty + // in certain cases like tagging + if (!key.empty()) + e.emplace(key,val); +} + +static int rgw_iam_add_tags_from_bl(struct req_state* s, bufferlist& bl){ + RGWObjTags& tagset = s->tagset; + try { + auto bliter = bl.cbegin(); + tagset.decode(bliter); + } catch (buffer::error& err) { + ldpp_dout(s, 0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + return -EIO; + } + + for (const auto& tag: tagset.get_tags()){ + rgw_add_to_iam_environment(s->env, "s3:ExistingObjectTag/" + tag.first, tag.second); + } + return 0; +} + +static int rgw_iam_add_existing_objtags(RGWRados* store, struct req_state* s, rgw_obj& obj, std::uint64_t action){ + map attrs; + store->set_atomic(s->obj_ctx, obj); + int op_ret = get_obj_attrs(store, s, obj, attrs); + if (op_ret < 0) + return op_ret; + auto tags = attrs.find(RGW_ATTR_TAGS); + if (tags != attrs.end()){ + return rgw_iam_add_tags_from_bl(s, tags->second); + } + return 0; +} + +static void rgw_add_grant_to_iam_environment(rgw::IAM::Environment& e, struct req_state *s){ + + using header_pair_t = std::pair ; + static const std::initializer_list acl_header_conditionals { + {"HTTP_X_AMZ_GRANT_READ", "s3:x-amz-grant-read"}, + {"HTTP_X_AMZ_GRANT_WRITE", "s3:x-amz-grant-write"}, + {"HTTP_X_AMZ_GRANT_READ_ACP", "s3:x-amz-grant-read-acp"}, + {"HTTP_X_AMZ_GRANT_WRITE_ACP", "s3:x-amz-grant-write-acp"}, + {"HTTP_X_AMZ_GRANT_FULL_CONTROL", "s3:x-amz-grant-full-control"} + }; + + if (s->has_acl_header){ + for (const auto& c: acl_header_conditionals){ + auto hdr = s->info.env->get(c.first); + if(hdr) { + e[c.second] = hdr; + } + } + } +} + +void rgw_build_iam_environment(RGWRados* store, + struct req_state* s) +{ + const auto& m = s->info.env->get_map(); + auto t = ceph::real_clock::now(); + s->env.emplace("aws:CurrentTime", std::to_string(ceph::real_clock::to_time_t(t))); + s->env.emplace("aws:EpochTime", ceph::to_iso_8601(t)); + // TODO: This is fine for now, but once we have STS we'll need to + // look and see. Also this won't work with the IdentityApplier + // model, since we need to know the actual credential. + s->env.emplace("aws:PrincipalType", "User"); + + auto i = m.find("HTTP_REFERER"); + if (i != m.end()) { + s->env.emplace("aws:Referer", i->second); + } + + if (rgw_transport_is_secure(s->cct, *s->info.env)) { + s->env.emplace("aws:SecureTransport", "true"); + } + + const auto remote_addr_param = s->cct->_conf->rgw_remote_addr_param; + if (remote_addr_param.length()) { + i = m.find(remote_addr_param); + } else { + i = m.find("REMOTE_ADDR"); + } + if (i != m.end()) { + const string* ip = &(i->second); + string temp; + if (remote_addr_param == "HTTP_X_FORWARDED_FOR") { + const auto comma = ip->find(','); + if (comma != string::npos) { + temp.assign(*ip, 0, comma); + ip = &temp; + } + } + s->env.emplace("aws:SourceIp", *ip); + } + + i = m.find("HTTP_USER_AGENT"); { + if (i != m.end()) + s->env.emplace("aws:UserAgent", i->second); + } + + if (s->user) { + // What to do about aws::userid? One can have multiple access + // keys so that isn't really suitable. Do we have a durable + // identifier that can persist through name changes? + s->env.emplace("aws:username", s->user->user_id.id); + } + + i = m.find("HTTP_X_AMZ_SECURITY_TOKEN"); + if (i != m.end()) { + s->env.emplace("sts:authentication", "true"); + } else { + s->env.emplace("sts:authentication", "false"); + } +} + +void rgw_bucket_object_pre_exec(struct req_state *s) +{ + if (s->expect_cont) + dump_continue(s); + + dump_bucket_from_state(s); +} + +// So! Now and then when we try to update bucket information, the +// bucket has changed during the course of the operation. (Or we have +// a cache consistency problem that Watch/Notify isn't ruling out +// completely.) +// +// When this happens, we need to update the bucket info and try +// again. We have, however, to try the right *part* again. We can't +// simply re-send, since that will obliterate the previous update. +// +// Thus, callers of this function should include everything that +// merges information to be changed into the bucket information as +// well as the call to set it. +// +// The called function must return an integer, negative on error. In +// general, they should just return op_ret. +namespace { +template +int retry_raced_bucket_write(RGWRados* g, req_state* s, const F& f) { + auto r = f(); + for (auto i = 0u; i < 15u && r == -ECANCELED; ++i) { + r = g->try_refresh_bucket_info(s->bucket_info, nullptr, + &s->bucket_attrs); + if (r >= 0) { + r = f(); + } + } + return r; +} +} + + +int RGWGetObj::verify_permission() +{ + obj = rgw_obj(s->bucket, s->object); + store->set_atomic(s->obj_ctx, obj); + if (get_data) { + store->set_prefetch_data(s->obj_ctx, obj); + } + + if (torrent.get_flag()) { + if (obj.key.instance.empty()) { + action = rgw::IAM::s3GetObjectTorrent; + } else { + action = rgw::IAM::s3GetObjectVersionTorrent; + } + } else { + if (obj.key.instance.empty()) { + action = rgw::IAM::s3GetObject; + } else { + action = rgw::IAM::s3GetObjectVersion; + } + if (s->iam_policy && s->iam_policy->has_partial_conditional(S3_EXISTING_OBJTAG)) + rgw_iam_add_existing_objtags(store, s, obj, action); + if (! s->iam_user_policies.empty()) { + for (auto& user_policy : s->iam_user_policies) { + if (user_policy.has_partial_conditional(S3_EXISTING_OBJTAG)) + rgw_iam_add_existing_objtags(store, s, obj, action); + } + } + } + + if (!verify_object_permission(this, s, action)) { + return -EACCES; + } + + if (s->bucket_info.obj_lock_enabled()) { + get_retention = verify_object_permission(this, s, rgw::IAM::s3GetObjectRetention); + get_legal_hold = verify_object_permission(this, s, rgw::IAM::s3GetObjectLegalHold); + } + + return 0; +} + +// cache the objects tags into the requests +// use inside try/catch as "decode()" may throw +void populate_tags_in_request(req_state* s, const std::map& attrs) { + const auto attr_iter = attrs.find(RGW_ATTR_TAGS); + if (attr_iter != attrs.end()) { + auto bliter = attr_iter->second.cbegin(); + decode(s->tagset, bliter); + } +} + +// cache the objects metadata into the request +void populate_metadata_in_request(req_state* s, std::map& attrs) { + for (auto& attr : attrs) { + if (boost::algorithm::starts_with(attr.first, RGW_ATTR_META_PREFIX)) { + std::string_view key(attr.first); + key.remove_prefix(sizeof(RGW_ATTR_PREFIX)-1); + s->info.x_meta_map.emplace(key, attr.second.c_str()); + } + } +} + +int RGWOp::verify_op_mask() +{ + uint32_t required_mask = op_mask(); + + ldpp_dout(this, 20) << "required_mask= " << required_mask + << " user.op_mask=" << s->user->op_mask << dendl; + + if ((s->user->op_mask & required_mask) != required_mask) { + return -EPERM; + } + + if (!s->system_request && (required_mask & RGW_OP_TYPE_MODIFY) && !store->svc.zone->zone_is_writeable()) { + ldpp_dout(this, 5) << "NOTICE: modify request to a read-only zone by a " + "non-system user, permission denied" << dendl; + return -EPERM; + } + + return 0; +} + +int RGWGetObjTags::verify_permission() +{ + auto iam_action = s->object.instance.empty()? + rgw::IAM::s3GetObjectTagging: + rgw::IAM::s3GetObjectVersionTagging; + // TODO since we are parsing the bl now anyway, we probably change + // the send_response function to accept RGWObjTag instead of a bl + if (s->iam_policy && s->iam_policy->has_partial_conditional(S3_EXISTING_OBJTAG)){ + rgw_obj obj = rgw_obj(s->bucket, s->object); + rgw_iam_add_existing_objtags(store, s, obj, iam_action); + } + if (! s->iam_user_policies.empty()) { + for (auto& user_policy : s->iam_user_policies) { + if (user_policy.has_partial_conditional(S3_EXISTING_OBJTAG)) { + rgw_obj obj = rgw_obj(s->bucket, s->object); + rgw_iam_add_existing_objtags(store, s, obj, iam_action); + } + } + } + if (!verify_object_permission(this, s,iam_action)) + return -EACCES; + + return 0; +} + +void RGWGetObjTags::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjTags::execute() +{ + rgw_obj obj; + map attrs; + + obj = rgw_obj(s->bucket, s->object); + + store->set_atomic(s->obj_ctx, obj); + + op_ret = get_obj_attrs(store, s, obj, attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << obj + << " ret=" << op_ret << dendl; + return; + } + + auto tags = attrs.find(RGW_ATTR_TAGS); + if(tags != attrs.end()){ + has_tags = true; + tags_bl.append(tags->second); + } + send_response_data(tags_bl); +} + +int RGWPutObjTags::verify_permission() +{ + auto iam_action = s->object.instance.empty() ? + rgw::IAM::s3PutObjectTagging: + rgw::IAM::s3PutObjectVersionTagging; + + if(s->iam_policy && s->iam_policy->has_partial_conditional(S3_EXISTING_OBJTAG)){ + auto obj = rgw_obj(s->bucket, s->object); + rgw_iam_add_existing_objtags(store, s, obj, iam_action); + } + if (! s->iam_user_policies.empty()) { + for (auto& user_policy : s->iam_user_policies) { + if (user_policy.has_partial_conditional(S3_EXISTING_OBJTAG)) { + rgw_obj obj = rgw_obj(s->bucket, s->object); + rgw_iam_add_existing_objtags(store, s, obj, iam_action); + } + } + } + if (!verify_object_permission(this, s,iam_action)) + return -EACCES; + return 0; +} + +void RGWPutObjTags::execute() +{ + op_ret = get_params(); + if (op_ret < 0) + return; + + if (s->object.empty()){ + op_ret= -EINVAL; // we only support tagging on existing objects + return; + } + + rgw_obj obj; + obj = rgw_obj(s->bucket, s->object); + store->set_atomic(s->obj_ctx, obj); + op_ret = modify_obj_attr(store, s, obj, RGW_ATTR_TAGS, tags_bl); + if (op_ret == -ECANCELED){ + op_ret = -ERR_TAG_CONFLICT; + } +} + +void RGWDeleteObjTags::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + + +int RGWDeleteObjTags::verify_permission() +{ + if (!s->object.empty()) { + auto iam_action = s->object.instance.empty() ? + rgw::IAM::s3DeleteObjectTagging: + rgw::IAM::s3DeleteObjectVersionTagging; + + if (s->iam_policy && s->iam_policy->has_partial_conditional(S3_EXISTING_OBJTAG)){ + auto obj = rgw_obj(s->bucket, s->object); + rgw_iam_add_existing_objtags(store, s, obj, iam_action); + } + if (! s->iam_user_policies.empty()) { + for (auto& user_policy : s->iam_user_policies) { + if (user_policy.has_partial_conditional(S3_EXISTING_OBJTAG)) { + auto obj = rgw_obj(s->bucket, s->object); + rgw_iam_add_existing_objtags(store, s, obj, iam_action); + } + } + } + if (!verify_object_permission(this, s, iam_action)) + return -EACCES; + } + return 0; +} + +void RGWDeleteObjTags::execute() +{ + if (s->object.empty()) + return; + + rgw_obj obj; + obj = rgw_obj(s->bucket, s->object); + store->set_atomic(s->obj_ctx, obj); + map attrs; + map rmattr; + bufferlist bl; + rmattr[RGW_ATTR_TAGS] = bl; + op_ret = store->set_attrs(s->obj_ctx, s->bucket_info, obj, attrs, &rmattr); +} + +int RGWOp::do_aws4_auth_completion() +{ + ldpp_dout(this, 5) << "NOTICE: call to do_aws4_auth_completion" << dendl; + if (s->auth.completer) { + if (!s->auth.completer->complete()) { + return -ERR_AMZ_CONTENT_SHA256_MISMATCH; + } else { + ldpp_dout(this, 10) << "v4 auth ok -- do_aws4_auth_completion" << dendl; + } + + /* TODO(rzarzynski): yes, we're really called twice on PUTs. Only first + * call passes, so we disable second one. This is old behaviour, sorry! + * Plan for tomorrow: seek and destroy. */ + s->auth.completer = nullptr; + } + + return 0; +} + +int RGWOp::init_quota() +{ + /* no quota enforcement for system requests */ + if (s->system_request) + return 0; + + /* init quota related stuff */ + if (!(s->user->op_mask & RGW_OP_TYPE_MODIFY)) { + return 0; + } + + /* only interested in object related ops */ + if (s->object.empty()) { + return 0; + } + + RGWUserInfo owner_info; + RGWUserInfo *uinfo; + + if (s->user->user_id == s->bucket_owner.get_id()) { + uinfo = s->user; + } else { + int r = rgw_get_user_info_by_uid(store, s->bucket_info.owner, owner_info); + if (r < 0) + return r; + uinfo = &owner_info; + } + + if (s->bucket_info.quota.enabled) { + bucket_quota = s->bucket_info.quota; + } else if (uinfo->bucket_quota.enabled) { + bucket_quota = uinfo->bucket_quota; + } else { + bucket_quota = store->svc.quota->get_bucket_quota(); + } + + if (uinfo->user_quota.enabled) { + user_quota = uinfo->user_quota; + } else { + user_quota = store->svc.quota->get_user_quota(); + } + + return 0; +} + +static bool validate_cors_rule_method(RGWCORSRule *rule, const char *req_meth) { + uint8_t flags = 0; + + if (!req_meth) { + dout(5) << "req_meth is null" << dendl; + return false; + } + + if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET; + else if (strcmp(req_meth, "POST") == 0) flags = RGW_CORS_POST; + else if (strcmp(req_meth, "PUT") == 0) flags = RGW_CORS_PUT; + else if (strcmp(req_meth, "DELETE") == 0) flags = RGW_CORS_DELETE; + else if (strcmp(req_meth, "HEAD") == 0) flags = RGW_CORS_HEAD; + + if (rule->get_allowed_methods() & flags) { + dout(10) << "Method " << req_meth << " is supported" << dendl; + } else { + dout(5) << "Method " << req_meth << " is not supported" << dendl; + return false; + } + + return true; +} + +static bool validate_cors_rule_header(RGWCORSRule *rule, const char *req_hdrs) { + if (req_hdrs) { + vector hdrs; + get_str_vec(req_hdrs, hdrs); + for (const auto& hdr : hdrs) { + if (!rule->is_header_allowed(hdr.c_str(), hdr.length())) { + dout(5) << "Header " << hdr << " is not registered in this rule" << dendl; + return false; + } + } + } + return true; +} + +int RGWOp::read_bucket_cors() +{ + bufferlist bl; + + map::iterator aiter = s->bucket_attrs.find(RGW_ATTR_CORS); + if (aiter == s->bucket_attrs.end()) { + ldpp_dout(this, 20) << "no CORS configuration attr found" << dendl; + cors_exist = false; + return 0; /* no CORS configuration found */ + } + + cors_exist = true; + + bl = aiter->second; + + auto iter = bl.cbegin(); + try { + bucket_cors.decode(iter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (s->cct->_conf->subsys.should_gather()) { + RGWCORSConfiguration_S3 *s3cors = static_cast(&bucket_cors); + ldpp_dout(this, 15) << "Read RGWCORSConfiguration"; + s3cors->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + +/** CORS 6.2.6. + * If any of the header field-names is not a ASCII case-insensitive match for + * any of the values in list of headers do not set any additional headers and + * terminate this set of steps. + * */ +static void get_cors_response_headers(RGWCORSRule *rule, const char *req_hdrs, string& hdrs, string& exp_hdrs, unsigned *max_age) { + if (req_hdrs) { + list hl; + get_str_list(req_hdrs, hl); + for(list::iterator it = hl.begin(); it != hl.end(); ++it) { + if (!rule->is_header_allowed((*it).c_str(), (*it).length())) { + dout(5) << "Header " << (*it) << " is not registered in this rule" << dendl; + } else { + if (hdrs.length() > 0) hdrs.append(","); + hdrs.append((*it)); + } + } + } + rule->format_exp_headers(exp_hdrs); + *max_age = rule->get_max_age(); +} + +/** + * Generate the CORS header response + * + * This is described in the CORS standard, section 6.2. + */ +bool RGWOp::generate_cors_headers(string& origin, string& method, string& headers, string& exp_headers, unsigned *max_age) +{ + /* CORS 6.2.1. */ + const char *orig = s->info.env->get("HTTP_ORIGIN"); + if (!orig) { + return false; + } + + /* Custom: */ + origin = orig; + op_ret = read_bucket_cors(); + if (op_ret < 0) { + return false; + } + + if (!cors_exist) { + ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl; + return false; + } + + /* CORS 6.2.2. */ + RGWCORSRule *rule = bucket_cors.host_name_rule(orig); + if (!rule) + return false; + + /* + * Set the Allowed-Origin header to a asterisk if this is allowed in the rule + * and no Authorization was send by the client + * + * The origin parameter specifies a URI that may access the resource. The browser must enforce this. + * For requests without credentials, the server may specify "*" as a wildcard, + * thereby allowing any origin to access the resource. + */ + const char *authorization = s->info.env->get("HTTP_AUTHORIZATION"); + if (!authorization && rule->has_wildcard_origin()) + origin = "*"; + + /* CORS 6.2.3. */ + const char *req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + if (!req_meth) { + req_meth = s->info.method; + } + + if (req_meth) { + method = req_meth; + /* CORS 6.2.5. */ + if (!validate_cors_rule_method(rule, req_meth)) { + return false; + } + } + + /* CORS 6.2.4. */ + const char *req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_HEADERS"); + + /* CORS 6.2.6. */ + get_cors_response_headers(rule, req_hdrs, headers, exp_headers, max_age); + + return true; +} + +int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const off_t start_ofs, + const off_t end_ofs, + bool swift_slo) +{ + ldpp_dout(this, 20) << "user manifest obj=" << ent.key.name + << "[" << ent.key.instance << "]" << dendl; + RGWGetObj_CB cb(this); + RGWGetObj_Filter* filter = &cb; + boost::optional decompress; + + int64_t cur_ofs = start_ofs; + int64_t cur_end = end_ofs; + + rgw_obj part(bucket, ent.key); + + map attrs; + + uint64_t obj_size; + RGWObjectCtx obj_ctx(store); + RGWAccessControlPolicy obj_policy(s->cct); + + ldpp_dout(this, 20) << "reading obj=" << part << " ofs=" << cur_ofs + << " end=" << cur_end << dendl; + + obj_ctx.set_atomic(part); + store->set_prefetch_data(&obj_ctx, part); + + RGWRados::Object op_target(store, s->bucket_info, obj_ctx, part); + RGWRados::Object::Read read_op(&op_target); + + if (!swift_slo) { + /* SLO etag is optional */ + read_op.conds.if_match = ent.meta.etag.c_str(); + } + read_op.params.attrs = &attrs; + read_op.params.obj_size = &obj_size; + + op_ret = read_op.prepare(); + if (op_ret < 0) + return op_ret; + op_ret = read_op.range_to_ofs(ent.meta.accounted_size, cur_ofs, cur_end); + if (op_ret < 0) + return op_ret; + bool need_decompress; + op_ret = rgw_compression_info_from_attrset(attrs, need_decompress, cs_info); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to decode compression info" << dendl; + return -EIO; + } + + if (need_decompress) + { + if (cs_info.orig_size != ent.meta.accounted_size) { + // hmm.. something wrong, object not as expected, abort! + ldpp_dout(this, 0) << "ERROR: expected cs_info.orig_size=" << cs_info.orig_size + << ", actual read size=" << ent.meta.size << dendl; + return -EIO; + } + decompress.emplace(s->cct, &cs_info, partial_content, filter); + filter = &*decompress; + } + else + { + if (obj_size != ent.meta.size) { + // hmm.. something wrong, object not as expected, abort! + ldpp_dout(this, 0) << "ERROR: expected obj_size=" << obj_size + << ", actual read size=" << ent.meta.size << dendl; + return -EIO; + } + } + + op_ret = rgw_policy_from_attrset(s->cct, attrs, &obj_policy); + if (op_ret < 0) + return op_ret; + + /* We can use global user_acl because LOs cannot have segments + * stored inside different accounts. */ + if (s->system_request) { + ldpp_dout(this, 2) << "overriding permissions due to system operation" << dendl; + } else if (s->auth.identity->is_admin_of(s->user->user_id)) { + ldpp_dout(this, 2) << "overriding permissions due to admin operation" << dendl; + } else if (!verify_object_permission(this, s, part, s->user_acl.get(), bucket_acl, + &obj_policy, bucket_policy, s->iam_user_policies, action)) { + return -EPERM; + } + if (ent.meta.size == 0) { + return 0; + } + + perfcounter->inc(l_rgw_get_b, cur_end - cur_ofs); + filter->fixup_range(cur_ofs, cur_end); + op_ret = read_op.iterate(cur_ofs, cur_end, filter); + if (op_ret >= 0) + op_ret = filter->flush(); + return op_ret; +} + +static int iterate_user_manifest_parts(CephContext * const cct, + RGWRados * const store, + const off_t ofs, + const off_t end, + RGWBucketInfo *pbucket_info, + const string& obj_prefix, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + uint64_t * const ptotal_len, + uint64_t * const pobj_size, + string * const pobj_sum, + int (*cb)(rgw_bucket& bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + off_t start_ofs, + off_t end_ofs, + void *param, + bool swift_slo), + void * const cb_param) +{ + rgw_bucket& bucket = pbucket_info->bucket; + uint64_t obj_ofs = 0, len_count = 0; + bool found_start = false, found_end = false, handled_end = false; + string delim; + bool is_truncated; + vector objs; + + utime_t start_time = ceph_clock_now(); + + RGWRados::Bucket target(store, *pbucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.prefix = obj_prefix; + list_op.params.delim = delim; + + MD5 etag_sum; + do { +#define MAX_LIST_OBJS 100 + int r = list_op.list_objects(MAX_LIST_OBJS, &objs, NULL, &is_truncated); + if (r < 0) { + return r; + } + + for (rgw_bucket_dir_entry& ent : objs) { + const uint64_t cur_total_len = obj_ofs; + const uint64_t obj_size = ent.meta.accounted_size; + uint64_t start_ofs = 0, end_ofs = obj_size; + + if ((ptotal_len || cb) && !found_start && cur_total_len + obj_size > (uint64_t)ofs) { + start_ofs = ofs - obj_ofs; + found_start = true; + } + + obj_ofs += obj_size; + if (pobj_sum) { + etag_sum.Update((const unsigned char *)ent.meta.etag.c_str(), + ent.meta.etag.length()); + } + + if ((ptotal_len || cb) && !found_end && obj_ofs > (uint64_t)end) { + end_ofs = end - cur_total_len + 1; + found_end = true; + } + + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now() - start_time)); + + if (found_start && !handled_end) { + len_count += end_ofs - start_ofs; + + if (cb) { + r = cb(bucket, ent, bucket_acl, bucket_policy, start_ofs, end_ofs, + cb_param, false /* swift_slo */); + if (r < 0) { + return r; + } + } + } + + handled_end = found_end; + start_time = ceph_clock_now(); + } + } while (is_truncated); + + if (ptotal_len) { + *ptotal_len = len_count; + } + if (pobj_size) { + *pobj_size = obj_ofs; + } + if (pobj_sum) { + complete_etag(etag_sum, pobj_sum); + } + + return 0; +} + +struct rgw_slo_part { + RGWAccessControlPolicy *bucket_acl = nullptr; + Policy* bucket_policy = nullptr; + rgw_bucket bucket; + string obj_name; + uint64_t size = 0; + string etag; +}; + +static int iterate_slo_parts(CephContext *cct, + RGWRados *store, + off_t ofs, + off_t end, + map& slo_parts, + int (*cb)(rgw_bucket& bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy *bucket_acl, + const boost::optional& bucket_policy, + off_t start_ofs, + off_t end_ofs, + void *param, + bool swift_slo), + void *cb_param) +{ + bool found_start = false, found_end = false; + + if (slo_parts.empty()) { + return 0; + } + + utime_t start_time = ceph_clock_now(); + + map::iterator iter = slo_parts.upper_bound(ofs); + if (iter != slo_parts.begin()) { + --iter; + } + + uint64_t obj_ofs = iter->first; + + for (; iter != slo_parts.end() && !found_end; ++iter) { + rgw_slo_part& part = iter->second; + rgw_bucket_dir_entry ent; + + ent.key.name = part.obj_name; + ent.meta.accounted_size = ent.meta.size = part.size; + ent.meta.etag = part.etag; + + uint64_t cur_total_len = obj_ofs; + uint64_t start_ofs = 0, end_ofs = ent.meta.size - 1; + + if (!found_start && cur_total_len + ent.meta.size > (uint64_t)ofs) { + start_ofs = ofs - obj_ofs; + found_start = true; + } + + obj_ofs += ent.meta.size; + + if (!found_end && obj_ofs > (uint64_t)end) { + end_ofs = end - cur_total_len; + found_end = true; + } + + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now() - start_time)); + + if (found_start) { + if (cb) { + dout(20) << "iterate_slo_parts()" + << " obj=" << part.obj_name + << " start_ofs=" << start_ofs + << " end_ofs=" << end_ofs + << dendl; + + // SLO is a Swift thing, and Swift has no knowledge of S3 Policies. + int r = cb(part.bucket, ent, part.bucket_acl, + (part.bucket_policy ? + boost::optional(*part.bucket_policy) : none), + start_ofs, end_ofs, cb_param, true /* swift_slo */); + if (r < 0) + return r; + } + } + + start_time = ceph_clock_now(); + } + + return 0; +} + +static int get_obj_user_manifest_iterate_cb(rgw_bucket& bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const off_t start_ofs, + const off_t end_ofs, + void * const param, + bool swift_slo = false) +{ + RGWGetObj *op = static_cast(param); + return op->read_user_manifest_part( + bucket, ent, bucket_acl, bucket_policy, start_ofs, end_ofs, swift_slo); +} + +int RGWGetObj::handle_user_manifest(const char *prefix) +{ + const boost::string_view prefix_view(prefix); + ldpp_dout(this, 2) << "RGWGetObj::handle_user_manifest() prefix=" + << prefix_view << dendl; + + const size_t pos = prefix_view.find('/'); + if (pos == string::npos) { + return -EINVAL; + } + + const std::string bucket_name = url_decode(prefix_view.substr(0, pos)); + const std::string obj_prefix = url_decode(prefix_view.substr(pos + 1)); + + rgw_bucket bucket; + + RGWAccessControlPolicy _bucket_acl(s->cct); + RGWAccessControlPolicy *bucket_acl; + boost::optional _bucket_policy; + boost::optional* bucket_policy; + RGWBucketInfo bucket_info; + RGWBucketInfo *pbucket_info; + + if (bucket_name.compare(s->bucket.name) != 0) { + map bucket_attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_info(obj_ctx, s->user->user_id.tenant, + bucket_name, bucket_info, NULL, + &bucket_attrs); + if (r < 0) { + ldpp_dout(this, 0) << "could not get bucket info for bucket=" + << bucket_name << dendl; + return r; + } + bucket = bucket_info.bucket; + pbucket_info = &bucket_info; + bucket_acl = &_bucket_acl; + r = read_bucket_policy(store, s, bucket_info, bucket_attrs, bucket_acl, bucket); + if (r < 0) { + ldpp_dout(this, 0) << "failed to read bucket policy" << dendl; + return r; + } + _bucket_policy = get_iam_policy_from_attr(s->cct, store, bucket_attrs, + bucket_info.bucket.tenant); + bucket_policy = &_bucket_policy; + } else { + bucket = s->bucket; + pbucket_info = &s->bucket_info; + bucket_acl = s->bucket_acl.get(); + bucket_policy = &s->iam_policy; + } + + /* dry run to find out: + * - total length (of the parts we are going to send to client), + * - overall DLO's content size, + * - md5 sum of overall DLO's content (for etag of Swift API). */ + int r = iterate_user_manifest_parts(s->cct, store, ofs, end, + pbucket_info, obj_prefix, bucket_acl, *bucket_policy, + nullptr, &s->obj_size, &lo_etag, + nullptr /* cb */, nullptr /* cb arg */); + if (r < 0) { + return r; + } + + r = RGWRados::Object::Read::range_to_ofs(s->obj_size, ofs, end); + if (r < 0) { + return r; + } + + r = iterate_user_manifest_parts(s->cct, store, ofs, end, + pbucket_info, obj_prefix, bucket_acl, *bucket_policy, + &total_len, nullptr, nullptr, + nullptr, nullptr); + if (r < 0) { + return r; + } + + if (!get_data) { + bufferlist bl; + send_response_data(bl, 0, 0); + return 0; + } + + r = iterate_user_manifest_parts(s->cct, store, ofs, end, + pbucket_info, obj_prefix, bucket_acl, *bucket_policy, + nullptr, nullptr, nullptr, + get_obj_user_manifest_iterate_cb, (void *)this); + if (r < 0) { + return r; + } + + if (!total_len) { + bufferlist bl; + send_response_data(bl, 0, 0); + } + + return 0; +} + +int RGWGetObj::handle_slo_manifest(bufferlist& bl) +{ + RGWSLOInfo slo_info; + auto bliter = bl.cbegin(); + try { + decode(slo_info, bliter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode slo manifest" << dendl; + return -EIO; + } + ldpp_dout(this, 2) << "RGWGetObj::handle_slo_manifest()" << dendl; + + vector allocated_acls; + map>> policies; + map buckets; + + map slo_parts; + + MD5 etag_sum; + total_len = 0; + + for (const auto& entry : slo_info.entries) { + const string& path = entry.path; + + /* If the path starts with slashes, strip them all. */ + const size_t pos_init = path.find_first_not_of('/'); + /* According to the documentation of std::string::find following check + * is not necessary as we should get the std::string::npos propagation + * here. This might be true with the accuracy to implementation's bugs. + * See following question on SO: + * http://stackoverflow.com/questions/1011790/why-does-stdstring-findtext-stdstringnpos-not-return-npos + */ + if (pos_init == string::npos) { + return -EINVAL; + } + + const size_t pos_sep = path.find('/', pos_init); + if (pos_sep == string::npos) { + return -EINVAL; + } + + string bucket_name = path.substr(pos_init, pos_sep - pos_init); + string obj_name = path.substr(pos_sep + 1); + + rgw_bucket bucket; + RGWAccessControlPolicy *bucket_acl; + Policy* bucket_policy; + + if (bucket_name.compare(s->bucket.name) != 0) { + const auto& piter = policies.find(bucket_name); + if (piter != policies.end()) { + bucket_acl = piter->second.first; + bucket_policy = piter->second.second.get_ptr(); + bucket = buckets[bucket_name]; + } else { + allocated_acls.push_back(RGWAccessControlPolicy(s->cct)); + RGWAccessControlPolicy& _bucket_acl = allocated_acls.back(); + + RGWBucketInfo bucket_info; + map bucket_attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + int r = store->get_bucket_info(obj_ctx, s->user->user_id.tenant, + bucket_name, bucket_info, nullptr, + &bucket_attrs); + if (r < 0) { + ldpp_dout(this, 0) << "could not get bucket info for bucket=" + << bucket_name << dendl; + return r; + } + bucket = bucket_info.bucket; + bucket_acl = &_bucket_acl; + r = read_bucket_policy(store, s, bucket_info, bucket_attrs, bucket_acl, + bucket); + if (r < 0) { + ldpp_dout(this, 0) << "failed to read bucket ACL for bucket " + << bucket << dendl; + return r; + } + auto _bucket_policy = get_iam_policy_from_attr( + s->cct, store, bucket_attrs, bucket_info.bucket.tenant); + bucket_policy = _bucket_policy.get_ptr(); + buckets[bucket_name] = bucket; + policies[bucket_name] = make_pair(bucket_acl, _bucket_policy); + } + } else { + bucket = s->bucket; + bucket_acl = s->bucket_acl.get(); + bucket_policy = s->iam_policy.get_ptr(); + } + + rgw_slo_part part; + part.bucket_acl = bucket_acl; + part.bucket_policy = bucket_policy; + part.bucket = bucket; + part.obj_name = obj_name; + part.size = entry.size_bytes; + part.etag = entry.etag; + ldpp_dout(this, 20) << "slo_part: bucket=" << part.bucket + << " obj=" << part.obj_name + << " size=" << part.size + << " etag=" << part.etag + << dendl; + + etag_sum.Update((const unsigned char *)entry.etag.c_str(), + entry.etag.length()); + + slo_parts[total_len] = part; + total_len += part.size; + } /* foreach entry */ + + complete_etag(etag_sum, &lo_etag); + + s->obj_size = slo_info.total_size; + ldpp_dout(this, 20) << "s->obj_size=" << s->obj_size << dendl; + + int r = RGWRados::Object::Read::range_to_ofs(total_len, ofs, end); + if (r < 0) { + return r; + } + + total_len = end - ofs + 1; + ldpp_dout(this, 20) << "Requested: ofs=" << ofs + << " end=" << end + << " total=" << total_len + << dendl; + + r = iterate_slo_parts(s->cct, store, ofs, end, slo_parts, + get_obj_user_manifest_iterate_cb, (void *)this); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWGetObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + /* garbage collection related handling */ + utime_t start_time = ceph_clock_now(); + if (start_time > gc_invalidate_time) { + int r = store->defer_gc(s->obj_ctx, s->bucket_info, obj); + if (r < 0) { + ldpp_dout(this, 0) << "WARNING: could not defer gc entry for obj" << dendl; + } + gc_invalidate_time = start_time; + gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + } + return send_response_data(bl, bl_ofs, bl_len); +} + +bool RGWGetObj::prefetch_data() +{ + /* HEAD request, stop prefetch*/ + if (!get_data) { + return false; + } + + range_str = s->info.env->get("HTTP_RANGE"); + // TODO: add range prefetch + if (range_str) { + parse_range(); + return false; + } + + return get_data; +} + +void RGWGetObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +static bool object_is_expired(map& attrs) { + map::iterator iter = attrs.find(RGW_ATTR_DELETE_AT); + if (iter != attrs.end()) { + utime_t delete_at; + try { + decode(delete_at, iter->second); + } catch (buffer::error& err) { + dout(0) << "ERROR: " << __func__ << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl; + return false; + } + + if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) { + return true; + } + } + + return false; +} + +static inline void rgw_cond_decode_objtags( + struct req_state *s, + const std::map &attrs) +{ + const auto& tags = attrs.find(RGW_ATTR_TAGS); + if (tags != attrs.end()) { + try { + bufferlist::const_iterator iter{&tags->second}; + s->tagset.decode(iter); + } catch (buffer::error& err) { + ldout(s->cct, 0) + << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + } + } +} + +void RGWGetObj::execute() +{ + bufferlist bl; + gc_invalidate_time = ceph_clock_now(); + gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + + bool need_decompress; + int64_t ofs_x, end_x; + + RGWGetObj_CB cb(this); + RGWGetObj_Filter* filter = (RGWGetObj_Filter *)&cb; + boost::optional decompress; + std::unique_ptr decrypt; + map::iterator attr_iter; + + perfcounter->inc(l_rgw_get); + + RGWRados::Object op_target(store, s->bucket_info, *static_cast(s->obj_ctx), obj); + RGWRados::Object::Read read_op(&op_target); + + op_ret = get_params(); + if (op_ret < 0) + goto done_err; + + op_ret = init_common(); + if (op_ret < 0) + goto done_err; + + read_op.conds.mod_ptr = mod_ptr; + read_op.conds.unmod_ptr = unmod_ptr; + read_op.conds.high_precision_time = s->system_request; /* system request need to use high precision time */ + read_op.conds.mod_zone_id = mod_zone_id; + read_op.conds.mod_pg_ver = mod_pg_ver; + read_op.conds.if_match = if_match; + read_op.conds.if_nomatch = if_nomatch; + read_op.params.attrs = &attrs; + read_op.params.lastmod = &lastmod; + read_op.params.obj_size = &s->obj_size; + + op_ret = read_op.prepare(); + if (op_ret < 0) + goto done_err; + version_id = read_op.state.obj.key.instance; + + /* STAT ops don't need data, and do no i/o */ + if (get_type() == RGW_OP_STAT_OBJ) { + return; + } + + /* start gettorrent */ + if (torrent.get_flag()) + { + attr_iter = attrs.find(RGW_ATTR_CRYPT_MODE); + if (attr_iter != attrs.end() && attr_iter->second.to_str() == "SSE-C-AES256") { + ldpp_dout(this, 0) << "ERROR: torrents are not supported for objects " + "encrypted with SSE-C" << dendl; + op_ret = -EINVAL; + goto done_err; + } + torrent.init(s, store); + op_ret = torrent.get_torrent_file(read_op, total_len, bl, obj); + if (op_ret < 0) + { + ldpp_dout(this, 0) << "ERROR: failed to get_torrent_file ret= " << op_ret + << dendl; + goto done_err; + } + op_ret = send_response_data(bl, 0, total_len); + if (op_ret < 0) + { + ldpp_dout(this, 0) << "ERROR: failed to send_response_data ret= " << op_ret << dendl; + goto done_err; + } + return; + } + /* end gettorrent */ + + op_ret = rgw_compression_info_from_attrset(attrs, need_decompress, cs_info); + if (op_ret < 0) { + ldpp_dout(s, 0) << "ERROR: failed to decode compression info, cannot decompress" << dendl; + goto done_err; + } + if (need_decompress) { + s->obj_size = cs_info.orig_size; + decompress.emplace(s->cct, &cs_info, partial_content, filter); + filter = &*decompress; + } + + attr_iter = attrs.find(RGW_ATTR_USER_MANIFEST); + if (attr_iter != attrs.end() && !skip_manifest) { + op_ret = handle_user_manifest(attr_iter->second.c_str()); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to handle user manifest ret=" + << op_ret << dendl; + goto done_err; + } + return; + } + + attr_iter = attrs.find(RGW_ATTR_SLO_MANIFEST); + if (attr_iter != attrs.end() && !skip_manifest) { + is_slo = true; + op_ret = handle_slo_manifest(attr_iter->second); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to handle slo manifest ret=" << op_ret + << dendl; + goto done_err; + } + return; + } + + // for range requests with obj size 0 + if (range_str && !(s->obj_size)) { + total_len = 0; + op_ret = -ERANGE; + goto done_err; + } + + op_ret = read_op.range_to_ofs(s->obj_size, ofs, end); + if (op_ret < 0) + goto done_err; + total_len = (ofs <= end ? end + 1 - ofs : 0); + + /* Check whether the object has expired. Swift API documentation + * stands that we should return 404 Not Found in such case. */ + if (need_object_expiration() && object_is_expired(attrs)) { + op_ret = -ENOENT; + goto done_err; + } + + /* Decode S3 objtags, if any */ + rgw_cond_decode_objtags(s, attrs); + + start = ofs; + + attr_iter = attrs.find(RGW_ATTR_MANIFEST); + op_ret = this->get_decrypt_filter(&decrypt, filter, + attr_iter != attrs.end() ? &(attr_iter->second) : nullptr); + if (decrypt != nullptr) { + filter = decrypt.get(); + } + if (op_ret < 0) { + goto done_err; + } + + if (!get_data || ofs > end) { + send_response_data(bl, 0, 0); + return; + } + + perfcounter->inc(l_rgw_get_b, end - ofs); + + ofs_x = ofs; + end_x = end; + filter->fixup_range(ofs_x, end_x); + op_ret = read_op.iterate(ofs_x, end_x, filter); + + if (op_ret >= 0) + op_ret = filter->flush(); + + perfcounter->tinc(l_rgw_get_lat, s->time_elapsed()); + if (op_ret < 0) { + goto done_err; + } + + op_ret = send_response_data(bl, 0, 0); + if (op_ret < 0) { + goto done_err; + } + return; + +done_err: + send_response_data_error(); +} + +int RGWGetObj::init_common() +{ + if (range_str) { + /* range parsed error when prefetch */ + if (!range_parsed) { + int r = parse_range(); + if (r < 0) + return r; + } + } + if (if_mod) { + if (parse_time(if_mod, &mod_time) < 0) + return -EINVAL; + mod_ptr = &mod_time; + } + + if (if_unmod) { + if (parse_time(if_unmod, &unmod_time) < 0) + return -EINVAL; + unmod_ptr = &unmod_time; + } + + return 0; +} + +int RGWListBuckets::verify_permission() +{ + rgw::Partition partition = rgw::Partition::aws; + rgw::Service service = rgw::Service::s3; + + if (!verify_user_permission(this, s, ARN(partition, service, "", s->user->user_id.tenant, "*"), rgw::IAM::s3ListAllMyBuckets)) { + return -EACCES; + } + + return 0; +} + +int RGWGetUsage::verify_permission() +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + return 0; +} + +void RGWListBuckets::execute() +{ + bool done; + bool started = false; + uint64_t total_count = 0; + + const uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + + op_ret = get_params(); + if (op_ret < 0) { + goto send_end; + } + + if (supports_account_metadata()) { + op_ret = rgw_get_user_attrs_by_uid(store, s->user->user_id, attrs); + if (op_ret < 0) { + goto send_end; + } + } + + is_truncated = false; + do { + RGWUserBuckets buckets; + uint64_t read_count; + if (limit >= 0) { + read_count = min(limit - total_count, max_buckets); + } else { + read_count = max_buckets; + } + + op_ret = rgw_read_user_buckets(store, s->user->user_id, buckets, + marker, end_marker, read_count, + should_get_stats(), &is_truncated, + get_default_max()); + if (op_ret < 0) { + /* hmm.. something wrong here.. the user was authenticated, so it + should exist */ + ldpp_dout(this, 10) << "WARNING: failed on rgw_get_user_buckets uid=" + << s->user->user_id << dendl; + break; + } + + /* We need to have stats for all our policies - even if a given policy + * isn't actually used in a given account. In such situation its usage + * stats would be simply full of zeros. */ + for (const auto& policy : store->svc.zone->get_zonegroup().placement_targets) { + policies_stats.emplace(policy.second.name, + decltype(policies_stats)::mapped_type()); + } + + std::map& m = buckets.get_buckets(); + for (const auto& kv : m) { + const auto& bucket = kv.second; + + global_stats.bytes_used += bucket.size; + global_stats.bytes_used_rounded += bucket.size_rounded; + global_stats.objects_count += bucket.count; + + /* operator[] still can create a new entry for storage policy seen + * for first time. */ + auto& policy_stats = policies_stats[bucket.placement_rule.to_str()]; + policy_stats.bytes_used += bucket.size; + policy_stats.bytes_used_rounded += bucket.size_rounded; + policy_stats.buckets_count++; + policy_stats.objects_count += bucket.count; + } + global_stats.buckets_count += m.size(); + total_count += m.size(); + + done = (m.size() < read_count || (limit >= 0 && total_count >= (uint64_t)limit)); + + if (!started) { + send_response_begin(buckets.count() > 0); + started = true; + } + + if (!m.empty()) { + map::reverse_iterator riter = m.rbegin(); + marker = riter->first; + + handle_listing_chunk(std::move(buckets)); + } + } while (is_truncated && !done); + +send_end: + if (!started) { + send_response_begin(false); + } + send_response_end(); +} + +void RGWGetUsage::execute() +{ + uint64_t start_epoch = 0; + uint64_t end_epoch = (uint64_t)-1; + op_ret = get_params(); + if (op_ret < 0) + return; + + if (!start_date.empty()) { + op_ret = utime_t::parse_date(start_date, &start_epoch, NULL); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to parse start date" << dendl; + return; + } + } + + if (!end_date.empty()) { + op_ret = utime_t::parse_date(end_date, &end_epoch, NULL); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to parse end date" << dendl; + return; + } + } + + uint32_t max_entries = 1000; + + bool is_truncated = true; + + RGWUsageIter usage_iter; + + while (is_truncated) { + op_ret = store->read_usage(s->user->user_id, s->bucket_name, start_epoch, end_epoch, max_entries, + &is_truncated, usage_iter, usage); + + if (op_ret == -ENOENT) { + op_ret = 0; + is_truncated = false; + } + + if (op_ret < 0) { + return; + } + } + + op_ret = rgw_user_sync_all_stats(store, s->user->user_id); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to sync user stats" << dendl; + return; + } + + op_ret = rgw_user_get_all_buckets_stats(store, s->user->user_id, buckets_usage); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get user's buckets stats" << dendl; + return; + } + + string user_str = s->user->user_id.to_str(); + op_ret = store->cls_user_get_header(user_str, &header); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: can't read user header" << dendl; + return; + } + + return; +} + +int RGWStatAccount::verify_permission() +{ + if (!verify_user_permission_no_policy(this, s, RGW_PERM_READ)) { + return -EACCES; + } + + return 0; +} + +void RGWStatAccount::execute() +{ + string marker; + bool is_truncated = false; + uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + const string *lastmarker; + + do { + RGWUserBuckets buckets; + + lastmarker = nullptr; + op_ret = rgw_read_user_buckets(store, s->user->user_id, buckets, marker, + string(), max_buckets, true, &is_truncated); + if (op_ret < 0) { + /* hmm.. something wrong here.. the user was authenticated, so it + should exist */ + ldpp_dout(this, 10) << "WARNING: failed on rgw_read_user_buckets uid=" + << s->user->user_id << " ret=" << op_ret << dendl; + break; + } else { + /* We need to have stats for all our policies - even if a given policy + * isn't actually used in a given account. In such situation its usage + * stats would be simply full of zeros. */ + for (const auto& policy : store->svc.zone->get_zonegroup().placement_targets) { + policies_stats.emplace(policy.second.name, + decltype(policies_stats)::mapped_type()); + } + + std::map& m = buckets.get_buckets(); + for (const auto& kv : m) { + const auto& bucket = kv.second; + lastmarker = &kv.first; + + global_stats.bytes_used += bucket.size; + global_stats.bytes_used_rounded += bucket.size_rounded; + global_stats.objects_count += bucket.count; + + /* operator[] still can create a new entry for storage policy seen + * for first time. */ + auto& policy_stats = policies_stats[bucket.placement_rule.to_str()]; + policy_stats.bytes_used += bucket.size; + policy_stats.bytes_used_rounded += bucket.size_rounded; + policy_stats.buckets_count++; + policy_stats.objects_count += bucket.count; + } + global_stats.buckets_count += m.size(); + + } + if (!lastmarker) { + lderr(s->cct) << "ERROR: rgw_read_user_buckets, stasis at marker=" + << marker << " uid=" << s->user->user_id << dendl; + break; + } + marker = *lastmarker; + } while (is_truncated); +} + +int RGWGetBucketVersioning::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketVersioning); +} + +void RGWGetBucketVersioning::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetBucketVersioning::execute() +{ + if (! s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + versioned = s->bucket_info.versioned(); + versioning_enabled = s->bucket_info.versioning_enabled(); + mfa_enabled = s->bucket_info.mfa_enabled(); +} + +int RGWSetBucketVersioning::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketVersioning); +} + +void RGWSetBucketVersioning::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWSetBucketVersioning::execute() +{ + op_ret = get_params(); + if (op_ret < 0) + return; + + if (! s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + if (s->bucket_info.obj_lock_enabled() && versioning_status != VersioningEnabled) { + op_ret = -ERR_INVALID_BUCKET_STATE; + return; + } + + bool cur_mfa_status = (s->bucket_info.flags & BUCKET_MFA_ENABLED) != 0; + + mfa_set_status &= (mfa_status != cur_mfa_status); + + if (mfa_set_status && + !s->mfa_verified) { + op_ret = -ERR_MFA_REQUIRED; + return; + } + //if mfa is enabled for bucket, make sure mfa code is validated in case versioned status gets changed + if (cur_mfa_status) { + bool req_versioning_status = false; + //if requested versioning status is not the same as the one set for the bucket, return error + if (versioning_status == VersioningEnabled) { + req_versioning_status = (s->bucket_info.flags & BUCKET_VERSIONS_SUSPENDED) != 0; + } else if (versioning_status == VersioningSuspended) { + req_versioning_status = (s->bucket_info.flags & BUCKET_VERSIONS_SUSPENDED) == 0; + } + if (req_versioning_status && !s->mfa_verified) { + op_ret = -ERR_MFA_REQUIRED; + return; + } + } + + if (!store->svc.zone->is_meta_master()) { + op_ret = forward_request_to_master(s, NULL, store, in_data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + bool modified = mfa_set_status; + + op_ret = retry_raced_bucket_write(store, s, [&] { + if (mfa_set_status) { + if (mfa_status) { + s->bucket_info.flags |= BUCKET_MFA_ENABLED; + } else { + s->bucket_info.flags &= ~BUCKET_MFA_ENABLED; + } + } + + if (versioning_status == VersioningEnabled) { + s->bucket_info.flags |= BUCKET_VERSIONED; + s->bucket_info.flags &= ~BUCKET_VERSIONS_SUSPENDED; + modified = true; + } else if (versioning_status == VersioningSuspended) { + s->bucket_info.flags |= (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED); + modified = true; + } else { + return op_ret; + } + return store->put_bucket_instance_info(s->bucket_info, false, real_time(), + &s->bucket_attrs); + }); + + if (!modified) { + return; + } + + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name + << " returned err=" << op_ret << dendl; + return; + } +} + +int RGWGetBucketWebsite::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketWebsite); +} + +void RGWGetBucketWebsite::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetBucketWebsite::execute() +{ + if (!s->bucket_info.has_website) { + op_ret = -ERR_NO_SUCH_WEBSITE_CONFIGURATION; + } +} + +int RGWSetBucketWebsite::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketWebsite); +} + +void RGWSetBucketWebsite::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWSetBucketWebsite::execute() +{ + op_ret = get_params(); + + if (op_ret < 0) + return; + + if (!store->svc.zone->is_meta_master()) { + op_ret = forward_request_to_master(s, NULL, store, in_data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 0) << " forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + op_ret = retry_raced_bucket_write(store, s, [this] { + s->bucket_info.has_website = true; + s->bucket_info.website_conf = website_conf; + op_ret = store->put_bucket_instance_info(s->bucket_info, false, + real_time(), &s->bucket_attrs); + return op_ret; + }); + + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name + << " returned err=" << op_ret << dendl; + return; + } +} + +int RGWDeleteBucketWebsite::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3DeleteBucketWebsite); +} + +void RGWDeleteBucketWebsite::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteBucketWebsite::execute() +{ + + if (!store->svc.zone->is_meta_master()) { + bufferlist in_data; + op_ret = forward_request_to_master(s, nullptr, store, in_data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: forward_to_master failed on bucket=" << s->bucket.name + << "returned err=" << op_ret << dendl; + return; + } + } + op_ret = retry_raced_bucket_write(store, s, [this] { + s->bucket_info.has_website = false; + s->bucket_info.website_conf = RGWBucketWebsiteConf(); + op_ret = store->put_bucket_instance_info(s->bucket_info, false, + real_time(), &s->bucket_attrs); + return op_ret; + }); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name + << " returned err=" << op_ret << dendl; + return; + } +} + +int RGWStatBucket::verify_permission() +{ + // This (a HEAD request on a bucket) is governed by the s3:ListBucket permission. + if (!verify_bucket_permission(this, s, rgw::IAM::s3ListBucket)) { + return -EACCES; + } + + return 0; +} + +void RGWStatBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWStatBucket::execute() +{ + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + RGWUserBuckets buckets; + bucket.bucket = s->bucket; + buckets.add(bucket); + map& m = buckets.get_buckets(); + op_ret = store->update_containers_stats(m); + if (! op_ret) + op_ret = -EEXIST; + if (op_ret > 0) { + op_ret = 0; + map::iterator iter = m.find(bucket.bucket.name); + if (iter != m.end()) { + bucket = iter->second; + } else { + op_ret = -EINVAL; + } + } +} + +int RGWListBucket::verify_permission() +{ + op_ret = get_params(); + if (op_ret < 0) { + return op_ret; + } + if (!prefix.empty()) + s->env.emplace("s3:prefix", prefix); + + if (!delimiter.empty()) + s->env.emplace("s3:delimiter", delimiter); + + s->env.emplace("s3:max-keys", std::to_string(max)); + + if (!verify_bucket_permission(this, + s, + list_versions ? + rgw::IAM::s3ListBucketVersions : + rgw::IAM::s3ListBucket)) { + return -EACCES; + } + + return 0; +} + +int RGWListBucket::parse_max_keys() +{ + // Bound max value of max-keys to configured value for security + // Bound min value of max-keys to '0' + // Some S3 clients explicitly send max-keys=0 to detect if the bucket is + // empty without listing any items. + return parse_value_and_bound(max_keys, max, 0, + g_conf().get_val("rgw_max_listing_results"), + default_max); +} + +void RGWListBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListBucket::execute() +{ + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + if (allow_unordered && !delimiter.empty()) { + ldpp_dout(this, 0) << + "ERROR: unordered bucket listing requested with a delimiter" << dendl; + op_ret = -EINVAL; + return; + } + + if (need_container_stats()) { + map m; + m[s->bucket.name] = RGWBucketEnt(); + m.begin()->second.bucket = s->bucket; + op_ret = store->update_containers_stats(m); + if (op_ret > 0) { + bucket = m.begin()->second; + } + } + + RGWRados::Bucket target(store, s->bucket_info); + if (shard_id >= 0) { + target.set_shard_id(shard_id); + } + RGWRados::Bucket::List list_op(&target); + + list_op.params.prefix = prefix; + list_op.params.delim = delimiter; + list_op.params.marker = marker; + list_op.params.end_marker = end_marker; + list_op.params.list_versions = list_versions; + list_op.params.allow_unordered = allow_unordered; + + op_ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated); + if (op_ret >= 0) { + next_marker = list_op.get_next_marker(); + } +} + +int RGWGetBucketLogging::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketLogging); +} + +int RGWGetBucketLocation::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketLocation); +} + +int RGWCreateBucket::verify_permission() +{ + /* This check is mostly needed for S3 that doesn't support account ACL. + * Swift doesn't allow to delegate any permission to an anonymous user, + * so it will become an early exit in such case. */ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + rgw_bucket bucket; + bucket.name = s->bucket_name; + bucket.tenant = s->bucket_tenant; + ARN arn = ARN(bucket); + if (!verify_user_permission(this, s, arn, rgw::IAM::s3CreateBucket)) { + return -EACCES; + } + + if (s->user->user_id.tenant != s->bucket_tenant) { + ldpp_dout(this, 10) << "user cannot create a bucket in a different tenant" + << " (user_id.tenant=" << s->user->user_id.tenant + << " requested=" << s->bucket_tenant << ")" + << dendl; + return -EACCES; + } + if (s->user->max_buckets < 0) { + return -EPERM; + } + + if (s->user->max_buckets) { + RGWUserBuckets buckets; + string marker; + bool is_truncated = false; + op_ret = rgw_read_user_buckets(store, s->user->user_id, buckets, + marker, string(), s->user->max_buckets, + false, &is_truncated); + if (op_ret < 0) { + return op_ret; + } + + if ((int)buckets.count() >= s->user->max_buckets) { + return -ERR_TOO_MANY_BUCKETS; + } + } + + return 0; +} + +static int forward_request_to_master(struct req_state *s, obj_version *objv, + RGWRados *store, bufferlist& in_data, + JSONParser *jp, req_info *forward_info) +{ + if (!store->svc.zone->get_master_conn()) { + ldpp_dout(s, 0) << "rest connection is invalid" << dendl; + return -EINVAL; + } + ldpp_dout(s, 0) << "sending request to master zonegroup" << dendl; + bufferlist response; + string uid_str = s->user->user_id.to_str(); +#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response + int ret = store->svc.zone->get_master_conn()->forward(uid_str, (forward_info ? *forward_info : s->info), + objv, MAX_REST_RESPONSE, &in_data, &response); + if (ret < 0) + return ret; + + ldpp_dout(s, 20) << "response: " << response.c_str() << dendl; + if (jp && !jp->parse(response.c_str(), response.length())) { + ldpp_dout(s, 0) << "failed parsing response from master zonegroup" << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWCreateBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +static void prepare_add_del_attrs(const map& orig_attrs, + map& out_attrs, + map& out_rmattrs) +{ + for (const auto& kv : orig_attrs) { + const string& name = kv.first; + + /* Check if the attr is user-defined metadata item. */ + if (name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, + RGW_ATTR_META_PREFIX) == 0) { + /* For the objects all existing meta attrs have to be removed. */ + out_rmattrs[name] = kv.second; + } else if (out_attrs.find(name) == std::end(out_attrs)) { + out_attrs[name] = kv.second; + } + } +} + +/* Fuse resource metadata basing on original attributes in @orig_attrs, set + * of _custom_ attribute names to remove in @rmattr_names and attributes in + * @out_attrs. Place results in @out_attrs. + * + * NOTE: it's supposed that all special attrs already present in @out_attrs + * will be preserved without any change. Special attributes are those which + * names start with RGW_ATTR_META_PREFIX. They're complement to custom ones + * used for X-Account-Meta-*, X-Container-Meta-*, X-Amz-Meta and so on. */ +static void prepare_add_del_attrs(const map& orig_attrs, + const set& rmattr_names, + map& out_attrs) +{ + for (const auto& kv : orig_attrs) { + const string& name = kv.first; + + /* Check if the attr is user-defined metadata item. */ + if (name.compare(0, strlen(RGW_ATTR_META_PREFIX), + RGW_ATTR_META_PREFIX) == 0) { + /* For the buckets all existing meta attrs are preserved, + except those that are listed in rmattr_names. */ + if (rmattr_names.find(name) != std::end(rmattr_names)) { + const auto aiter = out_attrs.find(name); + + if (aiter != std::end(out_attrs)) { + out_attrs.erase(aiter); + } + } else { + /* emplace() won't alter the map if the key is already present. + * This behaviour is fully intensional here. */ + out_attrs.emplace(kv); + } + } else if (out_attrs.find(name) == std::end(out_attrs)) { + out_attrs[name] = kv.second; + } + } +} + + +static void populate_with_generic_attrs(const req_state * const s, + map& out_attrs) +{ + for (const auto& kv : s->generic_attrs) { + bufferlist& attrbl = out_attrs[kv.first]; + const string& val = kv.second; + attrbl.clear(); + attrbl.append(val.c_str(), val.size() + 1); + } +} + + +static int filter_out_quota_info(std::map& add_attrs, + const std::set& rmattr_names, + RGWQuotaInfo& quota, + bool * quota_extracted = nullptr) +{ + bool extracted = false; + + /* Put new limit on max objects. */ + auto iter = add_attrs.find(RGW_ATTR_QUOTA_NOBJS); + std::string err; + if (std::end(add_attrs) != iter) { + quota.max_objects = + static_cast(strict_strtoll(iter->second.c_str(), 10, &err)); + if (!err.empty()) { + return -EINVAL; + } + add_attrs.erase(iter); + extracted = true; + } + + /* Put new limit on bucket (container) size. */ + iter = add_attrs.find(RGW_ATTR_QUOTA_MSIZE); + if (iter != add_attrs.end()) { + quota.max_size = + static_cast(strict_strtoll(iter->second.c_str(), 10, &err)); + if (!err.empty()) { + return -EINVAL; + } + add_attrs.erase(iter); + extracted = true; + } + + for (const auto& name : rmattr_names) { + /* Remove limit on max objects. */ + if (name.compare(RGW_ATTR_QUOTA_NOBJS) == 0) { + quota.max_objects = -1; + extracted = true; + } + + /* Remove limit on max bucket size. */ + if (name.compare(RGW_ATTR_QUOTA_MSIZE) == 0) { + quota.max_size = -1; + extracted = true; + } + } + + /* Swift requries checking on raw usage instead of the 4 KiB rounded one. */ + quota.check_on_raw = true; + quota.enabled = quota.max_size > 0 || quota.max_objects > 0; + + if (quota_extracted) { + *quota_extracted = extracted; + } + + return 0; +} + + +static void filter_out_website(std::map& add_attrs, + const std::set& rmattr_names, + RGWBucketWebsiteConf& ws_conf) +{ + std::string lstval; + + /* Let's define a mapping between each custom attribute and the memory where + * attribute's value should be stored. The memory location is expressed by + * a non-const reference. */ + const auto mapping = { + std::make_pair(RGW_ATTR_WEB_INDEX, std::ref(ws_conf.index_doc_suffix)), + std::make_pair(RGW_ATTR_WEB_ERROR, std::ref(ws_conf.error_doc)), + std::make_pair(RGW_ATTR_WEB_LISTINGS, std::ref(lstval)), + std::make_pair(RGW_ATTR_WEB_LIST_CSS, std::ref(ws_conf.listing_css_doc)), + std::make_pair(RGW_ATTR_SUBDIR_MARKER, std::ref(ws_conf.subdir_marker)) + }; + + for (const auto& kv : mapping) { + const char * const key = kv.first; + auto& target = kv.second; + + auto iter = add_attrs.find(key); + + if (std::end(add_attrs) != iter) { + /* The "target" is a reference to ws_conf. */ + target = iter->second.c_str(); + add_attrs.erase(iter); + } + + if (rmattr_names.count(key)) { + target = std::string(); + } + } + + if (! lstval.empty()) { + ws_conf.listing_enabled = boost::algorithm::iequals(lstval, "true"); + } +} + + +void RGWCreateBucket::execute() +{ + RGWAccessControlPolicy old_policy(s->cct); + buffer::list aclbl; + buffer::list corsbl; + bool existed; + string bucket_name; + rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name, bucket_name); + rgw_raw_obj obj(store->svc.zone->get_zone_params().domain_root, bucket_name); + obj_version objv, *pobjv = NULL; + + op_ret = get_params(); + if (op_ret < 0) + return; + + if (!relaxed_region_enforcement && + !location_constraint.empty() && + !store->svc.zone->has_zonegroup_api(location_constraint)) { + ldpp_dout(this, 0) << "location constraint (" << location_constraint << ")" + << " can't be found." << dendl; + op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; + s->err.message = "The specified location-constraint is not valid"; + return; + } + + if (!relaxed_region_enforcement && !store->svc.zone->get_zonegroup().is_master_zonegroup() && !location_constraint.empty() && + store->svc.zone->get_zonegroup().api_name != location_constraint) { + ldpp_dout(this, 0) << "location constraint (" << location_constraint << ")" + << " doesn't match zonegroup" << " (" << store->svc.zone->get_zonegroup().api_name << ")" + << dendl; + op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; + s->err.message = "The specified location-constraint is not valid"; + return; + } + + const auto& zonegroup = store->svc.zone->get_zonegroup(); + if (!placement_rule.name.empty() && + !zonegroup.placement_targets.count(placement_rule.name)) { + ldpp_dout(this, 0) << "placement target (" << placement_rule.name << ")" + << " doesn't exist in the placement targets of zonegroup" + << " (" << store->svc.zone->get_zonegroup().api_name << ")" << dendl; + op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; + s->err.message = "The specified placement target does not exist"; + return; + } + + /* we need to make sure we read bucket info, it's not read before for this + * specific request */ + op_ret = store->get_bucket_info(*s->sysobj_ctx, s->bucket_tenant, s->bucket_name, + s->bucket_info, nullptr, &s->bucket_attrs); + if (op_ret < 0 && op_ret != -ENOENT) + return; + s->bucket_exists = (op_ret != -ENOENT); + + s->bucket_owner.set_id(s->user->user_id); + s->bucket_owner.set_name(s->user->display_name); + if (s->bucket_exists) { + int r = rgw_op_get_bucket_policy_from_attr(s->cct, store, s->bucket_info, + s->bucket_attrs, &old_policy); + if (r >= 0) { + if (old_policy.get_owner().get_id().compare(s->user->user_id) != 0) { + op_ret = -EEXIST; + return; + } + } + } + + RGWBucketInfo master_info; + rgw_bucket *pmaster_bucket; + uint32_t *pmaster_num_shards; + real_time creation_time; + + if (!store->svc.zone->is_meta_master()) { + JSONParser jp; + op_ret = forward_request_to_master(s, NULL, store, in_data, &jp); + if (op_ret < 0) { + return; + } + + JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp); + JSONDecoder::decode_json("object_ver", objv, &jp); + JSONDecoder::decode_json("bucket_info", master_info, &jp); + ldpp_dout(this, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl; + ldpp_dout(this, 20) << "got creation time: << " << master_info.creation_time << dendl; + pmaster_bucket= &master_info.bucket; + creation_time = master_info.creation_time; + pmaster_num_shards = &master_info.num_shards; + pobjv = &objv; + obj_lock_enabled = master_info.obj_lock_enabled(); + } else { + pmaster_bucket = NULL; + pmaster_num_shards = NULL; + } + + string zonegroup_id; + + if (s->system_request) { + zonegroup_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup"); + if (zonegroup_id.empty()) { + zonegroup_id = store->svc.zone->get_zonegroup().get_id(); + } + } else { + zonegroup_id = store->svc.zone->get_zonegroup().get_id(); + } + + if (s->bucket_exists) { + rgw_placement_rule selected_placement_rule; + rgw_bucket bucket; + bucket.tenant = s->bucket_tenant; + bucket.name = s->bucket_name; + op_ret = store->svc.zone->select_bucket_placement(*(s->user), zonegroup_id, + placement_rule, + &selected_placement_rule, nullptr); + if (selected_placement_rule != s->bucket_info.placement_rule) { + op_ret = -EEXIST; + return; + } + } + + /* Encode special metadata first as we're using std::map::emplace under + * the hood. This method will add the new items only if the map doesn't + * contain such keys yet. */ + policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + if (has_cors) { + cors_config.encode(corsbl); + emplace_attr(RGW_ATTR_CORS, std::move(corsbl)); + } + + RGWQuotaInfo quota_info; + const RGWQuotaInfo * pquota_info = nullptr; + if (need_metadata_upload()) { + /* It's supposed that following functions WILL NOT change any special + * attributes (like RGW_ATTR_ACL) if they are already present in attrs. */ + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false); + if (op_ret < 0) { + return; + } + prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); + + op_ret = filter_out_quota_info(attrs, rmattr_names, quota_info); + if (op_ret < 0) { + return; + } else { + pquota_info = "a_info; + } + + /* Web site of Swift API. */ + filter_out_website(attrs, rmattr_names, s->bucket_info.website_conf); + s->bucket_info.has_website = !s->bucket_info.website_conf.is_empty(); + } + + s->bucket.tenant = s->bucket_tenant; /* ignored if bucket exists */ + s->bucket.name = s->bucket_name; + + /* Handle updates of the metadata for Swift's object versioning. */ + if (swift_ver_location) { + s->bucket_info.swift_ver_location = *swift_ver_location; + s->bucket_info.swift_versioning = (! swift_ver_location->empty()); + } + if (obj_lock_enabled) { + info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED; + } + + + op_ret = store->create_bucket(*(s->user), s->bucket, zonegroup_id, + placement_rule, s->bucket_info.swift_ver_location, + pquota_info, attrs, + info, pobjv, &ep_objv, creation_time, + pmaster_bucket, pmaster_num_shards, true); + /* continue if EEXIST and create_bucket will fail below. this way we can + * recover from a partial create by retrying it. */ + ldpp_dout(this, 20) << "rgw_create_bucket returned ret=" << op_ret << " bucket=" << s->bucket << dendl; + + if (op_ret && op_ret != -EEXIST) + return; + + existed = (op_ret == -EEXIST); + + if (existed) { + /* bucket already existed, might have raced with another bucket creation, or + * might be partial bucket creation that never completed. Read existing bucket + * info, verify that the reported bucket owner is the current user. + * If all is ok then update the user's list of buckets. + * Otherwise inform client about a name conflict. + */ + if (info.owner.compare(s->user->user_id) != 0) { + op_ret = -EEXIST; + return; + } + s->bucket = info.bucket; + } + + op_ret = rgw_link_bucket(store, s->user->user_id, s->bucket, + info.creation_time, false); + if (op_ret && !existed && op_ret != -EEXIST) { + /* if it exists (or previously existed), don't remove it! */ + op_ret = rgw_unlink_bucket(store, s->user->user_id, s->bucket.tenant, + s->bucket.name); + if (op_ret < 0) { + ldpp_dout(this, 0) << "WARNING: failed to unlink bucket: ret=" << op_ret + << dendl; + } + } else if (op_ret == -EEXIST || (op_ret == 0 && existed)) { + op_ret = -ERR_BUCKET_EXISTS; + } + + if (need_metadata_upload() && existed) { + /* OK, it looks we lost race with another request. As it's required to + * handle metadata fusion and upload, the whole operation becomes very + * similar in nature to PutMetadataBucket. However, as the attrs may + * changed in the meantime, we have to refresh. */ + short tries = 0; + do { + RGWBucketInfo binfo; + map battrs; + + op_ret = store->get_bucket_info(*s->sysobj_ctx, s->bucket_tenant, s->bucket_name, + binfo, nullptr, &battrs); + if (op_ret < 0) { + return; + } else if (binfo.owner.compare(s->user->user_id) != 0) { + /* New bucket doesn't belong to the account we're operating on. */ + op_ret = -EEXIST; + return; + } else { + s->bucket_info = binfo; + s->bucket_attrs = battrs; + } + + attrs.clear(); + + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false); + if (op_ret < 0) { + return; + } + prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); + op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket_info.quota); + if (op_ret < 0) { + return; + } + + /* Handle updates of the metadata for Swift's object versioning. */ + if (swift_ver_location) { + s->bucket_info.swift_ver_location = *swift_ver_location; + s->bucket_info.swift_versioning = (! swift_ver_location->empty()); + } + + /* Web site of Swift API. */ + filter_out_website(attrs, rmattr_names, s->bucket_info.website_conf); + s->bucket_info.has_website = !s->bucket_info.website_conf.is_empty(); + + /* This will also set the quota on the bucket. */ + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, + &s->bucket_info.objv_tracker); + } while (op_ret == -ECANCELED && tries++ < 20); + + /* Restore the proper return code. */ + if (op_ret >= 0) { + op_ret = -ERR_BUCKET_EXISTS; + } + } +} + +int RGWDeleteBucket::verify_permission() +{ + if (!verify_bucket_permission(this, s, rgw::IAM::s3DeleteBucket)) { + return -EACCES; + } + + return 0; +} + +void RGWDeleteBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteBucket::execute() +{ + if (s->bucket_name.empty()) { + op_ret = -EINVAL; + return; + } + + if (!s->bucket_exists) { + ldpp_dout(this, 0) << "ERROR: bucket " << s->bucket_name << " not found" << dendl; + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + RGWObjVersionTracker ot; + ot.read_version = s->bucket_info.ep_objv; + + if (s->system_request) { + string tag = s->info.args.get(RGW_SYS_PARAM_PREFIX "tag"); + string ver_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "ver"); + if (!tag.empty()) { + ot.read_version.tag = tag; + uint64_t ver; + string err; + ver = strict_strtol(ver_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 0) << "failed to parse ver param" << dendl; + op_ret = -EINVAL; + return; + } + ot.read_version.ver = ver; + } + } + + op_ret = rgw_bucket_sync_user_stats(store, s->user->user_id, s->bucket_info); + if ( op_ret < 0) { + ldpp_dout(this, 1) << "WARNING: failed to sync user stats before bucket delete: op_ret= " << op_ret << dendl; + } + + op_ret = store->check_bucket_empty(s->bucket_info); + if (op_ret < 0) { + return; + } + + if (!store->svc.zone->is_meta_master()) { + bufferlist in_data; + op_ret = forward_request_to_master(s, &ot.read_version, store, in_data, + NULL); + if (op_ret < 0) { + if (op_ret == -ENOENT) { + /* adjust error, we want to return with NoSuchBucket and not + * NoSuchKey */ + op_ret = -ERR_NO_SUCH_BUCKET; + } + return; + } + } + + string prefix, delimiter; + + if (s->prot_flags & RGW_REST_SWIFT) { + string path_args; + path_args = s->info.args.get("path"); + if (!path_args.empty()) { + if (!delimiter.empty() || !prefix.empty()) { + op_ret = -EINVAL; + return; + } + prefix = path_args; + delimiter="/"; + } + } + + op_ret = abort_bucket_multiparts(store, s->cct, s->bucket_info, prefix, delimiter); + + if (op_ret < 0) { + return; + } + + op_ret = store->delete_bucket(s->bucket_info, ot, false); + + if (op_ret == -ECANCELED) { + // lost a race, either with mdlog sync or another delete bucket operation. + // in either case, we've already called rgw_unlink_bucket() + op_ret = 0; + return; + } + + if (op_ret == 0) { + op_ret = rgw_unlink_bucket(store, s->bucket_info.owner, s->bucket.tenant, + s->bucket.name, false); + if (op_ret < 0) { + ldpp_dout(this, 0) << "WARNING: failed to unlink bucket: ret=" << op_ret + << dendl; + } + } +} + +int RGWPutObj::verify_permission() +{ + if (! copy_source.empty()) { + + RGWAccessControlPolicy cs_acl(s->cct); + boost::optional policy; + map cs_attrs; + rgw_bucket cs_bucket(copy_source_bucket_info.bucket); + rgw_obj_key cs_object(copy_source_object_name, copy_source_version_id); + + rgw_obj obj(cs_bucket, cs_object); + store->set_atomic(s->obj_ctx, obj); + store->set_prefetch_data(s->obj_ctx, obj); + + /* check source object permissions */ + if (read_obj_policy(store, s, copy_source_bucket_info, cs_attrs, &cs_acl, nullptr, + policy, cs_bucket, cs_object) < 0) { + return -EACCES; + } + + /* admin request overrides permission checks */ + if (! s->auth.identity->is_admin_of(cs_acl.get_owner().get_id())) { + if (policy || ! s->iam_user_policies.empty()) { + auto usr_policy_res = Effect::Pass; + for (auto& user_policy : s->iam_user_policies) { + if (usr_policy_res = user_policy.eval(s->env, *s->auth.identity, + cs_object.instance.empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion, + rgw::ARN(obj)); usr_policy_res == Effect::Deny) + return -EACCES; + else if (usr_policy_res == Effect::Allow) + break; + } + rgw::IAM::Effect e = Effect::Pass; + if (policy) { + e = policy->eval(s->env, *s->auth.identity, + cs_object.instance.empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion, + rgw::ARN(obj)); + } + if (e == Effect::Deny) { + return -EACCES; + } else if (usr_policy_res == Effect::Pass && e == Effect::Pass && + !cs_acl.verify_permission(this, *s->auth.identity, s->perm_mask, + RGW_PERM_READ)) { + return -EACCES; + } + } else if (!cs_acl.verify_permission(this, *s->auth.identity, s->perm_mask, + RGW_PERM_READ)) { + return -EACCES; + } + } + } + + auto op_ret = get_params(); + if (op_ret < 0) { + ldpp_dout(this, 20) << "get_params() returned ret=" << op_ret << dendl; + return op_ret; + } + + if (s->iam_policy || ! s->iam_user_policies.empty()) { + rgw_add_grant_to_iam_environment(s->env, s); + + rgw_add_to_iam_environment(s->env, "s3:x-amz-acl", s->canned_acl); + + if (obj_tags != nullptr && obj_tags->count() > 0){ + auto tags = obj_tags->get_tags(); + for (const auto& kv: tags){ + rgw_add_to_iam_environment(s->env, "s3:RequestObjectTag/"+kv.first, kv.second); + } + } + + constexpr auto encrypt_attr = "x-amz-server-side-encryption"; + constexpr auto s3_encrypt_attr = "s3:x-amz-server-side-encryption"; + auto enc_header = s->info.x_meta_map.find(encrypt_attr); + if (enc_header != s->info.x_meta_map.end()){ + rgw_add_to_iam_environment(s->env, s3_encrypt_attr, enc_header->second); + } + + constexpr auto kms_attr = "x-amz-server-side-encryption-aws-kms-key-id"; + constexpr auto s3_kms_attr = "s3:x-amz-server-side-encryption-aws-kms-key-id"; + auto kms_header = s->info.x_meta_map.find(kms_attr); + if (kms_header != s->info.x_meta_map.end()){ + rgw_add_to_iam_environment(s->env, s3_kms_attr, kms_header->second); + } + + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + rgw::IAM::s3PutObject, + rgw_obj(s->bucket, s->object)); + if (usr_policy_res == Effect::Deny) + return -EACCES; + + rgw::IAM::Effect e = Effect::Pass; + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + rgw_obj(s->bucket, s->object)); + } + if (e == Effect::Allow) { + return 0; + } else if (e == Effect::Deny) { + return -EACCES; + } else if (usr_policy_res == Effect::Allow) { + return 0; + } + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + + +void RGWPutObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +class RGWPutObj_CB : public RGWGetObj_Filter +{ + RGWPutObj *op; +public: + explicit RGWPutObj_CB(RGWPutObj *_op) : op(_op) {} + ~RGWPutObj_CB() override {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override { + return op->get_data_cb(bl, bl_ofs, bl_len); + } +}; + +int RGWPutObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + bufferlist bl_tmp; + bl.copy(bl_ofs, bl_len, bl_tmp); + + bl_aux.append(bl_tmp); + + return bl_len; +} + +int RGWPutObj::get_data(const off_t fst, const off_t lst, bufferlist& bl) +{ + RGWPutObj_CB cb(this); + RGWGetObj_Filter* filter = &cb; + boost::optional decompress; + std::unique_ptr decrypt; + RGWCompressionInfo cs_info; + map attrs; + map::iterator attr_iter; + int ret = 0; + + uint64_t obj_size; + int64_t new_ofs, new_end; + + new_ofs = fst; + new_end = lst; + + rgw_obj_key obj_key(copy_source_object_name, copy_source_version_id); + rgw_obj obj(copy_source_bucket_info.bucket, obj_key); + + RGWRados::Object op_target(store, copy_source_bucket_info, *static_cast(s->obj_ctx), obj); + RGWRados::Object::Read read_op(&op_target); + read_op.params.obj_size = &obj_size; + read_op.params.attrs = &attrs; + + ret = read_op.prepare(); + if (ret < 0) + return ret; + + bool need_decompress; + op_ret = rgw_compression_info_from_attrset(attrs, need_decompress, cs_info); + if (op_ret < 0) { + ldpp_dout(s, 0) << "ERROR: failed to decode compression info" << dendl; + return -EIO; + } + + bool partial_content = true; + if (need_decompress) + { + obj_size = cs_info.orig_size; + decompress.emplace(s->cct, &cs_info, partial_content, filter); + filter = &*decompress; + } + + attr_iter = attrs.find(RGW_ATTR_MANIFEST); + op_ret = this->get_decrypt_filter(&decrypt, + filter, + attrs, + attr_iter != attrs.end() ? &(attr_iter->second) : nullptr); + if (decrypt != nullptr) { + filter = decrypt.get(); + } + if (op_ret < 0) { + return ret; + } + + ret = read_op.range_to_ofs(obj_size, new_ofs, new_end); + if (ret < 0) + return ret; + + filter->fixup_range(new_ofs, new_end); + ret = read_op.iterate(new_ofs, new_end, filter); + + if (ret >= 0) + ret = filter->flush(); + + bl.claim_append(bl_aux); + + return ret; +} + +// special handling for compression type = "random" with multipart uploads +static CompressorRef get_compressor_plugin(const req_state *s, + const std::string& compression_type) +{ + if (compression_type != "random") { + return Compressor::create(s->cct, compression_type); + } + + bool is_multipart{false}; + const auto& upload_id = s->info.args.get("uploadId", &is_multipart); + + if (!is_multipart) { + return Compressor::create(s->cct, compression_type); + } + + // use a hash of the multipart upload id so all parts use the same plugin + const auto alg = std::hash{}(upload_id) % Compressor::COMP_ALG_LAST; + if (alg == Compressor::COMP_ALG_NONE) { + return nullptr; + } + return Compressor::create(s->cct, alg); +} + +void RGWPutObj::execute() +{ + char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1]; + char supplied_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + bufferlist bl, aclbl, bs; + int len; + + off_t fst; + off_t lst; + + bool need_calc_md5 = (dlo_manifest == NULL) && (slo_info == NULL); + perfcounter->inc(l_rgw_put); + // report latency on return + auto put_lat = make_scope_guard([&] { + perfcounter->tinc(l_rgw_put_lat, s->time_elapsed()); + }); + + op_ret = -EINVAL; + if (s->object.empty()) { + return; + } + + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + + op_ret = get_system_versioning_params(s, &olh_epoch, &version_id); + if (op_ret < 0) { + ldpp_dout(this, 20) << "get_system_versioning_params() returned ret=" + << op_ret << dendl; + return; + } + + if (supplied_md5_b64) { + need_calc_md5 = true; + + ldpp_dout(this, 15) << "supplied_md5_b64=" << supplied_md5_b64 << dendl; + op_ret = ceph_unarmor(supplied_md5_bin, &supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1], + supplied_md5_b64, supplied_md5_b64 + strlen(supplied_md5_b64)); + ldpp_dout(this, 15) << "ceph_armor ret=" << op_ret << dendl; + if (op_ret != CEPH_CRYPTO_MD5_DIGESTSIZE) { + op_ret = -ERR_INVALID_DIGEST; + return; + } + + buf_to_hex((const unsigned char *)supplied_md5_bin, CEPH_CRYPTO_MD5_DIGESTSIZE, supplied_md5); + ldpp_dout(this, 15) << "supplied_md5=" << supplied_md5 << dendl; + } + + if (!chunked_upload) { /* with chunked upload we don't know how big is the upload. + we also check sizes at the end anyway */ + op_ret = store->check_quota(s->bucket_owner.get_id(), s->bucket, + user_quota, bucket_quota, s->content_length); + if (op_ret < 0) { + ldpp_dout(this, 20) << "check_quota() returned ret=" << op_ret << dendl; + return; + } + op_ret = store->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota); + if (op_ret < 0) { + ldpp_dout(this, 20) << "check_bucket_shards() returned ret=" << op_ret << dendl; + return; + } + } + + if (supplied_etag) { + strncpy(supplied_md5, supplied_etag, sizeof(supplied_md5) - 1); + supplied_md5[sizeof(supplied_md5) - 1] = '\0'; + } + + const bool multipart = !multipart_upload_id.empty(); + auto& obj_ctx = *static_cast(s->obj_ctx); + rgw_obj obj{s->bucket, s->object}; + + /* Handle object versioning of Swift API. */ + if (! multipart) { + op_ret = store->swift_versioning_copy(obj_ctx, + s->bucket_owner.get_id(), + s->bucket_info, + obj); + if (op_ret < 0) { + return; + } + } + + // create the object processor + rgw::AioThrottle aio(store->ctx()->_conf->rgw_put_obj_min_window_size); + using namespace rgw::putobj; + constexpr auto max_processor_size = std::max({sizeof(MultipartObjectProcessor), + sizeof(AtomicObjectProcessor), + sizeof(AppendObjectProcessor)}); + ceph::static_ptr processor; + + rgw_placement_rule *pdest_placement; + + multipart_upload_info upload_info; + if (multipart) { + RGWMPObj mp(s->object.name, multipart_upload_id); + + op_ret = get_multipart_info(store, s, mp.get_meta(), nullptr, nullptr, &upload_info); + if (op_ret < 0) { + if (op_ret != -ENOENT) { + ldpp_dout(this, 0) << "ERROR: get_multipart_info returned " << op_ret << ": " << cpp_strerror(-op_ret) << dendl; + } else {// -ENOENT: raced with upload complete/cancel, no need to spam log + ldpp_dout(this, 20) << "failed to get multipart info (returned " << op_ret << ": " << cpp_strerror(-op_ret) << "): probably raced with upload complete / cancel" << dendl; + } + return; + } + pdest_placement = &upload_info.dest_placement; + ldpp_dout(this, 20) << "dest_placement for part=" << upload_info.dest_placement << dendl; + processor.emplace( + &aio, store, s->bucket_info, pdest_placement, + s->owner.get_id(), obj_ctx, obj, + multipart_upload_id, multipart_part_num, multipart_part_str); + } else if(append) { + if (s->bucket_info.versioned()) { + op_ret = -ERR_INVALID_BUCKET_STATE; + return; + } + pdest_placement = &s->dest_placement; + processor.emplace( + &aio, store, s->bucket_info, pdest_placement, s->bucket_owner.get_id(),obj_ctx, obj, + s->req_id, position, &cur_accounted_size); + } else { + if (s->bucket_info.versioning_enabled()) { + if (!version_id.empty()) { + obj.key.set_instance(version_id); + } else { + store->gen_rand_obj_instance_name(&obj); + version_id = obj.key.instance; + } + } + pdest_placement = &s->dest_placement; + processor.emplace( + &aio, store, s->bucket_info, pdest_placement, + s->bucket_owner.get_id(), obj_ctx, obj, olh_epoch, s->req_id); + } + + op_ret = processor->prepare(); + if (op_ret < 0) { + ldpp_dout(this, 20) << "processor->prepare() returned ret=" << op_ret + << dendl; + return; + } + + if ((! copy_source.empty()) && !copy_source_range) { + rgw_obj_key obj_key(copy_source_object_name, copy_source_version_id); + rgw_obj obj(copy_source_bucket_info.bucket, obj_key.name); + + RGWObjState *astate; + op_ret = store->get_obj_state(&obj_ctx, copy_source_bucket_info, obj, + &astate, true, false); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: get copy source obj state returned with error" << op_ret << dendl; + return; + } + if (!astate->exists){ + op_ret = -ENOENT; + return; + } + lst = astate->accounted_size - 1; + } else { + lst = copy_source_range_lst; + } + + fst = copy_source_range_fst; + + // no filters by default + DataProcessor *filter = processor.get(); + + const auto& compression_type = store->svc.zone->get_zone_params().get_compression_type(*pdest_placement); + CompressorRef plugin; + boost::optional compressor; + + std::unique_ptr encrypt; + + if (!append) { // compression and encryption only apply to full object uploads + op_ret = get_encrypt_filter(&encrypt, filter); + if (op_ret < 0) { + return; + } + if (encrypt != nullptr) { + filter = &*encrypt; + } else if (compression_type != "none") { + plugin = get_compressor_plugin(s, compression_type); + if (!plugin) { + ldpp_dout(this, 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } else { + compressor.emplace(s->cct, plugin, filter); + filter = &*compressor; + } + } + } + tracepoint(rgw_op, before_data_transfer, s->req_id.c_str()); + do { + bufferlist data; + if (fst > lst) + break; + if (copy_source.empty()) { + len = get_data(data); + } else { + uint64_t cur_lst = min(fst + s->cct->_conf->rgw_max_chunk_size - 1, lst); + op_ret = get_data(fst, cur_lst, data); + if (op_ret < 0) + return; + len = data.length(); + s->content_length += len; + fst += len; + } + if (len < 0) { + op_ret = len; + ldpp_dout(this, 20) << "get_data() returned ret=" << op_ret << dendl; + return; + } else if (len == 0) { + break; + } + + if (need_calc_md5) { + hash.Update((const unsigned char *)data.c_str(), data.length()); + } + + /* update torrrent */ + torrent.update(data); + + op_ret = filter->process(std::move(data), ofs); + if (op_ret < 0) { + ldpp_dout(this, 20) << "processor->process() returned ret=" + << op_ret << dendl; + return; + } + + ofs += len; + } while (len > 0); + tracepoint(rgw_op, after_data_transfer, s->req_id.c_str(), ofs); + + // flush any data in filters + op_ret = filter->process({}, ofs); + if (op_ret < 0) { + return; + } + + if (!chunked_upload && ofs != s->content_length) { + op_ret = -ERR_REQUEST_TIMEOUT; + return; + } + s->obj_size = ofs; + + perfcounter->inc(l_rgw_put_b, s->obj_size); + + op_ret = do_aws4_auth_completion(); + if (op_ret < 0) { + return; + } + + op_ret = store->check_quota(s->bucket_owner.get_id(), s->bucket, + user_quota, bucket_quota, s->obj_size); + if (op_ret < 0) { + ldpp_dout(this, 20) << "second check_quota() returned op_ret=" << op_ret << dendl; + return; + } + + op_ret = store->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota); + if (op_ret < 0) { + ldpp_dout(this, 20) << "check_bucket_shards() returned ret=" << op_ret << dendl; + return; + } + + hash.Final(m); + + if (compressor && compressor->is_compressed()) { + bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = s->obj_size; + cs_info.blocks = move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + attrs[RGW_ATTR_COMPRESSION] = tmp; + ldpp_dout(this, 20) << "storing " << RGW_ATTR_COMPRESSION + << " with type=" << cs_info.compression_type + << ", orig_size=" << cs_info.orig_size + << ", blocks=" << cs_info.blocks.size() << dendl; + } + + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + etag = calc_md5; + + if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) { + op_ret = -ERR_BAD_DIGEST; + return; + } + + policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + if (dlo_manifest) { + op_ret = encode_dlo_manifest_attr(dlo_manifest, attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "bad user manifest: " << dlo_manifest << dendl; + return; + } + } + + if (slo_info) { + bufferlist manifest_bl; + encode(*slo_info, manifest_bl); + emplace_attr(RGW_ATTR_SLO_MANIFEST, std::move(manifest_bl)); + } + + if (supplied_etag && etag.compare(supplied_etag) != 0) { + op_ret = -ERR_UNPROCESSABLE_ENTITY; + return; + } + bl.append(etag.c_str(), etag.size()); + emplace_attr(RGW_ATTR_ETAG, std::move(bl)); + + populate_with_generic_attrs(s, attrs); + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs); + if (op_ret < 0) { + return; + } + encode_delete_at_attr(delete_at, attrs); + encode_obj_tags_attr(obj_tags.get(), attrs); + rgw_cond_decode_objtags(s, attrs); + + /* Add a custom metadata to expose the information whether an object + * is an SLO or not. Appending the attribute must be performed AFTER + * processing any input from user in order to prohibit overwriting. */ + if (slo_info) { + bufferlist slo_userindicator_bl; + slo_userindicator_bl.append("True", 4); + emplace_attr(RGW_ATTR_SLO_UINDICATOR, std::move(slo_userindicator_bl)); + } + if (obj_legal_hold) { + bufferlist obj_legal_hold_bl; + obj_legal_hold->encode(obj_legal_hold_bl); + emplace_attr(RGW_ATTR_OBJECT_LEGAL_HOLD, std::move(obj_legal_hold_bl)); + } + if (obj_retention) { + bufferlist obj_retention_bl; + obj_retention->encode(obj_retention_bl); + emplace_attr(RGW_ATTR_OBJECT_RETENTION, std::move(obj_retention_bl)); + } + + tracepoint(rgw_op, processor_complete_enter, s->req_id.c_str()); + op_ret = processor->complete(s->obj_size, etag, &mtime, real_time(), attrs, + (delete_at ? *delete_at : real_time()), if_match, if_nomatch, + (user_data.empty() ? nullptr : &user_data), nullptr, nullptr); + tracepoint(rgw_op, processor_complete_exit, s->req_id.c_str()); + + /* produce torrent */ + if (s->cct->_conf->rgw_torrent_flag && (ofs == torrent.get_data_len())) + { + torrent.init(s, store); + torrent.set_create_date(mtime); + op_ret = torrent.complete(); + if (0 != op_ret) + { + ldpp_dout(this, 0) << "ERROR: torrent.handle_data() returned " << op_ret << dendl; + return; + } + } + + // send request to notification manager + const auto ret = rgw::notify::publish(s, obj.key, s->obj_size, mtime, etag, rgw::notify::ObjectCreatedPut, store); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: publishing notification failed, with error: " << ret << dendl; + // TODO: we should have conf to make send a blocking coroutine and reply with error in case sending failed + // this should be global conf (probably returnign a different handler) + // so we don't need to read the configured values before we perform it + } +} + +int RGWPostObj::verify_permission() +{ + return 0; +} + +void RGWPostObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPostObj::execute() +{ + boost::optional compressor; + CompressorRef plugin; + char supplied_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + + /* Read in the data from the POST form. */ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + op_ret = verify_params(); + if (op_ret < 0) { + return; + } + + if (s->iam_policy || ! s->iam_user_policies.empty()) { + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + rgw::IAM::s3PutObject, + rgw_obj(s->bucket, s->object)); + if (usr_policy_res == Effect::Deny) { + op_ret = -EACCES; + return; + } + + rgw::IAM::Effect e = Effect::Pass; + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + rgw_obj(s->bucket, s->object)); + } + if (e == Effect::Deny) { + op_ret = -EACCES; + return; + } else if (usr_policy_res == Effect::Pass && e == Effect::Pass && !verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + op_ret = -EACCES; + return; + } + } else if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + op_ret = -EACCES; + return; + } + + /* Start iteration over data fields. It's necessary as Swift's FormPost + * is capable to handle multiple files in single form. */ + do { + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + ceph::buffer::list bl, aclbl; + int len = 0; + + op_ret = store->check_quota(s->bucket_owner.get_id(), + s->bucket, + user_quota, + bucket_quota, + s->content_length); + if (op_ret < 0) { + return; + } + + op_ret = store->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota); + if (op_ret < 0) { + return; + } + + if (supplied_md5_b64) { + char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1]; + ldpp_dout(this, 15) << "supplied_md5_b64=" << supplied_md5_b64 << dendl; + op_ret = ceph_unarmor(supplied_md5_bin, &supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1], + supplied_md5_b64, supplied_md5_b64 + strlen(supplied_md5_b64)); + ldpp_dout(this, 15) << "ceph_armor ret=" << op_ret << dendl; + if (op_ret != CEPH_CRYPTO_MD5_DIGESTSIZE) { + op_ret = -ERR_INVALID_DIGEST; + return; + } + + buf_to_hex((const unsigned char *)supplied_md5_bin, CEPH_CRYPTO_MD5_DIGESTSIZE, supplied_md5); + ldpp_dout(this, 15) << "supplied_md5=" << supplied_md5 << dendl; + } + + rgw_obj obj(s->bucket, get_current_filename()); + if (s->bucket_info.versioning_enabled()) { + store->gen_rand_obj_instance_name(&obj); + } + + rgw::AioThrottle aio(s->cct->_conf->rgw_put_obj_min_window_size); + + using namespace rgw::putobj; + AtomicObjectProcessor processor(&aio, store, s->bucket_info, + &s->dest_placement, + s->bucket_owner.get_id(), + *static_cast(s->obj_ctx), + obj, 0, s->req_id); + op_ret = processor.prepare(); + if (op_ret < 0) { + return; + } + + /* No filters by default. */ + DataProcessor *filter = &processor; + + std::unique_ptr encrypt; + op_ret = get_encrypt_filter(&encrypt, filter); + if (op_ret < 0) { + return; + } + if (encrypt != nullptr) { + filter = encrypt.get(); + } else { + const auto& compression_type = store->svc.zone->get_zone_params().get_compression_type( + s->dest_placement); + if (compression_type != "none") { + plugin = Compressor::create(s->cct, compression_type); + if (!plugin) { + ldpp_dout(this, 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } else { + compressor.emplace(s->cct, plugin, filter); + filter = &*compressor; + } + } + } + + bool again; + do { + ceph::bufferlist data; + len = get_data(data, again); + + if (len < 0) { + op_ret = len; + return; + } + + if (!len) { + break; + } + + hash.Update((const unsigned char *)data.c_str(), data.length()); + op_ret = filter->process(std::move(data), ofs); + + ofs += len; + + if (ofs > max_len) { + op_ret = -ERR_TOO_LARGE; + return; + } + } while (again); + + // flush + op_ret = filter->process({}, ofs); + if (op_ret < 0) { + return; + } + + if (len < min_len) { + op_ret = -ERR_TOO_SMALL; + return; + } + + s->obj_size = ofs; + + + op_ret = store->check_quota(s->bucket_owner.get_id(), s->bucket, + user_quota, bucket_quota, s->obj_size); + if (op_ret < 0) { + return; + } + + op_ret = store->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota); + if (op_ret < 0) { + return; + } + + hash.Final(m); + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + etag = calc_md5; + + if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) { + op_ret = -ERR_BAD_DIGEST; + return; + } + + bl.append(etag.c_str(), etag.size()); + emplace_attr(RGW_ATTR_ETAG, std::move(bl)); + + policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + const std::string content_type = get_current_content_type(); + if (! content_type.empty()) { + ceph::bufferlist ct_bl; + ct_bl.append(content_type.c_str(), content_type.size() + 1); + emplace_attr(RGW_ATTR_CONTENT_TYPE, std::move(ct_bl)); + } + + if (compressor && compressor->is_compressed()) { + ceph::bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = s->obj_size; + cs_info.blocks = move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + emplace_attr(RGW_ATTR_COMPRESSION, std::move(tmp)); + } + + op_ret = processor.complete(s->obj_size, etag, nullptr, real_time(), attrs, + (delete_at ? *delete_at : real_time()), + nullptr, nullptr, nullptr, nullptr, nullptr); + if (op_ret < 0) { + return; + } + } while (is_next_file_to_upload()); + + const auto ret = rgw::notify::publish(s, s->object, s->obj_size, ceph::real_clock::now(), etag, rgw::notify::ObjectCreatedPost, store); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: publishing notification failed, with error: " << ret << dendl; + // TODO: we should have conf to make send a blocking coroutine and reply with error in case sending failed + // this should be global conf (probably returnign a different handler) + // so we don't need to read the configured values before we perform it + } +} + + +void RGWPutMetadataAccount::filter_out_temp_url(map& add_attrs, + const set& rmattr_names, + map& temp_url_keys) +{ + map::iterator iter; + + iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY1); + if (iter != add_attrs.end()) { + temp_url_keys[0] = iter->second.c_str(); + add_attrs.erase(iter); + } + + iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY2); + if (iter != add_attrs.end()) { + temp_url_keys[1] = iter->second.c_str(); + add_attrs.erase(iter); + } + + for (const string& name : rmattr_names) { + if (name.compare(RGW_ATTR_TEMPURL_KEY1) == 0) { + temp_url_keys[0] = string(); + } + if (name.compare(RGW_ATTR_TEMPURL_KEY2) == 0) { + temp_url_keys[1] = string(); + } + } +} + +int RGWPutMetadataAccount::init_processing() +{ + /* First, go to the base class. At the time of writing the method was + * responsible only for initializing the quota. This isn't necessary + * here as we are touching metadata only. I'm putting this call only + * for the future. */ + op_ret = RGWOp::init_processing(); + if (op_ret < 0) { + return op_ret; + } + + op_ret = get_params(); + if (op_ret < 0) { + return op_ret; + } + + op_ret = rgw_get_user_attrs_by_uid(store, s->user->user_id, orig_attrs, + &acct_op_tracker); + if (op_ret < 0) { + return op_ret; + } + + if (has_policy) { + bufferlist acl_bl; + policy.encode(acl_bl); + attrs.emplace(RGW_ATTR_ACL, std::move(acl_bl)); + } + + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false); + if (op_ret < 0) { + return op_ret; + } + prepare_add_del_attrs(orig_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); + + /* Try extract the TempURL-related stuff now to allow verify_permission + * evaluate whether we need FULL_CONTROL or not. */ + filter_out_temp_url(attrs, rmattr_names, temp_url_keys); + + /* The same with quota except a client needs to be reseller admin. */ + op_ret = filter_out_quota_info(attrs, rmattr_names, new_quota, + &new_quota_extracted); + if (op_ret < 0) { + return op_ret; + } + + return 0; +} + +int RGWPutMetadataAccount::verify_permission() +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (!verify_user_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + /* Altering TempURL keys requires FULL_CONTROL. */ + if (!temp_url_keys.empty() && s->perm_mask != RGW_PERM_FULL_CONTROL) { + return -EPERM; + } + + /* We are failing this intensionally to allow system user/reseller admin + * override in rgw_process.cc. This is the way to specify a given RGWOp + * expect extra privileges. */ + if (new_quota_extracted) { + return -EACCES; + } + + return 0; +} + +void RGWPutMetadataAccount::execute() +{ + /* Params have been extracted earlier. See init_processing(). */ + RGWUserInfo new_uinfo; + op_ret = rgw_get_user_info_by_uid(store, s->user->user_id, new_uinfo, + &acct_op_tracker); + if (op_ret < 0) { + return; + } + + /* Handle the TempURL-related stuff. */ + if (!temp_url_keys.empty()) { + for (auto& pair : temp_url_keys) { + new_uinfo.temp_url_keys[pair.first] = std::move(pair.second); + } + } + + /* Handle the quota extracted at the verify_permission step. */ + if (new_quota_extracted) { + new_uinfo.user_quota = std::move(new_quota); + } + + /* We are passing here the current (old) user info to allow the function + * optimize-out some operations. */ + op_ret = rgw_store_user_info(store, new_uinfo, s->user, + &acct_op_tracker, real_time(), false, &attrs); +} + +int RGWPutMetadataBucket::verify_permission() +{ + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWPutMetadataBucket::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutMetadataBucket::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false); + if (op_ret < 0) { + return; + } + + if (!placement_rule.empty() && + placement_rule != s->bucket_info.placement_rule) { + op_ret = -EEXIST; + return; + } + + op_ret = retry_raced_bucket_write(store, s, [this] { + /* Encode special metadata first as we're using std::map::emplace under + * the hood. This method will add the new items only if the map doesn't + * contain such keys yet. */ + if (has_policy) { + if (s->dialect.compare("swift") == 0) { + auto old_policy = \ + static_cast(s->bucket_acl.get()); + auto new_policy = static_cast(&policy); + new_policy->filter_merge(policy_rw_mask, old_policy); + policy = *new_policy; + } + buffer::list bl; + policy.encode(bl); + emplace_attr(RGW_ATTR_ACL, std::move(bl)); + } + + if (has_cors) { + buffer::list bl; + cors_config.encode(bl); + emplace_attr(RGW_ATTR_CORS, std::move(bl)); + } + + /* It's supposed that following functions WILL NOT change any + * special attributes (like RGW_ATTR_ACL) if they are already + * present in attrs. */ + prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); + + /* According to the Swift's behaviour and its container_quota + * WSGI middleware implementation: anyone with write permissions + * is able to set the bucket quota. This stays in contrast to + * account quotas that can be set only by clients holding + * reseller admin privileges. */ + op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket_info.quota); + if (op_ret < 0) { + return op_ret; + } + + if (swift_ver_location) { + s->bucket_info.swift_ver_location = *swift_ver_location; + s->bucket_info.swift_versioning = (!swift_ver_location->empty()); + } + + /* Web site of Swift API. */ + filter_out_website(attrs, rmattr_names, s->bucket_info.website_conf); + s->bucket_info.has_website = !s->bucket_info.website_conf.is_empty(); + + /* Setting attributes also stores the provided bucket info. Due + * to this fact, the new quota settings can be serialized with + * the same call. */ + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, + &s->bucket_info.objv_tracker); + return op_ret; + }); +} + +int RGWPutMetadataObject::verify_permission() +{ + // This looks to be something specific to Swift. We could add + // operations like swift:PutMetadataObject to the Policy Engine. + if (!verify_object_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWPutMetadataObject::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutMetadataObject::execute() +{ + rgw_obj obj(s->bucket, s->object); + rgw_obj target_obj; + map attrs, orig_attrs, rmattrs; + + store->set_atomic(s->obj_ctx, obj); + + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs); + if (op_ret < 0) { + return; + } + + /* check if obj exists, read orig attrs */ + op_ret = get_obj_attrs(store, s, obj, orig_attrs, &target_obj); + if (op_ret < 0) { + return; + } + + /* Check whether the object has expired. Swift API documentation + * stands that we should return 404 Not Found in such case. */ + if (need_object_expiration() && object_is_expired(orig_attrs)) { + op_ret = -ENOENT; + return; + } + + /* Filter currently existing attributes. */ + prepare_add_del_attrs(orig_attrs, attrs, rmattrs); + populate_with_generic_attrs(s, attrs); + encode_delete_at_attr(delete_at, attrs); + + if (dlo_manifest) { + op_ret = encode_dlo_manifest_attr(dlo_manifest, attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "bad user manifest: " << dlo_manifest << dendl; + return; + } + } + + op_ret = store->set_attrs(s->obj_ctx, s->bucket_info, target_obj, attrs, &rmattrs); +} + +int RGWDeleteObj::handle_slo_manifest(bufferlist& bl) +{ + RGWSLOInfo slo_info; + auto bliter = bl.cbegin(); + try { + decode(slo_info, bliter); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode slo manifest" << dendl; + return -EIO; + } + + try { + deleter = std::unique_ptr(\ + new RGWBulkDelete::Deleter(this, store, s)); + } catch (const std::bad_alloc&) { + return -ENOMEM; + } + + list items; + for (const auto& iter : slo_info.entries) { + const string& path_str = iter.path; + + const size_t sep_pos = path_str.find('/', 1 /* skip first slash */); + if (boost::string_view::npos == sep_pos) { + return -EINVAL; + } + + RGWBulkDelete::acct_path_t path; + + path.bucket_name = url_decode(path_str.substr(1, sep_pos - 1)); + path.obj_key = url_decode(path_str.substr(sep_pos + 1)); + + items.push_back(path); + } + + /* Request removal of the manifest object itself. */ + RGWBulkDelete::acct_path_t path; + path.bucket_name = s->bucket_name; + path.obj_key = s->object; + items.push_back(path); + + int ret = deleter->delete_chunk(items); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWDeleteObj::verify_permission() +{ + int op_ret = get_params(); + if (op_ret) { + return op_ret; + } + if (s->iam_policy || ! s->iam_user_policies.empty()) { + if (s->bucket_info.obj_lock_enabled() && bypass_governance_mode) { + auto r = eval_user_policies(s->iam_user_policies, s->env, boost::none, + rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket, s->object.name)); + if (r == Effect::Deny) { + bypass_perm = false; + } else if (r == Effect::Pass && s->iam_policy) { + r = s->iam_policy->eval(s->env, *s->auth.identity, rgw::IAM::s3BypassGovernanceRetention, + ARN(s->bucket, s->object.name)); + if (r == Effect::Deny) { + bypass_perm = false; + } + } + } + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + s->object.instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(s->bucket, s->object.name)); + if (usr_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect r = Effect::Pass; + if (s->iam_policy) { + r = s->iam_policy->eval(s->env, *s->auth.identity, + s->object.instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(s->bucket, s->object.name)); + } + if (r == Effect::Allow) + return 0; + else if (r == Effect::Deny) + return -EACCES; + else if (usr_policy_res == Effect::Allow) + return 0; + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + if (s->bucket_info.mfa_enabled() && + !s->object.instance.empty() && + !s->mfa_verified) { + ldpp_dout(this, 5) << "NOTICE: object delete request with a versioned object, mfa auth not provided" << dendl; + return -ERR_MFA_REQUIRED; + } + + return 0; +} + +void RGWDeleteObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteObj::execute() +{ + if (!s->bucket_exists) { + op_ret = -ERR_NO_SUCH_BUCKET; + return; + } + + rgw_obj obj(s->bucket, s->object); + map attrs; + + bool check_obj_lock = obj.key.have_instance() && s->bucket_info.obj_lock_enabled(); + + if (!s->object.empty()) { + /* check if obj exists, read orig attrs */ + op_ret = get_obj_attrs(store, s, obj, attrs); + + if (need_object_expiration() || multipart_delete) { + if (op_ret < 0) { + // failed to get attributes + return; + } + } + + if (check_obj_lock) { + if (op_ret < 0) { + if (op_ret == -ENOENT) { + /* object maybe delete_marker, skip check_obj_lock*/ + check_obj_lock = false; + } else { + // failed to get attributes and check_obj_lock is needed + return; + } + } + } + + if (check_obj_lock) { + auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (aiter != attrs.end()) { + RGWObjectRetention obj_retention; + try { + decode(obj_retention, aiter->second); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl; + op_ret = -EIO; + return; + } + if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) > ceph_clock_now()) { + if (obj_retention.get_mode().compare("GOVERNANCE") != 0 || !bypass_perm || !bypass_governance_mode) { + op_ret = -EACCES; + return; + } + } + } + aiter = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD); + if (aiter != attrs.end()) { + RGWObjectLegalHold obj_legal_hold; + try { + decode(obj_legal_hold, aiter->second); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectLegalHold" << dendl; + op_ret = -EIO; + return; + } + if (obj_legal_hold.is_enabled()) { + op_ret = -EACCES; + return; + } + } + } + + if (multipart_delete) { + const auto slo_attr = attrs.find(RGW_ATTR_SLO_MANIFEST); + + if (slo_attr != attrs.end()) { + op_ret = handle_slo_manifest(slo_attr->second); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to handle slo manifest ret=" << op_ret << dendl; + } + } else { + op_ret = -ERR_NOT_SLO_MANIFEST; + } + + return; + } + + RGWObjectCtx *obj_ctx = static_cast(s->obj_ctx); + obj_ctx->set_atomic(obj); + + bool ver_restored = false; + op_ret = store->swift_versioning_restore(*s->sysobj_ctx, *obj_ctx, s->bucket_owner.get_id(), + s->bucket_info, obj, ver_restored); + if (op_ret < 0) { + return; + } + + if (!ver_restored) { + /* Swift's versioning mechanism hasn't found any previous version of + * the object that could be restored. This means we should proceed + * with the regular delete path. */ + RGWRados::Object del_target(store, s->bucket_info, *obj_ctx, obj); + RGWRados::Object::Delete del_op(&del_target); + + op_ret = get_system_versioning_params(s, &del_op.params.olh_epoch, + &del_op.params.marker_version_id); + if (op_ret < 0) { + return; + } + + del_op.params.bucket_owner = s->bucket_owner.get_id(); + del_op.params.versioning_status = s->bucket_info.versioning_status(); + del_op.params.obj_owner = s->owner; + del_op.params.unmod_since = unmod_since; + del_op.params.high_precision_time = s->system_request; /* system request uses high precision time */ + + op_ret = del_op.delete_obj(); + if (op_ret >= 0) { + delete_marker = del_op.result.delete_marker; + version_id = del_op.result.version_id; + } + + /* Check whether the object has expired. Swift API documentation + * stands that we should return 404 Not Found in such case. */ + if (need_object_expiration() && object_is_expired(attrs)) { + op_ret = -ENOENT; + return; + } + } + + if (op_ret == -ECANCELED) { + op_ret = 0; + } + if (op_ret == -ERR_PRECONDITION_FAILED && no_precondition_error) { + op_ret = 0; + } + + // cache the objects tags and metadata into the requests + // so it could be used in the notification mechanism + try { + populate_tags_in_request(s, attrs); + } catch (buffer::error& err) { + ldpp_dout(this, 5) << "WARNING: failed to populate delete request with object tags: " << err.what() << dendl; + } + populate_metadata_in_request(s, attrs); + } else { + op_ret = -EINVAL; + } + + const auto ret = rgw::notify::publish(s, s->object, s->obj_size, ceph::real_clock::now(), attrs[RGW_ATTR_ETAG].to_str(), + delete_marker && s->object.instance.empty() ? rgw::notify::ObjectRemovedDeleteMarkerCreated : rgw::notify::ObjectRemovedDelete, + store); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: publishing notification failed, with error: " << ret << dendl; + // TODO: we should have conf to make send a blocking coroutine and reply with error in case sending failed + // this should be global conf (probably returnign a different handler) + // so we don't need to read the configured values before we perform it + } +} + +bool RGWCopyObj::parse_copy_location(const boost::string_view& url_src, + string& bucket_name, + rgw_obj_key& key) +{ + boost::string_view name_str; + boost::string_view params_str; + + // search for ? before url-decoding so we don't accidentally match %3F + size_t pos = url_src.find('?'); + if (pos == string::npos) { + name_str = url_src; + } else { + name_str = url_src.substr(0, pos); + params_str = url_src.substr(pos + 1); + } + + boost::string_view dec_src{name_str}; + if (dec_src[0] == '/') + dec_src.remove_prefix(1); + + pos = dec_src.find('/'); + if (pos == string::npos) + return false; + + bucket_name = url_decode(dec_src.substr(0, pos)); + key.name = url_decode(dec_src.substr(pos + 1)); + + if (key.name.empty()) { + return false; + } + + if (! params_str.empty()) { + RGWHTTPArgs args; + args.set(params_str.to_string()); + args.parse(); + + key.instance = args.get("versionId", NULL); + } + + return true; +} + +int RGWCopyObj::verify_permission() +{ + RGWAccessControlPolicy src_acl(s->cct); + boost::optional src_policy; + op_ret = get_params(); + if (op_ret < 0) + return op_ret; + + op_ret = get_system_versioning_params(s, &olh_epoch, &version_id); + if (op_ret < 0) { + return op_ret; + } + map src_attrs; + + if (s->bucket_instance_id.empty()) { + op_ret = store->get_bucket_info(*s->sysobj_ctx, src_tenant_name, src_bucket_name, src_bucket_info, NULL, &src_attrs); + } else { + /* will only happen in intra region sync where the source and dest bucket is the same */ + op_ret = store->get_bucket_instance_info(*s->sysobj_ctx, s->bucket_instance_id, src_bucket_info, NULL, &src_attrs); + } + if (op_ret < 0) { + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_BUCKET; + } + return op_ret; + } + + src_bucket = src_bucket_info.bucket; + + /* get buckets info (source and dest) */ + if (s->local_source && source_zone.empty()) { + rgw_obj src_obj(src_bucket, src_object); + store->set_atomic(s->obj_ctx, src_obj); + store->set_prefetch_data(s->obj_ctx, src_obj); + + rgw_placement_rule src_placement; + + /* check source object permissions */ + op_ret = read_obj_policy(store, s, src_bucket_info, src_attrs, &src_acl, &src_placement.storage_class, + src_policy, src_bucket, src_object); + if (op_ret < 0) { + return op_ret; + } + + /* follow up on previous checks that required reading source object head */ + if (need_to_check_storage_class) { + src_placement.inherit_from(src_bucket_info.placement_rule); + + op_ret = check_storage_class(src_placement); + if (op_ret < 0) { + return op_ret; + } + } + + /* admin request overrides permission checks */ + if (!s->auth.identity->is_admin_of(src_acl.get_owner().get_id())) { + if (src_policy) { + auto e = src_policy->eval(s->env, *s->auth.identity, + src_object.instance.empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion, + ARN(src_obj)); + if (e == Effect::Deny) { + return -EACCES; + } else if (e == Effect::Pass && + !src_acl.verify_permission(this, *s->auth.identity, s->perm_mask, + RGW_PERM_READ)) { + return -EACCES; + } + } else if (!src_acl.verify_permission(this, *s->auth.identity, + s->perm_mask, + RGW_PERM_READ)) { + return -EACCES; + } + } + } + + RGWAccessControlPolicy dest_bucket_policy(s->cct); + map dest_attrs; + + if (src_bucket_name.compare(dest_bucket_name) == 0) { /* will only happen if s->local_source + or intra region sync */ + dest_bucket_info = src_bucket_info; + dest_attrs = src_attrs; + } else { + op_ret = store->get_bucket_info(*s->sysobj_ctx, dest_tenant_name, dest_bucket_name, + dest_bucket_info, nullptr, &dest_attrs); + if (op_ret < 0) { + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_BUCKET; + } + return op_ret; + } + } + + dest_bucket = dest_bucket_info.bucket; + + rgw_obj dest_obj(dest_bucket, dest_object); + store->set_atomic(s->obj_ctx, dest_obj); + + /* check dest bucket permissions */ + op_ret = read_bucket_policy(store, s, dest_bucket_info, dest_attrs, + &dest_bucket_policy, dest_bucket); + if (op_ret < 0) { + return op_ret; + } + auto dest_iam_policy = get_iam_policy_from_attr(s->cct, store, dest_attrs, dest_bucket.tenant); + /* admin request overrides permission checks */ + if (! s->auth.identity->is_admin_of(dest_policy.get_owner().get_id())){ + if (dest_iam_policy != boost::none) { + rgw_add_to_iam_environment(s->env, "s3:x-amz-copy-source", copy_source); + if (md_directive) + rgw_add_to_iam_environment(s->env, "s3:x-amz-metadata-directive", + *md_directive); + + auto e = dest_iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + ARN(dest_obj)); + if (e == Effect::Deny) { + return -EACCES; + } else if (e == Effect::Pass && + ! dest_bucket_policy.verify_permission(this, + *s->auth.identity, + s->perm_mask, + RGW_PERM_WRITE)){ + return -EACCES; + } + } else if (! dest_bucket_policy.verify_permission(this, *s->auth.identity, s->perm_mask, + RGW_PERM_WRITE)) { + return -EACCES; + } + + } + + op_ret = init_dest_policy(); + if (op_ret < 0) { + return op_ret; + } + + return 0; +} + + +int RGWCopyObj::init_common() +{ + if (if_mod) { + if (parse_time(if_mod, &mod_time) < 0) { + op_ret = -EINVAL; + return op_ret; + } + mod_ptr = &mod_time; + } + + if (if_unmod) { + if (parse_time(if_unmod, &unmod_time) < 0) { + op_ret = -EINVAL; + return op_ret; + } + unmod_ptr = &unmod_time; + } + + bufferlist aclbl; + dest_policy.encode(aclbl); + emplace_attr(RGW_ATTR_ACL, std::move(aclbl)); + + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs); + if (op_ret < 0) { + return op_ret; + } + populate_with_generic_attrs(s, attrs); + + return 0; +} + +static void copy_obj_progress_cb(off_t ofs, void *param) +{ + RGWCopyObj *op = static_cast(param); + op->progress_cb(ofs); +} + +void RGWCopyObj::progress_cb(off_t ofs) +{ + if (!s->cct->_conf->rgw_copy_obj_progress) + return; + + if (ofs - last_ofs < s->cct->_conf->rgw_copy_obj_progress_every_bytes) + return; + + send_partial_response(ofs); + + last_ofs = ofs; +} + +void RGWCopyObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWCopyObj::execute() +{ + if (init_common() < 0) + return; + + rgw_obj src_obj(src_bucket, src_object); + rgw_obj dst_obj(dest_bucket, dest_object); + + RGWObjectCtx& obj_ctx = *static_cast(s->obj_ctx); + if ( ! version_id.empty()) { + dst_obj.key.set_instance(version_id); + } else if (dest_bucket_info.versioning_enabled()) { + store->gen_rand_obj_instance_name(&dst_obj); + } + + obj_ctx.set_atomic(src_obj); + obj_ctx.set_atomic(dst_obj); + + encode_delete_at_attr(delete_at, attrs); + + if (!s->system_request) { // no quota enforcement for system requests + // get src object size (cached in obj_ctx from verify_permission()) + RGWObjState* astate = nullptr; + op_ret = store->get_obj_state(s->obj_ctx, src_bucket_info, src_obj, + &astate, true, false); + if (op_ret < 0) { + return; + } + // enforce quota against the destination bucket owner + op_ret = store->check_quota(dest_bucket_info.owner, + dest_bucket_info.bucket, + user_quota, bucket_quota, + astate->accounted_size); + if (op_ret < 0) { + return; + } + } + + bool high_precision_time = (s->system_request); + + /* Handle object versioning of Swift API. In case of copying to remote this + * should fail gently (op_ret == 0) as the dst_obj will not exist here. */ + op_ret = store->swift_versioning_copy(obj_ctx, + dest_bucket_info.owner, + dest_bucket_info, + dst_obj); + if (op_ret < 0) { + return; + } + + op_ret = store->copy_obj(obj_ctx, + s->user->user_id, + &s->info, + source_zone, + dst_obj, + src_obj, + dest_bucket_info, + src_bucket_info, + s->dest_placement, + &src_mtime, + &mtime, + mod_ptr, + unmod_ptr, + high_precision_time, + if_match, + if_nomatch, + attrs_mod, + copy_if_newer, + attrs, RGWObjCategory::Main, + olh_epoch, + (delete_at ? *delete_at : real_time()), + (version_id.empty() ? NULL : &version_id), + &s->req_id, /* use req_id as tag */ + &etag, + copy_obj_progress_cb, (void *)this + ); + + const auto ret = rgw::notify::publish(s, s->object, s->obj_size, mtime, etag, rgw::notify::ObjectCreatedCopy, store); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: publishing notification failed, with error: " << ret << dendl; + // TODO: we should have conf to make send a blocking coroutine and reply with error in case sending failed + // this should be global conf (probably returnign a different handler) + // so we don't need to read the configured values before we perform it + } +} + +int RGWGetACLs::verify_permission() +{ + bool perm; + if (!s->object.empty()) { + auto iam_action = s->object.instance.empty() ? + rgw::IAM::s3GetObjectAcl : + rgw::IAM::s3GetObjectVersionAcl; + + if (s->iam_policy && s->iam_policy->has_partial_conditional(S3_EXISTING_OBJTAG)){ + rgw_obj obj = rgw_obj(s->bucket, s->object); + rgw_iam_add_existing_objtags(store, s, obj, iam_action); + } + if (! s->iam_user_policies.empty()) { + for (auto& user_policy : s->iam_user_policies) { + if (user_policy.has_partial_conditional(S3_EXISTING_OBJTAG)) { + rgw_obj obj = rgw_obj(s->bucket, s->object); + rgw_iam_add_existing_objtags(store, s, obj, iam_action); + } + } + } + perm = verify_object_permission(this, s, iam_action); + } else { + if (!s->bucket_exists) { + return -ERR_NO_SUCH_BUCKET; + } + perm = verify_bucket_permission(this, s, rgw::IAM::s3GetBucketAcl); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWGetACLs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetACLs::execute() +{ + stringstream ss; + RGWAccessControlPolicy* const acl = \ + (!s->object.empty() ? s->object_acl.get() : s->bucket_acl.get()); + RGWAccessControlPolicy_S3* const s3policy = \ + static_cast(acl); + s3policy->to_xml(ss); + acls = ss.str(); +} + + + +int RGWPutACLs::verify_permission() +{ + bool perm; + + rgw_add_to_iam_environment(s->env, "s3:x-amz-acl", s->canned_acl); + + rgw_add_grant_to_iam_environment(s->env, s); + if (!s->object.empty()) { + auto iam_action = s->object.instance.empty() ? rgw::IAM::s3PutObjectAcl : rgw::IAM::s3PutObjectVersionAcl; + auto obj = rgw_obj(s->bucket, s->object); + op_ret = rgw_iam_add_existing_objtags(store, s, obj, iam_action); + perm = verify_object_permission(this, s, iam_action); + } else { + perm = verify_bucket_permission(this, s, rgw::IAM::s3PutBucketAcl); + } + if (!perm) + return -EACCES; + + return 0; +} + +int RGWGetLC::verify_permission() +{ + bool perm; + perm = verify_bucket_permission(this, s, rgw::IAM::s3GetLifecycleConfiguration); + if (!perm) + return -EACCES; + + return 0; +} + +int RGWPutLC::verify_permission() +{ + bool perm; + perm = verify_bucket_permission(this, s, rgw::IAM::s3PutLifecycleConfiguration); + if (!perm) + return -EACCES; + + return 0; +} + +int RGWDeleteLC::verify_permission() +{ + bool perm; + perm = verify_bucket_permission(this, s, rgw::IAM::s3PutLifecycleConfiguration); + if (!perm) + return -EACCES; + + return 0; +} + +void RGWPutACLs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetLC::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutLC::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteLC::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutACLs::execute() +{ + bufferlist bl; + + RGWAccessControlPolicy_S3 *policy = NULL; + RGWACLXMLParser_S3 parser(s->cct); + RGWAccessControlPolicy_S3 new_policy(s->cct); + stringstream ss; + rgw_obj obj; + + op_ret = 0; /* XXX redundant? */ + + if (!parser.init()) { + op_ret = -EINVAL; + return; + } + + + RGWAccessControlPolicy* const existing_policy = \ + (s->object.empty() ? s->bucket_acl.get() : s->object_acl.get()); + + owner = existing_policy->get_owner(); + + op_ret = get_params(); + if (op_ret < 0) { + if (op_ret == -ERANGE) { + ldpp_dout(this, 4) << "The size of request xml data is larger than the max limitation, data size = " + << s->length << dendl; + op_ret = -ERR_MALFORMED_XML; + s->err.message = "The XML you provided was larger than the maximum " + + std::to_string(s->cct->_conf->rgw_max_put_param_size) + + " bytes allowed."; + } + return; + } + + char* buf = data.c_str(); + ldpp_dout(this, 15) << "read len=" << data.length() << " data=" << (buf ? buf : "") << dendl; + + if (!s->canned_acl.empty() && data.length() > 0) { + op_ret = -EINVAL; + return; + } + + if (!s->canned_acl.empty() || s->has_acl_header) { + op_ret = get_policy_from_state(store, s, ss); + if (op_ret < 0) + return; + + data.clear(); + data.append(ss.str()); + } + + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -EINVAL; + return; + } + policy = static_cast(parser.find_first("AccessControlPolicy")); + if (!policy) { + op_ret = -EINVAL; + return; + } + + const RGWAccessControlList& req_acl = policy->get_acl(); + const multimap& req_grant_map = req_acl.get_grant_map(); +#define ACL_GRANTS_MAX_NUM 100 + int max_num = s->cct->_conf->rgw_acl_grants_max_num; + if (max_num < 0) { + max_num = ACL_GRANTS_MAX_NUM; + } + + int grants_num = req_grant_map.size(); + if (grants_num > max_num) { + ldpp_dout(this, 4) << "An acl can have up to " << max_num + << " grants, request acl grants num: " << grants_num << dendl; + op_ret = -ERR_MALFORMED_ACL_ERROR; + s->err.message = "The request is rejected, because the acl grants number you requested is larger than the maximum " + + std::to_string(max_num) + + " grants allowed in an acl."; + return; + } + + // forward bucket acl requests to meta master zone + if (s->object.empty() && !store->svc.zone->is_meta_master()) { + bufferlist in_data; + // include acl data unless it was generated from a canned_acl + if (s->canned_acl.empty()) { + in_data.append(data); + } + op_ret = forward_request_to_master(s, NULL, store, in_data, NULL); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + if (s->cct->_conf->subsys.should_gather()) { + ldpp_dout(this, 15) << "Old AccessControlPolicy"; + policy->to_xml(*_dout); + *_dout << dendl; + } + + op_ret = policy->rebuild(store, &owner, new_policy); + if (op_ret < 0) + return; + + if (s->cct->_conf->subsys.should_gather()) { + ldpp_dout(this, 15) << "New AccessControlPolicy:"; + new_policy.to_xml(*_dout); + *_dout << dendl; + } + + new_policy.encode(bl); + map attrs; + + if (!s->object.empty()) { + obj = rgw_obj(s->bucket, s->object); + store->set_atomic(s->obj_ctx, obj); + //if instance is empty, we should modify the latest object + op_ret = modify_obj_attr(store, s, obj, RGW_ATTR_ACL, bl); + } else { + attrs = s->bucket_attrs; + attrs[RGW_ATTR_ACL] = bl; + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, &s->bucket_info.objv_tracker); + } + if (op_ret == -ECANCELED) { + op_ret = 0; /* lost a race, but it's ok because acls are immutable */ + } +} + +void RGWPutLC::execute() +{ + bufferlist bl; + + RGWLifecycleConfiguration_S3 config(s->cct); + RGWXMLParser parser; + RGWLifecycleConfiguration_S3 new_config(s->cct); + + content_md5 = s->info.env->get("HTTP_CONTENT_MD5"); + if (content_md5 == nullptr) { + op_ret = -ERR_INVALID_REQUEST; + s->err.message = "Missing required header for this request: Content-MD5"; + ldpp_dout(this, 5) << s->err.message << dendl; + return; + } + + std::string content_md5_bin; + try { + content_md5_bin = rgw::from_base64(boost::string_view(content_md5)); + } catch (...) { + s->err.message = "Request header Content-MD5 contains character " + "that is not base64 encoded."; + ldpp_dout(this, 5) << s->err.message << dendl; + op_ret = -ERR_BAD_DIGEST; + return; + } + + if (!parser.init()) { + op_ret = -EINVAL; + return; + } + + op_ret = get_params(); + if (op_ret < 0) + return; + + char* buf = data.c_str(); + ldpp_dout(this, 15) << "read len=" << data.length() << " data=" << (buf ? buf : "") << dendl; + + MD5 data_hash; + unsigned char data_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE]; + data_hash.Update(reinterpret_cast(buf), data.length()); + data_hash.Final(data_hash_res); + + if (memcmp(data_hash_res, content_md5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) { + op_ret = -ERR_BAD_DIGEST; + s->err.message = "The Content-MD5 you specified did not match what we received."; + ldpp_dout(this, 5) << s->err.message + << " Specified content md5: " << content_md5 + << ", calculated content md5: " << data_hash_res + << dendl; + return; + } + + if (!parser.parse(buf, data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("LifecycleConfiguration", config, &parser); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "Bad lifecycle configuration: " << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + + op_ret = config.rebuild(store, new_config); + if (op_ret < 0) + return; + + if (s->cct->_conf->subsys.should_gather()) { + XMLFormatter xf; + new_config.dump_xml(&xf); + stringstream ss; + xf.flush(ss); + ldpp_dout(this, 15) << "New LifecycleConfiguration:" << ss.str() << dendl; + } + + if (!store->svc.zone->is_meta_master()) { + op_ret = forward_request_to_master(s, nullptr, store, data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + op_ret = store->get_lc()->set_bucket_config(s->bucket_info, s->bucket_attrs, &new_config); + if (op_ret < 0) { + return; + } + return; +} + +void RGWDeleteLC::execute() +{ + if (!store->svc.zone->is_meta_master()) { + bufferlist data; + op_ret = forward_request_to_master(s, nullptr, store, data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + op_ret = store->get_lc()->remove_bucket_config(s->bucket_info, s->bucket_attrs); + if (op_ret < 0) { + return; + } + return; +} + +int RGWGetCORS::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketCORS); +} + +void RGWGetCORS::execute() +{ + op_ret = read_bucket_cors(); + if (op_ret < 0) + return ; + + if (!cors_exist) { + ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl; + op_ret = -ERR_NO_CORS_FOUND; + return; + } +} + +int RGWPutCORS::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketCORS); +} + +void RGWPutCORS::execute() +{ + rgw_raw_obj obj; + + op_ret = get_params(); + if (op_ret < 0) + return; + + if (!store->svc.zone->is_meta_master()) { + op_ret = forward_request_to_master(s, NULL, store, in_data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + op_ret = retry_raced_bucket_write(store, s, [this] { + map attrs = s->bucket_attrs; + attrs[RGW_ATTR_CORS] = cors_bl; + return rgw_bucket_set_attrs(store, s->bucket_info, attrs, &s->bucket_info.objv_tracker); + }); +} + +int RGWDeleteCORS::verify_permission() +{ + // No separate delete permission + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketCORS); +} + +void RGWDeleteCORS::execute() +{ + if (!store->svc.zone->is_meta_master()) { + bufferlist data; + op_ret = forward_request_to_master(s, nullptr, store, data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + op_ret = retry_raced_bucket_write(store, s, [this] { + op_ret = read_bucket_cors(); + if (op_ret < 0) + return op_ret; + + if (!cors_exist) { + ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl; + op_ret = -ENOENT; + return op_ret; + } + + map attrs = s->bucket_attrs; + attrs.erase(RGW_ATTR_CORS); + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, + &s->bucket_info.objv_tracker); + if (op_ret < 0) { + ldpp_dout(this, 0) << "RGWLC::RGWDeleteCORS() failed to set attrs on bucket=" << s->bucket.name + << " returned err=" << op_ret << dendl; + } + return op_ret; + }); +} + +void RGWOptionsCORS::get_response_params(string& hdrs, string& exp_hdrs, unsigned *max_age) { + get_cors_response_headers(rule, req_hdrs, hdrs, exp_hdrs, max_age); +} + +int RGWOptionsCORS::validate_cors_request(RGWCORSConfiguration *cc) { + rule = cc->host_name_rule(origin); + if (!rule) { + ldpp_dout(this, 10) << "There is no cors rule present for " << origin << dendl; + return -ENOENT; + } + + if (!validate_cors_rule_method(rule, req_meth)) { + return -ENOENT; + } + + if (!validate_cors_rule_header(rule, req_hdrs)) { + return -ENOENT; + } + + return 0; +} + +void RGWOptionsCORS::execute() +{ + op_ret = read_bucket_cors(); + if (op_ret < 0) + return; + + origin = s->info.env->get("HTTP_ORIGIN"); + if (!origin) { + ldpp_dout(this, 0) << "Missing mandatory Origin header" << dendl; + op_ret = -EINVAL; + return; + } + req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + if (!req_meth) { + ldpp_dout(this, 0) << "Missing mandatory Access-control-request-method header" << dendl; + op_ret = -EINVAL; + return; + } + if (!cors_exist) { + ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl; + op_ret = -ENOENT; + return; + } + req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_HEADERS"); + op_ret = validate_cors_request(&bucket_cors); + if (!rule) { + origin = req_meth = NULL; + return; + } + return; +} + +int RGWGetRequestPayment::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketRequestPayment); +} + +void RGWGetRequestPayment::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetRequestPayment::execute() +{ + requester_pays = s->bucket_info.requester_pays; +} + +int RGWSetRequestPayment::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketRequestPayment); +} + +void RGWSetRequestPayment::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWSetRequestPayment::execute() +{ + + if (!store->svc.zone->is_meta_master()) { + op_ret = forward_request_to_master(s, nullptr, store, in_data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + op_ret = get_params(); + + if (op_ret < 0) + return; + + s->bucket_info.requester_pays = requester_pays; + op_ret = store->put_bucket_instance_info(s->bucket_info, false, real_time(), + &s->bucket_attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name + << " returned err=" << op_ret << dendl; + return; + } +} + +int RGWInitMultipart::verify_permission() +{ + if (s->iam_policy || ! s->iam_user_policies.empty()) { + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + rgw::IAM::s3PutObject, + rgw_obj(s->bucket, s->object)); + if (usr_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect e = Effect::Pass; + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + rgw_obj(s->bucket, s->object)); + } + if (e == Effect::Allow) { + return 0; + } else if (e == Effect::Deny) { + return -EACCES; + } else if (usr_policy_res == Effect::Allow) { + return 0; + } + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWInitMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWInitMultipart::execute() +{ + bufferlist aclbl; + map attrs; + rgw_obj obj; + + if (get_params() < 0) + return; + + if (s->object.empty()) + return; + + policy.encode(aclbl); + attrs[RGW_ATTR_ACL] = aclbl; + + populate_with_generic_attrs(s, attrs); + + /* select encryption mode */ + op_ret = prepare_encryption(attrs); + if (op_ret != 0) + return; + + op_ret = rgw_get_request_metadata(s->cct, s->info, attrs); + if (op_ret < 0) { + return; + } + + do { + char buf[33]; + gen_rand_alphanumeric(s->cct, buf, sizeof(buf) - 1); + upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */ + upload_id.append(buf); + + string tmp_obj_name; + RGWMPObj mp(s->object.name, upload_id); + tmp_obj_name = mp.get_meta(); + + obj.init_ns(s->bucket, tmp_obj_name, mp_ns); + // the meta object will be indexed with 0 size, we c + obj.set_in_extra_data(true); + obj.index_hash_source = s->object.name; + + RGWRados::Object op_target(store, s->bucket_info, *static_cast(s->obj_ctx), obj); + op_target.set_versioning_disabled(true); /* no versioning for multipart meta */ + + RGWRados::Object::Write obj_op(&op_target); + + obj_op.meta.owner = s->owner.get_id(); + obj_op.meta.category = RGWObjCategory::MultiMeta; + obj_op.meta.flags = PUT_OBJ_CREATE_EXCL; + + multipart_upload_info upload_info; + upload_info.dest_placement = s->dest_placement; + + bufferlist bl; + encode(upload_info, bl); + obj_op.meta.data = &bl; + + op_ret = obj_op.write_meta(bl.length(), 0, attrs); + } while (op_ret == -EEXIST); + + const auto ret = rgw::notify::publish(s, s->object, s->obj_size, ceph::real_clock::now(), attrs[RGW_ATTR_ETAG].to_str(), rgw::notify::ObjectCreatedPost, store); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: publishing notification failed, with error: " << ret << dendl; + // TODO: we should have conf to make send a blocking coroutine and reply with error in case sending failed + // this should be global conf (probably returnign a different handler) + // so we don't need to read the configured values before we perform it + } +} + +int RGWCompleteMultipart::verify_permission() +{ + if (s->iam_policy || ! s->iam_user_policies.empty()) { + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + rgw::IAM::s3PutObject, + rgw_obj(s->bucket, s->object)); + if (usr_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect e = Effect::Pass; + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, + rgw_obj(s->bucket, s->object)); + } + if (e == Effect::Allow) { + return 0; + } else if (e == Effect::Deny) { + return -EACCES; + } else if (usr_policy_res == Effect::Allow) { + return 0; + } + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWCompleteMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWCompleteMultipart::execute() +{ + RGWMultiCompleteUpload *parts; + map::iterator iter; + RGWMultiXMLParser parser; + string meta_oid; + map obj_parts; + map::iterator obj_iter; + map attrs; + off_t ofs = 0; + MD5 hash; + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + bufferlist etag_bl; + rgw_obj meta_obj; + rgw_obj target_obj; + RGWMPObj mp; + RGWObjManifest manifest; + uint64_t olh_epoch = 0; + + op_ret = get_params(); + if (op_ret < 0) + return; + op_ret = get_system_versioning_params(s, &olh_epoch, &version_id); + if (op_ret < 0) { + return; + } + + if (!data.length()) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + if (!parser.init()) { + op_ret = -EIO; + return; + } + + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + parts = static_cast(parser.find_first("CompleteMultipartUpload")); + if (!parts || parts->parts.empty()) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + if ((int)parts->parts.size() > + s->cct->_conf->rgw_multipart_part_upload_limit) { + op_ret = -ERANGE; + return; + } + + mp.init(s->object.name, upload_id); + meta_oid = mp.get_meta(); + + int total_parts = 0; + int handled_parts = 0; + int max_parts = 1000; + int marker = 0; + bool truncated; + RGWCompressionInfo cs_info; + bool compressed = false; + uint64_t accounted_size = 0; + + uint64_t min_part_size = s->cct->_conf->rgw_multipart_min_part_size; + + list remove_objs; /* objects to be removed from index listing */ + + bool versioned_object = s->bucket_info.versioning_enabled(); + + iter = parts->parts.begin(); + + meta_obj.init_ns(s->bucket, meta_oid, mp_ns); + meta_obj.set_in_extra_data(true); + meta_obj.index_hash_source = s->object.name; + + /*take a cls lock on meta_obj to prevent racing completions (or retries) + from deleting the parts*/ + rgw_pool meta_pool; + rgw_raw_obj raw_obj; + int max_lock_secs_mp = + s->cct->_conf.get_val("rgw_mp_lock_max_time"); + utime_t dur(max_lock_secs_mp, 0); + + store->obj_to_raw((s->bucket_info).placement_rule, meta_obj, &raw_obj); + store->get_obj_data_pool((s->bucket_info).placement_rule, + meta_obj,&meta_pool); + store->open_pool_ctx(meta_pool, serializer.ioctx, true); + + op_ret = serializer.try_lock(raw_obj.oid, dur); + if (op_ret < 0) { + ldpp_dout(this, 0) << "failed to acquire lock" << dendl; + op_ret = -ERR_INTERNAL_ERROR; + s->err.message = "This multipart completion is already in progress"; + return; + } + + op_ret = get_obj_attrs(store, s, meta_obj, attrs); + + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << meta_obj + << " ret=" << op_ret << dendl; + return; + } + + do { + op_ret = list_multipart_parts(store, s, upload_id, meta_oid, max_parts, + marker, obj_parts, &marker, &truncated); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_UPLOAD; + } + if (op_ret < 0) + return; + + total_parts += obj_parts.size(); + if (!truncated && total_parts != (int)parts->parts.size()) { + ldpp_dout(this, 0) << "NOTICE: total parts mismatch: have: " << total_parts + << " expected: " << parts->parts.size() << dendl; + op_ret = -ERR_INVALID_PART; + return; + } + + for (obj_iter = obj_parts.begin(); iter != parts->parts.end() && obj_iter != obj_parts.end(); ++iter, ++obj_iter, ++handled_parts) { + uint64_t part_size = obj_iter->second.accounted_size; + if (handled_parts < (int)parts->parts.size() - 1 && + part_size < min_part_size) { + op_ret = -ERR_TOO_SMALL; + return; + } + + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + if (iter->first != (int)obj_iter->first) { + ldpp_dout(this, 0) << "NOTICE: parts num mismatch: next requested: " + << iter->first << " next uploaded: " + << obj_iter->first << dendl; + op_ret = -ERR_INVALID_PART; + return; + } + string part_etag = rgw_string_unquote(iter->second); + if (part_etag.compare(obj_iter->second.etag) != 0) { + ldpp_dout(this, 0) << "NOTICE: etag mismatch: part: " << iter->first + << " etag: " << iter->second << dendl; + op_ret = -ERR_INVALID_PART; + return; + } + + hex_to_buf(obj_iter->second.etag.c_str(), petag, + CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + + RGWUploadPartInfo& obj_part = obj_iter->second; + + /* update manifest for part */ + string oid = mp.get_part(obj_iter->second.num); + rgw_obj src_obj; + src_obj.init_ns(s->bucket, oid, mp_ns); + + if (obj_part.manifest.empty()) { + ldpp_dout(this, 0) << "ERROR: empty manifest for object part: obj=" + << src_obj << dendl; + op_ret = -ERR_INVALID_PART; + return; + } else { + manifest.append(obj_part.manifest, store->svc.zone); + } + + bool part_compressed = (obj_part.cs_info.compression_type != "none"); + if ((obj_iter != obj_parts.begin()) && + ((part_compressed != compressed) || + (cs_info.compression_type != obj_part.cs_info.compression_type))) { + ldpp_dout(this, 0) << "ERROR: compression type was changed during multipart upload (" + << cs_info.compression_type << ">>" << obj_part.cs_info.compression_type << ")" << dendl; + op_ret = -ERR_INVALID_PART; + return; + } + + if (part_compressed) { + int64_t new_ofs; // offset in compression data for new part + if (cs_info.blocks.size() > 0) + new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len; + else + new_ofs = 0; + for (const auto& block : obj_part.cs_info.blocks) { + compression_block cb; + cb.old_ofs = block.old_ofs + cs_info.orig_size; + cb.new_ofs = new_ofs; + cb.len = block.len; + cs_info.blocks.push_back(cb); + new_ofs = cb.new_ofs + cb.len; + } + if (!compressed) + cs_info.compression_type = obj_part.cs_info.compression_type; + cs_info.orig_size += obj_part.cs_info.orig_size; + compressed = true; + } + + rgw_obj_index_key remove_key; + src_obj.key.get_index_key(&remove_key); + + remove_objs.push_back(remove_key); + + ofs += obj_part.size; + accounted_size += obj_part.accounted_size; + } + } while (truncated); + hash.Final((unsigned char *)final_etag); + + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)parts->parts.size()); + etag = final_etag_str; + ldpp_dout(this, 10) << "calculated etag: " << final_etag_str << dendl; + + etag_bl.append(final_etag_str, strlen(final_etag_str)); + + attrs[RGW_ATTR_ETAG] = etag_bl; + + if (compressed) { + // write compression attribute to full object + bufferlist tmp; + encode(cs_info, tmp); + attrs[RGW_ATTR_COMPRESSION] = tmp; + } + + target_obj.init(s->bucket, s->object.name); + if (versioned_object) { + if (!version_id.empty()) { + target_obj.key.set_instance(version_id); + } else { + store->gen_rand_obj_instance_name(&target_obj); + version_id = target_obj.key.get_instance(); + } + } + + RGWObjectCtx& obj_ctx = *static_cast(s->obj_ctx); + + obj_ctx.set_atomic(target_obj); + + RGWRados::Object op_target(store, s->bucket_info, *static_cast(s->obj_ctx), target_obj); + RGWRados::Object::Write obj_op(&op_target); + + obj_op.meta.manifest = &manifest; + obj_op.meta.remove_objs = &remove_objs; + + obj_op.meta.ptag = &s->req_id; /* use req_id as operation tag */ + obj_op.meta.owner = s->owner.get_id(); + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.modify_tail = true; + obj_op.meta.completeMultipart = true; + obj_op.meta.olh_epoch = olh_epoch; + op_ret = obj_op.write_meta(ofs, accounted_size, attrs); + if (op_ret < 0) + return; + + // remove the upload obj + int r = store->delete_obj(*static_cast(s->obj_ctx), + s->bucket_info, meta_obj, 0); + if (r >= 0) { + /* serializer's exclusive lock is released */ + serializer.clear_locked(); + } else { + ldpp_dout(this, 0) << "WARNING: failed to remove object " << meta_obj << dendl; + } + + const auto ret = rgw::notify::publish(s, s->object, s->obj_size, ceph::real_clock::now(), etag, rgw::notify::ObjectCreatedCompleteMultipartUpload, store); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: publishing notification failed, with error: " << ret << dendl; + // TODO: we should have conf to make send a blocking coroutine and reply with error in case sending failed + // this should be global conf (probably returnign a different handler) + // so we don't need to read the configured values before we perform it + } +} + +int RGWCompleteMultipart::MPSerializer::try_lock( + const std::string& _oid, + utime_t dur) +{ + oid = _oid; + op.assert_exists(); + lock.set_duration(dur); + lock.lock_exclusive(&op); + int ret = ioctx.operate(oid, &op); + if (! ret) { + locked = true; + } + return ret; +} + +void RGWCompleteMultipart::complete() +{ + /* release exclusive lock iff not already */ + if (unlikely(serializer.locked)) { + int r = serializer.unlock(); + if (r < 0) { + ldpp_dout(this, 0) << "WARNING: failed to unlock " << serializer.oid << dendl; + } + } + send_response(); +} + +int RGWAbortMultipart::verify_permission() +{ + if (s->iam_policy || ! s->iam_user_policies.empty()) { + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + rgw::IAM::s3AbortMultipartUpload, + rgw_obj(s->bucket, s->object)); + if (usr_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect e = Effect::Pass; + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3AbortMultipartUpload, + rgw_obj(s->bucket, s->object)); + } + if (e == Effect::Allow) { + return 0; + } else if (e == Effect::Deny) { + return -EACCES; + } else if (usr_policy_res == Effect::Allow) + return 0; + } + + if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + return 0; +} + +void RGWAbortMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWAbortMultipart::execute() +{ + op_ret = -EINVAL; + string upload_id; + string meta_oid; + upload_id = s->info.args.get("uploadId"); + rgw_obj meta_obj; + RGWMPObj mp; + + if (upload_id.empty() || s->object.empty()) + return; + + mp.init(s->object.name, upload_id); + meta_oid = mp.get_meta(); + + op_ret = get_multipart_info(store, s, meta_oid, nullptr, nullptr, nullptr); + if (op_ret < 0) + return; + + RGWObjectCtx *obj_ctx = static_cast(s->obj_ctx); + op_ret = abort_multipart_upload(store, s->cct, obj_ctx, s->bucket_info, mp); +} + +int RGWListMultipart::verify_permission() +{ + if (!verify_object_permission(this, s, rgw::IAM::s3ListMultipartUploadParts)) + return -EACCES; + + return 0; +} + +void RGWListMultipart::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListMultipart::execute() +{ + string meta_oid; + RGWMPObj mp; + + op_ret = get_params(); + if (op_ret < 0) + return; + + mp.init(s->object.name, upload_id); + meta_oid = mp.get_meta(); + + op_ret = get_multipart_info(store, s, meta_oid, &policy, nullptr, nullptr); + if (op_ret < 0) + return; + + op_ret = list_multipart_parts(store, s, upload_id, meta_oid, max_parts, + marker, parts, NULL, &truncated); +} + +int RGWListBucketMultiparts::verify_permission() +{ + if (!verify_bucket_permission(this, + s, + rgw::IAM::s3ListBucketMultipartUploads)) + return -EACCES; + + return 0; +} + +void RGWListBucketMultiparts::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWListBucketMultiparts::execute() +{ + vector objs; + string marker_meta; + + op_ret = get_params(); + if (op_ret < 0) + return; + + if (s->prot_flags & RGW_REST_SWIFT) { + string path_args; + path_args = s->info.args.get("path"); + if (!path_args.empty()) { + if (!delimiter.empty() || !prefix.empty()) { + op_ret = -EINVAL; + return; + } + prefix = path_args; + delimiter="/"; + } + } + marker_meta = marker.get_meta(); + + op_ret = list_bucket_multiparts(store, s->bucket_info, prefix, marker_meta, delimiter, + max_uploads, &objs, &common_prefixes, &is_truncated); + if (op_ret < 0) { + return; + } + + if (!objs.empty()) { + vector::iterator iter; + RGWMultipartUploadEntry entry; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + rgw_obj_key key(iter->key); + if (!entry.mp.from_meta(key.name)) + continue; + entry.obj = *iter; + uploads.push_back(entry); + } + next_marker = entry; + } +} + +void RGWGetHealthCheck::execute() +{ + if (!g_conf()->rgw_healthcheck_disabling_path.empty() && + (::access(g_conf()->rgw_healthcheck_disabling_path.c_str(), F_OK) == 0)) { + /* Disabling path specified & existent in the filesystem. */ + op_ret = -ERR_SERVICE_UNAVAILABLE; /* 503 */ + } else { + op_ret = 0; /* 200 OK */ + } +} + +int RGWDeleteMultiObj::verify_permission() +{ + if (s->iam_policy || ! s->iam_user_policies.empty()) { + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + s->object.instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(s->bucket)); + if (usr_policy_res == Effect::Deny) { + return -EACCES; + } + + rgw::IAM::Effect r = Effect::Pass; + if (s->iam_policy) { + r = s->iam_policy->eval(s->env, *s->auth.identity, + s->object.instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(s->bucket)); + } + if (r == Effect::Allow) + return 0; + else if (r == Effect::Deny) + return -EACCES; + else if (usr_policy_res == Effect::Allow) + return 0; + } + + acl_allowed = verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE); + if (!acl_allowed) + return -EACCES; + + return 0; +} + +void RGWDeleteMultiObj::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDeleteMultiObj::execute() +{ + RGWMultiDelDelete *multi_delete; + vector::iterator iter; + RGWMultiDelXMLParser parser; + RGWObjectCtx *obj_ctx = static_cast(s->obj_ctx); + char* buf; + + op_ret = get_params(); + if (op_ret < 0) { + goto error; + } + + buf = data.c_str(); + if (!buf) { + op_ret = -EINVAL; + goto error; + } + + if (!parser.init()) { + op_ret = -EINVAL; + goto error; + } + + if (!parser.parse(buf, data.length(), 1)) { + op_ret = -EINVAL; + goto error; + } + + multi_delete = static_cast(parser.find_first("Delete")); + if (!multi_delete) { + op_ret = -EINVAL; + goto error; + } else { +#define DELETE_MULTI_OBJ_MAX_NUM 1000 + int max_num = s->cct->_conf->rgw_delete_multi_obj_max_num; + if (max_num < 0) { + max_num = DELETE_MULTI_OBJ_MAX_NUM; + } + int multi_delete_object_num = multi_delete->objects.size(); + if (multi_delete_object_num > max_num) { + op_ret = -ERR_MALFORMED_XML; + goto error; + } + } + + if (multi_delete->is_quiet()) + quiet = true; + + if (s->bucket_info.mfa_enabled()) { + bool has_versioned = false; + for (auto i : multi_delete->objects) { + if (!i.instance.empty()) { + has_versioned = true; + break; + } + } + if (has_versioned && !s->mfa_verified) { + ldpp_dout(this, 5) << "NOTICE: multi-object delete request with a versioned object, mfa auth not provided" << dendl; + op_ret = -ERR_MFA_REQUIRED; + goto error; + } + } + + begin_response(); + if (multi_delete->objects.empty()) { + goto done; + } + + for (iter = multi_delete->objects.begin(); + iter != multi_delete->objects.end(); + ++iter) { + rgw_obj obj(bucket, *iter); + if (s->iam_policy || ! s->iam_user_policies.empty()) { + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + iter->instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(obj)); + if (usr_policy_res == Effect::Deny) { + send_partial_response(*iter, false, "", -EACCES); + continue; + } + + rgw::IAM::Effect e = Effect::Pass; + if (s->iam_policy) { + e = s->iam_policy->eval(s->env, + *s->auth.identity, + iter->instance.empty() ? + rgw::IAM::s3DeleteObject : + rgw::IAM::s3DeleteObjectVersion, + ARN(obj)); + } + if ((e == Effect::Deny) || + (usr_policy_res == Effect::Pass && e == Effect::Pass && !acl_allowed)) { + send_partial_response(*iter, false, "", -EACCES); + continue; + } + } + + obj_ctx->set_atomic(obj); + + RGWRados::Object del_target(store, s->bucket_info, *obj_ctx, obj); + RGWRados::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = s->bucket_owner.get_id(); + del_op.params.versioning_status = s->bucket_info.versioning_status(); + del_op.params.obj_owner = s->owner; + + op_ret = del_op.delete_obj(); + if (op_ret == -ENOENT) { + op_ret = 0; + } + + send_partial_response(*iter, del_op.result.delete_marker, + del_op.result.version_id, op_ret); + + const auto obj_state = obj_ctx->get_state(obj); + bufferlist etag_bl; + const auto etag = obj_state->get_attr(RGW_ATTR_ETAG, etag_bl) ? etag_bl.to_str() : ""; + + const auto ret = rgw::notify::publish(s, obj.key, obj_state->size, ceph::real_clock::now(), etag, + del_op.result.delete_marker && s->object.instance.empty() ? rgw::notify::ObjectRemovedDeleteMarkerCreated : rgw::notify::ObjectRemovedDelete, + store); + if (ret < 0) { + ldpp_dout(this, 5) << "WARNING: publishing notification failed, with error: " << ret << dendl; + // TODO: we should have conf to make send a blocking coroutine and reply with error in case sending failed + // this should be global conf (probably returnign a different handler) + // so we don't need to read the configured values before we perform it + } + } + + /* set the return code to zero, errors at this point will be + dumped to the response */ + op_ret = 0; + +done: + // will likely segfault if begin_response() has not been called + end_response(); + return; + +error: + send_status(); + return; + +} + +bool RGWBulkDelete::Deleter::verify_permission(RGWBucketInfo& binfo, + map& battrs, + ACLOwner& bucket_owner /* out */) +{ + RGWAccessControlPolicy bacl(store->ctx()); + int ret = read_bucket_policy(store, s, binfo, battrs, &bacl, binfo.bucket); + if (ret < 0) { + return false; + } + + auto policy = get_iam_policy_from_attr(s->cct, store, battrs, binfo.bucket.tenant); + + bucket_owner = bacl.get_owner(); + + /* We can use global user_acl because each BulkDelete request is allowed + * to work on entities from a single account only. */ + return verify_bucket_permission(dpp, s, binfo.bucket, s->user_acl.get(), + &bacl, policy, s->iam_user_policies, rgw::IAM::s3DeleteBucket); +} + +bool RGWBulkDelete::Deleter::delete_single(const acct_path_t& path) +{ + auto& obj_ctx = *static_cast(s->obj_ctx); + + RGWBucketInfo binfo; + map battrs; + ACLOwner bowner; + + int ret = store->get_bucket_info(*s->sysobj_ctx, s->user->user_id.tenant, + path.bucket_name, binfo, nullptr, + &battrs); + if (ret < 0) { + goto binfo_fail; + } + + if (!verify_permission(binfo, battrs, bowner)) { + ret = -EACCES; + goto auth_fail; + } + + if (!path.obj_key.empty()) { + rgw_obj obj(binfo.bucket, path.obj_key); + obj_ctx.set_atomic(obj); + + RGWRados::Object del_target(store, binfo, obj_ctx, obj); + RGWRados::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = binfo.owner; + del_op.params.versioning_status = binfo.versioning_status(); + del_op.params.obj_owner = bowner; + + ret = del_op.delete_obj(); + if (ret < 0) { + goto delop_fail; + } + } else { + RGWObjVersionTracker ot; + ot.read_version = binfo.ep_objv; + + ret = store->delete_bucket(binfo, ot); + if (0 == ret) { + ret = rgw_unlink_bucket(store, binfo.owner, binfo.bucket.tenant, + binfo.bucket.name, false); + if (ret < 0) { + ldpp_dout(s, 0) << "WARNING: failed to unlink bucket: ret=" << ret << dendl; + } + } + if (ret < 0) { + goto delop_fail; + } + + if (!store->svc.zone->is_meta_master()) { + bufferlist in_data; + ret = forward_request_to_master(s, &ot.read_version, store, in_data, + nullptr); + if (ret < 0) { + if (ret == -ENOENT) { + /* adjust error, we want to return with NoSuchBucket and not + * NoSuchKey */ + ret = -ERR_NO_SUCH_BUCKET; + } + goto delop_fail; + } + } + } + + num_deleted++; + return true; + + +binfo_fail: + if (-ENOENT == ret) { + ldpp_dout(s, 20) << "cannot find bucket = " << path.bucket_name << dendl; + num_unfound++; + } else { + ldpp_dout(s, 20) << "cannot get bucket info, ret = " << ret << dendl; + + fail_desc_t failed_item = { + .err = ret, + .path = path + }; + failures.push_back(failed_item); + } + return false; + +auth_fail: + ldpp_dout(s, 20) << "wrong auth for " << path << dendl; + { + fail_desc_t failed_item = { + .err = ret, + .path = path + }; + failures.push_back(failed_item); + } + return false; + +delop_fail: + if (-ENOENT == ret) { + ldpp_dout(s, 20) << "cannot find entry " << path << dendl; + num_unfound++; + } else { + fail_desc_t failed_item = { + .err = ret, + .path = path + }; + failures.push_back(failed_item); + } + return false; +} + +bool RGWBulkDelete::Deleter::delete_chunk(const std::list& paths) +{ + ldpp_dout(s, 20) << "in delete_chunk" << dendl; + for (auto path : paths) { + ldpp_dout(s, 20) << "bulk deleting path: " << path << dendl; + delete_single(path); + } + + return true; +} + +int RGWBulkDelete::verify_permission() +{ + return 0; +} + +void RGWBulkDelete::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWBulkDelete::execute() +{ + deleter = std::unique_ptr(new Deleter(this, store, s)); + + bool is_truncated = false; + do { + list items; + + int ret = get_data(items, &is_truncated); + if (ret < 0) { + return; + } + + ret = deleter->delete_chunk(items); + } while (!op_ret && is_truncated); + + return; +} + + +constexpr std::array RGWBulkUploadOp::terminal_errors; + +int RGWBulkUploadOp::verify_permission() +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (! verify_user_permission_no_policy(this, s, RGW_PERM_WRITE)) { + return -EACCES; + } + + if (s->user->user_id.tenant != s->bucket_tenant) { + ldpp_dout(this, 10) << "user cannot create a bucket in a different tenant" + << " (user_id.tenant=" << s->user->user_id.tenant + << " requested=" << s->bucket_tenant << ")" << dendl; + return -EACCES; + } + + if (s->user->max_buckets < 0) { + return -EPERM; + } + + return 0; +} + +void RGWBulkUploadOp::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +boost::optional> +RGWBulkUploadOp::parse_path(const boost::string_ref& path) +{ + /* We need to skip all slashes at the beginning in order to preserve + * compliance with Swift. */ + const size_t start_pos = path.find_first_not_of('/'); + + if (boost::string_ref::npos != start_pos) { + /* Seperator is the first slash after the leading ones. */ + const size_t sep_pos = path.substr(start_pos).find('/'); + + if (boost::string_ref::npos != sep_pos) { + const auto bucket_name = path.substr(start_pos, sep_pos - start_pos); + const auto obj_name = path.substr(sep_pos + 1); + + return std::make_pair(bucket_name.to_string(), + rgw_obj_key(obj_name.to_string())); + } else { + /* It's guaranteed here that bucket name is at least one character + * long and is different than slash. */ + return std::make_pair(path.substr(start_pos).to_string(), + rgw_obj_key()); + } + } + + return none; +} + +std::pair +RGWBulkUploadOp::handle_upload_path(struct req_state *s) +{ + std::string bucket_path, file_prefix; + if (! s->init_state.url_bucket.empty()) { + file_prefix = bucket_path = s->init_state.url_bucket + "/"; + if (! s->object.empty()) { + std::string& object_name = s->object.name; + + /* As rgw_obj_key::empty() already verified emptiness of s->object.name, + * we can safely examine its last element. */ + if (object_name.back() == '/') { + file_prefix.append(object_name); + } else { + file_prefix.append(object_name).append("/"); + } + } + } + return std::make_pair(bucket_path, file_prefix); +} + +int RGWBulkUploadOp::handle_dir_verify_permission() +{ + if (s->user->max_buckets > 0) { + RGWUserBuckets buckets; + std::string marker; + bool is_truncated = false; + op_ret = rgw_read_user_buckets(store, s->user->user_id, buckets, + marker, std::string(), s->user->max_buckets, + false, &is_truncated); + if (op_ret < 0) { + return op_ret; + } + + if (buckets.count() >= static_cast(s->user->max_buckets)) { + return -ERR_TOO_MANY_BUCKETS; + } + } + + return 0; +} + +static void forward_req_info(CephContext *cct, req_info& info, const std::string& bucket_name) +{ + /* the request of container or object level will contain bucket name. + * only at account level need to append the bucket name */ + if (info.script_uri.find(bucket_name) != std::string::npos) { + return; + } + + ldout(cct, 20) << "append the bucket: "<< bucket_name << " to req_info" << dendl; + info.script_uri.append("/").append(bucket_name); + info.request_uri_aws4 = info.request_uri = info.script_uri; + info.effective_uri = "/" + bucket_name; +} + +void RGWBulkUploadOp::init(RGWRados* const store, + struct req_state* const s, + RGWHandler* const h) +{ + RGWOp::init(store, s, h); + dir_ctx.emplace(store->svc.sysobj->init_obj_ctx()); +} + +int RGWBulkUploadOp::handle_dir(const boost::string_ref path) +{ + ldpp_dout(this, 20) << "got directory=" << path << dendl; + + op_ret = handle_dir_verify_permission(); + if (op_ret < 0) { + return op_ret; + } + + std::string bucket_name; + rgw_obj_key object_junk; + std::tie(bucket_name, object_junk) = *parse_path(path); + + rgw_raw_obj obj(store->svc.zone->get_zone_params().domain_root, + rgw_make_bucket_entry_name(s->bucket_tenant, bucket_name)); + + /* we need to make sure we read bucket info, it's not read before for this + * specific request */ + RGWBucketInfo binfo; + std::map battrs; + op_ret = store->get_bucket_info(*dir_ctx, s->bucket_tenant, bucket_name, + binfo, nullptr, &battrs); + if (op_ret < 0 && op_ret != -ENOENT) { + return op_ret; + } + const bool bucket_exists = (op_ret != -ENOENT); + + if (bucket_exists) { + RGWAccessControlPolicy old_policy(s->cct); + int r = rgw_op_get_bucket_policy_from_attr(s->cct, store, binfo, + battrs, &old_policy); + if (r >= 0) { + if (old_policy.get_owner().get_id().compare(s->user->user_id) != 0) { + op_ret = -EEXIST; + return op_ret; + } + } + } + + RGWBucketInfo master_info; + rgw_bucket *pmaster_bucket = nullptr; + uint32_t *pmaster_num_shards = nullptr; + real_time creation_time; + obj_version objv, ep_objv, *pobjv = nullptr; + + if (! store->svc.zone->is_meta_master()) { + JSONParser jp; + ceph::bufferlist in_data; + req_info info = s->info; + forward_req_info(s->cct, info, bucket_name); + op_ret = forward_request_to_master(s, nullptr, store, in_data, &jp, &info); + if (op_ret < 0) { + return op_ret; + } + + JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp); + JSONDecoder::decode_json("object_ver", objv, &jp); + JSONDecoder::decode_json("bucket_info", master_info, &jp); + + ldpp_dout(this, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl; + ldpp_dout(this, 20) << "got creation_time="<< master_info.creation_time << dendl; + + pmaster_bucket= &master_info.bucket; + creation_time = master_info.creation_time; + pmaster_num_shards = &master_info.num_shards; + pobjv = &objv; + } else { + pmaster_bucket = nullptr; + pmaster_num_shards = nullptr; + } + + rgw_placement_rule placement_rule(binfo.placement_rule, s->info.storage_class); + + if (bucket_exists) { + rgw_placement_rule selected_placement_rule; + rgw_bucket bucket; + bucket.tenant = s->bucket_tenant; + bucket.name = s->bucket_name; + op_ret = store->svc.zone->select_bucket_placement(*(s->user), + store->svc.zone->get_zonegroup().get_id(), + placement_rule, + &selected_placement_rule, + nullptr); + if (selected_placement_rule != binfo.placement_rule) { + op_ret = -EEXIST; + ldpp_dout(this, 20) << "non-coherent placement rule" << dendl; + return op_ret; + } + } + + /* Create metadata: ACLs. */ + std::map attrs; + RGWAccessControlPolicy policy; + policy.create_default(s->user->user_id, s->user->display_name); + ceph::bufferlist aclbl; + policy.encode(aclbl); + attrs.emplace(RGW_ATTR_ACL, std::move(aclbl)); + + RGWQuotaInfo quota_info; + const RGWQuotaInfo * pquota_info = nullptr; + + rgw_bucket bucket; + bucket.tenant = s->bucket_tenant; /* ignored if bucket exists */ + bucket.name = bucket_name; + + + RGWBucketInfo out_info; + op_ret = store->create_bucket(*(s->user), + bucket, + store->svc.zone->get_zonegroup().get_id(), + placement_rule, binfo.swift_ver_location, + pquota_info, attrs, + out_info, pobjv, &ep_objv, creation_time, + pmaster_bucket, pmaster_num_shards, true); + /* continue if EEXIST and create_bucket will fail below. this way we can + * recover from a partial create by retrying it. */ + ldpp_dout(this, 20) << "rgw_create_bucket returned ret=" << op_ret + << ", bucket=" << bucket << dendl; + + if (op_ret && op_ret != -EEXIST) { + return op_ret; + } + + const bool existed = (op_ret == -EEXIST); + if (existed) { + /* bucket already existed, might have raced with another bucket creation, or + * might be partial bucket creation that never completed. Read existing bucket + * info, verify that the reported bucket owner is the current user. + * If all is ok then update the user's list of buckets. + * Otherwise inform client about a name conflict. + */ + if (out_info.owner.compare(s->user->user_id) != 0) { + op_ret = -EEXIST; + ldpp_dout(this, 20) << "conflicting bucket name" << dendl; + return op_ret; + } + bucket = out_info.bucket; + } + + op_ret = rgw_link_bucket(store, s->user->user_id, bucket, + out_info.creation_time, false); + if (op_ret && !existed && op_ret != -EEXIST) { + /* if it exists (or previously existed), don't remove it! */ + op_ret = rgw_unlink_bucket(store, s->user->user_id, + bucket.tenant, bucket.name); + if (op_ret < 0) { + ldpp_dout(this, 0) << "WARNING: failed to unlink bucket: ret=" << op_ret << dendl; + } + } else if (op_ret == -EEXIST || (op_ret == 0 && existed)) { + ldpp_dout(this, 20) << "containers already exists" << dendl; + op_ret = -ERR_BUCKET_EXISTS; + } + + return op_ret; +} + + +bool RGWBulkUploadOp::handle_file_verify_permission(RGWBucketInfo& binfo, + const rgw_obj& obj, + std::map& battrs, + ACLOwner& bucket_owner /* out */) +{ + RGWAccessControlPolicy bacl(store->ctx()); + op_ret = read_bucket_policy(store, s, binfo, battrs, &bacl, binfo.bucket); + if (op_ret < 0) { + ldpp_dout(this, 20) << "cannot read_policy() for bucket" << dendl; + return false; + } + + auto policy = get_iam_policy_from_attr(s->cct, store, battrs, binfo.bucket.tenant); + + bucket_owner = bacl.get_owner(); + if (policy || ! s->iam_user_policies.empty()) { + auto usr_policy_res = eval_user_policies(s->iam_user_policies, s->env, + boost::none, + rgw::IAM::s3PutObject, obj); + if (usr_policy_res == Effect::Deny) { + return false; + } + auto e = policy->eval(s->env, *s->auth.identity, + rgw::IAM::s3PutObject, obj); + if (e == Effect::Allow) { + return true; + } else if (e == Effect::Deny) { + return false; + } else if (usr_policy_res == Effect::Allow) { + return true; + } + } + + return verify_bucket_permission_no_policy(this, s, s->user_acl.get(), + &bacl, RGW_PERM_WRITE); +} + +int RGWBulkUploadOp::handle_file(const boost::string_ref path, + const size_t size, + AlignedStreamGetter& body) +{ + + ldpp_dout(this, 20) << "got file=" << path << ", size=" << size << dendl; + + if (size > static_cast(s->cct->_conf->rgw_max_put_size)) { + op_ret = -ERR_TOO_LARGE; + return op_ret; + } + + std::string bucket_name; + rgw_obj_key object; + std::tie(bucket_name, object) = *parse_path(path); + + auto& obj_ctx = *static_cast(s->obj_ctx); + RGWBucketInfo binfo; + std::map battrs; + ACLOwner bowner; + op_ret = store->get_bucket_info(*s->sysobj_ctx, s->user->user_id.tenant, + bucket_name, binfo, nullptr, &battrs); + if (op_ret == -ENOENT) { + ldpp_dout(this, 20) << "non existent directory=" << bucket_name << dendl; + } else if (op_ret < 0) { + return op_ret; + } + + if (! handle_file_verify_permission(binfo, + rgw_obj(binfo.bucket, object), + battrs, bowner)) { + ldpp_dout(this, 20) << "object creation unauthorized" << dendl; + op_ret = -EACCES; + return op_ret; + } + + op_ret = store->check_quota(bowner.get_id(), binfo.bucket, + user_quota, bucket_quota, size); + if (op_ret < 0) { + return op_ret; + } + + op_ret = store->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota); + if (op_ret < 0) { + return op_ret; + } + + rgw_obj obj(binfo.bucket, object); + if (s->bucket_info.versioning_enabled()) { + store->gen_rand_obj_instance_name(&obj); + } + + rgw_placement_rule dest_placement = s->dest_placement; + dest_placement.inherit_from(binfo.placement_rule); + + rgw::AioThrottle aio(store->ctx()->_conf->rgw_put_obj_min_window_size); + + using namespace rgw::putobj; + + AtomicObjectProcessor processor(&aio, store, binfo, &s->dest_placement, bowner.get_id(), + obj_ctx, obj, 0, s->req_id); + + op_ret = processor.prepare(); + if (op_ret < 0) { + ldpp_dout(this, 20) << "cannot prepare processor due to ret=" << op_ret << dendl; + return op_ret; + } + + /* No filters by default. */ + DataProcessor *filter = &processor; + + const auto& compression_type = store->svc.zone->get_zone_params().get_compression_type( + dest_placement); + CompressorRef plugin; + boost::optional compressor; + if (compression_type != "none") { + plugin = Compressor::create(s->cct, compression_type); + if (! plugin) { + ldpp_dout(this, 1) << "Cannot load plugin for rgw_compression_type " + << compression_type << dendl; + } else { + compressor.emplace(s->cct, plugin, filter); + filter = &*compressor; + } + } + + /* Upload file content. */ + ssize_t len = 0; + size_t ofs = 0; + MD5 hash; + do { + ceph::bufferlist data; + len = body.get_at_most(s->cct->_conf->rgw_max_chunk_size, data); + + ldpp_dout(this, 20) << "body=" << data.c_str() << dendl; + if (len < 0) { + op_ret = len; + return op_ret; + } else if (len > 0) { + hash.Update((const unsigned char *)data.c_str(), data.length()); + op_ret = filter->process(std::move(data), ofs); + if (op_ret < 0) { + ldpp_dout(this, 20) << "filter->process() returned ret=" << op_ret << dendl; + return op_ret; + } + + ofs += len; + } + + } while (len > 0); + + // flush + op_ret = filter->process({}, ofs); + if (op_ret < 0) { + return op_ret; + } + + if (ofs != size) { + ldpp_dout(this, 10) << "real file size different from declared" << dendl; + op_ret = -EINVAL; + return op_ret; + } + + op_ret = store->check_quota(bowner.get_id(), binfo.bucket, + user_quota, bucket_quota, size); + if (op_ret < 0) { + ldpp_dout(this, 20) << "quota exceeded for path=" << path << dendl; + return op_ret; + } + + op_ret = store->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota); + if (op_ret < 0) { + return op_ret; + } + + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + hash.Final(m); + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + /* Create metadata: ETAG. */ + std::map attrs; + std::string etag = calc_md5; + ceph::bufferlist etag_bl; + etag_bl.append(etag.c_str(), etag.size() + 1); + attrs.emplace(RGW_ATTR_ETAG, std::move(etag_bl)); + + /* Create metadata: ACLs. */ + RGWAccessControlPolicy policy; + policy.create_default(s->user->user_id, s->user->display_name); + ceph::bufferlist aclbl; + policy.encode(aclbl); + attrs.emplace(RGW_ATTR_ACL, std::move(aclbl)); + + /* Create metadata: compression info. */ + if (compressor && compressor->is_compressed()) { + ceph::bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = s->obj_size; + cs_info.blocks = std::move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + attrs.emplace(RGW_ATTR_COMPRESSION, std::move(tmp)); + } + + /* Complete the transaction. */ + op_ret = processor.complete(size, etag, nullptr, ceph::real_time(), + attrs, ceph::real_time() /* delete_at */, + nullptr, nullptr, nullptr, nullptr, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 20) << "processor::complete returned op_ret=" << op_ret << dendl; + } + + return op_ret; +} + +void RGWBulkUploadOp::execute() +{ + ceph::bufferlist buffer(64 * 1024); + + ldpp_dout(this, 20) << "start" << dendl; + + /* Create an instance of stream-abstracting class. Having this indirection + * allows for easy introduction of decompressors like gzip and bzip2. */ + auto stream = create_stream(); + if (! stream) { + return; + } + + /* Handling the $UPLOAD_PATH accordingly to the Swift's Bulk middleware. See: + * https://github.com/openstack/swift/blob/2.13.0/swift/common/middleware/bulk.py#L31-L41 */ + std::string bucket_path, file_prefix; + std::tie(bucket_path, file_prefix) = handle_upload_path(s); + + auto status = rgw::tar::StatusIndicator::create(); + do { + op_ret = stream->get_exactly(rgw::tar::BLOCK_SIZE, buffer); + if (op_ret < 0) { + ldpp_dout(this, 2) << "cannot read header" << dendl; + return; + } + + /* We need to re-interpret the buffer as a TAR block. Exactly two blocks + * must be tracked to detect out end-of-archive. It occurs when both of + * them are empty (zeroed). Tracing this particular inter-block dependency + * is responsibility of the rgw::tar::StatusIndicator class. */ + boost::optional header; + std::tie(status, header) = rgw::tar::interpret_block(status, buffer); + + if (! status.empty() && header) { + /* This specific block isn't empty (entirely zeroed), so we can parse + * it as a TAR header and dispatch. At the moment we do support only + * regular files and directories. Everything else (symlinks, devices) + * will be ignored but won't cease the whole upload. */ + switch (header->get_filetype()) { + case rgw::tar::FileType::NORMAL_FILE: { + ldpp_dout(this, 2) << "handling regular file" << dendl; + + boost::string_ref filename = bucket_path.empty() ? header->get_filename() : \ + file_prefix + header->get_filename().to_string(); + auto body = AlignedStreamGetter(0, header->get_filesize(), + rgw::tar::BLOCK_SIZE, *stream); + op_ret = handle_file(filename, + header->get_filesize(), + body); + if (! op_ret) { + /* Only regular files counts. */ + num_created++; + } else { + failures.emplace_back(op_ret, filename.to_string()); + } + break; + } + case rgw::tar::FileType::DIRECTORY: { + ldpp_dout(this, 2) << "handling regular directory" << dendl; + + boost::string_ref dirname = bucket_path.empty() ? header->get_filename() : bucket_path; + op_ret = handle_dir(dirname); + if (op_ret < 0 && op_ret != -ERR_BUCKET_EXISTS) { + failures.emplace_back(op_ret, dirname.to_string()); + } + break; + } + default: { + /* Not recognized. Skip. */ + op_ret = 0; + break; + } + } + + /* In case of any problems with sub-request authorization Swift simply + * terminates whole upload immediately. */ + if (boost::algorithm::contains(std::initializer_list{ op_ret }, + terminal_errors)) { + ldpp_dout(this, 2) << "terminating due to ret=" << op_ret << dendl; + break; + } + } else { + ldpp_dout(this, 2) << "an empty block" << dendl; + op_ret = 0; + } + + buffer.clear(); + } while (! status.eof()); + + return; +} + +RGWBulkUploadOp::AlignedStreamGetter::~AlignedStreamGetter() +{ + const size_t aligned_legnth = length + (-length % alignment); + ceph::bufferlist junk; + + DecoratedStreamGetter::get_exactly(aligned_legnth - position, junk); +} + +ssize_t RGWBulkUploadOp::AlignedStreamGetter::get_at_most(const size_t want, + ceph::bufferlist& dst) +{ + const size_t max_to_read = std::min(want, length - position); + const auto len = DecoratedStreamGetter::get_at_most(max_to_read, dst); + if (len > 0) { + position += len; + } + return len; +} + +ssize_t RGWBulkUploadOp::AlignedStreamGetter::get_exactly(const size_t want, + ceph::bufferlist& dst) +{ + const auto len = DecoratedStreamGetter::get_exactly(want, dst); + if (len > 0) { + position += len; + } + return len; +} + +int RGWSetAttrs::verify_permission() +{ + // This looks to be part of the RGW-NFS machinery and has no S3 or + // Swift equivalent. + bool perm; + if (!s->object.empty()) { + perm = verify_object_permission_no_policy(this, s, RGW_PERM_WRITE); + } else { + perm = verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWSetAttrs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWSetAttrs::execute() +{ + op_ret = get_params(); + if (op_ret < 0) + return; + + rgw_obj obj(s->bucket, s->object); + + if (!s->object.empty()) { + store->set_atomic(s->obj_ctx, obj); + op_ret = store->set_attrs(s->obj_ctx, s->bucket_info, obj, attrs, nullptr); + } else { + for (auto& iter : attrs) { + s->bucket_attrs[iter.first] = std::move(iter.second); + } + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, s->bucket_attrs, + &s->bucket_info.objv_tracker); + } +} + +void RGWGetObjLayout::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjLayout::execute() +{ + rgw_obj obj(s->bucket, s->object); + RGWRados::Object target(store, + s->bucket_info, + *static_cast(s->obj_ctx), + rgw_obj(s->bucket, s->object)); + RGWRados::Object::Read stat_op(&target); + + op_ret = stat_op.prepare(); + if (op_ret < 0) { + return; + } + + head_obj = stat_op.state.head_obj; + + op_ret = target.get_manifest(&manifest); +} + + +int RGWConfigBucketMetaSearch::verify_permission() +{ + if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) { + return -EACCES; + } + + return 0; +} + +void RGWConfigBucketMetaSearch::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWConfigBucketMetaSearch::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + ldpp_dout(this, 20) << "NOTICE: get_params() returned ret=" << op_ret << dendl; + return; + } + + s->bucket_info.mdsearch_config = mdsearch_config; + + op_ret = store->put_bucket_instance_info(s->bucket_info, false, real_time(), &s->bucket_attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name + << " returned err=" << op_ret << dendl; + return; + } +} + +int RGWGetBucketMetaSearch::verify_permission() +{ + if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) { + return -EACCES; + } + + return 0; +} + +void RGWGetBucketMetaSearch::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +int RGWDelBucketMetaSearch::verify_permission() +{ + if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) { + return -EACCES; + } + + return 0; +} + +void RGWDelBucketMetaSearch::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWDelBucketMetaSearch::execute() +{ + s->bucket_info.mdsearch_config.clear(); + + op_ret = store->put_bucket_instance_info(s->bucket_info, false, real_time(), &s->bucket_attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name + << " returned err=" << op_ret << dendl; + return; + } +} + + +RGWHandler::~RGWHandler() +{ +} + +int RGWHandler::init(RGWRados *_store, + struct req_state *_s, + rgw::io::BasicClient *cio) +{ + store = _store; + s = _s; + + return 0; +} + +int RGWHandler::do_init_permissions() +{ + int ret = rgw_build_bucket_policies(store, s); + if (ret < 0) { + ldpp_dout(s, 10) << "init_permissions on " << s->bucket + << " failed, ret=" << ret << dendl; + return ret==-ENODATA ? -EACCES : ret; + } + + rgw_build_iam_environment(store, s); + return ret; +} + +int RGWHandler::do_read_permissions(RGWOp *op, bool only_bucket) +{ + if (only_bucket) { + /* already read bucket info */ + return 0; + } + int ret = rgw_build_object_policies(store, s, op->prefetch_data()); + + if (ret < 0) { + ldpp_dout(op, 10) << "read_permissions on " << s->bucket << ":" + << s->object << " only_bucket=" << only_bucket + << " ret=" << ret << dendl; + if (ret == -ENODATA) + ret = -EACCES; + if (s->auth.identity->is_anonymous() && ret == -EACCES) + ret = -EPERM; + } + + return ret; +} + +int RGWOp::error_handler(int err_no, string *error_content) { + return dialect_handler->error_handler(err_no, error_content); +} + +int RGWHandler::error_handler(int err_no, string *error_content) { + // This is the do-nothing error handler + return err_no; +} + +std::ostream& RGWOp::gen_prefix(std::ostream& out) const +{ + // append : to the prefix + return s->gen_prefix(out) << s->dialect << ':' << name() << ' '; +} + +void RGWDefaultResponseOp::send_response() { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWPutBucketPolicy::send_response() +{ + if (!op_ret) { + /* A successful Put Bucket Policy should return a 204 on success */ + op_ret = STATUS_NO_CONTENT; + } + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWPutBucketPolicy::verify_permission() +{ + if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketPolicy)) { + return -EACCES; + } + + return 0; +} + +int RGWPutBucketPolicy::get_params() +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + // At some point when I have more time I want to make a version of + // rgw_rest_read_all_input that doesn't use malloc. + std::tie(op_ret, data) = rgw_rest_read_all_input(s, max_size, false); + + // And throws exceptions. + return op_ret; +} + +void RGWPutBucketPolicy::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + if (!store->svc.zone->is_meta_master()) { + op_ret = forward_request_to_master(s, NULL, store, data, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 20) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + try { + const Policy p(s->cct, s->bucket_tenant, data); + op_ret = retry_raced_bucket_write(store, s, [&p, this] { + auto attrs = s->bucket_attrs; + attrs[RGW_ATTR_IAM_POLICY].clear(); + attrs[RGW_ATTR_IAM_POLICY].append(p.text); + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, + &s->bucket_info.objv_tracker); + return op_ret; + }); + } catch (rgw::IAM::PolicyParseException& e) { + ldpp_dout(this, 20) << "failed to parse policy: " << e.what() << dendl; + op_ret = -EINVAL; + } +} + +void RGWGetBucketPolicy::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/json"); + dump_body(s, policy); +} + +int RGWGetBucketPolicy::verify_permission() +{ + if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketPolicy)) { + return -EACCES; + } + + return 0; +} + +void RGWGetBucketPolicy::execute() +{ + auto attrs = s->bucket_attrs; + map::iterator aiter = attrs.find(RGW_ATTR_IAM_POLICY); + if (aiter == attrs.end()) { + ldpp_dout(this, 0) << "can't find bucket IAM POLICY attr bucket_name = " + << s->bucket_name << dendl; + op_ret = -ERR_NO_SUCH_BUCKET_POLICY; + s->err.message = "The bucket policy does not exist"; + return; + } else { + policy = attrs[RGW_ATTR_IAM_POLICY]; + + if (policy.length() == 0) { + ldpp_dout(this, 10) << "The bucket policy does not exist, bucket: " + << s->bucket_name << dendl; + op_ret = -ERR_NO_SUCH_BUCKET_POLICY; + s->err.message = "The bucket policy does not exist"; + return; + } + } +} + +void RGWDeleteBucketPolicy::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWDeleteBucketPolicy::verify_permission() +{ + if (!verify_bucket_permission(this, s, rgw::IAM::s3DeleteBucketPolicy)) { + return -EACCES; + } + + return 0; +} + +void RGWDeleteBucketPolicy::execute() +{ + op_ret = retry_raced_bucket_write(store, s, [this] { + auto attrs = s->bucket_attrs; + attrs.erase(RGW_ATTR_IAM_POLICY); + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, + &s->bucket_info.objv_tracker); + return op_ret; + }); +} + +void RGWPutBucketObjectLock::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +int RGWPutBucketObjectLock::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketObjectLockConfiguration); +} + +void RGWPutBucketObjectLock::execute() +{ + if (!s->bucket_info.obj_lock_enabled()) { + ldpp_dout(this, 0) << "ERROR: object Lock configuration cannot be enabled on existing buckets" << dendl; + op_ret = -ERR_INVALID_BUCKET_STATE; + return; + } + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + op_ret = -EINVAL; + return; + } + op_ret = get_params(); + if (op_ret < 0) { + return; + } + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("ObjectLockConfiguration", obj_lock, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldout(s->cct, 5) << "unexpected xml:" << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + if (obj_lock.has_rule() && !obj_lock.retention_period_valid()) { + ldpp_dout(this, 0) << "ERROR: retention period must be a positive integer value" << dendl; + op_ret = -ERR_INVALID_RETENTION_PERIOD; + return; + } + + if (!store->svc.zone->is_meta_master()) { + op_ret = forward_request_to_master(s, NULL, store, data, nullptr); + if (op_ret < 0) { + ldout(s->cct, 20) << __func__ << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + } + + op_ret = retry_raced_bucket_write(store, s, [this] { + s->bucket_info.obj_lock = obj_lock; + op_ret = store->put_bucket_instance_info(s->bucket_info, false, + real_time(), &s->bucket_attrs); + return op_ret; + }); + return; +} + +void RGWGetBucketObjectLock::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +int RGWGetBucketObjectLock::verify_permission() +{ + return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketObjectLockConfiguration); +} + +void RGWGetBucketObjectLock::execute() +{ + if (!s->bucket_info.obj_lock_enabled()) { + op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION; + return; + } +} + +int RGWPutObjRetention::verify_permission() +{ + if (!verify_object_permission(this, s, rgw::IAM::s3PutObjectRetention)) { + return -EACCES; + } + op_ret = get_params(); + if (op_ret) { + return op_ret; + } + if (bypass_governance_mode) { + bypass_perm = verify_object_permission(this, s, rgw::IAM::s3BypassGovernanceRetention); + } + return 0; +} + +void RGWPutObjRetention::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutObjRetention::execute() +{ + if (!s->bucket_info.obj_lock_enabled()) { + ldpp_dout(this, 0) << "ERROR: object retention can't be set if bucket object lock not configured" << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + op_ret = -EINVAL; + return; + } + + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("Retention", obj_retention, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldpp_dout(this, 5) << "unexpected xml:" << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + + if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) < ceph_clock_now()) { + ldpp_dout(this, 0) << "ERROR: the retain until date must be in the future" << dendl; + op_ret = -EINVAL; + return; + } + bufferlist bl; + obj_retention.encode(bl); + rgw_obj obj(s->bucket, s->object); + + //check old retention + map attrs; + op_ret = get_obj_attrs(store, s, obj, attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: get obj attr error"<< dendl; + return; + } + auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (aiter != attrs.end()) { + RGWObjectRetention old_obj_retention; + try { + decode(old_obj_retention, aiter->second); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl; + op_ret = -EIO; + return; + } + if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) < ceph::real_clock::to_time_t(old_obj_retention.get_retain_until_date())) { + if (old_obj_retention.get_mode().compare("GOVERNANCE") != 0 || !bypass_perm || !bypass_governance_mode) { + op_ret = -EACCES; + return; + } + } + } + + op_ret = modify_obj_attr(store, s, obj, RGW_ATTR_OBJECT_RETENTION, bl); + + return; +} + +int RGWGetObjRetention::verify_permission() +{ + if (!verify_object_permission(this, s, rgw::IAM::s3GetObjectRetention)) { + return -EACCES; + } + return 0; +} + +void RGWGetObjRetention::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjRetention::execute() +{ + if (!s->bucket_info.obj_lock_enabled()) { + ldpp_dout(this, 0) << "ERROR: bucket object lock not configured" << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + rgw_obj obj(s->bucket, s->object); + map attrs; + op_ret = get_obj_attrs(store, s, obj, attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << obj + << " ret=" << op_ret << dendl; + return; + } + auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (aiter == attrs.end()) { + op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION; + return; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + obj_retention.decode(iter); + } catch (const buffer::error& e) { + ldout(s->cct, 0) << __func__ << "decode object retention config failed" << dendl; + op_ret = -EIO; + return; + } + return; +} + +int RGWPutObjLegalHold::verify_permission() +{ + if (!verify_object_permission(this, s, rgw::IAM::s3PutObjectLegalHold)) { + return -EACCES; + } + return 0; +} + +void RGWPutObjLegalHold::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWPutObjLegalHold::execute() { + if (!s->bucket_info.obj_lock_enabled()) { + ldpp_dout(this, 0) << "ERROR: object legal hold can't be set if bucket object lock not configured" << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl; + op_ret = -EINVAL; + return; + } + + op_ret = get_params(); + if (op_ret < 0) + return; + + if (!parser.parse(data.c_str(), data.length(), 1)) { + op_ret = -ERR_MALFORMED_XML; + return; + } + + try { + RGWXMLDecoder::decode_xml("LegalHold", obj_legal_hold, &parser, true); + } catch (RGWXMLDecoder::err &err) { + ldout(s->cct, 5) << "unexpected xml:" << err << dendl; + op_ret = -ERR_MALFORMED_XML; + return; + } + bufferlist bl; + obj_legal_hold.encode(bl); + rgw_obj obj(s->bucket, s->object); + //if instance is empty, we should modify the latest object + op_ret = modify_obj_attr(store, s, obj, RGW_ATTR_OBJECT_LEGAL_HOLD, bl); + return; +} + +int RGWGetObjLegalHold::verify_permission() +{ + if (!verify_object_permission(this, s, rgw::IAM::s3GetObjectLegalHold)) { + return -EACCES; + } + return 0; +} + +void RGWGetObjLegalHold::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjLegalHold::execute() +{ + if (!s->bucket_info.obj_lock_enabled()) { + ldpp_dout(this, 0) << "ERROR: bucket object lock not configured" << dendl; + op_ret = -ERR_INVALID_REQUEST; + return; + } + rgw_obj obj(s->bucket, s->object); + map attrs; + op_ret = get_obj_attrs(store, s, obj, attrs); + if (op_ret < 0) { + ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << obj + << " ret=" << op_ret << dendl; + return; + } + auto aiter = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD); + if (aiter == attrs.end()) { + op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION; + return; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + obj_legal_hold.decode(iter); + } catch (const buffer::error& e) { + ldout(s->cct, 0) << __func__ << "decode object legal hold config failed" << dendl; + op_ret = -EIO; + return; + } + return; +} + +void RGWGetClusterStat::execute() +{ + op_ret = this->store->get_rados_handle()->cluster_stat(stats_op); +} + + diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h new file mode 100644 index 00000000..e76b1258 --- /dev/null +++ b/src/rgw/rgw_op.h @@ -0,0 +1,2346 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/** + * All operations via the rados gateway are carried out by + * small classes known as RGWOps. This class contains a req_state + * and each possible command is a subclass of this with a defined + * execute() method that does whatever the subclass name implies. + * These subclasses must be further subclassed (by interface type) + * to provide additional virtual methods such as send_response or get_params. + */ + +#ifndef CEPH_RGW_OP_H +#define CEPH_RGW_OP_H + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "common/armor.h" +#include "common/mime.h" +#include "common/utf8.h" +#include "common/ceph_json.h" +#include "common/ceph_time.h" + +#include "rgw_common.h" +#include "rgw_dmclock.h" +#include "rgw_rados.h" +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_acl.h" +#include "rgw_cors.h" +#include "rgw_quota.h" +#include "rgw_putobj.h" + +#include "rgw_lc.h" +#include "rgw_torrent.h" +#include "rgw_tag.h" +#include "rgw_object_lock.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/rgw/cls_rgw_client.h" + +#include "services/svc_sys_obj.h" + +#include "include/ceph_assert.h" + +using ceph::crypto::SHA1; + +struct req_state; +class RGWOp; + + +namespace rgw { +namespace auth { +namespace registry { + +class StrategyRegistry; + +} +} +} + +int rgw_op_get_bucket_policy_from_attr(CephContext *cct, + RGWRados *store, + RGWBucketInfo& bucket_info, + map& bucket_attrs, + RGWAccessControlPolicy *policy); + +class RGWHandler { +protected: + RGWRados* store; + struct req_state* s; + + int do_init_permissions(); + int do_read_permissions(RGWOp* op, bool only_bucket); + +public: + RGWHandler() + : store(nullptr), + s(nullptr) { + } + virtual ~RGWHandler(); + + virtual int init(RGWRados* store, + struct req_state* _s, + rgw::io::BasicClient* cio); + + virtual int init_permissions(RGWOp*) { + return 0; + } + + virtual int retarget(RGWOp* op, RGWOp** new_op) { + *new_op = op; + return 0; + } + + virtual int read_permissions(RGWOp* op) = 0; + virtual int authorize(const DoutPrefixProvider* dpp) = 0; + virtual int postauth_init() = 0; + virtual int error_handler(int err_no, std::string* error_content); + virtual void dump(const string& code, const string& message) const {} + + virtual bool supports_quota() { + return true; + } +}; + + + +void rgw_bucket_object_pre_exec(struct req_state *s); + +namespace dmc = rgw::dmclock; + +/** + * Provide the base class for all ops. + */ +class RGWOp : public DoutPrefixProvider { +protected: + struct req_state *s; + RGWHandler *dialect_handler; + RGWRados *store; + RGWCORSConfiguration bucket_cors; + bool cors_exist; + RGWQuotaInfo bucket_quota; + RGWQuotaInfo user_quota; + int op_ret; + int do_aws4_auth_completion(); + + virtual int init_quota(); + +public: + RGWOp() + : s(nullptr), + dialect_handler(nullptr), + store(nullptr), + cors_exist(false), + op_ret(0) { + } + + virtual ~RGWOp() = default; + + int get_ret() const { return op_ret; } + + virtual int init_processing() { + if (dialect_handler->supports_quota()) { + op_ret = init_quota(); + if (op_ret < 0) + return op_ret; + } + + return 0; + } + + virtual void init(RGWRados *store, struct req_state *s, RGWHandler *dialect_handler) { + this->store = store; + this->s = s; + this->dialect_handler = dialect_handler; + } + int read_bucket_cors(); + bool generate_cors_headers(string& origin, string& method, string& headers, string& exp_headers, unsigned *max_age); + + virtual int verify_params() { return 0; } + virtual bool prefetch_data() { return false; } + + /* Authenticate requester -- verify its identity. + * + * NOTE: typically the procedure is common across all operations of the same + * dialect (S3, Swift API). However, there are significant exceptions in + * both APIs: browser uploads, /info and OPTIONS handlers. All of them use + * different, specific authentication schema driving the need for per-op + * authentication. The alternative is to duplicate parts of the method- + * dispatch logic in RGWHandler::authorize() and pollute it with a lot + * of special cases. */ + virtual int verify_requester(const rgw::auth::StrategyRegistry& auth_registry) { + /* TODO(rzarzynski): rename RGWHandler::authorize to generic_authenticate. */ + return dialect_handler->authorize(this); + } + virtual int verify_permission() = 0; + virtual int verify_op_mask(); + virtual void pre_exec() {} + virtual void execute() = 0; + virtual void send_response() {} + virtual void complete() { + send_response(); + } + virtual const char* name() const = 0; + virtual RGWOpType get_type() { return RGW_OP_UNKNOWN; } + + virtual uint32_t op_mask() { return 0; } + + virtual int error_handler(int err_no, string *error_content); + + // implements DoutPrefixProvider + std::ostream& gen_prefix(std::ostream& out) const override; + CephContext* get_cct() const override { return s->cct; } + unsigned get_subsys() const override { return ceph_subsys_rgw; } + + virtual dmc::client_id dmclock_client() { return dmc::client_id::metadata; } + virtual dmc::Cost dmclock_cost() { return 1; } +}; + +class RGWDefaultResponseOp : public RGWOp { +public: + void send_response() override; +}; + +class RGWGetObj_Filter : public RGWGetDataCB +{ +protected: + RGWGetObj_Filter *next{nullptr}; +public: + RGWGetObj_Filter() {} + explicit RGWGetObj_Filter(RGWGetObj_Filter *next): next(next) {} + ~RGWGetObj_Filter() override {} + /** + * Passes data through filter. + * Filter can modify content of bl. + * When bl_len == 0 , it means 'flush + */ + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override { + if (next) { + return next->handle_data(bl, bl_ofs, bl_len); + } + return 0; + } + /** + * Flushes any cached data. Used by RGWGetObjFilter. + * Return logic same as handle_data. + */ + virtual int flush() { + if (next) { + return next->flush(); + } + return 0; + } + /** + * Allows filter to extend range required for successful filtering + */ + virtual int fixup_range(off_t& ofs, off_t& end) { + if (next) { + return next->fixup_range(ofs, end); + } + return 0; + } +}; + +class RGWGetObj : public RGWOp { +protected: + seed torrent; // get torrent + const char *range_str; + const char *if_mod; + const char *if_unmod; + const char *if_match; + const char *if_nomatch; + uint32_t mod_zone_id; + uint64_t mod_pg_ver; + off_t ofs; + uint64_t total_len; + off_t start; + off_t end; + ceph::real_time mod_time; + ceph::real_time lastmod; + ceph::real_time unmod_time; + ceph::real_time *mod_ptr; + ceph::real_time *unmod_ptr; + map attrs; + bool get_data; + bool partial_content; + bool ignore_invalid_range; + bool range_parsed; + bool skip_manifest; + bool skip_decrypt{false}; + rgw_obj obj; + utime_t gc_invalidate_time; + bool is_slo; + string lo_etag; + bool rgwx_stat; /* extended rgw stat operation */ + string version_id; + + // compression attrs + RGWCompressionInfo cs_info; + off_t first_block, last_block; + off_t q_ofs, q_len; + bool first_data; + uint64_t cur_ofs; + bufferlist waiting; + uint64_t action = 0; + + bool get_retention; + bool get_legal_hold; + + int init_common(); +public: + RGWGetObj() { + range_str = NULL; + if_mod = NULL; + if_unmod = NULL; + if_match = NULL; + if_nomatch = NULL; + mod_zone_id = 0; + mod_pg_ver = 0; + start = 0; + ofs = 0; + total_len = 0; + end = -1; + mod_ptr = NULL; + unmod_ptr = NULL; + get_data = false; + partial_content = false; + range_parsed = false; + skip_manifest = false; + is_slo = false; + first_block = 0; + last_block = 0; + q_ofs = 0; + q_len = 0; + first_data = true; + cur_ofs = 0; + get_retention = false; + get_legal_hold = false; + } + + bool prefetch_data() override; + + void set_get_data(bool get_data) { + this->get_data = get_data; + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + int parse_range(); + int read_user_manifest_part( + rgw_bucket& bucket, + const rgw_bucket_dir_entry& ent, + RGWAccessControlPolicy * const bucket_acl, + const boost::optional& bucket_policy, + const off_t start_ofs, + const off_t end_ofs, + bool swift_slo); + int handle_user_manifest(const char *prefix); + int handle_slo_manifest(bufferlist& bl); + + int get_data_cb(bufferlist& bl, off_t ofs, off_t len); + + virtual int get_params() = 0; + virtual int send_response_data_error() = 0; + virtual int send_response_data(bufferlist& bl, off_t ofs, off_t len) = 0; + + const char* name() const override { return "get_obj"; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + virtual bool need_object_expiration() { return false; } + /** + * calculates filter used to decrypt RGW objects data + */ + virtual int get_decrypt_filter(std::unique_ptr* filter, RGWGetObj_Filter* cb, bufferlist* manifest_bl) { + *filter = nullptr; + return 0; + } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWGetObj_CB : public RGWGetObj_Filter +{ + RGWGetObj *op; +public: + explicit RGWGetObj_CB(RGWGetObj *_op) : op(_op) {} + ~RGWGetObj_CB() override {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override { + return op->get_data_cb(bl, bl_ofs, bl_len); + } +}; + +class RGWGetObjTags : public RGWOp { + protected: + bufferlist tags_bl; + bool has_tags{false}; + public: + int verify_permission() override; + void execute() override; + void pre_exec() override; + + virtual void send_response_data(bufferlist& bl) = 0; + const char* name() const override { return "get_obj_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ_TAGGING; } + +}; + +class RGWPutObjTags : public RGWOp { + protected: + bufferlist tags_bl; + public: + int verify_permission() override; + void execute() override; + + virtual void send_response() override = 0; + virtual int get_params() = 0; + const char* name() const override { return "put_obj_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + RGWOpType get_type() override { return RGW_OP_PUT_OBJ_TAGGING; } + +}; + +class RGWDeleteObjTags: public RGWOp { + public: + void pre_exec() override; + int verify_permission() override; + void execute() override; + + const char* name() const override { return "delete_obj_tags"; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + RGWOpType get_type() override { return RGW_OP_DELETE_OBJ_TAGGING;} +}; + +class RGWBulkDelete : public RGWOp { +public: + struct acct_path_t { + std::string bucket_name; + rgw_obj_key obj_key; + }; + + struct fail_desc_t { + int err; + acct_path_t path; + }; + + class Deleter { + protected: + const DoutPrefixProvider * dpp; + unsigned int num_deleted; + unsigned int num_unfound; + std::list failures; + + RGWRados * const store; + req_state * const s; + + public: + Deleter(const DoutPrefixProvider* dpp, RGWRados * const str, req_state * const s) + : dpp(dpp), + num_deleted(0), + num_unfound(0), + store(str), + s(s) { + } + + unsigned int get_num_deleted() const { + return num_deleted; + } + + unsigned int get_num_unfound() const { + return num_unfound; + } + + const std::list get_failures() const { + return failures; + } + + bool verify_permission(RGWBucketInfo& binfo, + map& battrs, + ACLOwner& bucket_owner /* out */); + bool delete_single(const acct_path_t& path); + bool delete_chunk(const std::list& paths); + }; + /* End of Deleter subclass */ + + static const size_t MAX_CHUNK_ENTRIES = 1024; + +protected: + std::unique_ptr deleter; + +public: + RGWBulkDelete() + : deleter(nullptr) { + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_data(std::list& items, + bool * is_truncated) = 0; + void send_response() override = 0; + + const char* name() const override { return "bulk_delete"; } + RGWOpType get_type() override { return RGW_OP_BULK_DELETE; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +inline ostream& operator<<(ostream& out, const RGWBulkDelete::acct_path_t &o) { + return out << o.bucket_name << "/" << o.obj_key; +} + + +class RGWBulkUploadOp : public RGWOp { + boost::optional dir_ctx; + +protected: + class fail_desc_t { + public: + fail_desc_t(const int err, std::string path) + : err(err), + path(std::move(path)) { + } + + const int err; + const std::string path; + }; + + static constexpr std::array terminal_errors = { + { -EACCES, -EPERM } + }; + + /* FIXME: boost::container::small_vector failures; */ + std::vector failures; + size_t num_created; + + class StreamGetter; + class DecoratedStreamGetter; + class AlignedStreamGetter; + + virtual std::unique_ptr create_stream() = 0; + virtual void send_response() override = 0; + + boost::optional> + parse_path(const boost::string_ref& path); + + std::pair + handle_upload_path(struct req_state *s); + + bool handle_file_verify_permission(RGWBucketInfo& binfo, + const rgw_obj& obj, + std::map& battrs, + ACLOwner& bucket_owner /* out */); + int handle_file(boost::string_ref path, + size_t size, + AlignedStreamGetter& body); + + int handle_dir_verify_permission(); + int handle_dir(boost::string_ref path); + +public: + RGWBulkUploadOp() + : num_created(0) { + } + + void init(RGWRados* const store, + struct req_state* const s, + RGWHandler* const h) override; + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + const char* name() const override { return "bulk_upload"; } + + RGWOpType get_type() override { + return RGW_OP_BULK_UPLOAD; + } + + uint32_t op_mask() override { + return RGW_OP_TYPE_WRITE; + } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; /* RGWBulkUploadOp */ + + +class RGWBulkUploadOp::StreamGetter { +public: + StreamGetter() = default; + virtual ~StreamGetter() = default; + + virtual ssize_t get_at_most(size_t want, ceph::bufferlist& dst) = 0; + virtual ssize_t get_exactly(size_t want, ceph::bufferlist& dst) = 0; +}; /* End of nested subclass StreamGetter */ + + +class RGWBulkUploadOp::DecoratedStreamGetter : public StreamGetter { + StreamGetter& decoratee; + +protected: + StreamGetter& get_decoratee() { + return decoratee; + } + +public: + explicit DecoratedStreamGetter(StreamGetter& decoratee) + : decoratee(decoratee) { + } + virtual ~DecoratedStreamGetter() = default; + + ssize_t get_at_most(const size_t want, ceph::bufferlist& dst) override { + return get_decoratee().get_at_most(want, dst); + } + + ssize_t get_exactly(const size_t want, ceph::bufferlist& dst) override { + return get_decoratee().get_exactly(want, dst); + } +}; /* RGWBulkUploadOp::DecoratedStreamGetter */ + + +class RGWBulkUploadOp::AlignedStreamGetter + : public RGWBulkUploadOp::DecoratedStreamGetter { + size_t position; + size_t length; + size_t alignment; + +public: + template + AlignedStreamGetter(const size_t position, + const size_t length, + const size_t alignment, + U&& decoratee) + : DecoratedStreamGetter(std::forward(decoratee)), + position(position), + length(length), + alignment(alignment) { + } + virtual ~AlignedStreamGetter(); + ssize_t get_at_most(size_t want, ceph::bufferlist& dst) override; + ssize_t get_exactly(size_t want, ceph::bufferlist& dst) override; +}; /* RGWBulkUploadOp::AlignedStreamGetter */ + + +struct RGWUsageStats { + uint64_t bytes_used = 0; + uint64_t bytes_used_rounded = 0; + uint64_t buckets_count = 0; + uint64_t objects_count = 0; +}; + +#define RGW_LIST_BUCKETS_LIMIT_MAX 10000 + +class RGWListBuckets : public RGWOp { +protected: + bool sent_data; + std::string marker; + std::string end_marker; + int64_t limit; + uint64_t limit_max; + std::map attrs; + bool is_truncated; + + RGWUsageStats global_stats; + std::map policies_stats; + + virtual uint64_t get_default_max() const { + return 1000; + } + +public: + RGWListBuckets() + : sent_data(false), + limit(RGW_LIST_BUCKETS_LIMIT_MAX), + limit_max(RGW_LIST_BUCKETS_LIMIT_MAX), + is_truncated(false) { + } + + int verify_permission() override; + void execute() override; + + virtual int get_params() = 0; + virtual void handle_listing_chunk(RGWUserBuckets&& buckets) { + /* The default implementation, used by e.g. S3, just generates a new + * part of listing and sends it client immediately. Swift can behave + * differently: when the reverse option is requested, all incoming + * instances of RGWUserBuckets are buffered and finally reversed. */ + return send_response_data(buckets); + } + virtual void send_response_begin(bool has_buckets) = 0; + virtual void send_response_data(RGWUserBuckets& buckets) = 0; + virtual void send_response_end() = 0; + void send_response() override {} + + virtual bool should_get_stats() { return false; } + virtual bool supports_account_metadata() { return false; } + + const char* name() const override { return "list_buckets"; } + RGWOpType get_type() override { return RGW_OP_LIST_BUCKETS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; // class RGWListBuckets + +class RGWGetUsage : public RGWOp { +protected: + bool sent_data; + string start_date; + string end_date; + int show_log_entries; + int show_log_sum; + map categories; + map usage; + map summary_map; + map buckets_usage; + cls_user_header header; +public: + RGWGetUsage() : sent_data(false), show_log_entries(true), show_log_sum(true){ + } + + int verify_permission() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override {} + + virtual bool should_get_stats() { return false; } + + const char* name() const override { return "get_usage"; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWStatAccount : public RGWOp { +protected: + RGWUsageStats global_stats; + std::map policies_stats; + +public: + RGWStatAccount() = default; + + int verify_permission() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "stat_account"; } + RGWOpType get_type() override { return RGW_OP_STAT_ACCOUNT; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWListBucket : public RGWOp { +protected: + RGWBucketEnt bucket; + string prefix; + rgw_obj_key marker; + rgw_obj_key next_marker; + rgw_obj_key end_marker; + string max_keys; + string delimiter; + string encoding_type; + bool list_versions; + int max; + vector objs; + map common_prefixes; + + int default_max; + bool is_truncated; + bool allow_unordered; + + int shard_id; + + int parse_max_keys(); + +public: + RGWListBucket() : list_versions(false), max(0), + default_max(0), is_truncated(false), + allow_unordered(false), shard_id(-1) {} + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "list_bucket"; } + RGWOpType get_type() override { return RGW_OP_LIST_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } + virtual bool need_container_stats() { return false; } +}; + +class RGWGetBucketLogging : public RGWOp { +public: + RGWGetBucketLogging() {} + int verify_permission() override; + void execute() override { } + + void send_response() override = 0; + const char* name() const override { return "get_bucket_logging"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOGGING; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWGetBucketLocation : public RGWOp { +public: + RGWGetBucketLocation() {} + ~RGWGetBucketLocation() override {} + int verify_permission() override; + void execute() override { } + + void send_response() override = 0; + const char* name() const override { return "get_bucket_location"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOCATION; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWGetBucketVersioning : public RGWOp { +protected: + bool versioned{false}; + bool versioning_enabled{false}; + bool mfa_enabled{false}; +public: + RGWGetBucketVersioning() = default; + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "get_bucket_versioning"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_VERSIONING; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +enum BucketVersionStatus { + VersioningStatusInvalid = -1, + VersioningNotChanged = 0, + VersioningEnabled = 1, + VersioningSuspended =2, +}; + +class RGWSetBucketVersioning : public RGWOp { +protected: + int versioning_status; + bool mfa_set_status{false}; + bool mfa_status{false}; + bufferlist in_data; +public: + RGWSetBucketVersioning() : versioning_status(VersioningNotChanged) {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() { return 0; } + + void send_response() override = 0; + const char* name() const override { return "set_bucket_versioning"; } + RGWOpType get_type() override { return RGW_OP_SET_BUCKET_VERSIONING; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetBucketWebsite : public RGWOp { +public: + RGWGetBucketWebsite() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "get_bucket_website"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_WEBSITE; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWSetBucketWebsite : public RGWOp { +protected: + bufferlist in_data; + RGWBucketWebsiteConf website_conf; +public: + RGWSetBucketWebsite() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() { return 0; } + + void send_response() override = 0; + const char* name() const override { return "set_bucket_website"; } + RGWOpType get_type() override { return RGW_OP_SET_BUCKET_WEBSITE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWDeleteBucketWebsite : public RGWOp { +public: + RGWDeleteBucketWebsite() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "delete_bucket_website"; } + RGWOpType get_type() override { return RGW_OP_SET_BUCKET_WEBSITE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWStatBucket : public RGWOp { +protected: + RGWBucketEnt bucket; + +public: + RGWStatBucket() {} + ~RGWStatBucket() override {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "stat_bucket"; } + RGWOpType get_type() override { return RGW_OP_STAT_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWCreateBucket : public RGWOp { +protected: + RGWAccessControlPolicy policy; + string location_constraint; + rgw_placement_rule placement_rule; + RGWBucketInfo info; + obj_version ep_objv; + bool has_cors; + bool relaxed_region_enforcement; + bool obj_lock_enabled; + RGWCORSConfiguration cors_config; + boost::optional swift_ver_location; + map attrs; + set rmattr_names; + + bufferlist in_data; + + virtual bool need_metadata_upload() const { return false; } + +public: + RGWCreateBucket() : has_cors(false), relaxed_region_enforcement(false), obj_lock_enabled(false) {} + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */ + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + policy.set_ctx(s->cct); + relaxed_region_enforcement = + s->cct->_conf.get_val("rgw_relaxed_region_enforcement"); + } + virtual int get_params() { return 0; } + void send_response() override = 0; + const char* name() const override { return "create_bucket"; } + RGWOpType get_type() override { return RGW_OP_CREATE_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWDeleteBucket : public RGWOp { +protected: + RGWObjVersionTracker objv_tracker; + +public: + RGWDeleteBucket() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "delete_bucket"; } + RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } +}; + +struct rgw_slo_entry { + string path; + string etag; + uint64_t size_bytes; + + rgw_slo_entry() : size_bytes(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(path, bl); + encode(etag, bl); + encode(size_bytes, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(path, bl); + decode(etag, bl); + decode(size_bytes, bl); + DECODE_FINISH(bl); + } + + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_slo_entry) + +struct RGWSLOInfo { + vector entries; + uint64_t total_size; + + /* in memory only */ + bufferlist raw_data; + + RGWSLOInfo() : total_size(0) {} + ~RGWSLOInfo() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(entries, bl); + encode(total_size, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(entries, bl); + decode(total_size, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(RGWSLOInfo) + +class RGWPutObj : public RGWOp { +protected: + seed torrent; + off_t ofs; + const char *supplied_md5_b64; + const char *supplied_etag; + const char *if_match; + const char *if_nomatch; + std::string copy_source; + const char *copy_source_range; + RGWBucketInfo copy_source_bucket_info; + string copy_source_tenant_name; + string copy_source_bucket_name; + string copy_source_object_name; + string copy_source_version_id; + off_t copy_source_range_fst; + off_t copy_source_range_lst; + string etag; + bool chunked_upload; + RGWAccessControlPolicy policy; + std::unique_ptr obj_tags; + const char *dlo_manifest; + RGWSLOInfo *slo_info; + map attrs; + ceph::real_time mtime; + uint64_t olh_epoch; + string version_id; + bufferlist bl_aux; + map crypt_http_responses; + string user_data; + + std::string multipart_upload_id; + std::string multipart_part_str; + int multipart_part_num = 0; + + boost::optional delete_at; + //append obj + bool append; + uint64_t position; + uint64_t cur_accounted_size; + + //object lock + RGWObjectRetention *obj_retention; + RGWObjectLegalHold *obj_legal_hold; + +public: + RGWPutObj() : ofs(0), + supplied_md5_b64(NULL), + supplied_etag(NULL), + if_match(NULL), + if_nomatch(NULL), + copy_source_range(NULL), + copy_source_range_fst(0), + copy_source_range_lst(0), + chunked_upload(0), + dlo_manifest(NULL), + slo_info(NULL), + olh_epoch(0), + append(false), + position(0), + cur_accounted_size(0), + obj_retention(nullptr), + obj_legal_hold(nullptr) {} + + ~RGWPutObj() override { + delete slo_info; + delete obj_retention; + delete obj_legal_hold; + } + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + policy.set_ctx(s->cct); + } + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */ + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + /* this is for cases when copying data from other object */ + virtual int get_decrypt_filter(std::unique_ptr* filter, + RGWGetObj_Filter* cb, + map& attrs, + bufferlist* manifest_bl) { + *filter = nullptr; + return 0; + } + virtual int get_encrypt_filter(std::unique_ptr *filter, + rgw::putobj::DataProcessor *cb) { + return 0; + } + + int get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len); + int get_data(const off_t fst, const off_t lst, bufferlist& bl); + + virtual int get_params() = 0; + virtual int get_data(bufferlist& bl) = 0; + void send_response() override = 0; + const char* name() const override { return "put_obj"; } + RGWOpType get_type() override { return RGW_OP_PUT_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWPostObj : public RGWOp { +protected: + off_t min_len; + off_t max_len; + int len; + off_t ofs; + const char *supplied_md5_b64; + const char *supplied_etag; + string etag; + RGWAccessControlPolicy policy; + map attrs; + boost::optional delete_at; + + /* Must be called after get_data() or the result is undefined. */ + virtual std::string get_current_filename() const = 0; + virtual std::string get_current_content_type() const = 0; + virtual bool is_next_file_to_upload() { + return false; + } +public: + RGWPostObj() : min_len(0), + max_len(LLONG_MAX), + len(0), + ofs(0), + supplied_md5_b64(nullptr), + supplied_etag(nullptr) { + } + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */ + } + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + policy.set_ctx(s->cct); + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_encrypt_filter(std::unique_ptr *filter, + rgw::putobj::DataProcessor *cb) { + return 0; + } + virtual int get_params() = 0; + virtual int get_data(ceph::bufferlist& bl, bool& again) = 0; + void send_response() override = 0; + const char* name() const override { return "post_obj"; } + RGWOpType get_type() override { return RGW_OP_POST_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWPutMetadataAccount : public RGWOp { +protected: + std::set rmattr_names; + std::map attrs, orig_attrs; + std::map temp_url_keys; + RGWQuotaInfo new_quota; + bool new_quota_extracted; + + RGWObjVersionTracker acct_op_tracker; + + RGWAccessControlPolicy policy; + bool has_policy; + +public: + RGWPutMetadataAccount() + : new_quota_extracted(false), + has_policy(false) { + } + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + policy.set_ctx(s->cct); + } + int init_processing() override; + int verify_permission() override; + void pre_exec() override { } + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + virtual void filter_out_temp_url(map& add_attrs, + const set& rmattr_names, + map& temp_url_keys); + const char* name() const override { return "put_account_metadata"; } + RGWOpType get_type() override { return RGW_OP_PUT_METADATA_ACCOUNT; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWPutMetadataBucket : public RGWOp { +protected: + map attrs; + set rmattr_names; + bool has_policy, has_cors; + uint32_t policy_rw_mask; + RGWAccessControlPolicy policy; + RGWCORSConfiguration cors_config; + rgw_placement_rule placement_rule; + boost::optional swift_ver_location; + +public: + RGWPutMetadataBucket() + : has_policy(false), has_cors(false), policy_rw_mask(0) + {} + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */ + } + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + policy.set_ctx(s->cct); + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "put_bucket_metadata"; } + RGWOpType get_type() override { return RGW_OP_PUT_METADATA_BUCKET; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWPutMetadataObject : public RGWOp { +protected: + RGWAccessControlPolicy policy; + boost::optional delete_at; + const char *dlo_manifest; + +public: + RGWPutMetadataObject() + : dlo_manifest(NULL) + {} + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + policy.set_ctx(s->cct); + } + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "put_obj_metadata"; } + RGWOpType get_type() override { return RGW_OP_PUT_METADATA_OBJECT; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + virtual bool need_object_expiration() { return false; } +}; + +class RGWDeleteObj : public RGWOp { +protected: + bool delete_marker; + bool multipart_delete; + string version_id; + ceph::real_time unmod_since; /* if unmodified since */ + bool no_precondition_error; + std::unique_ptr deleter; + bool bypass_perm; + bool bypass_governance_mode; + +public: + RGWDeleteObj() + : delete_marker(false), + multipart_delete(false), + no_precondition_error(false), + deleter(nullptr), + bypass_perm(true), + bypass_governance_mode(false) { + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + int handle_slo_manifest(bufferlist& bl); + + virtual int get_params() { return 0; } + void send_response() override = 0; + const char* name() const override { return "delete_obj"; } + RGWOpType get_type() override { return RGW_OP_DELETE_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } + virtual bool need_object_expiration() { return false; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWCopyObj : public RGWOp { +protected: + RGWAccessControlPolicy dest_policy; + const char *if_mod; + const char *if_unmod; + const char *if_match; + const char *if_nomatch; + // Required or it is not a copy operation + std::string_view copy_source; + // Not actually required + std::optional md_directive; + + off_t ofs; + off_t len; + off_t end; + ceph::real_time mod_time; + ceph::real_time unmod_time; + ceph::real_time *mod_ptr; + ceph::real_time *unmod_ptr; + map attrs; + string src_tenant_name, src_bucket_name; + rgw_bucket src_bucket; + rgw_obj_key src_object; + string dest_tenant_name, dest_bucket_name; + rgw_bucket dest_bucket; + string dest_object; + ceph::real_time src_mtime; + ceph::real_time mtime; + RGWRados::AttrsMod attrs_mod; + RGWBucketInfo src_bucket_info; + RGWBucketInfo dest_bucket_info; + string source_zone; + string etag; + + off_t last_ofs; + + string version_id; + uint64_t olh_epoch; + + boost::optional delete_at; + bool copy_if_newer; + + bool need_to_check_storage_class = false; + + int init_common(); + +public: + RGWCopyObj() { + if_mod = NULL; + if_unmod = NULL; + if_match = NULL; + if_nomatch = NULL; + ofs = 0; + len = 0; + end = -1; + mod_ptr = NULL; + unmod_ptr = NULL; + attrs_mod = RGWRados::ATTRSMOD_NONE; + last_ofs = 0; + olh_epoch = 0; + copy_if_newer = false; + } + + static bool parse_copy_location(const boost::string_view& src, + string& bucket_name, + rgw_obj_key& object); + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); + } + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + dest_policy.set_ctx(s->cct); + } + int verify_permission() override; + void pre_exec() override; + void execute() override; + void progress_cb(off_t ofs); + + virtual int check_storage_class(const rgw_placement_rule& src_placement) { + return 0; + } + + virtual int init_dest_policy() { return 0; } + virtual int get_params() = 0; + virtual void send_partial_response(off_t ofs) {} + void send_response() override = 0; + const char* name() const override { return "copy_obj"; } + RGWOpType get_type() override { return RGW_OP_COPY_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + dmc::client_id dmclock_client() override { return dmc::client_id::data; } +}; + +class RGWGetACLs : public RGWOp { +protected: + string acls; + +public: + RGWGetACLs() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "get_acls"; } + RGWOpType get_type() override { return RGW_OP_GET_ACLS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutACLs : public RGWOp { +protected: + bufferlist data; + ACLOwner owner; + +public: + RGWPutACLs() {} + ~RGWPutACLs() override {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_policy_from_state(RGWRados *store, struct req_state *s, stringstream& ss) { return 0; } + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "put_acls"; } + RGWOpType get_type() override { return RGW_OP_PUT_ACLS; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetLC : public RGWOp { +protected: + +public: + RGWGetLC() { } + ~RGWGetLC() override { } + + int verify_permission() override; + void pre_exec() override; + void execute() override = 0; + + void send_response() override = 0; + const char* name() const override { return "get_lifecycle"; } + RGWOpType get_type() override { return RGW_OP_GET_LC; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutLC : public RGWOp { +protected: + bufferlist data; + const char *content_md5; + string cookie; + +public: + RGWPutLC() { + content_md5 = nullptr; + } + ~RGWPutLC() override {} + + void init(RGWRados *store, struct req_state *s, RGWHandler *dialect_handler) override { +#define COOKIE_LEN 16 + char buf[COOKIE_LEN + 1]; + + RGWOp::init(store, s, dialect_handler); + gen_rand_alphanumeric(s->cct, buf, sizeof(buf) - 1); + cookie = buf; + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + +// virtual int get_policy_from_state(RGWRados *store, struct req_state *s, stringstream& ss) { return 0; } + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "put_lifecycle"; } + RGWOpType get_type() override { return RGW_OP_PUT_LC; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWDeleteLC : public RGWOp { +protected: + size_t len; + char *data; + +public: + RGWDeleteLC() { + len = 0; + data = NULL; + } + ~RGWDeleteLC() override { + free(data); + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "delete_lifecycle"; } + RGWOpType get_type() override { return RGW_OP_DELETE_LC; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetCORS : public RGWOp { +protected: + +public: + RGWGetCORS() {} + + int verify_permission() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "get_cors"; } + RGWOpType get_type() override { return RGW_OP_GET_CORS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutCORS : public RGWOp { +protected: + bufferlist cors_bl; + bufferlist in_data; + +public: + RGWPutCORS() {} + ~RGWPutCORS() override {} + + int verify_permission() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "put_cors"; } + RGWOpType get_type() override { return RGW_OP_PUT_CORS; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWDeleteCORS : public RGWOp { +protected: + +public: + RGWDeleteCORS() {} + + int verify_permission() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "delete_cors"; } + RGWOpType get_type() override { return RGW_OP_DELETE_CORS; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWOptionsCORS : public RGWOp { +protected: + RGWCORSRule *rule; + const char *origin, *req_hdrs, *req_meth; + +public: + RGWOptionsCORS() : rule(NULL), origin(NULL), + req_hdrs(NULL), req_meth(NULL) { + } + + int verify_permission() override {return 0;} + int validate_cors_request(RGWCORSConfiguration *cc); + void execute() override; + void get_response_params(string& allowed_hdrs, string& exp_hdrs, unsigned *max_age); + void send_response() override = 0; + const char* name() const override { return "options_cors"; } + RGWOpType get_type() override { return RGW_OP_OPTIONS_CORS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWGetRequestPayment : public RGWOp { +protected: + bool requester_pays; + +public: + RGWGetRequestPayment() : requester_pays(0) {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "get_request_payment"; } + RGWOpType get_type() override { return RGW_OP_GET_REQUEST_PAYMENT; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWSetRequestPayment : public RGWOp { +protected: + bool requester_pays; + bufferlist in_data; +public: + RGWSetRequestPayment() : requester_pays(false) {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() { return 0; } + + void send_response() override = 0; + const char* name() const override { return "set_request_payment"; } + RGWOpType get_type() override { return RGW_OP_SET_REQUEST_PAYMENT; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWInitMultipart : public RGWOp { +protected: + string upload_id; + RGWAccessControlPolicy policy; + +public: + RGWInitMultipart() {} + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + policy.set_ctx(s->cct); + } + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "init_multipart"; } + RGWOpType get_type() override { return RGW_OP_INIT_MULTIPART; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + virtual int prepare_encryption(map& attrs) { return 0; } +}; + +class RGWCompleteMultipart : public RGWOp { +protected: + string upload_id; + string etag; + string version_id; + bufferlist data; + + struct MPSerializer { + librados::IoCtx ioctx; + rados::cls::lock::Lock lock; + librados::ObjectWriteOperation op; + std::string oid; + bool locked; + + MPSerializer() : lock("RGWCompleteMultipart"), locked(false) + {} + + int try_lock(const std::string& oid, utime_t dur); + + int unlock() { + return lock.unlock(&ioctx, oid); + } + + void clear_locked() { + locked = false; + } + } serializer; + +public: + RGWCompleteMultipart() {} + ~RGWCompleteMultipart() override {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + void complete() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "complete_multipart"; } + RGWOpType get_type() override { return RGW_OP_COMPLETE_MULTIPART; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWAbortMultipart : public RGWOp { +public: + RGWAbortMultipart() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + void send_response() override = 0; + const char* name() const override { return "abort_multipart"; } + RGWOpType get_type() override { return RGW_OP_ABORT_MULTIPART; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } +}; + +class RGWListMultipart : public RGWOp { +protected: + string upload_id; + map parts; + int max_parts; + int marker; + RGWAccessControlPolicy policy; + bool truncated; + +public: + RGWListMultipart() { + max_parts = 1000; + marker = 0; + truncated = false; + } + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + policy = RGWAccessControlPolicy(s->cct); + } + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "list_multipart"; } + RGWOpType get_type() override { return RGW_OP_LIST_MULTIPART; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +struct RGWMultipartUploadEntry { + rgw_bucket_dir_entry obj; + RGWMPObj mp; + + friend std::ostream& operator<<(std::ostream& out, + const RGWMultipartUploadEntry& e) { + constexpr char quote = '"'; + return out << "RGWMultipartUploadEntry{ obj.key=" << + quote << e.obj.key << quote << " mp=" << e.mp << " }"; + } +}; + +class RGWListBucketMultiparts : public RGWOp { +protected: + string prefix; + RGWMPObj marker; + RGWMultipartUploadEntry next_marker; + int max_uploads; + string delimiter; + vector uploads; + map common_prefixes; + bool is_truncated; + int default_max; + +public: + RGWListBucketMultiparts() { + max_uploads = 0; + is_truncated = false; + default_max = 0; + } + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + max_uploads = default_max; + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "list_bucket_multiparts"; } + RGWOpType get_type() override { return RGW_OP_LIST_BUCKET_MULTIPARTS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + + +class RGWGetCrossDomainPolicy : public RGWOp { +public: + RGWGetCrossDomainPolicy() = default; + ~RGWGetCrossDomainPolicy() override = default; + + int verify_permission() override { + return 0; + } + + void execute() override { + op_ret = 0; + } + + const char* name() const override { return "get_crossdomain_policy"; } + + RGWOpType get_type() override { + return RGW_OP_GET_CROSS_DOMAIN_POLICY; + } + + uint32_t op_mask() override { + return RGW_OP_TYPE_READ; + } +}; + + +class RGWGetHealthCheck : public RGWOp { +public: + RGWGetHealthCheck() = default; + ~RGWGetHealthCheck() override = default; + + int verify_permission() override { + return 0; + } + + void execute() override; + + const char* name() const override { return "get_health_check"; } + + RGWOpType get_type() override { + return RGW_OP_GET_HEALTH_CHECK; + } + + uint32_t op_mask() override { + return RGW_OP_TYPE_READ; + } +}; + + +class RGWDeleteMultiObj : public RGWOp { +protected: + bufferlist data; + rgw_bucket bucket; + bool quiet; + bool status_dumped; + bool acl_allowed = false; + +public: + RGWDeleteMultiObj() { + quiet = false; + status_dumped = false; + } + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + virtual void send_status() = 0; + virtual void begin_response() = 0; + virtual void send_partial_response(rgw_obj_key& key, bool delete_marker, + const string& marker_version_id, int ret) = 0; + virtual void end_response() = 0; + const char* name() const override { return "multi_object_delete"; } + RGWOpType get_type() override { return RGW_OP_DELETE_MULTI_OBJ; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } +}; + +class RGWInfo: public RGWOp { +public: + RGWInfo() = default; + ~RGWInfo() override = default; + + int verify_permission() override { return 0; } + const char* name() const override { return "get info"; } + RGWOpType get_type() override { return RGW_OP_GET_INFO; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +extern int rgw_build_bucket_policies(RGWRados* store, struct req_state* s); +extern int rgw_build_object_policies(RGWRados *store, struct req_state *s, + bool prefetch_data); +extern void rgw_build_iam_environment(RGWRados* store, + struct req_state* s); +extern vector get_iam_user_policy_from_attr(CephContext* cct, + RGWRados* store, + map& attrs, + const string& tenant); + +static inline int get_system_versioning_params(req_state *s, + uint64_t *olh_epoch, + string *version_id) +{ + if (!s->system_request) { + return 0; + } + + if (olh_epoch) { + string epoch_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "versioned-epoch"); + if (!epoch_str.empty()) { + string err; + *olh_epoch = strict_strtol(epoch_str.c_str(), 10, &err); + if (!err.empty()) { + lsubdout(s->cct, rgw, 0) << "failed to parse versioned-epoch param" + << dendl; + return -EINVAL; + } + } + } + + if (version_id) { + *version_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "version-id"); + } + + return 0; +} /* get_system_versioning_params */ + +static inline void format_xattr(std::string &xattr) +{ + /* If the extended attribute is not valid UTF-8, we encode it using + * quoted-printable encoding. + */ + if ((check_utf8(xattr.c_str(), xattr.length()) != 0) || + (check_for_control_characters(xattr.c_str(), xattr.length()) != 0)) { + static const char MIME_PREFIX_STR[] = "=?UTF-8?Q?"; + static const int MIME_PREFIX_LEN = sizeof(MIME_PREFIX_STR) - 1; + static const char MIME_SUFFIX_STR[] = "?="; + static const int MIME_SUFFIX_LEN = sizeof(MIME_SUFFIX_STR) - 1; + int mlen = mime_encode_as_qp(xattr.c_str(), NULL, 0); + char *mime = new char[MIME_PREFIX_LEN + mlen + MIME_SUFFIX_LEN + 1]; + strcpy(mime, MIME_PREFIX_STR); + mime_encode_as_qp(xattr.c_str(), mime + MIME_PREFIX_LEN, mlen); + strcpy(mime + MIME_PREFIX_LEN + (mlen - 1), MIME_SUFFIX_STR); + xattr.assign(mime); + delete [] mime; + } +} /* format_xattr */ + +/** + * Get the HTTP request metadata out of the req_state as a + * map(, where attr_name is RGW_ATTR_PREFIX.HTTP_NAME) + * s: The request state + * attrs: will be filled up with attrs mapped as + * On success returns 0. + * On failure returns a negative error code. + * + */ +static inline int rgw_get_request_metadata(CephContext* const cct, + struct req_info& info, + std::map& attrs, + const bool allow_empty_attrs = true) +{ + static const std::set blacklisted_headers = { + "x-amz-server-side-encryption-customer-algorithm", + "x-amz-server-side-encryption-customer-key", + "x-amz-server-side-encryption-customer-key-md5", + "x-amz-storage-class" + }; + + size_t valid_meta_count = 0; + for (auto& kv : info.x_meta_map) { + const std::string& name = kv.first; + std::string& xattr = kv.second; + + if (blacklisted_headers.count(name) == 1) { + lsubdout(cct, rgw, 10) << "skipping x>> " << name << dendl; + continue; + } else if (allow_empty_attrs || !xattr.empty()) { + lsubdout(cct, rgw, 10) << "x>> " << name << ":" << xattr << dendl; + format_xattr(xattr); + + std::string attr_name(RGW_ATTR_PREFIX); + attr_name.append(name); + + /* Check roughly whether we aren't going behind the limit on attribute + * name. Passing here doesn't guarantee that an OSD will accept that + * as ObjectStore::get_max_attr_name_length() can set the limit even + * lower than the "osd_max_attr_name_len" configurable. */ + const auto max_attr_name_len = cct->_conf->rgw_max_attr_name_len; + if (max_attr_name_len && attr_name.length() > max_attr_name_len) { + return -ENAMETOOLONG; + } + + /* Similar remarks apply to the check for value size. We're veryfing + * it early at the RGW's side as it's being claimed in /info. */ + const auto max_attr_size = cct->_conf->rgw_max_attr_size; + if (max_attr_size && xattr.length() > max_attr_size) { + return -EFBIG; + } + + /* Swift allows administrators to limit the number of metadats items + * send _in a single request_. */ + const auto max_attrs_num_in_req = cct->_conf->rgw_max_attrs_num_in_req; + if (max_attrs_num_in_req && + ++valid_meta_count > max_attrs_num_in_req) { + return -E2BIG; + } + + auto rval = attrs.emplace(std::move(attr_name), ceph::bufferlist()); + /* At the moment the value of the freshly created attribute key-value + * pair is an empty bufferlist. */ + + ceph::bufferlist& bl = rval.first->second; + bl.append(xattr.c_str(), xattr.size() + 1); + } + } + + return 0; +} /* rgw_get_request_metadata */ + +static inline void encode_delete_at_attr(boost::optional delete_at, + map& attrs) +{ + if (delete_at == boost::none) { + return; + } + + bufferlist delatbl; + encode(*delete_at, delatbl); + attrs[RGW_ATTR_DELETE_AT] = delatbl; +} /* encode_delete_at_attr */ + +static inline void encode_obj_tags_attr(RGWObjTags* obj_tags, map& attrs) +{ + if (obj_tags == nullptr){ + // we assume the user submitted a tag format which we couldn't parse since + // this wouldn't be parsed later by get/put obj tags, lets delete if the + // attr was populated + return; + } + + bufferlist tagsbl; + obj_tags->encode(tagsbl); + attrs[RGW_ATTR_TAGS] = tagsbl; +} + +static inline int encode_dlo_manifest_attr(const char * const dlo_manifest, + map& attrs) +{ + string dm = dlo_manifest; + + if (dm.find('/') == string::npos) { + return -EINVAL; + } + + bufferlist manifest_bl; + manifest_bl.append(dlo_manifest, strlen(dlo_manifest) + 1); + attrs[RGW_ATTR_USER_MANIFEST] = manifest_bl; + + return 0; +} /* encode_dlo_manifest_attr */ + +static inline void complete_etag(MD5& hash, string *etag) +{ + char etag_buf[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char etag_buf_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + + hash.Final((unsigned char *)etag_buf); + buf_to_hex((const unsigned char *)etag_buf, CEPH_CRYPTO_MD5_DIGESTSIZE, + etag_buf_str); + + *etag = etag_buf_str; +} /* complete_etag */ + +class RGWSetAttrs : public RGWOp { +protected: + map attrs; + +public: + RGWSetAttrs() {} + ~RGWSetAttrs() override {} + + void emplace_attr(std::string&& key, buffer::list&& bl) { + attrs.emplace(std::move(key), std::move(bl)); + } + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + void send_response() override = 0; + const char* name() const override { return "set_attrs"; } + RGWOpType get_type() override { return RGW_OP_SET_ATTRS; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetObjLayout : public RGWOp { +protected: + RGWObjManifest *manifest{nullptr}; + rgw_raw_obj head_obj; + +public: + RGWGetObjLayout() { + } + + int check_caps(RGWUserCaps& caps) { + return caps.check_cap("admin", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void pre_exec() override; + void execute() override; + + const char* name() const override { return "get_obj_layout"; } + virtual RGWOpType get_type() override { return RGW_OP_GET_OBJ_LAYOUT; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutBucketPolicy : public RGWOp { + bufferlist data; +public: + RGWPutBucketPolicy() = default; + ~RGWPutBucketPolicy() { + } + void send_response() override; + int verify_permission() override; + uint32_t op_mask() override { + return RGW_OP_TYPE_WRITE; + } + void execute() override; + int get_params(); + const char* name() const override { return "put_bucket_policy"; } + RGWOpType get_type() override { + return RGW_OP_PUT_BUCKET_POLICY; + } +}; + +class RGWGetBucketPolicy : public RGWOp { + buffer::list policy; +public: + RGWGetBucketPolicy() = default; + void send_response() override; + int verify_permission() override; + uint32_t op_mask() override { + return RGW_OP_TYPE_READ; + } + void execute() override; + const char* name() const override { return "get_bucket_policy"; } + RGWOpType get_type() override { + return RGW_OP_GET_BUCKET_POLICY; + } +}; + +class RGWDeleteBucketPolicy : public RGWOp { +public: + RGWDeleteBucketPolicy() = default; + void send_response() override; + int verify_permission() override; + uint32_t op_mask() override { + return RGW_OP_TYPE_WRITE; + } + void execute() override; + int get_params(); + const char* name() const override { return "delete_bucket_policy"; } + RGWOpType get_type() override { + return RGW_OP_DELETE_BUCKET_POLICY; + } +}; + +class RGWPutBucketObjectLock : public RGWOp { +protected: + bufferlist data; + bufferlist obj_lock_bl; + RGWObjectLock obj_lock; +public: + RGWPutBucketObjectLock() = default; + ~RGWPutBucketObjectLock() {} + int verify_permission() override; + void pre_exec() override; + void execute() override; + virtual void send_response() = 0; + virtual int get_params() = 0; + const char* name() const override { return "put_bucket_object_lock"; } + RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_OBJ_LOCK; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetBucketObjectLock : public RGWOp { +public: + int verify_permission() override; + void pre_exec() override; + void execute() override; + virtual void send_response() = 0; + const char* name() const override {return "get_bucket_object_lock"; } + RGWOpType get_type() override { return RGW_OP_GET_BUCKET_OBJ_LOCK; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutObjRetention : public RGWOp { +protected: + bufferlist data; + RGWObjectRetention obj_retention; + bool bypass_perm; + bool bypass_governance_mode; +public: + RGWPutObjRetention():bypass_perm(true), bypass_governance_mode(false) {} + int verify_permission() override; + void pre_exec() override; + void execute() override; + virtual void send_response() override = 0; + virtual int get_params() = 0; + const char* name() const override { return "put_obj_retention"; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + RGWOpType get_type() override { return RGW_OP_PUT_OBJ_RETENTION; } +}; + +class RGWGetObjRetention : public RGWOp { +protected: + RGWObjectRetention obj_retention; +public: + int verify_permission() override; + void pre_exec() override; + void execute() override; + virtual void send_response() = 0; + const char* name() const override {return "get_obj_retention"; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ_RETENTION; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWPutObjLegalHold : public RGWOp { +protected: + bufferlist data; + RGWObjectLegalHold obj_legal_hold; +public: + int verify_permission() override; + void pre_exec() override; + void execute() override; + virtual void send_response() override = 0; + virtual int get_params() = 0; + const char* name() const override { return "put_obj_legal_hold"; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } + RGWOpType get_type() override { return RGW_OP_PUT_OBJ_LEGAL_HOLD; } +}; + +class RGWGetObjLegalHold : public RGWOp { +protected: + RGWObjectLegalHold obj_legal_hold; +public: + int verify_permission() override; + void pre_exec() override; + void execute() override; + virtual void send_response() = 0; + const char* name() const override {return "get_obj_legal_hold"; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ_LEGAL_HOLD; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + + +class RGWConfigBucketMetaSearch : public RGWOp { +protected: + std::map mdsearch_config; +public: + RGWConfigBucketMetaSearch() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + virtual int get_params() = 0; + const char* name() const override { return "config_bucket_meta_search"; } + virtual RGWOpType get_type() override { return RGW_OP_CONFIG_BUCKET_META_SEARCH; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetBucketMetaSearch : public RGWOp { +public: + RGWGetBucketMetaSearch() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override {} + + const char* name() const override { return "get_bucket_meta_search"; } + virtual RGWOpType get_type() override { return RGW_OP_GET_BUCKET_META_SEARCH; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +class RGWDelBucketMetaSearch : public RGWOp { +public: + RGWDelBucketMetaSearch() {} + + int verify_permission() override; + void pre_exec() override; + void execute() override; + + const char* name() const override { return "delete_bucket_meta_search"; } + virtual RGWOpType delete_type() { return RGW_OP_DEL_BUCKET_META_SEARCH; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +class RGWGetClusterStat : public RGWOp { +protected: + struct rados_cluster_stat_t stats_op; +public: + RGWGetClusterStat() {} + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWOp::init(store, s, h); + } + int verify_permission() override {return 0;} + virtual void send_response() override = 0; + virtual int get_params() = 0; + void execute() override; + const char* name() const override { return "get_cluster_stat"; } + dmc::client_id dmclock_client() override { return dmc::client_id::admin; } +}; + +static inline int parse_value_and_bound( + const string &input, + int &output, + const long lower_bound, + const long upper_bound, + const long default_val) +{ + if (!input.empty()) { + char *endptr; + output = strtol(input.c_str(), &endptr, 10); + if (endptr) { + if (endptr == input.c_str()) return -EINVAL; + while (*endptr && isspace(*endptr)) // ignore white space + endptr++; + if (*endptr) { + return -EINVAL; + } + } + if(output > upper_bound) { + output = upper_bound; + } + if(output < lower_bound) { + output = lower_bound; + } + } else { + output = default_val; + } + + return 0; +} + + +#endif /* CEPH_RGW_OP_H */ diff --git a/src/rgw/rgw_opa.cc b/src/rgw/rgw_opa.cc new file mode 100644 index 00000000..2331beb6 --- /dev/null +++ b/src/rgw/rgw_opa.cc @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_opa.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +int rgw_opa_authorize(RGWOp *& op, + req_state * const s) +{ + + ldpp_dout(op, 2) << "authorizing request using OPA" << dendl; + + /* get OPA url */ + const string& opa_url = s->cct->_conf->rgw_opa_url; + if (opa_url == "") { + ldpp_dout(op, 2) << "OPA_URL not provided" << dendl; + return -ERR_INVALID_REQUEST; + } + ldpp_dout(op, 2) << "OPA URL= " << opa_url.c_str() << dendl; + + /* get authentication token for OPA */ + const string& opa_token = s->cct->_conf->rgw_opa_token; + + int ret; + bufferlist bl; + RGWHTTPTransceiver req(s->cct, "POST", opa_url.c_str(), &bl); + + /* set required headers for OPA request */ + req.append_header("X-Auth-Token", opa_token); + req.append_header("Content-Type", "application/json"); + + /* check if we want to verify OPA server SSL certificate */ + req.set_verify_ssl(s->cct->_conf->rgw_opa_verify_ssl); + + /* create json request body */ + JSONFormatter jf; + jf.open_object_section(""); + jf.open_object_section("input"); + jf.dump_string("method", s->info.env->get("REQUEST_METHOD")); + jf.dump_string("relative_uri", s->relative_uri.c_str()); + jf.dump_string("decoded_uri", s->decoded_uri.c_str()); + jf.dump_string("params", s->info.request_params.c_str()); + jf.dump_string("request_uri_aws4", s->info.request_uri_aws4.c_str()); + jf.dump_string("object_name", s->object.name.c_str()); + jf.dump_string("subuser", s->auth.identity->get_subuser().c_str()); + jf.dump_object("user_info", *s->user); + jf.dump_object("bucket_info", s->bucket_info); + jf.close_section(); + jf.close_section(); + + std::stringstream ss; + jf.flush(ss); + req.set_post_data(ss.str()); + req.set_send_length(ss.str().length()); + + /* send request */ + ret = req.process(); + if (ret < 0) { + ldpp_dout(op, 2) << "OPA process error:" << bl.c_str() << dendl; + return ret; + } + + /* check OPA response */ + JSONParser parser; + if (!parser.parse(bl.c_str(), bl.length())) { + ldpp_dout(op, 2) << "OPA parse error: malformed json" << dendl; + return -EINVAL; + } + + bool opa_result; + JSONDecoder::decode_json("result", opa_result, &parser); + + if (opa_result == false) { + ldpp_dout(op, 2) << "OPA rejecting request" << dendl; + return -EPERM; + } + + ldpp_dout(op, 2) << "OPA accepting request" << dendl; + return 0; +} diff --git a/src/rgw/rgw_opa.h b/src/rgw/rgw_opa.h new file mode 100644 index 00000000..2f87e45e --- /dev/null +++ b/src/rgw/rgw_opa.h @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_OPA_H +#define RGW_OPA_H + +#include "rgw_common.h" +#include "rgw_op.h" + +/* authorize request using OPA */ +int rgw_opa_authorize(RGWOp*& op, + req_state* s); + +#endif /* RGW_OPA_H */ diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/rgw_orphan.cc new file mode 100644 index 00000000..832076b7 --- /dev/null +++ b/src/rgw/rgw_orphan.cc @@ -0,0 +1,1523 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + + +#include "common/config.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "rgw_rados.h" +#include "rgw_op.h" +#include "rgw_multi.h" +#include "rgw_orphan.h" +#include "rgw_zone.h" +#include "rgw_bucket.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + +#define DEFAULT_NUM_SHARDS 64 + +static string obj_fingerprint(const string& oid, const char *force_ns = NULL) +{ + ssize_t pos = oid.find('_'); + if (pos < 0) { + cerr << "ERROR: object does not have a bucket marker: " << oid << std::endl; + } + + string obj_marker = oid.substr(0, pos); + + rgw_obj_key key; + + rgw_obj_key::parse_raw_oid(oid.substr(pos + 1), &key); + + if (key.ns.empty()) { + return oid; + } + + string s = oid; + + if (force_ns) { + rgw_bucket b; + rgw_obj new_obj(b, key); + s = obj_marker + "_" + new_obj.get_oid(); + } + + /* cut out suffix */ + size_t i = s.size() - 1; + for (; i >= s.size() - 10; --i) { + char c = s[i]; + if (!isdigit(c) && c != '.' && c != '_') { + break; + } + } + + return s.substr(0, i + 1); +} + +int RGWOrphanStore::read_job(const string& job_name, RGWOrphanSearchState & state) +{ + set keys; + map vals; + keys.insert(job_name); + int r = ioctx.omap_get_vals_by_keys(oid, keys, &vals); + if (r < 0) { + return r; + } + + map::iterator iter = vals.find(job_name); + if (iter == vals.end()) { + return -ENOENT; + } + + try { + bufferlist& bl = iter->second; + decode(state, bl); + } catch (buffer::error& err) { + lderr(store->ctx()) << "ERROR: could not decode buffer" << dendl; + return -EIO; + } + + return 0; +} + +int RGWOrphanStore::write_job(const string& job_name, const RGWOrphanSearchState& state) +{ + map vals; + bufferlist bl; + encode(state, bl); + vals[job_name] = bl; + int r = ioctx.omap_set(oid, vals); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWOrphanStore::remove_job(const string& job_name) +{ + set keys; + keys.insert(job_name); + + int r = ioctx.omap_rm_keys(oid, keys); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWOrphanStore::list_jobs(map & job_list) +{ + map vals; + int MAX_READ=1024; + string marker=""; + int r = 0; + + // loop through all the omap vals from index object, storing them to job_list, + // read in batches of 1024, we update the marker every iteration and exit the + // loop when we find that total size read out is less than batch size + do { + r = ioctx.omap_get_vals(oid, marker, MAX_READ, &vals); + if (r < 0) { + return r; + } + r = vals.size(); + + for (const auto &it : vals) { + marker=it.first; + RGWOrphanSearchState state; + try { + bufferlist bl = it.second; + decode(state, bl); + } catch (buffer::error& err) { + lderr(store->ctx()) << "ERROR: could not decode buffer" << dendl; + return -EIO; + } + job_list[it.first] = state; + } + } while (r == MAX_READ); + + return 0; +} + +int RGWOrphanStore::init() +{ + const rgw_pool& log_pool = store->svc.zone->get_zone_params().log_pool; + int r = rgw_init_ioctx(store->get_rados_handle(), log_pool, ioctx); + if (r < 0) { + cerr << "ERROR: failed to open log pool (" << log_pool << " ret=" << r << std::endl; + return r; + } + + return 0; +} + +int RGWOrphanStore::store_entries(const string& oid, const map& entries) +{ + librados::ObjectWriteOperation op; + op.omap_set(entries); + cout << "storing " << entries.size() << " entries at " << oid << std::endl; + ldout(store->ctx(), 20) << "storing " << entries.size() << " entries at " << oid << ": " << dendl; + for (map::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) { + ldout(store->ctx(), 20) << " > " << iter->first << dendl; + } + int ret = ioctx.operate(oid, &op); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << ret << dendl; + } + + return 0; +} + +int RGWOrphanStore::read_entries(const string& oid, const string& marker, map *entries, bool *truncated) +{ +#define MAX_OMAP_GET 100 + int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET, entries); + if (ret < 0 && ret != -ENOENT) { + cerr << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << cpp_strerror(-ret) << std::endl; + } + + *truncated = (entries->size() == MAX_OMAP_GET); + + return 0; +} + +int RGWOrphanSearch::init(const string& job_name, RGWOrphanSearchInfo *info, bool _detailed_mode) +{ + int r = orphan_store.init(); + if (r < 0) { + return r; + } + + constexpr int64_t MAX_LIST_OBJS_ENTRIES=100; + + max_list_bucket_entries = std::max(store->ctx()->_conf->rgw_list_bucket_min_readahead, + MAX_LIST_OBJS_ENTRIES); + + detailed_mode = _detailed_mode; + RGWOrphanSearchState state; + r = orphan_store.read_job(job_name, state); + if (r < 0 && r != -ENOENT) { + lderr(store->ctx()) << "ERROR: failed to read state ret=" << r << dendl; + return r; + } + + if (r == 0) { + search_info = state.info; + search_stage = state.stage; + } else if (info) { /* r == -ENOENT, initiate a new job if info was provided */ + search_info = *info; + search_info.job_name = job_name; + search_info.num_shards = (info->num_shards ? info->num_shards : DEFAULT_NUM_SHARDS); + search_info.start_time = ceph_clock_now(); + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_INIT); + + r = save_state(); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to write state ret=" << r << dendl; + return r; + } + } else { + lderr(store->ctx()) << "ERROR: job not found" << dendl; + return r; + } + + index_objs_prefix = RGW_ORPHAN_INDEX_PREFIX + string("."); + index_objs_prefix += job_name; + + for (int i = 0; i < search_info.num_shards; i++) { + char buf[128]; + + snprintf(buf, sizeof(buf), "%s.rados.%d", index_objs_prefix.c_str(), i); + all_objs_index[i] = buf; + + snprintf(buf, sizeof(buf), "%s.buckets.%d", index_objs_prefix.c_str(), i); + buckets_instance_index[i] = buf; + + snprintf(buf, sizeof(buf), "%s.linked.%d", index_objs_prefix.c_str(), i); + linked_objs_index[i] = buf; + } + return 0; +} + +int RGWOrphanSearch::log_oids(map& log_shards, map >& oids) +{ + map >::iterator miter = oids.begin(); + + list liters; /* a list of iterator pairs for begin and end */ + + for (; miter != oids.end(); ++miter) { + log_iter_info info; + info.oid = log_shards[miter->first]; + info.cur = miter->second.begin(); + info.end = miter->second.end(); + liters.push_back(info); + } + + list::iterator list_iter; + while (!liters.empty()) { + list_iter = liters.begin(); + + while (list_iter != liters.end()) { + log_iter_info& cur_info = *list_iter; + + list::iterator& cur = cur_info.cur; + list::iterator& end = cur_info.end; + + map entries; +#define MAX_OMAP_SET_ENTRIES 100 + for (int j = 0; cur != end && j != MAX_OMAP_SET_ENTRIES; ++cur, ++j) { + ldout(store->ctx(), 20) << "adding obj: " << *cur << dendl; + entries[*cur] = bufferlist(); + } + + int ret = orphan_store.store_entries(cur_info.oid, entries); + if (ret < 0) { + return ret; + } + list::iterator tmp = list_iter; + ++list_iter; + if (cur == end) { + liters.erase(tmp); + } + } + } + return 0; +} + +int RGWOrphanSearch::build_all_oids_index() +{ + librados::IoCtx ioctx; + + int ret = rgw_init_ioctx(store->get_rados_handle(), search_info.pool, ioctx); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": rgw_init_ioctx() returned ret=" << ret << dendl; + return ret; + } + + ioctx.set_namespace(librados::all_nspaces); + librados::NObjectIterator i = ioctx.nobjects_begin(); + librados::NObjectIterator i_end = ioctx.nobjects_end(); + + map > oids; + + int count = 0; + uint64_t total = 0; + + cout << "logging all objects in the pool" << std::endl; + + for (; i != i_end; ++i) { + string nspace = i->get_nspace(); + string oid = i->get_oid(); + string locator = i->get_locator(); + + ssize_t pos = oid.find('_'); + if (pos < 0) { + cout << "unidentified oid: " << oid << ", skipping" << std::endl; + /* what is this object, oids should be in the format of _, + * skip this entry + */ + continue; + } + string stripped_oid = oid.substr(pos + 1); + rgw_obj_key key; + if (!rgw_obj_key::parse_raw_oid(stripped_oid, &key)) { + cout << "cannot parse oid: " << oid << ", skipping" << std::endl; + continue; + } + + if (key.ns.empty()) { + /* skipping head objects, we don't want to remove these as they are mutable and + * cleaning them up is racy (can race with object removal and a later recreation) + */ + cout << "skipping head object: oid=" << oid << std::endl; + continue; + } + + string oid_fp = obj_fingerprint(oid); + + ldout(store->ctx(), 20) << "oid_fp=" << oid_fp << dendl; + + int shard = orphan_shard(oid_fp); + oids[shard].push_back(oid); + +#define COUNT_BEFORE_FLUSH 1000 + ++total; + if (++count >= COUNT_BEFORE_FLUSH) { + ldout(store->ctx(), 1) << "iterated through " << total << " objects" << dendl; + ret = log_oids(all_objs_index, oids); + if (ret < 0) { + cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl; + return ret; + } + count = 0; + oids.clear(); + } + } + ret = log_oids(all_objs_index, oids); + if (ret < 0) { + cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl; + return ret; + } + + return 0; +} + +int RGWOrphanSearch::build_buckets_instance_index() +{ + void *handle; + int max = 1000; + string section = "bucket.instance"; + int ret = store->meta_mgr->list_keys_init(section, &handle); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: can't get key: " << cpp_strerror(-ret) << dendl; + return ret; + } + + map > instances; + + bool truncated; + + RGWObjectCtx obj_ctx(store); + + int count = 0; + uint64_t total = 0; + + do { + list keys; + ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + for (list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + ++total; + ldout(store->ctx(), 10) << "bucket_instance=" << *iter << " total=" << total << dendl; + int shard = orphan_shard(*iter); + instances[shard].push_back(*iter); + + if (++count >= COUNT_BEFORE_FLUSH) { + ret = log_oids(buckets_instance_index, instances); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl; + return ret; + } + count = 0; + instances.clear(); + } + } + + } while (truncated); + + ret = log_oids(buckets_instance_index, instances); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl; + return ret; + } + store->meta_mgr->list_keys_complete(handle); + + return 0; +} + +int RGWOrphanSearch::handle_stat_result(map >& oids, RGWRados::Object::Stat::Result& result) +{ + set obj_oids; + rgw_bucket& bucket = result.obj.bucket; + if (!result.has_manifest) { /* a very very old object, or part of a multipart upload during upload */ + const string loc = bucket.bucket_id + "_" + result.obj.get_oid(); + obj_oids.insert(obj_fingerprint(loc)); + + /* + * multipart parts don't have manifest on them, it's in the meta object. Instead of reading the + * meta object, just add a "shadow" object to the mix + */ + obj_oids.insert(obj_fingerprint(loc, "shadow")); + } else { + RGWObjManifest& manifest = result.manifest; + + if (!detailed_mode && + manifest.get_obj_size() <= manifest.get_head_size()) { + ldout(store->ctx(), 5) << "skipping object as it fits in a head" << dendl; + return 0; + } + + RGWObjManifest::obj_iterator miter; + for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) { + const rgw_raw_obj& loc = miter.get_location().get_raw_obj(store); + string s = loc.oid; + obj_oids.insert(obj_fingerprint(s)); + } + } + + for (set::iterator iter = obj_oids.begin(); iter != obj_oids.end(); ++iter) { + ldout(store->ctx(), 20) << __func__ << ": oid for obj=" << result.obj << ": " << *iter << dendl; + + int shard = orphan_shard(*iter); + oids[shard].push_back(*iter); + } + + return 0; +} + +int RGWOrphanSearch::pop_and_handle_stat_op(map >& oids, std::deque& ops) +{ + RGWRados::Object::Stat& front_op = ops.front(); + + int ret = front_op.wait(); + if (ret < 0) { + if (ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl; + } + goto done; + } + ret = handle_stat_result(oids, front_op.result); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: handle_stat_response() returned error: " << cpp_strerror(-ret) << dendl; + } +done: + ops.pop_front(); + return ret; +} + +int RGWOrphanSearch::build_linked_oids_for_bucket(const string& bucket_instance_id, map >& oids) +{ + RGWObjectCtx obj_ctx(store); + auto sysobj_ctx = store->svc.sysobj->init_obj_ctx(); + + rgw_bucket orphan_bucket; + int shard_id; + int ret = rgw_bucket_parse_bucket_key(store->ctx(), bucket_instance_id, + &orphan_bucket, &shard_id); + if (ret < 0) { + ldout(store->ctx(),0) << __func__ << " failed to parse bucket instance: " + << bucket_instance_id << " skipping" << dendl; + return ret; + } + + RGWBucketInfo cur_bucket_info; + ret = store->get_bucket_info(sysobj_ctx, orphan_bucket.tenant, + orphan_bucket.name, cur_bucket_info, nullptr); + if (ret < 0) { + if (ret == -ENOENT) { + /* probably raced with bucket removal */ + return 0; + } + lderr(store->ctx()) << __func__ << ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << ret << dendl; + return ret; + } + + if (cur_bucket_info.bucket.bucket_id != orphan_bucket.bucket_id) { + ldout(store->ctx(), 0) << __func__ << ": Skipping stale bucket instance: " + << orphan_bucket.name << ": " + << orphan_bucket.bucket_id << dendl; + return 0; + } + + if (cur_bucket_info.reshard_status == CLS_RGW_RESHARD_IN_PROGRESS) { + ldout(store->ctx(), 0) << __func__ << ": reshard in progress. Skipping " + << orphan_bucket.name << ": " + << orphan_bucket.bucket_id << dendl; + return 0; + } + + RGWBucketInfo bucket_info; + ret = store->get_bucket_instance_info(sysobj_ctx, bucket_instance_id, bucket_info, nullptr, nullptr); + if (ret < 0) { + if (ret == -ENOENT) { + /* probably raced with bucket removal */ + return 0; + } + lderr(store->ctx()) << __func__ << ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << ret << dendl; + return ret; + } + + ldout(store->ctx(), 10) << "building linked oids for bucket instance: " << bucket_instance_id << dendl; + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + + string marker; + list_op.params.marker = rgw_obj_key(marker); + list_op.params.list_versions = true; + list_op.params.enforce_ns = false; + + bool truncated; + + deque stat_ops; + + do { + vector result; + + ret = list_op.list_objects(max_list_bucket_entries, + &result, nullptr, &truncated); + if (ret < 0) { + cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << std::endl; + return ret; + } + + for (vector::iterator iter = result.begin(); iter != result.end(); ++iter) { + rgw_bucket_dir_entry& entry = *iter; + if (entry.key.instance.empty()) { + ldout(store->ctx(), 20) << "obj entry: " << entry.key.name << dendl; + } else { + ldout(store->ctx(), 20) << "obj entry: " << entry.key.name << " [" << entry.key.instance << "]" << dendl; + } + + ldout(store->ctx(), 20) << __func__ << ": entry.key.name=" << entry.key.name << " entry.key.instance=" << entry.key.instance << dendl; + + if (!detailed_mode && + entry.meta.accounted_size <= (uint64_t)store->ctx()->_conf->rgw_max_chunk_size) { + ldout(store->ctx(),5) << __func__ << "skipping stat as the object " << entry.key.name + << "fits in a head" << dendl; + continue; + } + + rgw_obj obj(bucket_info.bucket, entry.key); + + RGWRados::Object op_target(store, bucket_info, obj_ctx, obj); + + stat_ops.push_back(RGWRados::Object::Stat(&op_target)); + RGWRados::Object::Stat& op = stat_ops.back(); + + + ret = op.stat_async(); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl; + return ret; + } + if (stat_ops.size() >= max_concurrent_ios) { + ret = pop_and_handle_stat_op(oids, stat_ops); + if (ret < 0) { + if (ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl; + } + } + } + if (oids.size() >= COUNT_BEFORE_FLUSH) { + ret = log_oids(linked_objs_index, oids); + if (ret < 0) { + cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl; + return ret; + } + oids.clear(); + } + } + } while (truncated); + + while (!stat_ops.empty()) { + ret = pop_and_handle_stat_op(oids, stat_ops); + if (ret < 0) { + if (ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl; + } + } + } + + return 0; +} + +int RGWOrphanSearch::build_linked_oids_index() +{ + map > oids; + map::iterator iter = buckets_instance_index.find(search_stage.shard); + for (; iter != buckets_instance_index.end(); ++iter) { + ldout(store->ctx(), 0) << "building linked oids index: " << iter->first << "/" << buckets_instance_index.size() << dendl; + bool truncated; + + string oid = iter->second; + + do { + map entries; + int ret = orphan_store.read_entries(oid, search_stage.marker, &entries, &truncated); + if (ret == -ENOENT) { + truncated = false; + ret = 0; + } + + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: read_entries() oid=" << oid << " returned ret=" << ret << dendl; + return ret; + } + + if (entries.empty()) { + break; + } + + for (map::iterator eiter = entries.begin(); eiter != entries.end(); ++eiter) { + ldout(store->ctx(), 20) << " indexed entry: " << eiter->first << dendl; + ret = build_linked_oids_for_bucket(eiter->first, oids); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: build_linked_oids_for_bucket() indexed entry=" << eiter->first + << " returned ret=" << ret << dendl; + return ret; + } + } + + search_stage.shard = iter->first; + search_stage.marker = entries.rbegin()->first; /* last entry */ + } while (truncated); + + search_stage.marker.clear(); + } + + int ret = log_oids(linked_objs_index, oids); + if (ret < 0) { + cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl; + return ret; + } + + ret = save_state(); + if (ret < 0) { + cerr << __func__ << ": ERROR: failed to write state ret=" << ret << std::endl; + return ret; + } + + return 0; +} + +class OMAPReader { + librados::IoCtx ioctx; + string oid; + + map entries; + map::iterator iter; + string marker; + bool truncated; + +public: + OMAPReader(librados::IoCtx& _ioctx, const string& _oid) : ioctx(_ioctx), oid(_oid), truncated(true) { + iter = entries.end(); + } + + int get_next(string *key, bufferlist *pbl, bool *done); +}; + +int OMAPReader::get_next(string *key, bufferlist *pbl, bool *done) +{ + if (iter != entries.end()) { + *key = iter->first; + if (pbl) { + *pbl = iter->second; + } + ++iter; + *done = false; + marker = *key; + return 0; + } + + if (!truncated) { + *done = true; + return 0; + } + +#define MAX_OMAP_GET_ENTRIES 100 + int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET_ENTRIES, &entries); + if (ret < 0) { + if (ret == -ENOENT) { + *done = true; + return 0; + } + return ret; + } + + truncated = (entries.size() == MAX_OMAP_GET_ENTRIES); + iter = entries.begin(); + return get_next(key, pbl, done); +} + +int RGWOrphanSearch::compare_oid_indexes() +{ + ceph_assert(linked_objs_index.size() == all_objs_index.size()); + + librados::IoCtx& ioctx = orphan_store.get_ioctx(); + + librados::IoCtx data_ioctx; + + int ret = rgw_init_ioctx(store->get_rados_handle(), search_info.pool, data_ioctx); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": rgw_init_ioctx() returned ret=" << ret << dendl; + return ret; + } + + uint64_t time_threshold = search_info.start_time.sec() - stale_secs; + + map::iterator liter = linked_objs_index.begin(); + map::iterator aiter = all_objs_index.begin(); + + for (; liter != linked_objs_index.end(); ++liter, ++aiter) { + OMAPReader linked_entries(ioctx, liter->second); + OMAPReader all_entries(ioctx, aiter->second); + + bool done; + + string cur_linked; + bool linked_done = false; + + + do { + string key; + int r = all_entries.get_next(&key, NULL, &done); + if (r < 0) { + return r; + } + if (done) { + break; + } + + string key_fp = obj_fingerprint(key); + + while (cur_linked < key_fp && !linked_done) { + r = linked_entries.get_next(&cur_linked, NULL, &linked_done); + if (r < 0) { + return r; + } + } + + if (cur_linked == key_fp) { + ldout(store->ctx(), 20) << "linked: " << key << dendl; + continue; + } + + time_t mtime; + r = data_ioctx.stat(key, NULL, &mtime); + if (r < 0) { + if (r != -ENOENT) { + lderr(store->ctx()) << "ERROR: ioctx.stat(" << key << ") returned ret=" << r << dendl; + } + continue; + } + if (stale_secs && (uint64_t)mtime >= time_threshold) { + ldout(store->ctx(), 20) << "skipping: " << key << " (mtime=" << mtime << " threshold=" << time_threshold << ")" << dendl; + continue; + } + ldout(store->ctx(), 20) << "leaked: " << key << dendl; + cout << "leaked: " << key << std::endl; + } while (!done); + } + + return 0; +} + +int RGWOrphanSearch::run() +{ + int r; + + switch (search_stage.stage) { + + case ORPHAN_SEARCH_STAGE_INIT: + ldout(store->ctx(), 0) << __func__ << "(): initializing state" << dendl; + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSPOOL); + r = save_state(); + if (r < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl; + return r; + } + // fall through + case ORPHAN_SEARCH_STAGE_LSPOOL: + ldout(store->ctx(), 0) << __func__ << "(): building index of all objects in pool" << dendl; + r = build_all_oids_index(); + if (r < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl; + return r; + } + + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSBUCKETS); + r = save_state(); + if (r < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl; + return r; + } + // fall through + + case ORPHAN_SEARCH_STAGE_LSBUCKETS: + ldout(store->ctx(), 0) << __func__ << "(): building index of all bucket indexes" << dendl; + r = build_buckets_instance_index(); + if (r < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl; + return r; + } + + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_ITERATE_BI); + r = save_state(); + if (r < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl; + return r; + } + // fall through + + + case ORPHAN_SEARCH_STAGE_ITERATE_BI: + ldout(store->ctx(), 0) << __func__ << "(): building index of all linked objects" << dendl; + r = build_linked_oids_index(); + if (r < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl; + return r; + } + + search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_COMPARE); + r = save_state(); + if (r < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl; + return r; + } + // fall through + + case ORPHAN_SEARCH_STAGE_COMPARE: + r = compare_oid_indexes(); + if (r < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl; + return r; + } + + break; + + default: + ceph_abort(); + }; + + return 0; +} + + +int RGWOrphanSearch::remove_index(map& index) +{ + librados::IoCtx& ioctx = orphan_store.get_ioctx(); + + for (map::iterator iter = index.begin(); iter != index.end(); ++iter) { + int r = ioctx.remove(iter->second); + if (r < 0) { + if (r != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: couldn't remove " << iter->second << ": ret=" << r << dendl; + } + } + } + return 0; +} + +int RGWOrphanSearch::finish() +{ + int r = remove_index(all_objs_index); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: remove_index(" << all_objs_index << ") returned ret=" << r << dendl; + } + r = remove_index(buckets_instance_index); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: remove_index(" << buckets_instance_index << ") returned ret=" << r << dendl; + } + r = remove_index(linked_objs_index); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: remove_index(" << linked_objs_index << ") returned ret=" << r << dendl; + } + + r = orphan_store.remove_job(search_info.job_name); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: could not remove job name (" << search_info.job_name << ") ret=" << r << dendl; + } + + return r; +} + + +int RGWRadosList::handle_stat_result(RGWRados::Object::Stat::Result& result, + std::set& obj_oids) +{ + obj_oids.clear(); + + rgw_bucket& bucket = result.obj.bucket; + + ldout(store->ctx(), 20) << "RGWRadosList::" << __func__ << + " bucket=" << bucket << ", has_manifest=" << result.has_manifest << + dendl; + + // iterator to store result of dlo/slo attribute find + decltype(result.attrs)::iterator attr_it = result.attrs.end(); + const std::string oid = bucket.marker + "_" + result.obj.get_oid(); + ldout(store->ctx(), 20) << "radoslist processing object=\"" << + oid << "\"" << dendl; + if (visited_oids.find(oid) != visited_oids.end()) { + // apparently we hit a loop; don't continue with this oid + ldout(store->ctx(), 15) << + "radoslist stopped loop at already visited object=\"" << + oid << "\"" << dendl; + return 0; + } + + if (!result.has_manifest) { + /* a very very old object, or part of a multipart upload during upload */ + obj_oids.insert(oid); + + /* + * multipart parts don't have manifest on them, it's in the meta + * object; we'll process them in + * RGWRadosList::do_incomplete_multipart + */ + } else if ((attr_it = result.attrs.find(RGW_ATTR_USER_MANIFEST)) != + result.attrs.end()) { + // *** handle DLO object *** + + obj_oids.insert(oid); + visited_oids.insert(oid); // prevent dlo loops + ldout(store->ctx(), 15) << "radoslist added to visited list DLO=\"" << + oid << "\"" << dendl; + + char* prefix_path_c = attr_it->second.c_str(); + const std::string& prefix_path = prefix_path_c; + + const size_t sep_pos = prefix_path.find('/'); + if (string::npos == sep_pos) { + return -EINVAL; + } + + const std::string bucket_name = prefix_path.substr(0, sep_pos); + const std::string prefix = prefix_path.substr(sep_pos + 1); + + add_bucket_prefix(bucket_name, prefix); + ldout(store->ctx(), 25) << "radoslist DLO oid=\"" << oid << + "\" added bucket=\"" << bucket_name << "\" prefix=\"" << + prefix << "\" to process list" << dendl; + } else if ((attr_it = result.attrs.find(RGW_ATTR_SLO_MANIFEST)) != + result.attrs.end()) { + // *** handle SLO object *** + + obj_oids.insert(oid); + visited_oids.insert(oid); // prevent slo loops + ldout(store->ctx(), 15) << "radoslist added to visited list SLO=\"" << + oid << "\"" << dendl; + + RGWSLOInfo slo_info; + bufferlist::const_iterator bliter = attr_it->second.begin(); + try { + ::decode(slo_info, bliter); + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << + "ERROR: failed to decode slo manifest for " << oid << dendl; + return -EIO; + } + + for (const auto& iter : slo_info.entries) { + const string& path_str = iter.path; + + const size_t sep_pos = path_str.find('/', 1 /* skip initial slash */); + if (string::npos == sep_pos) { + return -EINVAL; + } + + std::string bucket_name; + std::string obj_name; + + bucket_name = url_decode(path_str.substr(1, sep_pos - 1)); + obj_name = url_decode(path_str.substr(sep_pos + 1)); + + const rgw_obj_key obj_key(obj_name); + add_bucket_filter(bucket_name, obj_key); + ldout(store->ctx(), 25) << "radoslist SLO oid=\"" << oid << + "\" added bucket=\"" << bucket_name << "\" obj_key=\"" << + obj_key << "\" to process list" << dendl; + } + } else { + RGWObjManifest& manifest = result.manifest; + + // in multipart, the head object contains no data and just has the + // manifest AND empty objects have no manifest, but they're + // realized as empty rados objects + if (0 == manifest.get_max_head_size() || + manifest.obj_begin() == manifest.obj_end()) { + obj_oids.insert(oid); + // first_insert = true; + } + + RGWObjManifest::obj_iterator miter; + for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) { + const rgw_raw_obj& loc = miter.get_location().get_raw_obj(store); + string s = loc.oid; + obj_oids.insert(s); + } + } + + return 0; +} // RGWRadosList::handle_stat_result + +int RGWRadosList::pop_and_handle_stat_op( + RGWObjectCtx& obj_ctx, + std::deque& ops) +{ + std::set obj_oids; + RGWRados::Object::Stat& front_op = ops.front(); + + int ret = front_op.wait(); + if (ret < 0) { + if (ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: stat_async() returned error: " << + cpp_strerror(-ret) << dendl; + } + goto done; + } + + ret = handle_stat_result(front_op.result, obj_oids); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: handle_stat_result() returned error: " << + cpp_strerror(-ret) << dendl; + } + + // output results + for (const auto& o : obj_oids) { + std::cout << o << std::endl; + } + +done: + + // invalidate object context for this object to avoid memory leak + // (see pr https://github.com/ceph/ceph/pull/30174) + obj_ctx.invalidate(front_op.result.obj); + + ops.pop_front(); + return ret; +} + + +#if 0 // code that may be the basis for expansion +int RGWRadosList::build_buckets_instance_index() +{ + void *handle; + int max = 1000; + string section = "bucket.instance"; + int ret = store->meta_mgr->list_keys_init(section, &handle); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: can't get key: " << cpp_strerror(-ret) << dendl; + return ret; + } + + map > instances; + + bool truncated; + + RGWObjectCtx obj_ctx(store); + + int count = 0; + uint64_t total = 0; + + do { + list keys; + ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + for (list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + ++total; + ldout(store->ctx(), 10) << "bucket_instance=" << *iter << " total=" << total << dendl; + int shard = orphan_shard(*iter); + instances[shard].push_back(*iter); + + if (++count >= COUNT_BEFORE_FLUSH) { + ret = log_oids(buckets_instance_index, instances); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl; + return ret; + } + count = 0; + instances.clear(); + } + } + } while (truncated); + + ret = log_oids(buckets_instance_index, instances); + if (ret < 0) { + lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl; + return ret; + } + store->meta_mgr->list_keys_complete(handle); + + return 0; +} +#endif + + +int RGWRadosList::process_bucket( + const std::string& bucket_instance_id, + const std::string& prefix, + const std::set& entries_filter) +{ + ldout(store->ctx(), 10) << "RGWRadosList::" << __func__ << + " bucket_instance_id=" << bucket_instance_id << + ", prefix=" << prefix << + ", entries_filter.size=" << entries_filter.size() << dendl; + + RGWBucketInfo bucket_info; + RGWSysObjectCtx sys_obj_ctx = store->svc.sysobj->init_obj_ctx(); + int ret = store->get_bucket_instance_info(sys_obj_ctx, bucket_instance_id, + bucket_info, nullptr, nullptr); + if (ret < 0) { + if (ret == -ENOENT) { + // probably raced with bucket removal + return 0; + } + lderr(store->ctx()) << __func__ << + ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << + ret << dendl; + return ret; + } + + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + + std::string marker; + list_op.params.marker = rgw_obj_key(marker); + list_op.params.list_versions = true; + list_op.params.enforce_ns = false; + list_op.params.allow_unordered = false; + list_op.params.prefix = prefix; + + bool truncated; + + std::deque stat_ops; + std::string prev_versioned_key_name = ""; + + RGWObjectCtx obj_ctx(store); + + do { + std::vector result; + + constexpr int64_t LIST_OBJS_MAX_ENTRIES = 100; + ret = list_op.list_objects(LIST_OBJS_MAX_ENTRIES, &result, + NULL, &truncated); + if (ret == -ENOENT) { + // race with bucket delete? + ret = 0; + break; + } else if (ret < 0) { + std::cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << + std::endl; + return ret; + } + + for (std::vector::iterator iter = result.begin(); + iter != result.end(); + ++iter) { + rgw_bucket_dir_entry& entry = *iter; + + if (entry.key.instance.empty()) { + ldout(store->ctx(), 20) << "obj entry: " << entry.key.name << dendl; + } else { + ldout(store->ctx(), 20) << "obj entry: " << entry.key.name << + " [" << entry.key.instance << "]" << dendl; + } + + ldout(store->ctx(), 20) << __func__ << ": entry.key.name=" << + entry.key.name << " entry.key.instance=" << entry.key.instance << + dendl; + + // ignore entries that are not in the filter if there is a filter + if (!entries_filter.empty() && + entries_filter.find(entry.key) == entries_filter.cend()) { + continue; + } + + // we need to do this in two cases below, so use a lambda + auto do_stat_key = + [&](const rgw_obj_key& key) -> int { + int ret; + + rgw_obj obj(bucket_info.bucket, key); + + RGWRados::Object op_target(store, bucket_info, obj_ctx, obj); + + stat_ops.push_back(RGWRados::Object::Stat(&op_target)); + RGWRados::Object::Stat& op = stat_ops.back(); + + ret = op.stat_async(); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: stat_async() returned error: " << + cpp_strerror(-ret) << dendl; + return ret; + } + + if (stat_ops.size() >= max_concurrent_ios) { + ret = pop_and_handle_stat_op(obj_ctx, stat_ops); + if (ret < 0) { + if (ret != -ENOENT) { + lderr(store->ctx()) << + "ERROR: pop_and_handle_stat_op() returned error: " << + cpp_strerror(-ret) << dendl; + } + + // clear error, so we'll continue processing directory + ret = 0; + } + } + + return ret; + }; // do_stat_key lambda + + // for versioned objects, make sure the head object is handled + // as well by ignoring the instance identifier + if (!entry.key.instance.empty() && + entry.key.name != prev_versioned_key_name) { + // don't do the same key twice; even though out bucket index + // listing allows unordered, since all versions of an object + // use the same bucket index key, they'll all end up together + // and sorted + prev_versioned_key_name = entry.key.name; + + rgw_obj_key uninstanced(entry.key.name); + + ret = do_stat_key(uninstanced); + if (ret < 0) { + return ret; + } + } + + ret = do_stat_key(entry.key); + if (ret < 0) { + return ret; + } + } // for iter loop + } while (truncated); + + while (!stat_ops.empty()) { + ret = pop_and_handle_stat_op(obj_ctx, stat_ops); + if (ret < 0) { + if (ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: stat_async() returned error: " << + cpp_strerror(-ret) << dendl; + } + } + } + + return 0; +} + + +int RGWRadosList::run() +{ + int ret; + void* handle = nullptr; + + ret = store->meta_mgr->list_keys_init("bucket", &handle); + if (ret < 0) { + lderr(store->ctx()) << "RGWRadosList::" << __func__ << + " ERROR: list_keys_init returned " << + cpp_strerror(-ret) << dendl; + return ret; + } + + const int max_keys = 1000; + bool truncated = true; + + do { + std::list buckets; + ret = store->meta_mgr->list_keys_next(handle, max_keys, buckets, &truncated); + + for (std::string& bucket_id : buckets) { + ret = run(bucket_id); + if (ret == -ENOENT) { + continue; + } else if (ret < 0) { + return ret; + } + } + } while (truncated); + + return 0; +} // RGWRadosList::run() + + +int RGWRadosList::run(const std::string& start_bucket_name) +{ + RGWSysObjectCtx sys_obj_ctx = store->svc.sysobj->init_obj_ctx(); + RGWObjectCtx obj_ctx(store); + int ret; + + add_bucket_entire(start_bucket_name); + + while (! bucket_process_map.empty()) { + // pop item from map and capture its key data + auto front = bucket_process_map.begin(); + std::string bucket_name = front->first; + process_t process; + std::swap(process, front->second); + bucket_process_map.erase(front); + + RGWBucketInfo bucket_info; + ret = store->get_bucket_info(sys_obj_ctx, + tenant_name, bucket_name, bucket_info, + nullptr, nullptr); + if (ret == -ENOENT) { + std::cerr << "WARNING: bucket " << bucket_name << + " does not exist; could it have been deleted very recently?" << + std::endl; + continue; + } else if (ret < 0) { + std::cerr << "ERROR: could not get info for bucket " << bucket_name << + " -- " << cpp_strerror(-ret) << std::endl; + return ret; + } + + const std::string bucket_id = bucket_info.bucket.get_key(); + + static const std::set empty_filter; + static const std::string empty_prefix; + + auto do_process_bucket = + [&bucket_id, this] + (const std::string& prefix, + const std::set& entries_filter) -> int { + int ret = process_bucket(bucket_id, prefix, entries_filter); + if (ret == -ENOENT) { + // bucket deletion race? + return 0; + } if (ret < 0) { + lderr(store->ctx()) << "RGWRadosList::" << __func__ << + ": ERROR: process_bucket(); bucket_id=" << + bucket_id << " returned ret=" << ret << dendl; + } + + return ret; + }; + + // either process the whole bucket *or* process the filters and/or + // the prefixes + if (process.entire_container) { + ret = do_process_bucket(empty_prefix, empty_filter); + if (ret < 0) { + return ret; + } + } else { + if (! process.filter_keys.empty()) { + ret = do_process_bucket(empty_prefix, process.filter_keys); + if (ret < 0) { + return ret; + } + } + for (const auto& p : process.prefixes) { + ret = do_process_bucket(p, empty_filter); + if (ret < 0) { + return ret; + } + } + } + } // while (! bucket_process_map.empty()) + + // now handle incomplete multipart uploads by going back to the + // initial bucket + + RGWBucketInfo bucket_info; + ret = store->get_bucket_info(sys_obj_ctx, + tenant_name, start_bucket_name, bucket_info, + nullptr, nullptr); + if (ret == -ENOENT) { + // bucket deletion race? + return 0; + } else if (ret < 0) { + lderr(store->ctx()) << "RGWRadosList::" << __func__ << + ": ERROR: get_bucket_info returned ret=" << ret << dendl; + return ret; + } + + ret = do_incomplete_multipart(store, bucket_info); + if (ret < 0) { + lderr(store->ctx()) << "RGWRadosList::" << __func__ << + ": ERROR: do_incomplete_multipart returned ret=" << ret << dendl; + return ret; + } + + return 0; +} // RGWRadosList::run(string) + + +int RGWRadosList::do_incomplete_multipart( + RGWRados* store, + RGWBucketInfo& bucket_info) +{ + constexpr int max_uploads = 1000; + constexpr int max_parts = 1000; + static const std::string mp_ns = RGW_OBJ_NS_MULTIPART; + static MultipartMetaFilter mp_filter; + + int ret; + + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + list_op.params.ns = mp_ns; + list_op.params.filter = &mp_filter; + // use empty string for initial list_op.params.marker + // use empty strings for list_op.params.{prefix,delim} + + bool is_listing_truncated; + + do { + std::vector objs; + std::map common_prefixes; + ret = list_op.list_objects(max_uploads, &objs, &common_prefixes, + &is_listing_truncated); + if (ret == -ENOENT) { + // could bucket have been removed while this is running? + ldout(store->ctx(), 20) << "RGWRadosList::" << __func__ << + ": WARNING: call to list_objects of multipart namespace got ENOENT; " + "assuming bucket removal race" << dendl; + break; + } else if (ret < 0) { + lderr(store->ctx()) << "RGWRadosList::" << __func__ << + ": ERROR: list_objects op returned ret=" << ret << dendl; + return ret; + } + + if (!objs.empty()) { + std::vector uploads; + RGWMultipartUploadEntry entry; + for (const rgw_bucket_dir_entry& obj : objs) { + const rgw_obj_key& key = obj.key; + if (!entry.mp.from_meta(key.name)) { + // we only want the meta objects, so skip all the components + continue; + } + entry.obj = obj; + uploads.push_back(entry); + ldout(store->ctx(), 20) << "RGWRadosList::" << __func__ << + " processing incomplete multipart entry " << + entry << dendl; + } + + // now process the uploads vector + int parts_marker = 0; + bool is_parts_truncated = false; + do { + map parts; + + for (const auto& upload : uploads) { + const RGWMPObj& mp = upload.mp; + ret = list_multipart_parts(store, bucket_info, store->ctx(), + mp.get_upload_id(), mp.get_meta(), + max_parts, + parts_marker, parts, NULL, &is_parts_truncated); + if (ret == -ENOENT) { + continue; + } else if (ret < 0) { + lderr(store->ctx()) << "RGWRadosList::" << __func__ << + ": ERROR: list_multipart_parts returned ret=" << ret << dendl; + return ret; + } + + for (auto& p : parts) { + RGWObjManifest& manifest = p.second.manifest; + for (auto obj_it = manifest.obj_begin(); + obj_it != manifest.obj_end(); + ++obj_it) { + const rgw_raw_obj& loc = obj_it.get_location().get_raw_obj(store); + std::cout << loc.oid << std::endl; + } + } + } + } while (is_parts_truncated); + } // if objs not empty + } while (is_listing_truncated); + + return 0; +} // RGWRadosList::do_incomplete_multipart diff --git a/src/rgw/rgw_orphan.h b/src/rgw/rgw_orphan.h new file mode 100644 index 00000000..fe737b4f --- /dev/null +++ b/src/rgw/rgw_orphan.h @@ -0,0 +1,290 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RGW_ORPHAN_H +#define CEPH_RGW_ORPHAN_H + +#include "common/config.h" +#include "common/Formatter.h" +#include "common/errno.h" + +#include "rgw_rados.h" + +#define dout_subsys ceph_subsys_rgw + +#define RGW_ORPHAN_INDEX_OID "orphan.index" +#define RGW_ORPHAN_INDEX_PREFIX "orphan.scan" + + +enum RGWOrphanSearchStageId { + ORPHAN_SEARCH_STAGE_UNKNOWN = 0, + ORPHAN_SEARCH_STAGE_INIT = 1, + ORPHAN_SEARCH_STAGE_LSPOOL = 2, + ORPHAN_SEARCH_STAGE_LSBUCKETS = 3, + ORPHAN_SEARCH_STAGE_ITERATE_BI = 4, + ORPHAN_SEARCH_STAGE_COMPARE = 5, +}; + + +struct RGWOrphanSearchStage { + RGWOrphanSearchStageId stage; + int shard; + string marker; + + RGWOrphanSearchStage() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN), shard(0) {} + explicit RGWOrphanSearchStage(RGWOrphanSearchStageId _stage) : stage(_stage), shard(0) {} + RGWOrphanSearchStage(RGWOrphanSearchStageId _stage, int _shard, const string& _marker) : stage(_stage), shard(_shard), marker(_marker) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode((int)stage, bl); + encode(shard, bl); + encode(marker, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + int s; + decode(s, bl); + stage = (RGWOrphanSearchStageId)s; + decode(shard, bl); + decode(marker, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOrphanSearchStage) + +struct RGWOrphanSearchInfo { + string job_name; + rgw_pool pool; + uint16_t num_shards; + utime_t start_time; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(job_name, bl); + encode(pool.to_str(), bl); + encode(num_shards, bl); + encode(start_time, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(job_name, bl); + string s; + decode(s, bl); + pool.from_str(s); + decode(num_shards, bl); + decode(start_time, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOrphanSearchInfo) + +struct RGWOrphanSearchState { + RGWOrphanSearchInfo info; + RGWOrphanSearchStage stage; + + RGWOrphanSearchState() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(info, bl); + encode(stage, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(info, bl); + decode(stage, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOrphanSearchState) + +class RGWOrphanStore { + RGWRados *store; + librados::IoCtx ioctx; + + string oid; + +public: + explicit RGWOrphanStore(RGWRados *_store) : store(_store), oid(RGW_ORPHAN_INDEX_OID) {} + + librados::IoCtx& get_ioctx() { return ioctx; } + + int init(); + + int read_job(const string& job_name, RGWOrphanSearchState& state); + int write_job(const string& job_name, const RGWOrphanSearchState& state); + int remove_job(const string& job_name); + int list_jobs(map &job_list); + + + int store_entries(const string& oid, const map& entries); + int read_entries(const string& oid, const string& marker, map *entries, bool *truncated); +}; + + +class RGWOrphanSearch { + RGWRados *store; + + RGWOrphanStore orphan_store; + + RGWOrphanSearchInfo search_info; + RGWOrphanSearchStage search_stage; + + map all_objs_index; + map buckets_instance_index; + map linked_objs_index; + + string index_objs_prefix; + + uint16_t max_concurrent_ios; + uint64_t stale_secs; + int64_t max_list_bucket_entries; + + bool detailed_mode; + + struct log_iter_info { + string oid; + list::iterator cur; + list::iterator end; + }; + + int log_oids(map& log_shards, map >& oids); + +#define RGW_ORPHANSEARCH_HASH_PRIME 7877 + int orphan_shard(const string& str) { + return ceph_str_hash_linux(str.c_str(), str.size()) % RGW_ORPHANSEARCH_HASH_PRIME % search_info.num_shards; + } + + int handle_stat_result(map >& oids, RGWRados::Object::Stat::Result& result); + int pop_and_handle_stat_op(map >& oids, std::deque& ops); + + + int remove_index(map& index); +public: + RGWOrphanSearch(RGWRados *_store, int _max_ios, uint64_t _stale_secs) : store(_store), orphan_store(store), max_concurrent_ios(_max_ios), stale_secs(_stale_secs) {} + + int save_state() { + RGWOrphanSearchState state; + state.info = search_info; + state.stage = search_stage; + return orphan_store.write_job(search_info.job_name, state); + } + + int init(const string& job_name, RGWOrphanSearchInfo *info, bool _detailed_mode=false); + + int create(const string& job_name, int num_shards); + + int build_all_oids_index(); + int build_buckets_instance_index(); + int build_linked_oids_for_bucket(const string& bucket_instance_id, map >& oids); + int build_linked_oids_index(); + int compare_oid_indexes(); + + int run(); + int finish(); +}; + + +class RGWRadosList { + + /* + * process_t describes how to process a irectory, we will either + * process the whole thing (entire_container == true) or a portion + * of it (entire_container == false). When we only process a + * portion, we will list the specific keys and/or specific lexical + * prefixes. + */ + struct process_t { + bool entire_container; + std::set filter_keys; + std::set prefixes; + + process_t() : + entire_container(false) + {} + }; + + std::map bucket_process_map; + std::set visited_oids; + + void add_bucket_entire(const std::string& bucket_name) { + auto p = bucket_process_map.emplace(std::make_pair(bucket_name, + process_t())); + p.first->second.entire_container = true; + } + + void add_bucket_prefix(const std::string& bucket_name, + const std::string& prefix) { + auto p = bucket_process_map.emplace(std::make_pair(bucket_name, + process_t())); + p.first->second.prefixes.insert(prefix); + } + + void add_bucket_filter(const std::string& bucket_name, + const rgw_obj_key& obj_key) { + auto p = bucket_process_map.emplace(std::make_pair(bucket_name, + process_t())); + p.first->second.filter_keys.insert(obj_key); + } + + RGWRados *store; + + uint16_t max_concurrent_ios; + uint64_t stale_secs; + std::string tenant_name; + + int handle_stat_result(RGWRados::Object::Stat::Result& result, + std::set& obj_oids); + int pop_and_handle_stat_op(RGWObjectCtx& obj_ctx, + std::deque& ops); + +public: + + RGWRadosList(RGWRados *_store, + int _max_ios, + uint64_t _stale_secs, + const std::string& _tenant_name) : + store(_store), + max_concurrent_ios(_max_ios), + stale_secs(_stale_secs), + tenant_name(_tenant_name) + {} + + int process_bucket(const std::string& bucket_instance_id, + const std::string& prefix, + const std::set& entries_filter); + + int do_incomplete_multipart(RGWRados* store, RGWBucketInfo& bucket_info); + + int build_linked_oids_index(); + + int run(const std::string& bucket_id); + int run(); +}; // class RGWRadosList + +#endif diff --git a/src/rgw/rgw_os_lib.cc b/src/rgw/rgw_os_lib.cc new file mode 100644 index 00000000..e43bf418 --- /dev/null +++ b/src/rgw/rgw_os_lib.cc @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_user.h" +#include "rgw_os_lib.h" +#include "rgw_file.h" +#include "rgw_lib_frontend.h" + +namespace rgw { + +/* static */ + int RGWHandler_Lib::init_from_header(struct req_state *s) + { + string req; + string first; + + const char *req_name = s->relative_uri.c_str(); + const char *p; + + /* skip request_params parsing, rgw_file should not be + * seeing any */ + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(); + + if (*req_name != '/') + return 0; + + req_name++; + + if (!*req_name) + return 0; + + req = req_name; + int pos = req.find('/'); + if (pos >= 0) { + first = req.substr(0, pos); + } else { + first = req; + } + + if (s->bucket_name.empty()) { + s->bucket_name = std::move(first); + if (pos >= 0) { + // XXX ugh, another copy + string encoded_obj_str = req.substr(pos+1); + s->object = rgw_obj_key(encoded_obj_str, s->info.args.get("versionId")); + } + } else { + s->object = rgw_obj_key(req_name, s->info.args.get("versionId")); + } + return 0; + } /* init_from_header */ + +} /* namespace rgw */ diff --git a/src/rgw/rgw_os_lib.h b/src/rgw/rgw_os_lib.h new file mode 100644 index 00000000..78071b7d --- /dev/null +++ b/src/rgw/rgw_os_lib.h @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_OS_LIB_H +#define RGW_OS_LIB_H + +#include +#include "rgw_common.h" +#include "rgw_lib.h" + + +#endif /* RGW_OS_LIB_H */ diff --git a/src/rgw/rgw_otp.cc b/src/rgw/rgw_otp.cc new file mode 100644 index 00000000..e00a9344 --- /dev/null +++ b/src/rgw/rgw_otp.cc @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "rgw_rados.h" +#include "rgw_zone.h" + +#include "include/types.h" + +#include "rgw_common.h" +#include "rgw_tools.h" + +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + + +static RGWMetadataHandler *otp_meta_handler = NULL; + + +class RGWOTPMetadataObject : public RGWMetadataObject { + list result; +public: + RGWOTPMetadataObject(list& _result, obj_version& v, real_time m) { + result.swap(_result); + objv = v; + mtime = m; + } + + void dump(Formatter *f) const override { + encode_json("devices", result, f); + } +}; + +class RGWOTPMetadataHandler : public RGWMetadataHandler { +public: + string get_type() override { return "otp"; } + + int get(RGWRados *store, string& entry, RGWMetadataObject **obj) override { + RGWObjVersionTracker objv_tracker; + real_time mtime; + + list result; + int r = store->list_mfa(entry, &result, &objv_tracker, &mtime); + if (r < 0) { + return r; + } + RGWOTPMetadataObject *mdo = new RGWOTPMetadataObject(result, objv_tracker.read_version, mtime); + *obj = mdo; + return 0; + } + + int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker, + real_time mtime, JSONObj *obj, sync_type_t sync_mode) override { + + list devices; + try { + JSONDecoder::decode_json("devices", devices, obj); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + + int ret = store->meta_mgr->mutate(this, entry, mtime, &objv_tracker, + MDLOG_STATUS_WRITE, sync_mode, + [&] { + return store->set_mfa(entry, devices, true, &objv_tracker, mtime); + }); + if (ret < 0) { + return ret; + } + + return STATUS_APPLIED; + } + + int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override { + return store->meta_mgr->remove_entry(this, entry, &objv_tracker); + } + + void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) override { + oid = key; + pool = store->svc.zone->get_zone_params().otp_pool; + } + + struct list_keys_info { + RGWRados *store; + RGWListRawObjsCtx ctx; + }; + + int list_keys_init(RGWRados *store, const string& marker, void **phandle) override + { + auto info = std::make_unique(); + + info->store = store; + + int ret = store->list_raw_objects_init(store->svc.zone->get_zone_params().otp_pool, marker, + &info->ctx); + if (ret < 0) { + return ret; + } + + *phandle = (void *)info.release(); + + return 0; + } + + int list_keys_next(void *handle, int max, list& keys, bool *truncated) override { + list_keys_info *info = static_cast(handle); + + string no_filter; + + keys.clear(); + + RGWRados *store = info->store; + + int ret = store->list_raw_objects_next(no_filter, max, info->ctx, + keys, truncated); + if (ret < 0 && ret != -ENOENT) + return ret; + if (ret == -ENOENT) { + if (truncated) + *truncated = false; + return 0; + } + + return 0; + } + + void list_keys_complete(void *handle) override { + list_keys_info *info = static_cast(handle); + delete info; + } + + string get_marker(void *handle) override { + list_keys_info *info = static_cast(handle); + return info->store->list_raw_objs_get_cursor(info->ctx); + } +}; + +RGWMetadataHandler *rgw_otp_get_handler() +{ + return otp_meta_handler; +} + +void rgw_otp_init(RGWRados *store) +{ + otp_meta_handler = new RGWOTPMetadataHandler; + store->meta_mgr->register_handler(otp_meta_handler); +} diff --git a/src/rgw/rgw_otp.h b/src/rgw/rgw_otp.h new file mode 100644 index 00000000..54491343 --- /dev/null +++ b/src/rgw/rgw_otp.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_OTP_H +#define CEPH_RGW_OTP_H + +class RGWRados; + +class RGWMetadataHandler; + +RGWMetadataHandler *rgw_otp_get_handler(void); +void rgw_otp_init(RGWRados *store); + +#endif + diff --git a/src/rgw/rgw_perf_counters.cc b/src/rgw/rgw_perf_counters.cc new file mode 100644 index 00000000..21d1a363 --- /dev/null +++ b/src/rgw/rgw_perf_counters.cc @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_perf_counters.h" +#include "common/perf_counters.h" +#include "common/ceph_context.h" + +PerfCounters *perfcounter = NULL; + +int rgw_perf_start(CephContext *cct) +{ + PerfCountersBuilder plb(cct, "rgw", l_rgw_first, l_rgw_last); + + // RGW emits comparatively few metrics, so let's be generous + // and mark them all USEFUL to get transmission to ceph-mgr by default. + plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + plb.add_u64_counter(l_rgw_req, "req", "Requests"); + plb.add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests"); + + plb.add_u64_counter(l_rgw_get, "get", "Gets"); + plb.add_u64_counter(l_rgw_get_b, "get_b", "Size of gets"); + plb.add_time_avg(l_rgw_get_lat, "get_initial_lat", "Get latency"); + plb.add_u64_counter(l_rgw_put, "put", "Puts"); + plb.add_u64_counter(l_rgw_put_b, "put_b", "Size of puts"); + plb.add_time_avg(l_rgw_put_lat, "put_initial_lat", "Put latency"); + + plb.add_u64(l_rgw_qlen, "qlen", "Queue length"); + plb.add_u64(l_rgw_qactive, "qactive", "Active requests queue"); + + plb.add_u64_counter(l_rgw_cache_hit, "cache_hit", "Cache hits"); + plb.add_u64_counter(l_rgw_cache_miss, "cache_miss", "Cache miss"); + + plb.add_u64_counter(l_rgw_keystone_token_cache_hit, "keystone_token_cache_hit", "Keystone token cache hits"); + plb.add_u64_counter(l_rgw_keystone_token_cache_miss, "keystone_token_cache_miss", "Keystone token cache miss"); + + plb.add_u64_counter(l_rgw_gc_retire, "gc_retire_object", "GC object retires"); + + plb.add_u64_counter(l_rgw_pubsub_event_triggered, "pubsub_event_triggered", "Pubsub events with at least one topic"); + plb.add_u64_counter(l_rgw_pubsub_event_lost, "pubsub_event_lost", "Pubsub events lost"); + plb.add_u64_counter(l_rgw_pubsub_store_ok, "pubsub_store_ok", "Pubsub events successfully stored"); + plb.add_u64_counter(l_rgw_pubsub_store_fail, "pubsub_store_fail", "Pubsub events failed to be stored"); + plb.add_u64(l_rgw_pubsub_events, "pubsub_events", "Pubsub events in store"); + plb.add_u64_counter(l_rgw_pubsub_push_ok, "pubsub_push_ok", "Pubsub events pushed to an endpoint"); + plb.add_u64_counter(l_rgw_pubsub_push_failed, "pubsub_push_failed", "Pubsub events failed to be pushed to an endpoint"); + plb.add_u64(l_rgw_pubsub_push_pending, "pubsub_push_pending", "Pubsub events pending reply from endpoint"); + plb.add_u64_counter(l_rgw_pubsub_missing_conf, "pubsub_missing_conf", "Pubsub events could not be handled because of missing configuration"); + + perfcounter = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perfcounter); + return 0; +} + +void rgw_perf_stop(CephContext *cct) +{ + ceph_assert(perfcounter); + cct->get_perfcounters_collection()->remove(perfcounter); + delete perfcounter; +} + diff --git a/src/rgw/rgw_perf_counters.h b/src/rgw/rgw_perf_counters.h new file mode 100644 index 00000000..1f0b6fc3 --- /dev/null +++ b/src/rgw/rgw_perf_counters.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +class CephContext; +class PerfCounters; + +extern PerfCounters *perfcounter; + +extern int rgw_perf_start(CephContext *cct); +extern void rgw_perf_stop(CephContext *cct); + +enum { + l_rgw_first = 15000, + l_rgw_req, + l_rgw_failed_req, + + l_rgw_get, + l_rgw_get_b, + l_rgw_get_lat, + + l_rgw_put, + l_rgw_put_b, + l_rgw_put_lat, + + l_rgw_qlen, + l_rgw_qactive, + + l_rgw_cache_hit, + l_rgw_cache_miss, + + l_rgw_keystone_token_cache_hit, + l_rgw_keystone_token_cache_miss, + + l_rgw_gc_retire, + + l_rgw_pubsub_event_triggered, + l_rgw_pubsub_event_lost, + l_rgw_pubsub_store_ok, + l_rgw_pubsub_store_fail, + l_rgw_pubsub_events, + l_rgw_pubsub_push_ok, + l_rgw_pubsub_push_failed, + l_rgw_pubsub_push_pending, + l_rgw_pubsub_missing_conf, + + l_rgw_last, +}; + diff --git a/src/rgw/rgw_period_history.cc b/src/rgw/rgw_period_history.cc new file mode 100644 index 00000000..cf0f3cfc --- /dev/null +++ b/src/rgw/rgw_period_history.cc @@ -0,0 +1,354 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_period_history.h" +#include "rgw_rados.h" +#include "rgw_zone.h" + +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw period history: ") + +/// an ordered history of consecutive periods +class RGWPeriodHistory::History : public bi::avl_set_base_hook<> { + public: + std::deque periods; + + epoch_t get_oldest_epoch() const { + return periods.front().get_realm_epoch(); + } + epoch_t get_newest_epoch() const { + return periods.back().get_realm_epoch(); + } + bool contains(epoch_t epoch) const { + return get_oldest_epoch() <= epoch && epoch <= get_newest_epoch(); + } + RGWPeriod& get(epoch_t epoch) { + return periods[epoch - get_oldest_epoch()]; + } + const RGWPeriod& get(epoch_t epoch) const { + return periods[epoch - get_oldest_epoch()]; + } + const std::string& get_predecessor_id() const { + return periods.front().get_predecessor(); + } +}; + +/// value comparison for avl_set +bool operator<(const RGWPeriodHistory::History& lhs, + const RGWPeriodHistory::History& rhs) +{ + return lhs.get_newest_epoch() < rhs.get_newest_epoch(); +} + +/// key-value comparison for avl_set +struct NewestEpochLess { + bool operator()(const RGWPeriodHistory::History& value, epoch_t key) const { + return value.get_newest_epoch() < key; + } +}; + + +using Cursor = RGWPeriodHistory::Cursor; + +const RGWPeriod& Cursor::get_period() const +{ + std::lock_guard lock(*mutex); + return history->get(epoch); +} +bool Cursor::has_prev() const +{ + std::lock_guard lock(*mutex); + return epoch > history->get_oldest_epoch(); +} +bool Cursor::has_next() const +{ + std::lock_guard lock(*mutex); + return epoch < history->get_newest_epoch(); +} + +bool operator==(const Cursor& lhs, const Cursor& rhs) +{ + return lhs.history == rhs.history && lhs.epoch == rhs.epoch; +} + +bool operator!=(const Cursor& lhs, const Cursor& rhs) +{ + return !(lhs == rhs); +} + +class RGWPeriodHistory::Impl final { + public: + Impl(CephContext* cct, Puller* puller, const RGWPeriod& current_period); + ~Impl(); + + Cursor get_current() const { return current_cursor; } + Cursor attach(RGWPeriod&& period); + Cursor insert(RGWPeriod&& period); + Cursor lookup(epoch_t realm_epoch); + + private: + /// an intrusive set of histories, ordered by their newest epoch. although + /// the newest epoch of each history is mutable, the ordering cannot change + /// because we prevent the histories from overlapping + using Set = bi::avl_set; + + /// insert the given period into the period history, creating new unconnected + /// histories or merging existing histories as necessary. expects the caller + /// to hold a lock on mutex. returns a valid cursor regardless of whether it + /// ends up in current_history, though cursors in other histories are only + /// valid within the context of the lock + Cursor insert_locked(RGWPeriod&& period); + + /// merge the periods from the src history onto the end of the dst history, + /// and return an iterator to the merged history + Set::iterator merge(Set::iterator dst, Set::iterator src); + + /// construct a Cursor object using Cursor's private constuctor + Cursor make_cursor(Set::const_iterator history, epoch_t epoch); + + CephContext *const cct; + Puller *const puller; //< interface for pulling missing periods + Cursor current_cursor; //< Cursor to realm's current period + + mutable std::mutex mutex; //< protects the histories + + /// set of disjoint histories that are missing intermediate periods needed to + /// connect them together + Set histories; + + /// iterator to the history that contains the realm's current period + Set::const_iterator current_history; +}; + +RGWPeriodHistory::Impl::Impl(CephContext* cct, Puller* puller, + const RGWPeriod& current_period) + : cct(cct), puller(puller) +{ + if (!current_period.get_id().empty()) { + // copy the current period into a new history + auto history = new History; + history->periods.push_back(current_period); + + // insert as our current history + current_history = histories.insert(*history).first; + + // get a cursor to the current period + current_cursor = make_cursor(current_history, current_period.get_realm_epoch()); + } else { + current_history = histories.end(); + } +} + +RGWPeriodHistory::Impl::~Impl() +{ + // clear the histories and delete each entry + histories.clear_and_dispose(std::default_delete{}); +} + +Cursor RGWPeriodHistory::Impl::attach(RGWPeriod&& period) +{ + if (current_history == histories.end()) { + return Cursor{-EINVAL}; + } + + const auto epoch = period.get_realm_epoch(); + + std::string predecessor_id; + for (;;) { + { + // hold the lock over insert, and while accessing the unsafe cursor + std::lock_guard lock(mutex); + + auto cursor = insert_locked(std::move(period)); + if (!cursor) { + return cursor; + } + if (current_history->contains(epoch)) { + break; // the history is complete + } + + // take the predecessor id of the most recent history + if (cursor.get_epoch() > current_cursor.get_epoch()) { + predecessor_id = cursor.history->get_predecessor_id(); + } else { + predecessor_id = current_history->get_predecessor_id(); + } + } + + if (predecessor_id.empty()) { + lderr(cct) << "reached a period with an empty predecessor id" << dendl; + return Cursor{-EINVAL}; + } + + // pull the period outside of the lock + int r = puller->pull(predecessor_id, period); + if (r < 0) { + return Cursor{r}; + } + } + + // return a cursor to the requested period + return make_cursor(current_history, epoch); +} + +Cursor RGWPeriodHistory::Impl::insert(RGWPeriod&& period) +{ + if (current_history == histories.end()) { + return Cursor{-EINVAL}; + } + + std::lock_guard lock(mutex); + + auto cursor = insert_locked(std::move(period)); + + if (cursor.get_error()) { + return cursor; + } + // we can only provide cursors that are safe to use outside of the mutex if + // they're within the current_history, because other histories can disappear + // in a merge. see merge() for the special handling of current_history + if (cursor.history == &*current_history) { + return cursor; + } + return Cursor{}; +} + +Cursor RGWPeriodHistory::Impl::lookup(epoch_t realm_epoch) +{ + if (current_history != histories.end() && + current_history->contains(realm_epoch)) { + return make_cursor(current_history, realm_epoch); + } + return Cursor{}; +} + +Cursor RGWPeriodHistory::Impl::insert_locked(RGWPeriod&& period) +{ + auto epoch = period.get_realm_epoch(); + + // find the first history whose newest epoch comes at or after this period + auto i = histories.lower_bound(epoch, NewestEpochLess{}); + + if (i == histories.end()) { + // epoch is past the end of our newest history + auto last = --Set::iterator{i}; // last = i - 1 + + if (epoch == last->get_newest_epoch() + 1) { + // insert at the back of the last history + last->periods.emplace_back(std::move(period)); + return make_cursor(last, epoch); + } + + // create a new history for this period + auto history = new History; + history->periods.emplace_back(std::move(period)); + histories.insert(last, *history); + + i = Set::s_iterator_to(*history); + return make_cursor(i, epoch); + } + + if (i->contains(epoch)) { + // already resident in this history + auto& existing = i->get(epoch); + // verify that the period ids match; otherwise we've forked the history + if (period.get_id() != existing.get_id()) { + lderr(cct) << "Got two different periods, " << period.get_id() + << " and " << existing.get_id() << ", with the same realm epoch " + << epoch << "! This indicates a fork in the period history." << dendl; + return Cursor{-EEXIST}; + } + // update the existing period if we got a newer period epoch + if (period.get_epoch() > existing.get_epoch()) { + existing = std::move(period); + } + return make_cursor(i, epoch); + } + + if (epoch + 1 == i->get_oldest_epoch()) { + // insert at the front of this history + i->periods.emplace_front(std::move(period)); + + // try to merge with the previous history + if (i != histories.begin()) { + auto prev = --Set::iterator{i}; + if (epoch == prev->get_newest_epoch() + 1) { + i = merge(prev, i); + } + } + return make_cursor(i, epoch); + } + + if (i != histories.begin()) { + auto prev = --Set::iterator{i}; + if (epoch == prev->get_newest_epoch() + 1) { + // insert at the back of the previous history + prev->periods.emplace_back(std::move(period)); + return make_cursor(prev, epoch); + } + } + + // create a new history for this period + auto history = new History; + history->periods.emplace_back(std::move(period)); + histories.insert(i, *history); + + i = Set::s_iterator_to(*history); + return make_cursor(i, epoch); +} + +RGWPeriodHistory::Impl::Set::iterator +RGWPeriodHistory::Impl::merge(Set::iterator dst, Set::iterator src) +{ + ceph_assert(dst->get_newest_epoch() + 1 == src->get_oldest_epoch()); + + // always merge into current_history + if (src == current_history) { + // move the periods from dst onto the front of src + src->periods.insert(src->periods.begin(), + std::make_move_iterator(dst->periods.begin()), + std::make_move_iterator(dst->periods.end())); + histories.erase_and_dispose(dst, std::default_delete{}); + return src; + } + + // move the periods from src onto the end of dst + dst->periods.insert(dst->periods.end(), + std::make_move_iterator(src->periods.begin()), + std::make_move_iterator(src->periods.end())); + histories.erase_and_dispose(src, std::default_delete{}); + return dst; +} + +Cursor RGWPeriodHistory::Impl::make_cursor(Set::const_iterator history, + epoch_t epoch) { + return Cursor{&*history, &mutex, epoch}; +} + + +RGWPeriodHistory::RGWPeriodHistory(CephContext* cct, Puller* puller, + const RGWPeriod& current_period) + : impl(new Impl(cct, puller, current_period)) {} + +RGWPeriodHistory::~RGWPeriodHistory() = default; + +Cursor RGWPeriodHistory::get_current() const +{ + return impl->get_current(); +} +Cursor RGWPeriodHistory::attach(RGWPeriod&& period) +{ + return impl->attach(std::move(period)); +} +Cursor RGWPeriodHistory::insert(RGWPeriod&& period) +{ + return impl->insert(std::move(period)); +} +Cursor RGWPeriodHistory::lookup(epoch_t realm_epoch) +{ + return impl->lookup(realm_epoch); +} diff --git a/src/rgw/rgw_period_history.h b/src/rgw/rgw_period_history.h new file mode 100644 index 00000000..11a5c2aa --- /dev/null +++ b/src/rgw/rgw_period_history.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_PERIOD_HISTORY_H +#define RGW_PERIOD_HISTORY_H + +#include +#include +#include +#include +#include "include/ceph_assert.h" +#include "include/types.h" + +namespace bi = boost::intrusive; + +class RGWPeriod; + +/** + * RGWPeriodHistory tracks the relative history of all inserted periods, + * coordinates the pulling of missing intermediate periods, and provides a + * Cursor object for traversing through the connected history. + */ +class RGWPeriodHistory final { + private: + /// an ordered history of consecutive periods + class History; + + // comparisons for avl_set ordering + friend bool operator<(const History& lhs, const History& rhs); + friend struct NewestEpochLess; + + class Impl; + std::unique_ptr impl; + + public: + /** + * Puller is a synchronous interface for pulling periods from the master + * zone. The abstraction exists mainly to support unit testing. + */ + class Puller { + public: + virtual ~Puller() = default; + + virtual int pull(const std::string& period_id, RGWPeriod& period) = 0; + }; + + RGWPeriodHistory(CephContext* cct, Puller* puller, + const RGWPeriod& current_period); + ~RGWPeriodHistory(); + + /** + * Cursor tracks a position in the period history and allows forward and + * backward traversal. Only periods that are fully connected to the + * current_period are reachable via a Cursor, because other histories are + * temporary and can be merged away. Cursors to periods in disjoint + * histories, as provided by insert() or lookup(), are therefore invalid and + * their operator bool() will return false. + */ + class Cursor final { + public: + Cursor() = default; + explicit Cursor(int error) : error(error) {} + + int get_error() const { return error; } + + /// return false for a default-constructed or error Cursor + operator bool() const { return history != nullptr; } + + epoch_t get_epoch() const { return epoch; } + const RGWPeriod& get_period() const; + + bool has_prev() const; + bool has_next() const; + + void prev() { epoch--; } + void next() { epoch++; } + + friend bool operator==(const Cursor& lhs, const Cursor& rhs); + friend bool operator!=(const Cursor& lhs, const Cursor& rhs); + + private: + // private constructors for RGWPeriodHistory + friend class RGWPeriodHistory::Impl; + + Cursor(const History* history, std::mutex* mutex, epoch_t epoch) + : history(history), mutex(mutex), epoch(epoch) {} + + int error{0}; + const History* history{nullptr}; + std::mutex* mutex{nullptr}; + epoch_t epoch{0}; //< realm epoch of cursor position + }; + + /// return a cursor to the current period + Cursor get_current() const; + + /// build up a connected period history that covers the span between + /// current_period and the given period, reading predecessor periods or + /// fetching them from the master as necessary. returns a cursor at the + /// given period that can be used to traverse the current_history + Cursor attach(RGWPeriod&& period); + + /// insert the given period into an existing history, or create a new + /// unconnected history. similar to attach(), but it doesn't try to fetch + /// missing periods. returns a cursor to the inserted period iff it's in + /// the current_history + Cursor insert(RGWPeriod&& period); + + /// search for a period by realm epoch, returning a valid Cursor iff it's in + /// the current_history + Cursor lookup(epoch_t realm_epoch); +}; + +#endif // RGW_PERIOD_HISTORY_H diff --git a/src/rgw/rgw_period_puller.cc b/src/rgw/rgw_period_puller.cc new file mode 100644 index 00000000..934eb000 --- /dev/null +++ b/src/rgw/rgw_period_puller.cc @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_rest_conn.h" +#include "common/ceph_json.h" +#include "common/errno.h" + +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw period puller: ") + +namespace { + +// pull the given period over the connection +int pull_period(RGWRESTConn* conn, const std::string& period_id, + const std::string& realm_id, RGWPeriod& period) +{ + rgw_user user; + RGWEnv env; + req_info info(conn->get_ctx(), &env); + info.method = "GET"; + info.request_uri = "/admin/realm/period"; + + auto& params = info.args.get_params(); + params["realm_id"] = realm_id; + params["period_id"] = period_id; + + bufferlist data; +#define MAX_REST_RESPONSE (128 * 1024) + int r = conn->forward(user, info, nullptr, MAX_REST_RESPONSE, nullptr, &data); + if (r < 0) { + return r; + } + + JSONParser parser; + r = parser.parse(data.c_str(), data.length()); + if (r < 0) { + lderr(conn->get_ctx()) << "request failed: " << cpp_strerror(-r) << dendl; + return r; + } + + try { + decode_json_obj(period, &parser); + } catch (JSONDecoder::err& e) { + lderr(conn->get_ctx()) << "failed to decode JSON input: " + << e.message << dendl; + return -EINVAL; + } + return 0; +} + +} // anonymous namespace + +int RGWPeriodPuller::pull(const std::string& period_id, RGWPeriod& period) +{ + // try to read the period from rados + period.set_id(period_id); + period.set_epoch(0); + int r = period.init(store->ctx(), store->svc.sysobj); + if (r < 0) { + if (store->svc.zone->is_meta_master()) { + // can't pull if we're the master + ldout(store->ctx(), 1) << "metadata master failed to read period " + << period_id << " from local storage: " << cpp_strerror(r) << dendl; + return r; + } + ldout(store->ctx(), 14) << "pulling period " << period_id + << " from master" << dendl; + // request the period from the master zone + r = pull_period(store->svc.zone->get_master_conn(), period_id, + store->svc.zone->get_realm().get_id(), period); + if (r < 0) { + lderr(store->ctx()) << "failed to pull period " << period_id << dendl; + return r; + } + // write the period to rados + r = period.store_info(true); + if (r == -EEXIST) { + r = 0; + } else if (r < 0) { + lderr(store->ctx()) << "failed to store period " << period_id << dendl; + return r; + } + // update latest epoch + r = period.update_latest_epoch(period.get_epoch()); + if (r == -EEXIST) { + // already have this epoch (or a more recent one) + return 0; + } + if (r < 0) { + lderr(store->ctx()) << "failed to update latest_epoch for period " + << period_id << dendl; + return r; + } + // reflect period objects if this is the latest version + if (store->svc.zone->get_realm().get_current_period() == period_id) { + r = period.reflect(); + if (r < 0) { + return r; + } + } + ldout(store->ctx(), 14) << "period " << period_id + << " pulled and written to local storage" << dendl; + } else { + ldout(store->ctx(), 14) << "found period " << period_id + << " in local storage" << dendl; + } + return 0; +} diff --git a/src/rgw/rgw_period_puller.h b/src/rgw/rgw_period_puller.h new file mode 100644 index 00000000..9018d584 --- /dev/null +++ b/src/rgw/rgw_period_puller.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_PERIOD_PULLER_H +#define CEPH_RGW_PERIOD_PULLER_H + +#include "rgw_period_history.h" + +class RGWRados; +class RGWPeriod; + +class RGWPeriodPuller : public RGWPeriodHistory::Puller { + RGWRados *const store; + public: + explicit RGWPeriodPuller(RGWRados* store) : store(store) {} + + int pull(const std::string& period_id, RGWPeriod& period) override; +}; + +#endif // CEPH_RGW_PERIOD_PULLER_H diff --git a/src/rgw/rgw_period_pusher.cc b/src/rgw/rgw_period_pusher.cc new file mode 100644 index 00000000..e3db85df --- /dev/null +++ b/src/rgw/rgw_period_pusher.cc @@ -0,0 +1,307 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "rgw_period_pusher.h" +#include "rgw_cr_rest.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include "common/errno.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw period pusher: ") + +/// A coroutine to post the period over the given connection. +using PushCR = RGWPostRESTResourceCR; + +/// A coroutine that calls PushCR, and retries with backoff until success. +class PushAndRetryCR : public RGWCoroutine { + const std::string& zone; + RGWRESTConn *const conn; + RGWHTTPManager *const http; + RGWPeriod& period; + const std::string epoch; //< epoch string for params + double timeout; //< current interval between retries + const double timeout_max; //< maximum interval between retries + uint32_t counter; //< number of failures since backoff increased + + public: + PushAndRetryCR(CephContext* cct, const std::string& zone, RGWRESTConn* conn, + RGWHTTPManager* http, RGWPeriod& period) + : RGWCoroutine(cct), zone(zone), conn(conn), http(http), period(period), + epoch(std::to_string(period.get_epoch())), + timeout(cct->_conf->rgw_period_push_interval), + timeout_max(cct->_conf->rgw_period_push_interval_max), + counter(0) + {} + + int operate() override; +}; + +int PushAndRetryCR::operate() +{ + reenter(this) { + for (;;) { + yield { + ldout(cct, 10) << "pushing period " << period.get_id() + << " to " << zone << dendl; + // initialize the http params + rgw_http_param_pair params[] = { + { "period", period.get_id().c_str() }, + { "epoch", epoch.c_str() }, + { nullptr, nullptr } + }; + call(new PushCR(cct, conn, http, "/admin/realm/period", + params, period, nullptr)); + } + + // stop on success + if (get_ret_status() == 0) { + ldout(cct, 10) << "push to " << zone << " succeeded" << dendl; + return set_cr_done(); + } + + // try each endpoint in the connection before waiting + if (++counter < conn->get_endpoint_count()) + continue; + counter = 0; + + // wait with exponential backoff up to timeout_max + yield { + utime_t dur; + dur.set_from_double(timeout); + + ldout(cct, 10) << "waiting " << dur << "s for retry.." << dendl; + wait(dur); + + timeout *= 2; + if (timeout > timeout_max) + timeout = timeout_max; + } + } + } + return 0; +} + +/** + * PushAllCR is a coroutine that sends the period over all of the given + * connections, retrying until they are all marked as completed. + */ +class PushAllCR : public RGWCoroutine { + RGWHTTPManager *const http; + RGWPeriod period; //< period object to push + std::map conns; //< zones that need the period + + public: + PushAllCR(CephContext* cct, RGWHTTPManager* http, RGWPeriod&& period, + std::map&& conns) + : RGWCoroutine(cct), http(http), + period(std::move(period)), + conns(std::move(conns)) + {} + + int operate() override; +}; + +int PushAllCR::operate() +{ + reenter(this) { + // spawn a coroutine to push the period over each connection + yield { + ldout(cct, 4) << "sending " << conns.size() << " periods" << dendl; + for (auto& c : conns) + spawn(new PushAndRetryCR(cct, c.first, &c.second, http, period), false); + } + // wait for all to complete + drain_all(); + return set_cr_done(); + } + return 0; +} + +/// A background thread to run the PushAllCR coroutine and exit. +class RGWPeriodPusher::CRThread { + RGWCoroutinesManager coroutines; + RGWHTTPManager http; + boost::intrusive_ptr push_all; + std::thread thread; + + public: + CRThread(CephContext* cct, RGWPeriod&& period, + std::map&& conns) + : coroutines(cct, NULL), + http(cct, coroutines.get_completion_mgr()), + push_all(new PushAllCR(cct, &http, std::move(period), std::move(conns))) + { + http.start(); + // must spawn the CR thread after start + thread = std::thread([this] { coroutines.run(push_all.get()); }); + } + ~CRThread() + { + push_all.reset(); + coroutines.stop(); + http.stop(); + if (thread.joinable()) + thread.join(); + } +}; + + +RGWPeriodPusher::RGWPeriodPusher(RGWRados* store) + : cct(store->ctx()), store(store) +{ + const auto& realm = store->svc.zone->get_realm(); + auto& realm_id = realm.get_id(); + if (realm_id.empty()) // no realm configuration + return; + + // always send out the current period on startup + RGWPeriod period; + int r = period.init(cct, store->svc.sysobj, realm_id, realm.get_name()); + if (r < 0) { + lderr(cct) << "failed to load period for realm " << realm_id << dendl; + return; + } + + std::lock_guard lock(mutex); + handle_notify(std::move(period)); +} + +// destructor is here because CRThread is incomplete in the header +RGWPeriodPusher::~RGWPeriodPusher() = default; + +void RGWPeriodPusher::handle_notify(RGWRealmNotify type, + bufferlist::const_iterator& p) +{ + // decode the period + RGWZonesNeedPeriod info; + try { + decode(info, p); + } catch (buffer::error& e) { + lderr(cct) << "Failed to decode the period: " << e.what() << dendl; + return; + } + + std::lock_guard lock(mutex); + + // we can't process this notification without access to our current realm + // configuration. queue it until resume() + if (store == nullptr) { + pending_periods.emplace_back(std::move(info)); + return; + } + + handle_notify(std::move(info)); +} + +// expects the caller to hold a lock on mutex +void RGWPeriodPusher::handle_notify(RGWZonesNeedPeriod&& period) +{ + if (period.get_realm_epoch() < realm_epoch) { + ldout(cct, 10) << "period's realm epoch " << period.get_realm_epoch() + << " is not newer than current realm epoch " << realm_epoch + << ", discarding update" << dendl; + return; + } + if (period.get_realm_epoch() == realm_epoch && + period.get_epoch() <= period_epoch) { + ldout(cct, 10) << "period epoch " << period.get_epoch() << " is not newer " + "than current epoch " << period_epoch << ", discarding update" << dendl; + return; + } + + // find our zonegroup in the new period + auto& zonegroups = period.get_map().zonegroups; + auto i = zonegroups.find(store->svc.zone->get_zonegroup().get_id()); + if (i == zonegroups.end()) { + lderr(cct) << "The new period does not contain my zonegroup!" << dendl; + return; + } + auto& my_zonegroup = i->second; + + // if we're not a master zone, we're not responsible for pushing any updates + if (my_zonegroup.master_zone != store->svc.zone->get_zone_params().get_id()) + return; + + // construct a map of the zones that need this period. the map uses the same + // keys/ordering as the zone[group] map, so we can use a hint for insertions + std::map conns; + auto hint = conns.end(); + + // are we the master zonegroup in this period? + if (period.get_map().master_zonegroup == store->svc.zone->get_zonegroup().get_id()) { + // update other zonegroup endpoints + for (auto& zg : zonegroups) { + auto& zonegroup = zg.second; + if (zonegroup.get_id() == store->svc.zone->get_zonegroup().get_id()) + continue; + if (zonegroup.endpoints.empty()) + continue; + + hint = conns.emplace_hint( + hint, std::piecewise_construct, + std::forward_as_tuple(zonegroup.get_id()), + std::forward_as_tuple(cct, store->svc.zone, zonegroup.get_id(), zonegroup.endpoints)); + } + } + + // update other zone endpoints + for (auto& z : my_zonegroup.zones) { + auto& zone = z.second; + if (zone.id == store->svc.zone->get_zone_params().get_id()) + continue; + if (zone.endpoints.empty()) + continue; + + hint = conns.emplace_hint( + hint, std::piecewise_construct, + std::forward_as_tuple(zone.id), + std::forward_as_tuple(cct, store->svc.zone, zone.id, zone.endpoints)); + } + + if (conns.empty()) { + ldout(cct, 4) << "No zones to update" << dendl; + return; + } + + realm_epoch = period.get_realm_epoch(); + period_epoch = period.get_epoch(); + + ldout(cct, 4) << "Zone master pushing period " << period.get_id() + << " epoch " << period_epoch << " to " + << conns.size() << " other zones" << dendl; + + // spawn a new coroutine thread, destroying the previous one + cr_thread.reset(new CRThread(cct, std::move(period), std::move(conns))); +} + +void RGWPeriodPusher::pause() +{ + ldout(cct, 4) << "paused for realm update" << dendl; + std::lock_guard lock(mutex); + store = nullptr; +} + +void RGWPeriodPusher::resume(RGWRados* store) +{ + std::lock_guard lock(mutex); + this->store = store; + + ldout(cct, 4) << "resume with " << pending_periods.size() + << " periods pending" << dendl; + + // process notification queue + for (auto& info : pending_periods) { + handle_notify(std::move(info)); + } + pending_periods.clear(); +} diff --git a/src/rgw/rgw_period_pusher.h b/src/rgw/rgw_period_pusher.h new file mode 100644 index 00000000..fdadd226 --- /dev/null +++ b/src/rgw/rgw_period_pusher.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_PERIOD_PUSHER_H +#define RGW_PERIOD_PUSHER_H + +#include +#include +#include + +#include "rgw_realm_reloader.h" + +class RGWRados; +class RGWPeriod; + +// RGWRealmNotify payload for push coordination +using RGWZonesNeedPeriod = RGWPeriod; + +/** + * RGWPeriodPusher coordinates with other nodes via the realm watcher to manage + * the responsibility for pushing period updates to other zones or zonegroups. + */ +class RGWPeriodPusher final : public RGWRealmWatcher::Watcher, + public RGWRealmReloader::Pauser { + public: + explicit RGWPeriodPusher(RGWRados* store); + ~RGWPeriodPusher() override; + + /// respond to realm notifications by pushing new periods to other zones + void handle_notify(RGWRealmNotify type, bufferlist::const_iterator& p) override; + + /// avoid accessing RGWRados while dynamic reconfiguration is in progress. + /// notifications will be enqueued until resume() + void pause() override; + + /// continue processing notifications with a new RGWRados instance + void resume(RGWRados* store) override; + + private: + void handle_notify(RGWZonesNeedPeriod&& period); + + CephContext *const cct; + RGWRados* store; + + std::mutex mutex; + epoch_t realm_epoch{0}; //< the current realm epoch being sent + epoch_t period_epoch{0}; //< the current period epoch being sent + + /// while paused for reconfiguration, we need to queue up notifications + std::vector pending_periods; + + class CRThread; //< contains thread, coroutine manager, http manager + std::unique_ptr cr_thread; //< thread to run the push coroutines +}; + +#endif // RGW_PERIOD_PUSHER_H diff --git a/src/rgw/rgw_policy_s3.cc b/src/rgw/rgw_policy_s3.cc new file mode 100644 index 00000000..17a4e953 --- /dev/null +++ b/src/rgw/rgw_policy_s3.cc @@ -0,0 +1,303 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "common/ceph_json.h" +#include "rgw_policy_s3.h" +#include "rgw_common.h" +#include "rgw_crypt_sanitize.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +class RGWPolicyCondition { +protected: + string v1; + string v2; + + virtual bool check(const string& first, const string& second, string& err_msg) = 0; + +public: + virtual ~RGWPolicyCondition() {} + + void set_vals(const string& _v1, const string& _v2) { + v1 = _v1; + v2 = _v2; + } + + bool check(RGWPolicyEnv *env, map& checked_vars, string& err_msg) { + string first, second; + env->get_value(v1, first, checked_vars); + env->get_value(v2, second, checked_vars); + dout(1) << "policy condition check " << v1 << " [" + << rgw::crypt_sanitize::s3_policy{v1, first} + << "] " << v2 << " [" + << rgw::crypt_sanitize::s3_policy{v2, second} + << "]" << dendl; + bool ret = check(first, second, err_msg); + if (!ret) { + err_msg.append(": "); + err_msg.append(v1); + err_msg.append(", "); + err_msg.append(v2); + } + return ret; + } + +}; + + +class RGWPolicyCondition_StrEqual : public RGWPolicyCondition { +protected: + bool check(const string& first, const string& second, string& msg) override { + bool ret = first.compare(second) == 0; + if (!ret) { + msg = "Policy condition failed: eq"; + } + return ret; + } +}; + +class RGWPolicyCondition_StrStartsWith : public RGWPolicyCondition { +protected: + bool check(const string& first, const string& second, string& msg) override { + bool ret = first.compare(0, second.size(), second) == 0; + if (!ret) { + msg = "Policy condition failed: starts-with"; + } + return ret; + } +}; + +void RGWPolicyEnv::add_var(const string& name, const string& value) +{ + vars[name] = value; +} + +bool RGWPolicyEnv::get_var(const string& name, string& val) +{ + map::iterator iter = vars.find(name); + if (iter == vars.end()) + return false; + + val = iter->second; + + return true; +} + +bool RGWPolicyEnv::get_value(const string& s, string& val, map& checked_vars) +{ + if (s.empty() || s[0] != '$') { + val = s; + return true; + } + + const string& var = s.substr(1); + checked_vars[var] = true; + + return get_var(var, val); +} + + +bool RGWPolicyEnv::match_policy_vars(map& policy_vars, string& err_msg) +{ + map::iterator iter; + string ignore_prefix = "x-ignore-"; + for (iter = vars.begin(); iter != vars.end(); ++iter) { + const string& var = iter->first; + if (strncasecmp(ignore_prefix.c_str(), var.c_str(), ignore_prefix.size()) == 0) + continue; + if (policy_vars.count(var) == 0) { + err_msg = "Policy missing condition: "; + err_msg.append(iter->first); + dout(1) << "env var missing in policy: " << iter->first << dendl; + return false; + } + } + return true; +} + +RGWPolicy::~RGWPolicy() +{ + list::iterator citer; + for (citer = conditions.begin(); citer != conditions.end(); ++citer) { + RGWPolicyCondition *cond = *citer; + delete cond; + } +} + +int RGWPolicy::set_expires(const string& e) +{ + struct tm t; + if (!parse_iso8601(e.c_str(), &t)) + return -EINVAL; + + expires = internal_timegm(&t); + + return 0; +} + +int RGWPolicy::add_condition(const string& op, const string& first, const string& second, string& err_msg) +{ + RGWPolicyCondition *cond = NULL; + if (stringcasecmp(op, "eq") == 0) { + cond = new RGWPolicyCondition_StrEqual; + } else if (stringcasecmp(op, "starts-with") == 0) { + cond = new RGWPolicyCondition_StrStartsWith; + } else if (stringcasecmp(op, "content-length-range") == 0) { + off_t min, max; + int r = stringtoll(first, &min); + if (r < 0) { + err_msg = "Bad content-length-range param"; + dout(0) << "bad content-length-range param: " << first << dendl; + return r; + } + + r = stringtoll(second, &max); + if (r < 0) { + err_msg = "Bad content-length-range param"; + dout(0) << "bad content-length-range param: " << second << dendl; + return r; + } + + if (min > min_length) + min_length = min; + + if (max < max_length) + max_length = max; + + return 0; + } + + if (!cond) { + err_msg = "Invalid condition: "; + err_msg.append(op); + dout(0) << "invalid condition: " << op << dendl; + return -EINVAL; + } + + cond->set_vals(first, second); + + conditions.push_back(cond); + + return 0; +} + +int RGWPolicy::check(RGWPolicyEnv *env, string& err_msg) +{ + uint64_t now = ceph_clock_now().sec(); + if (expires <= now) { + dout(0) << "NOTICE: policy calculated as expired: " << expiration_str << dendl; + err_msg = "Policy expired"; + return -EACCES; // change to condition about expired policy following S3 + } + + list >::iterator viter; + for (viter = var_checks.begin(); viter != var_checks.end(); ++viter) { + pair& p = *viter; + const string& name = p.first; + const string& check_val = p.second; + string val; + if (!env->get_var(name, val)) { + dout(20) << " policy check failed, variable not found: '" << name << "'" << dendl; + err_msg = "Policy check failed, variable not found: "; + err_msg.append(name); + return -EACCES; + } + + set_var_checked(name); + + dout(20) << "comparing " << name << " [" << val << "], " << check_val << dendl; + if (val.compare(check_val) != 0) { + err_msg = "Policy check failed, variable not met condition: "; + err_msg.append(name); + dout(1) << "policy check failed, val=" << val << " != " << check_val << dendl; + return -EACCES; + } + } + + list::iterator citer; + for (citer = conditions.begin(); citer != conditions.end(); ++citer) { + RGWPolicyCondition *cond = *citer; + if (!cond->check(env, checked_vars, err_msg)) { + return -EACCES; + } + } + + if (!env->match_policy_vars(checked_vars, err_msg)) { + dout(1) << "missing policy condition" << dendl; + return -EACCES; + } + return 0; +} + + +int RGWPolicy::from_json(bufferlist& bl, string& err_msg) +{ + JSONParser parser; + + if (!parser.parse(bl.c_str(), bl.length())) { + err_msg = "Malformed JSON"; + dout(0) << "malformed json" << dendl; + return -EINVAL; + } + + // as no time was included in the request, we hope that the user has included a short timeout + JSONObjIter iter = parser.find_first("expiration"); + if (iter.end()) { + err_msg = "Policy missing expiration"; + dout(0) << "expiration not found" << dendl; + return -EINVAL; // change to a "no expiration" error following S3 + } + + JSONObj *obj = *iter; + expiration_str = obj->get_data(); + int r = set_expires(expiration_str); + if (r < 0) { + err_msg = "Failed to parse policy expiration"; + return r; + } + + iter = parser.find_first("conditions"); + if (iter.end()) { + err_msg = "Policy missing conditions"; + dout(0) << "conditions not found" << dendl; + return -EINVAL; // change to a "no conditions" error following S3 + } + + obj = *iter; + + iter = obj->find_first(); + for (; !iter.end(); ++iter) { + JSONObj *child = *iter; + dout(20) << "data=" << child->get_data() << dendl; + dout(20) << "is_object=" << child->is_object() << dendl; + dout(20) << "is_array=" << child->is_array() << dendl; + JSONObjIter citer = child->find_first(); + if (child->is_array()) { + vector v; + int i; + for (i = 0; !citer.end() && i < 3; ++citer, ++i) { + JSONObj *o = *citer; + v.push_back(o->get_data()); + } + if (i != 3 || !citer.end()) { /* we expect exactly 3 arguments here */ + err_msg = "Bad condition array, expecting 3 arguments"; + return -EINVAL; + } + + int r = add_condition(v[0], v[1], v[2], err_msg); + if (r < 0) + return r; + } else if (!citer.end()) { + JSONObj *c = *citer; + dout(20) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl; + + add_simple_check(c->get_name(), c->get_data()); + } else { + return -EINVAL; + } + } + return 0; +} diff --git a/src/rgw/rgw_policy_s3.h b/src/rgw/rgw_policy_s3.h new file mode 100644 index 00000000..9768055d --- /dev/null +++ b/src/rgw/rgw_policy_s3.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_POLICY_H +#define CEPH_RGW_POLICY_H + +#include + +#include +#include +#include + +#include "include/utime.h" + +#include "rgw_string.h" + + +class RGWPolicyEnv { + std::map vars; + +public: + void add_var(const string& name, const string& value); + bool get_var(const string& name, string& val); + bool get_value(const string& s, string& val, std::map& checked_vars); + bool match_policy_vars(map& policy_vars, string& err_msg); +}; + +class RGWPolicyCondition; + + +class RGWPolicy { + uint64_t expires; + string expiration_str; + std::list conditions; + std::list > var_checks; + std::map checked_vars; + +public: + off_t min_length; + off_t max_length; + + RGWPolicy() : expires(0), min_length(0), max_length(LLONG_MAX) {} + ~RGWPolicy(); + + int set_expires(const string& e); + + void set_var_checked(const std::string& var) { + checked_vars[var] = true; + } + + int add_condition(const std::string& op, const std::string& first, const std::string& second, string& err_msg); + void add_simple_check(const std::string& var, const std::string& value) { + var_checks.push_back(pair(var, value)); + } + + int check(RGWPolicyEnv *env, string& err_msg); + int from_json(bufferlist& bl, string& err_msg); +}; +#endif diff --git a/src/rgw/rgw_process.cc b/src/rgw/rgw_process.cc new file mode 100644 index 00000000..ad43b5d3 --- /dev/null +++ b/src/rgw/rgw_process.cc @@ -0,0 +1,323 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" +#include "common/Throttle.h" +#include "common/WorkQueue.h" +#include "include/scope_guard.h" + +#include "rgw_rados.h" +#include "rgw_dmclock_scheduler.h" +#include "rgw_rest.h" +#include "rgw_frontend.h" +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_loadgen.h" +#include "rgw_client_io.h" +#include "rgw_opa.h" +#include "rgw_perf_counters.h" + +#include "services/svc_zone_utils.h" + +#define dout_subsys ceph_subsys_rgw + +using rgw::dmclock::Scheduler; + +void RGWProcess::RGWWQ::_dump_queue() +{ + if (!g_conf()->subsys.should_gather()) { + return; + } + deque::iterator iter; + if (process->m_req_queue.empty()) { + dout(20) << "RGWWQ: empty" << dendl; + return; + } + dout(20) << "RGWWQ:" << dendl; + for (iter = process->m_req_queue.begin(); + iter != process->m_req_queue.end(); ++iter) { + dout(20) << "req: " << hex << *iter << dec << dendl; + } +} /* RGWProcess::RGWWQ::_dump_queue */ + +auto schedule_request(Scheduler *scheduler, req_state *s, RGWOp *op) +{ + using rgw::dmclock::SchedulerCompleter; + if (!scheduler) + return std::make_pair(0,SchedulerCompleter{}); + + const auto client = op->dmclock_client(); + const auto cost = op->dmclock_cost(); + ldpp_dout(op,10) << "scheduling with dmclock client=" << static_cast(client) + << " cost=" << cost << dendl; + return scheduler->schedule_request(client, {}, + req_state::Clock::to_double(s->time), + cost, + s->yield); +} + +bool RGWProcess::RGWWQ::_enqueue(RGWRequest* req) { + process->m_req_queue.push_back(req); + perfcounter->inc(l_rgw_qlen); + dout(20) << "enqueued request req=" << hex << req << dec << dendl; + _dump_queue(); + return true; +} + +RGWRequest* RGWProcess::RGWWQ::_dequeue() { + if (process->m_req_queue.empty()) + return NULL; + RGWRequest *req = process->m_req_queue.front(); + process->m_req_queue.pop_front(); + dout(20) << "dequeued request req=" << hex << req << dec << dendl; + _dump_queue(); + perfcounter->inc(l_rgw_qlen, -1); + return req; +} + +void RGWProcess::RGWWQ::_process(RGWRequest *req, ThreadPool::TPHandle &) { + perfcounter->inc(l_rgw_qactive); + process->handle_request(req); + process->req_throttle.put(1); + perfcounter->inc(l_rgw_qactive, -1); +} + +int rgw_process_authenticated(RGWHandler_REST * const handler, + RGWOp *& op, + RGWRequest * const req, + req_state * const s, + const bool skip_retarget) +{ + ldpp_dout(op, 2) << "init permissions" << dendl; + int ret = handler->init_permissions(op); + if (ret < 0) { + return ret; + } + + /** + * Only some accesses support website mode, and website mode does NOT apply + * if you are using the REST endpoint either (ergo, no authenticated access) + */ + if (! skip_retarget) { + ldpp_dout(op, 2) << "recalculating target" << dendl; + ret = handler->retarget(op, &op); + if (ret < 0) { + return ret; + } + req->op = op; + } else { + ldpp_dout(op, 2) << "retargeting skipped because of SubOp mode" << dendl; + } + + /* If necessary extract object ACL and put them into req_state. */ + ldpp_dout(op, 2) << "reading permissions" << dendl; + ret = handler->read_permissions(op); + if (ret < 0) { + return ret; + } + + ldpp_dout(op, 2) << "init op" << dendl; + ret = op->init_processing(); + if (ret < 0) { + return ret; + } + + ldpp_dout(op, 2) << "verifying op mask" << dendl; + ret = op->verify_op_mask(); + if (ret < 0) { + return ret; + } + + /* Check if OPA is used to authorize requests */ + if (s->cct->_conf->rgw_use_opa_authz) { + ret = rgw_opa_authorize(op, s); + if (ret < 0) { + return ret; + } + } + + ldpp_dout(op, 2) << "verifying op permissions" << dendl; + ret = op->verify_permission(); + if (ret < 0) { + if (s->system_request) { + dout(2) << "overriding permissions due to system operation" << dendl; + } else if (s->auth.identity->is_admin_of(s->user->user_id)) { + dout(2) << "overriding permissions due to admin operation" << dendl; + } else { + return ret; + } + } + + ldpp_dout(op, 2) << "verifying op params" << dendl; + ret = op->verify_params(); + if (ret < 0) { + return ret; + } + + ldpp_dout(op, 2) << "pre-executing" << dendl; + op->pre_exec(); + + ldpp_dout(op, 2) << "executing" << dendl; + op->execute(); + + ldpp_dout(op, 2) << "completing" << dendl; + op->complete(); + + return 0; +} + +int process_request(RGWRados* const store, + RGWREST* const rest, + RGWRequest* const req, + const std::string& frontend_prefix, + const rgw_auth_registry_t& auth_registry, + RGWRestfulIO* const client_io, + OpsLogSocket* const olog, + optional_yield yield, + rgw::dmclock::Scheduler *scheduler, + int* http_ret) +{ + int ret = client_io->init(g_ceph_context); + + dout(1) << "====== starting new request req=" << hex << req << dec + << " =====" << dendl; + perfcounter->inc(l_rgw_req); + + RGWEnv& rgw_env = client_io->get_env(); + + RGWUserInfo userinfo; + + struct req_state rstate(g_ceph_context, &rgw_env, &userinfo, req->id); + struct req_state *s = &rstate; + + RGWObjectCtx rados_ctx(store, s); + s->obj_ctx = &rados_ctx; + + auto sysobj_ctx = store->svc.sysobj->init_obj_ctx(); + s->sysobj_ctx = &sysobj_ctx; + + if (ret < 0) { + s->cio = client_io; + abort_early(s, nullptr, ret, nullptr); + return ret; + } + + s->req_id = store->svc.zone_utils->unique_id(req->id); + s->trans_id = store->svc.zone_utils->unique_trans_id(req->id); + s->host_id = store->host_id; + s->yield = yield; + + ldpp_dout(s, 2) << "initializing for trans_id = " << s->trans_id << dendl; + + RGWOp* op = nullptr; + int init_error = 0; + bool should_log = false; + RGWRESTMgr *mgr; + RGWHandler_REST *handler = rest->get_handler(store, s, + auth_registry, + frontend_prefix, + client_io, &mgr, &init_error); + rgw::dmclock::SchedulerCompleter c; + if (init_error != 0) { + abort_early(s, nullptr, init_error, nullptr); + goto done; + } + dout(10) << "handler=" << typeid(*handler).name() << dendl; + + should_log = mgr->get_logging(); + + ldpp_dout(s, 2) << "getting op " << s->op << dendl; + op = handler->get_op(store); + if (!op) { + abort_early(s, NULL, -ERR_METHOD_NOT_ALLOWED, handler); + goto done; + } + std::tie(ret,c) = schedule_request(scheduler, s, op); + if (ret < 0) { + if (ret == -EAGAIN) { + ret = -ERR_RATE_LIMITED; + } + ldpp_dout(op,0) << "Scheduling request failed with " << ret << dendl; + abort_early(s, op, ret, handler); + goto done; + } + req->op = op; + dout(10) << "op=" << typeid(*op).name() << dendl; + + s->op_type = op->get_type(); + + try { + ldpp_dout(op, 2) << "verifying requester" << dendl; + ret = op->verify_requester(auth_registry); + if (ret < 0) { + dout(10) << "failed to authorize request" << dendl; + abort_early(s, op, ret, handler); + goto done; + } + + /* FIXME: remove this after switching all handlers to the new authentication + * infrastructure. */ + if (nullptr == s->auth.identity) { + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } + + ldpp_dout(op, 2) << "normalizing buckets and tenants" << dendl; + ret = handler->postauth_init(); + if (ret < 0) { + dout(10) << "failed to run post-auth init" << dendl; + abort_early(s, op, ret, handler); + goto done; + } + + if (s->user->suspended) { + dout(10) << "user is suspended, uid=" << s->user->user_id << dendl; + abort_early(s, op, -ERR_USER_SUSPENDED, handler); + goto done; + } + + ret = rgw_process_authenticated(handler, op, req, s); + if (ret < 0) { + abort_early(s, op, ret, handler); + goto done; + } + } catch (const ceph::crypto::DigestException& e) { + dout(0) << "authentication failed" << e.what() << dendl; + abort_early(s, op, -ERR_INVALID_SECRET_KEY, handler); + } + +done: + try { + client_io->complete_request(); + } catch (rgw::io::Exception& e) { + dout(0) << "ERROR: client_io->complete_request() returned " + << e.what() << dendl; + } + + if (should_log) { + rgw_log_op(store, rest, s, (op ? op->name() : "unknown"), olog); + } + + if (http_ret != nullptr) { + *http_ret = s->err.http_ret; + } + int op_ret = 0; + if (op) { + op_ret = op->get_ret(); + ldpp_dout(op, 2) << "op status=" << op_ret << dendl; + ldpp_dout(op, 2) << "http status=" << s->err.http_ret << dendl; + } else { + ldpp_dout(s, 2) << "http status=" << s->err.http_ret << dendl; + } + if (handler) + handler->put_op(op); + rest->put_handler(handler); + + dout(1) << "====== req done req=" << hex << req << dec + << " op status=" << op_ret + << " http_status=" << s->err.http_ret + << " latency=" << s->time_elapsed() + << " ======" + << dendl; + + return (ret < 0 ? ret : s->err.ret); +} /* process_request */ diff --git a/src/rgw/rgw_process.h b/src/rgw/rgw_process.h new file mode 100644 index 00000000..c3b27bd7 --- /dev/null +++ b/src/rgw/rgw_process.h @@ -0,0 +1,199 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_PROCESS_H +#define RGW_PROCESS_H + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_auth_registry.h" +#include "rgw_user.h" +#include "rgw_op.h" +#include "rgw_rest.h" + +#include "include/ceph_assert.h" + +#include "common/WorkQueue.h" +#include "common/Throttle.h" + +#include + +#if !defined(dout_subsys) +#define dout_subsys ceph_subsys_rgw +#define def_dout_subsys +#endif + +#define dout_context g_ceph_context + +extern void signal_shutdown(); + +namespace rgw::dmclock { + class Scheduler; +} + +struct RGWProcessEnv { + RGWRados *store; + RGWREST *rest; + OpsLogSocket *olog; + int port; + std::string uri_prefix; + std::shared_ptr auth_registry; +}; + +class RGWFrontendConfig; + +class RGWProcess { + deque m_req_queue; +protected: + CephContext *cct; + RGWRados* store; + rgw_auth_registry_ptr_t auth_registry; + OpsLogSocket* olog; + ThreadPool m_tp; + Throttle req_throttle; + RGWREST* rest; + RGWFrontendConfig* conf; + int sock_fd; + std::string uri_prefix; + + struct RGWWQ : public ThreadPool::WorkQueue { + RGWProcess* process; + RGWWQ(RGWProcess* p, time_t timeout, time_t suicide_timeout, ThreadPool* tp) + : ThreadPool::WorkQueue("RGWWQ", timeout, suicide_timeout, + tp), process(p) {} + + bool _enqueue(RGWRequest* req) override; + + void _dequeue(RGWRequest* req) override { + ceph_abort(); + } + + bool _empty() override { + return process->m_req_queue.empty(); + } + + RGWRequest* _dequeue() override; + + using ThreadPool::WorkQueue::_process; + + void _process(RGWRequest *req, ThreadPool::TPHandle &) override; + + void _dump_queue(); + + void _clear() override { + ceph_assert(process->m_req_queue.empty()); + } + } req_wq; + +public: + RGWProcess(CephContext* const cct, + RGWProcessEnv* const pe, + const int num_threads, + RGWFrontendConfig* const conf) + : cct(cct), + store(pe->store), + auth_registry(pe->auth_registry), + olog(pe->olog), + m_tp(cct, "RGWProcess::m_tp", "tp_rgw_process", num_threads), + req_throttle(cct, "rgw_ops", num_threads * 2), + rest(pe->rest), + conf(conf), + sock_fd(-1), + uri_prefix(pe->uri_prefix), + req_wq(this, g_conf()->rgw_op_thread_timeout, + g_conf()->rgw_op_thread_suicide_timeout, &m_tp) { + } + + virtual ~RGWProcess() = default; + + virtual void run() = 0; + virtual void handle_request(RGWRequest *req) = 0; + + void pause() { + m_tp.pause(); + } + + void unpause_with_new_config(RGWRados* const store, + rgw_auth_registry_ptr_t auth_registry) { + this->store = store; + this->auth_registry = std::move(auth_registry); + m_tp.unpause(); + } + + void close_fd() { + if (sock_fd >= 0) { + ::close(sock_fd); + sock_fd = -1; + } + } +}; /* RGWProcess */ + +class RGWFCGXProcess : public RGWProcess { + int max_connections; +public: + + /* have a bit more connections than threads so that requests are + * still accepted even if we're still processing older requests */ + RGWFCGXProcess(CephContext* const cct, + RGWProcessEnv* const pe, + const int num_threads, + RGWFrontendConfig* const conf) + : RGWProcess(cct, pe, num_threads, conf), + max_connections(num_threads + (num_threads >> 3)) { + } + + void run() override; + void handle_request(RGWRequest* req) override; +}; + +class RGWProcessControlThread : public Thread { + RGWProcess *pprocess; +public: + explicit RGWProcessControlThread(RGWProcess *_pprocess) : pprocess(_pprocess) {} + + void *entry() override { + pprocess->run(); + return NULL; + } +}; + +class RGWLoadGenProcess : public RGWProcess { + RGWAccessKey access_key; +public: + RGWLoadGenProcess(CephContext* cct, RGWProcessEnv* pe, int num_threads, + RGWFrontendConfig* _conf) : + RGWProcess(cct, pe, num_threads, _conf) {} + void run() override; + void checkpoint(); + void handle_request(RGWRequest* req) override; + void gen_request(const string& method, const string& resource, + int content_length, std::atomic* fail_flag); + + void set_access_key(RGWAccessKey& key) { access_key = key; } +}; +/* process stream request */ +extern int process_request(RGWRados* store, + RGWREST* rest, + RGWRequest* req, + const std::string& frontend_prefix, + const rgw_auth_registry_t& auth_registry, + RGWRestfulIO* client_io, + OpsLogSocket* olog, + optional_yield y, + rgw::dmclock::Scheduler *scheduler, + int* http_ret = nullptr); + +extern int rgw_process_authenticated(RGWHandler_REST* handler, + RGWOp*& op, + RGWRequest* req, + req_state* s, + bool skip_retarget = false); + +#if defined(def_dout_subsys) +#undef def_dout_subsys +#undef dout_subsys +#endif +#undef dout_context + +#endif /* RGW_PROCESS_H */ diff --git a/src/rgw/rgw_pubsub.cc b/src/rgw/rgw_pubsub.cc new file mode 100644 index 00000000..f3ff342f --- /dev/null +++ b/src/rgw/rgw_pubsub.cc @@ -0,0 +1,872 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "services/svc_zone.h" +#include "rgw_b64.h" +#include "rgw_rados.h" +#include "rgw_pubsub.h" +#include "rgw_tools.h" +#include "rgw_xml.h" +#include "rgw_arn.h" +#include "rgw_pubsub_push.h" +#include "rgw_rados.h" +#include +#include + +#define dout_subsys ceph_subsys_rgw + +void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) { + char buf[64]; + const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str()); + if (len > 0) { + id.assign(buf, len); + } +} + +bool rgw_s3_key_filter::decode_xml(XMLObj* obj) { + XMLObjIter iter = obj->find("FilterRule"); + XMLObj *o; + + const auto throw_if_missing = true; + auto prefix_not_set = true; + auto suffix_not_set = true; + auto regex_not_set = true; + std::string name; + + while ((o = iter.get_next())) { + RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing); + if (name == "prefix" && prefix_not_set) { + prefix_not_set = false; + RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing); + } else if (name == "suffix" && suffix_not_set) { + suffix_not_set = false; + RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing); + } else if (name == "regex" && regex_not_set) { + regex_not_set = false; + RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing); + } else { + throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'"); + } + } + return true; +} + +void rgw_s3_key_filter::dump_xml(Formatter *f) const { + if (!prefix_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "prefix", f); + ::encode_xml("Value", prefix_rule, f); + f->close_section(); + } + if (!suffix_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "suffix", f); + ::encode_xml("Value", suffix_rule, f); + f->close_section(); + } + if (!regex_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "regex", f); + ::encode_xml("Value", regex_rule, f); + f->close_section(); + } +} + +bool rgw_s3_key_filter::has_content() const { + return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty()); +} + +bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) { + kvl.clear(); + XMLObjIter iter = obj->find("FilterRule"); + XMLObj *o; + + const auto throw_if_missing = true; + + std::string key; + std::string value; + + while ((o = iter.get_next())) { + RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing); + RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing); + kvl.emplace(key, value); + } + return true; +} + +void rgw_s3_key_value_filter::dump_xml(Formatter *f) const { + for (const auto& key_value : kvl) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", key_value.first, f); + ::encode_xml("Value", key_value.second, f); + f->close_section(); + } +} + +bool rgw_s3_key_value_filter::has_content() const { + return !kvl.empty(); +} + +bool rgw_s3_filter::decode_xml(XMLObj* obj) { + RGWXMLDecoder::decode_xml("S3Key", key_filter, obj); + RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj); + RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj); + return true; +} + +void rgw_s3_filter::dump_xml(Formatter *f) const { + if (key_filter.has_content()) { + ::encode_xml("S3Key", key_filter, f); + } + if (metadata_filter.has_content()) { + ::encode_xml("S3Metadata", metadata_filter, f); + } + if (tag_filter.has_content()) { + ::encode_xml("S3Tags", tag_filter, f); + } +} + +bool rgw_s3_filter::has_content() const { + return key_filter.has_content() || + metadata_filter.has_content() || + tag_filter.has_content(); +} + +bool match(const rgw_s3_key_filter& filter, const std::string& key) { + const auto key_size = key.size(); + const auto prefix_size = filter.prefix_rule.size(); + if (prefix_size != 0) { + // prefix rule exists + if (prefix_size > key_size) { + // if prefix is longer than key, we fail + return false; + } + if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) { + return false; + } + } + const auto suffix_size = filter.suffix_rule.size(); + if (suffix_size != 0) { + // suffix rule exists + if (suffix_size > key_size) { + // if suffix is longer than key, we fail + return false; + } + if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) { + return false; + } + } + if (!filter.regex_rule.empty()) { + // TODO add regex chaching in the filter + const std::regex base_regex(filter.regex_rule); + if (!std::regex_match(key, base_regex)) { + return false; + } + } + return true; +} + +bool match(const rgw_s3_key_value_filter& filter, const KeyValueList& kvl) { + // all filter pairs must exist with the same value in the object's metadata/tags + // object metadata/tags may include items not in the filter + return std::includes(kvl.begin(), kvl.end(), filter.kvl.begin(), filter.kvl.end()); +} + +bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) { + // if event list exists, and none of the events in the list matches the event type, filter the message + if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) { + return false; + } + return true; +} + +void do_decode_xml_obj(rgw::notify::EventTypeList& l, const string& name, XMLObj *obj) { + l.clear(); + + XMLObjIter iter = obj->find(name); + XMLObj *o; + + while ((o = iter.get_next())) { + std::string val; + decode_xml_obj(val, o); + l.push_back(rgw::notify::from_string(val)); + } +} + +bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) { + const auto throw_if_missing = true; + RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing); + + RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing); + + RGWXMLDecoder::decode_xml("Filter", filter, obj); + + do_decode_xml_obj(events, "Event", obj); + if (events.empty()) { + // if no events are provided, we assume all events + events.push_back(rgw::notify::ObjectCreated); + events.push_back(rgw::notify::ObjectRemoved); + } + return true; +} + +void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const { + ::encode_xml("Id", id, f); + ::encode_xml("Topic", topic_arn.c_str(), f); + if (filter.has_content()) { + ::encode_xml("Filter", filter, f); + } + for (const auto& event : events) { + ::encode_xml("Event", rgw::notify::to_string(event), f); + } +} + +bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) { + do_decode_xml_obj(list, "TopicConfiguration", obj); + if (list.empty()) { + throw RGWXMLDecoder::err("at least one 'TopicConfiguration' must exist"); + } + return true; +} + +rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) : + id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {} + +void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const { + do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f); +} + +void rgw_pubsub_s3_record::dump(Formatter *f) const { + encode_json("eventVersion", eventVersion, f); + encode_json("eventSource", eventSource, f); + encode_json("awsRegion", awsRegion, f); + utime_t ut(eventTime); + encode_json("eventTime", ut, f); + encode_json("eventName", eventName, f); + { + Formatter::ObjectSection s(*f, "userIdentity"); + encode_json("principalId", userIdentity, f); + } + { + Formatter::ObjectSection s(*f, "requestParameters"); + encode_json("sourceIPAddress", sourceIPAddress, f); + } + { + Formatter::ObjectSection s(*f, "responseElements"); + encode_json("x-amz-request-id", x_amz_request_id, f); + encode_json("x-amz-id-2", x_amz_id_2, f); + } + { + Formatter::ObjectSection s(*f, "s3"); + encode_json("s3SchemaVersion", s3SchemaVersion, f); + encode_json("configurationId", configurationId, f); + { + Formatter::ObjectSection sub_s(*f, "bucket"); + encode_json("name", bucket_name, f); + { + Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity"); + encode_json("principalId", bucket_ownerIdentity, f); + } + encode_json("arn", bucket_arn, f); + encode_json("id", bucket_id, f); + } + { + Formatter::ObjectSection sub_s(*f, "object"); + encode_json("key", object_key, f); + encode_json("size", object_size, f); + encode_json("etag", object_etag, f); + encode_json("versionId", object_versionId, f); + encode_json("sequencer", object_sequencer, f); + encode_json("metadata", x_meta_map, f); + encode_json("tags", tags, f); + } + } + encode_json("eventId", id, f); + encode_json("opaqueData", opaque_data, f); +} + +void rgw_pubsub_event::dump(Formatter *f) const +{ + encode_json("id", id, f); + encode_json("event", event_name, f); + utime_t ut(timestamp); + encode_json("timestamp", ut, f); + encode_json("info", info, f); +} + +void rgw_pubsub_topic::dump(Formatter *f) const +{ + encode_json("user", user, f); + encode_json("name", name, f); + encode_json("dest", dest, f); + encode_json("arn", arn, f); + encode_json("opaqueData", opaque_data, f); +} + +void rgw_pubsub_topic::dump_xml(Formatter *f) const +{ + encode_xml("User", user, f); + encode_xml("Name", name, f); + encode_xml("EndPoint", dest, f); + encode_xml("TopicArn", arn, f); + encode_xml("OpaqueData", opaque_data, f); +} + +void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f) +{ + f->open_array_section(name); + for (auto iter = l.cbegin(); iter != l.cend(); ++iter) { + f->dump_string("obj", rgw::notify::to_ceph_string(*iter)); + } + f->close_section(); +} + +void rgw_pubsub_topic_filter::dump(Formatter *f) const +{ + encode_json("topic", topic, f); + encode_json("events", events, f); +} + +void rgw_pubsub_topic_subs::dump(Formatter *f) const +{ + encode_json("topic", topic, f); + encode_json("subs", subs, f); +} + +void rgw_pubsub_bucket_topics::dump(Formatter *f) const +{ + Formatter::ArraySection s(*f, "topics"); + for (auto& t : topics) { + encode_json(t.first.c_str(), t.second, f); + } +} + +void rgw_pubsub_user_topics::dump(Formatter *f) const +{ + Formatter::ArraySection s(*f, "topics"); + for (auto& t : topics) { + encode_json(t.first.c_str(), t.second, f); + } +} + +void rgw_pubsub_user_topics::dump_xml(Formatter *f) const +{ + for (auto& t : topics) { + encode_xml("member", t.second.topic, f); + } +} + +void rgw_pubsub_sub_dest::dump(Formatter *f) const +{ + encode_json("bucket_name", bucket_name, f); + encode_json("oid_prefix", oid_prefix, f); + encode_json("push_endpoint", push_endpoint, f); + encode_json("push_endpoint_args", push_endpoint_args, f); + encode_json("push_endpoint_topic", arn_topic, f); +} + +void rgw_pubsub_sub_dest::dump_xml(Formatter *f) const +{ + encode_xml("EndpointAddress", push_endpoint, f); + encode_xml("EndpointArgs", push_endpoint_args, f); + encode_xml("EndpointTopic", arn_topic, f); +} + +void rgw_pubsub_sub_config::dump(Formatter *f) const +{ + encode_json("user", user, f); + encode_json("name", name, f); + encode_json("topic", topic, f); + encode_json("dest", dest, f); + encode_json("s3_id", s3_id, f); +} + + +int RGWUserPubSub::remove(const rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker) +{ + int ret = rgw_delete_system_obj(store, obj.pool, obj.oid, objv_tracker); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWUserPubSub::read_user_topics(rgw_pubsub_user_topics *result, RGWObjVersionTracker *objv_tracker) +{ + int ret = read(user_meta_obj, result, objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 10) << "WARNING: failed to read topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWUserPubSub::write_user_topics(const rgw_pubsub_user_topics& topics, RGWObjVersionTracker *objv_tracker) +{ + int ret = write(user_meta_obj, topics, objv_tracker); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWUserPubSub::get_user_topics(rgw_pubsub_user_topics *result) +{ + return read_user_topics(result, nullptr); +} + +int RGWUserPubSub::Bucket::read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker *objv_tracker) +{ + int ret = ps->read(bucket_meta_obj, result, objv_tracker); + if (ret < 0 && ret != -ENOENT) { + ldout(ps->store->ctx(), 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWUserPubSub::Bucket::write_topics(const rgw_pubsub_bucket_topics& topics, RGWObjVersionTracker *objv_tracker) +{ + int ret = ps->write(bucket_meta_obj, topics, objv_tracker); + if (ret < 0) { + ldout(ps->store->ctx(), 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWUserPubSub::Bucket::get_topics(rgw_pubsub_bucket_topics *result) +{ + return read_topics(result, nullptr); +} + +int RGWUserPubSub::get_topic(const string& name, rgw_pubsub_topic_subs *result) +{ + rgw_pubsub_user_topics topics; + int ret = get_user_topics(&topics); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } + + auto iter = topics.topics.find(name); + if (iter == topics.topics.end()) { + ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl; + return -ENOENT; + } + + *result = iter->second; + return 0; +} + +int RGWUserPubSub::get_topic(const string& name, rgw_pubsub_topic *result) +{ + rgw_pubsub_user_topics topics; + int ret = get_user_topics(&topics); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } + + auto iter = topics.topics.find(name); + if (iter == topics.topics.end()) { + ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl; + return -ENOENT; + } + + *result = iter->second.topic; + return 0; +} + +int RGWUserPubSub::Bucket::create_notification(const string& topic_name, const rgw::notify::EventTypeList& events) { + return create_notification(topic_name, events, std::nullopt, ""); +} + +int RGWUserPubSub::Bucket::create_notification(const string& topic_name, const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name) { + rgw_pubsub_topic_subs user_topic_info; + RGWRados *store = ps->store; + + int ret = ps->get_topic(topic_name, &user_topic_info); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl; + return ret; + } + ldout(store->ctx(), 20) << "successfully read topic '" << topic_name << "' info" << dendl; + + RGWObjVersionTracker objv_tracker; + rgw_pubsub_bucket_topics bucket_topics; + + ret = read_topics(&bucket_topics, &objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read topics from bucket '" << + bucket.name << "': ret=" << ret << dendl; + return ret; + } + ldout(store->ctx(), 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" << + bucket.name << "'" << dendl; + + auto& topic_filter = bucket_topics.topics[topic_name]; + topic_filter.topic = user_topic_info.topic; + topic_filter.events = events; + topic_filter.s3_id = notif_name; + if (s3_filter) { + topic_filter.s3_filter = *s3_filter; + } + + ret = write_topics(bucket_topics, &objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to write topics to bucket '" << bucket.name << "': ret=" << ret << dendl; + return ret; + } + + ldout(store->ctx(), 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket.name << "'" << dendl; + + return 0; +} + +int RGWUserPubSub::Bucket::remove_notification(const string& topic_name) +{ + rgw_pubsub_topic_subs user_topic_info; + RGWRados *store = ps->store; + + int ret = ps->get_topic(topic_name, &user_topic_info); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read topic info: ret=" << ret << dendl; + return ret; + } + + RGWObjVersionTracker objv_tracker; + rgw_pubsub_bucket_topics bucket_topics; + + ret = read_topics(&bucket_topics, &objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl; + return ret; + } + + bucket_topics.topics.erase(topic_name); + + ret = write_topics(bucket_topics, &objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWUserPubSub::create_topic(const string& name) { + return create_topic(name, rgw_pubsub_sub_dest(), "", ""); +} + +int RGWUserPubSub::create_topic(const string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data) { + RGWObjVersionTracker objv_tracker; + rgw_pubsub_user_topics topics; + + int ret = read_user_topics(&topics, &objv_tracker); + if (ret < 0 && ret != -ENOENT) { + // its not an error if not topics exist, we create one + ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } + + rgw_pubsub_topic_subs& new_topic = topics.topics[name]; + new_topic.topic.user = user; + new_topic.topic.name = name; + new_topic.topic.dest = dest; + new_topic.topic.arn = arn; + new_topic.topic.opaque_data = opaque_data; + + ret = write_user_topics(topics, &objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWUserPubSub::remove_topic(const string& name) +{ + RGWObjVersionTracker objv_tracker; + rgw_pubsub_user_topics topics; + + int ret = read_user_topics(&topics, &objv_tracker); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } else if (ret == -ENOENT) { + // its not an error if no topics exist, just a no-op + ldout(store->ctx(), 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl; + return 0; + } + + topics.topics.erase(name); + + ret = write_user_topics(topics, &objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWUserPubSub::Sub::read_sub(rgw_pubsub_sub_config *result, RGWObjVersionTracker *objv_tracker) +{ + int ret = ps->read(sub_meta_obj, result, objv_tracker); + if (ret < 0 && ret != -ENOENT) { + ldout(ps->store->ctx(), 1) << "ERROR: failed to read subscription info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWUserPubSub::Sub::write_sub(const rgw_pubsub_sub_config& sub_conf, RGWObjVersionTracker *objv_tracker) +{ + int ret = ps->write(sub_meta_obj, sub_conf, objv_tracker); + if (ret < 0) { + ldout(ps->store->ctx(), 1) << "ERROR: failed to write subscription info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWUserPubSub::Sub::remove_sub(RGWObjVersionTracker *objv_tracker) +{ + int ret = ps->remove(sub_meta_obj, objv_tracker); + if (ret < 0) { + ldout(ps->store->ctx(), 1) << "ERROR: failed to remove subscription info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWUserPubSub::Sub::get_conf(rgw_pubsub_sub_config *result) +{ + return read_sub(result, nullptr); +} + +int RGWUserPubSub::Sub::subscribe(const string& topic, const rgw_pubsub_sub_dest& dest, const std::string& s3_id) +{ + RGWObjVersionTracker user_objv_tracker; + rgw_pubsub_user_topics topics; + RGWRados *store = ps->store; + + int ret = ps->read_user_topics(&topics, &user_objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret != -ENOENT ? ret : -EINVAL; + } + + auto iter = topics.topics.find(topic); + if (iter == topics.topics.end()) { + ldout(store->ctx(), 1) << "ERROR: cannot add subscription to topic: topic not found" << dendl; + return -EINVAL; + } + + auto& t = iter->second; + + rgw_pubsub_sub_config sub_conf; + + sub_conf.user = ps->user; + sub_conf.name = sub; + sub_conf.topic = topic; + sub_conf.dest = dest; + sub_conf.s3_id = s3_id; + + t.subs.insert(sub); + + ret = ps->write_user_topics(topics, &user_objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + + ret = write_sub(sub_conf, nullptr); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to write subscription info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWUserPubSub::Sub::unsubscribe(const string& _topic) +{ + string topic = _topic; + RGWObjVersionTracker sobjv_tracker; + RGWRados *store = ps->store; + + if (topic.empty()) { + rgw_pubsub_sub_config sub_conf; + int ret = read_sub(&sub_conf, &sobjv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read subscription info: ret=" << ret << dendl; + return ret; + } + topic = sub_conf.topic; + } + + RGWObjVersionTracker objv_tracker; + rgw_pubsub_user_topics topics; + + int ret = ps->read_user_topics(&topics, &objv_tracker); + if (ret < 0) { + // not an error - could be that topic was already deleted + ldout(store->ctx(), 10) << "WARNING: failed to read topics info: ret=" << ret << dendl; + } else { + auto iter = topics.topics.find(topic); + if (iter != topics.topics.end()) { + auto& t = iter->second; + + t.subs.erase(sub); + + ret = ps->write_user_topics(topics, &objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + } + } + + ret = remove_sub(&sobjv_tracker); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to delete subscription info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +template +void RGWUserPubSub::SubWithEvents::list_events_result::dump(Formatter *f) const +{ + encode_json("next_marker", next_marker, f); + encode_json("is_truncated", is_truncated, f); + + Formatter::ArraySection s(*f, EventType::json_type_plural); + for (auto& event : events) { + encode_json("", event, f); + } +} + +template +int RGWUserPubSub::SubWithEvents::list_events(const string& marker, int max_events) +{ + RGWRados *store = ps->store; + rgw_pubsub_sub_config sub_conf; + int ret = get_conf(&sub_conf); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read sub config: ret=" << ret << dendl; + return ret; + } + + RGWBucketInfo bucket_info; + string tenant; + RGWSysObjectCtx obj_ctx(store->svc.sysobj->init_obj_ctx()); + ret = store->get_bucket_info(obj_ctx, tenant, sub_conf.dest.bucket_name, bucket_info, nullptr, nullptr); + if (ret == -ENOENT) { + list.is_truncated = false; + return 0; + } + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read bucket info for events bucket: bucket=" << sub_conf.dest.bucket_name << " ret=" << ret << dendl; + return ret; + } + + RGWRados::Bucket target(store, bucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.prefix = sub_conf.dest.oid_prefix; + list_op.params.marker = marker; + + std::vector objs; + + ret = list_op.list_objects(max_events, &objs, nullptr, &list.is_truncated); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to list bucket: bucket=" << sub_conf.dest.bucket_name << " ret=" << ret << dendl; + return ret; + } + if (list.is_truncated) { + list.next_marker = list_op.get_next_marker().name; + } + + for (auto& obj : objs) { + bufferlist bl64; + bufferlist bl; + bl64.append(obj.meta.user_data); + try { + bl.decode_base64(bl64); + } catch (buffer::error& err) { + ldout(store->ctx(), 1) << "ERROR: failed to event (not a valid base64)" << dendl; + continue; + } + EventType event; + + auto iter = bl.cbegin(); + try { + decode(event, iter); + } catch (buffer::error& err) { + ldout(store->ctx(), 1) << "ERROR: failed to decode event" << dendl; + continue; + }; + + list.events.push_back(event); + } + return 0; +} + +template +int RGWUserPubSub::SubWithEvents::remove_event(const string& event_id) +{ + RGWRados *store = ps->store; + rgw_pubsub_sub_config sub_conf; + int ret = get_conf(&sub_conf); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read sub config: ret=" << ret << dendl; + return ret; + } + + RGWBucketInfo bucket_info; + string tenant; + RGWSysObjectCtx sysobj_ctx(store->svc.sysobj->init_obj_ctx()); + ret = store->get_bucket_info(sysobj_ctx, tenant, sub_conf.dest.bucket_name, bucket_info, nullptr, nullptr); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read bucket info for events bucket: bucket=" << sub_conf.dest.bucket_name << " ret=" << ret << dendl; + return ret; + } + + rgw_bucket& bucket = bucket_info.bucket; + + RGWObjectCtx obj_ctx(store); + rgw_obj obj(bucket, sub_conf.dest.oid_prefix + event_id); + + obj_ctx.set_atomic(obj); + + RGWRados::Object del_target(store, bucket_info, obj_ctx, obj); + RGWRados::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = bucket_info.owner; + del_op.params.versioning_status = bucket_info.versioning_status(); + + ret = del_op.delete_obj(); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to remove event (obj=" << obj << "): ret=" << ret << dendl; + } + return 0; +} + +template +void RGWUserPubSub::SubWithEvents::dump(Formatter* f) const { + list.dump(f); +} + +// explicit instantiation for the only two possible types +// no need to move implementation to header +template class RGWUserPubSub::SubWithEvents; +template class RGWUserPubSub::SubWithEvents; + diff --git a/src/rgw/rgw_pubsub.h b/src/rgw/rgw_pubsub.h new file mode 100644 index 00000000..d7b1758a --- /dev/null +++ b/src/rgw/rgw_pubsub.h @@ -0,0 +1,812 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#ifndef CEPH_RGW_PUBSUB_H +#define CEPH_RGW_PUBSUB_H + +#include "rgw_common.h" +#include "rgw_tools.h" +#include "rgw_zone.h" +#include "rgw_rados.h" +#include "rgw_notify_event_type.h" +#include "services/svc_sys_obj.h" +#include + +class XMLObj; + +struct rgw_s3_key_filter { + std::string prefix_rule; + std::string suffix_rule; + std::string regex_rule; + + bool has_content() const; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(prefix_rule, bl); + encode(suffix_rule, bl); + encode(regex_rule, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(prefix_rule, bl); + decode(suffix_rule, bl); + decode(regex_rule, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_key_filter) + +using KeyValueList = boost::container::flat_map; + +struct rgw_s3_key_value_filter { + KeyValueList kvl; + + bool has_content() const; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(kvl, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(kvl, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_key_value_filter) + +struct rgw_s3_filter { + rgw_s3_key_filter key_filter; + rgw_s3_key_value_filter metadata_filter; + rgw_s3_key_value_filter tag_filter; + + bool has_content() const; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(key_filter, bl); + encode(metadata_filter, bl); + encode(tag_filter, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(key_filter, bl); + decode(metadata_filter, bl); + if (struct_v >= 2) { + decode(tag_filter, bl); + } + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_filter) + +using OptionalFilter = std::optional; + +struct rgw_pubsub_topic_filter; +/* S3 notification configuration + * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html + + + + + + suffix + jpg + + + + + + + + + + + + + + + + notification1 + arn:aws:sns::: + s3:ObjectCreated:* + s3:ObjectRemoved:* + + +*/ +struct rgw_pubsub_s3_notification { + // notification id + std::string id; + // types of events + rgw::notify::EventTypeList events; + // topic ARN + std::string topic_arn; + // filter rules + rgw_s3_filter filter; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + rgw_pubsub_s3_notification() = default; + // construct from rgw_pubsub_topic_filter (used by get/list notifications) + explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter); +}; + +// return true if the key matches the prefix/suffix/regex rules of the key filter +bool match(const rgw_s3_key_filter& filter, const std::string& key); +// return true if the key matches the metadata/tags rules of the metadata/tags filter +bool match(const rgw_s3_key_value_filter& filter, const KeyValueList& kvl); +// return true if the event type matches (equal or contained in) one of the events in the list +bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event); + +struct rgw_pubsub_s3_notifications { + std::list list; + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +/* S3 event records structure + * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html +{ +"Records":[ + { + "eventVersion":"" + "eventSource":"", + "awsRegion":"", + "eventTime":"", + "eventName":"", + "userIdentity":{ + "principalId":"" + }, + "requestParameters":{ + "sourceIPAddress":"" + }, + "responseElements":{ + "x-amz-request-id":"", + "x-amz-id-2":"" + }, + "s3":{ + "s3SchemaVersion":"1.0", + "configurationId":"", + "bucket":{ + "name":"", + "ownerIdentity":{ + "principalId":"" + }, + "arn":"" + "id": "" + }, + "object":{ + "key":"", + "size": , + "eTag":"", + "versionId":"", + "sequencer": "", + "metadata": "" + "tags": "" + } + }, + "eventId":"", + } +] +}*/ + +struct rgw_pubsub_s3_record { + constexpr static const char* const json_type_plural = "Records"; + std::string eventVersion = "2.2"; + // aws:s3 + std::string eventSource = "ceph:s3"; + // zonegroup + std::string awsRegion; + // time of the request + ceph::real_time eventTime; + // type of the event + std::string eventName; + // user that sent the request + std::string userIdentity; + // IP address of source of the request (not implemented) + std::string sourceIPAddress; + // request ID (not implemented) + std::string x_amz_request_id; + // radosgw that received the request + std::string x_amz_id_2; + std::string s3SchemaVersion = "1.0"; + // ID received in the notification request + std::string configurationId; + // bucket name + std::string bucket_name; + // bucket owner + std::string bucket_ownerIdentity; + // bucket ARN + std::string bucket_arn; + // object key + std::string object_key; + // object size + uint64_t object_size = 0; + // object etag + std::string object_etag; + // object version id bucket is versioned + std::string object_versionId; + // hexadecimal value used to determine event order for specific key + std::string object_sequencer; + // this is an rgw extension (not S3 standard) + // used to store a globally unique identifier of the event + // that could be used for acking or any other identification of the event + std::string id; + // this is an rgw extension holding the internal bucket id + std::string bucket_id; + // meta data + KeyValueList x_meta_map; + // tags + KeyValueList tags; + // opaque data received from the topic + // could be used to identify the gateway + std::string opaque_data; + + void encode(bufferlist& bl) const { + ENCODE_START(4, 1, bl); + encode(eventVersion, bl); + encode(eventSource, bl); + encode(awsRegion, bl); + encode(eventTime, bl); + encode(eventName, bl); + encode(userIdentity, bl); + encode(sourceIPAddress, bl); + encode(x_amz_request_id, bl); + encode(x_amz_id_2, bl); + encode(s3SchemaVersion, bl); + encode(configurationId, bl); + encode(bucket_name, bl); + encode(bucket_ownerIdentity, bl); + encode(bucket_arn, bl); + encode(object_key, bl); + encode(object_size, bl); + encode(object_etag, bl); + encode(object_versionId, bl); + encode(object_sequencer, bl); + encode(id, bl); + encode(bucket_id, bl); + encode(x_meta_map, bl); + encode(tags, bl); + encode(opaque_data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(4, bl); + decode(eventVersion, bl); + decode(eventSource, bl); + decode(awsRegion, bl); + decode(eventTime, bl); + decode(eventName, bl); + decode(userIdentity, bl); + decode(sourceIPAddress, bl); + decode(x_amz_request_id, bl); + decode(x_amz_id_2, bl); + decode(s3SchemaVersion, bl); + decode(configurationId, bl); + decode(bucket_name, bl); + decode(bucket_ownerIdentity, bl); + decode(bucket_arn, bl); + decode(object_key, bl); + decode(object_size, bl); + decode(object_etag, bl); + decode(object_versionId, bl); + decode(object_sequencer, bl); + decode(id, bl); + if (struct_v >= 2) { + decode(bucket_id, bl); + decode(x_meta_map, bl); + } + if (struct_v >= 3) { + decode(tags, bl); + } + if (struct_v >= 4) { + decode(opaque_data, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_s3_record) + +struct rgw_pubsub_event { + constexpr static const char* const json_type_plural = "events"; + std::string id; + std::string event_name; + std::string source; + ceph::real_time timestamp; + JSONFormattable info; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(event_name, bl); + encode(source, bl); + encode(timestamp, bl); + encode(info, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(event_name, bl); + decode(source, bl); + decode(timestamp, bl); + decode(info, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_event) + +// settign a unique ID for an event/record based on object hash and timestamp +void set_event_id(std::string& id, const std::string& hash, const utime_t& ts); + +struct rgw_pubsub_sub_dest { + std::string bucket_name; + std::string oid_prefix; + std::string push_endpoint; + std::string push_endpoint_args; + std::string arn_topic; + bool stored_secret = false; + + void encode(bufferlist& bl) const { + ENCODE_START(4, 1, bl); + encode(bucket_name, bl); + encode(oid_prefix, bl); + encode(push_endpoint, bl); + encode(push_endpoint_args, bl); + encode(arn_topic, bl); + encode(stored_secret, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(4, bl); + decode(bucket_name, bl); + decode(oid_prefix, bl); + decode(push_endpoint, bl); + if (struct_v >= 2) { + decode(push_endpoint_args, bl); + } + if (struct_v >= 3) { + decode(arn_topic, bl); + } + if (struct_v >= 4) { + decode(stored_secret, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_sub_dest) + +struct rgw_pubsub_sub_config { + rgw_user user; + std::string name; + std::string topic; + rgw_pubsub_sub_dest dest; + std::string s3_id; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(user, bl); + encode(name, bl); + encode(topic, bl); + encode(dest, bl); + encode(s3_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(user, bl); + decode(name, bl); + decode(topic, bl); + decode(dest, bl); + if (struct_v >= 2) { + decode(s3_id, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_sub_config) + +struct rgw_pubsub_topic { + rgw_user user; + std::string name; + rgw_pubsub_sub_dest dest; + std::string arn; + std::string opaque_data; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(user, bl); + encode(name, bl); + encode(dest, bl); + encode(arn, bl); + encode(opaque_data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(user, bl); + decode(name, bl); + if (struct_v >= 2) { + decode(dest, bl); + decode(arn, bl); + } + if (struct_v >= 3) { + decode(opaque_data, bl); + } + DECODE_FINISH(bl); + } + + string to_str() const { + return user.to_str() + "/" + name; + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + + bool operator<(const rgw_pubsub_topic& t) const { + return to_str().compare(t.to_str()); + } +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic) + +struct rgw_pubsub_topic_subs { + rgw_pubsub_topic topic; + std::set subs; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(topic, bl); + encode(subs, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(topic, bl); + decode(subs, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs) + +struct rgw_pubsub_topic_filter { + rgw_pubsub_topic topic; + rgw::notify::EventTypeList events; + std::string s3_id; + rgw_s3_filter s3_filter; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(topic, bl); + // events are stored as a vector of strings + std::vector tmp_events; + const auto converter = s3_id.empty() ? rgw::notify::to_ceph_string : rgw::notify::to_string; + std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), converter); + encode(tmp_events, bl); + encode(s3_id, bl); + encode(s3_filter, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(topic, bl); + // events are stored as a vector of strings + events.clear(); + std::vector tmp_events; + decode(tmp_events, bl); + std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string); + if (struct_v >= 2) { + decode(s3_id, bl); + } + if (struct_v >= 3) { + decode(s3_filter, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter) + +struct rgw_pubsub_bucket_topics { + std::map topics; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(topics, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(topics, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics) + +struct rgw_pubsub_user_topics { + std::map topics; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(topics, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(topics, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_user_topics) + +static std::string pubsub_user_oid_prefix = "pubsub.user."; + +class RGWUserPubSub +{ + friend class Bucket; + + RGWRados *store; + rgw_user user; + RGWSysObjectCtx obj_ctx; + + rgw_raw_obj user_meta_obj; + + std::string user_meta_oid() const { + return pubsub_user_oid_prefix + user.to_str(); + } + + std::string bucket_meta_oid(const rgw_bucket& bucket) const { + return pubsub_user_oid_prefix + user.to_str() + ".bucket." + bucket.name + "/" + bucket.bucket_id; + } + + std::string sub_meta_oid(const string& name) const { + return pubsub_user_oid_prefix + user.to_str() + ".sub." + name; + } + + template + int read(const rgw_raw_obj& obj, T *data, RGWObjVersionTracker *objv_tracker); + + template + int write(const rgw_raw_obj& obj, const T& info, RGWObjVersionTracker *obj_tracker); + + int remove(const rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker); + + int read_user_topics(rgw_pubsub_user_topics *result, RGWObjVersionTracker *objv_tracker); + int write_user_topics(const rgw_pubsub_user_topics& topics, RGWObjVersionTracker *objv_tracker); + +public: + RGWUserPubSub(RGWRados *_store, const rgw_user& _user) : store(_store), + user(_user), + obj_ctx(store->svc.sysobj->init_obj_ctx()) { + get_user_meta_obj(&user_meta_obj); + } + + class Bucket { + friend class RGWUserPubSub; + RGWUserPubSub *ps; + rgw_bucket bucket; + rgw_raw_obj bucket_meta_obj; + + // read the list of topics associated with a bucket and populate into result + // use version tacker to enforce atomicity between read/write + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker *objv_tracker); + // set the list of topics associated with a bucket + // use version tacker to enforce atomicity between read/write + // return 0 on success, error code otherwise + int write_topics(const rgw_pubsub_bucket_topics& topics, RGWObjVersionTracker *objv_tracker); + public: + Bucket(RGWUserPubSub *_ps, const rgw_bucket& _bucket) : ps(_ps), bucket(_bucket) { + ps->get_bucket_meta_obj(bucket, &bucket_meta_obj); + } + + // read the list of topics associated with a bucket and populate into result + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int get_topics(rgw_pubsub_bucket_topics *result); + // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket + // assigning a notification name is optional (needed for S3 compatible notifications) + // if the topic already exist on the bucket, the filter event list may be updated + // for S3 compliant notifications the version with: s3_filter and notif_name should be used + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int create_notification(const string& topic_name, const rgw::notify::EventTypeList& events); + int create_notification(const string& topic_name, const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name); + // remove a topic and filter from bucket + // if the topic does not exists on the bucket it is a no-op (considered success) + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int remove_notification(const string& topic_name); + }; + + // base class for subscription + class Sub { + friend class RGWUserPubSub; + protected: + RGWUserPubSub* const ps; + const std::string sub; + rgw_raw_obj sub_meta_obj; + + int read_sub(rgw_pubsub_sub_config *result, RGWObjVersionTracker *objv_tracker); + int write_sub(const rgw_pubsub_sub_config& sub_conf, RGWObjVersionTracker *objv_tracker); + int remove_sub(RGWObjVersionTracker *objv_tracker); + public: + Sub(RGWUserPubSub *_ps, const std::string& _sub) : ps(_ps), sub(_sub) { + ps->get_sub_meta_obj(sub, &sub_meta_obj); + } + + virtual ~Sub() = default; + + int subscribe(const string& topic_name, const rgw_pubsub_sub_dest& dest, const std::string& s3_id=""); + int unsubscribe(const string& topic_name); + int get_conf(rgw_pubsub_sub_config* result); + + static const int DEFAULT_MAX_EVENTS = 100; + // followint virtual methods should only be called in derived + virtual int list_events(const string& marker, int max_events) {ceph_assert(false);} + virtual int remove_event(const string& event_id) {ceph_assert(false);} + virtual void dump(Formatter* f) const {ceph_assert(false);} + }; + + // subscription with templated list of events to support both S3 compliant and Ceph specific events + template + class SubWithEvents : public Sub { + private: + struct list_events_result { + std::string next_marker; + bool is_truncated{false}; + void dump(Formatter *f) const; + std::vector events; + } list; + + public: + SubWithEvents(RGWUserPubSub *_ps, const string& _sub) : Sub(_ps, _sub) {} + + virtual ~SubWithEvents() = default; + + int list_events(const string& marker, int max_events) override; + int remove_event(const string& event_id) override; + void dump(Formatter* f) const override; + }; + + using BucketRef = std::shared_ptr; + using SubRef = std::shared_ptr; + + BucketRef get_bucket(const rgw_bucket& bucket) { + return std::make_shared(this, bucket); + } + + SubRef get_sub(const string& sub) { + return std::make_shared(this, sub); + } + + SubRef get_sub_with_events(const string& sub) { + auto tmpsub = Sub(this, sub); + rgw_pubsub_sub_config conf; + if (tmpsub.get_conf(&conf) < 0) { + return nullptr; + } + if (conf.s3_id.empty()) { + return std::make_shared>(this, sub); + } + return std::make_shared>(this, sub); + } + + void get_user_meta_obj(rgw_raw_obj *obj) const { + *obj = rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, user_meta_oid()); + } + + void get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const { + *obj = rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, bucket_meta_oid(bucket)); + } + + void get_sub_meta_obj(const string& name, rgw_raw_obj *obj) const { + *obj = rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sub_meta_oid(name)); + } + + // get all topics defined for the user and populate them into "result" + // return 0 on success or if no topics exist, error code otherwise + int get_user_topics(rgw_pubsub_user_topics *result); + // get a topic with its subscriptions by its name and populate it into "result" + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int get_topic(const string& name, rgw_pubsub_topic_subs *result); + // get a topic with by its name and populate it into "result" + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int get_topic(const string& name, rgw_pubsub_topic *result); + // create a topic with a name only + // if the topic already exists it is a no-op (considered success) + // return 0 on success, error code otherwise + int create_topic(const string& name); + // create a topic with push destination information and ARN + // if the topic already exists the destination and ARN values may be updated (considered succsess) + // return 0 on success, error code otherwise + int create_topic(const string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data); + // remove a topic according to its name + // if the topic does not exists it is a no-op (considered success) + // return 0 on success, error code otherwise + int remove_topic(const string& name); +}; + +template +int RGWUserPubSub::read(const rgw_raw_obj& obj, T *result, RGWObjVersionTracker *objv_tracker) +{ + bufferlist bl; + int ret = rgw_get_system_obj(store, obj_ctx, + obj.pool, obj.oid, + bl, + objv_tracker, + nullptr, nullptr, nullptr); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(*result, iter); + } catch (buffer::error& err) { + return -EIO; + } + + return 0; +} + +template +int RGWUserPubSub::write(const rgw_raw_obj& obj, const T& info, RGWObjVersionTracker *objv_tracker) +{ + bufferlist bl; + encode(info, bl); + + int ret = rgw_put_system_obj(store, obj.pool, obj.oid, + bl, false, objv_tracker, + real_time()); + if (ret < 0) { + return ret; + } + + obj_ctx.invalidate(const_cast(obj)); + return 0; +} + +#endif diff --git a/src/rgw/rgw_pubsub_push.cc b/src/rgw/rgw_pubsub_push.cc new file mode 100644 index 00000000..a1719d9f --- /dev/null +++ b/src/rgw/rgw_pubsub_push.cc @@ -0,0 +1,749 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_pubsub_push.h" +#include +#include +#include +#include "include/buffer_fwd.h" +#include "common/Formatter.h" +#include "common/async/completion.h" +#include "rgw_common.h" +#include "rgw_data_sync.h" +#include "rgw_pubsub.h" +#include "acconfig.h" +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +#include "rgw_amqp.h" +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +#include "rgw_kafka.h" +#endif +#include +#include +#include +#include "rgw_perf_counters.h" + +using namespace rgw; + +template +std::string json_format_pubsub_event(const EventType& event) { + std::stringstream ss; + JSONFormatter f(false); + { + Formatter::ObjectSection s(f, EventType::json_type_plural); + { + Formatter::ArraySection s(f, EventType::json_type_plural); + encode_json("", event, &f); + } + } + f.flush(ss); + return ss.str(); +} + +class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint { +private: + const std::string endpoint; + std::string str_ack_level; + typedef unsigned ack_level_t; + ack_level_t ack_level; // TODO: not used for now + bool verify_ssl; + static const ack_level_t ACK_LEVEL_ANY = 0; + static const ack_level_t ACK_LEVEL_NON_ERROR = 1; + + // PostCR implements async execution of RGWPostHTTPData via coroutine + class PostCR : public RGWPostHTTPData, public RGWSimpleCoroutine { + private: + RGWDataSyncEnv* const sync_env; + bufferlist read_bl; + const ack_level_t ack_level; + + public: + PostCR(const std::string& _post_data, + RGWDataSyncEnv* _sync_env, + const std::string& endpoint, + ack_level_t _ack_level, + bool verify_ssl) : + RGWPostHTTPData(_sync_env->cct, "POST", endpoint, &read_bl, verify_ssl), + RGWSimpleCoroutine(_sync_env->cct), + sync_env(_sync_env), + ack_level (_ack_level) { + // ctor also set the data to send + set_post_data(_post_data); + set_send_length(_post_data.length()); + } + + // send message to endpoint + int send_request() override { + init_new_io(this); + const auto rc = sync_env->http_manager->add_request(this); + if (rc < 0) { + return rc; + } + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending); + return 0; + } + + // wait for reply + int request_complete() override { + if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending); + if (ack_level == ACK_LEVEL_ANY) { + return 0; + } else if (ack_level == ACK_LEVEL_NON_ERROR) { + // TODO check result code to be non-error + } else { + // TODO: check that result code == ack_level + } + return -1; + } + }; + +public: + RGWPubSubHTTPEndpoint(const std::string& _endpoint, + const RGWHTTPArgs& args) : endpoint(_endpoint) { + bool exists; + + str_ack_level = args.get("http-ack-level", &exists); + if (!exists || str_ack_level == "any") { + // "any" is default + ack_level = ACK_LEVEL_ANY; + } else if (str_ack_level == "non-error") { + ack_level = ACK_LEVEL_NON_ERROR; + } else { + ack_level = std::atoi(str_ack_level.c_str()); + if (ack_level < 100 || ack_level >= 600) { + throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level); + } + } + + auto str_verify_ssl = args.get("verify-ssl", &exists); + boost::algorithm::to_lower(str_verify_ssl); + // verify server certificate by default + if (!exists || str_verify_ssl == "true") { + verify_ssl = true; + } else if (str_verify_ssl == "false") { + verify_ssl = false; + } else { + throw configuration_error("HTTP/S: verify-ssl must be true/false, not: " + str_verify_ssl); + } + } + + RGWCoroutine* send_to_completion_async(const rgw_pubsub_event& event, RGWDataSyncEnv* env) override { + return new PostCR(json_format_pubsub_event(event), env, endpoint, ack_level, verify_ssl); + } + + RGWCoroutine* send_to_completion_async(const rgw_pubsub_s3_record& record, RGWDataSyncEnv* env) override { + return new PostCR(json_format_pubsub_event(record), env, endpoint, ack_level, verify_ssl); + } + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_record& record, optional_yield y) override { + bufferlist read_bl; + RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl); + const auto post_data = json_format_pubsub_event(record); + request.set_post_data(post_data); + request.set_send_length(post_data.length()); + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending); + const auto rc = RGWHTTP::process(&request, y); + if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending); + // TODO: use read_bl to process return code and handle according to ack level + return rc; + } + + std::string to_str() const override { + std::string str("HTTP/S Endpoint"); + str += "\nURI: " + endpoint; + str += "\nAck Level: " + str_ack_level; + str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL"); + return str; + + } +}; + +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint { +private: + enum class ack_level_t { + None, + Broker, + Routable + }; + CephContext* const cct; + const std::string endpoint; + const std::string topic; + const std::string exchange; + amqp::connection_ptr_t conn; + ack_level_t ack_level; + std::string str_ack_level; + + static std::string get_exchange(const RGWHTTPArgs& args) { + bool exists; + const auto exchange = args.get("amqp-exchange", &exists); + if (!exists) { + throw configuration_error("AMQP: missing amqp-exchange"); + } + return exchange; + } + + // NoAckPublishCR implements async amqp publishing via coroutine + // This coroutine ends when it send the message and does not wait for an ack + class NoAckPublishCR : public RGWCoroutine { + private: + const std::string topic; + amqp::connection_ptr_t conn; + const std::string message; + + public: + NoAckPublishCR(CephContext* cct, + const std::string& _topic, + amqp::connection_ptr_t& _conn, + const std::string& _message) : + RGWCoroutine(cct), + topic(_topic), conn(_conn), message(_message) {} + + // send message to endpoint, without waiting for reply + int operate() override { + reenter(this) { + const auto rc = amqp::publish(conn, topic, message); + if (rc < 0) { + return set_cr_error(rc); + } + return set_cr_done(); + } + return 0; + } + }; + + // AckPublishCR implements async amqp publishing via coroutine + // This coroutine ends when an ack is received from the borker + // note that it does not wait for an ack fron the end client + class AckPublishCR : public RGWCoroutine, public RGWIOProvider { + private: + const std::string topic; + amqp::connection_ptr_t conn; + const std::string message; + [[maybe_unused]] const ack_level_t ack_level; // TODO not used for now + + public: + AckPublishCR(CephContext* cct, + const std::string& _topic, + amqp::connection_ptr_t& _conn, + const std::string& _message, + ack_level_t _ack_level) : + RGWCoroutine(cct), + topic(_topic), conn(_conn), message(_message), ack_level(_ack_level) {} + + // send message to endpoint, waiting for reply + int operate() override { + reenter(this) { + yield { + init_new_io(this); + const auto rc = amqp::publish_with_confirm(conn, + topic, + message, + std::bind(&AckPublishCR::request_complete, this, std::placeholders::_1)); + if (rc < 0) { + // failed to publish, does not wait for reply + return set_cr_error(rc); + } + // mark as blocked on the amqp answer + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending); + io_block(); + return 0; + } + return set_cr_done(); + } + return 0; + } + + // callback invoked from the amqp manager thread when ack/nack is received + void request_complete(int status) { + ceph_assert(!is_done()); + if (status != 0) { + // server replied with a nack + set_cr_error(status); + } + io_complete(); + if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending); + } + + // TODO: why are these mandatory in RGWIOProvider? + void set_io_user_info(void *_user_info) override { + } + + void *get_io_user_info() override { + return nullptr; + } + }; + +public: + RGWPubSubAMQPEndpoint(const std::string& _endpoint, + const std::string& _topic, + const RGWHTTPArgs& args, + CephContext* _cct) : + cct(_cct), + endpoint(_endpoint), + topic(_topic), + exchange(get_exchange(args)), + conn(amqp::connect(endpoint, exchange)) { + if (!conn) { + throw configuration_error("AMQP: failed to create connection to: " + endpoint); + } + bool exists; + // get ack level + str_ack_level = args.get("amqp-ack-level", &exists); + if (!exists || str_ack_level == "broker") { + // "broker" is default + ack_level = ack_level_t::Broker; + } else if (str_ack_level == "none") { + ack_level = ack_level_t::None; + } else if (str_ack_level == "routable") { + ack_level = ack_level_t::Routable; + } else { + throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level); + } + } + + RGWCoroutine* send_to_completion_async(const rgw_pubsub_event& event, RGWDataSyncEnv* env) override { + ceph_assert(conn); + if (ack_level == ack_level_t::None) { + return new NoAckPublishCR(cct, topic, conn, json_format_pubsub_event(event)); + } else { + // TODO: currently broker and routable are the same - this will require different flags + // but the same mechanism + return new AckPublishCR(cct, topic, conn, json_format_pubsub_event(event), ack_level); + } + } + + RGWCoroutine* send_to_completion_async(const rgw_pubsub_s3_record& record, RGWDataSyncEnv* env) override { + ceph_assert(conn); + if (ack_level == ack_level_t::None) { + return new NoAckPublishCR(cct, topic, conn, json_format_pubsub_event(record)); + } else { + // TODO: currently broker and routable are the same - this will require different flags + // but the same mechanism + return new AckPublishCR(cct, topic, conn, json_format_pubsub_event(record), ack_level); + } + } + + // this allows waiting untill "finish()" is called from a different thread + // waiting could be blocking the waiting thread or yielding, depending + // with compilation flag support and whether the optional_yield is set + class Waiter { + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion; + std::unique_ptr completion = nullptr; + int ret; + + mutable std::atomic done = false; + mutable std::mutex lock; + mutable std::condition_variable cond; + + template + auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { + boost::asio::async_completion init(token); + auto& handler = init.completion_handler; + { + std::unique_lock l{lock}; + completion = Completion::create(ctx.get_executor(), std::move(handler)); + } + return init.result.get(); + } + + public: + int wait(optional_yield y) { + if (done) { + return ret; + } +#ifdef HAVE_BOOST_CONTEXT + if (y) { + auto& io_ctx = y.get_io_context(); + auto& yield_ctx = y.get_yield_context(); + boost::system::error_code ec; + async_wait(io_ctx, yield_ctx[ec]); + return -ec.value(); + } +#endif + std::unique_lock l(lock); + cond.wait(l, [this]{return (done==true);}); + return ret; + } + + void finish(int r) { + std::unique_lock l{lock}; + ret = r; + done = true; + if (completion) { + boost::system::error_code ec(-ret, boost::system::system_category()); + Completion::post(std::move(completion), ec); + } else { + cond.notify_all(); + } + } + }; + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_record& record, optional_yield y) override { + ceph_assert(conn); + if (ack_level == ack_level_t::None) { + return amqp::publish(conn, topic, json_format_pubsub_event(record)); + } else { + // TODO: currently broker and routable are the same - this will require different flags but the same mechanism + // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine + auto w = std::unique_ptr(new Waiter); + const auto rc = amqp::publish_with_confirm(conn, + topic, + json_format_pubsub_event(record), + std::bind(&Waiter::finish, w.get(), std::placeholders::_1)); + if (rc < 0) { + // failed to publish, does not wait for reply + return rc; + } + return w->wait(y); + } + } + + std::string to_str() const override { + std::string str("AMQP(0.9.1) Endpoint"); + str += "\nURI: " + endpoint; + str += "\nTopic: " + topic; + str += "\nExchange: " + exchange; + str += "\nAck Level: " + str_ack_level; + return str; + } +}; + +static const std::string AMQP_0_9_1("0-9-1"); +static const std::string AMQP_1_0("1-0"); +static const std::string AMQP_SCHEMA("amqp"); +#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT + + +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint { +private: + enum class ack_level_t { + None, + Broker, + }; + CephContext* const cct; + const std::string topic; + kafka::connection_ptr_t conn; + const ack_level_t ack_level; + + static bool get_verify_ssl(const RGWHTTPArgs& args) { + bool exists; + auto str_verify_ssl = args.get("verify-ssl", &exists); + if (!exists) { + // verify server certificate by default + return true; + } + boost::algorithm::to_lower(str_verify_ssl); + if (str_verify_ssl == "true") { + return true; + } + if (str_verify_ssl == "false") { + return false; + } + throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl); + } + + static bool get_use_ssl(const RGWHTTPArgs& args) { + bool exists; + auto str_use_ssl = args.get("use-ssl", &exists); + if (!exists) { + // by default ssl not used + return false; + } + boost::algorithm::to_lower(str_use_ssl); + if (str_use_ssl == "true") { + return true; + } + if (str_use_ssl == "false") { + return false; + } + throw configuration_error("'use-ssl' must be true/false, not: " + str_use_ssl); + } + + static ack_level_t get_ack_level(const RGWHTTPArgs& args) { + bool exists; + // get ack level + const auto str_ack_level = args.get("kafka-ack-level", &exists); + if (!exists || str_ack_level == "broker") { + // "broker" is default + return ack_level_t::Broker; + } + if (str_ack_level == "none") { + return ack_level_t::None; + } + throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level); + } + + // NoAckPublishCR implements async kafka publishing via coroutine + // This coroutine ends when it send the message and does not wait for an ack + class NoAckPublishCR : public RGWCoroutine { + private: + const std::string topic; + kafka::connection_ptr_t conn; + const std::string message; + + public: + NoAckPublishCR(CephContext* cct, + const std::string& _topic, + kafka::connection_ptr_t& _conn, + const std::string& _message) : + RGWCoroutine(cct), + topic(_topic), conn(_conn), message(_message) {} + + // send message to endpoint, without waiting for reply + int operate() override { + reenter(this) { + const auto rc = kafka::publish(conn, topic, message); + if (rc < 0) { + return set_cr_error(rc); + } + return set_cr_done(); + } + return 0; + } + }; + + // AckPublishCR implements async kafka publishing via coroutine + // This coroutine ends when an ack is received from the borker + // note that it does not wait for an ack fron the end client + class AckPublishCR : public RGWCoroutine, public RGWIOProvider { + private: + const std::string topic; + kafka::connection_ptr_t conn; + const std::string message; + + public: + AckPublishCR(CephContext* cct, + const std::string& _topic, + kafka::connection_ptr_t& _conn, + const std::string& _message) : + RGWCoroutine(cct), + topic(_topic), conn(_conn), message(_message) {} + + // send message to endpoint, waiting for reply + int operate() override { + reenter(this) { + yield { + init_new_io(this); + const auto rc = kafka::publish_with_confirm(conn, + topic, + message, + std::bind(&AckPublishCR::request_complete, this, std::placeholders::_1)); + if (rc < 0) { + // failed to publish, does not wait for reply + return set_cr_error(rc); + } + // mark as blocked on the kafka answer + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending); + io_block(); + return 0; + } + return set_cr_done(); + } + return 0; + } + + // callback invoked from the kafka manager thread when ack/nack is received + void request_complete(int status) { + ceph_assert(!is_done()); + if (status != 0) { + // server replied with a nack + set_cr_error(status); + } + io_complete(); + if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending); + } + + // TODO: why are these mandatory in RGWIOProvider? + void set_io_user_info(void *_user_info) override { + } + + void *get_io_user_info() override { + return nullptr; + } + }; + +public: + RGWPubSubKafkaEndpoint(const std::string& _endpoint, + const std::string& _topic, + const RGWHTTPArgs& args, + CephContext* _cct) : + cct(_cct), + topic(_topic), + conn(kafka::connect(_endpoint, get_use_ssl(args), get_verify_ssl(args), args.get_optional("ca-location"))) , + ack_level(get_ack_level(args)) { + if (!conn) { + throw configuration_error("Kafka: failed to create connection to: " + _endpoint); + } + } + + RGWCoroutine* send_to_completion_async(const rgw_pubsub_event& event, RGWDataSyncEnv* env) override { + ceph_assert(conn); + if (ack_level == ack_level_t::None) { + return new NoAckPublishCR(cct, topic, conn, json_format_pubsub_event(event)); + } else { + return new AckPublishCR(cct, topic, conn, json_format_pubsub_event(event)); + } + } + + RGWCoroutine* send_to_completion_async(const rgw_pubsub_s3_record& record, RGWDataSyncEnv* env) override { + ceph_assert(conn); + if (ack_level == ack_level_t::None) { + return new NoAckPublishCR(cct, topic, conn, json_format_pubsub_event(record)); + } else { + return new AckPublishCR(cct, topic, conn, json_format_pubsub_event(record)); + } + } + + // this allows waiting untill "finish()" is called from a different thread + // waiting could be blocking the waiting thread or yielding, depending + // with compilation flag support and whether the optional_yield is set + class Waiter { + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion; + std::unique_ptr completion = nullptr; + int ret; + + mutable std::atomic done = false; + mutable std::mutex lock; + mutable std::condition_variable cond; + + template + auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { + boost::asio::async_completion init(token); + auto& handler = init.completion_handler; + { + std::unique_lock l{lock}; + completion = Completion::create(ctx.get_executor(), std::move(handler)); + } + return init.result.get(); + } + + public: + int wait(optional_yield y) { + if (done) { + return ret; + } +#ifdef HAVE_BOOST_CONTEXT + if (y) { + auto& io_ctx = y.get_io_context(); + auto& yield_ctx = y.get_yield_context(); + boost::system::error_code ec; + async_wait(io_ctx, yield_ctx[ec]); + return -ec.value(); + } +#endif + std::unique_lock l(lock); + cond.wait(l, [this]{return (done==true);}); + return ret; + } + + void finish(int r) { + std::unique_lock l{lock}; + ret = r; + done = true; + if (completion) { + boost::system::error_code ec(-ret, boost::system::system_category()); + Completion::post(std::move(completion), ec); + } else { + cond.notify_all(); + } + } + }; + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_record& record, optional_yield y) override { + ceph_assert(conn); + if (ack_level == ack_level_t::None) { + return kafka::publish(conn, topic, json_format_pubsub_event(record)); + } else { + // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine + auto w = std::unique_ptr(new Waiter); + const auto rc = kafka::publish_with_confirm(conn, + topic, + json_format_pubsub_event(record), + std::bind(&Waiter::finish, w.get(), std::placeholders::_1)); + if (rc < 0) { + // failed to publish, does not wait for reply + return rc; + } + return w->wait(y); + } + } + + std::string to_str() const override { + std::string str("Kafka Endpoint"); + str += kafka::to_string(conn); + str += "\nTopic: " + topic; + return str; + } +}; + +static const std::string KAFKA_SCHEMA("kafka"); +#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT + +static const std::string WEBHOOK_SCHEMA("webhook"); +static const std::string UNKNOWN_SCHEMA("unknown"); +static const std::string NO_SCHEMA(""); + +const std::string& get_schema(const std::string& endpoint) { + if (endpoint.empty()) { + return NO_SCHEMA; + } + const auto pos = endpoint.find(':'); + if (pos == std::string::npos) { + return UNKNOWN_SCHEMA; + } + const auto& schema = endpoint.substr(0,pos); + if (schema == "http" || schema == "https") { + return WEBHOOK_SCHEMA; +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + } else if (schema == "amqp") { + return AMQP_SCHEMA; +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + } else if (schema == "kafka") { + return KAFKA_SCHEMA; +#endif + } + return UNKNOWN_SCHEMA; +} + +RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint, + const std::string& topic, + const RGWHTTPArgs& args, + CephContext* cct) { + const auto& schema = get_schema(endpoint); + if (schema == WEBHOOK_SCHEMA) { + return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args)); +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + } else if (schema == AMQP_SCHEMA) { + bool exists; + std::string version = args.get("amqp-version", &exists); + if (!exists) { + version = AMQP_0_9_1; + } + if (version == AMQP_0_9_1) { + return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct)); + } else if (version == AMQP_1_0) { + throw configuration_error("AMQP: v1.0 not supported"); + return nullptr; + } else { + throw configuration_error("AMQP: unknown version: " + version); + return nullptr; + } + } else if (schema == "amqps") { + throw configuration_error("AMQP: ssl not supported"); + return nullptr; +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + } else if (schema == KAFKA_SCHEMA) { + return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct)); +#endif + } + + throw configuration_error("unknown schema in: " + endpoint); + return nullptr; +} + diff --git a/src/rgw/rgw_pubsub_push.h b/src/rgw/rgw_pubsub_push.h new file mode 100644 index 00000000..8cfdeb5f --- /dev/null +++ b/src/rgw/rgw_pubsub_push.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include +#include +#include +#include "include/buffer_fwd.h" +#include "common/async/yield_context.h" + +// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes +class RGWDataSyncEnv; +class RGWCoroutine; +class RGWHTTPArgs; +class CephContext; +struct rgw_pubsub_event; +struct rgw_pubsub_s3_record; + +// endpoint base class all endpoint - types should derive from it +class RGWPubSubEndpoint { +public: + RGWPubSubEndpoint() = default; + // endpoint should not be copied + RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete; + const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete; + + typedef std::unique_ptr Ptr; + + // factory method for the actual notification endpoint + // derived class specific arguments are passed in http args format + // may throw a configuration_error if creation fails + static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr); + + // this method is used in order to send notification (Ceph specific) and wait for completion + // in async manner via a coroutine when invoked in the data sync environment + virtual RGWCoroutine* send_to_completion_async(const rgw_pubsub_event& event, RGWDataSyncEnv* env) = 0; + + // this method is used in order to send notification (S3 compliant) and wait for completion + // in async manner via a coroutine when invoked in the data sync environment + virtual RGWCoroutine* send_to_completion_async(const rgw_pubsub_s3_record& record, RGWDataSyncEnv* env) = 0; + + // this method is used in order to send notification (S3 compliant) and wait for completion + // in async manner via a coroutine when invoked in the frontend environment + virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_record& record, optional_yield y) = 0; + + // present as string + virtual std::string to_str() const { return ""; } + + virtual ~RGWPubSubEndpoint() = default; + + // exception object for configuration error + struct configuration_error : public std::logic_error { + configuration_error(const std::string& what_arg) : + std::logic_error("pubsub endpoint configuration error: " + what_arg) {} + }; +}; + diff --git a/src/rgw/rgw_putobj.cc b/src/rgw/rgw_putobj.cc new file mode 100644 index 00000000..39410972 --- /dev/null +++ b/src/rgw/rgw_putobj.cc @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_putobj.h" + +namespace rgw::putobj { + +int ChunkProcessor::process(bufferlist&& data, uint64_t offset) +{ + ceph_assert(offset >= chunk.length()); + uint64_t position = offset - chunk.length(); + + const bool flush = (data.length() == 0); + if (flush) { + if (chunk.length() > 0) { + int r = Pipe::process(std::move(chunk), position); + if (r < 0) { + return r; + } + } + return Pipe::process({}, offset); + } + chunk.claim_append(data); + + // write each full chunk + while (chunk.length() >= chunk_size) { + bufferlist bl; + chunk.splice(0, chunk_size, &bl); + + int r = Pipe::process(std::move(bl), position); + if (r < 0) { + return r; + } + position += chunk_size; + } + return 0; +} + + +int StripeProcessor::process(bufferlist&& data, uint64_t offset) +{ + ceph_assert(offset >= bounds.first); + + const bool flush = (data.length() == 0); + if (flush) { + return Pipe::process({}, offset - bounds.first); + } + + auto max = bounds.second - offset; + while (data.length() > max) { + if (max > 0) { + bufferlist bl; + data.splice(0, max, &bl); + + int r = Pipe::process(std::move(bl), offset - bounds.first); + if (r < 0) { + return r; + } + offset += max; + } + + // flush the current chunk + int r = Pipe::process({}, offset - bounds.first); + if (r < 0) { + return r; + } + // generate the next stripe + uint64_t stripe_size; + r = gen->next(offset, &stripe_size); + if (r < 0) { + return r; + } + ceph_assert(stripe_size > 0); + + bounds.first = offset; + bounds.second = offset + stripe_size; + + max = stripe_size; + } + + if (data.length() == 0) { // don't flush the chunk here + return 0; + } + return Pipe::process(std::move(data), offset - bounds.first); +} + +} // namespace rgw::putobj diff --git a/src/rgw/rgw_putobj.h b/src/rgw/rgw_putobj.h new file mode 100644 index 00000000..367bc5c0 --- /dev/null +++ b/src/rgw/rgw_putobj.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "include/buffer.h" + +namespace rgw::putobj { + +// a simple streaming data processing abstraction +class DataProcessor { + public: + virtual ~DataProcessor() {} + + // consume a bufferlist in its entirety at the given object offset. an + // empty bufferlist is given to request that any buffered data be flushed, + // though this doesn't wait for completions + virtual int process(bufferlist&& data, uint64_t offset) = 0; +}; + +// for composing data processors into a pipeline +class Pipe : public DataProcessor { + DataProcessor *next; + public: + explicit Pipe(DataProcessor *next) : next(next) {} + + // passes the data on to the next processor + int process(bufferlist&& data, uint64_t offset) override { + return next->process(std::move(data), offset); + } +}; + +// pipe that writes to the next processor in discrete chunks +class ChunkProcessor : public Pipe { + uint64_t chunk_size; + bufferlist chunk; // leftover bytes from the last call to process() + public: + ChunkProcessor(DataProcessor *next, uint64_t chunk_size) + : Pipe(next), chunk_size(chunk_size) + {} + + int process(bufferlist&& data, uint64_t offset) override; +}; + + +// interface to generate the next stripe description +class StripeGenerator { + public: + virtual ~StripeGenerator() {} + + virtual int next(uint64_t offset, uint64_t *stripe_size) = 0; +}; + +// pipe that respects stripe boundaries and restarts each stripe at offset 0 +class StripeProcessor : public Pipe { + StripeGenerator *gen; + std::pair bounds; // bounds of current stripe + public: + StripeProcessor(DataProcessor *next, StripeGenerator *gen, + uint64_t first_stripe_size) + : Pipe(next), gen(gen), bounds(0, first_stripe_size) + {} + + int process(bufferlist&& data, uint64_t data_offset) override; +}; + +} // namespace rgw::putobj diff --git a/src/rgw/rgw_putobj_processor.cc b/src/rgw/rgw_putobj_processor.cc new file mode 100644 index 00000000..3de30a82 --- /dev/null +++ b/src/rgw/rgw_putobj_processor.cc @@ -0,0 +1,670 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_aio.h" +#include "rgw_putobj_processor.h" +#include "rgw_multi.h" +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + +namespace rgw::putobj { + +int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset) +{ + const bool flush = (data.length() == 0); + + // capture the first chunk for special handling + if (data_offset < head_chunk_size || data_offset == 0) { + if (flush) { + // flush partial chunk + return process_first_chunk(std::move(head_data), &processor); + } + + auto remaining = head_chunk_size - data_offset; + auto count = std::min(data.length(), remaining); + data.splice(0, count, &head_data); + data_offset += count; + + if (data_offset == head_chunk_size) { + // process the first complete chunk + ceph_assert(head_data.length() == head_chunk_size); + int r = process_first_chunk(std::move(head_data), &processor); + if (r < 0) { + return r; + } + } + if (data.length() == 0) { // avoid flushing stripe processor + return 0; + } + } + ceph_assert(processor); // process_first_chunk() must initialize + + // send everything else through the processor + auto write_offset = data_offset; + data_offset += data.length(); + return processor->process(std::move(data), write_offset); +} + + +static int process_completed(const AioResultList& completed, RawObjSet *written) +{ + std::optional error; + for (auto& r : completed) { + if (r.result >= 0) { + written->insert(r.obj.get_ref().obj); + } else if (!error) { // record first error code + error = r.result; + } + } + return error.value_or(0); +} + +int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj) +{ + stripe_obj = store->svc.rados->obj(raw_obj); + return stripe_obj.open(); +} + +int RadosWriter::process(bufferlist&& bl, uint64_t offset) +{ + bufferlist data = std::move(bl); + const uint64_t cost = data.length(); + if (cost == 0) { // no empty writes, use aio directly for creates + return 0; + } + librados::ObjectWriteOperation op; + if (offset == 0) { + op.write_full(data); + } else { + op.write(offset, data); + } + constexpr uint64_t id = 0; // unused + auto c = aio->submit(stripe_obj, &op, cost, id); + return process_completed(c, &written); +} + +int RadosWriter::write_exclusive(const bufferlist& data) +{ + const uint64_t cost = data.length(); + + librados::ObjectWriteOperation op; + op.create(true); // exclusive create + op.write_full(data); + + constexpr uint64_t id = 0; // unused + auto c = aio->submit(stripe_obj, &op, cost, id); + auto d = aio->drain(); + c.splice(c.end(), d); + return process_completed(c, &written); +} + +int RadosWriter::drain() +{ + return process_completed(aio->drain(), &written); +} + +RadosWriter::~RadosWriter() +{ + // wait on any outstanding aio completions + process_completed(aio->drain(), &written); + + bool need_to_remove_head = false; + std::optional raw_head; + if (!head_obj.empty()) { + raw_head.emplace(); + store->obj_to_raw(bucket_info.placement_rule, head_obj, &*raw_head); + } + + /** + * We should delete the object in the "multipart" namespace to avoid race condition. + * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart + * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects + * written by the second upload may be deleted by the first upload. + * details is describled on #11749 + * + * The above comment still stands, but instead of searching for a specific object in the multipart + * namespace, we just make sure that we remove the object that is marked as the head object after + * we remove all the other raw objects. Note that we use different call to remove the head object, + * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme. + */ + for (const auto& obj : written) { + if (raw_head && obj == *raw_head) { + ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl; + need_to_remove_head = true; + continue; + } + + int r = store->delete_raw_obj(obj); + if (r < 0 && r != -ENOENT) { + ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl; + } + } + + if (need_to_remove_head) { + ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl; + int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0); + if (r < 0 && r != -ENOENT) { + ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl; + } + } +} + + +// advance to the next stripe +int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size) +{ + // advance the manifest + int r = manifest_gen.create_next(offset); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + uint64_t chunk_size = 0; + r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size); + if (r < 0) { + return r; + } + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + + chunk = ChunkProcessor(&writer, chunk_size); + *pstripe_size = manifest_gen.cur_stripe_max_size(); + return 0; +} + + +int AtomicObjectProcessor::process_first_chunk(bufferlist&& data, + DataProcessor **processor) +{ + first_chunk = std::move(data); + *processor = &stripe; + return 0; +} + +int AtomicObjectProcessor::prepare() +{ + uint64_t max_head_chunk_size; + uint64_t head_max_size; + uint64_t chunk_size = 0; + uint64_t alignment; + rgw_pool head_pool; + + if (!store->get_obj_data_pool(bucket_info.placement_rule, head_obj, &head_pool)) { + return -EIO; + } + + int r = store->get_max_chunk_size(head_pool, &max_head_chunk_size, &alignment); + if (r < 0) { + return r; + } + + bool same_pool = true; + + if (bucket_info.placement_rule != tail_placement_rule) { + rgw_pool tail_pool; + if (!store->get_obj_data_pool(tail_placement_rule, head_obj, &tail_pool)) { + return -EIO; + } + + if (tail_pool != head_pool) { + same_pool = false; + + r = store->get_max_chunk_size(tail_pool, &chunk_size); + if (r < 0) { + return r; + } + + head_max_size = 0; + } + } + + if (same_pool) { + head_max_size = max_head_chunk_size; + chunk_size = max_head_chunk_size; + } + + uint64_t stripe_size; + const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size; + + store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size); + + manifest.set_trivial_rule(head_max_size, stripe_size); + + r = manifest_gen.create_begin(store->ctx(), &manifest, + bucket_info.placement_rule, + &tail_placement_rule, + head_obj.bucket, head_obj); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + + set_head_chunk_size(head_max_size); + // initialize the processors + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, head_max_size); + return 0; +} + +int AtomicObjectProcessor::complete(size_t accounted_size, + const std::string& etag, + ceph::real_time *mtime, + ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, + const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, + bool *pcanceled) +{ + int r = writer.drain(); + if (r < 0) { + return r; + } + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + + obj_ctx.set_atomic(head_obj); + + RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj); + + /* some object types shouldn't be versioned, e.g., multipart parts */ + op_target.set_versioning_disabled(!bucket_info.versioning_enabled()); + + RGWRados::Object::Write obj_op(&op_target); + + obj_op.meta.data = &first_chunk; + obj_op.meta.manifest = &manifest; + obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */ + obj_op.meta.if_match = if_match; + obj_op.meta.if_nomatch = if_nomatch; + obj_op.meta.mtime = mtime; + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.owner = owner; + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.olh_epoch = olh_epoch; + obj_op.meta.delete_at = delete_at; + obj_op.meta.user_data = user_data; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + + r = obj_op.write_meta(actual_size, accounted_size, attrs); + if (r < 0) { + return r; + } + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + return 0; +} + + +int MultipartObjectProcessor::process_first_chunk(bufferlist&& data, + DataProcessor **processor) +{ + // write the first chunk of the head object as part of an exclusive create, + // then drain to wait for the result in case of EEXIST + int r = writer.write_exclusive(data); + if (r == -EEXIST) { + // randomize the oid prefix and reprepare the head/manifest + std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32); + + mp.init(target_obj.key.name, upload_id, oid_rand); + manifest.set_prefix(target_obj.key.name + "." + oid_rand); + + r = prepare_head(); + if (r < 0) { + return r; + } + // resubmit the write op on the new head object + r = writer.write_exclusive(data); + } + if (r < 0) { + return r; + } + *processor = &stripe; + return 0; +} + +int MultipartObjectProcessor::prepare_head() +{ + const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size; + uint64_t chunk_size; + uint64_t stripe_size; + uint64_t alignment; + + int r = store->get_max_chunk_size(tail_placement_rule, target_obj, &chunk_size, &alignment); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl; + return r; + } + store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size); + + manifest.set_multipart_part_rule(stripe_size, part_num); + + r = manifest_gen.create_begin(store->ctx(), &manifest, + bucket_info.placement_rule, + &tail_placement_rule, + target_obj.bucket, target_obj); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + rgw_raw_obj_to_obj(head_obj.bucket, stripe_obj, &head_obj); + head_obj.index_hash_source = target_obj.key.name; + + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + stripe_size = manifest_gen.cur_stripe_max_size(); + set_head_chunk_size(stripe_size); + + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, stripe_size); + return 0; +} + +int MultipartObjectProcessor::prepare() +{ + manifest.set_prefix(target_obj.key.name + "." + upload_id); + + return prepare_head(); +} + +int MultipartObjectProcessor::complete(size_t accounted_size, + const std::string& etag, + ceph::real_time *mtime, + ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, + const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, + bool *pcanceled) +{ + int r = writer.drain(); + if (r < 0) { + return r; + } + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + + RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj); + op_target.set_versioning_disabled(true); + RGWRados::Object::Write obj_op(&op_target); + + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.mtime = mtime; + obj_op.meta.owner = owner; + obj_op.meta.delete_at = delete_at; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + + r = obj_op.write_meta(actual_size, accounted_size, attrs); + if (r < 0) + return r; + + bufferlist bl; + RGWUploadPartInfo info; + string p = "part."; + bool sorted_omap = is_v2_upload_id(upload_id); + + if (sorted_omap) { + char buf[32]; + snprintf(buf, sizeof(buf), "%08d", part_num); + p.append(buf); + } else { + p.append(part_num_str); + } + info.num = part_num; + info.etag = etag; + info.size = actual_size; + info.accounted_size = accounted_size; + info.modified = real_clock::now(); + info.manifest = manifest; + + bool compressed; + r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info); + if (r < 0) { + ldout(store->ctx(), 1) << "cannot get compression info" << dendl; + return r; + } + + encode(info, bl); + + rgw_obj meta_obj; + meta_obj.init_ns(bucket_info.bucket, mp.get_meta(), RGW_OBJ_NS_MULTIPART); + meta_obj.set_in_extra_data(true); + + rgw_raw_obj raw_meta_obj; + + store->obj_to_raw(bucket_info.placement_rule, meta_obj, &raw_meta_obj); + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(raw_meta_obj); + + r = sysobj.omap() + .set_must_exist(true) + .set(p, bl); + if (r < 0) { + return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r; + } + + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + return 0; +} + +int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::putobj::DataProcessor **processor) +{ + int r = writer.write_exclusive(data); + if (r < 0) { + return r; + } + *processor = &stripe; + return 0; +} + +int AppendObjectProcessor::prepare() +{ + RGWObjState *astate; + int r = store->get_obj_state(&obj_ctx, bucket_info, head_obj, &astate); + if (r < 0) { + return r; + } + cur_size = astate->size; + *cur_accounted_size = astate->accounted_size; + if (!astate->exists) { + if (position != 0) { + ldout(store->ctx(), 5) << "ERROR: Append position should be zero" << dendl; + return -ERR_POSITION_NOT_EQUAL_TO_LENGTH; + } else { + cur_part_num = 1; + //set the prefix + char buf[33]; + gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); + string oid_prefix = head_obj.key.name; + oid_prefix.append("."); + oid_prefix.append(buf); + oid_prefix.append("_"); + manifest.set_prefix(oid_prefix); + } + } else { + // check whether the object appendable + map::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM); + if (iter == astate->attrset.end()) { + ldout(store->ctx(), 5) << "ERROR: The object is not appendable" << dendl; + return -ERR_OBJECT_NOT_APPENDABLE; + } + if (position != *cur_accounted_size) { + ldout(store->ctx(), 5) << "ERROR: Append position should be equal to the obj size" << dendl; + return -ERR_POSITION_NOT_EQUAL_TO_LENGTH; + } + try { + decode(cur_part_num, iter->second); + } catch (buffer::error& err) { + ldout(store->ctx(), 5) << "ERROR: failed to decode part num" << dendl; + return -EIO; + } + cur_part_num++; + //get the current obj etag + iter = astate->attrset.find(RGW_ATTR_ETAG); + if (iter != astate->attrset.end()) { + string s = rgw_string_unquote(iter->second.c_str()); + size_t pos = s.find("-"); + cur_etag = s.substr(0, pos); + } + + iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS); + if (iter != astate->attrset.end()) { + tail_placement_rule.storage_class = iter->second.to_str(); + } + cur_manifest = &astate->manifest; + manifest.set_prefix(cur_manifest->get_prefix()); + astate->keep_tail = true; + } + manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num); + + r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, &tail_placement_rule, head_obj.bucket, head_obj); + if (r < 0) { + return r; + } + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + uint64_t chunk_size = 0; + r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size); + if (r < 0) { + return r; + } + r = writer.set_stripe_obj(std::move(stripe_obj)); + if (r < 0) { + return r; + } + + uint64_t stripe_size = manifest_gen.cur_stripe_max_size(); + + uint64_t max_head_size = std::min(chunk_size, stripe_size); + set_head_chunk_size(max_head_size); + + // initialize the processors + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, stripe_size); + + return 0; +} + +int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime, + ceph::real_time set_mtime, map &attrs, + ceph::real_time delete_at, const char *if_match, const char *if_nomatch, + const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled) +{ + int r = writer.drain(); + if (r < 0) + return r; + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + obj_ctx.set_atomic(head_obj); + RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj); + //For Append obj, disable versioning + op_target.set_versioning_disabled(true); + RGWRados::Object::Write obj_op(&op_target); + if (cur_manifest) { + cur_manifest->append(manifest, store->svc.zone); + obj_op.meta.manifest = cur_manifest; + } else { + obj_op.meta.manifest = &manifest; + } + obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */ + obj_op.meta.mtime = mtime; + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.owner = owner; + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.delete_at = delete_at; + obj_op.meta.user_data = user_data; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + obj_op.meta.appendable = true; + //Add the append part number + bufferlist cur_part_num_bl; + encode(cur_part_num, cur_part_num_bl); + attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl; + //calculate the etag + if (!cur_etag.empty()) { + MD5 hash; + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + hash.Final((unsigned char *)final_etag); + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)cur_part_num); + bufferlist etag_bl; + etag_bl.append(final_etag_str, strlen(final_etag_str) + 1); + attrs[RGW_ATTR_ETAG] = etag_bl; + } + r = obj_op.write_meta(actual_size + cur_size, accounted_size + *cur_accounted_size, attrs); + if (r < 0) { + return r; + } + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + *cur_accounted_size += accounted_size; + + return 0; +} + +} // namespace rgw::putobj diff --git a/src/rgw/rgw_putobj_processor.h b/src/rgw/rgw_putobj_processor.h new file mode 100644 index 00000000..8d265f17 --- /dev/null +++ b/src/rgw/rgw_putobj_processor.h @@ -0,0 +1,263 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include + +#include "rgw_putobj.h" +#include "rgw_rados.h" +#include "services/svc_rados.h" + +namespace rgw { + +class Aio; + +namespace putobj { + +// a data consumer that writes an object in a bucket +class ObjectProcessor : public DataProcessor { + public: + // prepare to start processing object data + virtual int prepare() = 0; + + // complete the operation and make its result visible to clients + virtual int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled) = 0; +}; + +// an object processor with special handling for the first chunk of the head. +// the virtual process_first_chunk() function returns a processor to handle the +// rest of the object +class HeadObjectProcessor : public ObjectProcessor { + uint64_t head_chunk_size; + // buffer to capture the first chunk of the head object + bufferlist head_data; + // initialized after process_first_chunk() to process everything else + DataProcessor *processor = nullptr; + uint64_t data_offset = 0; // maximum offset of data written (ie compressed) + protected: + uint64_t get_actual_size() const { return data_offset; } + + // process the first chunk of data and return a processor for the rest + virtual int process_first_chunk(bufferlist&& data, + DataProcessor **processor) = 0; + public: + HeadObjectProcessor(uint64_t head_chunk_size) + : head_chunk_size(head_chunk_size) + {} + + void set_head_chunk_size(uint64_t size) { head_chunk_size = size; } + + // cache first chunk for process_first_chunk(), then forward everything else + // to the returned processor + int process(bufferlist&& data, uint64_t logical_offset) final override; +}; + + +using RawObjSet = std::set; + +// a data sink that writes to rados objects and deletes them on cancelation +class RadosWriter : public DataProcessor { + Aio *const aio; + RGWRados *const store; + const RGWBucketInfo& bucket_info; + RGWObjectCtx& obj_ctx; + const rgw_obj head_obj; + RGWSI_RADOS::Obj stripe_obj; // current stripe object + RawObjSet written; // set of written objects for deletion + + public: + RadosWriter(Aio *aio, RGWRados *store, const RGWBucketInfo& bucket_info, + RGWObjectCtx& obj_ctx, const rgw_obj& head_obj) + : aio(aio), store(store), bucket_info(bucket_info), + obj_ctx(obj_ctx), head_obj(head_obj) + {} + ~RadosWriter(); + + // change the current stripe object + int set_stripe_obj(const rgw_raw_obj& obj); + + // write the data at the given offset of the current stripe object + int process(bufferlist&& data, uint64_t stripe_offset) override; + + // write the data as an exclusive create and wait for it to complete + int write_exclusive(const bufferlist& data); + + int drain(); + + // when the operation completes successfully, clear the set of written objects + // so they aren't deleted on destruction + void clear_written() { written.clear(); } +}; + +// a rados object processor that stripes according to RGWObjManifest +class ManifestObjectProcessor : public HeadObjectProcessor, + public StripeGenerator { + protected: + RGWRados *const store; + const RGWBucketInfo& bucket_info; + rgw_placement_rule tail_placement_rule; + const rgw_user& owner; + RGWObjectCtx& obj_ctx; + rgw_obj head_obj; + + RadosWriter writer; + RGWObjManifest manifest; + RGWObjManifest::generator manifest_gen; + ChunkProcessor chunk; + StripeProcessor stripe; + + // implements StripeGenerator + int next(uint64_t offset, uint64_t *stripe_size) override; + + public: + ManifestObjectProcessor(Aio *aio, RGWRados *store, + const RGWBucketInfo& bucket_info, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& obj_ctx, + const rgw_obj& head_obj) + : HeadObjectProcessor(0), + store(store), bucket_info(bucket_info), + owner(owner), + obj_ctx(obj_ctx), head_obj(head_obj), + writer(aio, store, bucket_info, obj_ctx, head_obj), + chunk(&writer, 0), stripe(&chunk, this, 0) { + if (ptail_placement_rule) { + tail_placement_rule = *ptail_placement_rule; + } + } + + void set_tail_placement(const rgw_placement_rule&& tpr) { + tail_placement_rule = tpr; + } +}; + + +// a processor that completes with an atomic write to the head object as part of +// a bucket index transaction +class AtomicObjectProcessor : public ManifestObjectProcessor { + const std::optional olh_epoch; + const std::string unique_tag; + bufferlist first_chunk; // written with the head in complete() + + int process_first_chunk(bufferlist&& data, DataProcessor **processor) override; + public: + AtomicObjectProcessor(Aio *aio, RGWRados *store, + const RGWBucketInfo& bucket_info, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, + RGWObjectCtx& obj_ctx, const rgw_obj& head_obj, + std::optional olh_epoch, + const std::string& unique_tag) + : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule, + owner, obj_ctx, head_obj), + olh_epoch(olh_epoch), unique_tag(unique_tag) + {} + + // prepare a trivial manifest + int prepare() override; + // write the head object atomically in a bucket index transaction + int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled) override; + +}; + + +// a processor for multipart parts, which don't require atomic completion. the +// part's head is written with an exclusive create to detect racing uploads of +// the same part/upload id, which are restarted with a random oid prefix +class MultipartObjectProcessor : public ManifestObjectProcessor { + const rgw_obj target_obj; // target multipart object + const std::string upload_id; + const int part_num; + const std::string part_num_str; + RGWMPObj mp; + + // write the first chunk and wait on aio->drain() for its completion. + // on EEXIST, retry with random prefix + int process_first_chunk(bufferlist&& data, DataProcessor **processor) override; + // prepare the head stripe and manifest + int prepare_head(); + public: + MultipartObjectProcessor(Aio *aio, RGWRados *store, + const RGWBucketInfo& bucket_info, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& obj_ctx, + const rgw_obj& head_obj, + const std::string& upload_id, uint64_t part_num, + const std::string& part_num_str) + : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule, + owner, obj_ctx, head_obj), + target_obj(head_obj), upload_id(upload_id), + part_num(part_num), part_num_str(part_num_str), + mp(head_obj.key.name, upload_id) + {} + + // prepare a multipart manifest + int prepare() override; + // write the head object attributes in a bucket index transaction, then + // register the completed part with the multipart meta object + int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled) override; +}; + + class AppendObjectProcessor : public ManifestObjectProcessor { + uint64_t cur_part_num; + uint64_t position; + uint64_t cur_size; + uint64_t *cur_accounted_size; + string cur_etag; + const std::string unique_tag; + + RGWObjManifest *cur_manifest; + + int process_first_chunk(bufferlist&& data, DataProcessor **processor) override; + + public: + AppendObjectProcessor(Aio *aio, RGWRados *store, const RGWBucketInfo& bucket_info, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& obj_ctx,const rgw_obj& head_obj, + const std::string& unique_tag, uint64_t position, uint64_t *cur_accounted_size) + : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule, owner, obj_ctx, head_obj), + position(position), cur_size(0), cur_accounted_size(cur_accounted_size), + unique_tag(unique_tag), cur_manifest(nullptr) + {} + int prepare() override; + int complete(size_t accounted_size, const string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + map& attrs, ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, const string *user_data, + rgw_zone_set *zones_trace, bool *canceled) override; + }; + +} // namespace putobj +} // namespace rgw + diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc new file mode 100644 index 00000000..052bc7a5 --- /dev/null +++ b/src/rgw/rgw_quota.cc @@ -0,0 +1,1034 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "include/utime.h" +#include "common/lru_map.h" +#include "common/RefCountedObj.h" +#include "common/Thread.h" +#include "common/Mutex.h" +#include "common/RWLock.h" + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_quota.h" +#include "rgw_bucket.h" +#include "rgw_user.h" + +#include "services/svc_sys_obj.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + + +struct RGWQuotaCacheStats { + RGWStorageStats stats; + utime_t expiration; + utime_t async_refresh_time; +}; + +template +class RGWQuotaCache { +protected: + RGWRados *store; + lru_map stats_map; + RefCountedWaitObject *async_refcount; + + class StatsAsyncTestSet : public lru_map::UpdateContext { + int objs_delta; + uint64_t added_bytes; + uint64_t removed_bytes; + public: + StatsAsyncTestSet() : objs_delta(0), added_bytes(0), removed_bytes(0) {} + bool update(RGWQuotaCacheStats *entry) override { + if (entry->async_refresh_time.sec() == 0) + return false; + + entry->async_refresh_time = utime_t(0, 0); + + return true; + } + }; + + virtual int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats) = 0; + + virtual bool map_find(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0; + + virtual bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, typename lru_map::UpdateContext *ctx) = 0; + virtual void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0; + + virtual void data_modified(const rgw_user& user, rgw_bucket& bucket) {} +public: + RGWQuotaCache(RGWRados *_store, int size) : store(_store), stats_map(size) { + async_refcount = new RefCountedWaitObject; + } + virtual ~RGWQuotaCache() { + async_refcount->put_wait(); /* wait for all pending async requests to complete */ + } + + int get_stats(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, RGWQuotaInfo& quota); + void adjust_stats(const rgw_user& user, rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes); + + virtual bool can_use_cached_stats(RGWQuotaInfo& quota, RGWStorageStats& stats); + + void set_stats(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats); + int async_refresh(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs); + void async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats); + void async_refresh_fail(const rgw_user& user, rgw_bucket& bucket); + + class AsyncRefreshHandler { + protected: + RGWRados *store; + RGWQuotaCache *cache; + public: + AsyncRefreshHandler(RGWRados *_store, RGWQuotaCache *_cache) : store(_store), cache(_cache) {} + virtual ~AsyncRefreshHandler() {} + + virtual int init_fetch() = 0; + virtual void drop_reference() = 0; + }; + + virtual AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) = 0; +}; + +template +bool RGWQuotaCache::can_use_cached_stats(RGWQuotaInfo& quota, RGWStorageStats& cached_stats) +{ + if (quota.max_size >= 0) { + if (quota.max_size_soft_threshold < 0) { + quota.max_size_soft_threshold = quota.max_size * store->ctx()->_conf->rgw_bucket_quota_soft_threshold; + } + + if (cached_stats.size_rounded >= (uint64_t)quota.max_size_soft_threshold) { + ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (size): " + << cached_stats.size_rounded << " >= " << quota.max_size_soft_threshold << dendl; + return false; + } + } + + if (quota.max_objects >= 0) { + if (quota.max_objs_soft_threshold < 0) { + quota.max_objs_soft_threshold = quota.max_objects * store->ctx()->_conf->rgw_bucket_quota_soft_threshold; + } + + if (cached_stats.num_objects >= (uint64_t)quota.max_objs_soft_threshold) { + ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (num objs): " + << cached_stats.num_objects << " >= " << quota.max_objs_soft_threshold << dendl; + return false; + } + } + + return true; +} + +template +int RGWQuotaCache::async_refresh(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) +{ + /* protect against multiple updates */ + StatsAsyncTestSet test_update; + if (!map_find_and_update(user, bucket, &test_update)) { + /* most likely we just raced with another update */ + return 0; + } + + async_refcount->get(); + + + AsyncRefreshHandler *handler = allocate_refresh_handler(user, bucket); + + int ret = handler->init_fetch(); + if (ret < 0) { + async_refcount->put(); + handler->drop_reference(); + return ret; + } + + return 0; +} + +template +void RGWQuotaCache::async_refresh_fail(const rgw_user& user, rgw_bucket& bucket) +{ + ldout(store->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl; + + async_refcount->put(); +} + +template +void RGWQuotaCache::async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats) +{ + ldout(store->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl; + + RGWQuotaCacheStats qs; + + map_find(user, bucket, qs); + + set_stats(user, bucket, qs, stats); + + async_refcount->put(); +} + +template +void RGWQuotaCache::set_stats(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats) +{ + qs.stats = stats; + qs.expiration = ceph_clock_now(); + qs.async_refresh_time = qs.expiration; + qs.expiration += store->ctx()->_conf->rgw_bucket_quota_ttl; + qs.async_refresh_time += store->ctx()->_conf->rgw_bucket_quota_ttl / 2; + + map_add(user, bucket, qs); +} + +template +int RGWQuotaCache::get_stats(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, RGWQuotaInfo& quota) { + RGWQuotaCacheStats qs; + utime_t now = ceph_clock_now(); + if (map_find(user, bucket, qs)) { + if (qs.async_refresh_time.sec() > 0 && now >= qs.async_refresh_time) { + int r = async_refresh(user, bucket, qs); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: quota async refresh returned ret=" << r << dendl; + + /* continue processing, might be a transient error, async refresh is just optimization */ + } + } + + if (can_use_cached_stats(quota, qs.stats) && qs.expiration > + ceph_clock_now()) { + stats = qs.stats; + return 0; + } + } + + int ret = fetch_stats_from_storage(user, bucket, stats); + if (ret < 0 && ret != -ENOENT) + return ret; + + set_stats(user, bucket, qs, stats); + + return 0; +} + + +template +class RGWQuotaStatsUpdate : public lru_map::UpdateContext { + const int objs_delta; + const uint64_t added_bytes; + const uint64_t removed_bytes; +public: + RGWQuotaStatsUpdate(const int objs_delta, + const uint64_t added_bytes, + const uint64_t removed_bytes) + : objs_delta(objs_delta), + added_bytes(added_bytes), + removed_bytes(removed_bytes) { + } + + bool update(RGWQuotaCacheStats * const entry) override { + const uint64_t rounded_added = rgw_rounded_objsize(added_bytes); + const uint64_t rounded_removed = rgw_rounded_objsize(removed_bytes); + + if (((int64_t)(entry->stats.size + added_bytes - removed_bytes)) >= 0) { + entry->stats.size += added_bytes - removed_bytes; + } else { + entry->stats.size = 0; + } + + if (((int64_t)(entry->stats.size_rounded + rounded_added - rounded_removed)) >= 0) { + entry->stats.size_rounded += rounded_added - rounded_removed; + } else { + entry->stats.size_rounded = 0; + } + + if (((int64_t)(entry->stats.num_objects + objs_delta)) >= 0) { + entry->stats.num_objects += objs_delta; + } else { + entry->stats.num_objects = 0; + } + + return true; + } +}; + + +template +void RGWQuotaCache::adjust_stats(const rgw_user& user, rgw_bucket& bucket, int objs_delta, + uint64_t added_bytes, uint64_t removed_bytes) +{ + RGWQuotaStatsUpdate update(objs_delta, added_bytes, removed_bytes); + map_find_and_update(user, bucket, &update); + + data_modified(user, bucket); +} + +class BucketAsyncRefreshHandler : public RGWQuotaCache::AsyncRefreshHandler, + public RGWGetBucketStats_CB { + rgw_user user; +public: + BucketAsyncRefreshHandler(RGWRados *_store, RGWQuotaCache *_cache, + const rgw_user& _user, const rgw_bucket& _bucket) : + RGWQuotaCache::AsyncRefreshHandler(_store, _cache), + RGWGetBucketStats_CB(_bucket), user(_user) {} + + void drop_reference() override { put(); } + void handle_response(int r) override; + int init_fetch() override; +}; + +int BucketAsyncRefreshHandler::init_fetch() +{ + RGWBucketInfo bucket_info; + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int r = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket << " r=" << r << dendl; + return r; + } + + ldout(store->ctx(), 20) << "initiating async quota refresh for bucket=" << bucket << dendl; + + r = store->get_bucket_stats_async(bucket_info, RGW_NO_SHARD, this); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket.name << dendl; + + /* get_bucket_stats_async() dropped our reference already */ + return r; + } + + return 0; +} + +void BucketAsyncRefreshHandler::handle_response(const int r) +{ + if (r < 0) { + ldout(store->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl; + cache->async_refresh_fail(user, bucket); + return; + } + + RGWStorageStats bs; + + for (const auto& pair : *stats) { + const RGWStorageStats& s = pair.second; + + bs.size += s.size; + bs.size_rounded += s.size_rounded; + bs.num_objects += s.num_objects; + } + + cache->async_refresh_response(user, bucket, bs); +} + +class RGWBucketStatsCache : public RGWQuotaCache { +protected: + bool map_find(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override { + return stats_map.find(bucket, qs); + } + + bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, lru_map::UpdateContext *ctx) override { + return stats_map.find_and_update(bucket, NULL, ctx); + } + + void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override { + stats_map.add(bucket, qs); + } + + int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats) override; + +public: + explicit RGWBucketStatsCache(RGWRados *_store) : RGWQuotaCache(_store, _store->ctx()->_conf->rgw_bucket_quota_cache_size) { + } + + AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) override { + return new BucketAsyncRefreshHandler(store, this, user, bucket); + } +}; + +int RGWBucketStatsCache::fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats) +{ + RGWBucketInfo bucket_info; + + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int r = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket << " r=" << r << dendl; + return r; + } + + string bucket_ver; + string master_ver; + + map bucket_stats; + r = store->get_bucket_stats(bucket_info, RGW_NO_SHARD, &bucket_ver, + &master_ver, bucket_stats, nullptr); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket stats for bucket=" + << bucket.name << dendl; + return r; + } + + stats = RGWStorageStats(); + + for (const auto& pair : bucket_stats) { + const RGWStorageStats& s = pair.second; + + stats.size += s.size; + stats.size_rounded += s.size_rounded; + stats.num_objects += s.num_objects; + } + + return 0; +} + +class UserAsyncRefreshHandler : public RGWQuotaCache::AsyncRefreshHandler, + public RGWGetUserStats_CB { + rgw_bucket bucket; +public: + UserAsyncRefreshHandler(RGWRados *_store, RGWQuotaCache *_cache, + const rgw_user& _user, const rgw_bucket& _bucket) : + RGWQuotaCache::AsyncRefreshHandler(_store, _cache), + RGWGetUserStats_CB(_user), + bucket(_bucket) {} + + void drop_reference() override { put(); } + int init_fetch() override; + void handle_response(int r) override; +}; + +int UserAsyncRefreshHandler::init_fetch() +{ + ldout(store->ctx(), 20) << "initiating async quota refresh for user=" << user << dendl; + int r = store->get_user_stats_async(user, this); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for user=" << user << dendl; + + /* get_bucket_stats_async() dropped our reference already */ + return r; + } + + return 0; +} + +void UserAsyncRefreshHandler::handle_response(int r) +{ + if (r < 0) { + ldout(store->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl; + cache->async_refresh_fail(user, bucket); + return; + } + + cache->async_refresh_response(user, bucket, stats); +} + +class RGWUserStatsCache : public RGWQuotaCache { + std::atomic down_flag = { false }; + RWLock rwlock; + map modified_buckets; + + /* thread, sync recent modified buckets info */ + class BucketsSyncThread : public Thread { + CephContext *cct; + RGWUserStatsCache *stats; + + Mutex lock; + Cond cond; + public: + + BucketsSyncThread(CephContext *_cct, RGWUserStatsCache *_s) : cct(_cct), stats(_s), lock("RGWUserStatsCache::BucketsSyncThread") {} + + void *entry() override { + ldout(cct, 20) << "BucketsSyncThread: start" << dendl; + do { + map buckets; + + stats->swap_modified_buckets(buckets); + + for (map::iterator iter = buckets.begin(); iter != buckets.end(); ++iter) { + rgw_bucket bucket = iter->first; + rgw_user& user = iter->second; + ldout(cct, 20) << "BucketsSyncThread: sync user=" << user << " bucket=" << bucket << dendl; + int r = stats->sync_bucket(user, bucket); + if (r < 0) { + ldout(cct, 0) << "WARNING: sync_bucket() returned r=" << r << dendl; + } + } + + if (stats->going_down()) + break; + + lock.Lock(); + cond.WaitInterval(lock, utime_t(cct->_conf->rgw_user_quota_bucket_sync_interval, 0)); + lock.Unlock(); + } while (!stats->going_down()); + ldout(cct, 20) << "BucketsSyncThread: done" << dendl; + + return NULL; + } + + void stop() { + Mutex::Locker l(lock); + cond.Signal(); + } + }; + + /* + * thread, full sync all users stats periodically + * + * only sync non idle users or ones that never got synced before, this is needed so that + * users that didn't have quota turned on before (or existed before the user objclass + * tracked stats) need to get their backend stats up to date. + */ + class UserSyncThread : public Thread { + CephContext *cct; + RGWUserStatsCache *stats; + + Mutex lock; + Cond cond; + public: + + UserSyncThread(CephContext *_cct, RGWUserStatsCache *_s) : cct(_cct), stats(_s), lock("RGWUserStatsCache::UserSyncThread") {} + + void *entry() override { + ldout(cct, 20) << "UserSyncThread: start" << dendl; + do { + int ret = stats->sync_all_users(); + if (ret < 0) { + ldout(cct, 5) << "ERROR: sync_all_users() returned ret=" << ret << dendl; + } + + if (stats->going_down()) + break; + + lock.Lock(); + cond.WaitInterval(lock, utime_t(cct->_conf->rgw_user_quota_sync_interval, 0)); + lock.Unlock(); + } while (!stats->going_down()); + ldout(cct, 20) << "UserSyncThread: done" << dendl; + + return NULL; + } + + void stop() { + Mutex::Locker l(lock); + cond.Signal(); + } + }; + + BucketsSyncThread *buckets_sync_thread; + UserSyncThread *user_sync_thread; +protected: + bool map_find(const rgw_user& user,const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override { + return stats_map.find(user, qs); + } + + bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, lru_map::UpdateContext *ctx) override { + return stats_map.find_and_update(user, NULL, ctx); + } + + void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override { + stats_map.add(user, qs); + } + + int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats) override; + int sync_bucket(const rgw_user& rgw_user, rgw_bucket& bucket); + int sync_user(const rgw_user& user); + int sync_all_users(); + + void data_modified(const rgw_user& user, rgw_bucket& bucket) override; + + void swap_modified_buckets(map& out) { + rwlock.get_write(); + modified_buckets.swap(out); + rwlock.unlock(); + } + + template /* easier doing it as a template, Thread doesn't have ->stop() */ + void stop_thread(T **pthr) { + T *thread = *pthr; + if (!thread) + return; + + thread->stop(); + thread->join(); + delete thread; + *pthr = NULL; + } + +public: + RGWUserStatsCache(RGWRados *_store, bool quota_threads) : RGWQuotaCache(_store, _store->ctx()->_conf->rgw_bucket_quota_cache_size), + rwlock("RGWUserStatsCache::rwlock") { + if (quota_threads) { + buckets_sync_thread = new BucketsSyncThread(store->ctx(), this); + buckets_sync_thread->create("rgw_buck_st_syn"); + user_sync_thread = new UserSyncThread(store->ctx(), this); + user_sync_thread->create("rgw_user_st_syn"); + } else { + buckets_sync_thread = NULL; + user_sync_thread = NULL; + } + } + ~RGWUserStatsCache() override { + stop(); + } + + AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) override { + return new UserAsyncRefreshHandler(store, this, user, bucket); + } + + bool can_use_cached_stats(RGWQuotaInfo& quota, RGWStorageStats& stats) override { + /* in the user case, the cached stats may contain a better estimation of the totals, as + * the backend is only periodically getting updated. + */ + return true; + } + + bool going_down() { + return down_flag; + } + + void stop() { + down_flag = true; + rwlock.get_write(); + stop_thread(&buckets_sync_thread); + rwlock.unlock(); + stop_thread(&user_sync_thread); + } +}; + +int RGWUserStatsCache::fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats) +{ + int r = store->get_user_stats(user, stats); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get user stats for user=" << user << dendl; + return r; + } + + return 0; +} + +int RGWUserStatsCache::sync_bucket(const rgw_user& user, rgw_bucket& bucket) +{ + RGWBucketInfo bucket_info; + + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int r = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket << " r=" << r << dendl; + return r; + } + + r = rgw_bucket_sync_user_stats(store, user, bucket_info); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: rgw_bucket_sync_user_stats() for user=" << user << ", bucket=" << bucket << " returned " << r << dendl; + return r; + } + + return 0; +} + +int RGWUserStatsCache::sync_user(const rgw_user& user) +{ + cls_user_header header; + string user_str = user.to_str(); + int ret = store->cls_user_get_header(user_str, &header); + if (ret < 0) { + ldout(store->ctx(), 5) << "ERROR: can't read user header: ret=" << ret << dendl; + return ret; + } + + if (!store->ctx()->_conf->rgw_user_quota_sync_idle_users && + header.last_stats_update < header.last_stats_sync) { + ldout(store->ctx(), 20) << "user is idle, not doing a full sync (user=" << user << ")" << dendl; + return 0; + } + + real_time when_need_full_sync = header.last_stats_sync; + when_need_full_sync += make_timespan(store->ctx()->_conf->rgw_user_quota_sync_wait_time); + + // check if enough time passed since last full sync + /* FIXME: missing check? */ + + ret = rgw_user_sync_all_stats(store, user); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: failed user stats sync, ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWUserStatsCache::sync_all_users() +{ + string key = "user"; + void *handle; + + int ret = store->meta_mgr->list_keys_init(key, &handle); + if (ret < 0) { + ldout(store->ctx(), 10) << "ERROR: can't get key: ret=" << ret << dendl; + return ret; + } + + bool truncated; + int max = 1000; + + do { + list keys; + ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: lists_keys_next(): ret=" << ret << dendl; + goto done; + } + for (list::iterator iter = keys.begin(); + iter != keys.end() && !going_down(); + ++iter) { + rgw_user user(*iter); + ldout(store->ctx(), 20) << "RGWUserStatsCache: sync user=" << user << dendl; + int ret = sync_user(user); + if (ret < 0) { + ldout(store->ctx(), 5) << "ERROR: sync_user() failed, user=" << user << " ret=" << ret << dendl; + + /* continuing to next user */ + continue; + } + } + } while (truncated); + + ret = 0; +done: + store->meta_mgr->list_keys_complete(handle); + return ret; +} + +void RGWUserStatsCache::data_modified(const rgw_user& user, rgw_bucket& bucket) +{ + /* racy, but it's ok */ + rwlock.get_read(); + bool need_update = modified_buckets.find(bucket) == modified_buckets.end(); + rwlock.unlock(); + + if (need_update) { + rwlock.get_write(); + modified_buckets[bucket] = user; + rwlock.unlock(); + } +} + + +class RGWQuotaInfoApplier { + /* NOTE: no non-static field allowed as instances are supposed to live in + * the static memory only. */ +protected: + RGWQuotaInfoApplier() = default; + +public: + virtual ~RGWQuotaInfoApplier() {} + + virtual bool is_size_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const = 0; + + virtual bool is_num_objs_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const = 0; + + static const RGWQuotaInfoApplier& get_instance(const RGWQuotaInfo& qinfo); +}; + +class RGWQuotaInfoDefApplier : public RGWQuotaInfoApplier { +public: + bool is_size_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const override; + + bool is_num_objs_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const override; +}; + +class RGWQuotaInfoRawApplier : public RGWQuotaInfoApplier { +public: + bool is_size_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const override; + + bool is_num_objs_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const override; +}; + + +bool RGWQuotaInfoDefApplier::is_size_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const +{ + if (qinfo.max_size < 0) { + /* The limit is not enabled. */ + return false; + } + + const uint64_t cur_size = stats.size_rounded; + const uint64_t new_size = rgw_rounded_objsize(size); + + if (cur_size + new_size > static_cast(qinfo.max_size)) { + dout(10) << "quota exceeded: stats.size_rounded=" << stats.size_rounded + << " size=" << new_size << " " + << entity << "_quota.max_size=" << qinfo.max_size << dendl; + return true; + } + + return false; +} + +bool RGWQuotaInfoDefApplier::is_num_objs_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const +{ + if (qinfo.max_objects < 0) { + /* The limit is not enabled. */ + return false; + } + + if (stats.num_objects + num_objs > static_cast(qinfo.max_objects)) { + dout(10) << "quota exceeded: stats.num_objects=" << stats.num_objects + << " " << entity << "_quota.max_objects=" << qinfo.max_objects + << dendl; + return true; + } + + return false; +} + +bool RGWQuotaInfoRawApplier::is_size_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t size) const +{ + if (qinfo.max_size < 0) { + /* The limit is not enabled. */ + return false; + } + + const uint64_t cur_size = stats.size; + + if (cur_size + size > static_cast(qinfo.max_size)) { + dout(10) << "quota exceeded: stats.size=" << stats.size + << " size=" << size << " " + << entity << "_quota.max_size=" << qinfo.max_size << dendl; + return true; + } + + return false; +} + +bool RGWQuotaInfoRawApplier::is_num_objs_exceeded(const char * const entity, + const RGWQuotaInfo& qinfo, + const RGWStorageStats& stats, + const uint64_t num_objs) const +{ + if (qinfo.max_objects < 0) { + /* The limit is not enabled. */ + return false; + } + + if (stats.num_objects + num_objs > static_cast(qinfo.max_objects)) { + dout(10) << "quota exceeded: stats.num_objects=" << stats.num_objects + << " " << entity << "_quota.max_objects=" << qinfo.max_objects + << dendl; + return true; + } + + return false; +} + +const RGWQuotaInfoApplier& RGWQuotaInfoApplier::get_instance( + const RGWQuotaInfo& qinfo) +{ + static RGWQuotaInfoDefApplier default_qapplier; + static RGWQuotaInfoRawApplier raw_qapplier; + + if (qinfo.check_on_raw) { + return raw_qapplier; + } else { + return default_qapplier; + } +} + + +class RGWQuotaHandlerImpl : public RGWQuotaHandler { + RGWRados *store; + RGWBucketStatsCache bucket_stats_cache; + RGWUserStatsCache user_stats_cache; + + int check_quota(const char * const entity, + const RGWQuotaInfo& quota, + const RGWStorageStats& stats, + const uint64_t num_objs, + const uint64_t size) { + if (!quota.enabled) { + return 0; + } + + const auto& quota_applier = RGWQuotaInfoApplier::get_instance(quota); + + ldout(store->ctx(), 20) << entity + << " quota: max_objects=" << quota.max_objects + << " max_size=" << quota.max_size << dendl; + + + if (quota_applier.is_num_objs_exceeded(entity, quota, stats, num_objs)) { + return -ERR_QUOTA_EXCEEDED; + } + + if (quota_applier.is_size_exceeded(entity, quota, stats, size)) { + return -ERR_QUOTA_EXCEEDED; + } + + ldout(store->ctx(), 20) << entity << " quota OK:" + << " stats.num_objects=" << stats.num_objects + << " stats.size=" << stats.size << dendl; + return 0; + } +public: + RGWQuotaHandlerImpl(RGWRados *_store, bool quota_threads) : store(_store), + bucket_stats_cache(_store), + user_stats_cache(_store, quota_threads) {} + + int check_quota(const rgw_user& user, + rgw_bucket& bucket, + RGWQuotaInfo& user_quota, + RGWQuotaInfo& bucket_quota, + uint64_t num_objs, + uint64_t size) override { + + if (!bucket_quota.enabled && !user_quota.enabled) { + return 0; + } + + /* + * we need to fetch bucket stats if the user quota is enabled, because + * the whole system relies on us periodically updating the user's bucket + * stats in the user's header, this happens in get_stats() if we actually + * fetch that info and not rely on cached data + */ + + if (bucket_quota.enabled) { + RGWStorageStats bucket_stats; + int ret = bucket_stats_cache.get_stats(user, bucket, bucket_stats, + bucket_quota); + if (ret < 0) { + return ret; + } + ret = check_quota("bucket", bucket_quota, bucket_stats, num_objs, size); + if (ret < 0) { + return ret; + } + } + + if (user_quota.enabled) { + RGWStorageStats user_stats; + int ret = user_stats_cache.get_stats(user, bucket, user_stats, user_quota); + if (ret < 0) { + return ret; + } + ret = check_quota("user", user_quota, user_stats, num_objs, size); + if (ret < 0) { + return ret; + } + } + return 0; + } + + void update_stats(const rgw_user& user, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) override { + bucket_stats_cache.adjust_stats(user, bucket, obj_delta, added_bytes, removed_bytes); + user_stats_cache.adjust_stats(user, bucket, obj_delta, added_bytes, removed_bytes); + } + + int check_bucket_shards(uint64_t max_objs_per_shard, uint64_t num_shards, + const rgw_user& user, const rgw_bucket& bucket, RGWQuotaInfo& bucket_quota, + uint64_t num_objs, bool& need_resharding, uint32_t *suggested_num_shards) override + { + RGWStorageStats bucket_stats; + int ret = bucket_stats_cache.get_stats(user, bucket, bucket_stats, + bucket_quota); + if (ret < 0) { + return ret; + } + + if (bucket_stats.num_objects + num_objs > num_shards * max_objs_per_shard) { + ldout(store->ctx(), 0) << __func__ << ": resharding needed: stats.num_objects=" << bucket_stats.num_objects + << " shard max_objects=" << max_objs_per_shard * num_shards << dendl; + need_resharding = true; + if (suggested_num_shards) { + *suggested_num_shards = (bucket_stats.num_objects + num_objs) * 2 / max_objs_per_shard; + } + } else { + need_resharding = false; + } + + return 0; + } + +}; + + +RGWQuotaHandler *RGWQuotaHandler::generate_handler(RGWRados *store, bool quota_threads) +{ + return new RGWQuotaHandlerImpl(store, quota_threads); +} + +void RGWQuotaHandler::free_handler(RGWQuotaHandler *handler) +{ + delete handler; +} + + +void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf) +{ + if (conf->rgw_bucket_default_quota_max_objects >= 0) { + quota.max_objects = conf->rgw_bucket_default_quota_max_objects; + quota.enabled = true; + } + if (conf->rgw_bucket_default_quota_max_size >= 0) { + quota.max_size = conf->rgw_bucket_default_quota_max_size; + quota.enabled = true; + } +} + +void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf) +{ + if (conf->rgw_user_default_quota_max_objects >= 0) { + quota.max_objects = conf->rgw_user_default_quota_max_objects; + quota.enabled = true; + } + if (conf->rgw_user_default_quota_max_size >= 0) { + quota.max_size = conf->rgw_user_default_quota_max_size; + quota.enabled = true; + } +} diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h new file mode 100644 index 00000000..a048aa7d --- /dev/null +++ b/src/rgw/rgw_quota.h @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RGW_QUOTA_H +#define CEPH_RGW_QUOTA_H + +#include "include/utime.h" +#include "common/config_fwd.h" +#include "common/lru_map.h" + +#include + +static inline int64_t rgw_rounded_kb(int64_t bytes) +{ + return (bytes + 1023) / 1024; +} + +class RGWRados; +class JSONObj; + +struct RGWQuotaInfo { + template friend class RGWQuotaCache; +protected: + /* The quota thresholds after which comparing against cached storage stats + * is disallowed. Those fields may be accessed only by the RGWQuotaCache. + * They are not intended as tunables but rather as a mean to store results + * of repeating calculations in the quota cache subsystem. */ + int64_t max_size_soft_threshold; + int64_t max_objs_soft_threshold; + +public: + int64_t max_size; + int64_t max_objects; + bool enabled; + /* Do we want to compare with raw, not rounded RGWStorageStats::size (true) + * or maybe rounded-to-4KiB RGWStorageStats::size_rounded (false)? */ + bool check_on_raw; + + RGWQuotaInfo() + : max_size_soft_threshold(-1), + max_objs_soft_threshold(-1), + max_size(-1), + max_objects(-1), + enabled(false), + check_on_raw(false) { + } + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + if (max_size < 0) { + encode(-rgw_rounded_kb(abs(max_size)), bl); + } else { + encode(rgw_rounded_kb(max_size), bl); + } + encode(max_objects, bl); + encode(enabled, bl); + encode(max_size, bl); + encode(check_on_raw, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 1, 1, bl); + int64_t max_size_kb; + decode(max_size_kb, bl); + decode(max_objects, bl); + decode(enabled, bl); + if (struct_v < 2) { + max_size = max_size_kb * 1024; + } else { + decode(max_size, bl); + } + if (struct_v >= 3) { + decode(check_on_raw, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + + void decode_json(JSONObj *obj); + +}; +WRITE_CLASS_ENCODER(RGWQuotaInfo) + +struct rgw_bucket; + +class RGWQuotaHandler { +public: + RGWQuotaHandler() {} + virtual ~RGWQuotaHandler() { + } + virtual int check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket, + RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, + uint64_t num_objs, uint64_t size) = 0; + + virtual int check_bucket_shards(uint64_t max_objs_per_shard, uint64_t num_shards, + const rgw_user& bucket_owner, const rgw_bucket& bucket, + RGWQuotaInfo& bucket_quota, uint64_t num_objs, bool& need_resharding, + uint32_t *suggested_num_shards) = 0; + + virtual void update_stats(const rgw_user& bucket_owner, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) = 0; + + static RGWQuotaHandler *generate_handler(RGWRados *store, bool quota_threads); + static void free_handler(RGWQuotaHandler *handler); +}; + +// apply default quotas from configuration +void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf); +void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf); + +#endif diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc new file mode 100644 index 00000000..4d3ae0b8 --- /dev/null +++ b/src/rgw/rgw_rados.cc @@ -0,0 +1,10734 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "common/ceph_json.h" + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/Throttle.h" + +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_cache.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */ +#include "rgw_aio_throttle.h" +#include "rgw_bucket.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_putobj_processor.h" + +#include "cls/rgw/cls_rgw_ops.h" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/rgw/cls_rgw_const.h" +#include "cls/refcount/cls_refcount_client.h" +#include "cls/version/cls_version_client.h" +#include "cls/log/cls_log_client.h" +#include "cls/timeindex/cls_timeindex_client.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/user/cls_user_client.h" +#include "cls/otp/cls_otp_client.h" +#include "osd/osd_types.h" + +#include "rgw_tools.h" +#include "rgw_coroutine.h" +#include "rgw_compression.h" +#include "rgw_etag_verifier.h" + +#undef fork // fails to compile RGWPeriod::fork() below + +#include "common/Clock.h" + +using namespace librados; + +#include +#include +#include +#include +#include +#include +#include "include/random.h" + +#include "rgw_gc.h" +#include "rgw_lc.h" + +#include "rgw_object_expirer_core.h" +#include "rgw_sync.h" +#include "rgw_sync_counters.h" +#include "rgw_sync_trace.h" +#include "rgw_data_sync.h" +#include "rgw_realm_watcher.h" +#include "rgw_reshard.h" + +#include "services/svc_zone.h" +#include "services/svc_zone_utils.h" +#include "services/svc_quota.h" +#include "services/svc_sync_modules.h" +#include "services/svc_sys_obj.h" +#include "services/svc_sys_obj_cache.h" + +#include "compressor/Compressor.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/rgw_rados.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + + +static string shadow_ns = "shadow"; +static string dir_oid_prefix = ".dir."; +static string default_bucket_index_pool_suffix = "rgw.buckets.index"; +static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec"; + +static string log_lock_name = "rgw_log_lock"; +static RGWObjCategory main_category = RGWObjCategory::Main; +#define RGW_USAGE_OBJ_PREFIX "usage." + +#define dout_subsys ceph_subsys_rgw + +const std::string MP_META_SUFFIX = ".meta"; + + +static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params, + const rgw_placement_rule& head_placement_rule, + const rgw_obj& obj, rgw_pool *pool) +{ + if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) { + RGWZonePlacementInfo placement; + if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) { + return false; + } + + if (!obj.in_extra_data) { + *pool = placement.get_data_pool(zonegroup.default_placement.storage_class); + } else { + *pool = placement.get_data_extra_pool(); + } + } + + return true; +} + +static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params, + const rgw_placement_rule& head_placement_rule, + const rgw_obj& obj, rgw_raw_obj *raw_obj) +{ + get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc); + + return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool); +} + +rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const +{ + if (!is_raw) { + rgw_raw_obj r; + rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r); + return r; + } + return raw_obj; +} + +rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const +{ + if (!is_raw) { + rgw_raw_obj r; + store->obj_to_raw(placement_rule, obj, &r); + return r; + } + return raw_obj; +} + +void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op) +{ + obj_version *check_objv = version_for_check(); + + if (check_objv) { + cls_version_check(*op, *check_objv, VER_COND_EQ); + } + + cls_version_read(*op, &read_version); +} + +void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op) +{ + obj_version *check_objv = version_for_check(); + obj_version *modify_version = version_for_write(); + + if (check_objv) { + cls_version_check(*op, *check_objv, VER_COND_EQ); + } + + if (modify_version) { + cls_version_set(*op, *modify_version); + } else { + cls_version_inc(*op); + } +} + +void RGWObjVersionTracker::apply_write() +{ + const bool checked = (read_version.ver != 0); + const bool incremented = (write_version.ver == 0); + + if (checked && incremented) { + // apply cls_version_inc() so our next operation can recheck it + ++read_version.ver; + } else { + read_version = write_version; + } + write_version = obj_version(); +} + +void RGWObjManifest::obj_iterator::operator++() +{ + if (manifest->explicit_objs) { + ++explicit_iter; + + update_explicit_pos(); + + update_location(); + return; + } + + uint64_t obj_size = manifest->get_obj_size(); + uint64_t head_size = manifest->get_head_size(); + + if (ofs == obj_size) { + return; + } + + if (manifest->rules.empty()) { + return; + } + + /* are we still pointing at the head? */ + if (ofs < head_size) { + rule_iter = manifest->rules.begin(); + RGWObjManifestRule *rule = &rule_iter->second; + ofs = std::min(head_size, obj_size); + stripe_ofs = ofs; + cur_stripe = 1; + stripe_size = std::min(obj_size - ofs, rule->stripe_max_size); + if (rule->part_size > 0) { + stripe_size = std::min(stripe_size, rule->part_size); + } + update_location(); + return; + } + + RGWObjManifestRule *rule = &rule_iter->second; + + stripe_ofs += rule->stripe_max_size; + cur_stripe++; + dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl; + + if (rule->part_size > 0) { + /* multi part, multi stripes object */ + + dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl; + + if (stripe_ofs >= part_ofs + rule->part_size) { + /* moved to the next part */ + cur_stripe = 0; + part_ofs += rule->part_size; + stripe_ofs = part_ofs; + + bool last_rule = (next_rule_iter == manifest->rules.end()); + /* move to the next rule? */ + if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) { + rule_iter = next_rule_iter; + last_rule = (next_rule_iter == manifest->rules.end()); + if (!last_rule) { + ++next_rule_iter; + } + cur_part_id = rule_iter->second.start_part_num; + } else { + cur_part_id++; + } + + rule = &rule_iter->second; + } + + stripe_size = std::min(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size); + } + + cur_override_prefix = rule->override_prefix; + + ofs = stripe_ofs; + if (ofs > obj_size) { + ofs = obj_size; + stripe_ofs = ofs; + stripe_size = 0; + } + + dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl; + update_location(); +} + +int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, + const rgw_placement_rule& head_placement_rule, + const rgw_placement_rule *tail_placement_rule, + const rgw_bucket& _b, const rgw_obj& _obj) +{ + manifest = _m; + + if (!tail_placement_rule) { + manifest->set_tail_placement(head_placement_rule, _b); + } else { + rgw_placement_rule new_tail_rule = *tail_placement_rule; + new_tail_rule.inherit_from(head_placement_rule); + manifest->set_tail_placement(new_tail_rule, _b); + } + + manifest->set_head(head_placement_rule, _obj, 0); + last_ofs = 0; + + if (manifest->get_prefix().empty()) { + char buf[33]; + gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1); + + string oid_prefix = "."; + oid_prefix.append(buf); + oid_prefix.append("_"); + + manifest->set_prefix(oid_prefix); + } + + bool found = manifest->get_rule(0, &rule); + if (!found) { + derr << "ERROR: manifest->get_rule() could not find rule" << dendl; + return -EIO; + } + + uint64_t head_size = manifest->get_head_size(); + + if (head_size > 0) { + cur_stripe_size = head_size; + } else { + cur_stripe_size = rule.stripe_max_size; + } + + cur_part_id = rule.start_part_num; + + manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj); + + // Normal object which not generated through copy operation + manifest->set_tail_instance(_obj.key.instance); + + manifest->update_iterators(); + + return 0; +} + +int RGWObjManifest::generator::create_next(uint64_t ofs) +{ + if (ofs < last_ofs) /* only going forward */ + return -EINVAL; + + uint64_t max_head_size = manifest->get_max_head_size(); + + if (ofs < max_head_size) { + manifest->set_head_size(ofs); + } + + if (ofs >= max_head_size) { + manifest->set_head_size(max_head_size); + cur_stripe = (ofs - max_head_size) / rule.stripe_max_size; + cur_stripe_size = rule.stripe_max_size; + + if (cur_part_id == 0 && max_head_size > 0) { + cur_stripe++; + } + } + + last_ofs = ofs; + manifest->set_obj_size(ofs); + + manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj); + + manifest->update_iterators(); + + return 0; +} + +const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin() +{ + return begin_iter; +} + +const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end() +{ + return end_iter; +} + +RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs) +{ + if (ofs > obj_size) { + ofs = obj_size; + } + RGWObjManifest::obj_iterator iter(this); + iter.seek(ofs); + return iter; +} + +int RGWObjManifest::append(RGWObjManifest& m, const RGWZoneGroup& zonegroup, + const RGWZoneParams& zone_params) +{ + if (explicit_objs || m.explicit_objs) { + return append_explicit(m, zonegroup, zone_params); + } + + if (rules.empty()) { + *this = m; + return 0; + } + + string override_prefix; + + if (prefix.empty()) { + prefix = m.prefix; + } + + if (prefix != m.prefix) { + override_prefix = m.prefix; + } + + map::iterator miter = m.rules.begin(); + if (miter == m.rules.end()) { + return append_explicit(m, zonegroup, zone_params); + } + + for (; miter != m.rules.end(); ++miter) { + map::reverse_iterator last_rule = rules.rbegin(); + + RGWObjManifestRule& rule = last_rule->second; + + if (rule.part_size == 0) { + rule.part_size = obj_size - rule.start_ofs; + } + + RGWObjManifestRule& next_rule = miter->second; + if (!next_rule.part_size) { + next_rule.part_size = m.obj_size - next_rule.start_ofs; + } + + string rule_prefix = prefix; + if (!rule.override_prefix.empty()) { + rule_prefix = rule.override_prefix; + } + + string next_rule_prefix = m.prefix; + if (!next_rule.override_prefix.empty()) { + next_rule_prefix = next_rule.override_prefix; + } + + if (rule.part_size != next_rule.part_size || + rule.stripe_max_size != next_rule.stripe_max_size || + rule_prefix != next_rule_prefix) { + if (next_rule_prefix != prefix) { + append_rules(m, miter, &next_rule_prefix); + } else { + append_rules(m, miter, NULL); + } + break; + } + + uint64_t expected_part_num = rule.start_part_num + 1; + if (rule.part_size > 0) { + expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size; + } + + if (expected_part_num != next_rule.start_part_num) { + append_rules(m, miter, NULL); + break; + } + } + + set_obj_size(obj_size + m.obj_size); + + return 0; +} + +int RGWObjManifest::append(RGWObjManifest& m, RGWSI_Zone *zone_svc) +{ + return append(m, zone_svc->get_zonegroup(), zone_svc->get_zone_params()); +} + +void RGWObjManifest::append_rules(RGWObjManifest& m, map::iterator& miter, + string *override_prefix) +{ + for (; miter != m.rules.end(); ++miter) { + RGWObjManifestRule rule = miter->second; + rule.start_ofs += obj_size; + if (override_prefix) + rule.override_prefix = *override_prefix; + rules[rule.start_ofs] = rule; + } +} + +void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) +{ + if (explicit_objs) { + return; + } + obj_iterator iter = obj_begin(); + + while (iter != obj_end()) { + RGWObjManifestPart& part = objs[iter.get_stripe_ofs()]; + const rgw_obj_select& os = iter.get_location(); + const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params); + part.loc_ofs = 0; + + uint64_t ofs = iter.get_stripe_ofs(); + + if (ofs == 0) { + part.loc = obj; + } else { + rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc); + } + ++iter; + uint64_t next_ofs = iter.get_stripe_ofs(); + + part.size = next_ofs - ofs; + } + + explicit_objs = true; + rules.clear(); + prefix.clear(); +} + +int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) +{ + if (!explicit_objs) { + convert_to_explicit(zonegroup, zone_params); + } + if (!m.explicit_objs) { + m.convert_to_explicit(zonegroup, zone_params); + } + map::iterator iter; + uint64_t base = obj_size; + for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) { + RGWObjManifestPart& part = iter->second; + objs[base + iter->first] = part; + } + obj_size += m.obj_size; + + return 0; +} + +bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule) +{ + if (rules.empty()) { + return false; + } + + map::iterator iter = rules.upper_bound(ofs); + if (iter != rules.begin()) { + --iter; + } + + *rule = iter->second; + + return true; +} + +void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct) +{ + write_version.ver = 1; +#define TAG_LEN 24 + + write_version.tag.clear(); + append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN); +} + +class RGWMetaNotifierManager : public RGWCoroutinesManager { + RGWRados *store; + RGWHTTPManager http_manager; + +public: + RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store), + http_manager(store->ctx(), completion_mgr) { + http_manager.start(); + } + + int notify_all(map& conn_map, set& shards) { + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { "notify", NULL }, + { NULL, NULL } }; + + list stacks; + for (map::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this); + stack->call(new RGWPostRESTResourceCR, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL)); + + stacks.push_back(stack); + } + return run(stacks); + } +}; + +class RGWDataNotifierManager : public RGWCoroutinesManager { + RGWRados *store; + RGWHTTPManager http_manager; + +public: + RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store), + http_manager(store->ctx(), completion_mgr) { + http_manager.start(); + } + + int notify_all(map& conn_map, map >& shards) { + rgw_http_param_pair pairs[] = { { "type", "data" }, + { "notify", NULL }, + { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() }, + { NULL, NULL } }; + + list stacks; + for (map::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this); + stack->call(new RGWPostRESTResourceCR >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL)); + + stacks.push_back(stack); + } + return run(stacks); + } +}; + +/* class RGWRadosThread */ + +void RGWRadosThread::start() +{ + worker = new Worker(cct, this); + worker->create(thread_name.c_str()); +} + +void RGWRadosThread::stop() +{ + down_flag = true; + stop_process(); + if (worker) { + worker->signal(); + worker->join(); + } + delete worker; + worker = NULL; +} + +void *RGWRadosThread::Worker::entry() { + uint64_t msec = processor->interval_msec(); + utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000); + + do { + utime_t start = ceph_clock_now(); + int r = processor->process(); + if (r < 0) { + dout(0) << "ERROR: processor->process() returned error r=" << r << dendl; + } + + if (processor->going_down()) + break; + + utime_t end = ceph_clock_now(); + end -= start; + + uint64_t cur_msec = processor->interval_msec(); + if (cur_msec != msec) { /* was it reconfigured? */ + msec = cur_msec; + interval = utime_t(msec / 1000, (msec % 1000) * 1000000); + } + + if (cur_msec > 0) { + if (interval <= end) + continue; // next round + + utime_t wait_time = interval; + wait_time -= end; + + wait_interval(wait_time); + } else { + wait(); + } + } while (!processor->going_down()); + + return NULL; +} + +class RGWMetaNotifier : public RGWRadosThread { + RGWMetaNotifierManager notify_mgr; + RGWMetadataLog *const log; + + uint64_t interval_msec() override { + return cct->_conf->rgw_md_notify_interval_msec; + } + void stop_process() override { + notify_mgr.stop(); + } +public: + RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log) + : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {} + + int process() override; +}; + +int RGWMetaNotifier::process() +{ + set shards; + + log->read_clear_modified(shards); + + if (shards.empty()) { + return 0; + } + + for (set::iterator iter = shards.begin(); iter != shards.end(); ++iter) { + ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl; + } + + notify_mgr.notify_all(store->svc.zone->get_zone_conn_map(), shards); + + return 0; +} + +class RGWDataNotifier : public RGWRadosThread { + RGWDataNotifierManager notify_mgr; + + uint64_t interval_msec() override { + return cct->_conf.get_val("rgw_data_notify_interval_msec"); + } + void stop_process() override { + notify_mgr.stop(); + } +public: + RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {} + + int process() override; +}; + +int RGWDataNotifier::process() +{ + if (!store->data_log) { + return 0; + } + + map > shards; + + store->data_log->read_clear_modified(shards); + + if (shards.empty()) { + return 0; + } + + for (map >::iterator iter = shards.begin(); iter != shards.end(); ++iter) { + ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl; + } + + notify_mgr.notify_all(store->svc.zone->get_zone_data_notify_to_map(), shards); + + return 0; +} + +class RGWSyncProcessorThread : public RGWRadosThread { +public: + RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {} + RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {} + ~RGWSyncProcessorThread() override {} + int init() override = 0 ; + int process() override = 0; +}; + +class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread +{ + RGWMetaSyncStatusManager sync; + + uint64_t interval_msec() override { + return 0; /* no interval associated, it'll run once until stopped */ + } + void stop_process() override { + sync.stop(); + } +public: + RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados) + : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {} + + void wakeup_sync_shards(set& shard_ids) { + for (set::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) { + sync.wakeup(*iter); + } + } + RGWMetaSyncStatusManager* get_manager() { return &sync; } + + int init() override { + int ret = sync.init(); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl; + return ret; + } + return 0; + } + + int process() override { + sync.run(); + return 0; + } +}; + +class RGWDataSyncProcessorThread : public RGWSyncProcessorThread +{ + PerfCountersRef counters; + RGWDataSyncStatusManager sync; + bool initialized; + + uint64_t interval_msec() override { + if (initialized) { + return 0; /* no interval associated, it'll run once until stopped */ + } else { +#define DATA_SYNC_INIT_WAIT_SEC 20 + return DATA_SYNC_INIT_WAIT_SEC * 1000; + } + } + void stop_process() override { + sync.stop(); + } +public: + RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados, + const RGWZone* source_zone) + : RGWSyncProcessorThread(_store, "data-sync"), + counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)), + sync(_store, async_rados, source_zone->id, counters.get()), + initialized(false) {} + + void wakeup_sync_shards(map >& shard_ids) { + for (map >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) { + sync.wakeup(iter->first, iter->second); + } + } + RGWDataSyncStatusManager* get_manager() { return &sync; } + + int init() override { + return 0; + } + + int process() override { + while (!initialized) { + if (going_down()) { + return 0; + } + int ret = sync.init(); + if (ret >= 0) { + initialized = true; + break; + } + /* we'll be back! */ + return 0; + } + sync.run(); + return 0; + } +}; + +class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider +{ + RGWCoroutinesManager crs; + RGWRados *store; + rgw::BucketTrimManager *bucket_trim; + RGWHTTPManager http; + const utime_t trim_interval; + + uint64_t interval_msec() override { return 0; } + void stop_process() override { crs.stop(); } +public: + RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim, + int interval) + : RGWSyncProcessorThread(store, "sync-log-trim"), + crs(store->ctx(), store->get_cr_registry()), store(store), + bucket_trim(bucket_trim), + http(store->ctx(), crs.get_completion_mgr()), + trim_interval(interval, 0) + {} + + int init() override { + return http.start(); + } + int process() override { + list stacks; + auto meta = new RGWCoroutinesStack(store->ctx(), &crs); + meta->call(create_meta_log_trim_cr(this, store, &http, + cct->_conf->rgw_md_log_max_shards, + trim_interval)); + stacks.push_back(meta); + + auto data = new RGWCoroutinesStack(store->ctx(), &crs); + data->call(create_data_log_trim_cr(store, &http, + cct->_conf->rgw_data_log_num_shards, + trim_interval)); + stacks.push_back(data); + + auto bucket = new RGWCoroutinesStack(store->ctx(), &crs); + bucket->call(bucket_trim->create_bucket_trim_cr(&http)); + stacks.push_back(bucket); + + crs.run(stacks); + return 0; + } + + // implements DoutPrefixProvider + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const + { + return dout_subsys; + } + + std::ostream& gen_prefix(std::ostream& out) const + { + return out << "sync log trim: "; + } + +}; + +void RGWRados::wakeup_meta_sync_shards(set& shard_ids) +{ + Mutex::Locker l(meta_sync_thread_lock); + if (meta_sync_processor_thread) { + meta_sync_processor_thread->wakeup_sync_shards(shard_ids); + } +} + +void RGWRados::wakeup_data_sync_shards(const string& source_zone, map >& shard_ids) +{ + ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl; + Mutex::Locker l(data_sync_thread_lock); + map::iterator iter = data_sync_processor_threads.find(source_zone); + if (iter == data_sync_processor_threads.end()) { + ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl; + return; + } + + RGWDataSyncProcessorThread *thread = iter->second; + ceph_assert(thread); + thread->wakeup_sync_shards(shard_ids); +} + +RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager() +{ + Mutex::Locker l(meta_sync_thread_lock); + if (meta_sync_processor_thread) { + return meta_sync_processor_thread->get_manager(); + } + return nullptr; +} + +RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone) +{ + Mutex::Locker l(data_sync_thread_lock); + auto thread = data_sync_processor_threads.find(source_zone); + if (thread == data_sync_processor_threads.end()) { + return nullptr; + } + return thread->second->get_manager(); +} + +int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment) +{ + IoCtx ioctx; + int r = open_pool_ctx(pool, ioctx, false); + if (r < 0) { + ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl; + return r; + } + + bool requires; + r = ioctx.pool_requires_alignment2(&requires); + if (r < 0) { + ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned " + << r << dendl; + return r; + } + + if (!requires) { + *alignment = 0; + return 0; + } + + uint64_t align; + r = ioctx.pool_required_alignment2(&align); + if (r < 0) { + ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned " + << r << dendl; + return r; + } + if (align != 0) { + ldout(cct, 20) << "required alignment=" << align << dendl; + } + *alignment = align; + return 0; +} + +void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size) +{ + if (alignment == 0) { + *max_size = size; + return; + } + + if (size <= alignment) { + *max_size = alignment; + return; + } + + *max_size = size - (size % alignment); +} + +int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, uint64_t *palignment) +{ + uint64_t alignment; + int r = get_required_alignment(pool, &alignment); + if (r < 0) { + return r; + } + + if (palignment) { + *palignment = alignment; + } + + uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size; + + get_max_aligned_size(config_chunk_size, alignment, max_chunk_size); + + ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl; + + return 0; +} + +int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, + uint64_t *max_chunk_size, uint64_t *palignment) +{ + rgw_pool pool; + if (!get_obj_data_pool(placement_rule, obj, &pool)) { + ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl; + return -EIO; + } + return get_max_chunk_size(pool, max_chunk_size, palignment); +} + +class RGWIndexCompletionManager; + +struct complete_op_data { + Mutex lock{"complete_op_data"}; + AioCompletion *rados_completion{nullptr}; + int manager_shard_id{-1}; + RGWIndexCompletionManager *manager{nullptr}; + rgw_obj obj; + RGWModifyOp op; + string tag; + rgw_bucket_entry_ver ver; + cls_rgw_obj_key key; + rgw_bucket_dir_entry_meta dir_meta; + list remove_objs; + bool log_op; + uint16_t bilog_op; + rgw_zone_set zones_trace; + + bool stopped{false}; + + void stop() { + Mutex::Locker l(lock); + stopped = true; + } +}; + +class RGWIndexCompletionThread : public RGWRadosThread { + RGWRados *store; + + uint64_t interval_msec() override { + return 0; + } + + list completions; + + Mutex completions_lock; +public: + RGWIndexCompletionThread(RGWRados *_store) + : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {} + + int process() override; + + void add_completion(complete_op_data *completion) { + { + Mutex::Locker l(completions_lock); + completions.push_back(completion); + } + + signal(); + } +}; + +int RGWIndexCompletionThread::process() +{ + list comps; + + { + Mutex::Locker l(completions_lock); + completions.swap(comps); + } + + for (auto c : comps) { + std::unique_ptr up{c}; + + if (going_down()) { + continue; + } + ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl; + + RGWRados::BucketShard bs(store); + RGWBucketInfo bucket_info; + + int r = bs.init(c->obj.bucket, c->obj, &bucket_info); + if (r < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl; + /* not much to do */ + continue; + } + + r = store->guard_reshard(&bs, c->obj, bucket_info, + [&](RGWRados::BucketShard *bs) -> int { + librados::ObjectWriteOperation o; + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs, + c->log_op, c->bilog_op, &c->zones_trace); + return bs->index_ctx.operate(bs->bucket_obj, &o); + }); + if (r < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl; + /* ignoring error, can't do anything about it */ + continue; + } + r = store->data_log->add_entry(bs.bucket, bs.shard_id); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; + } + } + + return 0; +} + +class RGWIndexCompletionManager { + RGWRados *store{nullptr}; + vector locks; + vector > completions; + + RGWIndexCompletionThread *completion_thread{nullptr}; + + int num_shards; + + std::atomic cur_shard {0}; + + +public: + RGWIndexCompletionManager(RGWRados *_store) : store(_store) { + num_shards = store->ctx()->_conf->rgw_thread_pool_size; + + for (int i = 0; i < num_shards; i++) { + char buf[64]; + snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i); + locks.push_back(new Mutex(buf)); + } + + completions.resize(num_shards); + } + ~RGWIndexCompletionManager() { + stop(); + + for (auto l : locks) { + delete l; + } + } + + int next_shard() { + int result = cur_shard % num_shards; + cur_shard++; + return result; + } + + void create_completion(const rgw_obj& obj, + RGWModifyOp op, string& tag, + rgw_bucket_entry_ver& ver, + const cls_rgw_obj_key& key, + rgw_bucket_dir_entry_meta& dir_meta, + list *remove_objs, bool log_op, + uint16_t bilog_op, + rgw_zone_set *zones_trace, + complete_op_data **result); + bool handle_completion(completion_t cb, complete_op_data *arg); + + int start() { + completion_thread = new RGWIndexCompletionThread(store); + int ret = completion_thread->init(); + if (ret < 0) { + return ret; + } + completion_thread->start(); + return 0; + } + void stop() { + if (completion_thread) { + completion_thread->stop(); + delete completion_thread; + } + + for (int i = 0; i < num_shards; ++i) { + Mutex::Locker l(*locks[i]); + for (auto c : completions[i]) { + c->stop(); + } + } + completions.clear(); + } +}; + +static void obj_complete_cb(completion_t cb, void *arg) +{ + complete_op_data *completion = (complete_op_data *)arg; + completion->lock.Lock(); + if (completion->stopped) { + completion->lock.Unlock(); /* can drop lock, no one else is referencing us */ + delete completion; + return; + } + bool need_delete = completion->manager->handle_completion(cb, completion); + completion->lock.Unlock(); + if (need_delete) { + delete completion; + } +} + + +void RGWIndexCompletionManager::create_completion(const rgw_obj& obj, + RGWModifyOp op, string& tag, + rgw_bucket_entry_ver& ver, + const cls_rgw_obj_key& key, + rgw_bucket_dir_entry_meta& dir_meta, + list *remove_objs, bool log_op, + uint16_t bilog_op, + rgw_zone_set *zones_trace, + complete_op_data **result) +{ + complete_op_data *entry = new complete_op_data; + + int shard_id = next_shard(); + + entry->manager_shard_id = shard_id; + entry->manager = this; + entry->obj = obj; + entry->op = op; + entry->tag = tag; + entry->ver = ver; + entry->key = key; + entry->dir_meta = dir_meta; + entry->log_op = log_op; + entry->bilog_op = bilog_op; + + if (remove_objs) { + for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) { + entry->remove_objs.push_back(*iter); + } + } + + if (zones_trace) { + entry->zones_trace = *zones_trace; + } else { + entry->zones_trace.insert(store->svc.zone->get_zone().id); + } + + *result = entry; + + entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb); + + Mutex::Locker l(*locks[shard_id]); + completions[shard_id].insert(entry); +} + +bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg) +{ + int shard_id = arg->manager_shard_id; + { + Mutex::Locker l(*locks[shard_id]); + + auto& comps = completions[shard_id]; + + auto iter = comps.find(arg); + if (iter == comps.end()) { + return true; + } + + comps.erase(iter); + } + + int r = rados_aio_get_return_value(cb); + if (r != -ERR_BUSY_RESHARDING) { + return true; + } + completion_thread->add_completion(arg); + return false; +} + +void RGWRados::finalize() +{ + cct->get_admin_socket()->unregister_commands(this); + if (run_sync_thread) { + Mutex::Locker l(meta_sync_thread_lock); + meta_sync_processor_thread->stop(); + + Mutex::Locker dl(data_sync_thread_lock); + for (auto iter : data_sync_processor_threads) { + RGWDataSyncProcessorThread *thread = iter.second; + thread->stop(); + } + if (sync_log_trimmer) { + sync_log_trimmer->stop(); + } + } + if (async_rados) { + async_rados->stop(); + } + if (run_sync_thread) { + delete meta_sync_processor_thread; + meta_sync_processor_thread = NULL; + Mutex::Locker dl(data_sync_thread_lock); + for (auto iter : data_sync_processor_threads) { + RGWDataSyncProcessorThread *thread = iter.second; + delete thread; + } + data_sync_processor_threads.clear(); + delete sync_log_trimmer; + sync_log_trimmer = nullptr; + bucket_trim = boost::none; + } + if (meta_notifier) { + meta_notifier->stop(); + delete meta_notifier; + } + if (data_notifier) { + data_notifier->stop(); + delete data_notifier; + } + delete data_log; + delete sync_tracer; + if (async_rados) { + delete async_rados; + } + + delete lc; + lc = NULL; + + delete gc; + gc = NULL; + + delete obj_expirer; + obj_expirer = NULL; + + RGWQuotaHandler::free_handler(quota_handler); + if (cr_registry) { + cr_registry->put(); + } + + svc.shutdown(); + + delete meta_mgr; + delete binfo_cache; + delete obj_tombstone_cache; + + if (reshard_wait.get()) { + reshard_wait->stop(); + reshard_wait.reset(); + } + + if (run_reshard_thread) { + reshard->stop_processor(); + } + delete reshard; + delete index_completion_manager; +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::init_rados() +{ + int ret = 0; + auto admin_socket = cct->get_admin_socket(); + for (auto cmd : admin_commands) { + int r = admin_socket->register_command(cmd[0], cmd[1], this, + cmd[2]); + if (r < 0) { + lderr(cct) << "ERROR: fail to register admin socket command (r=" << r + << ")" << dendl; + return r; + } + } + + ret = rados.init_with_context(cct); + if (ret < 0) { + return ret; + } + ret = rados.connect(); + if (ret < 0) { + return ret; + } + + auto crs = std::unique_ptr{ + new RGWCoroutinesManagerRegistry(cct)}; + ret = crs->hook_to_admin_command("cr dump"); + if (ret < 0) { + return ret; + } + + meta_mgr = new RGWMetadataManager(cct, this); + data_log = new RGWDataChangesLog(cct, this); + cr_registry = crs.release(); + return ret; +} + +int RGWRados::register_to_service_map(const string& daemon_type, const map& meta) +{ + map metadata = meta; + metadata["num_handles"] = "1"s; + metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id(); + metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name(); + metadata["zone_name"] = svc.zone->zone_name(); + metadata["zone_id"] = svc.zone->zone_id(); + string name = cct->_conf->name.get_id(); + if (name.compare(0, 4, "rgw.") == 0) { + name = name.substr(4); + } + int ret = rados.service_daemon_register(daemon_type, name, metadata); + if (ret < 0) { + ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +int RGWRados::update_service_map(std::map&& status) +{ + int ret = rados.service_daemon_update_status(move(status)); + if (ret < 0) { + ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::init_complete() +{ + int ret; + + /* + * create sync module instance even if we don't run sync thread, might need it for radosgw-admin + */ + auto& zone_public_config = svc.zone->get_zone(); + ret = svc.sync_modules->get_manager()->create_instance(cct, zone_public_config.tier_type, svc.zone->get_zone_params().tier_config, &sync_module); + if (ret < 0) { + lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl; + if (ret == -ENOENT) { + lderr(cct) << "ERROR: " << zone_public_config.tier_type + << " sync module does not exist. valid sync modules: " + << svc.sync_modules->get_manager()->get_registered_module_names() + << dendl; + } + return ret; + } + + period_puller.reset(new RGWPeriodPuller(this)); + period_history.reset(new RGWPeriodHistory(cct, period_puller.get(), + svc.zone->get_current_period())); + + ret = open_root_pool_ctx(); + if (ret < 0) + return ret; + + ret = open_gc_pool_ctx(); + if (ret < 0) + return ret; + + ret = open_lc_pool_ctx(); + if (ret < 0) + return ret; + + ret = open_objexp_pool_ctx(); + if (ret < 0) + return ret; + + ret = open_reshard_pool_ctx(); + if (ret < 0) + return ret; + + pools_initialized = true; + + gc = new RGWGC(); + gc->initialize(cct, this); + + obj_expirer = new RGWObjectExpirer(this); + + if (use_gc_thread) { + gc->start_processor(); + obj_expirer->start_processor(); + } + + auto& current_period = svc.zone->get_current_period(); + auto& zonegroup = svc.zone->get_zonegroup(); + auto& zone_params = svc.zone->get_zone_params(); + auto& zone = svc.zone->get_zone(); + + /* no point of running sync thread if we don't have a master zone configured + or there is no rest_master_conn */ + if (zonegroup.master_zone.empty() || !svc.zone->get_master_conn() + || current_period.get_id().empty()) { + run_sync_thread = false; + } + + if (run_sync_thread) { + // initialize the log period history + meta_mgr->init_oldest_log_period(); + } + + async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads); + async_rados->start(); + + ret = meta_mgr->init(current_period.get_id()); + if (ret < 0) { + lderr(cct) << "ERROR: failed to initialize metadata log: " + << cpp_strerror(-ret) << dendl; + return ret; + } + + if (svc.zone->is_meta_master()) { + auto md_log = meta_mgr->get_log(current_period.get_id()); + meta_notifier = new RGWMetaNotifier(this, md_log); + meta_notifier->start(); + } + + /* init it anyway, might run sync through radosgw-admin explicitly */ + sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size); + sync_tracer->init(this); + ret = sync_tracer->hook_to_admin_command(); + if (ret < 0) { + return ret; + } + + if (run_sync_thread) { + for (const auto &pt: zonegroup.placement_targets) { + if (zone_params.placement_pools.find(pt.second.name) + == zone_params.placement_pools.end()){ + ldout(cct, 0) << "WARNING: This zone does not contain the placement target " + << pt.second.name << " present in zonegroup" << dendl; + } + } + Mutex::Locker l(meta_sync_thread_lock); + meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados); + ret = meta_sync_processor_thread->init(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl; + return ret; + } + meta_sync_processor_thread->start(); + + // configure the bucket trim manager + rgw::BucketTrimConfig config; + rgw::configure_bucket_trim(cct, config); + + bucket_trim.emplace(this, config); + ret = bucket_trim->init(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl; + return ret; + } + data_log->set_observer(&*bucket_trim); + + Mutex::Locker dl(data_sync_thread_lock); + for (auto source_zone : svc.zone->get_data_sync_source_zones()) { + ldout(cct, 5) << "starting data sync thread for zone " << source_zone->name << dendl; + auto *thread = new RGWDataSyncProcessorThread(this, async_rados, source_zone); + ret = thread->init(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl; + return ret; + } + thread->start(); + data_sync_processor_threads[source_zone->id] = thread; + } + auto interval = cct->_conf->rgw_sync_log_trim_interval; + if (interval > 0) { + sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval); + ret = sync_log_trimmer->init(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl; + return ret; + } + sync_log_trimmer->start(); + } + } + data_notifier = new RGWDataNotifier(this); + data_notifier->start(); + + binfo_cache = new RGWChainedCacheImpl; + binfo_cache->init(svc.cache); + + lc = new RGWLC(); + lc->initialize(cct, this); + + if (use_lc_thread) + lc->start_processor(); + + quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads); + + bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards : + zone.bucket_index_max_shards); + if (bucket_index_max_shards > get_max_bucket_shards()) { + bucket_index_max_shards = get_max_bucket_shards(); + ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: " + << get_max_bucket_shards() << dendl; + } + ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl; + + bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */ + + if (need_tombstone_cache) { + obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size); + } + + reshard_wait = std::make_shared(); + + reshard = new RGWReshard(this); + + /* only the master zone in the zonegroup reshards buckets */ + run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id); + if (run_reshard_thread) { + reshard->start_processor(); + } + + index_completion_manager = new RGWIndexCompletionManager(this); + ret = index_completion_manager->start(); + + return ret; +} + +int RGWRados::init_svc(bool raw) +{ + if (raw) { + return svc.init_raw(cct, use_cache); + } + + return svc.init(cct, use_cache); +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::initialize() +{ + int ret; + + inject_notify_timeout_probability = + cct->_conf.get_val("rgw_inject_notify_timeout_probability"); + max_notify_retries = cct->_conf.get_val("rgw_max_notify_retries"); + + ret = init_svc(false); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl; + return ret; + } + + host_id = svc.zone_utils->gen_host_id(); + + ret = init_rados(); + if (ret < 0) + return ret; + + return init_complete(); +} + +/** + * Open the pool used as root for this gateway + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::open_root_pool_ctx() +{ + return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true); +} + +int RGWRados::open_gc_pool_ctx() +{ + return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true); +} + +int RGWRados::open_lc_pool_ctx() +{ + return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true); +} + +int RGWRados::open_objexp_pool_ctx() +{ + return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true); +} + +int RGWRados::open_reshard_pool_ctx() +{ + return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true); +} + +int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx, + bool mostly_omap) +{ + constexpr bool create = true; // create the pool if it doesn't exist + return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create, mostly_omap); +} + +void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker, + string *marker) { + if (marker) { + *marker = shard_id_str; + marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR); + marker->append(shard_marker); + } +} + +int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx) +{ + const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool; + + if (!explicit_pool.empty()) { + return open_pool_ctx(explicit_pool, index_ctx, false); + } + + auto& zonegroup = svc.zone->get_zonegroup(); + auto& zone_params = svc.zone->get_zone_params(); + + const rgw_placement_rule *rule = &bucket_info.placement_rule; + if (rule->empty()) { + rule = &zonegroup.default_placement; + } + auto iter = zone_params.placement_pools.find(rule->name); + if (iter == zone_params.placement_pools.end()) { + ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl; + return -EINVAL; + } + + int r = open_pool_ctx(iter->second.index_pool, index_ctx, true); + if (r < 0) + return r; + + return 0; +} + +/**** logs ****/ + +struct log_list_state { + string prefix; + librados::IoCtx io_ctx; + librados::NObjectIterator obit; +}; + +int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle) +{ + log_list_state *state = new log_list_state; + int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx); + if (r < 0) { + delete state; + return r; + } + state->prefix = prefix; + state->obit = state->io_ctx.nobjects_begin(); + *handle = (RGWAccessHandle)state; + return 0; +} + +int RGWRados::log_list_next(RGWAccessHandle handle, string *name) +{ + log_list_state *state = static_cast(handle); + while (true) { + if (state->obit == state->io_ctx.nobjects_end()) { + delete state; + return -ENOENT; + } + if (state->prefix.length() && + state->obit->get_oid().find(state->prefix) != 0) { + state->obit++; + continue; + } + *name = state->obit->get_oid(); + state->obit++; + break; + } + return 0; +} + +int RGWRados::log_remove(const string& name) +{ + librados::IoCtx io_ctx; + int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx); + if (r < 0) + return r; + return io_ctx.remove(name); +} + +struct log_show_state { + librados::IoCtx io_ctx; + bufferlist bl; + bufferlist::const_iterator p; + string name; + uint64_t pos; + bool eof; + log_show_state() : pos(0), eof(false) {} +}; + +int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle) +{ + log_show_state *state = new log_show_state; + int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx); + if (r < 0) { + delete state; + return r; + } + state->name = name; + *handle = (RGWAccessHandle)state; + return 0; +} + +int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry) +{ + log_show_state *state = static_cast(handle); + off_t off = state->p.get_off(); + + ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length() + << " off " << off + << " eof " << (int)state->eof + << dendl; + // read some? + unsigned chunk = 1024*1024; + if ((state->bl.length() - off) < chunk/2 && !state->eof) { + bufferlist more; + int r = state->io_ctx.read(state->name, more, chunk, state->pos); + if (r < 0) + return r; + state->pos += r; + bufferlist old; + try { + old.substr_of(state->bl, off, state->bl.length() - off); + } catch (buffer::error& err) { + return -EINVAL; + } + state->bl.clear(); + state->bl.claim(old); + state->bl.claim_append(more); + state->p = state->bl.cbegin(); + if ((unsigned)r < chunk) + state->eof = true; + ldout(cct, 10) << " read " << r << dendl; + } + + if (state->p.end()) + return 0; // end of file + try { + decode(*entry, state->p); + } + catch (const buffer::error &e) { + return -EINVAL; + } + return 1; +} + +/** + * usage_log_hash: get usage log key hash, based on name and index + * + * Get the usage object name. Since a user may have more than 1 + * object holding that info (multiple shards), we use index to + * specify that shard number. Once index exceeds max shards it + * wraps. + * If name is not being set, results for all users will be returned + * and index will wrap only after total shards number. + * + * @param cct [in] ceph context + * @param name [in] user name + * @param hash [out] hash value + * @param index [in] shard index number + */ +static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index) +{ + uint32_t val = index; + + if (!name.empty()) { + int max_user_shards = cct->_conf->rgw_usage_max_user_shards; + val %= max_user_shards; + val += ceph_str_hash_linux(name.c_str(), name.size()); + } + char buf[17]; + int max_shards = cct->_conf->rgw_usage_max_shards; + snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards)); + hash = buf; +} + +int RGWRados::log_usage(map& usage_info) +{ + uint32_t index = 0; + + map log_objs; + + string hash; + string last_user; + + /* restructure usage map, zone by object hash */ + map::iterator iter; + for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) { + const rgw_user_bucket& ub = iter->first; + RGWUsageBatch& info = iter->second; + + if (ub.user.empty()) { + ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl; + continue; + } + + if (ub.user != last_user) { + /* index *should* be random, but why waste extra cycles + in most cases max user shards is not going to exceed 1, + so just incrementing it */ + usage_log_hash(cct, ub.user, hash, index++); + } + last_user = ub.user; + vector& v = log_objs[hash].entries; + + for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) { + v.push_back(miter->second); + } + } + + map::iterator liter; + + for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) { + int r = cls_obj_usage_log_add(liter->first, liter->second); + if (r < 0) + return r; + } + return 0; +} + +int RGWRados::read_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map& usage) +{ + uint32_t num = max_entries; + string hash, first_hash; + string user_str = user.to_str(); + usage_log_hash(cct, user_str, first_hash, 0); + + if (usage_iter.index) { + usage_log_hash(cct, user_str, hash, usage_iter.index); + } else { + hash = first_hash; + } + + usage.clear(); + + do { + map ret_usage; + map::iterator iter; + + int ret = cls_obj_usage_log_read(hash, user_str, bucket_name, start_epoch, end_epoch, num, + usage_iter.read_iter, ret_usage, is_truncated); + if (ret == -ENOENT) + goto next; + + if (ret < 0) + return ret; + + num -= ret_usage.size(); + + for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) { + usage[iter->first].aggregate(iter->second); + } + +next: + if (!*is_truncated) { + usage_iter.read_iter.clear(); + usage_log_hash(cct, user_str, hash, ++usage_iter.index); + } + } while (num && !*is_truncated && hash != first_hash); + return 0; +} + +int RGWRados::trim_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch) +{ + uint32_t index = 0; + string hash, first_hash; + string user_str = user.to_str(); + usage_log_hash(cct, user_str, first_hash, index); + + hash = first_hash; + do { + int ret = cls_obj_usage_log_trim(hash, user_str, bucket_name, start_epoch, end_epoch); + + if (ret < 0 && ret != -ENOENT) + return ret; + + usage_log_hash(cct, user_str, hash, ++index); + } while (hash != first_hash); + + return 0; +} + + +int RGWRados::clear_usage() +{ + auto max_shards = cct->_conf->rgw_usage_max_shards; + int ret=0; + for (unsigned i=0; i < max_shards; i++){ + string oid = RGW_USAGE_OBJ_PREFIX + to_string(i); + ret = cls_obj_usage_log_clear(oid); + if (ret < 0){ + ldout(cct,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl; + return ret; + } + } + return ret; +} + +int RGWRados::key_to_shard_id(const string& key, int max_shards) +{ + return rgw_shard_id(key, max_shards); +} + +void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id) +{ + uint32_t val = ceph_str_hash_linux(key.c_str(), key.size()); + char buf[16]; + if (shard_id) { + *shard_id = val % max_shards; + } + snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards)); + name = prefix + buf; +} + +void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name) +{ + uint32_t val = ceph_str_hash_linux(key.c_str(), key.size()); + val ^= ceph_str_hash_linux(section.c_str(), section.size()); + char buf[16]; + snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards)); + name = prefix + buf; +} + +void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name) +{ + char buf[16]; + snprintf(buf, sizeof(buf), "%u", shard_id); + name = prefix + buf; + +} + +void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl) +{ + cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl); +} + +int RGWRados::time_log_add_init(librados::IoCtx& io_ctx) +{ + return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx, true); + +} + +int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl) +{ + librados::IoCtx io_ctx; + + int r = time_log_add_init(io_ctx); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + utime_t t(ut); + cls_log_add(op, t, section, key, bl); + + return io_ctx.operate(oid, &op); +} + +int RGWRados::time_log_add(const string& oid, list& entries, + librados::AioCompletion *completion, bool monotonic_inc) +{ + librados::IoCtx io_ctx; + + int r = time_log_add_init(io_ctx); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + cls_log_add(op, entries, monotonic_inc); + + if (!completion) { + r = io_ctx.operate(oid, &op); + } else { + r = io_ctx.aio_operate(oid, completion, &op); + } + return r; +} + +int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time, + int max_entries, list& entries, + const string& marker, + string *out_marker, + bool *truncated) +{ + librados::IoCtx io_ctx; + + int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx); + if (r < 0) + return r; + librados::ObjectReadOperation op; + + utime_t st(start_time); + utime_t et(end_time); + + cls_log_list(op, st, et, marker, max_entries, entries, + out_marker, truncated); + + bufferlist obl; + + int ret = io_ctx.operate(oid, &op, &obl); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::time_log_info(const string& oid, cls_log_header *header) +{ + librados::IoCtx io_ctx; + + int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx); + if (r < 0) + return r; + librados::ObjectReadOperation op; + + cls_log_info(op, header); + + bufferlist obl; + + int ret = io_ctx.operate(oid, &op, &obl); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion) +{ + int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx); + if (r < 0) + return r; + + librados::ObjectReadOperation op; + + cls_log_info(op, header); + + int ret = io_ctx.aio_operate(oid, completion, &op, NULL); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time, + const string& from_marker, const string& to_marker, + librados::AioCompletion *completion) +{ + librados::IoCtx io_ctx; + + int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx); + if (r < 0) + return r; + + utime_t st(start_time); + utime_t et(end_time); + + ObjectWriteOperation op; + cls_log_trim(op, st, et, from_marker, to_marker); + + if (!completion) { + r = io_ctx.operate(oid, &op); + } else { + r = io_ctx.aio_operate(oid, completion, &op); + } + return r; +} + +string RGWRados::objexp_hint_get_shardname(int shard_num) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num); + + string objname("obj_delete_at_hint."); + return objname + buf; +} + +int RGWRados::objexp_key_shard(const rgw_obj_index_key& key) +{ + string obj_key = key.name + key.instance; + int num_shards = cct->_conf->rgw_objexp_hints_num_shards; + return rgw_bucket_shard_index(obj_key, num_shards); +} + +static string objexp_hint_get_keyext(const string& tenant_name, + const string& bucket_name, + const string& bucket_id, + const rgw_obj_key& obj_key) +{ + return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id + + ":" + obj_key.name + ":" + obj_key.instance; +} + +int RGWRados::objexp_hint_add(const ceph::real_time& delete_at, + const string& tenant_name, + const string& bucket_name, + const string& bucket_id, + const rgw_obj_index_key& obj_key) +{ + const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name, + bucket_id, obj_key); + objexp_hint_entry he = { + .tenant = tenant_name, + .bucket_name = bucket_name, + .bucket_id = bucket_id, + .obj_key = obj_key, + .exp_time = delete_at }; + bufferlist hebl; + encode(he, hebl); + ObjectWriteOperation op; + cls_timeindex_add(op, utime_t(delete_at), keyext, hebl); + + string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key)); + return objexp_pool_ctx.operate(shard_name, &op); +} + +void RGWRados::objexp_get_shard(int shard_num, + string& shard) /* out */ +{ + shard = objexp_hint_get_shardname(shard_num); +} + +int RGWRados::objexp_hint_list(const string& oid, + const ceph::real_time& start_time, + const ceph::real_time& end_time, + const int max_entries, + const string& marker, + list& entries, /* out */ + string *out_marker, /* out */ + bool *truncated) /* out */ +{ + librados::ObjectReadOperation op; + cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries, + out_marker, truncated); + + bufferlist obl; + int ret = objexp_pool_ctx.operate(oid, &op, &obl); + + if ((ret < 0 ) && (ret != -ENOENT)) { + return ret; + } + + if ((ret == -ENOENT) && truncated) { + *truncated = false; + } + + return 0; +} + +int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */ + objexp_hint_entry& hint_entry) /* out */ +{ + try { + auto iter = ti_entry.value.cbegin(); + decode(hint_entry, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl; + } + + return 0; +} + +int RGWRados::objexp_hint_trim(const string& oid, + const ceph::real_time& start_time, + const ceph::real_time& end_time, + const string& from_marker, + const string& to_marker) +{ + int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time), + from_marker, to_marker); + if ((ret < 0 ) && (ret != -ENOENT)) { + return ret; + } + + return 0; +} + +int RGWRados::lock_exclusive(const rgw_pool& pool, const string& oid, timespan& duration, + string& zone_id, string& owner_id) { + librados::IoCtx io_ctx; + + int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx); + if (r < 0) { + return r; + } + uint64_t msec = std::chrono::duration_cast(duration).count(); + utime_t ut(msec / 1000, msec % 1000); + + rados::cls::lock::Lock l(log_lock_name); + l.set_duration(ut); + l.set_cookie(owner_id); + l.set_tag(zone_id); + l.set_may_renew(true); + + return l.lock_exclusive(&io_ctx, oid); +} + +int RGWRados::unlock(const rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) { + librados::IoCtx io_ctx; + + int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx); + if (r < 0) { + return r; + } + + rados::cls::lock::Lock l(log_lock_name); + l.set_tag(zone_id); + l.set_cookie(owner_id); + + return l.unlock(&io_ctx, oid); +} + +int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner) +{ + auto i = bl.cbegin(); + RGWAccessControlPolicy policy(cct); + try { + policy.decode_owner(i); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + *owner = policy.get_owner(); + return 0; +} + +int rgw_policy_from_attrset(CephContext *cct, map& attrset, RGWAccessControlPolicy *policy) +{ + map::iterator aiter = attrset.find(RGW_ATTR_ACL); + if (aiter == attrset.end()) + return -EIO; + + bufferlist& bl = aiter->second; + auto iter = bl.cbegin(); + try { + policy->decode(iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (cct->_conf->subsys.should_gather()) { + RGWAccessControlPolicy_S3 *s3policy = static_cast(policy); + ldout(cct, 15) << __func__ << " Read AccessControlPolicy"; + s3policy->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + + +int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id) +{ + rgw_bucket bucket = bucket_info.bucket; + bucket.update_bucket_id(new_bucket_id); + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + bucket_info.objv_tracker.clear(); + int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr); + if (ret < 0) { + return ret; + } + + return 0; +} + + +static inline std::string after_delim(std::string_view delim) +{ + // assert: ! delim.empty() + std::string result{delim.data(), delim.length()}; + result += char(255); + return result; +} + + +/** + * Get ordered listing of the objects in a bucket. + * + * max: maximum number of results to return + * bucket: bucket to list contents of + * prefix: only return results that match this prefix + * delim: do not include results that match this string. + * Any skipped results will have the matching portion of their name + * inserted in common_prefixes with a "true" mark. + * marker: if filled in, begin the listing with this object. + * end_marker: if filled in, end the listing with this object. + * result: the objects are put in here. + * common_prefixes: if delim is filled in, any matching prefixes are + * placed here. + * is_truncated: if number of objects in the bucket is bigger than + * max, then truncated. + */ +int RGWRados::Bucket::List::list_objects_ordered( + int64_t max_p, + vector *result, + map *common_prefixes, + bool *is_truncated) +{ + RGWRados *store = target->get_store(); + CephContext *cct = store->ctx(); + int shard_id = target->get_shard_id(); + + int count = 0; + bool truncated = true; + const int64_t max = // protect against memory issues and negative vals + std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p)); + int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max); + + result->clear(); + + // use a local marker; either the marker will have a previous entry + // or it will be empty; either way it's OK to copy + rgw_obj_key marker_obj(params.marker.name, + params.marker.instance, + params.ns.empty() ? params.marker.ns : params.ns); + rgw_obj_index_key cur_marker; + marker_obj.get_index_key(&cur_marker); + + rgw_obj_key end_marker_obj(params.end_marker.name, + params.end_marker.instance, + params.ns.empty() ? params.end_marker.ns : params.ns); + rgw_obj_index_key cur_end_marker; + end_marker_obj.get_index_key(&cur_end_marker); + const bool cur_end_marker_valid = !params.end_marker.empty(); + + rgw_obj_key prefix_obj(params.prefix); + prefix_obj.set_ns(params.ns); + string cur_prefix = prefix_obj.get_index_key_name(); + string after_delim_s; /* needed in !params.delim.empty() AND later */ + + if (!params.delim.empty()) { + after_delim_s = after_delim(params.delim); + /* if marker points at a common prefix, fast forward it into its + * upper bound string */ + int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size()); + if (delim_pos >= 0) { + string s = cur_marker.name.substr(0, delim_pos); + s.append(after_delim_s); + cur_marker = s; + } + } + + // allows us to skip over entries in two conditions: 1) when using a + // delimiter and we can skip over "subdirectories" and 2) when + // searching for elements in the empty namespace we can skip over + // namespaced elements + rgw_obj_index_key marker_skip_ahead; + + rgw_obj_index_key prev_marker; + for (uint16_t attempt = 1; /* empty */; ++attempt) { + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " starting attempt " << attempt << dendl; + + if (attempt > 1 && !(prev_marker < cur_marker)) { + // we've failed to make forward progress + ldout(cct, 0) << "RGWRados::Bucket::List::" << __func__ << + ": ERROR marker failed to make forward progress; attempt=" << attempt << + ", prev_marker=" << prev_marker << + ", cur_marker=" << cur_marker << dendl; + break; + } + prev_marker = cur_marker; + + // see whether we found a way to skip ahead in the previous + // iteration + if (marker_skip_ahead > cur_marker) { + cur_marker = marker_skip_ahead; + ldout(cct, 20) << "advancing cur_marker=" << cur_marker << dendl; + } + + std::map ent_map; + const size_t num_requested = read_ahead + 1 - count; + int r = store->cls_bucket_list_ordered(target->get_bucket_info(), + shard_id, + cur_marker, + cur_prefix, + num_requested, + params.list_versions, + attempt, + ent_map, + &truncated, + &cur_marker); + if (r < 0) { + return r; + } + + for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) { + const std::string& key = eiter->first; + rgw_bucket_dir_entry& entry = eiter->second; + rgw_obj_index_key index_key = entry.key; + rgw_obj_key obj(index_key); // NB: why is this re-set below? can't be const + + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " considering entry " << entry.key << dendl; + + /* note that parse_raw_oid() here will not set the correct + * object's instance, as rgw_obj_index_key encodes that + * separately. We don't need to set the instance because it's + * not needed for the checks here and we end up using the raw + * entry for the return vector + */ + bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj); + if (!valid) { + ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl; + continue; + } + + if (!params.list_versions && !entry.is_visible()) { + continue; + } + + const bool matched_ns = (obj.ns == params.ns); + if (params.enforce_ns && !matched_ns) { + if (!params.ns.empty()) { + /* we've iterated past the namespace we're searching -- done now */ + truncated = false; + goto done; + } else { + // we're enforcing an empty namespace, so we need to skip + // past the namespace block + marker_skip_ahead = rgw_obj_key::after_namespace_marker(key); + continue; + } + } + + if (cur_end_marker_valid && cur_end_marker <= index_key) { + truncated = false; + goto done; + } + + if (count < max) { + params.marker = index_key; + next_marker = index_key; + } + + if (params.filter && !params.filter->filter(obj.name, index_key.name)) { + continue; + } + + if (params.prefix.size() && + (obj.name.compare(0, params.prefix.size(), params.prefix) != 0)) { + continue; + } + + if (!params.delim.empty()) { + int delim_pos = obj.name.find(params.delim, params.prefix.size()); + + if (delim_pos >= 0) { + /* extract key *with* trailing delimiter for CommonPrefix */ + const std::string prefix_key = + obj.name.substr(0, delim_pos + params.delim.length()); + + if (common_prefixes && + common_prefixes->find(prefix_key) == common_prefixes->end()) { + if (count >= max) { + truncated = true; + goto done; + } + next_marker = prefix_key; + (*common_prefixes)[prefix_key] = true; + + // setting marker_skip_ahead allows the next call to + // cls_bucket_list_ordered to skip over unlisted entries; + // NOTE: after_delim_s + const std::string skip_name = obj.name.substr(0, delim_pos) + after_delim_s; + const rgw_obj_key skip_key(skip_name, "" /* empty instance*/ , obj.ns); + skip_key.get_index_key(&marker_skip_ahead); + ldout(cct, 20) << "marker_skip_ahead=" << marker_skip_ahead << dendl; + + count++; + } + + continue; + } + } + + if (count >= max) { + truncated = true; + goto done; + } + + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " adding entry " << entry.key << " to result" << dendl; + + result->emplace_back(std::move(entry)); + count++; + } // eiter for loop + + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " INFO end of outer loop, truncated=" << truncated << + ", count=" << count << ", attempt=" << attempt << dendl; + + if (!truncated || count >= (max + 1) / 2) { + // if we finished listing, or if we're returning at least half the + // requested entries, that's enough; S3 and swift protocols allow + // returning fewer than max entries + break; + } else if (attempt > 8 && count >= 1) { + // if we've made at least 8 attempts and we have some, but very + // few, results, return with what we have + break; + } + } // for (uint16_t attempt... + +done: + + auto csz = (common_prefixes) ? common_prefixes->size() : 0; + ldout(cct, 10) << "RGWRados::Bucket::List::" << __func__ << + " INFO returning " << result->size() << " entries and " + << csz << " common prefixes" << dendl; + + if (is_truncated) { + *is_truncated = truncated; + } + + return 0; +} // list_objects_ordered + + +/** + * Get listing of the objects in a bucket and allow the results to be out + * of order. + * + * Even though there are key differences with the ordered counterpart, + * the parameters are the same to maintain some compatability. + * + * max: maximum number of results to return + * bucket: bucket to list contents of + * prefix: only return results that match this prefix + * delim: should not be set; if it is we should have indicated an error + * marker: if filled in, begin the listing with this object. + * end_marker: if filled in, end the listing with this object. + * result: the objects are put in here. + * common_prefixes: this is never filled with an unordered list; the param + * is maintained for compatibility + * is_truncated: if number of objects in the bucket is bigger than max, then + * truncated. + */ +int RGWRados::Bucket::List::list_objects_unordered(int64_t max_p, + vector *result, + map *common_prefixes, + bool *is_truncated) +{ + RGWRados *store = target->get_store(); + CephContext *cct = store->ctx(); + int shard_id = target->get_shard_id(); + + int count = 0; + bool truncated = true; + + const int64_t max = // protect against memory issues and negative vals + std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p)); + + // read a few extra in each call to cls_bucket_list_unordered in + // case some are filtered out due to namespace matching, versioning, + // filtering, etc. + const int64_t max_read_ahead = 100; + const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead)); + + result->clear(); + + // use a local marker; either the marker will have a previous entry + // or it will be empty; either way it's OK to copy + rgw_obj_key marker_obj(params.marker.name, + params.marker.instance, + params.ns.empty() ? params.marker.ns : params.ns); + rgw_obj_index_key cur_marker; + marker_obj.get_index_key(&cur_marker); + + rgw_obj_key end_marker_obj(params.end_marker.name, + params.end_marker.instance, + params.ns.empty() ? params.end_marker.ns : params.ns); + rgw_obj_index_key cur_end_marker; + end_marker_obj.get_index_key(&cur_end_marker); + const bool cur_end_marker_valid = !params.end_marker.empty(); + + rgw_obj_key prefix_obj(params.prefix); + prefix_obj.set_ns(params.ns); + string cur_prefix = prefix_obj.get_index_key_name(); + + while (truncated && count <= max) { + std::vector ent_list; + ent_list.reserve(read_ahead); + + int r = store->cls_bucket_list_unordered(target->get_bucket_info(), + shard_id, + cur_marker, + cur_prefix, + read_ahead, + params.list_versions, + ent_list, + &truncated, + &cur_marker); + if (r < 0) + return r; + + // NB: while regions of ent_list will be sorted, we have no + // guarantee that all items will be sorted since they can cross + // shard boundaries + + for (auto& entry : ent_list) { + rgw_obj_index_key index_key = entry.key; + rgw_obj_key obj(index_key); + + if (count < max) { + params.marker.set(index_key); + next_marker.set(index_key); + } + + /* note that parse_raw_oid() here will not set the correct + * object's instance, as rgw_obj_index_key encodes that + * separately. We don't need to set the instance because it's + * not needed for the checks here and we end up using the raw + * entry for the return vector + */ + bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj); + if (!valid) { + ldout(cct, 0) << "ERROR: could not parse object name: " << + obj.name << dendl; + continue; + } + + if (!params.list_versions && !entry.is_visible()) { + continue; + } + + if (params.enforce_ns && obj.ns != params.ns) { + continue; + } + + if (cur_end_marker_valid && cur_end_marker <= index_key) { + // we're not guaranteed items will come in order, so we have + // to loop through all + continue; + } + + if (params.filter && !params.filter->filter(obj.name, index_key.name)) + continue; + + if (params.prefix.size() && + (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) + continue; + + if (count >= max) { + truncated = true; + goto done; + } + + result->emplace_back(std::move(entry)); + count++; + } // for (auto& entry : ent_list) + } // while (truncated && count <= max) + +done: + if (is_truncated) + *is_truncated = truncated; + + return 0; +} // list_objects_unordered + + +/** + * create a rados pool, associated meta info + * returns 0 on success, -ERR# otherwise. + */ +int RGWRados::create_pool(const rgw_pool& pool) +{ + librados::IoCtx io_ctx; + constexpr bool create = true; + return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create); +} + +int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards) +{ + librados::IoCtx index_ctx; + + string dir_oid = dir_oid_prefix; + int r = open_bucket_index_ctx(bucket_info, index_ctx); + if (r < 0) { + return r; + } + + dir_oid.append(bucket_info.bucket.bucket_id); + + map bucket_objs; + get_bucket_index_objects(dir_oid, num_shards, bucket_objs); + + return CLSRGWIssueBucketIndexInit(index_ctx, + bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards) +{ + librados::IoCtx index_ctx; + + std::string dir_oid = dir_oid_prefix; + int r = open_bucket_index_ctx(bucket_info, index_ctx); + if (r < 0) { + return r; + } + + dir_oid.append(bucket_info.bucket.bucket_id); + + std::map bucket_objs; + get_bucket_index_objects(dir_oid, num_shards, bucket_objs); + + return CLSRGWIssueBucketIndexClean(index_ctx, + bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); +} + +void RGWRados::create_bucket_id(string *bucket_id) +{ + uint64_t iid = instance_id(); + uint64_t bid = next_bucket_id(); + char buf[svc.zone->get_zone_params().get_id().size() + 48]; + snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64, + svc.zone->get_zone_params().get_id().c_str(), iid, bid); + *bucket_id = buf; +} + +int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket, + const string& zonegroup_id, + const rgw_placement_rule& placement_rule, + const string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + map& attrs, + RGWBucketInfo& info, + obj_version *pobjv, + obj_version *pep_objv, + real_time creation_time, + rgw_bucket *pmaster_bucket, + uint32_t *pmaster_num_shards, + bool exclusive) +{ +#define MAX_CREATE_RETRIES 20 /* need to bound retries */ + rgw_placement_rule selected_placement_rule; + RGWZonePlacementInfo rule_info; + + for (int i = 0; i < MAX_CREATE_RETRIES; i++) { + int ret = 0; + ret = svc.zone->select_bucket_placement(owner, zonegroup_id, placement_rule, + &selected_placement_rule, &rule_info); + if (ret < 0) + return ret; + + if (!pmaster_bucket) { + create_bucket_id(&bucket.marker); + bucket.bucket_id = bucket.marker; + } else { + bucket.marker = pmaster_bucket->marker; + bucket.bucket_id = pmaster_bucket->bucket_id; + } + + RGWObjVersionTracker& objv_tracker = info.objv_tracker; + + if (pobjv) { + objv_tracker.write_version = *pobjv; + } else { + objv_tracker.generate_new_write_ver(cct); + } + + info.bucket = bucket; + info.owner = owner.user_id; + info.zonegroup = zonegroup_id; + info.placement_rule = selected_placement_rule; + info.index_type = rule_info.index_type; + info.swift_ver_location = swift_ver_location; + info.swift_versioning = (!swift_ver_location.empty()); + if (pmaster_num_shards) { + info.num_shards = *pmaster_num_shards; + } else { + info.num_shards = bucket_index_max_shards; + } + info.bucket_index_shard_hash_type = RGWBucketInfo::MOD; + info.requester_pays = false; + if (real_clock::is_zero(creation_time)) { + info.creation_time = ceph::real_clock::now(); + } else { + info.creation_time = creation_time; + } + if (pquota_info) { + info.quota = *pquota_info; + } + + int r = init_bucket_index(info, info.num_shards); + if (r < 0) { + return r; + } + + ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true); + if (ret == -EEXIST) { + librados::IoCtx index_ctx; + map bucket_objs; + int r = open_bucket_index(info, index_ctx, bucket_objs); + if (r < 0) + return r; + + /* we need to reread the info and return it, caller will have a use for it */ + RGWObjVersionTracker instance_ver = info.objv_tracker; + info.objv_tracker.clear(); + auto obj_ctx = svc.sysobj->init_obj_ctx(); + r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL); + if (r < 0) { + if (r == -ENOENT) { + continue; + } + ldout(cct, 0) << "get_bucket_info returned " << r << dendl; + return r; + } + + /* only remove it if it's a different bucket instance */ + if (info.bucket.bucket_id != bucket.bucket_id) { + /* remove bucket meta instance */ + r = rgw_bucket_instance_remove_entry(this, + bucket.get_key(), + &instance_ver); + if (r < 0) + return r; + + /* remove bucket index objects asynchronously by best effort */ + (void) CLSRGWIssueBucketIndexClean(index_ctx, + bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); + } + /* ret == -ENOENT here */ + } + return ret; + } + + /* this is highly unlikely */ + ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl; + return -ENOENT; +} + +bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool) +{ + return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool); +} + +bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj) +{ + get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc); + + return get_obj_data_pool(placement_rule, obj, &raw_obj->pool); +} + +int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx) +{ + string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + + rgw_pool pool; + if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) { + ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl; + return -EIO; + } + + int r = open_pool_ctx(pool, *ioctx, false); + if (r < 0) { + return r; + } + + ioctx->locator_set_key(key); + + return 0; +} + +int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref) +{ + get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc); + + rgw_pool pool; + if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) { + ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl; + return -EIO; + } + + int r = open_pool_ctx(pool, ref->ioctx, false); + if (r < 0) { + return r; + } + + ref->ioctx.locator_set_key(ref->obj.loc); + + return 0; +} + +int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref) +{ + ref->obj = obj; + + int r; + + if (ref->obj.oid.empty()) { + ref->obj.oid = obj.pool.to_str(); + ref->obj.pool = svc.zone->get_zone_params().domain_root; + } + r = open_pool_ctx(ref->obj.pool, ref->ioctx, false); + if (r < 0) + return r; + + ref->ioctx.locator_set_key(ref->obj.loc); + + return 0; +} + +int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref) +{ + return get_raw_obj_ref(obj, ref); +} + +/* + * fixes an issue where head objects were supposed to have a locator created, but ended + * up without one + */ +int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key) +{ + const rgw_bucket& bucket = bucket_info.bucket; + string oid; + string locator; + + rgw_obj obj(bucket, key); + + get_obj_bucket_and_oid_loc(obj, oid, locator); + + if (locator.empty()) { + ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl; + return 0; + } + + librados::IoCtx ioctx; + + int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx); + if (ret < 0) { + cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl; + return ret; + } + ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */ + + uint64_t size; + bufferlist data; + + struct timespec mtime_ts; + map attrs; + librados::ObjectReadOperation op; + op.getxattrs(&attrs, NULL); + op.stat2(&size, &mtime_ts, NULL); +#define HEAD_SIZE 512 * 1024 + op.read(0, HEAD_SIZE, &data, NULL); + + ret = ioctx.operate(oid, &op, NULL); + if (ret < 0) { + lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl; + return ret; + } + + if (size > HEAD_SIZE) { + lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl; + return -EIO; + } + + if (size != data.length()) { + lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl; + return -EIO; + } + + if (copy_obj) { + librados::ObjectWriteOperation wop; + + wop.mtime2(&mtime_ts); + + map::iterator iter; + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + wop.setxattr(iter->first.c_str(), iter->second); + } + + wop.write(0, data); + + ioctx.locator_set_key(locator); + ioctx.operate(oid, &wop); + } + + if (remove_bad) { + ioctx.locator_set_key(string()); + + ret = ioctx.remove(oid); + if (ret < 0) { + lderr(cct) << "ERROR: failed to remove original bad object" << dendl; + return ret; + } + } + + return 0; +} + +int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx, + const string& src_oid, const string& src_locator, + librados::IoCtx& dst_ioctx, + const string& dst_oid, const string& dst_locator) +{ + +#define COPY_BUF_SIZE (4 * 1024 * 1024) + bool done = false; + uint64_t chunk_size = COPY_BUF_SIZE; + uint64_t ofs = 0; + int ret = 0; + real_time mtime; + struct timespec mtime_ts; + uint64_t size; + + if (src_oid == dst_oid && src_locator == dst_locator) { + return 0; + } + + src_ioctx.locator_set_key(src_locator); + dst_ioctx.locator_set_key(dst_locator); + + do { + bufferlist data; + ObjectReadOperation rop; + ObjectWriteOperation wop; + + if (ofs == 0) { + rop.stat2(&size, &mtime_ts, NULL); + mtime = real_clock::from_timespec(mtime_ts); + } + rop.read(ofs, chunk_size, &data, NULL); + ret = src_ioctx.operate(src_oid, &rop, NULL); + if (ret < 0) { + goto done_err; + } + + if (data.length() == 0) { + break; + } + + if (ofs == 0) { + wop.create(true); /* make it exclusive */ + wop.mtime2(&mtime_ts); + mtime = real_clock::from_timespec(mtime_ts); + } + wop.write(ofs, data); + ret = dst_ioctx.operate(dst_oid, &wop); + if (ret < 0) { + goto done_err; + } + ofs += data.length(); + done = data.length() != chunk_size; + } while (!done); + + if (ofs != size) { + lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid + << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl; + ret = -EIO; + goto done_err; + } + + src_ioctx.remove(src_oid); + + return 0; + +done_err: + // TODO: clean up dst_oid if we created it + lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl; + return ret; +} + +/* + * fixes an issue where head objects were supposed to have a locator created, but ended + * up without one + */ +int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix) +{ + const rgw_bucket& bucket = bucket_info.bucket; + rgw_obj obj(bucket, key); + + if (need_fix) { + *need_fix = false; + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + RGWObjState *astate = NULL; + RGWObjectCtx rctx(this); + r = get_obj_state(&rctx, bucket_info, obj, &astate, false); + if (r < 0) + return r; + + if (astate->has_manifest) { + RGWObjManifest::obj_iterator miter; + RGWObjManifest& manifest = astate->manifest; + for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) { + rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this); + rgw_obj loc; + string oid; + string locator; + + rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc); + + if (loc.key.ns.empty()) { + /* continue, we're only interested in tail objects */ + continue; + } + + get_obj_bucket_and_oid_loc(loc, oid, locator); + ref.ioctx.locator_set_key(locator); + + ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl; + + r = ref.ioctx.stat(oid, NULL, NULL); + if (r != -ENOENT) { + continue; + } + + string bad_loc; + prepend_bucket_marker(bucket, loc.key.name, bad_loc); + + /* create a new ioctx with the bad locator */ + librados::IoCtx src_ioctx; + src_ioctx.dup(ref.ioctx); + src_ioctx.locator_set_key(bad_loc); + + r = src_ioctx.stat(oid, NULL, NULL); + if (r != 0) { + /* cannot find a broken part */ + continue; + } + ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl; + if (need_fix) { + *need_fix = true; + } + if (fix) { + r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator); + if (r < 0) { + lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl; + } + } + } + } + + return 0; +} + +int RGWRados::BucketShard::init(const rgw_bucket& _bucket, + const rgw_obj& obj, + RGWBucketInfo* bucket_info_out) +{ + bucket = _bucket; + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + RGWBucketInfo bucket_info; + RGWBucketInfo* bucket_info_p = + bucket_info_out ? bucket_info_out : &bucket_info; + + int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL); + if (ret < 0) { + return ret; + } + + ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl; + + return 0; +} + +int RGWRados::BucketShard::init(const rgw_bucket& _bucket, + int sid, + RGWBucketInfo* bucket_info_out) +{ + bucket = _bucket; + shard_id = sid; + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + RGWBucketInfo bucket_info; + RGWBucketInfo* bucket_info_p = + bucket_info_out ? bucket_info_out : &bucket_info; + int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL); + if (ret < 0) { + return ret; + } + + ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl; + + return 0; +} + +int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, + const rgw_obj& obj) +{ + bucket = bucket_info.bucket; + + int ret = store->open_bucket_index_shard(bucket_info, index_ctx, + obj.get_hash_object(), &bucket_obj, + &shard_id); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl; + + return 0; +} + +int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid) +{ + bucket = bucket_info.bucket; + shard_id = sid; + + int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl; + + return 0; +} + + +/* Execute @handler on last item in bucket listing for bucket specified + * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing + * to objects matching these criterias. */ +int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info, + const std::string& obj_prefix, + const std::string& obj_delim, + std::function handler) +{ + RGWRados::Bucket target(this, bucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.prefix = obj_prefix; + list_op.params.delim = obj_delim; + + ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name + << ", obj_prefix=" << obj_prefix + << ", obj_delim=" << obj_delim + << dendl; + + bool is_truncated = false; + + boost::optional last_entry; + /* We need to rewind to the last object in a listing. */ + do { + /* List bucket entries in chunks. */ + static constexpr int MAX_LIST_OBJS = 100; + std::vector entries(MAX_LIST_OBJS); + + int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr, + &is_truncated); + if (ret < 0) { + return ret; + } else if (!entries.empty()) { + last_entry = entries.back(); + } + } while (is_truncated); + + if (last_entry) { + return handler(*last_entry); + } + + /* Empty listing - no items we can run handler on. */ + return 0; +} + + +int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx, + const rgw_user& user, + RGWBucketInfo& bucket_info, + rgw_obj& obj) +{ + if (! swift_versioning_enabled(bucket_info)) { + return 0; + } + + obj_ctx.set_atomic(obj); + + RGWObjState * state = nullptr; + int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false); + if (r < 0) { + return r; + } + + if (!state->exists) { + return 0; + } + + const string& src_name = obj.get_oid(); + char buf[src_name.size() + 32]; + struct timespec ts = ceph::real_clock::to_timespec(state->mtime); + snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(), + src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000); + + RGWBucketInfo dest_bucket_info; + + auto sysobj_ctx = svc.sysobj->init_obj_ctx(); + + r = get_bucket_info(sysobj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL); + if (r < 0) { + ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl; + if (r == -ENOENT) { + return -ERR_PRECONDITION_FAILED; + } + return r; + } + + if (dest_bucket_info.owner != bucket_info.owner) { + return -ERR_PRECONDITION_FAILED; + } + + rgw_obj dest_obj(dest_bucket_info.bucket, buf); + + if (dest_bucket_info.versioning_enabled()){ + gen_rand_obj_instance_name(&dest_obj); + } + + obj_ctx.set_atomic(dest_obj); + + string no_zone; + + r = copy_obj(obj_ctx, + user, + NULL, /* req_info *info */ + no_zone, + dest_obj, + obj, + dest_bucket_info, + bucket_info, + bucket_info.placement_rule, + NULL, /* time_t *src_mtime */ + NULL, /* time_t *mtime */ + NULL, /* const time_t *mod_ptr */ + NULL, /* const time_t *unmod_ptr */ + false, /* bool high_precision_time */ + NULL, /* const char *if_match */ + NULL, /* const char *if_nomatch */ + RGWRados::ATTRSMOD_NONE, + true, /* bool copy_if_newer */ + state->attrset, + RGWObjCategory::Main, + 0, /* uint64_t olh_epoch */ + real_time(), /* time_t delete_at */ + NULL, /* string *version_id */ + NULL, /* string *ptag */ + NULL, /* string *petag */ + NULL, /* void (*progress_cb)(off_t, void *) */ + NULL); /* void *progress_data */ + if (r == -ECANCELED || r == -ENOENT) { + /* Has already been overwritten, meaning another rgw process already + * copied it out */ + return 0; + } + + return r; +} + +int RGWRados::swift_versioning_restore(RGWSysObjectCtx& sysobj_ctx, + RGWObjectCtx& obj_ctx, + const rgw_user& user, + RGWBucketInfo& bucket_info, + rgw_obj& obj, + bool& restored) /* out */ +{ + if (! swift_versioning_enabled(bucket_info)) { + return 0; + } + + /* Bucket info of the bucket that stores previous versions of our object. */ + RGWBucketInfo archive_binfo; + + int ret = get_bucket_info(sysobj_ctx, bucket_info.bucket.tenant, + bucket_info.swift_ver_location, archive_binfo, + nullptr, nullptr); + if (ret < 0) { + return ret; + } + + /* Abort the operation if the bucket storing our archive belongs to someone + * else. This is a limitation in comparison to Swift as we aren't taking ACLs + * into consideration. For we can live with that. + * + * TODO: delegate this check to un upper layer and compare with ACLs. */ + if (bucket_info.owner != archive_binfo.owner) { + return -EPERM; + } + + /* This code will be executed on latest version of the object. */ + const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int { + std::string no_zone; + + /* We don't support object versioning of Swift API on those buckets that + * are already versioned using the S3 mechanism. This affects also bucket + * storing archived objects. Otherwise the delete operation would create + * a deletion marker. */ + if (archive_binfo.versioned()) { + restored = false; + return -ERR_PRECONDITION_FAILED; + } + + /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly + * irrelevant and may be safely skipped. */ + std::map no_attrs; + + rgw_obj archive_obj(archive_binfo.bucket, entry.key); + + if (bucket_info.versioning_enabled()){ + gen_rand_obj_instance_name(&obj); + } + + obj_ctx.set_atomic(archive_obj); + obj_ctx.set_atomic(obj); + + int ret = copy_obj(obj_ctx, + user, + nullptr, /* req_info *info */ + no_zone, + obj, /* dest obj */ + archive_obj, /* src obj */ + bucket_info, /* dest bucket info */ + archive_binfo, /* src bucket info */ + bucket_info.placement_rule, /* placement_rule */ + nullptr, /* time_t *src_mtime */ + nullptr, /* time_t *mtime */ + nullptr, /* const time_t *mod_ptr */ + nullptr, /* const time_t *unmod_ptr */ + false, /* bool high_precision_time */ + nullptr, /* const char *if_match */ + nullptr, /* const char *if_nomatch */ + RGWRados::ATTRSMOD_NONE, + true, /* bool copy_if_newer */ + no_attrs, + RGWObjCategory::Main, + 0, /* uint64_t olh_epoch */ + real_time(), /* time_t delete_at */ + nullptr, /* string *version_id */ + nullptr, /* string *ptag */ + nullptr, /* string *petag */ + nullptr, /* void (*progress_cb)(off_t, void *) */ + nullptr); /* void *progress_data */ + if (ret == -ECANCELED || ret == -ENOENT) { + /* Has already been overwritten, meaning another rgw process already + * copied it out */ + return 0; + } else if (ret < 0) { + return ret; + } else { + restored = true; + } + + /* Need to remove the archived copy. */ + ret = delete_obj(obj_ctx, archive_binfo, archive_obj, + archive_binfo.versioning_status()); + + return ret; + }; + + const std::string& obj_name = obj.get_oid(); + const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size() + % obj_name); + + return on_last_entry_in_listing(archive_binfo, prefix, std::string(), + handler); +} + +int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size, + map& attrs, + bool assume_noent, bool modify_tail, + void *_index_op) +{ + RGWRados::Bucket::UpdateIndex *index_op = static_cast(_index_op); + RGWRados *store = target->get_store(); + + ObjectWriteOperation op; +#ifdef WITH_LTTNG + const struct req_state* s = get_req_state(); + string req_id; + if (!s) { + // fake req_id + req_id = store->svc.zone_utils->unique_id(store->get_new_req_id()); + } else { + req_id = s->req_id; + } +#endif + + RGWObjState *state; + int r = target->get_state(&state, false, assume_noent); + if (r < 0) + return r; + + rgw_obj& obj = target->get_obj(); + + if (obj.get_oid().empty()) { + ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl; + return -EIO; + } + + rgw_rados_ref ref; + r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref); + if (r < 0) + return r; + + bool is_olh = state->is_olh; + + bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0; + + const string *ptag = meta.ptag; + if (!ptag && !index_op->get_optag()->empty()) { + ptag = index_op->get_optag(); + } + r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail); + if (r < 0) + return r; + + if (real_clock::is_zero(meta.set_mtime)) { + meta.set_mtime = real_clock::now(); + } + + if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) { + auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (iter == attrs.end()) { + real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime); + string mode = target->bucket_info.obj_lock.get_mode(); + RGWObjectRetention obj_retention(mode, lock_until_date); + bufferlist bl; + obj_retention.encode(bl); + op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl); + } + } + + if (state->is_olh) { + op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag); + } + + struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime); + op.mtime2(&mtime_ts); + + if (meta.data) { + /* if we want to overwrite the data, we also want to overwrite the + xattrs, so just remove the object */ + op.write_full(*meta.data); + } + + string etag; + string content_type; + bufferlist acl_bl; + string storage_class; + + map::iterator iter; + if (meta.rmattrs) { + for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) { + const string& name = iter->first; + op.rmxattr(name.c_str()); + } + } + + if (meta.manifest) { + storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class; + + /* remove existing manifest attr */ + iter = attrs.find(RGW_ATTR_MANIFEST); + if (iter != attrs.end()) + attrs.erase(iter); + + bufferlist bl; + encode(*meta.manifest, bl); + op.setxattr(RGW_ATTR_MANIFEST, bl); + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + + if (name.compare(RGW_ATTR_ETAG) == 0) { + etag = rgw_bl_str(bl); + } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) { + content_type = rgw_bl_str(bl); + } else if (name.compare(RGW_ATTR_ACL) == 0) { + acl_bl = bl; + } + } + if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) { + cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER); + } + + if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) { + bufferlist bl; + encode(store->svc.zone->get_zone_short_id(), bl); + op.setxattr(RGW_ATTR_SOURCE_ZONE, bl); + } + + if (!storage_class.empty()) { + bufferlist bl; + bl.append(storage_class); + op.setxattr(RGW_ATTR_STORAGE_CLASS, bl); + } + + if (!op.size()) + return 0; + + uint64_t epoch; + int64_t poolid; + bool orig_exists; + uint64_t orig_size; + + if (!reset_obj) { //Multipart upload, it has immutable head. + orig_exists = false; + orig_size = 0; + } else { + orig_exists = state->exists; + orig_size = state->accounted_size; + } + + bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) || + !obj.key.instance.empty(); + + bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target); + + if (versioned_op) { + index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP); + } + + if (!index_op->is_prepared()) { + tracepoint(rgw_rados, prepare_enter, req_id.c_str()); + r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag); + tracepoint(rgw_rados, prepare_exit, req_id.c_str()); + if (r < 0) + return r; + } + + tracepoint(rgw_rados, operate_enter, req_id.c_str()); + r = ref.ioctx.operate(ref.obj.oid, &op); + tracepoint(rgw_rados, operate_exit, req_id.c_str()); + if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under, + or -ENOENT if was removed, or -EEXIST if it did not exist + before and now it does */ + if (r == -EEXIST && assume_noent) { + target->invalidate_state(); + return r; + } + goto done_cancel; + } + + epoch = ref.ioctx.get_last_version(); + poolid = ref.ioctx.get_id(); + + r = target->complete_atomic_modification(); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl; + } + + tracepoint(rgw_rados, complete_enter, req_id.c_str()); + r = index_op->complete(poolid, epoch, size, accounted_size, + meta.set_mtime, etag, content_type, + storage_class, &acl_bl, + meta.category, meta.remove_objs, meta.user_data, meta.appendable); + tracepoint(rgw_rados, complete_exit, req_id.c_str()); + if (r < 0) + goto done_cancel; + + if (meta.mtime) { + *meta.mtime = meta.set_mtime; + } + + /* note that index_op was using state so we couldn't invalidate it earlier */ + target->invalidate_state(); + state = NULL; + + if (versioned_op && meta.olh_epoch) { + r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, meta.zones_trace); + if (r < 0) { + return r; + } + } + + if (!real_clock::is_zero(meta.delete_at)) { + rgw_obj_index_key obj_key; + obj.key.get_index_key(&obj_key); + + r = store->objexp_hint_add(meta.delete_at, + obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl; + /* ignoring error, nothing we can do at this point */ + } + } + meta.canceled = false; + + /* update quota cache */ + if (meta.completeMultipart){ + store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1), + 0, orig_size); + } + else { + store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1), + accounted_size, orig_size); + } + return 0; + +done_cancel: + int ret = index_op->cancel(); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl; + } + + meta.canceled = true; + + /* we lost in a race. There are a few options: + * - existing object was rewritten (ECANCELED) + * - non existing object was created (EEXIST) + * - object was removed (ENOENT) + * should treat it as a success + */ + if (meta.if_match == NULL && meta.if_nomatch == NULL) { + if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) { + r = 0; + } + } else { + if (meta.if_match != NULL) { + // only overwrite existing object + if (strcmp(meta.if_match, "*") == 0) { + if (r == -ENOENT) { + r = -ERR_PRECONDITION_FAILED; + } else if (r == -ECANCELED) { + r = 0; + } + } + } + + if (meta.if_nomatch != NULL) { + // only create a new object + if (strcmp(meta.if_nomatch, "*") == 0) { + if (r == -EEXIST) { + r = -ERR_PRECONDITION_FAILED; + } else if (r == -ENOENT) { + r = 0; + } + } + } + } + + return r; +} + +int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size, + map& attrs) +{ + RGWBucketInfo& bucket_info = target->get_bucket_info(); + + RGWRados::Bucket bop(target->get_store(), bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj()); + index_op.set_zones_trace(meta.zones_trace); + + bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL); + int r; + if (assume_noent) { + r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op); + if (r == -EEXIST) { + assume_noent = false; + } + } + if (!assume_noent) { + r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op); + } + return r; +} + +class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB +{ + CephContext* cct; + rgw_obj obj; + rgw::putobj::DataProcessor *filter; + boost::optional& compressor; + bool try_etag_verify; + rgw::putobj::etag_verifier_ptr etag_verifier; + boost::optional buffering; + CompressorRef& plugin; + rgw::putobj::ObjectProcessor *processor; + void (*progress_cb)(off_t, void *); + void *progress_data; + bufferlist extra_data_bl, manifest_bl; + std::optional compression_info; + uint64_t extra_data_left{0}; + bool need_to_process_attrs{true}; + uint64_t data_len{0}; + map src_attrs; + uint64_t ofs{0}; + uint64_t lofs{0}; /* logical ofs */ + std::function&)> attrs_handler; +public: + RGWRadosPutObj(CephContext* cct, + CompressorRef& plugin, + boost::optional& compressor, + rgw::putobj::ObjectProcessor *p, + void (*_progress_cb)(off_t, void *), + void *_progress_data, + std::function&)> _attrs_handler) : + cct(cct), + filter(p), + compressor(compressor), + try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify), + plugin(plugin), + processor(p), + progress_cb(_progress_cb), + progress_data(_progress_data), + attrs_handler(_attrs_handler) {} + + int process_attrs(void) { + if (extra_data_bl.length()) { + JSONParser jp; + if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) { + ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl; + return -EIO; + } + + JSONDecoder::decode_json("attrs", src_attrs, &jp); + + auto iter = src_attrs.find(RGW_ATTR_COMPRESSION); + if (iter != src_attrs.end()) { + const bufferlist bl = std::move(iter->second); + src_attrs.erase(iter); // don't preserve source compression info + + if (try_etag_verify) { + // if we're trying to verify etags, we need to convert compressed + // ranges in the manifest back into logical multipart part offsets + RGWCompressionInfo info; + bool compressed = false; + int r = rgw_compression_info_from_attr(bl, compressed, info); + if (r < 0) { + ldout(cct, 4) << "failed to decode compression info, " + "disabling etag verification" << dendl; + try_etag_verify = false; + } else if (compressed) { + compression_info = std::move(info); + } + } + } + /* We need the manifest to recompute the ETag for verification */ + iter = src_attrs.find(RGW_ATTR_MANIFEST); + if (iter != src_attrs.end()) { + manifest_bl = std::move(iter->second); + src_attrs.erase(iter); + } + + // filter out olh attributes + iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX); + while (iter != src_attrs.end()) { + if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) { + break; + } + iter = src_attrs.erase(iter); + } + } + + int ret = attrs_handler(src_attrs); + if (ret < 0) { + return ret; + } + + if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) { + //do not compress if object is encrypted + compressor = boost::in_place(cct, plugin, filter); + // add a filter that buffers data so we don't try to compress tiny blocks. + // libcurl reads in 16k at a time, and we need at least 64k to get a good + // compression ratio + constexpr unsigned buffer_size = 512 * 1024; + buffering = boost::in_place(&*compressor, buffer_size); + filter = &*buffering; + } + + /* + * Presently we don't support ETag based verification if encryption is + * requested. We can enable simultaneous support once we have a mechanism + * to know the sequence in which the filters must be applied. + */ + if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) { + ret = rgw::putobj::create_etag_verifier(cct, filter, manifest_bl, + compression_info, + etag_verifier); + if (ret < 0) { + ldout(cct, 4) << "failed to initial etag verifier, " + "disabling etag verification" << dendl; + } else { + filter = etag_verifier.get(); + } + } + + need_to_process_attrs = false; + + return 0; + } + + int handle_data(bufferlist& bl, bool *pause) override { + if (progress_cb) { + progress_cb(data_len, progress_data); + } + if (extra_data_left) { + uint64_t extra_len = bl.length(); + if (extra_len > extra_data_left) + extra_len = extra_data_left; + + bufferlist extra; + bl.splice(0, extra_len, &extra); + extra_data_bl.append(extra); + + extra_data_left -= extra_len; + if (extra_data_left == 0) { + int res = process_attrs(); + if (res < 0) + return res; + } + ofs += extra_len; + if (bl.length() == 0) { + return 0; + } + } + if (need_to_process_attrs) { + /* need to call process_attrs() even if we don't get any attrs, + * need it to call attrs_handler(). + */ + int res = process_attrs(); + if (res < 0) { + return res; + } + } + + ceph_assert(uint64_t(ofs) >= extra_data_len); + + uint64_t size = bl.length(); + ofs += size; + + const uint64_t lofs = data_len; + data_len += size; + + return filter->process(std::move(bl), lofs); + } + + int flush() { + return filter->process({}, data_len); + } + + bufferlist& get_extra_data() { return extra_data_bl; } + + map& get_attrs() { return src_attrs; } + + void set_extra_data_len(uint64_t len) override { + extra_data_left = len; + RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len); + } + + uint64_t get_data_len() { + return data_len; + } + + std::string get_verifier_etag() { + if (etag_verifier) { + etag_verifier->calculate_etag(); + return etag_verifier->get_calculated_etag(); + } else { + return ""; + } + } +}; + +/* + * prepare attrset depending on attrs_mod. + */ +static void set_copy_attrs(map& src_attrs, + map& attrs, + RGWRados::AttrsMod attrs_mod) +{ + switch (attrs_mod) { + case RGWRados::ATTRSMOD_NONE: + attrs = src_attrs; + break; + case RGWRados::ATTRSMOD_REPLACE: + if (!attrs[RGW_ATTR_ETAG].length()) { + attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG]; + } + if (!attrs[RGW_ATTR_TAIL_TAG].length()) { + auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG); + if (ttiter != src_attrs.end()) { + attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG]; + } + } + break; + case RGWRados::ATTRSMOD_MERGE: + for (map::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) { + if (attrs.find(it->first) == attrs.end()) { + attrs[it->first] = it->second; + } + } + break; + } +} + +int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj) +{ + map attrset; + + real_time mtime; + uint64_t obj_size; + RGWObjectCtx rctx(this); + + RGWRados::Object op_target(this, dest_bucket_info, rctx, obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrset; + read_op.params.lastmod = &mtime; + read_op.params.obj_size = &obj_size; + + int ret = read_op.prepare(); + if (ret < 0) + return ret; + + attrset.erase(RGW_ATTR_ID_TAG); + attrset.erase(RGW_ATTR_TAIL_TAG); + + return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule, + read_op, obj_size - 1, obj, NULL, mtime, attrset, + 0, real_time(), NULL); +} + +struct obj_time_weight { + real_time mtime; + uint32_t zone_short_id; + uint64_t pg_ver; + bool high_precision; + + obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {} + + bool compare_low_precision(const obj_time_weight& rhs) { + struct timespec l = ceph::real_clock::to_timespec(mtime); + struct timespec r = ceph::real_clock::to_timespec(rhs.mtime); + l.tv_nsec = 0; + r.tv_nsec = 0; + if (l > r) { + return false; + } + if (l < r) { + return true; + } + if (!zone_short_id || !rhs.zone_short_id) { + /* don't compare zone ids, if one wasn't provided */ + return false; + } + if (zone_short_id != rhs.zone_short_id) { + return (zone_short_id < rhs.zone_short_id); + } + return (pg_ver < rhs.pg_ver); + + } + + bool operator<(const obj_time_weight& rhs) { + if (!high_precision || !rhs.high_precision) { + return compare_low_precision(rhs); + } + if (mtime > rhs.mtime) { + return false; + } + if (mtime < rhs.mtime) { + return true; + } + if (!zone_short_id || !rhs.zone_short_id) { + /* don't compare zone ids, if one wasn't provided */ + return false; + } + if (zone_short_id != rhs.zone_short_id) { + return (zone_short_id < rhs.zone_short_id); + } + return (pg_ver < rhs.pg_ver); + } + + void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) { + mtime = _mtime; + zone_short_id = _short_id; + pg_ver = _pg_ver; + } + + void init(RGWObjState *state) { + mtime = state->mtime; + zone_short_id = state->zone_short_id; + pg_ver = state->pg_ver; + } +}; + +inline ostream& operator<<(ostream& out, const obj_time_weight &o) { + out << o.mtime; + + if (o.zone_short_id != 0 || o.pg_ver != 0) { + out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]"; + } + + return out; +} + +class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB { + bufferlist extra_data; +public: + RGWGetExtraDataCB() {} + int handle_data(bufferlist& bl, bool *pause) override { + int bl_len = (int)bl.length(); + if (extra_data.length() < extra_data_len) { + off_t max = extra_data_len - extra_data.length(); + if (max > bl_len) { + max = bl_len; + } + bl.splice(0, max, &extra_data); + } + return bl_len; + } + + bufferlist& get_extra_data() { + return extra_data; + } +}; + +int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const string& source_zone, + rgw_obj& src_obj, + RGWBucketInfo& src_bucket_info, + real_time *src_mtime, + uint64_t *psize, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + map *pattrs, + map *pheaders, + string *version_id, + string *ptag, + string *petag) +{ + /* source is in a different zonegroup, copy from there */ + + RGWRESTStreamRWRequest *in_stream_req; + string tag; + map src_attrs; + append_rand_alpha(cct, tag, tag, 32); + obj_time_weight set_mtime_weight; + set_mtime_weight.high_precision = high_precision_time; + + RGWRESTConn *conn; + if (source_zone.empty()) { + if (src_bucket_info.zonegroup.empty()) { + /* source is in the master zonegroup */ + conn = svc.zone->get_master_conn(); + } else { + auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); + map::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup); + if (iter == zonegroup_conn_map.end()) { + ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + } else { + auto& zone_conn_map = svc.zone->get_zone_conn_map(); + map::iterator iter = zone_conn_map.find(source_zone); + if (iter == zone_conn_map.end()) { + ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + + RGWGetExtraDataCB cb; + map req_headers; + real_time set_mtime; + + const real_time *pmod = mod_ptr; + + obj_time_weight dest_mtime_weight; + + constexpr bool prepend_meta = true; + constexpr bool get_op = true; + constexpr bool rgwx_stat = true; + constexpr bool sync_manifest = true; + constexpr bool skip_decrypt = true; + int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr, + dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, + prepend_meta, get_op, rgwx_stat, + sync_manifest, skip_decrypt, + true, &cb, &in_stream_req); + if (ret < 0) { + return ret; + } + + ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, nullptr, pheaders); + if (ret < 0) { + return ret; + } + + bufferlist& extra_data_bl = cb.get_extra_data(); + if (extra_data_bl.length()) { + JSONParser jp; + if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) { + ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl; + return -EIO; + } + + JSONDecoder::decode_json("attrs", src_attrs, &jp); + + src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout + } + + if (src_mtime) { + *src_mtime = set_mtime; + } + + if (petag) { + map::iterator iter = src_attrs.find(RGW_ATTR_ETAG); + if (iter != src_attrs.end()) { + bufferlist& etagbl = iter->second; + *petag = etagbl.to_str(); + while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') { + *petag = petag->substr(0, petag->size() - 1); + } + } + } + + if (pattrs) { + *pattrs = std::move(src_attrs); + } + + return 0; +} + +int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const string& source_zone, + const rgw_obj& dest_obj, + const rgw_obj& src_obj, + RGWBucketInfo& dest_bucket_info, + RGWBucketInfo& src_bucket_info, + std::optional dest_placement_rule, + real_time *src_mtime, + real_time *mtime, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + map& attrs, + RGWObjCategory category, + std::optional olh_epoch, + real_time delete_at, + string *ptag, + string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + rgw_zone_set *zones_trace, + std::optional* bytes_transferred) +{ + /* source is in a different zonegroup, copy from there */ + + RGWRESTStreamRWRequest *in_stream_req; + string tag; + int i; + append_rand_alpha(cct, tag, tag, 32); + obj_time_weight set_mtime_weight; + set_mtime_weight.high_precision = high_precision_time; + int ret; + + rgw::AioThrottle aio(cct->_conf->rgw_put_obj_min_window_size); + using namespace rgw::putobj; + const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr); + AtomicObjectProcessor processor(&aio, this, dest_bucket_info, ptail_rule, user_id, + obj_ctx, dest_obj, olh_epoch, tag); + RGWRESTConn *conn; + auto& zone_conn_map = svc.zone->get_zone_conn_map(); + auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); + if (source_zone.empty()) { + if (dest_bucket_info.zonegroup.empty()) { + /* source is in the master zonegroup */ + conn = svc.zone->get_master_conn(); + } else { + map::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup); + if (iter == zonegroup_conn_map.end()) { + ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + } else { + map::iterator iter = zone_conn_map.find(source_zone); + if (iter == zone_conn_map.end()) { + ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + + string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid(); + + boost::optional compressor; + CompressorRef plugin; + + rgw_placement_rule dest_rule; + RGWRadosPutObj cb(cct, plugin, compressor, &processor, progress_cb, progress_data, + [&](const map& obj_attrs) { + if (!ptail_rule) { + auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS); + if (iter != obj_attrs.end()) { + dest_rule.storage_class = iter->second.to_str(); + dest_rule.inherit_from(dest_bucket_info.placement_rule); + processor.set_tail_placement(std::move(dest_rule)); + ptail_rule = &dest_rule; + } else { + ptail_rule = &dest_bucket_info.placement_rule; + } + } + const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule); + if (compression_type != "none") { + plugin = Compressor::create(cct, compression_type); + if (!plugin) { + ldout(cct, 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } + } + + int ret = processor.prepare(); + if (ret < 0) { + return ret; + } + return 0; + }); + + string etag; + real_time set_mtime; + uint64_t expected_size = 0; + + RGWObjState *dest_state = NULL; + + const real_time *pmod = mod_ptr; + + obj_time_weight dest_mtime_weight; + + if (copy_if_newer) { + /* need to get mtime for destination */ + ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false); + if (ret < 0) + goto set_err_state; + + if (!real_clock::is_zero(dest_state->mtime)) { + dest_mtime_weight.init(dest_state); + pmod = &dest_mtime_weight.mtime; + } + } + + static constexpr bool prepend_meta = true; + static constexpr bool get_op = true; + static constexpr bool rgwx_stat = false; + static constexpr bool sync_manifest = true; + static constexpr bool skip_decrypt = true; + ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr, + dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, + prepend_meta, get_op, rgwx_stat, + sync_manifest, skip_decrypt, + true, + &cb, &in_stream_req); + if (ret < 0) { + goto set_err_state; + } + + ret = conn->complete_request(in_stream_req, &etag, &set_mtime, + &expected_size, nullptr, nullptr); + if (ret < 0) { + goto set_err_state; + } + ret = cb.flush(); + if (ret < 0) { + goto set_err_state; + } + if (cb.get_data_len() != expected_size) { + ret = -EIO; + ldout(cct, 0) << "ERROR: object truncated during fetching, expected " + << expected_size << " bytes but received " << cb.get_data_len() << dendl; + goto set_err_state; + } + if (compressor && compressor->is_compressed()) { + bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = cb.get_data_len(); + cs_info.blocks = move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp; + } + + if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */ + cb.get_attrs().erase(RGW_ATTR_DELETE_AT); + } else { + map::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT); + if (iter != cb.get_attrs().end()) { + try { + decode(delete_at, iter->second); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl; + } + } + } + + if (src_mtime) { + *src_mtime = set_mtime; + } + + if (petag) { + const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG); + if (iter != cb.get_attrs().end()) { + *petag = iter->second.to_str(); + } + } + + //erase the append attr + cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM); + + if (source_zone.empty()) { + set_copy_attrs(cb.get_attrs(), attrs, attrs_mod); + } else { + attrs = cb.get_attrs(); + } + + if (copy_if_newer) { + uint64_t pg_ver = 0; + auto i = attrs.find(RGW_ATTR_PG_VER); + if (i != attrs.end() && i->second.length() > 0) { + auto iter = i->second.cbegin(); + try { + decode(pg_ver, iter); + } catch (buffer::error& err) { + ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl; + /* non critical error */ + } + } + set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver); + } + + /* Perform ETag verification is we have computed the object's MD5 sum at our end */ + if (const auto& verifier_etag = cb.get_verifier_etag(); + !verifier_etag.empty()) { + string trimmed_etag = etag; + + /* Remove the leading and trailing double quotes from etag */ + trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'), + trimmed_etag.end()); + + if (verifier_etag != trimmed_etag) { + ret = -EIO; + ldout(cct, 0) << "ERROR: source and destination objects don't match. Expected etag:" + << trimmed_etag << " Computed etag:" << verifier_etag << dendl; + goto set_err_state; + } + } + +#define MAX_COMPLETE_RETRY 100 + for (i = 0; i < MAX_COMPLETE_RETRY; i++) { + bool canceled = false; + ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime, + attrs, delete_at, nullptr, nullptr, nullptr, + zones_trace, &canceled); + if (ret < 0) { + goto set_err_state; + } + + if (copy_if_newer && canceled) { + ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl; + obj_ctx.invalidate(dest_obj); /* object was overwritten */ + ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false); + if (ret < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl; + goto set_err_state; + } + dest_mtime_weight.init(dest_state); + dest_mtime_weight.high_precision = high_precision_time; + if (!dest_state->exists || + dest_mtime_weight < set_mtime_weight) { + ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + continue; + } else { + ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + } + } + break; + } + + if (i == MAX_COMPLETE_RETRY) { + ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl; + ret = -EIO; + goto set_err_state; + } + + if (bytes_transferred) { + *bytes_transferred = cb.get_data_len(); + } + return 0; +set_err_state: + if (copy_if_newer && ret == -ERR_NOT_MODIFIED) { + // we may have already fetched during sync of OP_ADD, but were waiting + // for OP_LINK_OLH to call set_olh() with a real olh_epoch + if (olh_epoch && *olh_epoch > 0) { + constexpr bool log_data_change = true; + ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr, + *olh_epoch, real_time(), false, zones_trace, log_data_change); + } else { + // we already have the latest copy + ret = 0; + } + } + return ret; +} + + +int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate, + map& src_attrs, + RGWRados::Object::Read& read_op, + const rgw_user& user_id, + rgw_obj& dest_obj, + real_time *mtime) +{ + string etag; + + RGWRESTStreamS3PutObj *out_stream_req; + + auto rest_master_conn = svc.zone->get_master_conn(); + + int ret = rest_master_conn->put_obj_async(user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req); + if (ret < 0) { + return ret; + } + + ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb()); + if (ret < 0) { + delete out_stream_req; + return ret; + } + + ret = rest_master_conn->complete_request(out_stream_req, etag, mtime); + if (ret < 0) + return ret; + + return 0; +} + +/** + * Copy an object. + * dest_obj: the object to copy into + * src_obj: the object to copy from + * attrs: usage depends on attrs_mod parameter + * attrs_mod: the modification mode of the attrs, may have the following values: + * ATTRSMOD_NONE - the attributes of the source object will be + * copied without modifications, attrs parameter is ignored; + * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs + * parameter, source object attributes are not copied; + * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes + * are overwritten by values contained in attrs parameter. + * err: stores any errors resulting from the get of the original object + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const string& source_zone, + rgw_obj& dest_obj, + rgw_obj& src_obj, + RGWBucketInfo& dest_bucket_info, + RGWBucketInfo& src_bucket_info, + const rgw_placement_rule& dest_placement, + real_time *src_mtime, + real_time *mtime, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + map& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + real_time delete_at, + string *version_id, + string *ptag, + string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data) +{ + int ret; + uint64_t obj_size; + rgw_obj shadow_obj = dest_obj; + string shadow_oid; + + bool remote_src; + bool remote_dest; + + append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32); + shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns); + + auto& zonegroup = svc.zone->get_zonegroup(); + + remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup); + remote_src = !zonegroup.equals(src_bucket_info.zonegroup); + + if (remote_src && remote_dest) { + ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl; + return -EINVAL; + } + + ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl; + + if (remote_src || !source_zone.empty()) { + return fetch_remote_obj(obj_ctx, user_id, info, source_zone, + dest_obj, src_obj, dest_bucket_info, src_bucket_info, + dest_placement, src_mtime, mtime, mod_ptr, + unmod_ptr, high_precision_time, + if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category, + olh_epoch, delete_at, ptag, petag, progress_cb, progress_data); + } + + map src_attrs; + RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj); + RGWRados::Object::Read read_op(&src_op_target); + + read_op.conds.mod_ptr = mod_ptr; + read_op.conds.unmod_ptr = unmod_ptr; + read_op.conds.high_precision_time = high_precision_time; + read_op.conds.if_match = if_match; + read_op.conds.if_nomatch = if_nomatch; + read_op.params.attrs = &src_attrs; + read_op.params.lastmod = src_mtime; + read_op.params.obj_size = &obj_size; + + ret = read_op.prepare(); + if (ret < 0) { + return ret; + } + if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) { + // Current implementation does not follow S3 spec and even + // may result in data corruption silently when copying + // multipart objects acorss pools. So reject COPY operations + //on encrypted objects before it is fully functional. + ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj + << " has not been implemented." << dendl; + return -ERR_NOT_IMPLEMENTED; + } + + src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL]; + src_attrs.erase(RGW_ATTR_DELETE_AT); + + set_copy_attrs(src_attrs, attrs, attrs_mod); + attrs.erase(RGW_ATTR_ID_TAG); + attrs.erase(RGW_ATTR_PG_VER); + attrs.erase(RGW_ATTR_SOURCE_ZONE); + map::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION); + if (cmp != src_attrs.end()) + attrs[RGW_ATTR_COMPRESSION] = cmp->second; + + RGWObjManifest manifest; + RGWObjState *astate = NULL; + + ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate); + if (ret < 0) { + return ret; + } + + vector ref_objs; + + if (remote_dest) { + /* dest is in a different zonegroup, copy it there */ + return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime); + } + uint64_t max_chunk_size; + + ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl; + return ret; + } + + rgw_pool src_pool; + rgw_pool dest_pool; + + const rgw_placement_rule *src_rule{nullptr}; + + if (astate->has_manifest) { + src_rule = &astate->manifest.get_tail_placement().placement_rule; + ldout(cct, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl; + } + + if (!src_rule || src_rule->empty()) { + src_rule = &src_bucket_info.placement_rule; + } + + if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) { + ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl; + return -EIO; + } + + if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) { + ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl; + return -EIO; + } + + ldout(cct, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool + << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl; + + bool copy_data = !astate->has_manifest || + (*src_rule != dest_placement) || + (src_pool != dest_pool); + + bool copy_first = false; + if (astate->has_manifest) { + if (!astate->manifest.has_tail()) { + copy_data = true; + } else { + uint64_t head_size = astate->manifest.get_head_size(); + + if (head_size > 0) { + if (head_size > max_chunk_size) { + copy_data = true; + } else { + copy_first = true; + } + } + } + } + + if (petag) { + const auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + *petag = iter->second.to_str(); + } + } + + if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */ + attrs.erase(RGW_ATTR_TAIL_TAG); + return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj, + mtime, real_time(), attrs, olh_epoch, delete_at, petag); + } + + RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin(); + + if (copy_first) { // we need to copy first chunk, not increase refcount + ++miter; + } + + rgw_rados_ref ref; + ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref); + if (ret < 0) { + return ret; + } + + bufferlist first_chunk; + + bool copy_itself = (dest_obj == src_obj); + RGWObjManifest *pmanifest; + ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl; + + RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj); + RGWRados::Object::Write write_op(&dest_op_target); + + string tag; + + if (ptag) { + tag = *ptag; + } + + if (tag.empty()) { + append_rand_alpha(cct, tag, tag, 32); + } + + if (!copy_itself) { + attrs.erase(RGW_ATTR_TAIL_TAG); + manifest = astate->manifest; + const rgw_bucket_placement& tail_placement = manifest.get_tail_placement(); + if (tail_placement.bucket.name.empty()) { + manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket); + } + string ref_tag; + for (; miter != astate->manifest.obj_end(); ++miter) { + ObjectWriteOperation op; + ref_tag = tag + '\0'; + cls_refcount_get(op, ref_tag, true); + const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this); + ref.ioctx.locator_set_key(loc.loc); + + ret = ref.ioctx.operate(loc.oid, &op); + if (ret < 0) { + goto done_ret; + } + + ref_objs.push_back(loc); + } + + pmanifest = &manifest; + } else { + pmanifest = &astate->manifest; + /* don't send the object's tail for garbage collection */ + astate->keep_tail = true; + } + + if (copy_first) { + ret = read_op.read(0, max_chunk_size, first_chunk); + if (ret < 0) { + goto done_ret; + } + + pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length()); + } else { + pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0); + } + + write_op.meta.data = &first_chunk; + write_op.meta.manifest = pmanifest; + write_op.meta.ptag = &tag; + write_op.meta.owner = dest_bucket_info.owner; + write_op.meta.mtime = mtime; + write_op.meta.flags = PUT_OBJ_CREATE; + write_op.meta.category = category; + write_op.meta.olh_epoch = olh_epoch; + write_op.meta.delete_at = delete_at; + write_op.meta.modify_tail = !copy_itself; + + ret = write_op.write_meta(obj_size, astate->accounted_size, attrs); + if (ret < 0) { + goto done_ret; + } + + return 0; + +done_ret: + if (!copy_itself) { + vector::iterator riter; + + /* rollback reference */ + string ref_tag = tag + '\0'; + for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) { + ObjectWriteOperation op; + cls_refcount_put(op, ref_tag, true); + + ref.ioctx.locator_set_key(riter->loc); + + int r = ref.ioctx.operate(riter->oid, &op); + if (r < 0) { + ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl; + } + } + } + return ret; +} + + +int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx, + RGWBucketInfo& dest_bucket_info, + const rgw_placement_rule& dest_placement, + RGWRados::Object::Read& read_op, off_t end, + const rgw_obj& dest_obj, + real_time *mtime, + real_time set_mtime, + map& attrs, + uint64_t olh_epoch, + real_time delete_at, + string *petag) +{ + string tag; + append_rand_alpha(cct, tag, tag, 32); + + rgw::AioThrottle aio(cct->_conf->rgw_put_obj_min_window_size); + using namespace rgw::putobj; + AtomicObjectProcessor processor(&aio, this, dest_bucket_info, &dest_placement, + dest_bucket_info.owner, obj_ctx, + dest_obj, olh_epoch, tag); + int ret = processor.prepare(); + if (ret < 0) + return ret; + + off_t ofs = 0; + + do { + bufferlist bl; + ret = read_op.read(ofs, end, bl); + if (ret < 0) { + ldout(cct, 0) << "ERROR: fail to read object data, ret = " << ret << dendl; + return ret; + } + + uint64_t read_len = ret; + ret = processor.process(std::move(bl), ofs); + if (ret < 0) { + return ret; + } + + ofs += read_len; + } while (ofs <= end); + + // flush + ret = processor.process({}, ofs); + if (ret < 0) { + return ret; + } + + string etag; + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + etag = bl.to_str(); + if (petag) { + *petag = etag; + } + } + + uint64_t accounted_size; + { + bool compressed{false}; + RGWCompressionInfo cs_info; + ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to read compression info" << dendl; + return ret; + } + // pass original size if compressed + accounted_size = compressed ? cs_info.orig_size : ofs; + } + + return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, + nullptr, nullptr, nullptr, nullptr, nullptr); +} + +int RGWRados::transition_obj(RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, + rgw_obj& obj, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch) +{ + map attrs; + real_time read_mtime; + uint64_t obj_size; + + RGWRados::Object op_target(this, bucket_info, obj_ctx, obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrs; + read_op.params.lastmod = &read_mtime; + read_op.params.obj_size = &obj_size; + + int ret = read_op.prepare(); + if (ret < 0) { + return ret; + } + + if (read_mtime != mtime) { + /* raced */ + return -ECANCELED; + } + + ret = copy_obj_data(obj_ctx, + bucket_info, + placement_rule, + read_op, + obj_size - 1, + obj, + nullptr /* pmtime */, + mtime, + attrs, + olh_epoch, + real_time(), + nullptr /* petag */); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info) +{ + constexpr uint NUM_ENTRIES = 1000u; + + rgw_obj_index_key marker; + string prefix; + bool is_truncated; + + do { + std::vector ent_list; + ent_list.reserve(NUM_ENTRIES); + + int r = cls_bucket_list_unordered(bucket_info, + RGW_NO_SHARD, + marker, + prefix, + NUM_ENTRIES, + true, + ent_list, + &is_truncated, + &marker); + if (r < 0) { + return r; + } + + string ns; + for (auto const& dirent : ent_list) { + rgw_obj_key obj; + + if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) { + return -ENOTEMPTY; + } + } + } while (is_truncated); + + return 0; +} + +/** + * Delete a bucket. + * bucket: the name of the bucket to delete + * Returns 0 on success, -ERR# otherwise. + */ +int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty) +{ + const rgw_bucket& bucket = bucket_info.bucket; + librados::IoCtx index_ctx; + map bucket_objs; + int r = open_bucket_index(bucket_info, index_ctx, bucket_objs); + if (r < 0) + return r; + + if (check_empty) { + r = check_bucket_empty(bucket_info); + if (r < 0) { + return r; + } + } + + r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker); + if (r < 0) + return r; + + /* if the bucket is not synced we can remove the meta file */ + if (!svc.zone->is_syncing_bucket_meta(bucket)) { + RGWObjVersionTracker objv_tracker; + r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker); + if (r < 0) { + return r; + } + + /* remove bucket index objects asynchronously by best effort */ + (void) CLSRGWIssueBucketIndexClean(index_ctx, + bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); + } + + return 0; +} + +int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner) +{ + RGWBucketInfo info; + map attrs; + auto obj_ctx = svc.sysobj->init_obj_ctx(); + int r; + if (bucket.bucket_id.empty()) { + r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs); + } else { + r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs); + } + if (r < 0) { + ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; + return r; + } + + info.owner = owner.get_id(); + + r = put_bucket_instance_info(info, false, real_time(), &attrs); + if (r < 0) { + ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; + return r; + } + + return 0; +} + + +int RGWRados::set_buckets_enabled(vector& buckets, bool enabled) +{ + int ret = 0; + + vector::iterator iter; + + for (iter = buckets.begin(); iter != buckets.end(); ++iter) { + rgw_bucket& bucket = *iter; + if (enabled) + ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl; + else + ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl; + + RGWBucketInfo info; + map attrs; + auto obj_ctx = svc.sysobj->init_obj_ctx(); + int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs); + if (r < 0) { + ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; + ret = r; + continue; + } + if (enabled) { + info.flags &= ~BUCKET_SUSPENDED; + } else { + info.flags |= BUCKET_SUSPENDED; + } + + r = put_bucket_instance_info(info, false, real_time(), &attrs); + if (r < 0) { + ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; + ret = r; + continue; + } + } + return ret; +} + +int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended) +{ + RGWBucketInfo bucket_info; + auto obj_ctx = svc.sysobj->init_obj_ctx(); + int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL); + if (ret < 0) { + return ret; + } + + *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0); + return 0; +} + +int RGWRados::Object::complete_atomic_modification() +{ + if (!state->has_manifest || state->keep_tail) + return 0; + + cls_rgw_obj_chain chain; + store->update_gc_chain(obj, state->manifest, &chain); + + if (chain.empty()) { + return 0; + } + + string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str()); + return store->gc->send_chain(chain, tag, false); // do it async +} + +void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain) +{ + RGWObjManifest::obj_iterator iter; + rgw_raw_obj raw_head; + obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head); + for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) { + const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this); + if (mobj == raw_head) + continue; + cls_rgw_obj_key key(mobj.oid); + chain->push_obj(mobj.pool.to_str(), key, mobj.loc); + } +} + +int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync) +{ + return gc->send_chain(chain, tag, sync); +} + +int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, + librados::IoCtx& index_ctx, + string& bucket_oid) +{ + const rgw_bucket& bucket = bucket_info.bucket; + int r = open_bucket_index_ctx(bucket_info, index_ctx); + if (r < 0) + return r; + + if (bucket.bucket_id.empty()) { + ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl; + return -EIO; + } + + bucket_oid = dir_oid_prefix; + bucket_oid.append(bucket.bucket_id); + + return 0; +} + +int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, + librados::IoCtx& index_ctx, + string& bucket_oid_base) { + const rgw_bucket& bucket = bucket_info.bucket; + int r = open_bucket_index_ctx(bucket_info, index_ctx); + if (r < 0) + return r; + + if (bucket.bucket_id.empty()) { + ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl; + return -EIO; + } + + bucket_oid_base = dir_oid_prefix; + bucket_oid_base.append(bucket.bucket_id); + + return 0; + +} + +int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, + librados::IoCtx& index_ctx, + map& bucket_objs, + int shard_id, + map *bucket_instance_ids) { + string bucket_oid_base; + int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base); + if (ret < 0) { + return ret; + } + + get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id); + if (bucket_instance_ids) { + get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids); + } + return 0; +} + +template +int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, + map& oids, map& bucket_objs, + int shard_id, map *bucket_instance_ids) +{ + int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids); + if (ret < 0) + return ret; + + map::const_iterator iter = oids.begin(); + for (; iter != oids.end(); ++iter) { + bucket_objs[iter->first] = T(); + } + return 0; +} + +int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, + const string& obj_key, string *bucket_obj, int *shard_id) +{ + string bucket_oid_base; + int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base); + if (ret < 0) + return ret; + + RGWObjectCtx obj_ctx(this); + + ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards, + (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id); + if (ret < 0) { + ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, + int shard_id, string *bucket_obj) +{ + string bucket_oid_base; + int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base); + if (ret < 0) + return ret; + + RGWObjectCtx obj_ctx(this); + + get_bucket_index_object(bucket_oid_base, bucket_info.num_shards, + shard_id, bucket_obj); + return 0; +} + +static void accumulate_raw_stats(const rgw_bucket_dir_header& header, + map& stats) +{ + for (const auto& pair : header.stats) { + const RGWObjCategory category = static_cast(pair.first); + const rgw_bucket_category_stats& header_stats = pair.second; + + RGWStorageStats& s = stats[category]; + + s.category = category; + s.size += header_stats.total_size; + s.size_rounded += header_stats.total_size_rounded; + s.size_utilized += header_stats.actual_size; + s.num_objects += header_stats.num_entries; + } +} + +int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info, + map *existing_stats, + map *calculated_stats) +{ + librados::IoCtx index_ctx; + // key - bucket index object id + // value - bucket index check OP returned result with the given bucket index object (shard) + map oids; + map bucket_objs_ret; + + int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret); + if (ret < 0) { + return ret; + } + + ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)(); + if (ret < 0) { + return ret; + } + + // Aggregate results (from different shards if there is any) + map::iterator iter; + for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) { + accumulate_raw_stats(iter->second.existing_header, *existing_stats); + accumulate_raw_stats(iter->second.calculated_header, *calculated_stats); + } + + return 0; +} + +int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info) +{ + librados::IoCtx index_ctx; + map bucket_objs; + + int r = open_bucket_index(bucket_info, index_ctx, bucket_objs); + if (r < 0) { + return r; + } + + return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry) +{ + librados::IoCtx index_ctx; + map bucket_objs; + + int r = open_bucket_index(bucket_info, index_ctx, bucket_objs); + if (r < 0) { + return r; + } + + return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj) +{ + RGWObjectCtx *rctx = static_cast(ctx); + std::string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + if (!rctx) + return 0; + + RGWObjState *state = NULL; + + int r = get_obj_state(rctx, bucket_info, obj, &state, false); + if (r < 0) + return r; + + if (!state->is_atomic) { + ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl; + return -EINVAL; + } + + string tag; + + if (state->tail_tag.length() > 0) { + tag = state->tail_tag.c_str(); + } else if (state->obj_tag.length() > 0) { + tag = state->obj_tag.c_str(); + } else { + ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl; + return -EINVAL; + } + + ldout(cct, 0) << "defer chain tag=" << tag << dendl; + + return gc->defer_chain(tag, false); +} + +void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op) +{ + list prefixes; + prefixes.push_back(RGW_ATTR_OLH_PREFIX); + cls_rgw_remove_obj(op, prefixes); +} + +void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist) +{ + cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist); +} + +void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type) +{ + cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type); +} + + +/** + * Delete an object. + * bucket: name of the bucket storing the object + * obj: name of the object to delete + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::Object::Delete::delete_obj() +{ + RGWRados *store = target->get_store(); + rgw_obj& src_obj = target->get_obj(); + const string& instance = src_obj.key.instance; + rgw_obj obj = src_obj; + + if (instance == "null") { + obj.key.instance.clear(); + } + + bool explicit_marker_version = (!params.marker_version_id.empty()); + + if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) { + if (instance.empty() || explicit_marker_version) { + rgw_obj marker = obj; + + if (!params.marker_version_id.empty()) { + if (params.marker_version_id != "null") { + marker.key.set_instance(params.marker_version_id); + } + } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) { + store->gen_rand_obj_instance_name(&marker); + } + + result.version_id = marker.key.instance; + if (result.version_id.empty()) + result.version_id = "null"; + result.delete_marker = true; + + struct rgw_bucket_dir_entry_meta meta; + + meta.owner = params.obj_owner.get_id().to_str(); + meta.owner_display_name = params.obj_owner.get_display_name(); + + if (real_clock::is_zero(params.mtime)) { + meta.mtime = real_clock::now(); + } else { + meta.mtime = params.mtime; + } + + int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace); + if (r < 0) { + return r; + } + } else { + rgw_bucket_dir_entry dirent; + + int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent); + if (r < 0) { + return r; + } + result.delete_marker = dirent.is_delete_marker(); + r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace); + if (r < 0) { + return r; + } + result.version_id = instance; + } + + BucketShard *bs; + int r = target->get_bucket_shard(&bs); + if (r < 0) { + ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl; + return r; + } + + if (target->bucket_info.datasync_flag_enabled()) { + r = store->data_log->add_entry(bs->bucket, bs->shard_id); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; + return r; + } + } + + return 0; + } + + rgw_rados_ref ref; + int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref); + if (r < 0) { + return r; + } + + RGWObjState *state; + r = target->get_state(&state, false); + if (r < 0) + return r; + + ObjectWriteOperation op; + + if (!real_clock::is_zero(params.unmod_since)) { + struct timespec ctime = ceph::real_clock::to_timespec(state->mtime); + struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since); + if (!params.high_precision_time) { + ctime.tv_nsec = 0; + unmod.tv_nsec = 0; + } + + ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl; + if (ctime > unmod) { + return -ERR_PRECONDITION_FAILED; + } + + /* only delete object if mtime is less than or equal to params.unmod_since */ + store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE); + } + uint64_t obj_accounted_size = state->accounted_size; + + if(params.abortmp) { + obj_accounted_size = params.parts_accounted_size; + } + + if (!real_clock::is_zero(params.expiration_time)) { + bufferlist bl; + real_time delete_at; + + if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) { + try { + auto iter = bl.cbegin(); + decode(delete_at, iter); + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl; + return -EIO; + } + + if (params.expiration_time != delete_at) { + return -ERR_PRECONDITION_FAILED; + } + } else { + return -ERR_PRECONDITION_FAILED; + } + } + + if (!state->exists) { + target->invalidate_state(); + return -ENOENT; + } + + r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false); + if (r < 0) + return r; + + RGWBucketInfo& bucket_info = target->get_bucket_info(); + + RGWRados::Bucket bop(store, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + index_op.set_zones_trace(params.zones_trace); + index_op.set_bilog_flags(params.bilog_flags); + + r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag); + if (r < 0) + return r; + + store->remove_rgw_head_obj(op); + r = ref.ioctx.operate(ref.obj.oid, &op); + + /* raced with another operation, object state is indeterminate */ + const bool need_invalidate = (r == -ECANCELED); + + int64_t poolid = ref.ioctx.get_id(); + if (r >= 0) { + tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache(); + if (obj_tombstone_cache) { + tombstone_entry entry{*state}; + obj_tombstone_cache->add(obj, entry); + } + r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs); + + int ret = target->complete_atomic_modification(); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl; + } + /* other than that, no need to propagate error */ + } else { + int ret = index_op.cancel(); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; + } + } + + if (need_invalidate) { + target->invalidate_state(); + } + + if (r < 0) + return r; + + /* update quota cache */ + store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size); + + return 0; +} + +int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, + const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + int versioning_status, + uint16_t bilog_flags, + const real_time& expiration_time, + rgw_zone_set *zones_trace) +{ + RGWRados::Object del_target(this, bucket_info, obj_ctx, obj); + RGWRados::Object::Delete del_op(&del_target); + + del_op.params.bucket_owner = bucket_info.owner; + del_op.params.versioning_status = versioning_status; + del_op.params.bilog_flags = bilog_flags; + del_op.params.expiration_time = expiration_time; + del_op.params.zones_trace = zones_trace; + + return del_op.delete_obj(); +} + +int RGWRados::delete_raw_obj(const rgw_raw_obj& obj) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + + op.remove(); + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r < 0) + return r; + + return 0; +} + +int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime) +{ + std::string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + + auto obj_ctx = svc.sysobj->init_obj_ctx(); + + RGWBucketInfo bucket_info; + int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL); + if (ret < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl; + return ret; + } + + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + return index_op.complete_del(-1 /* pool */, 0, mtime, NULL); +} + +static void generate_fake_tag(RGWRados *store, map& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl) +{ + string tag; + + RGWObjManifest::obj_iterator mi = manifest.obj_begin(); + if (mi != manifest.obj_end()) { + if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part + ++mi; + tag = mi.get_location().get_raw_obj(store).oid; + tag.append("_"); + } + + unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + MD5 hash; + hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length()); + + map::iterator iter = attrset.find(RGW_ATTR_ETAG); + if (iter != attrset.end()) { + bufferlist& bl = iter->second; + hash.Update((const unsigned char *)bl.c_str(), bl.length()); + } + + hash.Final(md5); + buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str); + tag.append(md5_str); + + ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl; + + tag_bl.append(tag.c_str(), tag.size() + 1); +} + +static bool is_olh(map& attrs) +{ + map::iterator iter = attrs.find(RGW_ATTR_OLH_INFO); + return (iter != attrs.end()); +} + +static bool has_olh_tag(map& attrs) +{ + map::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG); + return (iter != attrs.end()); +} + +int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + RGWObjState *olh_state, RGWObjState **target_state) +{ + ceph_assert(olh_state->is_olh); + + rgw_obj target; + int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */ + if (r < 0) { + return r; + } + r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + RGWObjState **state, bool follow_olh, bool assume_noent) +{ + if (obj.empty()) { + return -EINVAL; + } + + bool need_follow_olh = follow_olh && obj.key.instance.empty(); + + RGWObjState *s = rctx->get_state(obj); + ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl; + *state = s; + if (s->has_attrs) { + if (s->is_olh && need_follow_olh) { + return get_olh_target_state(*rctx, bucket_info, obj, s, state); + } + return 0; + } + + s->obj = obj; + + rgw_raw_obj raw_obj; + obj_to_raw(bucket_info.placement_rule, obj, &raw_obj); + + int r = -ENOENT; + + if (!assume_noent) { + r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL); + } + + if (r == -ENOENT) { + s->exists = false; + s->has_attrs = true; + tombstone_entry entry; + if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) { + s->mtime = entry.mtime; + s->zone_short_id = entry.zone_short_id; + s->pg_ver = entry.pg_ver; + ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj + << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl; + } else { + s->mtime = real_time(); + } + return 0; + } + if (r < 0) + return r; + + s->exists = true; + s->has_attrs = true; + s->accounted_size = s->size; + + auto iter = s->attrset.find(RGW_ATTR_ETAG); + if (iter != s->attrset.end()) { + /* get rid of extra null character at the end of the etag, as we used to store it like that */ + bufferlist& bletag = iter->second; + if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') { + bufferlist newbl; + bletag.splice(0, bletag.length() - 1, &newbl); + bletag.claim(newbl); + } + } + + iter = s->attrset.find(RGW_ATTR_COMPRESSION); + const bool compressed = (iter != s->attrset.end()); + if (compressed) { + // use uncompressed size for accounted_size + try { + RGWCompressionInfo info; + auto p = iter->second.cbegin(); + decode(info, p); + s->accounted_size = info.orig_size; + } catch (buffer::error&) { + dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl; + return -EIO; + } + } + + iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ); + if (iter != s->attrset.end()) { + bufferlist bl = iter->second; + bufferlist::iterator it = bl.begin(); + it.copy(bl.length(), s->shadow_obj); + s->shadow_obj[bl.length()] = '\0'; + } + s->obj_tag = s->attrset[RGW_ATTR_ID_TAG]; + auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG); + if (ttiter != s->attrset.end()) { + s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG]; + } + + bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST]; + if (manifest_bl.length()) { + auto miter = manifest_bl.cbegin(); + try { + decode(s->manifest, miter); + s->has_manifest = true; + s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be + broken due to old bugs */ + s->size = s->manifest.get_obj_size(); + if (!compressed) + s->accounted_size = s->size; + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl; + return -EIO; + } + ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl; + if (cct->_conf->subsys.should_gather() && \ + s->manifest.has_explicit_objs()) { + RGWObjManifest::obj_iterator mi; + for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) { + ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl; + } + } + + if (!s->obj_tag.length()) { + /* + * Uh oh, something's wrong, object with manifest should have tag. Let's + * create one out of the manifest, would be unique + */ + generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag); + s->fake_tag = true; + } + } + map::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER); + if (aiter != s->attrset.end()) { + bufferlist& pg_ver_bl = aiter->second; + if (pg_ver_bl.length()) { + auto pgbl = pg_ver_bl.cbegin(); + try { + decode(s->pg_ver, pgbl); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl; + } + } + } + aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE); + if (aiter != s->attrset.end()) { + bufferlist& zone_short_id_bl = aiter->second; + if (zone_short_id_bl.length()) { + auto zbl = zone_short_id_bl.cbegin(); + try { + decode(s->zone_short_id, zbl); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl; + } + } + } + if (s->obj_tag.length()) + ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl; + else + ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl; + + /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if + * it exist, and not only if is_olh() returns true + */ + iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG); + if (iter != s->attrset.end()) { + s->olh_tag = iter->second; + } + + if (is_olh(s->attrset)) { + s->is_olh = true; + + ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl; + + if (need_follow_olh) { + return get_olh_target_state(*rctx, bucket_info, obj, s, state); + } else if (obj.key.have_null_instance() && !s->has_manifest) { + // read null version, and the head object only have olh info + s->exists = false; + return -ENOENT; + } + } + + return 0; +} + +int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, + bool follow_olh, bool assume_noent) +{ + int ret; + + do { + ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent); + } while (ret == -EAGAIN); + + return ret; +} + +int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest) +{ + RGWObjState *astate; + int r = get_state(&astate, true); + if (r < 0) { + return r; + } + + *pmanifest = &astate->manifest; + + return 0; +} + +int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest) +{ + RGWObjState *state; + int r = source->get_state(&state, true); + if (r < 0) + return r; + if (!state->exists) + return -ENOENT; + if (!state->get_attr(name, dest)) + return -ENODATA; + + return 0; +} + + +int RGWRados::Object::Stat::stat_async() +{ + RGWObjectCtx& ctx = source->get_ctx(); + rgw_obj& obj = source->get_obj(); + RGWRados *store = source->get_store(); + + RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */ + result.obj = obj; + if (s->has_attrs) { + state.ret = 0; + result.size = s->size; + result.mtime = ceph::real_clock::to_timespec(s->mtime); + result.attrs = s->attrset; + result.has_manifest = s->has_manifest; + result.manifest = s->manifest; + return 0; + } + + string oid; + string loc; + get_obj_bucket_and_oid_loc(obj, oid, loc); + + int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + op.stat2(&result.size, &result.mtime, NULL); + op.getxattrs(&result.attrs, NULL); + state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL); + state.io_ctx.locator_set_key(loc); + r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL); + if (r < 0) { + ldout(store->ctx(), 5) << __func__ + << ": ERROR: aio_operate() returned ret=" << r + << dendl; + return r; + } + + return 0; +} + + +int RGWRados::Object::Stat::wait() +{ + if (!state.completion) { + return state.ret; + } + + state.completion->wait_for_safe(); + state.ret = state.completion->get_return_value(); + state.completion->release(); + + if (state.ret != 0) { + return state.ret; + } + + return finish(); +} + +int RGWRados::Object::Stat::finish() +{ + map::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST); + if (iter != result.attrs.end()) { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + try { + decode(result.manifest, biter); + } catch (buffer::error& err) { + RGWRados *store = source->get_store(); + ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl; + return -EIO; + } + result.has_manifest = true; + } + + return 0; +} + +int RGWRados::append_atomic_test(RGWObjectCtx *rctx, + const RGWBucketInfo& bucket_info, const rgw_obj& obj, + ObjectOperation& op, RGWObjState **pstate) +{ + if (!rctx) + return 0; + + int r = get_obj_state(rctx, bucket_info, obj, pstate, false); + if (r < 0) + return r; + + return append_atomic_test(*pstate, op); +} + +int RGWRados::append_atomic_test(const RGWObjState* state, + librados::ObjectOperation& op) +{ + if (!state->is_atomic) { + ldout(cct, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl; + return 0; + } + + if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility + op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); + } else { + ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl; + } + return 0; +} + +int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent) +{ + return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent); +} + +void RGWRados::Object::invalidate_state() +{ + ctx.invalidate(obj); +} + +int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag, + const char *if_match, const char *if_nomatch, bool removal_op, + bool modify_tail) +{ + int r = get_state(&state, false); + if (r < 0) + return r; + + bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) || + if_match != NULL || if_nomatch != NULL) && + (!state->fake_tag); + + if (!state->is_atomic) { + ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl; + + if (reset_obj) { + op.create(false); + store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object + } + + return 0; + } + + if (need_guard) { + /* first verify that the object wasn't replaced under */ + if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) { + op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); + // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion + } + + if (if_match) { + if (strcmp(if_match, "*") == 0) { + // test the object is existing + if (!state->exists) { + return -ERR_PRECONDITION_FAILED; + } + } else { + bufferlist bl; + if (!state->get_attr(RGW_ATTR_ETAG, bl) || + strncmp(if_match, bl.c_str(), bl.length()) != 0) { + return -ERR_PRECONDITION_FAILED; + } + } + } + + if (if_nomatch) { + if (strcmp(if_nomatch, "*") == 0) { + // test the object is NOT existing + if (state->exists) { + return -ERR_PRECONDITION_FAILED; + } + } else { + bufferlist bl; + if (!state->get_attr(RGW_ATTR_ETAG, bl) || + strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) { + return -ERR_PRECONDITION_FAILED; + } + } + } + } + + if (reset_obj) { + if (state->exists) { + op.create(false); + store->remove_rgw_head_obj(op); + } else { + op.create(true); + } + } + + if (removal_op) { + /* the object is being removed, no need to update its tag */ + return 0; + } + + if (ptag) { + state->write_tag = *ptag; + } else { + append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32); + } + bufferlist bl; + bl.append(state->write_tag.c_str(), state->write_tag.size() + 1); + + ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl; + + op.setxattr(RGW_ATTR_ID_TAG, bl); + if (modify_tail) { + op.setxattr(RGW_ATTR_TAIL_TAG, bl); + } + + return 0; +} + +/** + * Set an attr on an object. + * bucket: name of the bucket holding the object + * obj: name of the object to set the attr on + * name: the attr to set + * bl: the contents of the attr + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl) +{ + map attrs; + attrs[name] = bl; + return set_attrs(ctx, bucket_info, obj, attrs, NULL); +} + +int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj, + map& attrs, + map* rmattrs) +{ + rgw_obj obj = src_obj; + if (obj.key.instance == "null") { + obj.key.instance.clear(); + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj, &ref); + if (r < 0) { + return r; + } + RGWObjectCtx *rctx = static_cast(ctx); + + ObjectWriteOperation op; + RGWObjState *state = NULL; + + r = append_atomic_test(rctx, bucket_info, obj, op, &state); + if (r < 0) + return r; + + // ensure null version object exist + if (src_obj.key.instance == "null" && !state->has_manifest) { + return -ENOENT; + } + + map::iterator iter; + if (rmattrs) { + for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { + const string& name = iter->first; + op.rmxattr(name.c_str()); + } + } + + const rgw_bucket& bucket = obj.bucket; + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + + if (name.compare(RGW_ATTR_DELETE_AT) == 0) { + real_time ts; + try { + decode(ts, bl); + + rgw_obj_index_key obj_key; + obj.key.get_index_key(&obj_key); + + objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl; + } + } + } + + if (!op.size()) + return 0; + + RGWObjectCtx obj_ctx(this); + + bufferlist bl; + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + if (state) { + string tag; + append_rand_alpha(cct, tag, tag, 32); + state->write_tag = tag; + r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag); + + if (r < 0) + return r; + + bl.append(tag.c_str(), tag.size() + 1); + op.setxattr(RGW_ATTR_ID_TAG, bl); + } + + + real_time mtime = real_clock::now(); + struct timespec mtime_ts = real_clock::to_timespec(mtime); + op.mtime2(&mtime_ts); + r = ref.ioctx.operate(ref.obj.oid, &op); + if (state) { + if (r >= 0) { + bufferlist acl_bl = attrs[RGW_ATTR_ACL]; + bufferlist etag_bl = attrs[RGW_ATTR_ETAG]; + bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE]; + string etag = rgw_bl_str(etag_bl); + string content_type = rgw_bl_str(content_type_bl); + string storage_class; + auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS); + if (iter != attrs.end()) { + storage_class = rgw_bl_str(iter->second); + } + uint64_t epoch = ref.ioctx.get_last_version(); + int64_t poolid = ref.ioctx.get_id(); + r = index_op.complete(poolid, epoch, state->size, state->accounted_size, + mtime, etag, content_type, storage_class, &acl_bl, + RGWObjCategory::Main, NULL); + } else { + int ret = index_op.cancel(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl; + } + } + } + if (r < 0) + return r; + + if (state) { + state->obj_tag.swap(bl); + if (rmattrs) { + for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { + state->attrset.erase(iter->first); + } + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + state->attrset[iter->first] = iter->second; + } + + auto iter = state->attrset.find(RGW_ATTR_ID_TAG); + if (iter != state->attrset.end()) { + iter->second = state->obj_tag; + } + } + + return 0; +} + +int RGWRados::Object::Read::prepare() +{ + RGWRados *store = source->get_store(); + CephContext *cct = store->ctx(); + + bufferlist etag; + + map::iterator iter; + + RGWObjState *astate; + int r = source->get_state(&astate, true); + if (r < 0) + return r; + + if (!astate->exists) { + return -ENOENT; + } + + const RGWBucketInfo& bucket_info = source->get_bucket_info(); + + state.obj = astate->obj; + store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj); + + state.cur_pool = state.head_obj.pool; + state.cur_ioctx = &state.io_ctxs[state.cur_pool]; + + r = store->get_obj_head_ioctx(bucket_info, state.obj, state.cur_ioctx); + if (r < 0) { + return r; + } + if (params.target_obj) { + *params.target_obj = state.obj; + } + if (params.attrs) { + *params.attrs = astate->attrset; + if (cct->_conf->subsys.should_gather()) { + for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) { + ldout(cct, 20) << "Read xattr: " << iter->first << dendl; + } + } + } + + /* Convert all times go GMT to make them compatible */ + if (conds.mod_ptr || conds.unmod_ptr) { + obj_time_weight src_weight; + src_weight.init(astate); + src_weight.high_precision = conds.high_precision_time; + + obj_time_weight dest_weight; + dest_weight.high_precision = conds.high_precision_time; + + if (conds.mod_ptr) { + dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver); + ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl; + if (!(dest_weight < src_weight)) { + return -ERR_NOT_MODIFIED; + } + } + + if (conds.unmod_ptr) { + dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver); + ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl; + if (dest_weight < src_weight) { + return -ERR_PRECONDITION_FAILED; + } + } + } + if (conds.if_match || conds.if_nomatch) { + r = get_attr(RGW_ATTR_ETAG, etag); + if (r < 0) + return r; + + + + if (conds.if_match) { + string if_match_str = rgw_string_unquote(conds.if_match); + ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl; + if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) { + return -ERR_PRECONDITION_FAILED; + } + } + + if (conds.if_nomatch) { + string if_nomatch_str = rgw_string_unquote(conds.if_nomatch); + ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl; + if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) { + return -ERR_NOT_MODIFIED; + } + } + } + + if (params.obj_size) + *params.obj_size = astate->size; + if (params.lastmod) + *params.lastmod = astate->mtime; + + return 0; +} + +int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end) +{ + if (ofs < 0) { + ofs += obj_size; + if (ofs < 0) + ofs = 0; + end = obj_size - 1; + } else if (end < 0) { + end = obj_size - 1; + } + + if (obj_size > 0) { + if (ofs >= (off_t)obj_size) { + return -ERANGE; + } + if (end >= (off_t)obj_size) { + end = obj_size - 1; + } + } + return 0; +} + +int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function call) +{ + RGWRados *store = target->get_store(); + BucketShard *bs; + int r; + +#define NUM_RESHARD_RETRIES 10 + for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { + int ret = get_bucket_shard(&bs); + if (ret < 0) { + ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl; + return ret; + } + r = call(bs); + if (r != -ERR_BUSY_RESHARDING) { + break; + } + ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl; + string new_bucket_id; + r = store->block_while_resharding(bs, &new_bucket_id, + target->bucket_info, null_yield); + if (r == -ERR_BUSY_RESHARDING) { + continue; + } + if (r < 0) { + return r; + } + ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl; + i = 0; /* resharding is finished, make sure we can retry */ + r = target->update_bucket_id(new_bucket_id); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl; + return r; + } + invalidate_bs(); + } // for loop + + if (r < 0) { + return r; + } + + if (pbs) { + *pbs = bs; + } + + return 0; +} + +int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + + if (write_tag && write_tag->length()) { + optag = string(write_tag->c_str(), write_tag->length()); + } else { + if (optag.empty()) { + append_rand_alpha(store->ctx(), optag, optag, 32); + } + } + + int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int { + return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace); + }); + + if (r < 0) { + return r; + } + prepared = true; + + return 0; +} + +int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch, + uint64_t size, uint64_t accounted_size, + ceph::real_time& ut, const string& etag, + const string& content_type, const string& storage_class, + bufferlist *acl_bl, + RGWObjCategory category, + list *remove_objs, const string *user_data, + bool appendable) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs; + + int ret = get_bucket_shard(&bs); + if (ret < 0) { + ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl; + return ret; + } + + rgw_bucket_dir_entry ent; + obj.key.get_index_key(&ent.key); + ent.meta.size = size; + ent.meta.accounted_size = accounted_size; + ent.meta.mtime = ut; + ent.meta.etag = etag; + ent.meta.storage_class = storage_class; + if (user_data) + ent.meta.user_data = *user_data; + + ACLOwner owner; + if (acl_bl && acl_bl->length()) { + int ret = store->decode_policy(*acl_bl, &owner); + if (ret < 0) { + ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl; + } + } + ent.meta.owner = owner.get_id().to_str(); + ent.meta.owner_display_name = owner.get_display_name(); + ent.meta.content_type = content_type; + ent.meta.appendable = appendable; + + ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace); + + if (target->bucket_info.datasync_flag_enabled()) { + int r = store->data_log->add_entry(bs->bucket, bs->shard_id); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; + } + } + + return ret; +} + +int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch, + real_time& removed_mtime, + list *remove_objs) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs; + + int ret = get_bucket_shard(&bs); + if (ret < 0) { + ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl; + return ret; + } + + ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace); + + if (target->bucket_info.datasync_flag_enabled()) { + int r = store->data_log->add_entry(bs->bucket, bs->shard_id); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; + } + } + + return ret; +} + + +int RGWRados::Bucket::UpdateIndex::cancel() +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs; + + int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int { + return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace); + }); + + /* + * need to update data log anyhow, so that whoever follows needs to update its internal markers + * for following the specific bucket shard log. Otherwise they end up staying behind, and users + * have no way to tell that they're all caught up + */ + if (target->bucket_info.datasync_flag_enabled()) { + int r = store->data_log->add_entry(bs->bucket, bs->shard_id); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; + } + } + + return ret; +} + +int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl) +{ + RGWRados *store = source->get_store(); + CephContext *cct = store->ctx(); + + rgw_raw_obj read_obj; + uint64_t read_ofs = ofs; + uint64_t len, read_len; + bool reading_from_head = true; + ObjectReadOperation op; + + bool merge_bl = false; + bufferlist *pbl = &bl; + bufferlist read_bl; + uint64_t max_chunk_size; + + RGWObjState *astate; + int r = source->get_state(&astate, true); + if (r < 0) + return r; + + if (astate->size == 0) { + end = 0; + } else if (end >= (int64_t)astate->size) { + end = astate->size - 1; + } + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (astate->has_manifest && astate->manifest.has_tail()) { + /* now get the relevant object part */ + RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs); + + uint64_t stripe_ofs = iter.get_stripe_ofs(); + read_obj = iter.get_location().get_raw_obj(store); + len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs)); + read_ofs = iter.location_ofs() + (ofs - stripe_ofs); + reading_from_head = (read_obj == state.head_obj); + } else { + read_obj = state.head_obj; + } + + r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl; + return r; + } + + if (len > max_chunk_size) + len = max_chunk_size; + + + read_len = len; + + if (reading_from_head) { + /* only when reading from the head object do we need to do the atomic test */ + r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate); + if (r < 0) + return r; + + if (astate && astate->prefetch_data) { + if (!ofs && astate->data.length() >= len) { + bl = astate->data; + return bl.length(); + } + + if (ofs < astate->data.length()) { + unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len); + astate->data.copy(ofs, copy_len, bl); + read_len -= copy_len; + read_ofs += copy_len; + if (!read_len) + return bl.length(); + + merge_bl = true; + pbl = &read_bl; + } + } + } + + ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl; + op.read(read_ofs, read_len, pbl, NULL); + + if (state.cur_pool != read_obj.pool) { + auto iter = state.io_ctxs.find(read_obj.pool); + if (iter == state.io_ctxs.end()) { + state.cur_ioctx = &state.io_ctxs[read_obj.pool]; + r = store->open_pool_ctx(read_obj.pool, *state.cur_ioctx, false); + if (r < 0) { + ldout(cct, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl; + return r; + } + } else { + state.cur_ioctx = &iter->second; + } + state.cur_pool = read_obj.pool; + } + + state.cur_ioctx->locator_set_key(read_obj.loc); + + r = state.cur_ioctx->operate(read_obj.oid, &op, NULL); + ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl; + + if (r < 0) { + return r; + } + + if (merge_bl) { + bl.append(read_bl); + } + + return bl.length(); +} + +struct get_obj_data { + RGWRados* store; + RGWGetDataCB* client_cb; + rgw::Aio* aio; + uint64_t offset; // next offset to write to client + rgw::AioResultList completed; // completed read results, sorted by offset + + get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio, uint64_t offset) + : store(store), client_cb(cb), aio(aio), offset(offset) {} + + int flush(rgw::AioResultList&& results) { + int r = rgw::check_for_errors(results); + if (r < 0) { + return r; + } + + auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; }; + results.sort(cmp); // merge() requires results to be sorted first + completed.merge(results, cmp); // merge results in sorted order + + while (!completed.empty() && completed.front().id == offset) { + auto bl = std::move(completed.front().data); + completed.pop_front_and_dispose(std::default_delete{}); + + offset += bl.length(); + int r = client_cb->handle_data(bl, 0, bl.length()); + if (r < 0) { + return r; + } + } + return 0; + } + + void cancel() { + // wait for all completions to drain and ignore the results + aio->drain(); + } + + int drain() { + auto c = aio->wait(); + while (!c.empty()) { + int r = flush(std::move(c)); + if (r < 0) { + cancel(); + return r; + } + c = aio->wait(); + } + return flush(std::move(c)); + } +}; + +static int _get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg) +{ + struct get_obj_data *d = (struct get_obj_data *)arg; + + return d->store->get_obj_iterate_cb(read_obj, obj_ofs, read_ofs, len, + is_head_obj, astate, arg); +} + +int RGWRados::get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg) +{ + ObjectReadOperation op; + struct get_obj_data *d = (struct get_obj_data *)arg; + string oid, key; + + if (is_head_obj) { + /* only when reading from the head object do we need to do the atomic test */ + int r = append_atomic_test(astate, op); + if (r < 0) + return r; + + if (astate && + obj_ofs < astate->data.length()) { + unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len); + + r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len); + if (r < 0) + return r; + + len -= chunk_len; + d->offset += chunk_len; + read_ofs += chunk_len; + obj_ofs += chunk_len; + if (!len) + return 0; + } + } + + auto obj = d->store->svc.rados->obj(read_obj); + int r = obj.open(); + if (r < 0) { + ldout(cct, 4) << "failed to open rados context for " << read_obj << dendl; + return r; + } + + ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl; + op.read(read_ofs, len, nullptr, nullptr); + + const uint64_t cost = len; + const uint64_t id = obj_ofs; // use logical object offset for sorting replies + + auto completed = d->aio->submit(obj, &op, cost, id); + + return d->flush(std::move(completed)); +} + +int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb) +{ + RGWRados *store = source->get_store(); + CephContext *cct = store->ctx(); + RGWObjectCtx& obj_ctx = source->get_ctx(); + const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size; + const uint64_t window_size = cct->_conf->rgw_get_obj_window_size; + + rgw::AioThrottle aio(window_size); + get_obj_data data(store, cb, &aio, ofs); + + int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, + ofs, end, chunk_size, _get_obj_iterate_cb, &data); + if (r < 0) { + ldout(cct, 0) << "iterate_obj() failed with " << r << dendl; + data.cancel(); // drain completions without writing back to client + return r; + } + + return data.drain(); +} + +int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx, + const RGWBucketInfo& bucket_info, const rgw_obj& obj, + off_t ofs, off_t end, uint64_t max_chunk_size, + iterate_obj_cb cb, void *arg) +{ + rgw_raw_obj head_obj; + rgw_raw_obj read_obj; + uint64_t read_ofs = ofs; + uint64_t len; + bool reading_from_head = true; + RGWObjState *astate = NULL; + + obj_to_raw(bucket_info.placement_rule, obj, &head_obj); + + int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false); + if (r < 0) { + return r; + } + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (astate->has_manifest) { + /* now get the relevant object stripe */ + RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs); + + RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end(); + + for (; iter != obj_end && ofs <= end; ++iter) { + off_t stripe_ofs = iter.get_stripe_ofs(); + off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size(); + + while (ofs < next_stripe_ofs && ofs <= end) { + read_obj = iter.get_location().get_raw_obj(this); + uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs)); + read_ofs = iter.location_ofs() + (ofs - stripe_ofs); + + if (read_len > max_chunk_size) { + read_len = max_chunk_size; + } + + reading_from_head = (read_obj == head_obj); + r = cb(read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg); + if (r < 0) { + return r; + } + + len -= read_len; + ofs += read_len; + } + } + } else { + while (ofs <= end) { + read_obj = head_obj; + uint64_t read_len = std::min(len, max_chunk_size); + + r = cb(read_obj, ofs, ofs, read_len, reading_from_head, astate, arg); + if (r < 0) { + return r; + } + + len -= read_len; + ofs += read_len; + } + } + + return 0; +} + +int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + return ref.ioctx.operate(ref.obj.oid, op); +} + +int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + bufferlist outbl; + + return ref.ioctx.operate(ref.obj.oid, op, &outbl); +} + +int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag) +{ + ObjectWriteOperation op; + + ceph_assert(olh_obj.key.instance.empty()); + + bool has_tag = (state.exists && has_olh_tag(state.attrset)); + + if (!state.exists) { + op.create(true); + } else { + op.assert_exists(); + struct timespec mtime_ts = real_clock::to_timespec(state.mtime); + op.mtime2(&mtime_ts); + } + + /* + * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object. + * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two + * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to + * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh + * log will reflect that. + * + * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag + * is used for object data instance, olh_tag for olh instance. + */ + if (has_tag) { + /* guard against racing writes */ + bucket_index_guard_olh_op(state, op); + } + + if (!has_tag) { + /* obj tag */ + string obj_tag = gen_rand_alphanumeric_lower(cct, 32); + + bufferlist bl; + bl.append(obj_tag.c_str(), obj_tag.size()); + op.setxattr(RGW_ATTR_ID_TAG, bl); + + state.attrset[RGW_ATTR_ID_TAG] = bl; + state.obj_tag = bl; + + /* olh tag */ + string olh_tag = gen_rand_alphanumeric_lower(cct, 32); + + bufferlist olh_bl; + olh_bl.append(olh_tag.c_str(), olh_tag.size()); + op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl); + + state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl; + state.olh_tag = olh_bl; + state.is_olh = true; + + bufferlist verbl; + op.setxattr(RGW_ATTR_OLH_VER, verbl); + } + + bufferlist bl; + RGWOLHPendingInfo pending_info; + pending_info.time = real_clock::now(); + encode(pending_info, bl); + +#define OLH_PENDING_TAG_LEN 32 + /* tag will start with current time epoch, this so that entries are sorted by time */ + char buf[32]; + utime_t ut(pending_info.time); + snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec()); + *op_tag = buf; + + string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size()); + + op_tag->append(s); + + string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; + attr_name.append(*op_tag); + + op.setxattr(attr_name.c_str(), bl); + + int ret = obj_operate(bucket_info, olh_obj, &op); + if (ret < 0) { + return ret; + } + + state.exists = true; + state.attrset[attr_name] = bl; + + return 0; +} + +int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag) +{ + int ret; + + ret = olh_init_modification_impl(bucket_info, state, obj, op_tag); + if (ret == -EEXIST) { + ret = -ECANCELED; + } + + return ret; +} + +int RGWRados::guard_reshard(BucketShard *bs, + const rgw_obj& obj_instance, + const RGWBucketInfo& bucket_info, + std::function call) +{ + rgw_obj obj; + const rgw_obj *pobj = &obj_instance; + int r; + + for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { + r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */); + if (r < 0) { + ldout(cct, 5) << "bs.init() returned ret=" << r << dendl; + return r; + } + r = call(bs); + if (r != -ERR_BUSY_RESHARDING) { + break; + } + ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl; + string new_bucket_id; + r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield); + if (r == -ERR_BUSY_RESHARDING) { + continue; + } + if (r < 0) { + return r; + } + ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl; + i = 0; /* resharding is finished, make sure we can retry */ + + obj = *pobj; + obj.bucket.update_bucket_id(new_bucket_id); + pobj = &obj; + } // for loop + + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, + string *new_bucket_id, + const RGWBucketInfo& bucket_info, + optional_yield y) +{ + int ret = 0; + cls_rgw_bucket_instance_entry entry; + + // since we want to run this recovery code from two distinct places, + // let's just put it in a lambda so we can easily re-use; if the + // lambda successfully fetches a new bucket id, it sets + // new_bucket_id and returns 0, otherwise it returns a negative + // error code + auto fetch_new_bucket_id = + [this, bucket_info](const std::string& log_tag, + std::string* new_bucket_id) -> int { + RGWBucketInfo fresh_bucket_info = bucket_info; + int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr); + if (ret < 0) { + ldout(cct, 0) << __func__ << + " ERROR: failed to refresh bucket info after reshard at " << + log_tag << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + *new_bucket_id = fresh_bucket_info.bucket.bucket_id; + return 0; + }; + + constexpr int num_retries = 10; + for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop + ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry); + if (ret == -ENOENT) { + return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id); + } else if (ret < 0) { + ldout(cct, 0) << __func__ << + " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) << + dendl; + return ret; + } + + if (!entry.resharding_in_progress()) { + return fetch_new_bucket_id("get_bucket_resharding_succeeded", + new_bucket_id); + } + + ldout(cct, 20) << "NOTICE: reshard still in progress; " << + (i < num_retries ? "retrying" : "too many retries") << dendl; + + if (i == num_retries) { + break; + } + + // If bucket is erroneously marked as resharding (e.g., crash or + // other error) then fix it. If we can take the bucket reshard + // lock then it means no other resharding should be taking place, + // and we're free to clear the flags. + { + // since we expect to do this rarely, we'll do our work in a + // block and erase our work after each try + + RGWObjectCtx obj_ctx(this); + const rgw_bucket& b = bs->bucket; + std::string bucket_id = b.get_key(); + RGWBucketReshardLock reshard_lock(this, bucket_info, true); + ret = reshard_lock.lock(); + if (ret < 0) { + ldout(cct, 20) << __func__ << + " INFO: failed to take reshard lock for bucket " << + bucket_id << "; expected if resharding underway" << dendl; + } else { + ldout(cct, 10) << __func__ << + " INFO: was able to take reshard lock for bucket " << + bucket_id << dendl; + ret = RGWBucketReshard::clear_resharding(this, bucket_info); + if (ret < 0) { + reshard_lock.unlock(); + ldout(cct, 0) << __func__ << + " ERROR: failed to clear resharding flags for bucket " << + bucket_id << dendl; + } else { + reshard_lock.unlock(); + ldout(cct, 5) << __func__ << + " INFO: apparently successfully cleared resharding flags for " + "bucket " << bucket_id << dendl; + continue; // if we apparently succeed immediately test again + } // if clear resharding succeeded + } // if taking of lock succeeded + } // block to encapsulate recovery from incomplete reshard + + ret = reshard_wait->wait(y); + if (ret < 0) { + ldout(cct, 0) << __func__ << + " ERROR: bucket is still resharding, please retry" << dendl; + return ret; + } + } // for loop + + ldout(cct, 0) << __func__ << + " ERROR: bucket is still resharding, please retry" << dendl; + return -ERR_BUSY_RESHARDING; +} + +int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance, + bool delete_marker, + const string& op_tag, + struct rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, + real_time unmod_since, bool high_precision_time, + rgw_zone_set *_zones_trace, bool log_data_change) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id); + + BucketShard bs(this); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); + r = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + librados::ObjectWriteOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + return cls_rgw_bucket_link_olh(bs->index_ctx, op, + bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch, + unmod_since, high_precision_time, + svc.zone->get_zone().log_data, zones_trace); + }); + if (r < 0) { + ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl; + return r; + } + + if (log_data_change && bucket_info.datasync_flag_enabled()) { + data_log->add_entry(bs.bucket, bs.shard_id); + } + + return 0; +} + +void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op) +{ + ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl; + op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag); +} + +int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance, + const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id); + + BucketShard bs(this); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); + r = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + librados::ObjectWriteOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag, + olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace); + }); + if (r < 0) { + ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl; + return r; + } + + return 0; +} + +int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, + const rgw_obj& obj_instance, uint64_t ver_marker, + map > *log, + bool *is_truncated) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + int ret = + bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */); + if (ret < 0) { + ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + ret = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + ObjectReadOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op, + key, ver_marker, olh_tag, log, is_truncated); + }); + if (ret < 0) { + ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl; + return ret; + } + + return 0; +} + +// a multisite sync bug resulted in the OLH head attributes being overwritten by +// the attributes from another zone, causing link_olh() to fail endlessly due to +// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH +// attributes from the bucket index. see http://tracker.ceph.com/issues/37792 +int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info, + const rgw_obj& obj) +{ + // fetch the current olh entry from the bucket index + rgw_bucket_olh_entry olh; + int r = bi_get_olh(bucket_info, obj, &olh); + if (r < 0) { + ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl; + return r; + } + if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved? + return 0; + } + + ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag + << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl; + + // rewrite OLH_ID_TAG and OLH_INFO from current olh + ObjectWriteOperation op; + // assert this is the same olh tag we think we're fixing + bucket_index_guard_olh_op(*state, op); + // preserve existing mtime + struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime); + op.mtime2(&mtime_ts); + { + bufferlist bl; + bl.append(olh.tag.c_str(), olh.tag.size()); + op.setxattr(RGW_ATTR_OLH_ID_TAG, bl); + } + { + RGWOLHInfo info; + info.target = rgw_obj(bucket_info.bucket, olh.key); + info.removed = olh.delete_marker; + bufferlist bl; + encode(info, bl); + op.setxattr(RGW_ATTR_OLH_INFO, bl); + } + rgw_rados_ref ref; + r = get_obj_head_ref(bucket_info, obj, &ref); + if (r < 0) { + return r; + } + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r < 0) { + ldout(cct, 0) << "repair_olh failed to write olh attributes with " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + int ret = + bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */); + if (ret < 0) { + ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + ret = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *pbs) -> int { + ObjectWriteOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_trim_olh_log(op, key, ver, olh_tag); + return pbs->index_ctx.operate(pbs->bucket_obj, &op); + }); + if (ret < 0) { + ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + int ret = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *pbs) -> int { + ObjectWriteOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag); + }); + if (ret < 0) { + ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +static int decode_olh_info(CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh) +{ + try { + auto biter = bl.cbegin(); + decode(*olh, biter); + return 0; + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl; + return -EIO; + } +} + +int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + bufferlist& olh_tag, map >& log, + uint64_t *plast_ver, rgw_zone_set* zones_trace) +{ + if (log.empty()) { + return 0; + } + + librados::ObjectWriteOperation op; + + uint64_t last_ver = log.rbegin()->first; + *plast_ver = last_ver; + + map >::iterator iter = log.begin(); + + op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag); + op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver); + + bufferlist ver_bl; + string last_ver_s = to_string(last_ver); + ver_bl.append(last_ver_s.c_str(), last_ver_s.size()); + op.setxattr(RGW_ATTR_OLH_VER, ver_bl); + + struct timespec mtime_ts = real_clock::to_timespec(state.mtime); + op.mtime2(&mtime_ts); + + bool need_to_link = false; + uint64_t link_epoch = 0; + cls_rgw_obj_key key; + bool delete_marker = false; + list remove_instances; + bool need_to_remove = false; + + // decode current epoch and instance + auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER); + if (olh_ver != state.attrset.end()) { + std::string str = olh_ver->second.to_str(); + std::string err; + link_epoch = strict_strtoll(str.c_str(), 10, &err); + } + auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO); + if (olh_info != state.attrset.end()) { + RGWOLHInfo info; + int r = decode_olh_info(cct, olh_info->second, &info); + if (r < 0) { + return r; + } + info.target.key.get_index_key(&key); + delete_marker = info.removed; + } + + for (iter = log.begin(); iter != log.end(); ++iter) { + vector::iterator viter = iter->second.begin(); + for (; viter != iter->second.end(); ++viter) { + rgw_bucket_olh_log_entry& entry = *viter; + + ldout(cct, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op + << " key=" << entry.key.name << "[" << entry.key.instance << "] " + << (entry.delete_marker ? "(delete)" : "") << dendl; + switch (entry.op) { + case CLS_RGW_OLH_OP_REMOVE_INSTANCE: + remove_instances.push_back(entry.key); + break; + case CLS_RGW_OLH_OP_LINK_OLH: + // only overwrite a link of the same epoch if its key sorts before + if (link_epoch < iter->first || key.instance.empty() || + key.instance > entry.key.instance) { + ldout(cct, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker + << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl; + need_to_link = true; + need_to_remove = false; + key = entry.key; + delete_marker = entry.delete_marker; + } else { + ldout(cct, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker + << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl; + } + break; + case CLS_RGW_OLH_OP_UNLINK_OLH: + need_to_remove = true; + need_to_link = false; + break; + default: + ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl; + return -EIO; + } + string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; + attr_name.append(entry.op_tag); + op.rmxattr(attr_name.c_str()); + } + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + const rgw_bucket& bucket = obj.bucket; + + if (need_to_link) { + rgw_obj target(bucket, key); + RGWOLHInfo info; + info.target = target; + info.removed = delete_marker; + bufferlist bl; + encode(info, bl); + op.setxattr(RGW_ATTR_OLH_INFO, bl); + } + + /* first remove object instances */ + for (list::iterator liter = remove_instances.begin(); + liter != remove_instances.end(); ++liter) { + cls_rgw_obj_key& key = *liter; + rgw_obj obj_instance(bucket, key); + int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl; + return ret; + } + } + + /* update olh object */ + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r == -ECANCELED) { + r = 0; + } + if (r < 0) { + ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl; + return r; + } + + r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver); + if (r < 0) { + ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl; + return r; + } + + if (need_to_remove) { + ObjectWriteOperation rm_op; + + rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag); + rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver); + cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */ + rm_op.remove(); + + r = ref.ioctx.operate(ref.obj.oid, &rm_op); + if (r == -ECANCELED) { + return 0; /* someone else won this race */ + } else { + /* + * only clear if was successful, otherwise we might clobber pending operations on this object + */ + r = bucket_index_clear_olh(bucket_info, state, obj); + if (r < 0) { + ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl; + return r; + } + } + } + + return 0; +} + +/* + * read olh log and apply it + */ +int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace) +{ + map > log; + bool is_truncated; + uint64_t ver_marker = 0; + + do { + int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated); + if (ret < 0) { + return ret; + } + ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace); + if (ret < 0) { + return ret; + } + } while (is_truncated); + + return 0; +} + +int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, + rgw_zone_set *zones_trace, bool log_data_change) +{ + string op_tag; + + rgw_obj olh_obj = target_obj; + olh_obj.key.instance.clear(); + + RGWObjState *state = NULL; + + int ret = 0; + int i; + +#define MAX_ECANCELED_RETRY 100 + for (i = 0; i < MAX_ECANCELED_RETRY; i++) { + if (ret == -ECANCELED) { + obj_ctx.invalidate(olh_obj); + } + + ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */ + if (ret < 0) { + return ret; + } + + ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag); + if (ret < 0) { + ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + return ret; + } + ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, + op_tag, meta, olh_epoch, unmod_since, high_precision_time, + zones_trace, log_data_change); + if (ret < 0) { + ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl; + if (ret == -ECANCELED) { + // the bucket index rejected the link_olh() due to olh tag mismatch; + // attempt to reconstruct olh head attributes based on the bucket index + int r2 = repair_olh(state, bucket_info, olh_obj); + if (r2 < 0 && r2 != -ECANCELED) { + return r2; + } + continue; + } + return ret; + } + break; + } + + if (i == MAX_ECANCELED_RETRY) { + ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl; + return -EIO; + } + + ret = update_olh(obj_ctx, state, bucket_info, olh_obj); + if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ + ret = 0; + } + if (ret < 0) { + ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, + uint64_t olh_epoch, rgw_zone_set *zones_trace) +{ + string op_tag; + + rgw_obj olh_obj = target_obj; + olh_obj.key.instance.clear(); + + RGWObjState *state = NULL; + + int ret = 0; + int i; + + for (i = 0; i < MAX_ECANCELED_RETRY; i++) { + if (ret == -ECANCELED) { + obj_ctx.invalidate(olh_obj); + } + + ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */ + if (ret < 0) + return ret; + + ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag); + if (ret < 0) { + ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + return ret; + } + + string olh_tag(state->olh_tag.c_str(), state->olh_tag.length()); + + ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace); + if (ret < 0) { + ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + return ret; + } + break; + } + + if (i == MAX_ECANCELED_RETRY) { + ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl; + return -EIO; + } + + ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace); + if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ + return 0; + } + if (ret < 0) { + ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl; + return ret; + } + + return 0; +} + +void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key) +{ +#define OBJ_INSTANCE_LEN 32 + char buf[OBJ_INSTANCE_LEN + 1]; + + gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped, + no underscore for instance name due to the way we encode the raw keys */ + + target_key->set_instance(buf); +} + +void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj) +{ + gen_rand_obj_instance_name(&target_obj->key); +} + +int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh) +{ + map attrset; + + ObjectReadOperation op; + op.getxattrs(&attrset, NULL); + + int r = obj_operate(bucket_info, obj, &op); + if (r < 0) { + return r; + } + + auto iter = attrset.find(RGW_ATTR_OLH_INFO); + if (iter == attrset.end()) { /* not an olh */ + return -EINVAL; + } + + return decode_olh_info(cct, iter->second, olh); +} + +void RGWRados::check_pending_olh_entries(map& pending_entries, + map *rm_pending_entries) +{ + map::iterator iter = pending_entries.begin(); + + real_time now = real_clock::now(); + + while (iter != pending_entries.end()) { + auto biter = iter->second.cbegin(); + RGWOLHPendingInfo pending_info; + try { + decode(pending_info, biter); + } catch (buffer::error& err) { + /* skipping bad entry, we could remove it but it might hide a bug */ + ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl; + ++iter; + continue; + } + + map::iterator cur_iter = iter; + ++iter; + if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) { + (*rm_pending_entries)[cur_iter->first] = cur_iter->second; + pending_entries.erase(cur_iter); + } else { + /* entries names are sorted by time (rounded to a second) */ + break; + } + } +} + +int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map& pending_attrs) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(bucket_info, olh_obj, &ref); + if (r < 0) { + return r; + } + + // trim no more than 1000 entries per osd op + constexpr int max_entries = 1000; + + auto i = pending_attrs.begin(); + while (i != pending_attrs.end()) { + ObjectWriteOperation op; + bucket_index_guard_olh_op(state, op); + + for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) { + op.rmxattr(i->first.c_str()); + } + + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r == -ENOENT || r == -ECANCELED) { + /* raced with some other change, shouldn't sweat about it */ + return 0; + } + if (r < 0) { + ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl; + return r; + } + } + return 0; +} + +int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target) +{ + map pending_entries; + rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries); + + map rm_pending_entries; + check_pending_olh_entries(pending_entries, &rm_pending_entries); + + if (!rm_pending_entries.empty()) { + int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries); + if (ret < 0) { + ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl; + return ret; + } + } + if (!pending_entries.empty()) { + ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl; + + int ret = update_olh(obj_ctx, state, bucket_info, olh_obj); + if (ret < 0) { + return ret; + } + } + + auto iter = state->attrset.find(RGW_ATTR_OLH_INFO); + if (iter == state->attrset.end()) { + return -EINVAL; + } + + RGWOLHInfo olh; + int ret = decode_olh_info(cct, iter->second, &olh); + if (ret < 0) { + return ret; + } + + if (olh.removed) { + return -ENOENT; + } + + *target = olh.target; + + return 0; +} + +int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch, + map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + map unfiltered_attrset; + uint64_t size = 0; + struct timespec mtime_ts; + + ObjectReadOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + if (attrs) { + op.getxattrs(&unfiltered_attrset, NULL); + } + if (psize || pmtime) { + op.stat2(&size, &mtime_ts, NULL); + } + if (first_chunk) { + op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL); + } + bufferlist outbl; + r = ref.ioctx.operate(ref.obj.oid, &op, &outbl); + + if (epoch) { + *epoch = ref.ioctx.get_last_version(); + } + + if (r < 0) + return r; + + if (psize) + *psize = size; + if (pmtime) + *pmtime = ceph::real_clock::from_timespec(mtime_ts); + if (attrs) { + rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs); + } + + return 0; +} + +int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver, + map& stats, string *max_marker, bool *syncstopped) +{ + vector headers; + map bucket_instance_ids; + int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids); + if (r < 0) { + return r; + } + + ceph_assert(headers.size() == bucket_instance_ids.size()); + + auto iter = headers.begin(); + map::iterator viter = bucket_instance_ids.begin(); + BucketIndexShardsManager ver_mgr; + BucketIndexShardsManager master_ver_mgr; + BucketIndexShardsManager marker_mgr; + char buf[64]; + for(; iter != headers.end(); ++iter, ++viter) { + accumulate_raw_stats(*iter, stats); + snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver); + ver_mgr.add(viter->first, string(buf)); + snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver); + master_ver_mgr.add(viter->first, string(buf)); + if (shard_id >= 0) { + *max_marker = iter->max_marker; + } else { + marker_mgr.add(viter->first, iter->max_marker); + } + if (syncstopped != NULL) + *syncstopped = iter->syncstopped; + } + ver_mgr.to_string(bucket_ver); + master_ver_mgr.to_string(master_ver); + if (shard_id < 0) { + marker_mgr.to_string(max_marker); + } + return 0; +} + +int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id, + map& markers) +{ + vector headers; + map bucket_instance_ids; + int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids); + if (r < 0) + return r; + + ceph_assert(headers.size() == bucket_instance_ids.size()); + + auto iter = headers.begin(); + map::iterator viter = bucket_instance_ids.begin(); + + for(; iter != headers.end(); ++iter, ++viter) { + if (shard_id >= 0) { + markers[shard_id] = iter->max_marker; + } else { + markers[viter->first] = iter->max_marker; + } + } + return 0; +} + +class RGWGetBucketStatsContext : public RGWGetDirHeader_CB { + RGWGetBucketStats_CB *cb; + uint32_t pendings; + map stats; + int ret_code; + bool should_cb; + Mutex lock; + +public: + RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings) + : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true), + lock("RGWGetBucketStatsContext") {} + + void handle_response(int r, rgw_bucket_dir_header& header) override { + Mutex::Locker l(lock); + if (should_cb) { + if ( r >= 0) { + accumulate_raw_stats(header, stats); + } else { + ret_code = r; + } + + // Are we all done? + if (--pendings == 0) { + if (!ret_code) { + cb->set_response(&stats); + } + cb->handle_response(ret_code); + cb->put(); + } + } + } + + void unset_cb() { + Mutex::Locker l(lock); + should_cb = false; + } +}; + +int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx) +{ + int num_aio = 0; + RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1); + ceph_assert(get_ctx); + int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio); + if (r < 0) { + ctx->put(); + if (num_aio) { + get_ctx->unset_cb(); + } + } + get_ctx->put(); + return r; +} + +class RGWGetUserStatsContext : public RGWGetUserHeader_CB { + RGWGetUserStats_CB *cb; + +public: + explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb) + : cb(cb) {} + + void handle_response(int r, cls_user_header& header) override { + const cls_user_stats& hs = header.stats; + if (r >= 0) { + RGWStorageStats stats; + + stats.size = hs.total_bytes; + stats.size_rounded = hs.total_bytes_rounded; + stats.num_objects = hs.total_entries; + + cb->set_response(stats); + } + + cb->handle_response(r); + + cb->put(); + } +}; + +int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats) +{ + string user_str = user.to_str(); + + cls_user_header header; + int r = cls_user_get_header(user_str, &header); + if (r < 0) + return r; + + const cls_user_stats& hs = header.stats; + + stats.size = hs.total_bytes; + stats.size_rounded = hs.total_bytes_rounded; + stats.num_objects = hs.total_entries; + + return 0; +} + +int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx) +{ + string user_str = user.to_str(); + + RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx); + int r = cls_user_get_header_async(user_str, get_ctx); + if (r < 0) { + ctx->put(); + delete get_ctx; + return r; + } + + return 0; +} + +void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid) +{ + oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':'); +} + +void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj) +{ + if (!bucket.oid.empty()) { + obj.init(svc.zone->get_zone_params().domain_root, bucket.oid); + } else { + string oid; + get_bucket_meta_oid(bucket, oid); + obj.init(svc.zone->get_zone_params().domain_root, oid); + } +} + +int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info, + real_time *pmtime, map *pattrs) +{ + size_t pos = meta_key.find(':'); + if (pos == string::npos) { + return -EINVAL; + } + string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key; + rgw_bucket_instance_key_to_oid(oid); + + return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs); +} + +int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info, + real_time *pmtime, map *pattrs) +{ + string oid; + if (bucket.oid.empty()) { + get_bucket_meta_oid(bucket, oid); + } else { + oid = bucket.oid; + } + + return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs); +} + +int RGWRados::get_bucket_instance_from_oid(RGWSysObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info, + real_time *pmtime, map *pattrs, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version) +{ + auto& domain_root = svc.zone->get_zone_params().domain_root; + + ldout(cct, 20) << "reading from " << domain_root << ":" << oid << dendl; + + bufferlist epbl; + + int ret = rgw_get_system_obj(this, obj_ctx, domain_root, + oid, epbl, &info.objv_tracker, pmtime, pattrs, + cache_info, refresh_version); + if (ret < 0) { + return ret; + } + + auto iter = epbl.cbegin(); + try { + decode(info, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl; + return -EIO; + } + info.bucket.oid = oid; + return 0; +} + +int RGWRados::get_bucket_entrypoint_info(RGWSysObjectCtx& obj_ctx, + const string& tenant_name, + const string& bucket_name, + RGWBucketEntryPoint& entry_point, + RGWObjVersionTracker *objv_tracker, + real_time *pmtime, + map *pattrs, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version) +{ + bufferlist bl; + string bucket_entry; + + rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry); + int ret = rgw_get_system_obj(this, obj_ctx, svc.zone->get_zone_params().domain_root, + bucket_entry, bl, objv_tracker, pmtime, pattrs, + cache_info, refresh_version); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(entry_point, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl; + return -EIO; + } + return 0; +} + +int RGWRados::convert_old_bucket_info(RGWSysObjectCtx& obj_ctx, + const string& tenant_name, + const string& bucket_name) +{ + RGWBucketEntryPoint entry_point; + real_time ep_mtime; + RGWObjVersionTracker ot; + map attrs; + RGWBucketInfo info; + + ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl; + + int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs); + if (ret < 0) { + ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl; + return ret; + } + + if (!entry_point.has_bucket_info) { + /* already converted! */ + return 0; + } + + info = entry_point.old_bucket_info; + info.bucket.oid = bucket_name; + info.ep_objv = ot.read_version; + + ot.generate_new_write_ver(cct); + + ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::_get_bucket_info(RGWSysObjectCtx& obj_ctx, + const string& tenant, + const string& bucket_name, + RGWBucketInfo& info, + real_time *pmtime, + map *pattrs, + boost::optional refresh_version) +{ + string bucket_entry; + rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry); + + + if (auto e = binfo_cache->find(bucket_entry)) { + if (refresh_version && + e->info.objv_tracker.read_version.compare(&(*refresh_version))) { + lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is " + << "a failure that should be debugged. I am a nice machine, " + << "so I will try to recover." << dendl; + binfo_cache->invalidate(bucket_entry); + } else { + info = e->info; + if (pattrs) + *pattrs = e->attrs; + if (pmtime) + *pmtime = e->mtime; + return 0; + } + } + + bucket_info_entry e; + RGWBucketEntryPoint entry_point; + real_time ep_mtime; + RGWObjVersionTracker ot; + rgw_cache_entry_info entry_cache_info; + int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name, + entry_point, &ot, &ep_mtime, pattrs, + &entry_cache_info, refresh_version); + if (ret < 0) { + /* only init these fields */ + info.bucket.tenant = tenant; + info.bucket.name = bucket_name; + return ret; + } + + if (entry_point.has_bucket_info) { + info = entry_point.old_bucket_info; + info.bucket.oid = bucket_name; + info.bucket.tenant = tenant; + info.ep_objv = ot.read_version; + ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl; + return 0; + } + + /* data is in the bucket instance object, we need to get attributes from there, clear everything + * that we got + */ + if (pattrs) { + pattrs->clear(); + } + + ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl; + + + /* read bucket instance info */ + + string oid; + get_bucket_meta_oid(entry_point.bucket, oid); + + rgw_cache_entry_info cache_info; + + ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs, + &cache_info, refresh_version); + e.info.ep_objv = ot.read_version; + info = e.info; + if (ret < 0) { + lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl; + info.bucket.tenant = tenant; + info.bucket.name = bucket_name; + // XXX and why return anything in case of an error anyway? + return ret; + } + + if (pmtime) + *pmtime = e.mtime; + if (pattrs) + *pattrs = e.attrs; + + /* chain to both bucket entry point and bucket instance */ + if (!binfo_cache->put(svc.cache, bucket_entry, &e, {&entry_cache_info, &cache_info})) { + ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl; + } + + if (refresh_version && + refresh_version->compare(&info.objv_tracker.read_version)) { + lderr(cct) << "WARNING: The OSD has the same version I have. Something may " + << "have gone squirrelly. An administrator may have forced a " + << "change; otherwise there is a problem somewhere." << dendl; + } + + return 0; +} + +int RGWRados::get_bucket_info(RGWSysObjectCtx& obj_ctx, + const string& tenant, const string& bucket_name, + RGWBucketInfo& info, + real_time *pmtime, map *pattrs) +{ + return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime, + pattrs, boost::none); +} + +int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info, + ceph::real_time *pmtime, + map *pattrs) +{ + RGWSysObjectCtx obj_ctx = svc.sysobj->init_obj_ctx(); + + return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name, + info, pmtime, pattrs, info.objv_tracker.read_version); +} + +int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point, + bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime, + map *pattrs) +{ + bufferlist epbl; + encode(entry_point, epbl); + string bucket_entry; + rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry); + return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime); +} + +int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, + real_time mtime, map *pattrs) +{ + info.has_instance_obj = true; + bufferlist bl; + + encode(info, bl); + + string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */ + int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime); + if (ret == -EEXIST) { + /* well, if it's exclusive we shouldn't overwrite it, because we might race with another + * bucket operation on this specific bucket (e.g., being synced from the master), but + * since bucket instace meta object is unique for this specific bucket instace, we don't + * need to return an error. + * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the + * master, creating a bucket, sending bucket creation to the master, we create the bucket + * locally, while in the sync thread we sync the new bucket. + */ + ret = 0; + } + return ret; +} + +int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv, + map *pattrs, bool create_entry_point) +{ + bool create_head = !info.has_instance_obj || create_entry_point; + + int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs); + if (ret < 0) { + return ret; + } + + if (!create_head) + return 0; /* done! */ + + RGWBucketEntryPoint entry_point; + entry_point.bucket = info.bucket; + entry_point.owner = info.owner; + entry_point.creation_time = info.creation_time; + entry_point.linked = true; + RGWObjVersionTracker ot; + if (pep_objv && !pep_objv->tag.empty()) { + ot.write_version = *pep_objv; + } else { + ot.generate_new_write_ver(cct); + if (pep_objv) { + *pep_objv = ot.write_version; + } + } + ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::update_containers_stats(map& m) +{ + auto obj_ctx = svc.sysobj->init_obj_ctx(); + + map::iterator iter; + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt& ent = iter->second; + rgw_bucket& bucket = ent.bucket; + ent.count = 0; + ent.size = 0; + ent.size_rounded = 0; + + vector headers; + + RGWBucketInfo bucket_info; + int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL); + if (ret < 0) { + return ret; + } + + int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers); + if (r < 0) + return r; + + auto hiter = headers.begin(); + for (; hiter != headers.end(); ++hiter) { + RGWObjCategory category = main_category; + auto iter = (hiter->stats).find(category); + if (iter != hiter->stats.end()) { + struct rgw_bucket_category_stats& stats = iter->second; + ent.count += stats.num_entries; + ent.size += stats.total_size; + ent.size_rounded += stats.total_size_rounded; + } + } + + // fill in placement_rule from the bucket instance for use in swift's + // per-storage policy statistics + ent.placement_rule = std::move(bucket_info.placement_rule); + } + + return m.size(); +} + +int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + librados::Rados *rad = get_rados_handle(); + librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL); + + r = ref.ioctx.aio_append(ref.obj.oid, completion, bl, size); + completion->release(); + return r; +} + +int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + int r = open_pool_ctx(pool, io_ctx, false); + if (r < 0) + return r; + + iter = io_ctx.nobjects_begin(); + + return 0; +} + +int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + int r = open_pool_ctx(pool, io_ctx, false); + if (r < 0) + return r; + + librados::ObjectCursor oc; + if (!oc.from_str(cursor)) { + ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl; + return -EINVAL; + } + + try { + iter = io_ctx.nobjects_begin(oc); + return 0; + } catch (const std::system_error& e) { + r = -e.code().value(); + ldout(cct, 10) << "nobjects_begin threw " << e.what() + << ", returning " << r << dendl; + return r; + } catch (const std::exception& e) { + ldout(cct, 10) << "nobjects_begin threw " << e.what() + << ", returning -5" << dendl; + return -EIO; + } +} + +string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx) +{ + return ctx.iter.get_cursor().to_str(); +} + +static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num, + vector& objs, + bool *is_truncated, RGWAccessListFilter *filter) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + if (iter == io_ctx.nobjects_end()) + return -ENOENT; + + uint32_t i; + + for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) { + rgw_bucket_dir_entry e; + + string oid = iter->get_oid(); + ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl; + + // fill it in with initial values; we may correct later + if (filter && !filter->filter(oid, oid)) + continue; + + e.key = oid; + objs.push_back(e); + } + + if (is_truncated) + *is_truncated = (iter != io_ctx.nobjects_end()); + + return objs.size(); +} + +int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector& objs, + bool *is_truncated, RGWAccessListFilter *filter) +{ + // catch exceptions from NObjectIterator::operator++() + try { + return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter); + } catch (const std::system_error& e) { + int r = -e.code().value(); + ldout(cct, 10) << "NObjectIterator threw exception " << e.what() + << ", returning " << r << dendl; + return r; + } catch (const std::exception& e) { + ldout(cct, 10) << "NObjectIterator threw exception " << e.what() + << ", returning -5" << dendl; + return -EIO; + } +} + +int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx) +{ + if (!ctx->initialized) { + int r = pool_iterate_begin(pool, marker, ctx->iter_ctx); + if (r < 0) { + ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl; + return r; + } + ctx->initialized = true; + } + return 0; +} + +int RGWRados::list_raw_objects_next(const string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, list& oids, + bool *is_truncated) +{ + if (!ctx.initialized) { + return -EINVAL; + } + RGWAccessListFilterPrefix filter(prefix_filter); + vector objs; + int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter); + if (r < 0) { + if(r != -ENOENT) + ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl; + return r; + } + + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + oids.push_back(iter->key.name); + } + + return oids.size(); +} + +int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter, + int max, RGWListRawObjsCtx& ctx, list& oids, + bool *is_truncated) +{ + if (!ctx.initialized) { + int r = list_raw_objects_init(pool, string(), &ctx); + if (r < 0) { + return r; + } + } + + return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated); +} + +string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx) +{ + return pool_iterate_get_cursor(ctx.iter_ctx); +} + +int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max, + std::list& result, bool *truncated) +{ + ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl; + result.clear(); + + librados::IoCtx index_ctx; + map oids; + map bi_log_lists; + int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id); + if (r < 0) + return r; + + BucketIndexShardsManager marker_mgr; + bool has_shards = (oids.size() > 1 || shard_id >= 0); + // If there are multiple shards for the bucket index object, the marker + // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}# + // {shard_marker_2}...', if there is no sharding, the bi_log_list should + // only contain one record, and the key is the bucket instance id. + r = marker_mgr.from_string(marker, shard_id); + if (r < 0) + return r; + + r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) + return r; + + map::iterator> vcurrents; + map::iterator> vends; + if (truncated) { + *truncated = false; + } + map::iterator miter = bi_log_lists.begin(); + for (; miter != bi_log_lists.end(); ++miter) { + int shard_id = miter->first; + vcurrents[shard_id] = miter->second.entries.begin(); + vends[shard_id] = miter->second.entries.end(); + if (truncated) { + *truncated = (*truncated || miter->second.truncated); + } + } + + size_t total = 0; + bool has_more = true; + map::iterator>::iterator viter; + map::iterator>::iterator eiter; + while (total < max && has_more) { + has_more = false; + + viter = vcurrents.begin(); + eiter = vends.begin(); + + for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) { + assert (eiter != vends.end()); + + int shard_id = viter->first; + list::iterator& liter = viter->second; + + if (liter == eiter->second){ + continue; + } + rgw_bi_log_entry& entry = *(liter); + if (has_shards) { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + string tmp_id; + build_bucket_index_marker(buf, entry.id, &tmp_id); + entry.id.swap(tmp_id); + } + marker_mgr.add(shard_id, entry.id); + result.push_back(entry); + total++; + has_more = true; + ++liter; + } + } + + if (truncated) { + for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) { + assert (eiter != vends.end()); + *truncated = (*truncated || (viter->second != eiter->second)); + } + } + + // Refresh marker, if there are multiple shards, the output will look like + // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...', + // if there is no sharding, the simply marker (without oid) is returned + if (has_shards) { + marker_mgr.to_string(&marker); + } else { + if (!result.empty()) { + marker = result.rbegin()->id; + } + } + + return 0; +} + +int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker) +{ + librados::IoCtx index_ctx; + map bucket_objs; + + BucketIndexShardsManager start_marker_mgr; + BucketIndexShardsManager end_marker_mgr; + + int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id); + if (r < 0) { + return r; + } + + r = start_marker_mgr.from_string(start_marker, shard_id); + if (r < 0) { + return r; + } + + r = end_marker_mgr.from_string(end_marker, shard_id); + if (r < 0) { + return r; + } + + return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id) +{ + librados::IoCtx index_ctx; + map bucket_objs; + int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id); + if (r < 0) + return r; + + return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id) +{ + librados::IoCtx index_ctx; + map bucket_objs; + int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id); + if (r < 0) + return r; + + return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj, + rgw_bucket_dir_entry *dirent) +{ + rgw_cls_bi_entry bi_entry; + int r = bi_get(bucket_info, obj, BIIndexType::Instance, &bi_entry); + if (r < 0 && r != -ENOENT) { + ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl; + } + if (r < 0) { + return r; + } + auto iter = bi_entry.data.cbegin(); + try { + decode(*dirent, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl; + return -EIO; + } + + return 0; +} + +int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, + rgw_bucket_olh_entry *olh) +{ + rgw_cls_bi_entry bi_entry; + int r = bi_get(bucket_info, obj, BIIndexType::OLH, &bi_entry); + if (r < 0 && r != -ENOENT) { + ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl; + } + if (r < 0) { + return r; + } + auto iter = bi_entry.data.cbegin(); + try { + decode(*olh, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl; + return -EIO; + } + + return 0; +} + +int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj, + BIIndexType index_type, rgw_cls_bi_entry *entry) +{ + BucketShard bs(this); + int ret = bs.init(bucket_info, obj); + if (ret < 0) { + ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance); + + return cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry); +} + +void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry) +{ + cls_rgw_bi_put(op, bs.bucket_obj, entry); +} + +int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry) +{ + int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry) +{ + BucketShard bs(this); + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */); + if (ret < 0) { + ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + return bi_put(bs, entry); +} + +int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list *entries, bool *is_truncated) +{ + rgw_obj obj(bucket, obj_name); + BucketShard bs(this); + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */); + if (ret < 0) { + ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated); + if (ret == -ENOENT) { + *is_truncated = false; + } + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list *entries, bool *is_truncated) +{ + int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_remove(BucketShard& bs) +{ + int ret = bs.index_ctx.remove(bs.bucket_obj); + if (ret == -ENOENT) { + ret = 0; + } + if (ret < 0) { + ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list *entries, bool *is_truncated) +{ + BucketShard bs(this); + int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */); + if (ret < 0) { + ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + return bi_list(bs, filter_obj, marker, max, entries, is_truncated); +} + +int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op) +{ + return gc_pool_ctx.operate(oid, op); +} + +int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op, AioCompletion **pc) +{ + AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); + int r = gc_pool_ctx.aio_operate(oid, c, op); + if (!pc) { + c->release(); + } else { + *pc = c; + } + return r; +} + +int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl) +{ + return gc_pool_ctx.operate(oid, op, pbl); +} + +int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated) +{ + return gc->list(index, marker, max, expired_only, result, truncated); +} + +int RGWRados::process_gc(bool expired_only) +{ + return gc->process(expired_only); +} + +int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map *progress_map) +{ + return lc->list_lc_progress(marker, max_entries, progress_map); +} + +int RGWRados::process_lc() +{ + return lc->process(); +} + +bool RGWRados::process_expire_objects() +{ + return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now()); +} + +int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag, + rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace) +{ + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id); + + ObjectWriteOperation o; + cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance); + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace); + return bs.index_ctx.operate(bs.bucket_obj, &o); +} + +int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag, + int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, + list *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace) +{ + ObjectWriteOperation o; + rgw_bucket_dir_entry_meta dir_meta; + dir_meta = ent.meta; + dir_meta.category = category; + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id); + + rgw_bucket_entry_ver ver; + ver.pool = pool; + ver.epoch = epoch; + cls_rgw_obj_key key(ent.key.name, ent.key.instance); + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs, + svc.zone->get_zone().log_data, bilog_flags, &zones_trace); + complete_op_data *arg; + index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs, + svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg); + librados::AioCompletion *completion = arg->rados_completion; + int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o); + completion->release(); /* can't reference arg here, as it might have already been released */ + return ret; +} + +int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag, + int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, + list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace) +{ + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace); +} + +int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag, + int64_t pool, uint64_t epoch, + rgw_obj& obj, + real_time& removed_mtime, + list *remove_objs, + uint16_t bilog_flags, + rgw_zone_set *zones_trace) +{ + rgw_bucket_dir_entry ent; + ent.meta.mtime = removed_mtime; + obj.key.get_index_key(&ent.key); + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, + ent, RGWObjCategory::None, remove_objs, + bilog_flags, zones_trace); +} + +int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace) +{ + rgw_bucket_dir_entry ent; + obj.key.get_index_key(&ent.key); + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, + -1 /* pool id */, 0, ent, + RGWObjCategory::None, NULL, bilog_flags, + zones_trace); +} + +int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout) +{ + librados::IoCtx index_ctx; + map bucket_objs; + int r = open_bucket_index(bucket_info, index_ctx, bucket_objs); + if (r < 0) + return r; + + return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)(); +} + + +uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries, + uint32_t num_shards) +{ + // We want to minimize the chances that when num_shards >> + // num_entries that we return much fewer than num_entries to the + // client. Given all the overhead of making a cls call to the osd, + // returning a few entries is not much more work than returning one + // entry. This minimum might be better tuned based on future + // experiments where num_shards >> num_entries. (Note: ">>" should + // be interpreted as "much greater than".) + constexpr uint32_t min_read = 8; + + // The following is based on _"Balls into Bins" -- A Simple and + // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle + // cases when num_shards >> num_entries (it almost serves as a + // ceiling calculation). We also assume alpha is 1.0 and extract it + // from the calculation. Future work could involve memoizing some of + // the transcendental functions to minimize repeatedly re-calling + // them with the same parameters, which we expect to be the case the + // majority of the time. + uint32_t calc_read = + 1 + + static_cast((num_entries / num_shards) + + sqrt((2 * num_entries) * + log(num_shards) / num_shards)); + + return std::max(min_read, calc_read); +} + + +int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, + const int shard_id, + const rgw_obj_index_key& start_after, + const string& prefix, + const uint32_t num_entries, + const bool list_versions, + const uint16_t expansion_factor, + map& m, + bool *is_truncated, + rgw_obj_index_key *last_entry, + bool (*force_check_filter)(const string& name)) +{ + /* expansion_factor allows the number of entries to read to grow + * exponentially; this is used when earlier reads are producing too + * few results, perhaps due to filtering or to a series of + * namespaced entries */ + + ldout(cct, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket << + " start_after=\"" << start_after.name << + "[" << start_after.instance << + "]\", prefix=\"" << prefix << + "\" num_entries=" << num_entries << + ", list_versions=" << list_versions << + ", expansion_factor=" << expansion_factor << dendl; + + m.clear(); + + librados::IoCtx index_ctx; + // key - oid (for different shards if there is any) + // value - list result for the corresponding oid (shard), it is filled by + // the AIO callback + map oids; + int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id); + if (r < 0) { + return r; + } + + const uint32_t shard_count = oids.size(); + uint32_t num_entries_per_shard; + if (expansion_factor == 0) { + num_entries_per_shard = + calc_ordered_bucket_list_per_shard(num_entries, shard_count); + } else if (expansion_factor <= 11) { + // we'll max out the exponential multiplication factor at 1024 (2<<10) + num_entries_per_shard = + std::min(num_entries, + (uint32_t(1 << (expansion_factor - 1)) * + calc_ordered_bucket_list_per_shard(num_entries, shard_count))); + } else { + num_entries_per_shard = num_entries; + } + + ldout(cct, 10) << "RGWRados::" << __func__ << + " request from each of " << shard_count << + " shard(s) for " << num_entries_per_shard << " entries to get " << + num_entries << " total entries" << dendl; + + map list_results; + cls_rgw_obj_key start_key(start_after.name, start_after.instance); + r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries_per_shard, + list_versions, oids, list_results, + cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) { + return r; + } + + // create a list of iterators that are used to iterate each shard + vector::iterator> vcurrents; + vector::iterator> vends; + vector vnames; + vcurrents.reserve(list_results.size()); + vends.reserve(list_results.size()); + vnames.reserve(list_results.size()); + for (auto& iter : list_results) { + vcurrents.push_back(iter.second.dir.m.begin()); + vends.push_back(iter.second.dir.m.end()); + vnames.push_back(oids[iter.first]); + } + + // create a map to track the next candidate entry from each shard, + // if the entry from a specified shard is selected/erased, the next + // entry from that shard will be inserted for next round selection + map candidates; + for (size_t i = 0; i < vcurrents.size(); ++i) { + if (vcurrents[i] != vends[i]) { + candidates[vcurrents[i]->first] = i; + } + } + + map updates; + uint32_t count = 0; + int pos = -1; + while (count < num_entries && !candidates.empty()) { + r = 0; + // Select the next one + pos = candidates.begin()->second; + const string& name = vcurrents[pos]->first; + struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second; + + ldout(cct, 20) << "RGWRados::" << __func__ << " currently processing " << + dirent.key << " from shard " << pos << dendl; + + bool force_check = + force_check_filter && force_check_filter(dirent.key.name); + + if ((!dirent.exists && !dirent.is_delete_marker()) || + !dirent.pending_map.empty() || + force_check) { + /* there are uncommitted ops. We need to check the current + * state, and if the tags are old we need to do clean-up as + * well. */ + librados::IoCtx sub_ctx; + sub_ctx.dup(index_ctx); + r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, + updates[vnames[pos]]); + if (r < 0 && r != -ENOENT) { + return r; + } + } else { + r = 0; + } + + if (r >= 0) { + ldout(cct, 10) << "RGWRados::" << __func__ << ": got " << + dirent.key.name << "[" << dirent.key.instance << "]" << dendl; + m[name] = std::move(dirent); + ++count; + } else { + ldout(cct, 10) << "RGWRados::" << __func__ << ": skipping " << + dirent.key.name << "[" << dirent.key.instance << "]" << dendl; + } + + // refresh the candidates map + candidates.erase(candidates.begin()); + if (++vcurrents[pos] != vends[pos]) { // note: pre-increment + candidates[vcurrents[pos]->first] = pos; + } else if (list_results[pos].is_truncated) { + // once we exhaust one shard that is truncated, we need to stop, + // as we cannot be certain that one of the next entries needs to + // come from that shard; S3 and swift protocols allow returning + // fewer than what was requested + break; + } + } // while we haven't provided requested # of result entries + + // suggest updates if there are any + for (auto& miter : updates) { + if (miter.second.length()) { + ObjectWriteOperation o; + cls_rgw_suggest_changes(o, miter.second); + // we don't care if we lose suggested updates, send them off blindly + AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); + index_ctx.aio_operate(miter.first, c, &o); + c->release(); + } + } // updates loop + + *is_truncated = false; + // check if all the returned entries are consumed or not + for (size_t i = 0; i < vcurrents.size(); ++i) { + if (vcurrents[i] != vends[i] || list_results[i].is_truncated) { + *is_truncated = true; + break; + } + } + + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, count=" << count << ", is_truncated=" << *is_truncated << + dendl; + + if (*is_truncated && count < num_entries) { + ldout(cct, 10) << "RGWRados::" << __func__ << + ": INFO requested " << num_entries << " entries but returning " << + count << ", which is truncated" << dendl; + } + + if (pos >= 0) { + *last_entry = std::move((--vcurrents[pos])->first); + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, last_entry=" << *last_entry << dendl; + } else { + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, last_entry NOT SET" << dendl; + } + + return 0; +} + + +int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info, + int shard_id, + const rgw_obj_index_key& start, + const string& prefix, + uint32_t num_entries, + bool list_versions, + std::vector& ent_list, + bool *is_truncated, + rgw_obj_index_key *last_entry, + bool (*force_check_filter)(const string& name)) { + ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket << + " start " << start.name << "[" << start.instance << + "] num_entries " << num_entries << dendl; + + ent_list.clear(); + static MultipartMetaFilter multipart_meta_filter; + + *is_truncated = false; + librados::IoCtx index_ctx; + + map oids; + int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id); + if (r < 0) + return r; + const uint32_t num_shards = oids.size(); + + rgw_obj_index_key marker = start; + uint32_t current_shard; + if (shard_id >= 0) { + current_shard = shard_id; + } else if (start.empty()) { + current_shard = 0u; + } else { + // at this point we have a marker (start) that has something in + // it, so we need to get to the bucket shard index, so we can + // start reading from there + + std::string key; + // test whether object name is a multipart meta name + if(! multipart_meta_filter.filter(start.name, key)) { + // if multipart_meta_filter fails, must be "regular" (i.e., + // unadorned) and the name is the key + key = start.name; + } + + // now convert the key (oid) to an rgw_obj_key since that will + // separate out the namespace, name, and instance + rgw_obj_key obj_key; + bool parsed = rgw_obj_key::parse_raw_oid(key, &obj_key); + if (!parsed) { + ldout(cct, 0) << + "ERROR: RGWRados::cls_bucket_list_unordered received an invalid " + "start marker: '" << start << "'" << dendl; + return -EINVAL; + } else if (obj_key.name.empty()) { + // if the name is empty that means the object name came in with + // a namespace only, and therefore we need to start our scan at + // the first bucket index shard + current_shard = 0u; + } else { + // so now we have the key used to compute the bucket index shard + // and can extract the specific shard from it + current_shard = rgw_bucket_shard_index(obj_key.name, num_shards); + } + } + + uint32_t count = 0u; + map updates; + rgw_obj_index_key last_added_entry; + while (count <= num_entries && + ((shard_id >= 0 && current_shard == uint32_t(shard_id)) || + current_shard < num_shards)) { + const std::string& oid = oids[current_shard]; + rgw_cls_list_ret result; + + librados::ObjectReadOperation op; + cls_rgw_bucket_list_op(op, marker, prefix, num_entries, + list_versions, &result); + r = index_ctx.operate(oid, &op, nullptr); + if (r < 0) + return r; + + for (auto& entry : result.dir.m) { + rgw_bucket_dir_entry& dirent = entry.second; + + bool force_check = force_check_filter && + force_check_filter(dirent.key.name); + if ((!dirent.exists && !dirent.is_delete_marker()) || + !dirent.pending_map.empty() || + force_check) { + /* there are uncommitted ops. We need to check the current state, + * and if the tags are old we need to do cleanup as well. */ + librados::IoCtx sub_ctx; + sub_ctx.dup(index_ctx); + r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]); + if (r < 0 && r != -ENOENT) { + return r; + } + } else { + r = 0; + } + + // at this point either r >=0 or r == -ENOENT + if (r >= 0) { // i.e., if r != -ENOENT + ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " << + dirent.key.name << "[" << dirent.key.instance << "]" << dendl; + + if (count < num_entries) { + marker = last_added_entry = dirent.key; // double assign + ent_list.emplace_back(std::move(dirent)); + ++count; + } else { + *is_truncated = true; + goto check_updates; + } + } else { // r == -ENOENT + // in the case of -ENOENT, make sure we're advancing marker + // for possible next call to CLSRGWIssueBucketList + marker = dirent.key; + } + } // entry for loop + + if (!result.is_truncated) { + // if we reached the end of the shard read next shard + ++current_shard; + marker = rgw_obj_index_key(); + } + } // shard loop + +check_updates: + + // suggest updates if there is any + map::iterator miter = updates.begin(); + for (; miter != updates.end(); ++miter) { + if (miter->second.length()) { + ObjectWriteOperation o; + cls_rgw_suggest_changes(o, miter->second); + // we don't care if we lose suggested updates, send them off blindly + AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); + index_ctx.aio_operate(miter->first, c, &o); + c->release(); + } + } + + if (last_entry && !ent_list.empty()) { + *last_entry = last_added_entry; + } + + return 0; +} // RGWRados::cls_bucket_list_unordered + + +int RGWRados::cls_obj_usage_log_add(const string& oid, + rgw_usage_log_info& info) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + cls_rgw_usage_log_add(op, info); + + r = ref.ioctx.operate(ref.obj.oid, &op); + return r; +} + +int RGWRados::cls_obj_usage_log_read(const string& oid, const string& user, const string& bucket, + uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + string& read_iter, map& usage, + bool *is_truncated) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + *is_truncated = false; + + r = cls_rgw_usage_log_read(ref.ioctx, ref.obj.oid, user, bucket, start_epoch, end_epoch, + max_entries, read_iter, usage, is_truncated); + + return r; +} + +int RGWRados::cls_obj_usage_log_trim(const string& oid, const string& user, const string& bucket, + uint64_t start_epoch, uint64_t end_epoch) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + r = cls_rgw_usage_log_trim(ref.ioctx, ref.obj.oid, user, bucket, start_epoch, end_epoch); + return r; +} + +int RGWRados::cls_obj_usage_log_clear(string& oid) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + librados::ObjectWriteOperation op; + cls_rgw_usage_log_clear(op); + r = ref.ioctx.operate(ref.obj.oid, &op); + return r; +} + + +int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list& oid_list) +{ + librados::IoCtx index_ctx; + string dir_oid; + + uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0); + + int r = open_bucket_index(bucket_info, index_ctx, dir_oid); + if (r < 0) + return r; + + bufferlist updates; + + for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) { + rgw_bucket_dir_entry entry; + entry.key = *iter; + dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl; + entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request + updates.append(CEPH_RGW_REMOVE | suggest_flag); + encode(entry, updates); + } + + bufferlist out; + + r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out); + + return r; +} + +int RGWRados::check_disk_state(librados::IoCtx io_ctx, + const RGWBucketInfo& bucket_info, + rgw_bucket_dir_entry& list_state, + rgw_bucket_dir_entry& object, + bufferlist& suggested_updates) +{ + const rgw_bucket& bucket = bucket_info.bucket; + uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0); + + std::string loc; + + rgw_obj obj(bucket, list_state.key); + + string oid; + get_obj_bucket_and_oid_loc(obj, oid, loc); + + if (loc != list_state.locator) { + ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl; + } + + io_ctx.locator_set_key(list_state.locator); + + RGWObjState *astate = NULL; + RGWObjectCtx rctx(this); + int r = get_obj_state(&rctx, bucket_info, obj, &astate, false); + if (r < 0) + return r; + + list_state.pending_map.clear(); // we don't need this and it inflates size + if (!astate->exists) { + /* object doesn't exist right now -- hopefully because it's + * marked as !exists and got deleted */ + if (list_state.exists) { + /* FIXME: what should happen now? Work out if there are any + * non-bad ways this could happen (there probably are, but annoying + * to handle!) */ + } + // encode a suggested removal of that key + list_state.ver.epoch = io_ctx.get_last_version(); + list_state.ver.pool = io_ctx.get_id(); + cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates); + return -ENOENT; + } + + string etag; + string content_type; + ACLOwner owner; + + object.meta.size = astate->size; + object.meta.accounted_size = astate->accounted_size; + object.meta.mtime = astate->mtime; + + map::iterator iter = astate->attrset.find(RGW_ATTR_ETAG); + if (iter != astate->attrset.end()) { + etag = rgw_bl_str(iter->second); + } + iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE); + if (iter != astate->attrset.end()) { + content_type = rgw_bl_str(iter->second); + } + iter = astate->attrset.find(RGW_ATTR_ACL); + if (iter != astate->attrset.end()) { + r = decode_policy(iter->second, &owner); + if (r < 0) { + dout(0) << "WARNING: could not decode policy for object: " << obj << dendl; + } + } + + if (astate->has_manifest) { + RGWObjManifest::obj_iterator miter; + RGWObjManifest& manifest = astate->manifest; + for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) { + const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this); + rgw_obj loc; + rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc); + + if (loc.key.ns == RGW_OBJ_NS_MULTIPART) { + dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl; + r = delete_obj_index(loc, astate->mtime); + if (r < 0) { + dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl; + } + } + } + } + + object.meta.etag = etag; + object.meta.content_type = content_type; + object.meta.owner = owner.get_id().to_str(); + object.meta.owner_display_name = owner.get_display_name(); + + // encode suggested updates + list_state.ver.pool = io_ctx.get_id(); + list_state.ver.epoch = astate->epoch; + list_state.meta.size = object.meta.size; + list_state.meta.accounted_size = object.meta.accounted_size; + list_state.meta.mtime = object.meta.mtime; + list_state.meta.category = main_category; + list_state.meta.etag = etag; + list_state.meta.content_type = content_type; + if (astate->obj_tag.length() > 0) + list_state.tag = astate->obj_tag.c_str(); + list_state.meta.owner = owner.get_id().to_str(); + list_state.meta.owner_display_name = owner.get_display_name(); + + list_state.exists = true; + cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates); + return 0; +} + +int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector& headers, map *bucket_instance_ids) +{ + librados::IoCtx index_ctx; + map oids; + map list_results; + int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids); + if (r < 0) + return r; + + r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) + return r; + + map::iterator iter = list_results.begin(); + for(; iter != list_results.end(); ++iter) { + headers.push_back(std::move(iter->second.dir.header)); + } + return 0; +} + +int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio) +{ + librados::IoCtx index_ctx; + map bucket_objs; + int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id); + if (r < 0) + return r; + + map::iterator iter = bucket_objs.begin(); + for (; iter != bucket_objs.end(); ++iter) { + r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast(ctx->get())); + if (r < 0) { + ctx->put(); + break; + } else { + (*num_aio)++; + } + } + return r; +} + +int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header) +{ + string buckets_obj_id; + rgw_get_buckets_obj(user_id, buckets_obj_id); + rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + int rc; + ::cls_user_get_header(op, header, &rc); + bufferlist ibl; + r = ref.ioctx.operate(ref.obj.oid, &op, &ibl); + if (r < 0) + return r; + if (rc < 0) + return rc; + + return 0; +} + +int RGWRados::cls_user_reset_stats(const string& user_id) +{ + string buckets_obj_id; + rgw_get_buckets_obj(user_id, buckets_obj_id); + rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + ::cls_user_reset_stats(op); + return ref.ioctx.operate(ref.obj.oid, &op); +} + +int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx) +{ + string buckets_obj_id; + rgw_get_buckets_obj(user_id, buckets_obj_id); + rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + r = ::cls_user_get_header_async(ref.ioctx, ref.obj.oid, ctx); + if (r < 0) + return r; + + return 0; +} + +int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, + const RGWBucketInfo& bucket_info) +{ + vector headers; + int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers); + if (r < 0) { + ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl; + return r; + } + + cls_user_bucket_entry entry; + + bucket_info.bucket.convert(&entry.bucket); + + for (const auto& hiter : headers) { + for (const auto& iter : hiter.stats) { + if (RGWObjCategory::Main == iter.first || + RGWObjCategory::MultiMeta == iter.first) { + const struct rgw_bucket_category_stats& header_stats = iter.second; + entry.size += header_stats.total_size; + entry.size_rounded += header_stats.total_size_rounded; + entry.count += header_stats.num_entries; + } + } + } + + list entries; + entries.push_back(entry); + + r = cls_user_update_buckets(user_obj, entries, false); + if (r < 0) { + ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl; + return r; + } + + return 0; +} + +int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry) +{ + vector headers; + RGWBucketInfo bucket_info; + auto obj_ctx = svc.sysobj->init_obj_ctx(); + int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL); + if (ret < 0) { + return ret; + } + + ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers); + if (ret < 0) { + ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl; + return ret; + } + + bucket.convert(&entry.bucket); + + for (const auto& hiter : headers) { + for (const auto& iter : hiter.stats) { + const struct rgw_bucket_category_stats& header_stats = iter.second; + entry.size += header_stats.total_size; + entry.size_rounded += header_stats.total_size_rounded; + entry.count += header_stats.num_entries; + } + } + + return 0; +} + +int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj, + const string& in_marker, + const string& end_marker, + const int max_entries, + list& entries, + string * const out_marker, + bool * const truncated) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + int rc; + + cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc); + bufferlist ibl; + r = ref.ioctx.operate(ref.obj.oid, &op, &ibl); + if (r < 0) + return r; + if (rc < 0) + return rc; + + return 0; +} + +int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list& entries, bool add) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + cls_user_set_buckets(op, entries, add); + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r < 0) + return r; + + return 0; +} + +int RGWRados::complete_sync_user_stats(const rgw_user& user_id) +{ + string buckets_obj_id; + rgw_get_buckets_obj(user_id, buckets_obj_id); + rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + return cls_user_complete_stats_sync(obj); +} + +int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + ::cls_user_complete_stats_sync(op); + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r < 0) + return r; + + return 0; +} + +int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry) +{ + list l; + l.push_back(entry); + + return cls_user_update_buckets(obj, l, true); +} + +int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket) +{ + rgw_rados_ref ref; + int r = get_system_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + ::cls_user_remove_bucket(op, bucket); + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r < 0) + return r; + + return 0; +} + +int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket, + RGWQuotaInfo& bucket_quota) +{ + if (! cct->_conf.get_val("rgw_dynamic_resharding")) { + return 0; + } + + bool need_resharding = false; + int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1); + uint32_t suggested_num_shards; + + const uint64_t max_objs_per_shard = + cct->_conf.get_val("rgw_max_objs_per_shard"); + int ret = + quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards, + bucket_info.owner, bucket, bucket_quota, + 1, need_resharding, &suggested_num_shards); + if (ret < 0) { + return ret; + } + + if (need_resharding) { + ldout(cct, 1) << __func__ << " bucket " << bucket.name << " need resharding " << + " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards << + dendl; + return add_bucket_to_reshard(bucket_info, suggested_num_shards); + } + + return ret; +} + +int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards) +{ + RGWReshard reshard(this); + + uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1); + + new_num_shards = std::min(new_num_shards, get_max_bucket_shards()); + if (new_num_shards <= num_source_shards) { + ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl; + return 0; + } + + cls_rgw_reshard_entry entry; + entry.time = real_clock::now(); + entry.tenant = bucket_info.owner.tenant; + entry.bucket_name = bucket_info.bucket.name; + entry.bucket_id = bucket_info.bucket.bucket_id; + entry.old_num_shards = num_source_shards; + entry.new_num_shards = new_num_shards; + + return reshard.add(entry); +} + +int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket, + RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size, bool check_size_only) +{ + // if we only check size, then num_objs will set to 0 + if(check_size_only) + return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size); + + return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size); +} + +void RGWRados::get_bucket_index_objects(const string& bucket_oid_base, + uint32_t num_shards, + map& bucket_objects, + int shard_id) { + if (!num_shards) { + bucket_objects[0] = bucket_oid_base; + } else { + char buf[bucket_oid_base.size() + 32]; + if (shard_id < 0) { + for (uint32_t i = 0; i < num_shards; ++i) { + snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i); + bucket_objects[i] = buf; + } + } else { + if ((uint32_t)shard_id > num_shards) { + return; + } + snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id); + bucket_objects[shard_id] = buf; + } + } +} + +void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map *result) +{ + const rgw_bucket& bucket = bucket_info.bucket; + string plain_id = bucket.name + ":" + bucket.bucket_id; + if (!bucket_info.num_shards) { + (*result)[0] = plain_id; + } else { + char buf[16]; + if (shard_id < 0) { + for (uint32_t i = 0; i < bucket_info.num_shards; ++i) { + snprintf(buf, sizeof(buf), ":%d", i); + (*result)[i] = plain_id + buf; + } + } else { + if ((uint32_t)shard_id > bucket_info.num_shards) { + return; + } + snprintf(buf, sizeof(buf), ":%d", shard_id); + (*result)[shard_id] = plain_id + buf; + } + } +} + +int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key, + int *shard_id) +{ + int r = 0; + switch (bucket_info.bucket_index_shard_hash_type) { + case RGWBucketInfo::MOD: + if (!bucket_info.num_shards) { + if (shard_id) { + *shard_id = -1; + } + } else { + uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards); + if (shard_id) { + *shard_id = (int)sid; + } + } + break; + default: + r = -ENOTSUP; + } + return r; +} + +void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards, + int shard_id, string *bucket_obj) +{ + if (!num_shards) { + // By default with no sharding, we use the bucket oid as itself + (*bucket_obj) = bucket_oid_base; + } else { + char buf[bucket_oid_base.size() + 32]; + snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id); + (*bucket_obj) = buf; + } +} + +int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key, + uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id) +{ + int r = 0; + switch (hash_type) { + case RGWBucketInfo::MOD: + if (!num_shards) { + // By default with no sharding, we use the bucket oid as itself + (*bucket_obj) = bucket_oid_base; + if (shard_id) { + *shard_id = -1; + } + } else { + uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards); + char buf[bucket_oid_base.size() + 32]; + snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid); + (*bucket_obj) = buf; + if (shard_id) { + *shard_id = (int)sid; + } + } + break; + default: + r = -ENOTSUP; + } + return r; +} + +uint64_t RGWRados::instance_id() +{ + return get_rados_handle()->get_instance_id(); +} + +uint64_t RGWRados::next_bucket_id() +{ + Mutex::Locker l(bucket_id_lock); + return ++max_bucket_id; +} + +RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, + bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache) +{ + RGWRados *store = new RGWRados; + + if ((*store).set_use_cache(use_cache) + .set_run_gc_thread(use_gc_thread) + .set_run_lc_thread(use_lc_thread) + .set_run_quota_threads(quota_threads) + .set_run_sync_thread(run_sync_thread) + .set_run_reshard_thread(run_reshard_thread) + .initialize(cct) < 0) { + delete store; + return NULL; + } + + return store; +} + +RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct) +{ + RGWRados *store = NULL; + store = new RGWRados; + + store->set_context(cct); + + int ret = store->init_svc(true); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl; + return nullptr; + } + + if (store->init_rados() < 0) { + delete store; + return nullptr; + } + + return store; +} + +void RGWStoreManager::close_storage(RGWRados *store) +{ + if (!store) + return; + + store->finalize(); + + delete store; +} + +librados::Rados* RGWRados::get_rados_handle() +{ + return &rados; +} + +int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list& handles) +{ + rgw_rados_ref ref; + int ret = get_raw_obj_ref(obj, &ref); + if (ret < 0) { + lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl; + return ret; + } + + ObjectWriteOperation op; + list prefixes; + cls_rgw_remove_obj(op, prefixes); + + AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); + ret = ref.ioctx.aio_operate(ref.obj.oid, c, &op); + if (ret < 0) { + lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl; + c->release(); + return ret; + } + + handles.push_back(c); + + return 0; +} + +int RGWRados::delete_obj_aio(const rgw_obj& obj, + RGWBucketInfo& bucket_info, RGWObjState *astate, + list& handles, bool keep_index_consistent) +{ + rgw_rados_ref ref; + int ret = get_obj_head_ref(bucket_info, obj, &ref); + if (ret < 0) { + lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl; + return ret; + } + + if (keep_index_consistent) { + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag); + if (ret < 0) { + lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl; + return ret; + } + } + + ObjectWriteOperation op; + list prefixes; + cls_rgw_remove_obj(op, prefixes); + + AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); + ret = ref.ioctx.aio_operate(ref.obj.oid, c, &op); + if (ret < 0) { + lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl; + c->release(); + return ret; + } + + handles.push_back(c); + + if (keep_index_consistent) { + ret = delete_obj_index(obj, astate->mtime); + if (ret < 0) { + lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl; + return ret; + } + } + return ret; +} + +int rgw_compression_info_from_attrset(map& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) { + map::iterator value = attrs.find(RGW_ATTR_COMPRESSION); + if (value != attrs.end()) { + auto bliter = value->second.cbegin(); + try { + decode(cs_info, bliter); + } catch (buffer::error& err) { + return -EIO; + } + if (cs_info.blocks.size() == 0) { + return -EIO; + } + if (cs_info.compression_type != "none") + need_decompress = true; + else + need_decompress = false; + return 0; + } else { + need_decompress = false; + return 0; + } +} + +bool RGWRados::call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) +{ + if (command == "cache list"sv) { + std::optional filter; + if (auto i = cmdmap.find("filter"); i != cmdmap.cend()) { + filter = boost::get(i->second); + } + std::unique_ptr f(ceph::Formatter::create(format, "table")); + if (f) { + f->open_array_section("cache_entries"); + call_list(filter, f.get()); + f->close_section(); + f->flush(out); + return true; + } else { + out.append("Unable to create Formatter.\n"); + return false; + } + } else if (command == "cache inspect"sv) { + std::unique_ptr f(ceph::Formatter::create(format, "json-pretty")); + if (f) { + const auto& target = boost::get(cmdmap.at("target")); + if (call_inspect(target, f.get())) { + f->flush(out); + return true; + } else { + out.append("Unable to find entry "s + target + ".\n"); + return false; + } + } else { + out.append("Unable to create Formatter.\n"); + return false; + } + } else if (command == "cache erase"sv) { + const auto& target = boost::get(cmdmap.at("target")); + if (call_erase(target)) { + return true; + } else { + out.append("Unable to find entry "s + target + ".\n"); + return false; + } + } else if (command == "cache zap"sv) { + call_zap(); + return true; + } + return false; +} + +void RGWRados::call_list(const std::optional& s, + ceph::Formatter *f) +{ + if (!svc.cache) { + return; + } + svc.cache->call_list(s, f); +} + +bool RGWRados::call_inspect(const std::string& s, Formatter *f) +{ + if (!svc.cache) { + return false; + } + return svc.cache->call_inspect(s, f); +} + +bool RGWRados::call_erase(const std::string& s) { + if (!svc.cache) { + return false; + } + return svc.cache->call_erase(s); +} + +void RGWRados::call_zap() { + if (svc.cache) { + return; + } + svc.cache->call_zap(); +} + +string RGWRados::get_mfa_oid(const rgw_user& user) +{ + return string("user:") + user.to_str(); +} + +int RGWRados::get_mfa_ref(const rgw_user& user, rgw_rados_ref *ref) +{ + string oid = get_mfa_oid(user); + rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid); + return get_system_obj_ref(obj, ref); +} + +int RGWRados::check_mfa(const rgw_user& user, const string& otp_id, const string& pin) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(user, &ref); + if (r < 0) { + return r; + } + + rados::cls::otp::otp_check_t result; + + r = rados::cls::otp::OTP::check(cct, ref.ioctx, ref.obj.oid, otp_id, pin, &result); + if (r < 0) + return r; + + ldout(cct, 20) << "OTP check, otp_id=" << otp_id << " result=" << (int)result.result << dendl; + + return (result.result == rados::cls::otp::OTP_CHECK_SUCCESS ? 0 : -EACCES); +} + +void RGWRados::prepare_mfa_write(librados::ObjectWriteOperation *op, + RGWObjVersionTracker *objv_tracker, + const ceph::real_time& mtime) +{ + RGWObjVersionTracker ot; + + if (objv_tracker) { + ot = *objv_tracker; + } + + if (ot.write_version.tag.empty()) { + if (ot.read_version.tag.empty()) { + ot.generate_new_write_ver(cct); + } else { + ot.write_version = ot.read_version; + ot.write_version.ver++; + } + } + + ot.prepare_op_for_write(op); + struct timespec mtime_ts = real_clock::to_timespec(mtime); + op->mtime2(&mtime_ts); +} + +int RGWRados::create_mfa(const rgw_user& user, const rados::cls::otp::otp_info_t& config, + RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(user, &ref); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + prepare_mfa_write(&op, objv_tracker, mtime); + rados::cls::otp::OTP::create(&op, config); + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r < 0) { + ldout(cct, 20) << "OTP create, otp_id=" << config.id << " result=" << (int)r << dendl; + return r; + } + + return 0; +} + +int RGWRados::remove_mfa(const rgw_user& user, const string& id, + RGWObjVersionTracker *objv_tracker, + const ceph::real_time& mtime) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(user, &ref); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + prepare_mfa_write(&op, objv_tracker, mtime); + rados::cls::otp::OTP::remove(&op, id); + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r < 0) { + ldout(cct, 20) << "OTP remove, otp_id=" << id << " result=" << (int)r << dendl; + return r; + } + + return 0; +} + +int RGWRados::get_mfa(const rgw_user& user, const string& id, rados::cls::otp::otp_info_t *result) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(user, &ref); + if (r < 0) { + return r; + } + + r = rados::cls::otp::OTP::get(nullptr, ref.ioctx, ref.obj.oid, id, result); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::list_mfa(const rgw_user& user, list *result) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(user, &ref); + if (r < 0) { + return r; + } + + r = rados::cls::otp::OTP::get_all(nullptr, ref.ioctx, ref.obj.oid, result); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::otp_get_current_time(const rgw_user& user, ceph::real_time *result) +{ + rgw_rados_ref ref; + + int r = get_mfa_ref(user, &ref); + if (r < 0) { + return r; + } + + r = rados::cls::otp::OTP::get_current_time(ref.ioctx, ref.obj.oid, result); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::set_mfa(const string& oid, const list& entries, + bool reset_obj, RGWObjVersionTracker *objv_tracker, + const real_time& mtime) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid); + rgw_rados_ref ref; + int r = get_system_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + librados::ObjectWriteOperation op; + if (reset_obj) { + op.remove(); + op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK); + op.create(false); + } + prepare_mfa_write(&op, objv_tracker, mtime); + rados::cls::otp::OTP::set(&op, entries); + r = ref.ioctx.operate(ref.obj.oid, &op); + if (r < 0) { + ldout(cct, 20) << "OTP set entries.size()=" << entries.size() << " result=" << (int)r << dendl; + return r; + } + + return 0; +} + +int RGWRados::list_mfa(const string& oid, list *result, + RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid); + rgw_rados_ref ref; + int r = get_system_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + librados::ObjectReadOperation op; + struct timespec mtime_ts; + if (pmtime) { + op.stat2(nullptr, &mtime_ts, nullptr); + } + objv_tracker->prepare_op_for_read(&op); + r = rados::cls::otp::OTP::get_all(&op, ref.ioctx, ref.obj.oid, result); + if (r < 0) { + return r; + } + if (pmtime) { + *pmtime = ceph::real_clock::from_timespec(mtime_ts); + } + + return 0; +} diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h new file mode 100644 index 00000000..395c574f --- /dev/null +++ b/src/rgw/rgw_rados.h @@ -0,0 +1,2633 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGWRADOS_H +#define CEPH_RGWRADOS_H + +#include +#include + +#include "include/rados/librados.hpp" +#include "include/Context.h" +#include "common/admin_socket.h" +#include "common/RefCountedObj.h" +#include "common/RWLock.h" +#include "common/ceph_time.h" +#include "common/lru_map.h" +#include "common/ceph_json.h" +#include "rgw_common.h" +#include "cls/rgw/cls_rgw_types.h" +#include "cls/version/cls_version_types.h" +#include "cls/log/cls_log_types.h" +#include "cls/timeindex/cls_timeindex_types.h" +#include "cls/otp/cls_otp_types.h" +#include "rgw_log.h" +#include "rgw_metadata.h" +#include "rgw_meta_sync_status.h" +#include "rgw_period_puller.h" +#include "rgw_sync_module.h" +#include "rgw_sync_log_trim.h" +#include "rgw_service.h" + +#include "services/svc_rados.h" +#include "services/svc_zone.h" + +class RGWWatcher; +class SafeTimer; +class ACLOwner; +class RGWGC; +class RGWMetaNotifier; +class RGWDataNotifier; +class RGWLC; +class RGWObjectExpirer; +class RGWMetaSyncProcessorThread; +class RGWDataSyncProcessorThread; +class RGWSyncLogTrimThread; +class RGWSyncTraceManager; +struct RGWZoneGroup; +struct RGWZoneParams; +class RGWReshard; +class RGWReshardWait; + +class RGWSysObjectCtx; + +/* flags for put_obj_meta() */ +#define PUT_OBJ_CREATE 0x01 +#define PUT_OBJ_EXCL 0x02 +#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL) + +#define RGW_OBJ_NS_MULTIPART "multipart" +#define RGW_OBJ_NS_SHADOW "shadow" + +#define RGW_BUCKET_INSTANCE_MD_PREFIX ".bucket.meta." + +#define RGW_NO_SHARD -1 + +#define RGW_SHARDS_PRIME_0 7877 +#define RGW_SHARDS_PRIME_1 65521 + +extern const std::string MP_META_SUFFIX; + +// only called by rgw_shard_id and rgw_bucket_shard_index +static inline int rgw_shards_mod(unsigned hval, int max_shards) +{ + if (max_shards <= RGW_SHARDS_PRIME_0) { + return hval % RGW_SHARDS_PRIME_0 % max_shards; + } + return hval % RGW_SHARDS_PRIME_1 % max_shards; +} + +// used for logging and tagging +static inline int rgw_shard_id(const string& key, int max_shards) +{ + return rgw_shards_mod(ceph_str_hash_linux(key.c_str(), key.size()), + max_shards); +} + +// used for bucket indices +static inline uint32_t rgw_bucket_shard_index(const std::string& key, + int num_shards) { + uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size()); + uint32_t sid2 = sid ^ ((sid & 0xFF) << 24); + return rgw_shards_mod(sid2, num_shards); +} + +static inline int rgw_shards_max() +{ + return RGW_SHARDS_PRIME_1; +} + +static inline void prepend_bucket_marker(const rgw_bucket& bucket, const string& orig_oid, string& oid) +{ + if (bucket.marker.empty() || orig_oid.empty()) { + oid = orig_oid; + } else { + oid = bucket.marker; + oid.append("_"); + oid.append(orig_oid); + } +} + +static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, string& oid, string& locator) +{ + const rgw_bucket& bucket = obj.bucket; + prepend_bucket_marker(bucket, obj.get_oid(), oid); + const string& loc = obj.key.get_loc(); + if (!loc.empty()) { + prepend_bucket_marker(bucket, loc, locator); + } else { + locator.clear(); + } +} + +int rgw_policy_from_attrset(CephContext *cct, map& attrset, RGWAccessControlPolicy *policy); + +static inline bool rgw_raw_obj_to_obj(const rgw_bucket& bucket, const rgw_raw_obj& raw_obj, rgw_obj *obj) +{ + ssize_t pos = raw_obj.oid.find('_'); + if (pos < 0) { + return false; + } + + if (!rgw_obj_key::parse_raw_oid(raw_obj.oid.substr(pos + 1), &obj->key)) { + return false; + } + obj->bucket = bucket; + + return true; +} + + +struct rgw_bucket_placement { + rgw_placement_rule placement_rule; + rgw_bucket bucket; + + void dump(Formatter *f) const; +}; + +class rgw_obj_select { + rgw_placement_rule placement_rule; + rgw_obj obj; + rgw_raw_obj raw_obj; + bool is_raw; + +public: + rgw_obj_select() : is_raw(false) {} + explicit rgw_obj_select(const rgw_obj& _obj) : obj(_obj), is_raw(false) {} + explicit rgw_obj_select(const rgw_raw_obj& _raw_obj) : raw_obj(_raw_obj), is_raw(true) {} + rgw_obj_select(const rgw_obj_select& rhs) { + placement_rule = rhs.placement_rule; + is_raw = rhs.is_raw; + if (is_raw) { + raw_obj = rhs.raw_obj; + } else { + obj = rhs.obj; + } + } + + rgw_raw_obj get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const; + rgw_raw_obj get_raw_obj(RGWRados *store) const; + + rgw_obj_select& operator=(const rgw_obj& rhs) { + obj = rhs; + is_raw = false; + return *this; + } + + rgw_obj_select& operator=(const rgw_raw_obj& rhs) { + raw_obj = rhs; + is_raw = true; + return *this; + } + + void set_placement_rule(const rgw_placement_rule& rule) { + placement_rule = rule; + } + void dump(Formatter *f) const; +}; + +struct compression_block { + uint64_t old_ofs; + uint64_t new_ofs; + uint64_t len; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(old_ofs, bl); + encode(new_ofs, bl); + encode(len, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(old_ofs, bl); + decode(new_ofs, bl); + decode(len, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(compression_block) + +struct RGWCompressionInfo { + string compression_type; + uint64_t orig_size; + vector blocks; + + RGWCompressionInfo() : compression_type("none"), orig_size(0) {} + RGWCompressionInfo(const RGWCompressionInfo& cs_info) : compression_type(cs_info.compression_type), + orig_size(cs_info.orig_size), + blocks(cs_info.blocks) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(compression_type, bl); + encode(orig_size, bl); + encode(blocks, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(compression_type, bl); + decode(orig_size, bl); + decode(blocks, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWCompressionInfo) + +int rgw_compression_info_from_attrset(map& attrs, bool& need_decompress, RGWCompressionInfo& cs_info); + +struct RGWOLHInfo { + rgw_obj target; + bool removed; + + RGWOLHInfo() : removed(false) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(target, bl); + encode(removed, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(target, bl); + decode(removed, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOLHInfo) + +struct RGWOLHPendingInfo { + ceph::real_time time; + + RGWOLHPendingInfo() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(time, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(time, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOLHPendingInfo) + +struct RGWUsageBatch { + map m; + + void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) { + bool exists = m.find(t) != m.end(); + *account = !exists; + m[t].aggregate(entry); + } +}; + +struct RGWUsageIter { + string read_iter; + uint32_t index; + + RGWUsageIter() : index(0) {} +}; + +class RGWGetDataCB { +public: + virtual int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) = 0; + RGWGetDataCB() {} + virtual ~RGWGetDataCB() {} +}; + +struct RGWCloneRangeInfo { + rgw_obj src; + off_t src_ofs; + off_t dst_ofs; + uint64_t len; +}; + +struct RGWObjManifestPart { + rgw_obj loc; /* the object where the data is located */ + uint64_t loc_ofs; /* the offset at that object where the data is located */ + uint64_t size; /* the part size */ + + RGWObjManifestPart() : loc_ofs(0), size(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(loc, bl); + encode(loc_ofs, bl); + encode(size, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl); + decode(loc, bl); + decode(loc_ofs, bl); + decode(size, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(RGWObjManifestPart) + +/* + The manifest defines a set of rules for structuring the object parts. + There are a few terms to note: + - head: the head part of the object, which is the part that contains + the first chunk of data. An object might not have a head (as in the + case of multipart-part objects). + - stripe: data portion of a single rgw object that resides on a single + rados object. + - part: a collection of stripes that make a contiguous part of an + object. A regular object will only have one part (although might have + many stripes), a multipart object might have many parts. Each part + has a fixed stripe size, although the last stripe of a part might + be smaller than that. Consecutive parts may be merged if their stripe + value is the same. +*/ + +struct RGWObjManifestRule { + uint32_t start_part_num; + uint64_t start_ofs; + uint64_t part_size; /* each part size, 0 if there's no part size, meaning it's unlimited */ + uint64_t stripe_max_size; /* underlying obj max size */ + string override_prefix; + + RGWObjManifestRule() : start_part_num(0), start_ofs(0), part_size(0), stripe_max_size(0) {} + RGWObjManifestRule(uint32_t _start_part_num, uint64_t _start_ofs, uint64_t _part_size, uint64_t _stripe_max_size) : + start_part_num(_start_part_num), start_ofs(_start_ofs), part_size(_part_size), stripe_max_size(_stripe_max_size) {} + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(start_part_num, bl); + encode(start_ofs, bl); + encode(part_size, bl); + encode(stripe_max_size, bl); + encode(override_prefix, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(start_part_num, bl); + decode(start_ofs, bl); + decode(part_size, bl); + decode(stripe_max_size, bl); + if (struct_v >= 2) + decode(override_prefix, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWObjManifestRule) + +class RGWObjManifest { +protected: + bool explicit_objs; /* old manifest? */ + map objs; + + uint64_t obj_size; + + rgw_obj obj; + uint64_t head_size; + rgw_placement_rule head_placement_rule; + + uint64_t max_head_size; + string prefix; + rgw_bucket_placement tail_placement; /* might be different than the original bucket, + as object might have been copied across pools */ + map rules; + + string tail_instance; /* tail object's instance */ + + void convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params); + int append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params); + void append_rules(RGWObjManifest& m, map::iterator& iter, string *override_prefix); + + void update_iterators() { + begin_iter.seek(0); + end_iter.seek(obj_size); + } +public: + + RGWObjManifest() : explicit_objs(false), obj_size(0), head_size(0), max_head_size(0), + begin_iter(this), end_iter(this) {} + RGWObjManifest(const RGWObjManifest& rhs) { + *this = rhs; + } + RGWObjManifest& operator=(const RGWObjManifest& rhs) { + explicit_objs = rhs.explicit_objs; + objs = rhs.objs; + obj_size = rhs.obj_size; + obj = rhs.obj; + head_size = rhs.head_size; + max_head_size = rhs.max_head_size; + prefix = rhs.prefix; + tail_placement = rhs.tail_placement; + rules = rhs.rules; + tail_instance = rhs.tail_instance; + + begin_iter.set_manifest(this); + end_iter.set_manifest(this); + + begin_iter.seek(rhs.begin_iter.get_ofs()); + end_iter.seek(rhs.end_iter.get_ofs()); + + return *this; + } + + map& get_explicit_objs() { + return objs; + } + + + void set_explicit(uint64_t _size, map& _objs) { + explicit_objs = true; + obj_size = _size; + objs.swap(_objs); + } + + void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs, string *override_prefix, rgw_obj_select *location); + + void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) { + RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size); + rules[0] = rule; + max_head_size = tail_ofs; + } + + void set_multipart_part_rule(uint64_t stripe_max_size, uint64_t part_num) { + RGWObjManifestRule rule(0, 0, 0, stripe_max_size); + rule.start_part_num = part_num; + rules[0] = rule; + max_head_size = 0; + } + + void encode(bufferlist& bl) const { + ENCODE_START(7, 6, bl); + encode(obj_size, bl); + encode(objs, bl); + encode(explicit_objs, bl); + encode(obj, bl); + encode(head_size, bl); + encode(max_head_size, bl); + encode(prefix, bl); + encode(rules, bl); + bool encode_tail_bucket = !(tail_placement.bucket == obj.bucket); + encode(encode_tail_bucket, bl); + if (encode_tail_bucket) { + encode(tail_placement.bucket, bl); + } + bool encode_tail_instance = (tail_instance != obj.key.instance); + encode(encode_tail_instance, bl); + if (encode_tail_instance) { + encode(tail_instance, bl); + } + encode(head_placement_rule, bl); + encode(tail_placement.placement_rule, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN_32(7, 2, 2, bl); + decode(obj_size, bl); + decode(objs, bl); + if (struct_v >= 3) { + decode(explicit_objs, bl); + decode(obj, bl); + decode(head_size, bl); + decode(max_head_size, bl); + decode(prefix, bl); + decode(rules, bl); + } else { + explicit_objs = true; + if (!objs.empty()) { + map::iterator iter = objs.begin(); + obj = iter->second.loc; + head_size = iter->second.size; + max_head_size = head_size; + } + } + + if (explicit_objs && head_size > 0 && !objs.empty()) { + /* patch up manifest due to issue 16435: + * the first object in the explicit objs list might not be the one we need to access, use the + * head object instead if set. This would happen if we had an old object that was created + * when the explicit objs manifest was around, and it got copied. + */ + rgw_obj& obj_0 = objs[0].loc; + if (!obj_0.get_oid().empty() && obj_0.key.ns.empty()) { + objs[0].loc = obj; + objs[0].size = head_size; + } + } + + if (struct_v >= 4) { + if (struct_v < 6) { + decode(tail_placement.bucket, bl); + } else { + bool need_to_decode; + decode(need_to_decode, bl); + if (need_to_decode) { + decode(tail_placement.bucket, bl); + } else { + tail_placement.bucket = obj.bucket; + } + } + } + + if (struct_v >= 5) { + if (struct_v < 6) { + decode(tail_instance, bl); + } else { + bool need_to_decode; + decode(need_to_decode, bl); + if (need_to_decode) { + decode(tail_instance, bl); + } else { + tail_instance = obj.key.instance; + } + } + } else { // old object created before 'tail_instance' field added to manifest + tail_instance = obj.key.instance; + } + + if (struct_v >= 7) { + decode(head_placement_rule, bl); + decode(tail_placement.placement_rule, bl); + } + + update_iterators(); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + int append(RGWObjManifest& m, const RGWZoneGroup& zonegroup, + const RGWZoneParams& zone_params); + int append(RGWObjManifest& m, RGWSI_Zone *zone_svc); + + bool get_rule(uint64_t ofs, RGWObjManifestRule *rule); + + bool empty() { + if (explicit_objs) + return objs.empty(); + return rules.empty(); + } + + bool has_explicit_objs() { + return explicit_objs; + } + + bool has_tail() { + if (explicit_objs) { + if (objs.size() == 1) { + map::iterator iter = objs.begin(); + rgw_obj& o = iter->second.loc; + return !(obj == o); + } + return (objs.size() >= 2); + } + return (obj_size > head_size); + } + + void set_head(const rgw_placement_rule& placement_rule, const rgw_obj& _o, uint64_t _s) { + head_placement_rule = placement_rule; + obj = _o; + head_size = _s; + + if (explicit_objs && head_size > 0) { + objs[0].loc = obj; + objs[0].size = head_size; + } + } + + const rgw_obj& get_obj() { + return obj; + } + + void set_tail_placement(const rgw_placement_rule& placement_rule, const rgw_bucket& _b) { + tail_placement.placement_rule = placement_rule; + tail_placement.bucket = _b; + } + + const rgw_bucket_placement& get_tail_placement() { + return tail_placement; + } + + const rgw_placement_rule& get_head_placement_rule() { + return head_placement_rule; + } + + void set_prefix(const string& _p) { + prefix = _p; + } + + const string& get_prefix() { + return prefix; + } + + void set_tail_instance(const string& _ti) { + tail_instance = _ti; + } + + const string& get_tail_instance() { + return tail_instance; + } + + void set_head_size(uint64_t _s) { + head_size = _s; + } + + void set_obj_size(uint64_t s) { + obj_size = s; + + update_iterators(); + } + + uint64_t get_obj_size() { + return obj_size; + } + + uint64_t get_head_size() { + return head_size; + } + + uint64_t get_max_head_size() { + return max_head_size; + } + + class obj_iterator { + RGWObjManifest *manifest; + uint64_t part_ofs; /* where current part starts */ + uint64_t stripe_ofs; /* where current stripe starts */ + uint64_t ofs; /* current position within the object */ + uint64_t stripe_size; /* current part size */ + + int cur_part_id; + int cur_stripe; + string cur_override_prefix; + + rgw_obj_select location; + + map::iterator rule_iter; + map::iterator next_rule_iter; + + map::iterator explicit_iter; + + void init() { + part_ofs = 0; + stripe_ofs = 0; + ofs = 0; + stripe_size = 0; + cur_part_id = 0; + cur_stripe = 0; + } + + void update_explicit_pos(); + + + protected: + + void set_manifest(RGWObjManifest *m) { + manifest = m; + } + + public: + obj_iterator() : manifest(NULL) { + init(); + } + explicit obj_iterator(RGWObjManifest *_m) : manifest(_m) { + init(); + if (!manifest->empty()) { + seek(0); + } + } + obj_iterator(RGWObjManifest *_m, uint64_t _ofs) : manifest(_m) { + init(); + if (!manifest->empty()) { + seek(_ofs); + } + } + void seek(uint64_t ofs); + + void operator++(); + bool operator==(const obj_iterator& rhs) const { + return (ofs == rhs.ofs); + } + bool operator!=(const obj_iterator& rhs) const { + return (ofs != rhs.ofs); + } + const rgw_obj_select& get_location() const { + return location; + } + + /* where current part starts */ + uint64_t get_part_ofs() const { + return part_ofs; + } + + /* start of current stripe */ + uint64_t get_stripe_ofs() const { + if (manifest->explicit_objs) { + return explicit_iter->first; + } + return stripe_ofs; + } + + /* current ofs relative to start of rgw object */ + uint64_t get_ofs() const { + return ofs; + } + + /* stripe number */ + int get_cur_stripe() const { + return cur_stripe; + } + + /* current stripe size */ + uint64_t get_stripe_size() const { + if (manifest->explicit_objs) { + return explicit_iter->second.size; + } + return stripe_size; + } + + /* offset where data starts within current stripe */ + uint64_t location_ofs() const { + if (manifest->explicit_objs) { + return explicit_iter->second.loc_ofs; + } + return 0; /* all stripes start at zero offset */ + } + + void update_location(); + + friend class RGWObjManifest; + void dump(Formatter *f) const; + }; + + const obj_iterator& obj_begin(); + const obj_iterator& obj_end(); + obj_iterator obj_find(uint64_t ofs); + + obj_iterator begin_iter; + obj_iterator end_iter; + + /* + * simple object generator. Using a simple single rule manifest. + */ + class generator { + RGWObjManifest *manifest; + uint64_t last_ofs; + uint64_t cur_part_ofs; + int cur_part_id; + int cur_stripe; + uint64_t cur_stripe_size; + string cur_oid; + + string oid_prefix; + + rgw_obj_select cur_obj; + + RGWObjManifestRule rule; + + public: + generator() : manifest(NULL), last_ofs(0), cur_part_ofs(0), cur_part_id(0), + cur_stripe(0), cur_stripe_size(0) {} + int create_begin(CephContext *cct, RGWObjManifest *manifest, + const rgw_placement_rule& head_placement_rule, + const rgw_placement_rule *tail_placement_rule, + const rgw_bucket& bucket, + const rgw_obj& obj); + + int create_next(uint64_t ofs); + + rgw_raw_obj get_cur_obj(RGWZoneGroup& zonegroup, RGWZoneParams& zone_params) { return cur_obj.get_raw_obj(zonegroup, zone_params); } + rgw_raw_obj get_cur_obj(RGWRados *store) const { return cur_obj.get_raw_obj(store); } + + /* total max size of current stripe (including head obj) */ + uint64_t cur_stripe_max_size() const { + return cur_stripe_size; + } + }; +}; +WRITE_CLASS_ENCODER(RGWObjManifest) + +struct RGWUploadPartInfo { + uint32_t num; + uint64_t size; + uint64_t accounted_size{0}; + string etag; + ceph::real_time modified; + RGWObjManifest manifest; + RGWCompressionInfo cs_info; + + RGWUploadPartInfo() : num(0), size(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(4, 2, bl); + encode(num, bl); + encode(size, bl); + encode(etag, bl); + encode(modified, bl); + encode(manifest, bl); + encode(cs_info, bl); + encode(accounted_size, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl); + decode(num, bl); + decode(size, bl); + decode(etag, bl); + decode(modified, bl); + if (struct_v >= 3) + decode(manifest, bl); + if (struct_v >= 4) { + decode(cs_info, bl); + decode(accounted_size, bl); + } else { + accounted_size = size; + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(RGWUploadPartInfo) + +struct RGWObjState { + rgw_obj obj; + bool is_atomic; + bool has_attrs; + bool exists; + uint64_t size; //< size of raw object + uint64_t accounted_size{0}; //< size before compression, encryption + ceph::real_time mtime; + uint64_t epoch; + bufferlist obj_tag; + bufferlist tail_tag; + string write_tag; + bool fake_tag; + RGWObjManifest manifest; + bool has_manifest; + string shadow_obj; + bool has_data; + bufferlist data; + bool prefetch_data; + bool keep_tail; + bool is_olh; + bufferlist olh_tag; + uint64_t pg_ver; + uint32_t zone_short_id; + + /* important! don't forget to update copy constructor */ + + RGWObjVersionTracker objv_tracker; + + map attrset; + RGWObjState() : is_atomic(false), has_attrs(0), exists(false), + size(0), epoch(0), fake_tag(false), has_manifest(false), + has_data(false), prefetch_data(false), keep_tail(false), is_olh(false), + pg_ver(0), zone_short_id(0) {} + RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) { + is_atomic = rhs.is_atomic; + has_attrs = rhs.has_attrs; + exists = rhs.exists; + size = rhs.size; + accounted_size = rhs.accounted_size; + mtime = rhs.mtime; + epoch = rhs.epoch; + if (rhs.obj_tag.length()) { + obj_tag = rhs.obj_tag; + } + if (rhs.tail_tag.length()) { + tail_tag = rhs.tail_tag; + } + write_tag = rhs.write_tag; + fake_tag = rhs.fake_tag; + if (rhs.has_manifest) { + manifest = rhs.manifest; + } + has_manifest = rhs.has_manifest; + shadow_obj = rhs.shadow_obj; + has_data = rhs.has_data; + if (rhs.data.length()) { + data = rhs.data; + } + prefetch_data = rhs.prefetch_data; + keep_tail = rhs.keep_tail; + is_olh = rhs.is_olh; + objv_tracker = rhs.objv_tracker; + pg_ver = rhs.pg_ver; + } + + bool get_attr(string name, bufferlist& dest) { + map::iterator iter = attrset.find(name); + if (iter != attrset.end()) { + dest = iter->second; + return true; + } + return false; + } +}; + +struct RGWRawObjState { + rgw_raw_obj obj; + bool has_attrs{false}; + bool exists{false}; + uint64_t size{0}; + ceph::real_time mtime; + uint64_t epoch{0}; + bufferlist obj_tag; + bool has_data{false}; + bufferlist data; + bool prefetch_data{false}; + uint64_t pg_ver{0}; + + /* important! don't forget to update copy constructor */ + + RGWObjVersionTracker objv_tracker; + + map attrset; + RGWRawObjState() {} + RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) { + has_attrs = rhs.has_attrs; + exists = rhs.exists; + size = rhs.size; + mtime = rhs.mtime; + epoch = rhs.epoch; + if (rhs.obj_tag.length()) { + obj_tag = rhs.obj_tag; + } + has_data = rhs.has_data; + if (rhs.data.length()) { + data = rhs.data; + } + prefetch_data = rhs.prefetch_data; + pg_ver = rhs.pg_ver; + objv_tracker = rhs.objv_tracker; + } +}; + +struct RGWPoolIterCtx { + librados::IoCtx io_ctx; + librados::NObjectIterator iter; +}; + +struct RGWListRawObjsCtx { + bool initialized; + RGWPoolIterCtx iter_ctx; + + RGWListRawObjsCtx() : initialized(false) {} +}; + +struct objexp_hint_entry { + string tenant; + string bucket_name; + string bucket_id; + rgw_obj_key obj_key; + ceph::real_time exp_time; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(bucket_name, bl); + encode(bucket_id, bl); + encode(obj_key, bl); + encode(exp_time, bl); + encode(tenant, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ? + DECODE_START(2, bl); + decode(bucket_name, bl); + decode(bucket_id, bl); + decode(obj_key, bl); + decode(exp_time, bl); + if (struct_v >= 2) { + decode(tenant, bl); + } else { + tenant.clear(); + } + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(objexp_hint_entry) + +class RGWDataChangesLog; +class RGWMetaSyncStatusManager; +class RGWDataSyncStatusManager; +class RGWCoroutinesManagerRegistry; + +class RGWGetBucketStats_CB : public RefCountedObject { +protected: + rgw_bucket bucket; + map *stats; +public: + explicit RGWGetBucketStats_CB(const rgw_bucket& _bucket) : bucket(_bucket), stats(NULL) {} + ~RGWGetBucketStats_CB() override {} + virtual void handle_response(int r) = 0; + virtual void set_response(map *_stats) { + stats = _stats; + } +}; + +class RGWGetUserStats_CB : public RefCountedObject { +protected: + rgw_user user; + RGWStorageStats stats; +public: + explicit RGWGetUserStats_CB(const rgw_user& _user) : user(_user) {} + ~RGWGetUserStats_CB() override {} + virtual void handle_response(int r) = 0; + virtual void set_response(RGWStorageStats& _stats) { + stats = _stats; + } +}; + +class RGWGetDirHeader_CB; +class RGWGetUserHeader_CB; + +class RGWObjectCtx { + RGWRados *store; + RWLock lock{"RGWObjectCtx"}; + void *s{nullptr}; + + std::map objs_state; +public: + explicit RGWObjectCtx(RGWRados *_store) : store(_store) {} + explicit RGWObjectCtx(RGWRados *_store, void *_s) : store(_store), s(_s) {} + + void *get_private() { + return s; + } + + RGWRados *get_store() { + return store; + } + + RGWObjState *get_state(const rgw_obj& obj) { + RGWObjState *result; + typename std::map::iterator iter; + lock.get_read(); + assert (!obj.empty()); + iter = objs_state.find(obj); + if (iter != objs_state.end()) { + result = &iter->second; + lock.unlock(); + } else { + lock.unlock(); + lock.get_write(); + result = &objs_state[obj]; + lock.unlock(); + } + return result; + } + + void set_atomic(rgw_obj& obj) { + RWLock::WLocker wl(lock); + assert (!obj.empty()); + objs_state[obj].is_atomic = true; + } + void set_prefetch_data(const rgw_obj& obj) { + RWLock::WLocker wl(lock); + assert (!obj.empty()); + objs_state[obj].prefetch_data = true; + } + + void invalidate(const rgw_obj& obj) { + RWLock::WLocker wl(lock); + auto iter = objs_state.find(obj); + if (iter == objs_state.end()) { + return; + } + bool is_atomic = iter->second.is_atomic; + bool prefetch_data = iter->second.prefetch_data; + + objs_state.erase(iter); + + if (is_atomic || prefetch_data) { + auto& state = objs_state[obj]; + state.is_atomic = is_atomic; + state.prefetch_data = prefetch_data; + } + } +}; + +class RGWAsyncRadosProcessor; + +template +class RGWChainedCacheImpl; + +struct bucket_info_entry { + RGWBucketInfo info; + real_time mtime; + map attrs; +}; + +struct tombstone_entry { + ceph::real_time mtime; + uint32_t zone_short_id; + uint64_t pg_ver; + + tombstone_entry() = default; + explicit tombstone_entry(const RGWObjState& state) + : mtime(state.mtime), zone_short_id(state.zone_short_id), + pg_ver(state.pg_ver) {} +}; + +class RGWIndexCompletionManager; + +class RGWRados : public AdminSocketHook +{ + friend class RGWGC; + friend class RGWMetaNotifier; + friend class RGWDataNotifier; + friend class RGWLC; + friend class RGWObjectExpirer; + friend class RGWMetaSyncProcessorThread; + friend class RGWDataSyncProcessorThread; + friend class RGWReshard; + friend class RGWBucketReshard; + friend class RGWBucketReshardLock; + friend class BucketIndexLockGuard; + friend class RGWCompleteMultipart; + + static constexpr const char* admin_commands[4][3] = { + { "cache list", + "cache list name=filter,type=CephString,req=false", + "cache list [filter_str]: list object cache, possibly matching substrings" }, + { "cache inspect", + "cache inspect name=target,type=CephString,req=true", + "cache inspect target: print cache element" }, + { "cache erase", + "cache erase name=target,type=CephString,req=true", + "cache erase target: erase element from cache" }, + { "cache zap", + "cache zap", + "cache zap: erase all elements from cache" } + }; + + /** Open the pool used as root for this gateway */ + int open_root_pool_ctx(); + int open_gc_pool_ctx(); + int open_lc_pool_ctx(); + int open_objexp_pool_ctx(); + int open_reshard_pool_ctx(); + + int open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx, + bool mostly_omap); + int open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx); + int open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid); + int open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, + string& bucket_oid_base); + int open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, + const string& obj_key, string *bucket_obj, int *shard_id); + int open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, + int shard_id, string *bucket_obj); + int open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, + map& bucket_objs, int shard_id = -1, map *bucket_instance_ids = NULL); + template + int open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, + map& oids, map& bucket_objs, + int shard_id = -1, map *bucket_instance_ids = NULL); + void build_bucket_index_marker(const string& shard_id_str, const string& shard_marker, + string *marker); + + void get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map *result); + + std::atomic max_req_id = { 0 }; + Mutex lock; + SafeTimer *timer; + + RGWGC *gc; + RGWLC *lc; + RGWObjectExpirer *obj_expirer; + bool use_gc_thread; + bool use_lc_thread; + bool quota_threads; + bool run_sync_thread; + bool run_reshard_thread; + + RGWAsyncRadosProcessor* async_rados; + + RGWMetaNotifier *meta_notifier; + RGWDataNotifier *data_notifier; + RGWMetaSyncProcessorThread *meta_sync_processor_thread; + RGWSyncTraceManager *sync_tracer = nullptr; + map data_sync_processor_threads; + + boost::optional bucket_trim; + RGWSyncLogTrimThread *sync_log_trimmer{nullptr}; + + Mutex meta_sync_thread_lock; + Mutex data_sync_thread_lock; + + librados::IoCtx root_pool_ctx; // .rgw + + double inject_notify_timeout_probability = 0; + unsigned max_notify_retries = 0; + + friend class RGWWatcher; + + Mutex bucket_id_lock; + + // This field represents the number of bucket index object shards + uint32_t bucket_index_max_shards; + + int get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx); + int get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref); + int get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref); + uint64_t max_bucket_id; + + int get_olh_target_state(RGWObjectCtx& rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + RGWObjState *olh_state, RGWObjState **target_state); + int get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, + bool follow_olh, bool assume_noent = false); + int append_atomic_test(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + librados::ObjectOperation& op, RGWObjState **state); + int append_atomic_test(const RGWObjState* astate, librados::ObjectOperation& op); + + int update_placement_map(); + int store_bucket_info(RGWBucketInfo& info, map *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive); + + void remove_rgw_head_obj(librados::ObjectWriteOperation& op); + void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const string& prefix, bool fail_if_exist); + void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type); +protected: + CephContext *cct; + + librados::Rados rados; + + using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl; + RGWChainedCacheImpl_bucket_info_entry *binfo_cache; + + using tombstone_cache_t = lru_map; + tombstone_cache_t *obj_tombstone_cache; + + librados::IoCtx gc_pool_ctx; // .rgw.gc + librados::IoCtx lc_pool_ctx; // .rgw.lc + librados::IoCtx objexp_pool_ctx; + librados::IoCtx reshard_pool_ctx; + + bool pools_initialized; + + RGWQuotaHandler *quota_handler; + + RGWCoroutinesManagerRegistry *cr_registry; + + RGWSyncModuleInstanceRef sync_module; + bool writeable_zone{false}; + + RGWIndexCompletionManager *index_completion_manager{nullptr}; + + bool use_cache{false}; +public: + RGWRados(): lock("rados_timer_lock"), timer(NULL), + gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false), + run_sync_thread(false), run_reshard_thread(false), async_rados(nullptr), meta_notifier(NULL), + data_notifier(NULL), meta_sync_processor_thread(NULL), + meta_sync_thread_lock("meta_sync_thread_lock"), data_sync_thread_lock("data_sync_thread_lock"), + bucket_id_lock("rados_bucket_id"), + bucket_index_max_shards(0), + max_bucket_id(0), cct(NULL), + binfo_cache(NULL), obj_tombstone_cache(nullptr), + pools_initialized(false), + quota_handler(NULL), + cr_registry(NULL), + meta_mgr(NULL), data_log(NULL), reshard(NULL) {} + + RGWRados& set_use_cache(bool status) { + use_cache = status; + return *this; + } + + RGWLC *get_lc() { + return lc; + } + + RGWRados& set_run_gc_thread(bool _use_gc_thread) { + use_gc_thread = _use_gc_thread; + return *this; + } + + RGWRados& set_run_lc_thread(bool _use_lc_thread) { + use_lc_thread = _use_lc_thread; + return *this; + } + + RGWRados& set_run_quota_threads(bool _run_quota_threads) { + quota_threads = _run_quota_threads; + return *this; + } + + RGWRados& set_run_sync_thread(bool _run_sync_thread) { + run_sync_thread = _run_sync_thread; + return *this; + } + + RGWRados& set_run_reshard_thread(bool _run_reshard_thread) { + run_reshard_thread = _run_reshard_thread; + return *this; + } + + uint64_t get_new_req_id() { + return ++max_req_id; + } + + librados::IoCtx* get_lc_pool_ctx() { + return &lc_pool_ctx; + } + void set_context(CephContext *_cct) { + cct = _cct; + } + + RGWServices svc; + + /** + * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we + * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed. + */ + string host_id; + + // pulls missing periods for period_history + std::unique_ptr period_puller; + // maintains a connected history of periods + std::unique_ptr period_history; + + RGWAsyncRadosProcessor* get_async_rados() const { return async_rados; }; + + RGWMetadataManager *meta_mgr; + + RGWDataChangesLog *data_log; + + RGWReshard *reshard; + std::shared_ptr reshard_wait; + + virtual ~RGWRados() = default; + + tombstone_cache_t *get_tombstone_cache() { + return obj_tombstone_cache; + } + const RGWSyncModuleInstanceRef& get_sync_module() { + return sync_module; + } + RGWSyncTraceManager *get_sync_tracer() { + return sync_tracer; + } + + int get_required_alignment(const rgw_pool& pool, uint64_t *alignment); + void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size); + int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, uint64_t *palignment = nullptr); + int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, uint64_t *palignment = nullptr); + + uint32_t get_max_bucket_shards() { + return rgw_shards_max(); + } + + + int get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref); + + int list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx); + int list_raw_objects_next(const string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, list& oids, + bool *is_truncated); + int list_raw_objects(const rgw_pool& pool, const string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, list& oids, + bool *is_truncated); + string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx); + + CephContext *ctx() { return cct; } + /** do all necessary setup of the storage device */ + int initialize(CephContext *_cct) { + set_context(_cct); + return initialize(); + } + /** Initialize the RADOS instance and prepare to do other ops */ + int init_svc(bool raw); + int init_rados(); + int init_complete(); + int initialize(); + void finalize(); + + int register_to_service_map(const string& daemon_type, const map& meta); + int update_service_map(std::map&& status); + + /// list logs + int log_list_init(const string& prefix, RGWAccessHandle *handle); + int log_list_next(RGWAccessHandle handle, string *name); + + /// remove log + int log_remove(const string& name); + + /// show log + int log_show_init(const string& name, RGWAccessHandle *handle); + int log_show_next(RGWAccessHandle handle, rgw_log_entry *entry); + + // log bandwidth info + int log_usage(map& usage_info); + int read_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, map& usage); + int trim_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch); + int clear_usage(); + + int create_pool(const rgw_pool& pool); + + int init_bucket_index(RGWBucketInfo& bucket_info, int num_shards); + int clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards); + void create_bucket_id(string *bucket_id); + + bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool); + bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj); + + int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket, + const string& zonegroup_id, + const rgw_placement_rule& placement_rule, + const string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + map& attrs, + RGWBucketInfo& bucket_info, + obj_version *pobjv, + obj_version *pep_objv, + ceph::real_time creation_time, + rgw_bucket *master_bucket, + uint32_t *master_num_shards, + bool exclusive = true); + + RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; } + + struct BucketShard { + RGWRados *store; + rgw_bucket bucket; + int shard_id; + librados::IoCtx index_ctx; + string bucket_obj; + + explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {} + int init(const rgw_bucket& _bucket, const rgw_obj& obj, RGWBucketInfo* out); + int init(const rgw_bucket& _bucket, int sid, RGWBucketInfo* out); + int init(const RGWBucketInfo& bucket_info, const rgw_obj& obj); + int init(const RGWBucketInfo& bucket_info, int sid); + }; + + class Object { + RGWRados *store; + RGWBucketInfo bucket_info; + RGWObjectCtx& ctx; + rgw_obj obj; + + BucketShard bs; + + RGWObjState *state; + + bool versioning_disabled; + + bool bs_initialized; + + protected: + int get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent = false); + void invalidate_state(); + + int prepare_atomic_modification(librados::ObjectWriteOperation& op, bool reset_obj, const string *ptag, + const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail); + int complete_atomic_modification(); + + public: + Object(RGWRados *_store, const RGWBucketInfo& _bucket_info, RGWObjectCtx& _ctx, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info), + ctx(_ctx), obj(_obj), bs(store), + state(NULL), versioning_disabled(false), + bs_initialized(false) {} + + RGWRados *get_store() { return store; } + rgw_obj& get_obj() { return obj; } + RGWObjectCtx& get_ctx() { return ctx; } + RGWBucketInfo& get_bucket_info() { return bucket_info; } + int get_manifest(RGWObjManifest **pmanifest); + + int get_bucket_shard(BucketShard **pbs) { + if (!bs_initialized) { + int r = + bs.init(bucket_info.bucket, obj, nullptr /* no RGWBucketInfo */); + if (r < 0) { + return r; + } + bs_initialized = true; + } + *pbs = &bs; + return 0; + } + + void set_versioning_disabled(bool status) { + versioning_disabled = status; + } + + bool versioning_enabled() { + return (!versioning_disabled && bucket_info.versioning_enabled()); + } + + struct Read { + RGWRados::Object *source; + + struct GetObjState { + map io_ctxs; + rgw_pool cur_pool; + librados::IoCtx *cur_ioctx{nullptr}; + rgw_obj obj; + rgw_raw_obj head_obj; + } state; + + struct ConditionParams { + const ceph::real_time *mod_ptr; + const ceph::real_time *unmod_ptr; + bool high_precision_time; + uint32_t mod_zone_id; + uint64_t mod_pg_ver; + const char *if_match; + const char *if_nomatch; + + ConditionParams() : + mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0), + if_match(NULL), if_nomatch(NULL) {} + } conds; + + struct Params { + ceph::real_time *lastmod; + uint64_t *obj_size; + map *attrs; + rgw_obj *target_obj; + + Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr), + target_obj(nullptr) {} + } params; + + explicit Read(RGWRados::Object *_source) : source(_source) {} + + int prepare(); + static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end); + int read(int64_t ofs, int64_t end, bufferlist& bl); + int iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb); + int get_attr(const char *name, bufferlist& dest); + }; + + struct Write { + RGWRados::Object *target; + + struct MetaParams { + ceph::real_time *mtime; + map* rmattrs; + const bufferlist *data; + RGWObjManifest *manifest; + const string *ptag; + list *remove_objs; + ceph::real_time set_mtime; + rgw_user owner; + RGWObjCategory category; + int flags; + const char *if_match; + const char *if_nomatch; + std::optional olh_epoch; + ceph::real_time delete_at; + bool canceled; + const string *user_data; + rgw_zone_set *zones_trace; + bool modify_tail; + bool completeMultipart; + bool appendable; + + MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL), + remove_objs(NULL), category(RGWObjCategory::Main), flags(0), + if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr), + modify_tail(false), completeMultipart(false), appendable(false) {} + } meta; + + explicit Write(RGWRados::Object *_target) : target(_target) {} + + int _do_write_meta(uint64_t size, uint64_t accounted_size, + map& attrs, + bool modify_tail, bool assume_noent, + void *index_op); + int write_meta(uint64_t size, uint64_t accounted_size, + map& attrs); + int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive); + const req_state* get_req_state() { + return (req_state *)target->get_ctx().get_private(); + } + }; + + struct Delete { + RGWRados::Object *target; + + struct DeleteParams { + rgw_user bucket_owner; + int versioning_status; + ACLOwner obj_owner; /* needed for creation of deletion marker */ + uint64_t olh_epoch; + string marker_version_id; + uint32_t bilog_flags; + list *remove_objs; + ceph::real_time expiration_time; + ceph::real_time unmod_since; + ceph::real_time mtime; /* for setting delete marker mtime */ + bool high_precision_time; + rgw_zone_set *zones_trace; + bool abortmp; + uint64_t parts_accounted_size; + + DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {} + } params; + + struct DeleteResult { + bool delete_marker; + string version_id; + + DeleteResult() : delete_marker(false) {} + } result; + + explicit Delete(RGWRados::Object *_target) : target(_target) {} + + int delete_obj(); + }; + + struct Stat { + RGWRados::Object *source; + + struct Result { + rgw_obj obj; + RGWObjManifest manifest; + bool has_manifest; + uint64_t size; + struct timespec mtime; + map attrs; + + Result() : has_manifest(false), size(0) {} + } result; + + struct State { + librados::IoCtx io_ctx; + librados::AioCompletion *completion; + int ret; + + State() : completion(NULL), ret(0) {} + } state; + + + explicit Stat(RGWRados::Object *_source) : source(_source) {} + + int stat_async(); + int wait(); + int stat(); + private: + int finish(); + }; + }; + + class Bucket { + RGWRados *store; + RGWBucketInfo bucket_info; + rgw_bucket& bucket; + int shard_id; + + public: + Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket), + shard_id(RGW_NO_SHARD) {} + RGWRados *get_store() { return store; } + rgw_bucket& get_bucket() { return bucket; } + RGWBucketInfo& get_bucket_info() { return bucket_info; } + + int update_bucket_id(const string& new_bucket_id); + + int get_shard_id() { return shard_id; } + void set_shard_id(int id) { + shard_id = id; + } + + class UpdateIndex { + RGWRados::Bucket *target; + string optag; + rgw_obj obj; + uint16_t bilog_flags{0}; + BucketShard bs; + bool bs_initialized{false}; + bool blind; + bool prepared{false}; + rgw_zone_set *zones_trace{nullptr}; + + int init_bs() { + int r = + bs.init(target->get_bucket(), obj, nullptr /* no RGWBucketInfo */); + if (r < 0) { + return r; + } + bs_initialized = true; + return 0; + } + + void invalidate_bs() { + bs_initialized = false; + } + + int guard_reshard(BucketShard **pbs, std::function call); + public: + + UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj), + bs(target->get_store()) { + blind = (target->get_bucket_info().index_type == RGWBIType_Indexless); + } + + int get_bucket_shard(BucketShard **pbs) { + if (!bs_initialized) { + int r = init_bs(); + if (r < 0) { + return r; + } + } + *pbs = &bs; + return 0; + } + + void set_bilog_flags(uint16_t flags) { + bilog_flags = flags; + } + + void set_zones_trace(rgw_zone_set *_zones_trace) { + zones_trace = _zones_trace; + } + + int prepare(RGWModifyOp, const string *write_tag); + int complete(int64_t poolid, uint64_t epoch, uint64_t size, + uint64_t accounted_size, ceph::real_time& ut, + const string& etag, const string& content_type, + const string& storage_class, + bufferlist *acl_bl, RGWObjCategory category, + list *remove_objs, const string *user_data = nullptr, bool appendable = false); + int complete_del(int64_t poolid, uint64_t epoch, + ceph::real_time& removed_mtime, /* mtime of removed object */ + list *remove_objs); + int cancel(); + + const string *get_optag() { return &optag; } + + bool is_prepared() { return prepared; } + }; // class UpdateIndex + + class List { + protected: + // absolute maximum number of objects that + // list_objects_(un)ordered can return + static constexpr int64_t bucket_list_objects_absolute_max = 25000; + + RGWRados::Bucket *target; + rgw_obj_key next_marker; + + int list_objects_ordered(int64_t max, + vector *result, + map *common_prefixes, + bool *is_truncated); + int list_objects_unordered(int64_t max, + vector *result, + map *common_prefixes, + bool *is_truncated); + + public: + + struct Params { + string prefix; + string delim; + rgw_obj_key marker; + rgw_obj_key end_marker; + string ns; + bool enforce_ns; + RGWAccessListFilter *filter; + bool list_versions; + bool allow_unordered; + + Params() : + enforce_ns(true), + filter(NULL), + list_versions(false), + allow_unordered(false) + {} + } params; + + explicit List(RGWRados::Bucket *_target) : target(_target) {} + + int list_objects(int64_t max, + vector *result, + map *common_prefixes, + bool *is_truncated) { + if (params.allow_unordered) { + return list_objects_unordered(max, result, common_prefixes, + is_truncated); + } else { + return list_objects_ordered(max, result, common_prefixes, + is_truncated); + } + } + rgw_obj_key& get_next_marker() { + return next_marker; + } + }; // class List + }; // class Bucket + + int on_last_entry_in_listing(RGWBucketInfo& bucket_info, + const std::string& obj_prefix, + const std::string& obj_delim, + std::function handler); + + bool swift_versioning_enabled(const RGWBucketInfo& bucket_info) const { + return bucket_info.has_swift_versioning() && + bucket_info.swift_ver_location.size(); + } + + int swift_versioning_copy(RGWObjectCtx& obj_ctx, /* in/out */ + const rgw_user& user, /* in */ + RGWBucketInfo& bucket_info, /* in */ + rgw_obj& obj); /* in */ + int swift_versioning_restore(RGWSysObjectCtx& sysobj_ctx, + RGWObjectCtx& obj_ctx, /* in/out */ + const rgw_user& user, /* in */ + RGWBucketInfo& bucket_info, /* in */ + rgw_obj& obj, /* in */ + bool& restored); /* out */ + int copy_obj_to_remote_dest(RGWObjState *astate, + map& src_attrs, + RGWRados::Object::Read& read_op, + const rgw_user& user_id, + rgw_obj& dest_obj, + ceph::real_time *mtime); + + enum AttrsMod { + ATTRSMOD_NONE = 0, + ATTRSMOD_REPLACE = 1, + ATTRSMOD_MERGE = 2 + }; + + int rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj); + + int stat_remote_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const string& source_zone, + rgw_obj& src_obj, + RGWBucketInfo& src_bucket_info, + real_time *src_mtime, + uint64_t *psize, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + map *pattrs, + map *pheaders, + string *version_id, + string *ptag, + string *petag); + + int fetch_remote_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const string& source_zone, + const rgw_obj& dest_obj, + const rgw_obj& src_obj, + RGWBucketInfo& dest_bucket_info, + RGWBucketInfo& src_bucket_info, + std::optional dest_placement, + ceph::real_time *src_mtime, + ceph::real_time *mtime, + const ceph::real_time *mod_ptr, + const ceph::real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + map& attrs, + RGWObjCategory category, + std::optional olh_epoch, + ceph::real_time delete_at, + string *ptag, + string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + rgw_zone_set *zones_trace= nullptr, + std::optional* bytes_transferred = 0); + /** + * Copy an object. + * dest_obj: the object to copy into + * src_obj: the object to copy from + * attrs: usage depends on attrs_mod parameter + * attrs_mod: the modification mode of the attrs, may have the following values: + * ATTRSMOD_NONE - the attributes of the source object will be + * copied without modifications, attrs parameter is ignored; + * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs + * parameter, source object attributes are not copied; + * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes + * are overwritten by values contained in attrs parameter. + * Returns: 0 on success, -ERR# otherwise. + */ + int copy_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const string& source_zone, + rgw_obj& dest_obj, + rgw_obj& src_obj, + RGWBucketInfo& dest_bucket_info, + RGWBucketInfo& src_bucket_info, + const rgw_placement_rule& dest_placement, + ceph::real_time *src_mtime, + ceph::real_time *mtime, + const ceph::real_time *mod_ptr, + const ceph::real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + map& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + ceph::real_time delete_at, + string *version_id, + string *ptag, + string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data); + + int copy_obj_data(RGWObjectCtx& obj_ctx, + RGWBucketInfo& dest_bucket_info, + const rgw_placement_rule& dest_placement, + RGWRados::Object::Read& read_op, off_t end, + const rgw_obj& dest_obj, + ceph::real_time *mtime, + ceph::real_time set_mtime, + map& attrs, + uint64_t olh_epoch, + ceph::real_time delete_at, + string *petag); + + int transition_obj(RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, + rgw_obj& obj, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch); + + int check_bucket_empty(RGWBucketInfo& bucket_info); + + /** + * Delete a bucket. + * bucket: the name of the bucket to delete + * Returns 0 on success, -ERR# otherwise. + */ + int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty = true); + + void wakeup_meta_sync_shards(set& shard_ids); + void wakeup_data_sync_shards(const string& source_zone, map >& shard_ids); + + RGWMetaSyncStatusManager* get_meta_sync_manager(); + RGWDataSyncStatusManager* get_data_sync_manager(const std::string& source_zone); + + int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner); + int set_buckets_enabled(std::vector& buckets, bool enabled); + int bucket_suspended(rgw_bucket& bucket, bool *suspended); + + /** Delete an object.*/ + int delete_obj(RGWObjectCtx& obj_ctx, + const RGWBucketInfo& bucket_owner, + const rgw_obj& src_obj, + int versioning_status, + uint16_t bilog_flags = 0, + const ceph::real_time& expiration_time = ceph::real_time(), + rgw_zone_set *zones_trace = nullptr); + + int delete_raw_obj(const rgw_raw_obj& obj); + + /** Remove an object from the bucket index */ + int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime); + + /** + * Set an attr on an object. + * bucket: name of the bucket holding the object + * obj: name of the object to set the attr on + * name: the attr to set + * bl: the contents of the attr + * Returns: 0 on success, -ERR# otherwise. + */ + int set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl); + + int set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, + map& attrs, + map* rmattrs); + + int get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, + bool follow_olh, bool assume_noent = false); + int get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state) { + return get_obj_state(rctx, bucket_info, obj, state, true); + } + + using iterate_obj_cb = int (*)(const rgw_raw_obj&, off_t, off_t, + off_t, bool, RGWObjState*, void*); + + int iterate_obj(RGWObjectCtx& ctx, const RGWBucketInfo& bucket_info, + const rgw_obj& obj, off_t ofs, off_t end, + uint64_t max_chunk_size, iterate_obj_cb cb, void *arg); + + int flush_read_list(struct get_obj_data *d); + + int get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg); + + void get_obj_aio_completion_cb(librados::completion_t cb, void *arg); + + /** + * a simple object read without keeping state + */ + + int raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch, + map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker); + + int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op); + int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op); + + int guard_reshard(BucketShard *bs, + const rgw_obj& obj_instance, + const RGWBucketInfo& bucket_info, + std::function call); + int block_while_resharding(RGWRados::BucketShard *bs, + string *new_bucket_id, + const RGWBucketInfo& bucket_info, + optional_yield y); + + void bucket_index_guard_olh_op(RGWObjState& olh_state, librados::ObjectOperation& op); + int olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag); + int olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag); + int bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, + const rgw_obj& obj_instance, bool delete_marker, + const string& op_tag, struct rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, + ceph::real_time unmod_since, bool high_precision_time, + rgw_zone_set *zones_trace = nullptr, + bool log_data_change = false); + int bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance, const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr); + int bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver_marker, + map > *log, bool *is_truncated); + int bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver); + int bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance); + int apply_olh_log(RGWObjectCtx& ctx, RGWObjState& obj_state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + bufferlist& obj_tag, map >& log, + uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr); + int update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace = nullptr); + int set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time, + rgw_zone_set *zones_trace = nullptr, bool log_data_change = false); + int repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info, + const rgw_obj& obj); + int unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, + uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr); + + void check_pending_olh_entries(map& pending_entries, map *rm_pending_entries); + int remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map& pending_attrs); + int follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target); + int get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh); + + void gen_rand_obj_instance_name(rgw_obj_key *target_key); + void gen_rand_obj_instance_name(rgw_obj *target); + + int update_containers_stats(map& m); + int append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl); + +public: + void set_atomic(void *ctx, rgw_obj& obj) { + RGWObjectCtx *rctx = static_cast(ctx); + rctx->set_atomic(obj); + } + void set_prefetch_data(void *ctx, const rgw_obj& obj) { + RGWObjectCtx *rctx = static_cast(ctx); + rctx->set_prefetch_data(obj); + } + int decode_policy(bufferlist& bl, ACLOwner *owner); + int get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver, + map& stats, string *max_marker, bool* syncstopped = NULL); + int get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *cb); + int get_user_stats(const rgw_user& user, RGWStorageStats& stats); + int get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *cb); + void get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj); + void get_bucket_meta_oid(const rgw_bucket& bucket, string& oid); + + int put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point, + bool exclusive, RGWObjVersionTracker& objv_tracker, ceph::real_time mtime, + map *pattrs); + int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, map *pattrs); + int get_bucket_entrypoint_info(RGWSysObjectCtx& obj_ctx, const string& tenant_name, const string& bucket_name, + RGWBucketEntryPoint& entry_point, RGWObjVersionTracker *objv_tracker, + ceph::real_time *pmtime, map *pattrs, rgw_cache_entry_info *cache_info = NULL, + boost::optional refresh_version = boost::none); + int get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, map *pattrs); + int get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, map *pattrs); + int get_bucket_instance_from_oid(RGWSysObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info, ceph::real_time *pmtime, map *pattrs, + rgw_cache_entry_info *cache_info = NULL, + boost::optional refresh_version = boost::none); + + int convert_old_bucket_info(RGWSysObjectCtx& obj_ctx, const string& tenant_name, const string& bucket_name); + static void make_bucket_entry_name(const string& tenant_name, const string& bucket_name, string& bucket_entry); + + +private: + int _get_bucket_info(RGWSysObjectCtx& obj_ctx, const string& tenant, + const string& bucket_name, RGWBucketInfo& info, + real_time *pmtime, + map *pattrs, + boost::optional refresh_version); +public: + + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, + bufferlist& out) override final; + +protected: + // `call_list` must iterate over all cache entries and call + // `cache_list_dump_helper` with the supplied Formatter on any that + // include `filter` as a substring. + // + void call_list(const std::optional& filter, + Formatter* format); + // `call_inspect` must look up the requested target and, if found, + // dump it to the supplied Formatter and return true. If not found, + // it must return false. + // + bool call_inspect(const std::string& target, Formatter* format); + + // `call_erase` must erase the requested target and return true. If + // the requested target does not exist, it should return false. + bool call_erase(const std::string& target); + + // `call_zap` must erase the cache. + void call_zap(); +public: + + int get_bucket_info(RGWSysObjectCtx& obj_ctx, + const string& tenant_name, const string& bucket_name, + RGWBucketInfo& info, + ceph::real_time *pmtime, map *pattrs = NULL); + + // Returns 0 on successful refresh. Returns error code if there was + // an error or the version stored on the OSD is the same as that + // presented in the BucketInfo structure. + // + int try_refresh_bucket_info(RGWBucketInfo& info, + ceph::real_time *pmtime, + map *pattrs = nullptr); + + int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv, + map *pattrs, bool create_entry_point); + + int cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag, int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent, + RGWObjCategory category, list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_del(BucketShard& bs, string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj, + ceph::real_time& removed_mtime, list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout); + int cls_bucket_list_ordered(RGWBucketInfo& bucket_info, + const int shard_id, + const rgw_obj_index_key& start_after, + const string& prefix, + const uint32_t num_entries, + const bool list_versions, + const uint16_t exp_factor, // 0 means ignore + map& m, + bool *is_truncated, + rgw_obj_index_key *last_entry, + bool (*force_check_filter)(const string& name) = nullptr); + int cls_bucket_list_unordered(RGWBucketInfo& bucket_info, int shard_id, + const rgw_obj_index_key& start, + const string& prefix, + uint32_t num_entries, bool list_versions, + vector& ent_list, + bool *is_truncated, rgw_obj_index_key *last_entry, + bool (*force_check_filter)(const string& name) = nullptr); + int cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector& headers, map *bucket_instance_ids = NULL); + int cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio); + int list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max, std::list& result, bool *truncated); + int trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, string& end_marker); + int resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id); + int stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id); + int get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id, map& max_marker); + + int bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent); + int bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh); + int bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry); + void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry); + int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry); + int bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry); + int bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list *entries, bool *is_truncated); + int bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list *entries, bool *is_truncated); + int bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, + list *entries, bool *is_truncated); + int bi_remove(BucketShard& bs); + + int cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info); + int cls_obj_usage_log_read(const string& oid, const string& user, const string& bucket, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, string& read_iter, map& usage, bool *is_truncated); + int cls_obj_usage_log_trim(const string& oid, const string& user, const string& bucket, uint64_t start_epoch, + uint64_t end_epoch); + int cls_obj_usage_log_clear(string& oid); + + int key_to_shard_id(const string& key, int max_shards); + void shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id); + void shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name); + void shard_name(const string& prefix, unsigned shard_id, string& name); + int get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key, int *shard_id); + void time_log_prepare_entry(cls_log_entry& entry, const ceph::real_time& ut, const string& section, const string& key, bufferlist& bl); + int time_log_add_init(librados::IoCtx& io_ctx); + int time_log_add(const string& oid, list& entries, + librados::AioCompletion *completion, bool monotonic_inc = true); + int time_log_add(const string& oid, const ceph::real_time& ut, const string& section, const string& key, bufferlist& bl); + int time_log_list(const string& oid, const ceph::real_time& start_time, const ceph::real_time& end_time, + int max_entries, list& entries, + const string& marker, string *out_marker, bool *truncated); + int time_log_info(const string& oid, cls_log_header *header); + int time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion); + int time_log_trim(const string& oid, const ceph::real_time& start_time, const ceph::real_time& end_time, + const string& from_marker, const string& to_marker, + librados::AioCompletion *completion = nullptr); + + string objexp_hint_get_shardname(int shard_num); + int objexp_key_shard(const rgw_obj_index_key& key); + void objexp_get_shard(int shard_num, + string& shard); /* out */ + int objexp_hint_add(const ceph::real_time& delete_at, + const string& tenant_name, + const string& bucket_name, + const string& bucket_id, + const rgw_obj_index_key& obj_key); + int objexp_hint_list(const string& oid, + const ceph::real_time& start_time, + const ceph::real_time& end_time, + const int max_entries, + const string& marker, + list& entries, /* out */ + string *out_marker, /* out */ + bool *truncated); /* out */ + int objexp_hint_parse(cls_timeindex_entry &ti_entry, + objexp_hint_entry& hint_entry); /* out */ + int objexp_hint_trim(const string& oid, + const ceph::real_time& start_time, + const ceph::real_time& end_time, + const string& from_marker = std::string(), + const string& to_marker = std::string()); + + int lock_exclusive(const rgw_pool& pool, const string& oid, ceph::timespan& duration, string& zone_id, string& owner_id); + int unlock(const rgw_pool& pool, const string& oid, string& zone_id, string& owner_id); + + void update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain); + int send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync); + int gc_operate(string& oid, librados::ObjectWriteOperation *op); + int gc_aio_operate(string& oid, librados::ObjectWriteOperation *op, librados::AioCompletion **pc = nullptr); + int gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl); + + int list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated); + int process_gc(bool expired_only); + bool process_expire_objects(); + int defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj); + + int process_lc(); + int list_lc_progress(const string& marker, uint32_t max_entries, map *progress_map); + + int bucket_check_index(RGWBucketInfo& bucket_info, + map *existing_stats, + map *calculated_stats); + int bucket_rebuild_index(RGWBucketInfo& bucket_info); + int bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry); + int remove_objs_from_index(RGWBucketInfo& bucket_info, list& oid_list); + int move_rados_obj(librados::IoCtx& src_ioctx, + const string& src_oid, const string& src_locator, + librados::IoCtx& dst_ioctx, + const string& dst_oid, const string& dst_locator); + int fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key); + int fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix); + + int cls_user_get_header(const string& user_id, cls_user_header *header); + int cls_user_reset_stats(const string& user_id); + int cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx); + int cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info); + int cls_user_list_buckets(rgw_raw_obj& obj, + const string& in_marker, + const string& end_marker, + int max_entries, + list& entries, + string *out_marker, + bool *truncated); + int cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry); + int cls_user_update_buckets(rgw_raw_obj& obj, list& entries, bool add); + int cls_user_complete_stats_sync(rgw_raw_obj& obj); + int complete_sync_user_stats(const rgw_user& user_id); + int cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket); + int cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry); + + int check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket, + RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size, bool check_size_only = false); + + int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket, + RGWQuotaInfo& bucket_quota); + + int add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards); + + uint64_t instance_id(); + + librados::Rados* get_rados_handle(); + + int delete_raw_obj_aio(const rgw_raw_obj& obj, list& handles); + int delete_obj_aio(const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate, + list& handles, bool keep_index_consistent); + + /* mfa/totp stuff */ + private: + void prepare_mfa_write(librados::ObjectWriteOperation *op, + RGWObjVersionTracker *objv_tracker, + const ceph::real_time& mtime); + public: + string get_mfa_oid(const rgw_user& user); + int get_mfa_ref(const rgw_user& user, rgw_rados_ref *ref); + int check_mfa(const rgw_user& user, const string& otp_id, const string& pin); + int create_mfa(const rgw_user& user, const rados::cls::otp::otp_info_t& config, + RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime); + int remove_mfa(const rgw_user& user, const string& id, + RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime); + int get_mfa(const rgw_user& user, const string& id, rados::cls::otp::otp_info_t *result); + int list_mfa(const rgw_user& user, list *result); + int otp_get_current_time(const rgw_user& user, ceph::real_time *result); + + /* mfa interfaces used by metadata engine */ + int set_mfa(const string& oid, const list& entries, bool reset_obj, + RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime); + int list_mfa(const string& oid, list *result, + RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime); + private: + /** + * This is a helper method, it generates a list of bucket index objects with the given + * bucket base oid and number of shards. + * + * bucket_oid_base [in] - base name of the bucket index object; + * num_shards [in] - number of bucket index object shards. + * bucket_objs [out] - filled by this method, a list of bucket index objects. + */ + void get_bucket_index_objects(const string& bucket_oid_base, uint32_t num_shards, + map& bucket_objs, int shard_id = -1); + + /** + * Get the bucket index object with the given base bucket index object and object key, + * and the number of bucket index shards. + * + * bucket_oid_base [in] - bucket object base name. + * obj_key [in] - object key. + * num_shards [in] - number of bucket index shards. + * hash_type [in] - type of hash to find the shard ID. + * bucket_obj [out] - the bucket index object for the given object. + * + * Return 0 on success, a failure code otherwise. + */ + int get_bucket_index_object(const string& bucket_oid_base, const string& obj_key, + uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard); + + void get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards, + int shard_id, string *bucket_obj); + + /** + * Check the actual on-disk state of the object specified + * by list_state, and fill in the time and size of object. + * Then append any changes to suggested_updates for + * the rgw class' dir_suggest_changes function. + * + * Note that this can maul list_state; don't use it afterwards. Also + * it expects object to already be filled in from list_state; it only + * sets the size and mtime. + * + * Returns 0 on success, -ENOENT if the object doesn't exist on disk, + * and -errno on other failures. (-ENOENT is not a failure, and it + * will encode that info as a suggested update.) + */ + int check_disk_state(librados::IoCtx io_ctx, + const RGWBucketInfo& bucket_info, + rgw_bucket_dir_entry& list_state, + rgw_bucket_dir_entry& object, + bufferlist& suggested_updates); + + /** + * Init pool iteration + * pool: pool to use for the ctx initialization + * ctx: context object to use for the iteration + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx); + + /** + * Init pool iteration + * pool: pool to use + * cursor: position to start iteration + * ctx: context object to use for the iteration + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx); + + /** + * Get pool iteration position + * ctx: context object to use for the iteration + * Returns: string representation of position + */ + string pool_iterate_get_cursor(RGWPoolIterCtx& ctx); + + /** + * Iterate over pool return object names, use optional filter + * ctx: iteration context, initialized with pool_iterate_begin() + * num: max number of objects to return + * objs: a vector that the results will append into + * is_truncated: if not NULL, will hold true iff iteration is complete + * filter: if not NULL, will be used to filter returned objects + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector& objs, + bool *is_truncated, RGWAccessListFilter *filter); + + uint64_t next_bucket_id(); + + /** + * This is broken out to facilitate unit testing. + */ + static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries, + uint32_t num_shards); +}; + +class RGWStoreManager { +public: + RGWStoreManager() {} + static RGWRados *get_storage(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, + bool run_sync_thread, bool run_reshard_thread, bool use_cache = true) { + RGWRados *store = init_storage_provider(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, + run_reshard_thread, use_cache); + return store; + } + static RGWRados *get_raw_storage(CephContext *cct) { + RGWRados *store = init_raw_storage_provider(cct); + return store; + } + static RGWRados *init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_metadata_cache); + static RGWRados *init_raw_storage_provider(CephContext *cct); + static void close_storage(RGWRados *store); + +}; + +class RGWMPObj { + string oid; + string prefix; + string meta; + string upload_id; +public: + RGWMPObj() {} + RGWMPObj(const string& _oid, const string& _upload_id) { + init(_oid, _upload_id, _upload_id); + } + void init(const string& _oid, const string& _upload_id) { + init(_oid, _upload_id, _upload_id); + } + void init(const string& _oid, const string& _upload_id, const string& part_unique_str) { + if (_oid.empty()) { + clear(); + return; + } + oid = _oid; + upload_id = _upload_id; + prefix = oid + "."; + meta = prefix + upload_id + MP_META_SUFFIX; + prefix.append(part_unique_str); + } + const string& get_meta() const { return meta; } + string get_part(int num) const { + char buf[16]; + snprintf(buf, 16, ".%d", num); + string s = prefix; + s.append(buf); + return s; + } + string get_part(const string& part) const { + string s = prefix; + s.append("."); + s.append(part); + return s; + } + const string& get_upload_id() const { + return upload_id; + } + const string& get_key() const { + return oid; + } + bool from_meta(const string& meta) { + int end_pos = meta.rfind('.'); // search for ".meta" + if (end_pos < 0) + return false; + int mid_pos = meta.rfind('.', end_pos - 1); // . + if (mid_pos < 0) + return false; + oid = meta.substr(0, mid_pos); + upload_id = meta.substr(mid_pos + 1, end_pos - mid_pos - 1); + init(oid, upload_id, upload_id); + return true; + } + void clear() { + oid = ""; + prefix = ""; + meta = ""; + upload_id = ""; + } + friend std::ostream& operator<<(std::ostream& out, const RGWMPObj& obj) { + return out << "RGWMPObj:{ prefix=" << std::quoted(obj.prefix) << + ", meta=" << std::quoted(obj.meta) << " }"; + } +}; // class RGWMPObj + + +class RGWRadosThread { + class Worker : public Thread { + CephContext *cct; + RGWRadosThread *processor; + Mutex lock; + Cond cond; + + void wait() { + Mutex::Locker l(lock); + cond.Wait(lock); + }; + + void wait_interval(const utime_t& wait_time) { + Mutex::Locker l(lock); + cond.WaitInterval(lock, wait_time); + } + + public: + Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {} + void *entry() override; + void signal() { + Mutex::Locker l(lock); + cond.Signal(); + } + }; + + Worker *worker; + +protected: + CephContext *cct; + RGWRados *store; + + std::atomic down_flag = { false }; + + string thread_name; + + virtual uint64_t interval_msec() = 0; + virtual void stop_process() {} +public: + RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw") + : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {} + virtual ~RGWRadosThread() { + stop(); + } + + virtual int init() { return 0; } + virtual int process() = 0; + + bool going_down() { return down_flag; } + + void start(); + void stop(); + + void signal() { + if (worker) { + worker->signal(); + } + } +}; + +#endif diff --git a/src/rgw/rgw_realm_reloader.cc b/src/rgw/rgw_realm_reloader.cc new file mode 100644 index 00000000..1fd48db0 --- /dev/null +++ b/src/rgw/rgw_realm_reloader.cc @@ -0,0 +1,176 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_realm_reloader.h" +#include "rgw_rados.h" + +#include "rgw_bucket.h" +#include "rgw_log.h" +#include "rgw_rest.h" +#include "rgw_user.h" + +#include "services/svc_zone.h" + +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw realm reloader: ") + + +// safe callbacks from SafeTimer are unneccessary. reload() can take a long +// time, so we don't want to hold the mutex and block handle_notify() for the +// duration +static constexpr bool USE_SAFE_TIMER_CALLBACKS = false; + + +RGWRealmReloader::RGWRealmReloader(RGWRados*& store, std::map& service_map_meta, + Pauser* frontends) + : store(store), + service_map_meta(service_map_meta), + frontends(frontends), + timer(store->ctx(), mutex, USE_SAFE_TIMER_CALLBACKS), + mutex("RGWRealmReloader"), + reload_scheduled(nullptr) +{ + timer.init(); +} + +RGWRealmReloader::~RGWRealmReloader() +{ + Mutex::Locker lock(mutex); + timer.shutdown(); +} + +class RGWRealmReloader::C_Reload : public Context { + RGWRealmReloader* reloader; + public: + explicit C_Reload(RGWRealmReloader* reloader) : reloader(reloader) {} + void finish(int r) override { reloader->reload(); } +}; + +void RGWRealmReloader::handle_notify(RGWRealmNotify type, + bufferlist::const_iterator& p) +{ + if (!store) { + /* we're in the middle of reload */ + return; + } + + CephContext *const cct = store->ctx(); + + Mutex::Locker lock(mutex); + if (reload_scheduled) { + ldout(cct, 4) << "Notification on realm, reconfiguration " + "already scheduled" << dendl; + return; + } + + reload_scheduled = new C_Reload(this); + cond.SignalOne(); // wake reload() if it blocked on a bad configuration + + // schedule reload() without delay + timer.add_event_after(0, reload_scheduled); + + ldout(cct, 4) << "Notification on realm, reconfiguration scheduled" << dendl; +} + +void RGWRealmReloader::reload() +{ + CephContext *const cct = store->ctx(); + ldout(cct, 1) << "Pausing frontends for realm update..." << dendl; + + frontends->pause(); + + ldout(cct, 1) << "Frontends paused" << dendl; + + // TODO: make RGWRados responsible for rgw_log_usage lifetime + rgw_log_usage_finalize(); + + // destroy the existing store + RGWStoreManager::close_storage(store); + store = nullptr; + + ldout(cct, 1) << "Store closed" << dendl; + { + // allow a new notify to reschedule us. it's important that we do this + // before we start loading the new realm, or we could miss some updates + Mutex::Locker lock(mutex); + reload_scheduled = nullptr; + } + + while (!store) { + // recreate and initialize a new store + store = + RGWStoreManager::get_storage(cct, + cct->_conf->rgw_enable_gc_threads, + cct->_conf->rgw_enable_lc_threads, + cct->_conf->rgw_enable_quota_threads, + cct->_conf->rgw_run_sync_thread, + cct->_conf.get_val("rgw_dynamic_resharding"), + cct->_conf->rgw_cache_enabled); + + ldout(cct, 1) << "Creating new store" << dendl; + + RGWRados* store_cleanup = nullptr; + { + Mutex::Locker lock(mutex); + + // failure to recreate RGWRados is not a recoverable error, but we + // don't want to assert or abort the entire cluster. instead, just + // sleep until we get another notification, and retry until we get + // a working configuration + if (store == nullptr) { + lderr(cct) << "Failed to reinitialize RGWRados after a realm " + "configuration update. Waiting for a new update." << dendl; + + // sleep until another event is scheduled + while (!reload_scheduled) + cond.Wait(mutex); + + ldout(cct, 1) << "Woke up with a new configuration, retrying " + "RGWRados initialization." << dendl; + } + + if (reload_scheduled) { + // cancel the event; we'll handle it now + timer.cancel_event(reload_scheduled); + reload_scheduled = nullptr; + + // if we successfully created a store, clean it up outside of the lock, + // then continue to loop and recreate another + std::swap(store, store_cleanup); + } + } + + if (store_cleanup) { + ldout(cct, 4) << "Got another notification, restarting RGWRados " + "initialization." << dendl; + + RGWStoreManager::close_storage(store_cleanup); + } + } + + int r = store->register_to_service_map("rgw", service_map_meta); + if (r < 0) { + lderr(cct) << "ERROR: failed to register to service map: " << cpp_strerror(-r) << dendl; + + /* ignore error */ + } + + ldout(cct, 1) << "Finishing initialization of new store" << dendl; + // finish initializing the new store + ldout(cct, 1) << " - REST subsystem init" << dendl; + rgw_rest_init(cct, store, store->svc.zone->get_zonegroup()); + ldout(cct, 1) << " - user subsystem init" << dendl; + rgw_user_init(store); + ldout(cct, 1) << " - user subsystem init" << dendl; + rgw_bucket_init(store->meta_mgr); + ldout(cct, 1) << " - usage subsystem init" << dendl; + rgw_log_usage_init(cct, store); + + ldout(cct, 1) << "Resuming frontends with new realm configuration." << dendl; + + frontends->resume(store); +} diff --git a/src/rgw/rgw_realm_reloader.h b/src/rgw/rgw_realm_reloader.h new file mode 100644 index 00000000..1277429e --- /dev/null +++ b/src/rgw/rgw_realm_reloader.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_REALM_RELOADER_H +#define RGW_REALM_RELOADER_H + +#include "rgw_realm_watcher.h" +#include "common/Cond.h" + +class RGWRados; + +/** + * RGWRealmReloader responds to new period notifications by recreating RGWRados + * with the updated realm configuration. + */ +class RGWRealmReloader : public RGWRealmWatcher::Watcher { + public: + /** + * Pauser is an interface to pause/resume frontends. Frontend cooperation + * is required to ensure that they stop issuing requests on the old + * RGWRados instance, and restart with the updated configuration. + * + * This abstraction avoids a dependency on class RGWFrontend. + */ + class Pauser { + public: + virtual ~Pauser() = default; + + /// pause all frontends while realm reconfiguration is in progress + virtual void pause() = 0; + /// resume all frontends with the given RGWRados instance + virtual void resume(RGWRados* store) = 0; + }; + + RGWRealmReloader(RGWRados*& store, std::map& service_map_meta, + Pauser* frontends); + ~RGWRealmReloader() override; + + /// respond to realm notifications by scheduling a reload() + void handle_notify(RGWRealmNotify type, bufferlist::const_iterator& p) override; + + private: + /// pause frontends and replace the RGWRados instance + void reload(); + + class C_Reload; //< Context that calls reload() + + /// main()'s RGWRados pointer as a reference, modified by reload() + RGWRados*& store; + std::map& service_map_meta; + Pauser *const frontends; + + /// reload() takes a significant amount of time, so we don't want to run + /// it in the handle_notify() thread. we choose a timer thread instead of a + /// Finisher because it allows us to cancel events that were scheduled while + /// reload() is still running + SafeTimer timer; + Mutex mutex; //< protects access to timer and reload_scheduled + Cond cond; //< to signal reload() after an invalid realm config + C_Reload* reload_scheduled; //< reload() context if scheduled +}; + +#endif // RGW_REALM_RELOADER_H diff --git a/src/rgw/rgw_realm_watcher.cc b/src/rgw/rgw_realm_watcher.cc new file mode 100644 index 00000000..ee154f0f --- /dev/null +++ b/src/rgw/rgw_realm_watcher.cc @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" + +#include "rgw_realm_watcher.h" +#include "rgw_tools.h" +#include "rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "rgw realm watcher: ") + + +RGWRealmWatcher::RGWRealmWatcher(CephContext* cct, const RGWRealm& realm) + : cct(cct) +{ + // no default realm, nothing to watch + if (realm.get_id().empty()) { + ldout(cct, 4) << "No realm, disabling dynamic reconfiguration." << dendl; + return; + } + + // establish the watch on RGWRealm + int r = watch_start(realm); + if (r < 0) { + lderr(cct) << "Failed to establish a watch on RGWRealm, " + "disabling dynamic reconfiguration." << dendl; + return; + } +} + +RGWRealmWatcher::~RGWRealmWatcher() +{ + watch_stop(); +} + +void RGWRealmWatcher::add_watcher(RGWRealmNotify type, Watcher& watcher) +{ + watchers.emplace(type, watcher); +} + +void RGWRealmWatcher::handle_notify(uint64_t notify_id, uint64_t cookie, + uint64_t notifier_id, bufferlist& bl) +{ + if (cookie != watch_handle) + return; + + // send an empty notify ack + bufferlist reply; + pool_ctx.notify_ack(watch_oid, notify_id, cookie, reply); + + try { + auto p = bl.cbegin(); + while (!p.end()) { + RGWRealmNotify notify; + decode(notify, p); + auto watcher = watchers.find(notify); + if (watcher == watchers.end()) { + lderr(cct) << "Failed to find a watcher for notify type " + << static_cast(notify) << dendl; + break; + } + watcher->second.handle_notify(notify, p); + } + } catch (const buffer::error &e) { + lderr(cct) << "Failed to decode realm notifications." << dendl; + } +} + +void RGWRealmWatcher::handle_error(uint64_t cookie, int err) +{ + lderr(cct) << "RGWRealmWatcher::handle_error oid=" << watch_oid << " err=" << err << dendl; + if (cookie != watch_handle) + return; + + watch_restart(); +} + +int RGWRealmWatcher::watch_start(const RGWRealm& realm) +{ + // initialize a Rados client + int r = rados.init_with_context(cct); + if (r < 0) { + lderr(cct) << "Rados client initialization failed with " + << cpp_strerror(-r) << dendl; + return r; + } + r = rados.connect(); + if (r < 0) { + lderr(cct) << "Rados client connection failed with " + << cpp_strerror(-r) << dendl; + return r; + } + + // open an IoCtx for the realm's pool + rgw_pool pool(realm.get_pool(cct)); + r = rgw_init_ioctx(&rados, pool, pool_ctx); + if (r < 0) { + lderr(cct) << "Failed to open pool " << pool + << " with " << cpp_strerror(-r) << dendl; + rados.shutdown(); + return r; + } + + // register a watch on the realm's control object + auto oid = realm.get_control_oid(); + r = pool_ctx.watch2(oid, &watch_handle, this); + if (r < 0) { + lderr(cct) << "Failed to watch " << oid + << " with " << cpp_strerror(-r) << dendl; + pool_ctx.close(); + rados.shutdown(); + return r; + } + + ldout(cct, 10) << "Watching " << oid << dendl; + std::swap(watch_oid, oid); + return 0; +} + +int RGWRealmWatcher::watch_restart() +{ + ceph_assert(!watch_oid.empty()); + int r = pool_ctx.unwatch2(watch_handle); + if (r < 0) { + lderr(cct) << "Failed to unwatch on " << watch_oid + << " with " << cpp_strerror(-r) << dendl; + } + r = pool_ctx.watch2(watch_oid, &watch_handle, this); + if (r < 0) { + lderr(cct) << "Failed to restart watch on " << watch_oid + << " with " << cpp_strerror(-r) << dendl; + pool_ctx.close(); + watch_oid.clear(); + } + return r; +} + +void RGWRealmWatcher::watch_stop() +{ + if (!watch_oid.empty()) { + pool_ctx.unwatch2(watch_handle); + pool_ctx.close(); + watch_oid.clear(); + } +} diff --git a/src/rgw/rgw_realm_watcher.h b/src/rgw/rgw_realm_watcher.h new file mode 100644 index 00000000..03d7e939 --- /dev/null +++ b/src/rgw/rgw_realm_watcher.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_REALM_WATCHER_H +#define RGW_REALM_WATCHER_H + +#include "include/rados/librados.hpp" +#include "include/ceph_assert.h" +#include "common/Timer.h" +#include "common/Cond.h" + +class RGWRados; +class RGWRealm; + +enum class RGWRealmNotify { + Reload, + ZonesNeedPeriod, +}; +WRITE_RAW_ENCODER(RGWRealmNotify); + +/** + * RGWRealmWatcher establishes a watch on the current RGWRealm's control object, + * and forwards notifications to registered observers. + */ +class RGWRealmWatcher : public librados::WatchCtx2 { + public: + /** + * Watcher is an interface that allows the RGWRealmWatcher to pass + * notifications on to other interested objects. + */ + class Watcher { + public: + virtual ~Watcher() = default; + + virtual void handle_notify(RGWRealmNotify type, + bufferlist::const_iterator& p) = 0; + }; + + RGWRealmWatcher(CephContext* cct, const RGWRealm& realm); + ~RGWRealmWatcher() override; + + /// register a watcher for the given notification type + void add_watcher(RGWRealmNotify type, Watcher& watcher); + + /// respond to realm notifications by calling the appropriate watcher + void handle_notify(uint64_t notify_id, uint64_t cookie, + uint64_t notifier_id, bufferlist& bl) override; + + /// reestablish the watch if it gets disconnected + void handle_error(uint64_t cookie, int err) override; + + private: + CephContext *const cct; + + /// keep a separate Rados client whose lifetime is independent of RGWRados + /// so that we don't miss notifications during realm reconfiguration + librados::Rados rados; + librados::IoCtx pool_ctx; + uint64_t watch_handle = 0; + std::string watch_oid; + + int watch_start(const RGWRealm& realm); + int watch_restart(); + void watch_stop(); + + std::map watchers; +}; + +#endif // RGW_REALM_WATCHER_H diff --git a/src/rgw/rgw_request.h b/src/rgw/rgw_request.h new file mode 100644 index 00000000..23483208 --- /dev/null +++ b/src/rgw/rgw_request.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_REQUEST_H +#define RGW_REQUEST_H + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_acl.h" +#include "rgw_user.h" +#include "rgw_op.h" +#if defined(WITH_RADOSGW_FCGI_FRONTEND) +#include "rgw_fcgi.h" +#endif + +#include "common/QueueRing.h" + +#include + +struct RGWRequest +{ + uint64_t id; + struct req_state *s; + RGWOp *op; + + explicit RGWRequest(uint64_t id) : id(id), s(NULL), op(NULL) {} + + virtual ~RGWRequest() {} + + void init_state(req_state *_s) { + s = _s; + } +}; /* RGWRequest */ + +#if defined(WITH_RADOSGW_FCGI_FRONTEND) +struct RGWFCGXRequest : public RGWRequest { + FCGX_Request *fcgx; + QueueRing *qr; + + RGWFCGXRequest(uint64_t req_id, QueueRing *_qr) + : RGWRequest(req_id), qr(_qr) { + qr->dequeue(&fcgx); + } + + ~RGWFCGXRequest() override { + FCGX_Finish_r(fcgx); + qr->enqueue(fcgx); + } +}; +#endif + +struct RGWLoadGenRequest : public RGWRequest { + string method; + string resource; + int content_length; + std::atomic* fail_flag = nullptr; + +RGWLoadGenRequest(uint64_t req_id, const string& _m, const string& _r, int _cl, + std::atomic *ff) + : RGWRequest(req_id), method(_m), resource(_r), content_length(_cl), + fail_flag(ff) {} +}; + +#endif /* RGW_REQUEST_H */ diff --git a/src/rgw/rgw_reshard.cc b/src/rgw/rgw_reshard.cc new file mode 100644 index 00000000..eb86b220 --- /dev/null +++ b/src/rgw/rgw_reshard.cc @@ -0,0 +1,1177 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_bucket.h" +#include "rgw_reshard.h" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/lock/cls_lock_client.h" +#include "common/errno.h" +#include "common/ceph_json.h" + +#include "common/dout.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +const string reshard_oid_prefix = "reshard."; +const string reshard_lock_name = "reshard_process"; +const string bucket_instance_lock_name = "bucket_instance_lock"; + + +class BucketReshardShard { + RGWRados *store; + const RGWBucketInfo& bucket_info; + int num_shard; + RGWRados::BucketShard bs; + vector entries; + map stats; + deque& aio_completions; + uint64_t max_aio_completions; + uint64_t reshard_shard_batch_size; + + int wait_next_completion() { + librados::AioCompletion *c = aio_completions.front(); + aio_completions.pop_front(); + + c->wait_for_safe(); + + int ret = c->get_return_value(); + c->release(); + + if (ret < 0) { + derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; + } + + int get_completion(librados::AioCompletion **c) { + if (aio_completions.size() >= max_aio_completions) { + int ret = wait_next_completion(); + if (ret < 0) { + return ret; + } + } + + *c = librados::Rados::aio_create_completion(nullptr, nullptr, nullptr); + aio_completions.push_back(*c); + + return 0; + } + +public: + BucketReshardShard(RGWRados *_store, const RGWBucketInfo& _bucket_info, + int _num_shard, + deque& _completions) : + store(_store), bucket_info(_bucket_info), bs(store), + aio_completions(_completions) + { + num_shard = (bucket_info.num_shards > 0 ? _num_shard : -1); + bs.init(bucket_info.bucket, num_shard, nullptr /* no RGWBucketInfo */); + + max_aio_completions = + store->ctx()->_conf.get_val("rgw_reshard_max_aio"); + reshard_shard_batch_size = + store->ctx()->_conf.get_val("rgw_reshard_batch_size"); + } + + int get_num_shard() { + return num_shard; + } + + int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category, + const rgw_bucket_category_stats& entry_stats) { + entries.push_back(entry); + if (account) { + rgw_bucket_category_stats& target = stats[category]; + target.num_entries += entry_stats.num_entries; + target.total_size += entry_stats.total_size; + target.total_size_rounded += entry_stats.total_size_rounded; + target.actual_size += entry_stats.actual_size; + } + if (entries.size() >= reshard_shard_batch_size) { + int ret = flush(); + if (ret < 0) { + return ret; + } + } + + return 0; + } + + int flush() { + if (entries.size() == 0) { + return 0; + } + + librados::ObjectWriteOperation op; + for (auto& entry : entries) { + store->bi_put(op, bs, entry); + } + cls_rgw_bucket_update_stats(op, false, stats); + + librados::AioCompletion *c; + int ret = get_completion(&c); + if (ret < 0) { + return ret; + } + ret = bs.index_ctx.aio_operate(bs.bucket_obj, c, &op); + if (ret < 0) { + derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl; + return ret; + } + entries.clear(); + stats.clear(); + return 0; + } + + int wait_all_aio() { + int ret = 0; + while (!aio_completions.empty()) { + int r = wait_next_completion(); + if (r < 0) { + ret = r; + } + } + return ret; + } +}; // class BucketReshardShard + + +class BucketReshardManager { + RGWRados *store; + const RGWBucketInfo& target_bucket_info; + deque completions; + int num_target_shards; + vector target_shards; + +public: + BucketReshardManager(RGWRados *_store, + const RGWBucketInfo& _target_bucket_info, + int _num_target_shards) : + store(_store), target_bucket_info(_target_bucket_info), + num_target_shards(_num_target_shards) + { + target_shards.resize(num_target_shards); + for (int i = 0; i < num_target_shards; ++i) { + target_shards[i] = new BucketReshardShard(store, target_bucket_info, i, completions); + } + } + + ~BucketReshardManager() { + for (auto& shard : target_shards) { + int ret = shard->wait_all_aio(); + if (ret < 0) { + ldout(store->ctx(), 20) << __func__ << + ": shard->wait_all_aio() returned ret=" << ret << dendl; + } + } + } + + int add_entry(int shard_index, + rgw_cls_bi_entry& entry, bool account, RGWObjCategory category, + const rgw_bucket_category_stats& entry_stats) { + int ret = target_shards[shard_index]->add_entry(entry, account, category, + entry_stats); + if (ret < 0) { + derr << "ERROR: target_shards.add_entry(" << entry.idx << + ") returned error: " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; + } + + int finish() { + int ret = 0; + for (auto& shard : target_shards) { + int r = shard->flush(); + if (r < 0) { + derr << "ERROR: target_shards[" << shard->get_num_shard() << "].flush() returned error: " << cpp_strerror(-r) << dendl; + ret = r; + } + } + for (auto& shard : target_shards) { + int r = shard->wait_all_aio(); + if (r < 0) { + derr << "ERROR: target_shards[" << shard->get_num_shard() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl; + ret = r; + } + delete shard; + } + target_shards.clear(); + return ret; + } +}; // class BucketReshardManager + +RGWBucketReshard::RGWBucketReshard(RGWRados *_store, + const RGWBucketInfo& _bucket_info, + const map& _bucket_attrs, + RGWBucketReshardLock* _outer_reshard_lock) : + store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs), + reshard_lock(store, bucket_info, true), + outer_reshard_lock(_outer_reshard_lock) +{ } + +int RGWBucketReshard::set_resharding_status(RGWRados* store, + const RGWBucketInfo& bucket_info, + const string& new_instance_id, + int32_t num_shards, + cls_rgw_reshard_status status) +{ + if (new_instance_id.empty()) { + ldout(store->ctx(), 0) << __func__ << " missing new bucket instance id" << dendl; + return -EINVAL; + } + + cls_rgw_bucket_instance_entry instance_entry; + instance_entry.set_status(new_instance_id, num_shards, status); + + int ret = store->bucket_set_reshard(bucket_info, instance_entry); + if (ret < 0) { + ldout(store->ctx(), 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: " + << cpp_strerror(-ret) << dendl; + return ret; + } + return 0; +} + +// reshard lock assumes lock is held +int RGWBucketReshard::clear_resharding(RGWRados* store, + const RGWBucketInfo& bucket_info) +{ + int ret = clear_index_shard_reshard_status(store, bucket_info); + if (ret < 0) { + ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ << + " ERROR: error clearing reshard status from index shard " << + cpp_strerror(-ret) << dendl; + return ret; + } + + cls_rgw_bucket_instance_entry instance_entry; + ret = store->bucket_set_reshard(bucket_info, instance_entry); + if (ret < 0) { + ldout(store->ctx(), 0) << "RGWReshard::" << __func__ << + " ERROR: error setting bucket resharding flag on bucket index: " << + cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +int RGWBucketReshard::clear_index_shard_reshard_status(RGWRados* store, + const RGWBucketInfo& bucket_info) +{ + uint32_t num_shards = bucket_info.num_shards; + + if (num_shards < std::numeric_limits::max()) { + int ret = set_resharding_status(store, bucket_info, + bucket_info.bucket.bucket_id, + (num_shards < 1 ? 1 : num_shards), + CLS_RGW_RESHARD_NOT_RESHARDING); + if (ret < 0) { + ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ << + " ERROR: error clearing reshard status from index shard " << + cpp_strerror(-ret) << dendl; + return ret; + } + } + + return 0; +} + +static int create_new_bucket_instance(RGWRados *store, + int new_num_shards, + const RGWBucketInfo& bucket_info, + map& attrs, + RGWBucketInfo& new_bucket_info) +{ + new_bucket_info = bucket_info; + + store->create_bucket_id(&new_bucket_info.bucket.bucket_id); + new_bucket_info.bucket.oid.clear(); + + new_bucket_info.num_shards = new_num_shards; + new_bucket_info.objv_tracker.clear(); + + new_bucket_info.new_bucket_instance_id.clear(); + new_bucket_info.reshard_status = 0; + + int ret = store->init_bucket_index(new_bucket_info, new_bucket_info.num_shards); + if (ret < 0) { + cerr << "ERROR: failed to init new bucket indexes: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + ret = store->put_bucket_instance_info(new_bucket_info, true, real_time(), &attrs); + if (ret < 0) { + cerr << "ERROR: failed to store new bucket instance info: " << cpp_strerror(-ret) << std::endl; + return ret; + } + + return 0; +} + +int RGWBucketReshard::create_new_bucket_instance(int new_num_shards, + RGWBucketInfo& new_bucket_info) +{ + return ::create_new_bucket_instance(store, new_num_shards, + bucket_info, bucket_attrs, new_bucket_info); +} + +int RGWBucketReshard::cancel() +{ + int ret = reshard_lock.lock(); + if (ret < 0) { + return ret; + } + + ret = clear_resharding(); + + reshard_lock.unlock(); + return ret; +} + +class BucketInfoReshardUpdate +{ + RGWRados *store; + RGWBucketInfo& bucket_info; + std::map bucket_attrs; + + bool in_progress{false}; + + int set_status(cls_rgw_reshard_status s) { + bucket_info.reshard_status = s; + int ret = store->put_bucket_instance_info(bucket_info, false, real_time(), &bucket_attrs); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: failed to write bucket info, ret=" << ret << dendl; + return ret; + } + return 0; + } + +public: + BucketInfoReshardUpdate(RGWRados *_store, + RGWBucketInfo& _bucket_info, + map& _bucket_attrs, + const string& new_bucket_id) : + store(_store), + bucket_info(_bucket_info), + bucket_attrs(_bucket_attrs) + { + bucket_info.new_bucket_instance_id = new_bucket_id; + } + + ~BucketInfoReshardUpdate() { + if (in_progress) { + // resharding must not have ended correctly, clean up + int ret = + RGWBucketReshard::clear_index_shard_reshard_status(store, bucket_info); + if (ret < 0) { + lderr(store->ctx()) << "Error: " << __func__ << + " clear_index_shard_status returned " << ret << dendl; + } + bucket_info.new_bucket_instance_id.clear(); + set_status(CLS_RGW_RESHARD_NOT_RESHARDING); // clears new_bucket_instance as well + } + } + + int start() { + int ret = set_status(CLS_RGW_RESHARD_IN_PROGRESS); + if (ret < 0) { + return ret; + } + in_progress = true; + return 0; + } + + int complete() { + int ret = set_status(CLS_RGW_RESHARD_DONE); + if (ret < 0) { + return ret; + } + in_progress = false; + return 0; + } +}; + + +RGWBucketReshardLock::RGWBucketReshardLock(RGWRados* _store, + const std::string& reshard_lock_oid, + bool _ephemeral) : + store(_store), + lock_oid(reshard_lock_oid), + ephemeral(_ephemeral), + internal_lock(reshard_lock_name) +{ + const int lock_dur_secs = store->ctx()->_conf.get_val( + "rgw_reshard_bucket_lock_duration"); + duration = std::chrono::seconds(lock_dur_secs); + +#define COOKIE_LEN 16 + char cookie_buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1); + cookie_buf[COOKIE_LEN] = '\0'; + + internal_lock.set_cookie(cookie_buf); + internal_lock.set_duration(duration); +} + +int RGWBucketReshardLock::lock() { + internal_lock.set_must_renew(false); + int ret; + if (ephemeral) { + ret = internal_lock.lock_exclusive_ephemeral(&store->reshard_pool_ctx, + lock_oid); + } else { + ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid); + } + if (ret < 0) { + ldout(store->ctx(), 0) << "RGWReshardLock::" << __func__ << + " failed to acquire lock on " << lock_oid << " ret=" << ret << dendl; + return ret; + } + reset_time(Clock::now()); + + return 0; +} + +void RGWBucketReshardLock::unlock() { + int ret = internal_lock.unlock(&store->reshard_pool_ctx, lock_oid); + if (ret < 0) { + ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ << + " failed to drop lock on " << lock_oid << " ret=" << ret << dendl; + } +} + +int RGWBucketReshardLock::renew(const Clock::time_point& now) { + internal_lock.set_must_renew(true); + int ret; + if (ephemeral) { + ret = internal_lock.lock_exclusive_ephemeral(&store->reshard_pool_ctx, + lock_oid); + } else { + ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid); + } + if (ret < 0) { /* expired or already locked by another processor */ + std::stringstream error_s; + if (-ENOENT == ret) { + error_s << "ENOENT (lock expired or never initially locked)"; + } else { + error_s << ret << " (" << cpp_strerror(-ret) << ")"; + } + ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " << + lock_oid << " with error " << error_s.str() << dendl; + return ret; + } + internal_lock.set_must_renew(false); + + reset_time(now); + ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " << + lock_oid << dendl; + + return 0; +} + + +int RGWBucketReshard::do_reshard(int num_shards, + RGWBucketInfo& new_bucket_info, + int max_entries, + bool verbose, + ostream *out, + Formatter *formatter) +{ + rgw_bucket& bucket = bucket_info.bucket; + + int ret = 0; + + if (out) { + (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl; + (*out) << "bucket name: " << bucket_info.bucket.name << std::endl; + (*out) << "old bucket instance id: " << bucket_info.bucket.bucket_id << + std::endl; + (*out) << "new bucket instance id: " << new_bucket_info.bucket.bucket_id << + std::endl; + } + + /* update bucket info -- in progress*/ + list entries; + + if (max_entries < 0) { + ldout(store->ctx(), 0) << __func__ << + ": can't reshard, negative max_entries" << dendl; + return -EINVAL; + } + + // NB: destructor cleans up sharding state if reshard does not + // complete successfully + BucketInfoReshardUpdate bucket_info_updater(store, bucket_info, bucket_attrs, new_bucket_info.bucket.bucket_id); + + ret = bucket_info_updater.start(); + if (ret < 0) { + ldout(store->ctx(), 0) << __func__ << ": failed to update bucket info ret=" << ret << dendl; + return ret; + } + + int num_target_shards = (new_bucket_info.num_shards > 0 ? new_bucket_info.num_shards : 1); + + BucketReshardManager target_shards_mgr(store, new_bucket_info, num_target_shards); + + bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr); + + if (verbose_json_out) { + formatter->open_array_section("entries"); + } + + uint64_t total_entries = 0; + + if (!verbose_json_out && out) { + (*out) << "total entries:"; + } + + const int num_source_shards = + (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1); + string marker; + for (int i = 0; i < num_source_shards; ++i) { + bool is_truncated = true; + marker.clear(); + while (is_truncated) { + entries.clear(); + ret = store->bi_list(bucket, i, string(), marker, max_entries, &entries, &is_truncated); + if (ret < 0 && ret != -ENOENT) { + derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + for (auto iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_cls_bi_entry& entry = *iter; + if (verbose_json_out) { + formatter->open_object_section("entry"); + + encode_json("shard_id", i, formatter); + encode_json("num_entry", total_entries, formatter); + encode_json("entry", entry, formatter); + } + total_entries++; + + marker = entry.idx; + + int target_shard_id; + cls_rgw_obj_key cls_key; + RGWObjCategory category; + rgw_bucket_category_stats stats; + bool account = entry.get_info(&cls_key, &category, &stats); + rgw_obj_key key(cls_key); + rgw_obj obj(new_bucket_info.bucket, key); + RGWMPObj mp; + if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) { + // place the multipart .meta object on the same shard as its head object + obj.index_hash_source = mp.get_key(); + } + int ret = store->get_target_shard_id(new_bucket_info, obj.get_hash_object(), &target_shard_id); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl; + return ret; + } + + int shard_index = (target_shard_id > 0 ? target_shard_id : 0); + + ret = target_shards_mgr.add_entry(shard_index, entry, account, + category, stats); + if (ret < 0) { + return ret; + } + + Clock::time_point now = Clock::now(); + if (reshard_lock.should_renew(now)) { + // assume outer locks have timespans at least the size of ours, so + // can call inside conditional + if (outer_reshard_lock) { + ret = outer_reshard_lock->renew(now); + if (ret < 0) { + return ret; + } + } + ret = reshard_lock.renew(now); + if (ret < 0) { + lderr(store->ctx()) << "Error renewing bucket lock: " << ret << dendl; + return ret; + } + } + if (verbose_json_out) { + formatter->close_section(); + formatter->flush(*out); + } else if (out && !(total_entries % 1000)) { + (*out) << " " << total_entries; + } + } // entries loop + } + } + + if (verbose_json_out) { + formatter->close_section(); + formatter->flush(*out); + } else if (out) { + (*out) << " " << total_entries << std::endl; + } + + ret = target_shards_mgr.finish(); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: failed to reshard" << dendl; + return -EIO; + } + + ret = rgw_link_bucket(store, new_bucket_info.owner, new_bucket_info.bucket, bucket_info.creation_time); + if (ret < 0) { + lderr(store->ctx()) << "failed to link new bucket instance (bucket_id=" << new_bucket_info.bucket.bucket_id << ": " << cpp_strerror(-ret) << ")" << dendl; + return ret; + } + + ret = bucket_info_updater.complete(); + if (ret < 0) { + ldout(store->ctx(), 0) << __func__ << ": failed to update bucket info ret=" << ret << dendl; + /* don't error out, reshard process succeeded */ + } + + return 0; + // NB: some error clean-up is done by ~BucketInfoReshardUpdate +} // RGWBucketReshard::do_reshard + +int RGWBucketReshard::get_status(list *status) +{ + librados::IoCtx index_ctx; + map bucket_objs; + + int r = store->open_bucket_index(bucket_info, index_ctx, bucket_objs); + if (r < 0) { + return r; + } + + for (auto i : bucket_objs) { + cls_rgw_bucket_instance_entry entry; + + int ret = cls_rgw_get_bucket_resharding(index_ctx, i.second, &entry); + if (ret < 0 && ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: " << __func__ << ": cls_rgw_get_bucket_resharding() returned ret=" << ret << dendl; + return ret; + } + + status->push_back(entry); + } + + return 0; +} + + +int RGWBucketReshard::execute(int num_shards, int max_op_entries, + bool verbose, ostream *out, Formatter *formatter, + RGWReshard* reshard_log) +{ + Clock::time_point now; + + int ret = reshard_lock.lock(); + if (ret < 0) { + return ret; + } + + RGWBucketInfo new_bucket_info; + ret = create_new_bucket_instance(num_shards, new_bucket_info); + if (ret < 0) { + // shard state is uncertain, but this will attempt to remove them anyway + goto error_out; + } + + if (reshard_log) { + ret = reshard_log->update(bucket_info, new_bucket_info); + if (ret < 0) { + goto error_out; + } + } + + // set resharding status of current bucket_info & shards with + // information about planned resharding + ret = set_resharding_status(new_bucket_info.bucket.bucket_id, + num_shards, CLS_RGW_RESHARD_IN_PROGRESS); + if (ret < 0) { + reshard_lock.unlock(); + return ret; + } + + ret = do_reshard(num_shards, + new_bucket_info, + max_op_entries, + verbose, out, formatter); + if (ret < 0) { + goto error_out; + } + + // at this point we've done the main work; we'll make a best-effort + // to clean-up but will not indicate any errors encountered + + reshard_lock.unlock(); + + // resharding successful, so remove old bucket index shards; use + // best effort and don't report out an error; the lock isn't needed + // at this point since all we're using a best effor to to remove old + // shard objects + ret = store->clean_bucket_index(bucket_info, bucket_info.num_shards); + if (ret < 0) { + lderr(store->ctx()) << "Error: " << __func__ << + " failed to clean up old shards; " << + "RGWRados::clean_bucket_index returned " << ret << dendl; + } + + ret = rgw_bucket_instance_remove_entry(store, + bucket_info.bucket.get_key(), + nullptr); + if (ret < 0) { + lderr(store->ctx()) << "Error: " << __func__ << + " failed to clean old bucket info object \"" << + bucket_info.bucket.get_key() << + "\"created after successful resharding with error " << ret << dendl; + } + + ldout(store->ctx(), 1) << __func__ << + " INFO: reshard of bucket \"" << bucket_info.bucket.name << "\" from \"" << + bucket_info.bucket.get_key() << "\" to \"" << + new_bucket_info.bucket.get_key() << "\" completed successfully" << dendl; + + return 0; + +error_out: + + reshard_lock.unlock(); + + // since the real problem is the issue that led to this error code + // path, we won't touch ret and instead use another variable to + // temporarily error codes + int ret2 = store->clean_bucket_index(new_bucket_info, + new_bucket_info.num_shards); + if (ret2 < 0) { + lderr(store->ctx()) << "Error: " << __func__ << + " failed to clean up shards from failed incomplete resharding; " << + "RGWRados::clean_bucket_index returned " << ret2 << dendl; + } + + ret2 = rgw_bucket_instance_remove_entry(store, + new_bucket_info.bucket.get_key(), + nullptr); + if (ret2 < 0) { + lderr(store->ctx()) << "Error: " << __func__ << + " failed to clean bucket info object \"" << + new_bucket_info.bucket.get_key() << + "\"created during incomplete resharding with error " << ret2 << dendl; + } + + return ret; +} // execute + + +RGWReshard::RGWReshard(RGWRados* _store, bool _verbose, ostream *_out, + Formatter *_formatter) : + store(_store), instance_lock(bucket_instance_lock_name), + verbose(_verbose), out(_out), formatter(_formatter) +{ + num_logshards = store->ctx()->_conf.get_val("rgw_reshard_num_logs"); +} + +string RGWReshard::get_logshard_key(const string& tenant, + const string& bucket_name) +{ + return tenant + ":" + bucket_name; +} + +#define MAX_RESHARD_LOGSHARDS_PRIME 7877 + +void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid) +{ + string key = get_logshard_key(tenant, bucket_name); + + uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size()); + uint32_t sid2 = sid ^ ((sid & 0xFF) << 24); + sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards; + + get_logshard_oid(int(sid), oid); +} + +int RGWReshard::add(cls_rgw_reshard_entry& entry) +{ + if (!store->svc.zone->can_reshard()) { + ldout(store->ctx(), 20) << __func__ << " Resharding is disabled" << dendl; + return 0; + } + + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + librados::ObjectWriteOperation op; + cls_rgw_reshard_add(op, entry); + + int ret = store->reshard_pool_ctx.operate(logshard_oid, &op); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl; + return ret; + } + return 0; +} + +int RGWReshard::update(const RGWBucketInfo& bucket_info, const RGWBucketInfo& new_bucket_info) +{ + cls_rgw_reshard_entry entry; + entry.bucket_name = bucket_info.bucket.name; + entry.bucket_id = bucket_info.bucket.bucket_id; + entry.tenant = bucket_info.owner.tenant; + + int ret = get(entry); + if (ret < 0) { + return ret; + } + + entry.new_instance_id = new_bucket_info.bucket.name + ":" + new_bucket_info.bucket.bucket_id; + + ret = add(entry); + if (ret < 0) { + ldout(store->ctx(), 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " << + cpp_strerror(-ret) << dendl; + } + + return ret; +} + + +int RGWReshard::list(int logshard_num, string& marker, uint32_t max, std::list& entries, bool *is_truncated) +{ + string logshard_oid; + + get_logshard_oid(logshard_num, &logshard_oid); + + int ret = cls_rgw_reshard_list(store->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated); + + if (ret < 0) { + if (ret == -ENOENT) { + *is_truncated = false; + ret = 0; + } + lderr(store->ctx()) << "ERROR: failed to list reshard log entries, oid=" << logshard_oid << dendl; + if (ret == -EACCES) { + lderr(store->ctx()) << "access denied to pool " << store->svc.zone->get_zone_params().reshard_pool + << ". Fix the pool access permissions of your client" << dendl; + } + } + + return ret; +} + +int RGWReshard::get(cls_rgw_reshard_entry& entry) +{ + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + int ret = cls_rgw_reshard_get(store->reshard_pool_ctx, logshard_oid, entry); + if (ret < 0) { + if (ret != -ENOENT) { + lderr(store->ctx()) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << + " bucket=" << entry.bucket_name << dendl; + } + return ret; + } + + return 0; +} + +int RGWReshard::remove(cls_rgw_reshard_entry& entry) +{ + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + librados::ObjectWriteOperation op; + cls_rgw_reshard_remove(op, entry); + + int ret = store->reshard_pool_ctx.operate(logshard_oid, &op); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl; + return ret; + } + + return ret; +} + +int RGWReshard::clear_bucket_resharding(const string& bucket_instance_oid, cls_rgw_reshard_entry& entry) +{ + int ret = cls_rgw_clear_bucket_resharding(store->reshard_pool_ctx, bucket_instance_oid); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl; + return ret; + } + + return 0; +} + +int RGWReshardWait::wait(optional_yield y) +{ + std::unique_lock lock(mutex); + + if (going_down) { + return -ECANCELED; + } + +#ifdef HAVE_BOOST_CONTEXT + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + + Waiter waiter(context); + waiters.push_back(waiter); + lock.unlock(); + + waiter.timer.expires_after(duration); + + boost::system::error_code ec; + waiter.timer.async_wait(yield[ec]); + + lock.lock(); + waiters.erase(waiters.iterator_to(waiter)); + return -ec.value(); + } +#endif + + cond.wait_for(lock, duration); + + if (going_down) { + return -ECANCELED; + } + + return 0; +} + +void RGWReshardWait::stop() +{ + std::scoped_lock lock(mutex); + going_down = true; + cond.notify_all(); + for (auto& waiter : waiters) { + // unblock any waiters with ECANCELED + waiter.timer.cancel(); + } +} + +int RGWReshard::process_single_logshard(int logshard_num) +{ + string marker; + bool truncated = true; + + CephContext *cct = store->ctx(); + constexpr uint32_t max_entries = 1000; + + string logshard_oid; + get_logshard_oid(logshard_num, &logshard_oid); + + RGWBucketReshardLock logshard_lock(store, logshard_oid, false); + + int ret = logshard_lock.lock(); + if (ret < 0) { + ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " << + logshard_oid << ", ret = " << ret < entries; + ret = list(logshard_num, marker, max_entries, entries, &truncated); + if (ret < 0) { + ldout(cct, 10) << "cannot list all reshards in logshard oid=" << + logshard_oid << dendl; + continue; + } + + for(auto& entry: entries) { // logshard entries + if(entry.new_instance_id.empty()) { + + ldout(store->ctx(), 20) << __func__ << " resharding " << + entry.bucket_name << dendl; + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + rgw_bucket bucket; + RGWBucketInfo bucket_info; + map attrs; + + ret = store->get_bucket_info(obj_ctx, entry.tenant, entry.bucket_name, + bucket_info, nullptr, &attrs); + if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) { + if (ret < 0) { + ldout(cct, 0) << __func__ << + ": Error in get_bucket_info for bucket " << entry.bucket_name << + ": " << cpp_strerror(-ret) << dendl; + if (ret != -ENOENT) { + // any error other than ENOENT will abort + return ret; + } + } else { + ldout(cct,0) << __func__ << + ": Bucket: " << entry.bucket_name << + " already resharded by someone, skipping " << dendl; + } + + // we've encountered a reshard queue entry for an apparently + // non-existent bucket; let's try to recover by cleaning up + ldout(cct, 0) << __func__ << + ": removing reshard queue entry for a resharded or non-existent bucket" << + entry.bucket_name << dendl; + + ret = remove(entry); + if (ret < 0) { + ldout(cct, 0) << __func__ << + ": Error removing non-existent bucket " << + entry.bucket_name << " from resharding queue: " << + cpp_strerror(-ret) << dendl; + return ret; + } + + // we cleaned up, move on to the next entry + goto finished_entry; + } + + RGWBucketReshard br(store, bucket_info, attrs, nullptr); + ret = br.execute(entry.new_num_shards, max_entries, false, nullptr, + nullptr, this); + if (ret < 0) { + ldout(store->ctx(), 0) << __func__ << + ": Error during resharding bucket " << entry.bucket_name << ":" << + cpp_strerror(-ret)<< dendl; + return ret; + } + + ldout(store->ctx(), 20) << __func__ << + " removing reshard queue entry for bucket " << entry.bucket_name << + dendl; + + ret = remove(entry); + if (ret < 0) { + ldout(cct, 0) << __func__ << ": Error removing bucket " << + entry.bucket_name << " from resharding queue: " << + cpp_strerror(-ret) << dendl; + return ret; + } + } // if new instance id is empty + + finished_entry: + + Clock::time_point now = Clock::now(); + if (logshard_lock.should_renew(now)) { + ret = logshard_lock.renew(now); + if (ret < 0) { + return ret; + } + } + + entry.get_key(&marker); + } // entry for loop + } while (truncated); + + logshard_lock.unlock(); + return 0; +} + + +void RGWReshard::get_logshard_oid(int shard_num, string *logshard) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num); + + string objname(reshard_oid_prefix); + *logshard = objname + buf; +} + +int RGWReshard::process_all_logshards() +{ + if (!store->svc.zone->can_reshard()) { + ldout(store->ctx(), 20) << __func__ << " Resharding is disabled" << dendl; + return 0; + } + int ret = 0; + + for (int i = 0; i < num_logshards; i++) { + string logshard; + get_logshard_oid(i, &logshard); + + ldout(store->ctx(), 20) << "processing logshard = " << logshard << dendl; + + ret = process_single_logshard(i); + if (ret <0) { + return ret; + } + } + + return 0; +} + +bool RGWReshard::going_down() +{ + return down_flag; +} + +void RGWReshard::start_processor() +{ + worker = new ReshardWorker(store->ctx(), this); + worker->create("rgw_reshard"); +} + +void RGWReshard::stop_processor() +{ + down_flag = true; + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = nullptr; +} + +void *RGWReshard::ReshardWorker::entry() { + utime_t last_run; + do { + utime_t start = ceph_clock_now(); + if (reshard->process_all_logshards()) { + /* All shards have been processed properly. Next time we can start + * from this moment. */ + last_run = start; + } + + if (reshard->going_down()) + break; + + utime_t end = ceph_clock_now(); + end -= start; + int secs = cct->_conf.get_val("rgw_reshard_thread_interval"); + + if (secs <= end.sec()) + continue; // next round + + secs -= end.sec(); + + lock.Lock(); + cond.WaitInterval(lock, utime_t(secs, 0)); + lock.Unlock(); + } while (!reshard->going_down()); + + return NULL; +} + +void RGWReshard::ReshardWorker::stop() +{ + Mutex::Locker l(lock); + cond.Signal(); +} diff --git a/src/rgw/rgw_reshard.h b/src/rgw/rgw_reshard.h new file mode 100644 index 00000000..213fc238 --- /dev/null +++ b/src/rgw/rgw_reshard.h @@ -0,0 +1,211 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_RESHARD_H +#define RGW_RESHARD_H + +#include +#include + +#include + +#include "include/rados/librados.hpp" +#include "common/ceph_time.h" +#include "cls/rgw/cls_rgw_types.h" +#include "cls/lock/cls_lock_client.h" +#include "rgw_bucket.h" + + +class CephContext; +class RGWRados; + +class RGWBucketReshardLock { + using Clock = ceph::coarse_mono_clock; + + RGWRados* store; + const std::string lock_oid; + const bool ephemeral; + rados::cls::lock::Lock internal_lock; + std::chrono::seconds duration; + + Clock::time_point start_time; + Clock::time_point renew_thresh; + + void reset_time(const Clock::time_point& now) { + start_time = now; + renew_thresh = start_time + duration / 2; + } + +public: + RGWBucketReshardLock(RGWRados* _store, + const std::string& reshard_lock_oid, + bool _ephemeral); + RGWBucketReshardLock(RGWRados* _store, + const RGWBucketInfo& bucket_info, + bool _ephemeral) : + RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral) + {} + + int lock(); + void unlock(); + int renew(const Clock::time_point&); + + bool should_renew(const Clock::time_point& now) const { + return now >= renew_thresh; + } +}; // class RGWBucketReshardLock + +class RGWBucketReshard { +public: + + friend class RGWReshard; + + using Clock = ceph::coarse_mono_clock; + +private: + + RGWRados *store; + RGWBucketInfo bucket_info; + std::map bucket_attrs; + + RGWBucketReshardLock reshard_lock; + RGWBucketReshardLock* outer_reshard_lock; + + int create_new_bucket_instance(int new_num_shards, + RGWBucketInfo& new_bucket_info); + int do_reshard(int num_shards, + RGWBucketInfo& new_bucket_info, + int max_entries, + bool verbose, + ostream *os, + Formatter *formatter); +public: + + // pass nullptr for the final parameter if no outer reshard lock to + // manage + RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info, + const std::map& _bucket_attrs, + RGWBucketReshardLock* _outer_reshard_lock); + int execute(int num_shards, int max_op_entries, + bool verbose = false, ostream *out = nullptr, + Formatter *formatter = nullptr, + RGWReshard *reshard_log = nullptr); + int get_status(std::list *status); + int cancel(); + static int clear_resharding(RGWRados* store, + const RGWBucketInfo& bucket_info); + int clear_resharding() { + return clear_resharding(store, bucket_info); + } + static int clear_index_shard_reshard_status(RGWRados* store, + const RGWBucketInfo& bucket_info); + int clear_index_shard_reshard_status() { + return clear_index_shard_reshard_status(store, bucket_info); + } + static int set_resharding_status(RGWRados* store, + const RGWBucketInfo& bucket_info, + const string& new_instance_id, + int32_t num_shards, + cls_rgw_reshard_status status); + int set_resharding_status(const string& new_instance_id, + int32_t num_shards, + cls_rgw_reshard_status status) { + return set_resharding_status(store, bucket_info, + new_instance_id, num_shards, status); + } +}; // RGWBucketReshard + +class RGWReshard { +public: + using Clock = ceph::coarse_mono_clock; + +private: + RGWRados *store; + string lock_name; + rados::cls::lock::Lock instance_lock; + int num_logshards; + + bool verbose; + ostream *out; + Formatter *formatter; + + void get_logshard_oid(int shard_num, string *shard); +protected: + class ReshardWorker : public Thread { + CephContext *cct; + RGWReshard *reshard; + Mutex lock; + Cond cond; + + public: + ReshardWorker(CephContext * const _cct, + RGWReshard * const _reshard) + : cct(_cct), + reshard(_reshard), + lock("ReshardWorker") { + } + + void *entry() override; + void stop(); + }; + + ReshardWorker *worker = nullptr; + std::atomic down_flag = { false }; + + string get_logshard_key(const string& tenant, const string& bucket_name); + void get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid); + +public: + RGWReshard(RGWRados* _store, bool _verbose = false, ostream *_out = nullptr, Formatter *_formatter = nullptr); + int add(cls_rgw_reshard_entry& entry); + int update(const RGWBucketInfo& bucket_info, const RGWBucketInfo& new_bucket_info); + int get(cls_rgw_reshard_entry& entry); + int remove(cls_rgw_reshard_entry& entry); + int list(int logshard_num, string& marker, uint32_t max, std::list& entries, bool *is_truncated); + int clear_bucket_resharding(const string& bucket_instance_oid, cls_rgw_reshard_entry& entry); + + /* reshard thread */ + int process_single_logshard(int logshard_num); + int process_all_logshards(); + bool going_down(); + void start_processor(); + void stop_processor(); +}; + +class RGWReshardWait { + public: + // the blocking wait uses std::condition_variable::wait_for(), which uses the + // std::chrono::steady_clock. use that for the async waits as well + using Clock = std::chrono::steady_clock; + private: + const ceph::timespan duration; + ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock"); + ceph::condition_variable cond; + + struct Waiter : boost::intrusive::list_base_hook<> { +#if BOOST_VERSION < 107000 + using Timer = boost::asio::basic_waitable_timer; +#else + using Executor = boost::asio::io_context::executor_type; + using Timer = boost::asio::basic_waitable_timer, Executor>; +#endif + Timer timer; + explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {} + }; + boost::intrusive::list waiters; + + bool going_down{false}; + +public: + RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5)) + : duration(duration) {} + ~RGWReshardWait() { + ceph_assert(going_down); + } + int wait(optional_yield y); + // unblock any threads waiting on reshard + void stop(); +}; + +#endif diff --git a/src/rgw/rgw_resolve.cc b/src/rgw/rgw_resolve.cc new file mode 100644 index 00000000..0e515962 --- /dev/null +++ b/src/rgw/rgw_resolve.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + +#include "acconfig.h" + +#ifdef HAVE_ARPA_NAMESER_COMPAT_H +#include +#endif + +#include "rgw_common.h" +#include "rgw_resolve.h" +#include "common/dns_resolve.h" + +#define dout_subsys ceph_subsys_rgw + + +RGWResolver::~RGWResolver() { +} + +RGWResolver::RGWResolver() { + resolver = DNSResolver::get_instance(); +} + +int RGWResolver::resolve_cname(const string& hostname, string& cname, bool *found) { + return resolver->resolve_cname(g_ceph_context, hostname, &cname, found); +} + +RGWResolver *rgw_resolver; + + +void rgw_init_resolver() +{ + rgw_resolver = new RGWResolver(); +} + +void rgw_shutdown_resolver() +{ + delete rgw_resolver; +} diff --git a/src/rgw/rgw_resolve.h b/src/rgw/rgw_resolve.h new file mode 100644 index 00000000..6e00aaa6 --- /dev/null +++ b/src/rgw/rgw_resolve.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_RESOLVE_H +#define CEPH_RGW_RESOLVE_H + +#include "rgw_common.h" + +namespace ceph { + class DNSResolver; +} + +class RGWResolver { + DNSResolver *resolver; + +public: + ~RGWResolver(); + RGWResolver(); + int resolve_cname(const string& hostname, string& cname, bool *found); +}; + + +extern void rgw_init_resolver(void); +extern void rgw_shutdown_resolver(void); +extern RGWResolver *rgw_resolver; + +#endif diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc new file mode 100644 index 00000000..9a52af5d --- /dev/null +++ b/src/rgw/rgw_rest.cc @@ -0,0 +1,2302 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#include +#include + +#include +#include +#include "common/Formatter.h" +#include "common/HTMLFormatter.h" +#include "common/utf8.h" +#include "include/str_list.h" +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_auth_s3.h" +#include "rgw_formats.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_swift.h" +#include "rgw_rest_s3.h" +#include "rgw_swift_auth.h" +#include "rgw_cors_s3.h" +#include "rgw_perf_counters.h" + +#include "rgw_client_io.h" +#include "rgw_resolve.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +struct rgw_http_status_code { + int code; + const char *name; +}; + +const static struct rgw_http_status_code http_codes[] = { + { 100, "Continue" }, + { 200, "OK" }, + { 201, "Created" }, + { 202, "Accepted" }, + { 204, "No Content" }, + { 205, "Reset Content" }, + { 206, "Partial Content" }, + { 207, "Multi Status" }, + { 208, "Already Reported" }, + { 300, "Multiple Choices" }, + { 301, "Moved Permanently" }, + { 302, "Found" }, + { 303, "See Other" }, + { 304, "Not Modified" }, + { 305, "User Proxy" }, + { 306, "Switch Proxy" }, + { 307, "Temporary Redirect" }, + { 308, "Permanent Redirect" }, + { 400, "Bad Request" }, + { 401, "Unauthorized" }, + { 402, "Payment Required" }, + { 403, "Forbidden" }, + { 404, "Not Found" }, + { 405, "Method Not Allowed" }, + { 406, "Not Acceptable" }, + { 407, "Proxy Authentication Required" }, + { 408, "Request Timeout" }, + { 409, "Conflict" }, + { 410, "Gone" }, + { 411, "Length Required" }, + { 412, "Precondition Failed" }, + { 413, "Request Entity Too Large" }, + { 414, "Request-URI Too Long" }, + { 415, "Unsupported Media Type" }, + { 416, "Requested Range Not Satisfiable" }, + { 417, "Expectation Failed" }, + { 422, "Unprocessable Entity" }, + { 498, "Rate Limited"}, + { 500, "Internal Server Error" }, + { 501, "Not Implemented" }, + { 503, "Slow Down"}, + { 0, NULL }, +}; + +struct rgw_http_attr { + const char *rgw_attr; + const char *http_attr; +}; + +/* + * mapping between rgw object attrs and output http fields + */ +static const struct rgw_http_attr base_rgw_to_http_attrs[] = { + { RGW_ATTR_CONTENT_LANG, "Content-Language" }, + { RGW_ATTR_EXPIRES, "Expires" }, + { RGW_ATTR_CACHE_CONTROL, "Cache-Control" }, + { RGW_ATTR_CONTENT_DISP, "Content-Disposition" }, + { RGW_ATTR_CONTENT_ENC, "Content-Encoding" }, + { RGW_ATTR_USER_MANIFEST, "X-Object-Manifest" }, + { RGW_ATTR_X_ROBOTS_TAG , "X-Robots-Tag" }, + { RGW_ATTR_STORAGE_CLASS , "X-Amz-Storage-Class" }, + /* RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION header depends on access mode: + * S3 endpoint: x-amz-website-redirect-location + * S3Website endpoint: Location + */ + { RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "x-amz-website-redirect-location" }, +}; + + +struct generic_attr { + const char *http_header; + const char *rgw_attr; +}; + +/* + * mapping between http env fields and rgw object attrs + */ +static const struct generic_attr generic_attrs[] = { + { "CONTENT_TYPE", RGW_ATTR_CONTENT_TYPE }, + { "HTTP_CONTENT_LANGUAGE", RGW_ATTR_CONTENT_LANG }, + { "HTTP_EXPIRES", RGW_ATTR_EXPIRES }, + { "HTTP_CACHE_CONTROL", RGW_ATTR_CACHE_CONTROL }, + { "HTTP_CONTENT_DISPOSITION", RGW_ATTR_CONTENT_DISP }, + { "HTTP_CONTENT_ENCODING", RGW_ATTR_CONTENT_ENC }, + { "HTTP_X_ROBOTS_TAG", RGW_ATTR_X_ROBOTS_TAG }, +}; + +map rgw_to_http_attrs; +static map generic_attrs_map; +map http_status_names; + +/* + * make attrs look_like_this + * converts dashes to underscores + */ +string lowercase_underscore_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + switch (*s) { + case '-': + buf[i] = '_'; + break; + default: + buf[i] = tolower(*s); + } + } + return string(buf); +} + +/* + * make attrs LOOK_LIKE_THIS + * converts dashes to underscores + */ +string uppercase_underscore_http_attr(const string& orig) +{ + const char *s = orig.c_str(); + char buf[orig.size() + 1]; + buf[orig.size()] = '\0'; + + for (size_t i = 0; i < orig.size(); ++i, ++s) { + switch (*s) { + case '-': + buf[i] = '_'; + break; + default: + buf[i] = toupper(*s); + } + } + return string(buf); +} + +/* avoid duplicate hostnames in hostnames lists */ +static set hostnames_set; +static set hostnames_s3website_set; + +void rgw_rest_init(CephContext *cct, RGWRados *store, const RGWZoneGroup& zone_group) +{ + for (const auto& rgw2http : base_rgw_to_http_attrs) { + rgw_to_http_attrs[rgw2http.rgw_attr] = rgw2http.http_attr; + } + + for (const auto& http2rgw : generic_attrs) { + generic_attrs_map[http2rgw.http_header] = http2rgw.rgw_attr; + } + + list extended_http_attrs; + get_str_list(cct->_conf->rgw_extended_http_attrs, extended_http_attrs); + + list::iterator iter; + for (iter = extended_http_attrs.begin(); iter != extended_http_attrs.end(); ++iter) { + string rgw_attr = RGW_ATTR_PREFIX; + rgw_attr.append(lowercase_underscore_http_attr(*iter)); + + rgw_to_http_attrs[rgw_attr] = camelcase_dash_http_attr(*iter); + + string http_header = "HTTP_"; + http_header.append(uppercase_underscore_http_attr(*iter)); + + generic_attrs_map[http_header] = rgw_attr; + } + + for (const struct rgw_http_status_code *h = http_codes; h->code; h++) { + http_status_names[h->code] = h->name; + } + + hostnames_set.insert(cct->_conf->rgw_dns_name); + hostnames_set.insert(zone_group.hostnames.begin(), zone_group.hostnames.end()); + hostnames_set.erase(""); // filter out empty hostnames + ldout(cct, 20) << "RGW hostnames: " << hostnames_set << dendl; + /* TODO: We should have a sanity check that no hostname matches the end of + * any other hostname, otherwise we will get ambigious results from + * rgw_find_host_in_domains. + * Eg: + * Hostnames: [A, B.A] + * Inputs: [Z.A, X.B.A] + * Z.A clearly splits to subdomain=Z, domain=Z + * X.B.A ambigously splits to both {X, B.A} and {X.B, A} + */ + + hostnames_s3website_set.insert(cct->_conf->rgw_dns_s3website_name); + hostnames_s3website_set.insert(zone_group.hostnames_s3website.begin(), zone_group.hostnames_s3website.end()); + hostnames_s3website_set.erase(""); // filter out empty hostnames + ldout(cct, 20) << "RGW S3website hostnames: " << hostnames_s3website_set << dendl; + /* TODO: we should repeat the hostnames_set sanity check here + * and ALSO decide about overlap, if any + */ +} + +static bool str_ends_with_nocase(const string& s, const string& suffix, size_t *pos) +{ + size_t len = suffix.size(); + if (len > (size_t)s.size()) { + return false; + } + + ssize_t p = s.size() - len; + if (pos) { + *pos = p; + } + + return boost::algorithm::iends_with(s, suffix); +} + +static bool rgw_find_host_in_domains(const string& host, string *domain, string *subdomain, + const set& valid_hostnames_set) +{ + set::iterator iter; + /** TODO, Future optimization + * store hostnames_set elements _reversed_, and look for a prefix match, + * which is much faster than a suffix match. + */ + for (iter = valid_hostnames_set.begin(); iter != valid_hostnames_set.end(); ++iter) { + size_t pos; + if (!str_ends_with_nocase(host, *iter, &pos)) + continue; + + if (pos == 0) { + *domain = host; + subdomain->clear(); + } else { + if (host[pos - 1] != '.') { + continue; + } + + *domain = host.substr(pos); + *subdomain = host.substr(0, pos - 1); + } + return true; + } + return false; +} + +static void dump_status(struct req_state *s, int status, + const char *status_name) +{ + s->formatter->set_status(status, status_name); + try { + RESTFUL_IO(s)->send_status(status, status_name); + } catch (rgw::io::Exception& e) { + ldout(s->cct, 0) << "ERROR: s->cio->send_status() returned err=" + << e.what() << dendl; + } +} + +void rgw_flush_formatter_and_reset(struct req_state *s, Formatter *formatter) +{ + std::ostringstream oss; + formatter->output_footer(); + formatter->flush(oss); + std::string outs(oss.str()); + if (!outs.empty() && s->op != OP_HEAD) { + dump_body(s, outs); + } + + s->formatter->reset(); +} + +void rgw_flush_formatter(struct req_state *s, Formatter *formatter) +{ + std::ostringstream oss; + formatter->flush(oss); + std::string outs(oss.str()); + if (!outs.empty() && s->op != OP_HEAD) { + dump_body(s, outs); + } +} + +void dump_errno(int http_ret, string& out) { + stringstream ss; + + ss << http_ret << " " << http_status_names[http_ret]; + out = ss.str(); +} + +void dump_errno(const struct rgw_err &err, string& out) { + dump_errno(err.http_ret, out); +} + +void dump_errno(struct req_state *s) +{ + dump_status(s, s->err.http_ret, http_status_names[s->err.http_ret]); +} + +void dump_errno(struct req_state *s, int http_ret) +{ + dump_status(s, http_ret, http_status_names[http_ret]); +} + +void dump_header(struct req_state* const s, + const boost::string_ref& name, + const boost::string_ref& val) +{ + try { + RESTFUL_IO(s)->send_header(name, val); + } catch (rgw::io::Exception& e) { + ldout(s->cct, 0) << "ERROR: s->cio->send_header() returned err=" + << e.what() << dendl; + } +} + +void dump_header(struct req_state* const s, + const boost::string_ref& name, + ceph::buffer::list& bl) +{ + return dump_header(s, name, rgw_sanitized_hdrval(bl)); +} + +void dump_header(struct req_state* const s, + const boost::string_ref& name, + const long long val) +{ + char buf[32]; + const auto len = snprintf(buf, sizeof(buf), "%lld", val); + + return dump_header(s, name, boost::string_ref(buf, len)); +} + +void dump_header(struct req_state* const s, + const boost::string_ref& name, + const utime_t& ut) +{ + char buf[32]; + const auto len = snprintf(buf, sizeof(buf), "%lld.%05d", + static_cast(ut.sec()), + static_cast(ut.usec() / 10)); + + return dump_header(s, name, boost::string_ref(buf, len)); +} + +void dump_content_length(struct req_state* const s, const uint64_t len) +{ + try { + RESTFUL_IO(s)->send_content_length(len); + } catch (rgw::io::Exception& e) { + ldout(s->cct, 0) << "ERROR: s->cio->send_content_length() returned err=" + << e.what() << dendl; + } + dump_header(s, "Accept-Ranges", "bytes"); +} + +static void dump_chunked_encoding(struct req_state* const s) +{ + try { + RESTFUL_IO(s)->send_chunked_transfer_encoding(); + } catch (rgw::io::Exception& e) { + ldout(s->cct, 0) << "ERROR: RESTFUL_IO(s)->send_chunked_transfer_encoding()" + << " returned err=" << e.what() << dendl; + } +} + +void dump_etag(struct req_state* const s, + const boost::string_ref& etag, + const bool quoted) +{ + if (etag.empty()) { + return; + } + + if (s->prot_flags & RGW_REST_SWIFT && ! quoted) { + return dump_header(s, "etag", etag); + } else { + return dump_header_quoted(s, "ETag", etag); + } +} + +void dump_bucket_from_state(struct req_state *s) +{ + if (g_conf()->rgw_expose_bucket && ! s->bucket_name.empty()) { + if (! s->bucket_tenant.empty()) { + dump_header(s, "Bucket", + url_encode(s->bucket_tenant + "/" + s->bucket_name)); + } else { + dump_header(s, "Bucket", url_encode(s->bucket_name)); + } + } +} + +void dump_redirect(struct req_state * const s, const std::string& redirect) +{ + return dump_header_if_nonempty(s, "Location", redirect); +} + +static size_t dump_time_header_impl(char (×tr)[TIME_BUF_SIZE], + const real_time t) +{ + const utime_t ut(t); + time_t secs = static_cast(ut.sec()); + + struct tm result; + const struct tm * const tmp = gmtime_r(&secs, &result); + if (tmp == nullptr) { + return 0; + } + + return strftime(timestr, sizeof(timestr), + "%a, %d %b %Y %H:%M:%S %Z", tmp); +} + +void dump_time_header(struct req_state *s, const char *name, real_time t) +{ + char timestr[TIME_BUF_SIZE]; + + const size_t len = dump_time_header_impl(timestr, t); + if (len == 0) { + return; + } + + return dump_header(s, name, boost::string_ref(timestr, len)); +} + +std::string dump_time_to_str(const real_time& t) +{ + char timestr[TIME_BUF_SIZE]; + dump_time_header_impl(timestr, t); + + return timestr; +} + + +void dump_last_modified(struct req_state *s, real_time t) +{ + dump_time_header(s, "Last-Modified", t); +} + +void dump_epoch_header(struct req_state *s, const char *name, real_time t) +{ + utime_t ut(t); + char buf[65]; + const auto len = snprintf(buf, sizeof(buf), "%lld.%09lld", + (long long)ut.sec(), + (long long)ut.nsec()); + + return dump_header(s, name, boost::string_ref(buf, len)); +} + +void dump_time(struct req_state *s, const char *name, real_time *t) +{ + char buf[TIME_BUF_SIZE]; + rgw_to_iso8601(*t, buf, sizeof(buf)); + + s->formatter->dump_string(name, buf); +} + +void dump_owner(struct req_state *s, const rgw_user& id, string& name, + const char *section) +{ + if (!section) + section = "Owner"; + s->formatter->open_object_section(section); + s->formatter->dump_string("ID", id.to_str()); + s->formatter->dump_string("DisplayName", name); + s->formatter->close_section(); +} + +void dump_access_control(struct req_state *s, const char *origin, + const char *meth, + const char *hdr, const char *exp_hdr, + uint32_t max_age) { + if (origin && (origin[0] != '\0')) { + dump_header(s, "Access-Control-Allow-Origin", origin); + /* If the server specifies an origin host rather than "*", + * then it must also include Origin in the Vary response header + * to indicate to clients that server responses will differ + * based on the value of the Origin request header. + */ + if (strcmp(origin, "*") != 0) { + dump_header(s, "Vary", "Origin"); + } + + if (meth && (meth[0] != '\0')) { + dump_header(s, "Access-Control-Allow-Methods", meth); + } + if (hdr && (hdr[0] != '\0')) { + dump_header(s, "Access-Control-Allow-Headers", hdr); + } + if (exp_hdr && (exp_hdr[0] != '\0')) { + dump_header(s, "Access-Control-Expose-Headers", exp_hdr); + } + if (max_age != CORS_MAX_AGE_INVALID) { + dump_header(s, "Access-Control-Max-Age", max_age); + } + } +} + +void dump_access_control(req_state *s, RGWOp *op) +{ + string origin; + string method; + string header; + string exp_header; + unsigned max_age = CORS_MAX_AGE_INVALID; + + if (!op->generate_cors_headers(origin, method, header, exp_header, &max_age)) + return; + + dump_access_control(s, origin.c_str(), method.c_str(), header.c_str(), + exp_header.c_str(), max_age); +} + +void dump_start(struct req_state *s) +{ + if (!s->content_started) { + s->formatter->output_header(); + s->content_started = true; + } +} + +void dump_trans_id(req_state *s) +{ + if (s->prot_flags & RGW_REST_SWIFT) { + dump_header(s, "X-Trans-Id", s->trans_id); + dump_header(s, "X-Openstack-Request-Id", s->trans_id); + } else if (s->trans_id.length()) { + dump_header(s, "x-amz-request-id", s->trans_id); + } +} + +void end_header(struct req_state* s, RGWOp* op, const char *content_type, + const int64_t proposed_content_length, bool force_content_type, + bool force_no_error) +{ + string ctype; + + dump_trans_id(s); + + if ((!s->is_err()) && + (s->bucket_info.owner != s->user->user_id) && + (s->bucket_info.requester_pays)) { + dump_header(s, "x-amz-request-charged", "requester"); + } + + if (op) { + dump_access_control(s, op); + } + + if (s->prot_flags & RGW_REST_SWIFT && !content_type) { + force_content_type = true; + } + + /* do not send content type if content length is zero + and the content type was not set by the user */ + if (force_content_type || + (!content_type && s->formatter->get_len() != 0) || s->is_err()){ + switch (s->format) { + case RGW_FORMAT_XML: + ctype = "application/xml"; + break; + case RGW_FORMAT_JSON: + ctype = "application/json"; + break; + case RGW_FORMAT_HTML: + ctype = "text/html"; + break; + default: + ctype = "text/plain"; + break; + } + if (s->prot_flags & RGW_REST_SWIFT) + ctype.append("; charset=utf-8"); + content_type = ctype.c_str(); + } + if (!force_no_error && s->is_err()) { + dump_start(s); + dump(s); + dump_content_length(s, s->formatter->get_len()); + } else { + if (proposed_content_length == CHUNKED_TRANSFER_ENCODING) { + dump_chunked_encoding(s); + } else if (proposed_content_length != NO_CONTENT_LENGTH) { + dump_content_length(s, proposed_content_length); + } + } + + if (content_type) { + dump_header(s, "Content-Type", content_type); + } + dump_header_if_nonempty(s, "Server", g_conf()->rgw_service_provider_name); + + try { + RESTFUL_IO(s)->complete_header(); + } catch (rgw::io::Exception& e) { + ldout(s->cct, 0) << "ERROR: RESTFUL_IO(s)->complete_header() returned err=" + << e.what() << dendl; + } + + ACCOUNTING_IO(s)->set_account(true); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void build_redirect_url(req_state *s, const string& redirect_base, string *redirect_url) +{ + string& dest_uri = *redirect_url; + + dest_uri = redirect_base; + /* + * reqest_uri is always start with slash, so we need to remove + * the unnecessary slash at the end of dest_uri. + */ + if (dest_uri[dest_uri.size() - 1] == '/') { + dest_uri = dest_uri.substr(0, dest_uri.size() - 1); + } + dest_uri += s->info.request_uri; + dest_uri += "?"; + dest_uri += s->info.request_params; +} + +void abort_early(struct req_state *s, RGWOp* op, int err_no, + RGWHandler* handler) +{ + string error_content(""); + if (!s->formatter) { + s->formatter = new JSONFormatter; + s->format = RGW_FORMAT_JSON; + } + + // op->error_handler is responsible for calling it's handler error_handler + if (op != NULL) { + int new_err_no; + new_err_no = op->error_handler(err_no, &error_content); + ldout(s->cct, 20) << "op->ERRORHANDLER: err_no=" << err_no + << " new_err_no=" << new_err_no << dendl; + err_no = new_err_no; + } else if (handler != NULL) { + int new_err_no; + new_err_no = handler->error_handler(err_no, &error_content); + ldout(s->cct, 20) << "handler->ERRORHANDLER: err_no=" << err_no + << " new_err_no=" << new_err_no << dendl; + err_no = new_err_no; + } + + // If the error handler(s) above dealt with it completely, they should have + // returned 0. If non-zero, we need to continue here. + if (err_no) { + // Watch out, we might have a custom error state already set! + if (!s->err.http_ret || s->err.http_ret == 200) { + set_req_state_err(s, err_no); + } + + if (s->err.http_ret == 404 && !s->redirect_zone_endpoint.empty()) { + s->err.http_ret = 301; + err_no = -ERR_PERMANENT_REDIRECT; + build_redirect_url(s, s->redirect_zone_endpoint, &s->redirect); + } + + dump_errno(s); + dump_bucket_from_state(s); + if (err_no == -ERR_PERMANENT_REDIRECT || err_no == -ERR_WEBSITE_REDIRECT) { + string dest_uri; + if (!s->redirect.empty()) { + dest_uri = s->redirect; + } else if (!s->zonegroup_endpoint.empty()) { + build_redirect_url(s, s->zonegroup_endpoint, &dest_uri); + } + + if (!dest_uri.empty()) { + dump_redirect(s, dest_uri); + } + } + + if (!error_content.empty()) { + /* + * TODO we must add all error entries as headers here: + * when having a working errordoc, then the s3 error fields are + * rendered as HTTP headers, e.g.: + * x-amz-error-code: NoSuchKey + * x-amz-error-message: The specified key does not exist. + * x-amz-error-detail-Key: foo + */ + end_header(s, op, NULL, error_content.size(), false, true); + RESTFUL_IO(s)->send_body(error_content.c_str(), error_content.size()); + } else { + end_header(s, op); + } + } + perfcounter->inc(l_rgw_failed_req); +} + +void dump_continue(struct req_state * const s) +{ + try { + RESTFUL_IO(s)->send_100_continue(); + } catch (rgw::io::Exception& e) { + ldout(s->cct, 0) << "ERROR: RESTFUL_IO(s)->send_100_continue() returned err=" + << e.what() << dendl; + } +} + +void dump_range(struct req_state* const s, + const uint64_t ofs, + const uint64_t end, + const uint64_t total) +{ + /* dumping range into temp buffer first, as libfcgi will fail to digest + * %lld */ + char range_buf[128]; + size_t len; + + if (! total) { + len = snprintf(range_buf, sizeof(range_buf), "bytes */%lld", + static_cast(total)); + } else { + len = snprintf(range_buf, sizeof(range_buf), "bytes %lld-%lld/%lld", + static_cast(ofs), + static_cast(end), + static_cast(total)); + } + + return dump_header(s, "Content-Range", boost::string_ref(range_buf, len)); +} + + +int dump_body(struct req_state* const s, + const char* const buf, + const size_t len) +{ + try { + return RESTFUL_IO(s)->send_body(buf, len); + } catch (rgw::io::Exception& e) { + return -e.code().value(); + } +} + +int dump_body(struct req_state* const s, /* const */ ceph::buffer::list& bl) +{ + return dump_body(s, bl.c_str(), bl.length()); +} + +int dump_body(struct req_state* const s, const std::string& str) +{ + return dump_body(s, str.c_str(), str.length()); +} + +int recv_body(struct req_state* const s, + char* const buf, + const size_t max) +{ + try { + return RESTFUL_IO(s)->recv_body(buf, max); + } catch (rgw::io::Exception& e) { + return -e.code().value(); + } +} + +int RGWGetObj_ObjStore::get_params() +{ + range_str = s->info.env->get("HTTP_RANGE"); + if_mod = s->info.env->get("HTTP_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_IF_NONE_MATCH"); + + if (s->system_request) { + mod_zone_id = s->info.env->get_int("HTTP_DEST_ZONE_SHORT_ID", 0); + mod_pg_ver = s->info.env->get_int("HTTP_DEST_PG_VER", 0); + rgwx_stat = s->info.args.exists(RGW_SYS_PARAM_PREFIX "stat"); + get_data &= (!rgwx_stat); + } + + if (s->info.args.exists(GET_TORRENT)) { + return torrent.get_params(); + } + return 0; +} + +int RESTArgs::get_string(struct req_state *s, const string& name, + const string& def_val, string *val, bool *existed) +{ + bool exists; + *val = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + return 0; +} + +int RESTArgs::get_uint64(struct req_state *s, const string& name, + uint64_t def_val, uint64_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + int r = stringtoull(sval, val); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_int64(struct req_state *s, const string& name, + int64_t def_val, int64_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + int r = stringtoll(sval, val); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_uint32(struct req_state *s, const string& name, + uint32_t def_val, uint32_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + int r = stringtoul(sval, val); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_int32(struct req_state *s, const string& name, + int32_t def_val, int32_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + int r = stringtol(sval, val); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_time(struct req_state *s, const string& name, + const utime_t& def_val, utime_t *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + uint64_t epoch, nsec; + + int r = utime_t::parse_date(sval, &epoch, &nsec); + if (r < 0) + return r; + + *val = utime_t(epoch, nsec); + + return 0; +} + +int RESTArgs::get_epoch(struct req_state *s, const string& name, uint64_t def_val, uint64_t *epoch, bool *existed) +{ + bool exists; + string date = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *epoch = def_val; + return 0; + } + + int r = utime_t::parse_date(date, epoch, NULL); + if (r < 0) + return r; + + return 0; +} + +int RESTArgs::get_bool(struct req_state *s, const string& name, bool def_val, bool *val, bool *existed) +{ + bool exists; + string sval = s->info.args.get(name, &exists); + + if (existed) + *existed = exists; + + if (!exists) { + *val = def_val; + return 0; + } + + const char *str = sval.c_str(); + + if (sval.empty() || + strcasecmp(str, "true") == 0 || + sval.compare("1") == 0) { + *val = true; + return 0; + } + + if (strcasecmp(str, "false") != 0 && + sval.compare("0") != 0) { + *val = def_val; + return -EINVAL; + } + + *val = false; + return 0; +} + + +void RGWRESTFlusher::do_start(int ret) +{ + set_req_state_err(s, ret); /* no going back from here */ + dump_errno(s); + dump_start(s); + end_header(s, op); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWRESTFlusher::do_flush() +{ + rgw_flush_formatter(s, s->formatter); +} + +int RGWPutObj_ObjStore::verify_params() +{ + if (s->length) { + off_t len = atoll(s->length); + if (len > (off_t)(s->cct->_conf->rgw_max_put_size)) { + return -ERR_TOO_LARGE; + } + } + + return 0; +} + +int RGWPutObj_ObjStore::get_params() +{ + /* start gettorrent */ + if (s->cct->_conf->rgw_torrent_flag) + { + int ret = 0; + ret = torrent.get_params(); + ldout(s->cct, 5) << "NOTICE: open produce torrent file " << dendl; + if (ret < 0) + { + return ret; + } + torrent.set_info_name((s->object).name); + } + /* end gettorrent */ + supplied_md5_b64 = s->info.env->get("HTTP_CONTENT_MD5"); + + return 0; +} + +int RGWPutObj_ObjStore::get_data(bufferlist& bl) +{ + size_t cl; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + if (s->length) { + cl = atoll(s->length) - ofs; + if (cl > chunk_size) + cl = chunk_size; + } else { + cl = chunk_size; + } + + int len = 0; + { + ACCOUNTING_IO(s)->set_account(true); + bufferptr bp(cl); + + const auto read_len = recv_body(s, bp.c_str(), cl); + if (read_len < 0) { + return read_len; + } + + len = read_len; + bl.append(bp, 0, len); + + ACCOUNTING_IO(s)->set_account(false); + } + + if ((uint64_t)ofs + len > s->cct->_conf->rgw_max_put_size) { + return -ERR_TOO_LARGE; + } + + return len; +} + + +/* + * parses params in the format: 'first; param1=foo; param2=bar' + */ +void RGWPostObj_ObjStore::parse_boundary_params(const std::string& params_str, + std::string& first, + std::map& params) +{ + size_t pos = params_str.find(';'); + if (std::string::npos == pos) { + first = rgw_trim_whitespace(params_str); + return; + } + + first = rgw_trim_whitespace(params_str.substr(0, pos)); + pos++; + + while (pos < params_str.size()) { + size_t end = params_str.find(';', pos); + if (std::string::npos == end) { + end = params_str.size(); + } + + std::string param = params_str.substr(pos, end - pos); + size_t eqpos = param.find('='); + + if (std::string::npos != eqpos) { + std::string param_name = rgw_trim_whitespace(param.substr(0, eqpos)); + std::string val = rgw_trim_quotes(param.substr(eqpos + 1)); + params[std::move(param_name)] = std::move(val); + } else { + params[rgw_trim_whitespace(param)] = ""; + } + + pos = end + 1; + } +} + +int RGWPostObj_ObjStore::parse_part_field(const std::string& line, + std::string& field_name, /* out */ + post_part_field& field) /* out */ +{ + size_t pos = line.find(':'); + if (pos == string::npos) + return -EINVAL; + + field_name = line.substr(0, pos); + if (pos >= line.size() - 1) + return 0; + + parse_boundary_params(line.substr(pos + 1), field.val, field.params); + + return 0; +} + +static bool is_crlf(const char *s) +{ + return (*s == '\r' && *(s + 1) == '\n'); +} + +/* + * find the index of the boundary, if exists, or optionally the next end of line + * also returns how many bytes to skip + */ +static int index_of(ceph::bufferlist& bl, + uint64_t max_len, + const std::string& str, + const bool check_crlf, + bool& reached_boundary, + int& skip) +{ + reached_boundary = false; + skip = 0; + + if (str.size() < 2) // we assume boundary is at least 2 chars (makes it easier with crlf checks) + return -EINVAL; + + if (bl.length() < str.size()) + return -1; + + const char *buf = bl.c_str(); + const char *s = str.c_str(); + + if (max_len > bl.length()) + max_len = bl.length(); + + for (uint64_t i = 0; i < max_len; i++, buf++) { + if (check_crlf && + i >= 1 && + is_crlf(buf - 1)) { + return i + 1; // skip the crlf + } + if ((i < max_len - str.size() + 1) && + (buf[0] == s[0] && buf[1] == s[1]) && + (strncmp(buf, s, str.size()) == 0)) { + reached_boundary = true; + skip = str.size(); + + /* oh, great, now we need to swallow the preceding crlf + * if exists + */ + if ((i >= 2) && + is_crlf(buf - 2)) { + i -= 2; + skip += 2; + } + return i; + } + } + + return -1; +} + +int RGWPostObj_ObjStore::read_with_boundary(ceph::bufferlist& bl, + uint64_t max, + const bool check_crlf, + bool& reached_boundary, + bool& done) +{ + uint64_t cl = max + 2 + boundary.size(); + + if (max > in_data.length()) { + uint64_t need_to_read = cl - in_data.length(); + + bufferptr bp(need_to_read); + + const auto read_len = recv_body(s, bp.c_str(), need_to_read); + if (read_len < 0) { + return read_len; + } + in_data.append(bp, 0, read_len); + } + + done = false; + int skip; + const int index = index_of(in_data, cl, boundary, check_crlf, + reached_boundary, skip); + if (index >= 0) { + max = index; + } + + if (max > in_data.length()) { + max = in_data.length(); + } + + bl.substr_of(in_data, 0, max); + + ceph::bufferlist new_read_data; + + /* + * now we need to skip boundary for next time, also skip any crlf, or + * check to see if it's the last final boundary (marked with "--" at the end + */ + if (reached_boundary) { + int left = in_data.length() - max; + if (left < skip + 2) { + int need = skip + 2 - left; + bufferptr boundary_bp(need); + const int r = recv_body(s, boundary_bp.c_str(), need); + if (r < 0) { + return r; + } + in_data.append(boundary_bp); + } + max += skip; // skip boundary for next time + if (in_data.length() >= max + 2) { + const char *data = in_data.c_str(); + if (is_crlf(data + max)) { + max += 2; + } else { + if (*(data + max) == '-' && + *(data + max + 1) == '-') { + done = true; + max += 2; + } + } + } + } + + new_read_data.substr_of(in_data, max, in_data.length() - max); + in_data = new_read_data; + + return 0; +} + +int RGWPostObj_ObjStore::read_line(ceph::bufferlist& bl, + const uint64_t max, + bool& reached_boundary, + bool& done) +{ + return read_with_boundary(bl, max, true, reached_boundary, done); +} + +int RGWPostObj_ObjStore::read_data(ceph::bufferlist& bl, + const uint64_t max, + bool& reached_boundary, + bool& done) +{ + return read_with_boundary(bl, max, false, reached_boundary, done); +} + + +int RGWPostObj_ObjStore::read_form_part_header(struct post_form_part* const part, + bool& done) +{ + bufferlist bl; + bool reached_boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + int r = read_line(bl, chunk_size, reached_boundary, done); + if (r < 0) { + return r; + } + + if (done) { + return 0; + } + + if (reached_boundary) { // skip the first boundary + r = read_line(bl, chunk_size, reached_boundary, done); + if (r < 0) { + return r; + } else if (done) { + return 0; + } + } + + while (true) { + /* + * iterate through fields + */ + std::string line = rgw_trim_whitespace(string(bl.c_str(), bl.length())); + + if (line.empty()) { + break; + } + + struct post_part_field field; + + string field_name; + r = parse_part_field(line, field_name, field); + if (r < 0) { + return r; + } + + part->fields[field_name] = field; + + if (stringcasecmp(field_name, "Content-Disposition") == 0) { + part->name = field.params["name"]; + } + + if (reached_boundary) { + break; + } + + r = read_line(bl, chunk_size, reached_boundary, done); + if (r < 0) { + return r; + } + } + + return 0; +} + +bool RGWPostObj_ObjStore::part_str(parts_collection_t& parts, + const std::string& name, + std::string* val) +{ + const auto iter = parts.find(name); + if (std::end(parts) == iter) { + return false; + } + + ceph::bufferlist& data = iter->second.data; + std::string str = string(data.c_str(), data.length()); + *val = rgw_trim_whitespace(str); + return true; +} + +std::string RGWPostObj_ObjStore::get_part_str(parts_collection_t& parts, + const std::string& name, + const std::string& def_val) +{ + std::string val; + + if (part_str(parts, name, &val)) { + return val; + } else { + return rgw_trim_whitespace(def_val); + } +} + +bool RGWPostObj_ObjStore::part_bl(parts_collection_t& parts, + const std::string& name, + ceph::bufferlist* pbl) +{ + const auto iter = parts.find(name); + if (std::end(parts) == iter) { + return false; + } + + *pbl = iter->second.data; + return true; +} + +int RGWPostObj_ObjStore::verify_params() +{ + /* check that we have enough memory to store the object + note that this test isn't exact and may fail unintentionally + for large requests is */ + if (!s->length) { + return -ERR_LENGTH_REQUIRED; + } + off_t len = atoll(s->length); + if (len > (off_t)(s->cct->_conf->rgw_max_put_size)) { + return -ERR_TOO_LARGE; + } + + supplied_md5_b64 = s->info.env->get("HTTP_CONTENT_MD5"); + + return 0; +} + +int RGWPostObj_ObjStore::get_params() +{ + if (s->expect_cont) { + /* OK, here it really gets ugly. With POST, the params are embedded in the + * request body, so we need to continue before being able to actually look + * at them. This diverts from the usual request flow. */ + dump_continue(s); + s->expect_cont = false; + } + + std::string req_content_type_str = s->info.env->get("CONTENT_TYPE", ""); + std::string req_content_type; + std::map params; + parse_boundary_params(req_content_type_str, req_content_type, params); + + if (req_content_type.compare("multipart/form-data") != 0) { + err_msg = "Request Content-Type is not multipart/form-data"; + return -EINVAL; + } + + if (s->cct->_conf->subsys.should_gather()) { + ldout(s->cct, 20) << "request content_type_str=" + << req_content_type_str << dendl; + ldout(s->cct, 20) << "request content_type params:" << dendl; + + for (const auto& pair : params) { + ldout(s->cct, 20) << " " << pair.first << " -> " << pair.second + << dendl; + } + } + + const auto iter = params.find("boundary"); + if (std::end(params) == iter) { + err_msg = "Missing multipart boundary specification"; + return -EINVAL; + } + + /* Create the boundary. */ + boundary = "--"; + boundary.append(iter->second); + + return 0; +} + + +int RGWPutACLs_ObjStore::get_params() +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = rgw_rest_read_all_input(s, max_size, false); + ldout(s->cct, 0) << "RGWPutACLs_ObjStore::get_params read data is: " << data.c_str() << dendl; + return op_ret; +} + +int RGWPutLC_ObjStore::get_params() +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = rgw_rest_read_all_input(s, max_size, false); + return op_ret; +} + +int RGWPutBucketObjectLock_ObjStore::get_params() +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = rgw_rest_read_all_input(s, max_size, false); + return op_ret; +} + +int RGWPutObjLegalHold_ObjStore::get_params() +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = rgw_rest_read_all_input(s, max_size, false); + return op_ret; +} + + +static std::tuple read_all_chunked_input(req_state *s, const uint64_t max_read) +{ +#define READ_CHUNK 4096 +#define MAX_READ_CHUNK (128 * 1024) + int need_to_read = READ_CHUNK; + int total = need_to_read; + bufferlist bl; + + int read_len = 0, len = 0; + do { + bufferptr bp(need_to_read + 1); + read_len = recv_body(s, bp.c_str(), need_to_read); + if (read_len < 0) { + return std::make_tuple(read_len, std::move(bl)); + } + + bp.c_str()[read_len] = '\0'; + bp.set_length(read_len); + bl.append(bp); + len += read_len; + + if (read_len == need_to_read) { + if (need_to_read < MAX_READ_CHUNK) + need_to_read *= 2; + + if ((unsigned)total > max_read) { + return std::make_tuple(-ERANGE, std::move(bl)); + } + total += need_to_read; + } else { + break; + } + } while (true); + + return std::make_tuple(0, std::move(bl)); +} + +std::tuple rgw_rest_read_all_input(struct req_state *s, + const uint64_t max_len, + const bool allow_chunked) +{ + size_t cl = 0; + int len = 0; + bufferlist bl; + + if (s->length) + cl = atoll(s->length); + else if (!allow_chunked) + return std::make_tuple(-ERR_LENGTH_REQUIRED, std::move(bl)); + + if (cl) { + if (cl > (size_t)max_len) { + return std::make_tuple(-ERANGE, std::move(bl)); + } + + bufferptr bp(cl + 1); + + len = recv_body(s, bp.c_str(), cl); + if (len < 0) { + return std::make_tuple(len, std::move(bl)); + } + + bp.c_str()[len] = '\0'; + bp.set_length(len); + bl.append(bp); + + } else if (allow_chunked && !s->length) { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + if (!encoding || strcmp(encoding, "chunked") != 0) + return std::make_tuple(-ERR_LENGTH_REQUIRED, std::move(bl)); + + int ret = 0; + std::tie(ret, bl) = read_all_chunked_input(s, max_len); + if (ret < 0) + return std::make_tuple(ret, std::move(bl)); + } + + return std::make_tuple(0, std::move(bl)); +} + +int RGWCompleteMultipart_ObjStore::get_params() +{ + upload_id = s->info.args.get("uploadId"); + + if (upload_id.empty()) { + op_ret = -ENOTSUP; + return op_ret; + } + +#define COMPLETE_MULTIPART_MAX_LEN (1024 * 1024) /* api defines max 10,000 parts, this should be enough */ + std::tie(op_ret, data) = rgw_rest_read_all_input(s, COMPLETE_MULTIPART_MAX_LEN); + if (op_ret < 0) + return op_ret; + + return 0; +} + +int RGWListMultipart_ObjStore::get_params() +{ + upload_id = s->info.args.get("uploadId"); + + if (upload_id.empty()) { + op_ret = -ENOTSUP; + } + string marker_str = s->info.args.get("part-number-marker"); + + if (!marker_str.empty()) { + string err; + marker = strict_strtol(marker_str.c_str(), 10, &err); + if (!err.empty()) { + ldout(s->cct, 20) << "bad marker: " << marker << dendl; + op_ret = -EINVAL; + return op_ret; + } + } + + string str = s->info.args.get("max-parts"); + op_ret = parse_value_and_bound(str, max_parts, 0, + g_conf().get_val("rgw_max_listing_results"), + max_parts); + + return op_ret; +} + +int RGWListBucketMultiparts_ObjStore::get_params() +{ + delimiter = s->info.args.get("delimiter"); + prefix = s->info.args.get("prefix"); + string str = s->info.args.get("max-uploads"); + op_ret = parse_value_and_bound(str, max_uploads, 0, + g_conf().get_val("rgw_max_listing_results"), + default_max); + if (op_ret < 0) { + return op_ret; + } + + string key_marker = s->info.args.get("key-marker"); + string upload_id_marker = s->info.args.get("upload-id-marker"); + if (!key_marker.empty()) + marker.init(key_marker, upload_id_marker); + + return 0; +} + +int RGWDeleteMultiObj_ObjStore::get_params() +{ + + if (s->bucket_name.empty()) { + op_ret = -EINVAL; + return op_ret; + } + + // everything is probably fine, set the bucket + bucket = s->bucket; + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = rgw_rest_read_all_input(s, max_size, false); + return op_ret; +} + + +void RGWRESTOp::send_response() +{ + if (!flusher.did_start()) { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s, this); + } + flusher.flush(); +} + +int RGWRESTOp::verify_permission() +{ + return check_caps(s->user->caps); +} + +RGWOp* RGWHandler_REST::get_op(RGWRados* store) +{ + RGWOp *op; + switch (s->op) { + case OP_GET: + op = op_get(); + break; + case OP_PUT: + op = op_put(); + break; + case OP_DELETE: + op = op_delete(); + break; + case OP_HEAD: + op = op_head(); + break; + case OP_POST: + op = op_post(); + break; + case OP_COPY: + op = op_copy(); + break; + case OP_OPTIONS: + op = op_options(); + break; + default: + return NULL; + } + + if (op) { + op->init(store, s, this); + } + return op; +} /* get_op */ + +void RGWHandler_REST::put_op(RGWOp* op) +{ + delete op; +} /* put_op */ + +int RGWHandler_REST::allocate_formatter(struct req_state *s, + int default_type, + bool configurable) +{ + s->format = -1; // set to invalid value to allocation happens anyway + auto type = default_type; + if (configurable) { + string format_str = s->info.args.get("format"); + if (format_str.compare("xml") == 0) { + type = RGW_FORMAT_XML; + } else if (format_str.compare("json") == 0) { + type = RGW_FORMAT_JSON; + } else if (format_str.compare("html") == 0) { + type = RGW_FORMAT_HTML; + } else { + const char *accept = s->info.env->get("HTTP_ACCEPT"); + if (accept) { + char format_buf[64]; + unsigned int i = 0; + for (; i < sizeof(format_buf) - 1 && accept[i] && accept[i] != ';'; ++i) { + format_buf[i] = accept[i]; + } + format_buf[i] = 0; + if ((strcmp(format_buf, "text/xml") == 0) || (strcmp(format_buf, "application/xml") == 0)) { + type = RGW_FORMAT_XML; + } else if (strcmp(format_buf, "application/json") == 0) { + type = RGW_FORMAT_JSON; + } else if (strcmp(format_buf, "text/html") == 0) { + type = RGW_FORMAT_HTML; + } + } + } + } + return RGWHandler_REST::reallocate_formatter(s, type); +} + +int RGWHandler_REST::reallocate_formatter(struct req_state *s, int type) +{ + if (s->format == type) { + // do nothing, just reset + ceph_assert(s->formatter); + s->formatter->reset(); + return 0; + } + + delete s->formatter; + s->formatter = nullptr; + s->format = type; + + const string& mm = s->info.args.get("multipart-manifest"); + const bool multipart_delete = (mm.compare("delete") == 0); + const bool swift_bulkupload = s->prot_flags & RGW_REST_SWIFT && + s->info.args.exists("extract-archive"); + switch (s->format) { + case RGW_FORMAT_PLAIN: + { + const bool use_kv_syntax = s->info.args.exists("bulk-delete") || + multipart_delete || swift_bulkupload; + s->formatter = new RGWFormatter_Plain(use_kv_syntax); + break; + } + case RGW_FORMAT_XML: + { + const bool lowercase_underscore = s->info.args.exists("bulk-delete") || + multipart_delete || swift_bulkupload; + + s->formatter = new XMLFormatter(false, lowercase_underscore); + break; + } + case RGW_FORMAT_JSON: + s->formatter = new JSONFormatter(false); + break; + case RGW_FORMAT_HTML: + s->formatter = new HTMLFormatter(s->prot_flags & RGW_REST_WEBSITE); + break; + default: + return -EINVAL; + + }; + //s->formatter->reset(); // All formatters should reset on create already + + return 0; +} + +// This function enforces Amazon's spec for bucket names. +// (The requirements, not the recommendations.) +int RGWHandler_REST::validate_bucket_name(const string& bucket) +{ + int len = bucket.size(); + if (len < 3) { + if (len == 0) { + // This request doesn't specify a bucket at all + return 0; + } + // Name too short + return -ERR_INVALID_BUCKET_NAME; + } + else if (len > MAX_BUCKET_NAME_LEN) { + // Name too long + return -ERR_INVALID_BUCKET_NAME; + } + + const char *s = bucket.c_str(); + for (int i = 0; i < len; ++i, ++s) { + if (*(unsigned char *)s == 0xff) + return -ERR_INVALID_BUCKET_NAME; + if (*(unsigned char *)s == '/') + return -ERR_INVALID_BUCKET_NAME; + } + + return 0; +} + +// "The name for a key is a sequence of Unicode characters whose UTF-8 encoding +// is at most 1024 bytes long." +// However, we can still have control characters and other nasties in there. +// Just as long as they're utf-8 nasties. +int RGWHandler_REST::validate_object_name(const string& object) +{ + int len = object.size(); + if (len > MAX_OBJ_NAME_LEN) { + // Name too long + return -ERR_INVALID_OBJECT_NAME; + } + + if (check_utf8(object.c_str(), len)) { + // Object names must be valid UTF-8. + return -ERR_INVALID_OBJECT_NAME; + } + return 0; +} + +static http_op op_from_method(const char *method) +{ + if (!method) + return OP_UNKNOWN; + if (strcmp(method, "GET") == 0) + return OP_GET; + if (strcmp(method, "PUT") == 0) + return OP_PUT; + if (strcmp(method, "DELETE") == 0) + return OP_DELETE; + if (strcmp(method, "HEAD") == 0) + return OP_HEAD; + if (strcmp(method, "POST") == 0) + return OP_POST; + if (strcmp(method, "COPY") == 0) + return OP_COPY; + if (strcmp(method, "OPTIONS") == 0) + return OP_OPTIONS; + + return OP_UNKNOWN; +} + +int RGWHandler_REST::init_permissions(RGWOp* op) +{ + if (op->get_type() == RGW_OP_CREATE_BUCKET) { + // We don't need user policies in case of STS token returned by AssumeRole, hence the check for user type + if (! s->user->user_id.empty() && s->auth.identity->get_identity_type() != TYPE_ROLE) { + try { + map uattrs; + if (auto ret = rgw_get_user_attrs_by_uid(store, s->user->user_id, uattrs); ! ret) { + if (s->iam_user_policies.empty()) { + s->iam_user_policies = get_iam_user_policy_from_attr(s->cct, store, uattrs, s->user->user_id.tenant); + } else { + // This scenario can happen when a STS token has a policy, then we need to append other user policies + // to the existing ones. (e.g. token returned by GetSessionToken) + auto user_policies = get_iam_user_policy_from_attr(s->cct, store, uattrs, s->user->user_id.tenant); + s->iam_user_policies.insert(s->iam_user_policies.end(), user_policies.begin(), user_policies.end()); + } + } + } catch (const std::exception& e) { + lderr(s->cct) << "Error reading IAM User Policy: " << e.what() << dendl; + } + } + rgw_build_iam_environment(store, s); + return 0; + } + + return do_init_permissions(); +} + +int RGWHandler_REST::read_permissions(RGWOp* op_obj) +{ + bool only_bucket = false; + + switch (s->op) { + case OP_HEAD: + case OP_GET: + only_bucket = false; + break; + case OP_PUT: + case OP_POST: + case OP_COPY: + /* is it a 'multi-object delete' request? */ + if (s->info.args.exists("delete")) { + only_bucket = true; + break; + } + if (is_obj_update_op()) { + only_bucket = false; + break; + } + /* is it a 'create bucket' request? */ + if (op_obj->get_type() == RGW_OP_CREATE_BUCKET) + return 0; + only_bucket = true; + break; + case OP_DELETE: + if (!s->info.args.exists("tagging")){ + only_bucket = true; + } + break; + case OP_OPTIONS: + only_bucket = true; + break; + default: + return -EINVAL; + } + + return do_read_permissions(op_obj, only_bucket); +} + +void RGWRESTMgr::register_resource(string resource, RGWRESTMgr *mgr) +{ + string r = "/"; + r.append(resource); + + /* do we have a resource manager registered for this entry point? */ + map::iterator iter = resource_mgrs.find(r); + if (iter != resource_mgrs.end()) { + delete iter->second; + } + resource_mgrs[r] = mgr; + resources_by_size.insert(pair(r.size(), r)); + + /* now build default resource managers for the path (instead of nested entry points) + * e.g., if the entry point is /auth/v1.0/ then we'd want to create a default + * manager for /auth/ + */ + + size_t pos = r.find('/', 1); + + while (pos != r.size() - 1 && pos != string::npos) { + string s = r.substr(0, pos); + + iter = resource_mgrs.find(s); + if (iter == resource_mgrs.end()) { /* only register it if one does not exist */ + resource_mgrs[s] = new RGWRESTMgr; /* a default do-nothing manager */ + resources_by_size.insert(pair(s.size(), s)); + } + + pos = r.find('/', pos + 1); + } +} + +void RGWRESTMgr::register_default_mgr(RGWRESTMgr *mgr) +{ + delete default_mgr; + default_mgr = mgr; +} + +RGWRESTMgr* RGWRESTMgr::get_resource_mgr(struct req_state* const s, + const std::string& uri, + std::string* const out_uri) +{ + *out_uri = uri; + + multimap::reverse_iterator iter; + + for (iter = resources_by_size.rbegin(); iter != resources_by_size.rend(); ++iter) { + string& resource = iter->second; + if (uri.compare(0, iter->first, resource) == 0 && + (uri.size() == iter->first || + uri[iter->first] == '/')) { + std::string suffix = uri.substr(iter->first); + return resource_mgrs[resource]->get_resource_mgr(s, suffix, out_uri); + } + } + + if (default_mgr) { + return default_mgr->get_resource_mgr_as_default(s, uri, out_uri); + } + + return this; +} + +void RGWREST::register_x_headers(const string& s_headers) +{ + std::vector hdrs = get_str_vec(s_headers); + for (auto& hdr : hdrs) { + boost::algorithm::to_upper(hdr); // XXX + (void) x_headers.insert(hdr); + } +} + +RGWRESTMgr::~RGWRESTMgr() +{ + map::iterator iter; + for (iter = resource_mgrs.begin(); iter != resource_mgrs.end(); ++iter) { + delete iter->second; + } + delete default_mgr; +} + +int64_t parse_content_length(const char *content_length) +{ + int64_t len = -1; + + if (*content_length == '\0') { + len = 0; + } else { + string err; + len = strict_strtoll(content_length, 10, &err); + if (!err.empty()) { + len = -1; + } + } + + return len; +} + +int RGWREST::preprocess(struct req_state *s, rgw::io::BasicClient* cio) +{ + req_info& info = s->info; + + /* save the request uri used to hash on the client side. request_uri may suffer + modifications as part of the bucket encoding in the subdomain calling format. + request_uri_aws4 will be used under aws4 auth */ + s->info.request_uri_aws4 = s->info.request_uri; + + s->cio = cio; + + // We need to know if this RGW instance is running the s3website API with a + // higher priority than regular S3 API, or possibly in place of the regular + // S3 API. + // Map the listing of rgw_enable_apis in REVERSE order, so that items near + // the front of the list have a higher number assigned (and -1 for items not in the list). + list apis; + get_str_list(g_conf()->rgw_enable_apis, apis); + int api_priority_s3 = -1; + int api_priority_s3website = -1; + auto api_s3website_priority_rawpos = std::find(apis.begin(), apis.end(), "s3website"); + auto api_s3_priority_rawpos = std::find(apis.begin(), apis.end(), "s3"); + if (api_s3_priority_rawpos != apis.end()) { + api_priority_s3 = apis.size() - std::distance(apis.begin(), api_s3_priority_rawpos); + } + if (api_s3website_priority_rawpos != apis.end()) { + api_priority_s3website = apis.size() - std::distance(apis.begin(), api_s3website_priority_rawpos); + } + ldout(s->cct, 10) << "rgw api priority: s3=" << api_priority_s3 << " s3website=" << api_priority_s3website << dendl; + bool s3website_enabled = api_priority_s3website >= 0; + + if (info.host.size()) { + ssize_t pos; + if (info.host.find('[') == 0) { + pos = info.host.find(']'); + if (pos >=1) { + info.host = info.host.substr(1, pos-1); + } + } else { + pos = info.host.find(':'); + if (pos >= 0) { + info.host = info.host.substr(0, pos); + } + } + ldout(s->cct, 10) << "host=" << info.host << dendl; + string domain; + string subdomain; + bool in_hosted_domain_s3website = false; + bool in_hosted_domain = rgw_find_host_in_domains(info.host, &domain, &subdomain, hostnames_set); + + string s3website_domain; + string s3website_subdomain; + + if (s3website_enabled) { + in_hosted_domain_s3website = rgw_find_host_in_domains(info.host, &s3website_domain, &s3website_subdomain, hostnames_s3website_set); + if (in_hosted_domain_s3website) { + in_hosted_domain = true; // TODO: should hostnames be a strict superset of hostnames_s3website? + domain = s3website_domain; + subdomain = s3website_subdomain; + } + } + + ldout(s->cct, 20) + << "subdomain=" << subdomain + << " domain=" << domain + << " in_hosted_domain=" << in_hosted_domain + << " in_hosted_domain_s3website=" << in_hosted_domain_s3website + << dendl; + + if (g_conf()->rgw_resolve_cname + && !in_hosted_domain + && !in_hosted_domain_s3website) { + string cname; + bool found; + int r = rgw_resolver->resolve_cname(info.host, cname, &found); + if (r < 0) { + ldout(s->cct, 0) + << "WARNING: rgw_resolver->resolve_cname() returned r=" << r + << dendl; + } + + if (found) { + ldout(s->cct, 5) << "resolved host cname " << info.host << " -> " + << cname << dendl; + in_hosted_domain = + rgw_find_host_in_domains(cname, &domain, &subdomain, hostnames_set); + + if (s3website_enabled + && !in_hosted_domain_s3website) { + in_hosted_domain_s3website = + rgw_find_host_in_domains(cname, &s3website_domain, + &s3website_subdomain, + hostnames_s3website_set); + if (in_hosted_domain_s3website) { + in_hosted_domain = true; // TODO: should hostnames be a + // strict superset of hostnames_s3website? + domain = s3website_domain; + subdomain = s3website_subdomain; + } + } + + ldout(s->cct, 20) + << "subdomain=" << subdomain + << " domain=" << domain + << " in_hosted_domain=" << in_hosted_domain + << " in_hosted_domain_s3website=" << in_hosted_domain_s3website + << dendl; + } + } + + // Handle A/CNAME records that point to the RGW storage, but do match the + // CNAME test above, per issue http://tracker.ceph.com/issues/15975 + // If BOTH domain & subdomain variables are empty, then none of the above + // cases matched anything, and we should fall back to using the Host header + // directly as the bucket name. + // As additional checks: + // - if the Host header is an IP, we're using path-style access without DNS + // - Also check that the Host header is a valid bucket name before using it. + // - Don't enable virtual hosting if no hostnames are configured + if (subdomain.empty() + && (domain.empty() || domain != info.host) + && !looks_like_ip_address(info.host.c_str()) + && RGWHandler_REST::validate_bucket_name(info.host) == 0 + && !(hostnames_set.empty() && hostnames_s3website_set.empty())) { + subdomain.append(info.host); + in_hosted_domain = 1; + } + + if (s3website_enabled && api_priority_s3website > api_priority_s3) { + in_hosted_domain_s3website = 1; + } + + if (in_hosted_domain_s3website) { + s->prot_flags |= RGW_REST_WEBSITE; + } + + + if (in_hosted_domain && !subdomain.empty()) { + string encoded_bucket = "/"; + encoded_bucket.append(subdomain); + if (s->info.request_uri[0] != '/') + encoded_bucket.append("/"); + encoded_bucket.append(s->info.request_uri); + s->info.request_uri = encoded_bucket; + } + + if (!domain.empty()) { + s->info.domain = domain; + } + + ldout(s->cct, 20) + << "final domain/bucket" + << " subdomain=" << subdomain + << " domain=" << domain + << " in_hosted_domain=" << in_hosted_domain + << " in_hosted_domain_s3website=" << in_hosted_domain_s3website + << " s->info.domain=" << s->info.domain + << " s->info.request_uri=" << s->info.request_uri + << dendl; + } + + if (s->info.domain.empty()) { + s->info.domain = s->cct->_conf->rgw_dns_name; + } + + s->decoded_uri = url_decode(s->info.request_uri); + /* Validate for being free of the '\0' buried in the middle of the string. */ + if (std::strlen(s->decoded_uri.c_str()) != s->decoded_uri.length()) { + return -ERR_ZERO_IN_URL; + } + + /* FastCGI specification, section 6.3 + * http://www.fastcgi.com/devkit/doc/fcgi-spec.html#S6.3 + * === + * The Authorizer application receives HTTP request information from the Web + * server on the FCGI_PARAMS stream, in the same format as a Responder. The + * Web server does not send CONTENT_LENGTH, PATH_INFO, PATH_TRANSLATED, and + * SCRIPT_NAME headers. + * === + * Ergo if we are in Authorizer role, we MUST look at HTTP_CONTENT_LENGTH + * instead of CONTENT_LENGTH for the Content-Length. + * + * There is one slight wrinkle in this, and that's older versions of + * nginx/lighttpd/apache setting BOTH headers. As a result, we have to check + * both headers and can't always simply pick A or B. + */ + const char* content_length = info.env->get("CONTENT_LENGTH"); + const char* http_content_length = info.env->get("HTTP_CONTENT_LENGTH"); + if (!http_content_length != !content_length) { + /* Easy case: one or the other is missing */ + s->length = (content_length ? content_length : http_content_length); + } else if (s->cct->_conf->rgw_content_length_compat && + content_length && http_content_length) { + /* Hard case: Both are set, we have to disambiguate */ + int64_t content_length_i, http_content_length_i; + + content_length_i = parse_content_length(content_length); + http_content_length_i = parse_content_length(http_content_length); + + // Now check them: + if (http_content_length_i < 0) { + // HTTP_CONTENT_LENGTH is invalid, ignore it + } else if (content_length_i < 0) { + // CONTENT_LENGTH is invalid, and HTTP_CONTENT_LENGTH is valid + // Swap entries + content_length = http_content_length; + } else { + // both CONTENT_LENGTH and HTTP_CONTENT_LENGTH are valid + // Let's pick the larger size + if (content_length_i < http_content_length_i) { + // prefer the larger value + content_length = http_content_length; + } + } + s->length = content_length; + // End of: else if (s->cct->_conf->rgw_content_length_compat && + // content_length && + // http_content_length) + } else { + /* no content length was defined */ + s->length = NULL; + } + + if (s->length) { + if (*s->length == '\0') { + s->content_length = 0; + } else { + string err; + s->content_length = strict_strtoll(s->length, 10, &err); + if (!err.empty()) { + ldout(s->cct, 10) << "bad content length, aborting" << dendl; + return -EINVAL; + } + } + } + + if (s->content_length < 0) { + ldout(s->cct, 10) << "negative content length, aborting" << dendl; + return -EINVAL; + } + + map::iterator giter; + for (giter = generic_attrs_map.begin(); giter != generic_attrs_map.end(); + ++giter) { + const char *env = info.env->get(giter->first.c_str()); + if (env) { + s->generic_attrs[giter->second] = env; + } + } + + if (g_conf()->rgw_print_continue) { + const char *expect = info.env->get("HTTP_EXPECT"); + s->expect_cont = (expect && !strcasecmp(expect, "100-continue")); + } + s->op = op_from_method(info.method); + + info.init_meta_info(&s->has_bad_meta); + + return 0; +} + +RGWHandler_REST* RGWREST::get_handler( + RGWRados * const store, + struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix, + RGWRestfulIO* const rio, + RGWRESTMgr** const pmgr, + int* const init_error +) { + *init_error = preprocess(s, rio); + if (*init_error < 0) { + return nullptr; + } + + RGWRESTMgr *m = mgr.get_manager(s, frontend_prefix, s->decoded_uri, + &s->relative_uri); + if (! m) { + *init_error = -ERR_METHOD_NOT_ALLOWED; + return nullptr; + } + + if (pmgr) { + *pmgr = m; + } + + RGWHandler_REST* handler = m->get_handler(s, auth_registry, frontend_prefix); + if (! handler) { + *init_error = -ERR_METHOD_NOT_ALLOWED; + return NULL; + } + *init_error = handler->init(store, s, rio); + if (*init_error < 0) { + m->put_handler(handler); + return nullptr; + } + + return handler; +} /* get stream handler */ diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h new file mode 100644 index 00000000..f755af31 --- /dev/null +++ b/src/rgw/rgw_rest.h @@ -0,0 +1,816 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_H +#define CEPH_RGW_REST_H + +#define TIME_BUF_SIZE 128 + +#include +#include +#include "common/sstring.hh" +#include "common/ceph_json.h" +#include "include/ceph_assert.h" /* needed because of common/ceph_json.h */ +#include "rgw_op.h" +#include "rgw_formats.h" +#include "rgw_client_io.h" + +extern std::map rgw_to_http_attrs; + +extern void rgw_rest_init(CephContext *cct, RGWRados *store, const RGWZoneGroup& zone_group); + +extern void rgw_flush_formatter_and_reset(struct req_state *s, + ceph::Formatter *formatter); + +extern void rgw_flush_formatter(struct req_state *s, + ceph::Formatter *formatter); + +std::tuple rgw_rest_read_all_input(struct req_state *s, + const uint64_t max_len, + const bool allow_chunked=true); + +static inline boost::string_ref rgw_sanitized_hdrval(ceph::buffer::list& raw) +{ + /* std::string and thus boost::string_ref ARE OBLIGED to carry multiple + * 0x00 and count them to the length of a string. We need to take that + * into consideration and sanitize the size of a ceph::buffer::list used + * to store metadata values (x-amz-meta-*, X-Container-Meta-*, etags). + * Otherwise we might send 0x00 to clients. */ + const char* const data = raw.c_str(); + size_t len = raw.length(); + + if (len && data[len - 1] == '\0') { + /* That's the case - the null byte has been included at the last position + * of the bufferlist. We need to restore the proper string length we'll + * pass to string_ref. */ + len--; + } + + return boost::string_ref(data, len); +} + +template +int rgw_rest_get_json_input(CephContext *cct, req_state *s, T& out, + uint64_t max_len, bool *empty) +{ + if (empty) + *empty = false; + + int rv = 0; + bufferlist data; + std::tie(rv, data) = rgw_rest_read_all_input(s, max_len); + if (rv < 0) { + return rv; + } + + if (!data.length()) { + if (empty) { + *empty = true; + } + + return -EINVAL; + } + + JSONParser parser; + + if (!parser.parse(data.c_str(), data.length())) { + return -EINVAL; + } + + try { + decode_json_obj(out, &parser); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + + return 0; +} + +template +std::tuple rgw_rest_get_json_input_keep_data(CephContext *cct, req_state *s, T& out, uint64_t max_len) +{ + int rv = 0; + bufferlist data; + std::tie(rv, data) = rgw_rest_read_all_input(s, max_len); + if (rv < 0) { + return std::make_tuple(rv, std::move(data)); + } + + if (!data.length()) { + return std::make_tuple(-EINVAL, std::move(data)); + } + + JSONParser parser; + + if (!parser.parse(data.c_str(), data.length())) { + return std::make_tuple(-EINVAL, std::move(data)); + } + + try { + decode_json_obj(out, &parser); + } catch (JSONDecoder::err& e) { + return std::make_tuple(-EINVAL, std::move(data)); + } + + return std::make_tuple(0, std::move(data)); +} + +class RESTArgs { +public: + static int get_string(struct req_state *s, const string& name, + const string& def_val, string *val, + bool *existed = NULL); + static int get_uint64(struct req_state *s, const string& name, + uint64_t def_val, uint64_t *val, bool *existed = NULL); + static int get_int64(struct req_state *s, const string& name, + int64_t def_val, int64_t *val, bool *existed = NULL); + static int get_uint32(struct req_state *s, const string& name, + uint32_t def_val, uint32_t *val, bool *existed = NULL); + static int get_int32(struct req_state *s, const string& name, + int32_t def_val, int32_t *val, bool *existed = NULL); + static int get_time(struct req_state *s, const string& name, + const utime_t& def_val, utime_t *val, + bool *existed = NULL); + static int get_epoch(struct req_state *s, const string& name, + uint64_t def_val, uint64_t *epoch, + bool *existed = NULL); + static int get_bool(struct req_state *s, const string& name, bool def_val, + bool *val, bool *existed = NULL); +}; + +class RGWRESTFlusher : public RGWFormatterFlusher { + struct req_state *s; + RGWOp *op; +protected: + void do_flush() override; + void do_start(int ret) override; +public: + RGWRESTFlusher(struct req_state *_s, RGWOp *_op) : + RGWFormatterFlusher(_s->formatter), s(_s), op(_op) {} + RGWRESTFlusher() : RGWFormatterFlusher(NULL), s(NULL), op(NULL) {} + + void init(struct req_state *_s, RGWOp *_op) { + s = _s; + op = _op; + set_formatter(s->formatter); + } +}; + +class RGWGetObj_ObjStore : public RGWGetObj +{ +protected: + bool sent_header; +public: + RGWGetObj_ObjStore() : sent_header(false) {} + + void init(RGWRados *store, struct req_state *s, RGWHandler *h) override { + RGWGetObj::init(store, s, h); + sent_header = false; + } + + int get_params() override; +}; + +class RGWGetObjTags_ObjStore : public RGWGetObjTags { +public: + RGWGetObjTags_ObjStore() {}; + ~RGWGetObjTags_ObjStore() {}; +}; + +class RGWPutObjTags_ObjStore: public RGWPutObjTags { +public: + RGWPutObjTags_ObjStore() {}; + ~RGWPutObjTags_ObjStore() {}; +}; + +class RGWListBuckets_ObjStore : public RGWListBuckets { +public: + RGWListBuckets_ObjStore() {} + ~RGWListBuckets_ObjStore() override {} +}; + +class RGWGetUsage_ObjStore : public RGWGetUsage { +public: + RGWGetUsage_ObjStore() {} + ~RGWGetUsage_ObjStore() override {} +}; + +class RGWListBucket_ObjStore : public RGWListBucket { +public: + RGWListBucket_ObjStore() {} + ~RGWListBucket_ObjStore() override {} +}; + +class RGWStatAccount_ObjStore : public RGWStatAccount { +public: + RGWStatAccount_ObjStore() {} + ~RGWStatAccount_ObjStore() override {} +}; + +class RGWStatBucket_ObjStore : public RGWStatBucket { +public: + RGWStatBucket_ObjStore() {} + ~RGWStatBucket_ObjStore() override {} +}; + +class RGWCreateBucket_ObjStore : public RGWCreateBucket { +public: + RGWCreateBucket_ObjStore() {} + ~RGWCreateBucket_ObjStore() override {} +}; + +class RGWDeleteBucket_ObjStore : public RGWDeleteBucket { +public: + RGWDeleteBucket_ObjStore() {} + ~RGWDeleteBucket_ObjStore() override {} +}; + +class RGWPutObj_ObjStore : public RGWPutObj +{ +public: + RGWPutObj_ObjStore() {} + ~RGWPutObj_ObjStore() override {} + + int verify_params() override; + int get_params() override; + int get_data(bufferlist& bl) override; +}; + +class RGWPostObj_ObjStore : public RGWPostObj +{ + std::string boundary; + +public: + struct post_part_field { + std::string val; + std::map params; + }; + + struct post_form_part { + std::string name; + std::map fields; + ceph::bufferlist data; + }; + +protected: + using parts_collection_t = \ + std::map; + + std::string err_msg; + ceph::bufferlist in_data; + + int read_with_boundary(ceph::bufferlist& bl, + uint64_t max, + bool check_eol, + bool& reached_boundary, + bool& done); + + int read_line(ceph::bufferlist& bl, + uint64_t max, + bool& reached_boundary, + bool& done); + + int read_data(ceph::bufferlist& bl, + uint64_t max, + bool& reached_boundary, + bool& done); + + int read_form_part_header(struct post_form_part *part, bool& done); + + int get_params() override; + + static int parse_part_field(const std::string& line, + std::string& field_name, /* out */ + post_part_field& field); /* out */ + + static void parse_boundary_params(const std::string& params_str, + std::string& first, + std::map& params); + + static bool part_str(parts_collection_t& parts, + const std::string& name, + std::string *val); + + static std::string get_part_str(parts_collection_t& parts, + const std::string& name, + const std::string& def_val = std::string()); + + static bool part_bl(parts_collection_t& parts, + const std::string& name, + ceph::bufferlist *pbl); + +public: + RGWPostObj_ObjStore() {} + ~RGWPostObj_ObjStore() override {} + + int verify_params() override; +}; + + +class RGWPutMetadataAccount_ObjStore : public RGWPutMetadataAccount +{ +public: + RGWPutMetadataAccount_ObjStore() {} + ~RGWPutMetadataAccount_ObjStore() override {} +}; + +class RGWPutMetadataBucket_ObjStore : public RGWPutMetadataBucket +{ +public: + RGWPutMetadataBucket_ObjStore() {} + ~RGWPutMetadataBucket_ObjStore() override {} +}; + +class RGWPutMetadataObject_ObjStore : public RGWPutMetadataObject +{ +public: + RGWPutMetadataObject_ObjStore() {} + ~RGWPutMetadataObject_ObjStore() override {} +}; + +class RGWDeleteObj_ObjStore : public RGWDeleteObj { +public: + RGWDeleteObj_ObjStore() {} + ~RGWDeleteObj_ObjStore() override {} +}; + +class RGWGetCrossDomainPolicy_ObjStore : public RGWGetCrossDomainPolicy { +public: + RGWGetCrossDomainPolicy_ObjStore() = default; + ~RGWGetCrossDomainPolicy_ObjStore() override = default; +}; + +class RGWGetHealthCheck_ObjStore : public RGWGetHealthCheck { +public: + RGWGetHealthCheck_ObjStore() = default; + ~RGWGetHealthCheck_ObjStore() override = default; +}; + +class RGWCopyObj_ObjStore : public RGWCopyObj { +public: + RGWCopyObj_ObjStore() {} + ~RGWCopyObj_ObjStore() override {} +}; + +class RGWGetACLs_ObjStore : public RGWGetACLs { +public: + RGWGetACLs_ObjStore() {} + ~RGWGetACLs_ObjStore() override {} +}; + +class RGWPutACLs_ObjStore : public RGWPutACLs { +public: + RGWPutACLs_ObjStore() {} + ~RGWPutACLs_ObjStore() override {} + + int get_params() override; +}; + +class RGWGetLC_ObjStore : public RGWGetLC { +public: + RGWGetLC_ObjStore() {} + ~RGWGetLC_ObjStore() override {} +}; + +class RGWPutLC_ObjStore : public RGWPutLC { +public: + RGWPutLC_ObjStore() {} + ~RGWPutLC_ObjStore() override {} + + int get_params() override; +}; + +class RGWDeleteLC_ObjStore : public RGWDeleteLC { +public: + RGWDeleteLC_ObjStore() {} + ~RGWDeleteLC_ObjStore() override {} + +}; + +class RGWGetCORS_ObjStore : public RGWGetCORS { +public: + RGWGetCORS_ObjStore() {} + ~RGWGetCORS_ObjStore() override {} +}; + +class RGWPutCORS_ObjStore : public RGWPutCORS { +public: + RGWPutCORS_ObjStore() {} + ~RGWPutCORS_ObjStore() override {} +}; + +class RGWDeleteCORS_ObjStore : public RGWDeleteCORS { +public: + RGWDeleteCORS_ObjStore() {} + ~RGWDeleteCORS_ObjStore() override {} +}; + +class RGWOptionsCORS_ObjStore : public RGWOptionsCORS { +public: + RGWOptionsCORS_ObjStore() {} + ~RGWOptionsCORS_ObjStore() override {} +}; + +class RGWInitMultipart_ObjStore : public RGWInitMultipart { +public: + RGWInitMultipart_ObjStore() {} + ~RGWInitMultipart_ObjStore() override {} +}; + +class RGWCompleteMultipart_ObjStore : public RGWCompleteMultipart { +public: + RGWCompleteMultipart_ObjStore() {} + ~RGWCompleteMultipart_ObjStore() override {} + + int get_params() override; +}; + +class RGWAbortMultipart_ObjStore : public RGWAbortMultipart { +public: + RGWAbortMultipart_ObjStore() {} + ~RGWAbortMultipart_ObjStore() override {} +}; + +class RGWListMultipart_ObjStore : public RGWListMultipart { +public: + RGWListMultipart_ObjStore() {} + ~RGWListMultipart_ObjStore() override {} + + int get_params() override; +}; + +class RGWListBucketMultiparts_ObjStore : public RGWListBucketMultiparts { +public: + RGWListBucketMultiparts_ObjStore() {} + ~RGWListBucketMultiparts_ObjStore() override {} + + int get_params() override; +}; + +class RGWBulkDelete_ObjStore : public RGWBulkDelete { +public: + RGWBulkDelete_ObjStore() {} + ~RGWBulkDelete_ObjStore() override {} +}; + +class RGWBulkUploadOp_ObjStore : public RGWBulkUploadOp { +public: + RGWBulkUploadOp_ObjStore() = default; + ~RGWBulkUploadOp_ObjStore() = default; +}; + +class RGWDeleteMultiObj_ObjStore : public RGWDeleteMultiObj { +public: + RGWDeleteMultiObj_ObjStore() {} + ~RGWDeleteMultiObj_ObjStore() override {} + + int get_params() override; +}; + +class RGWInfo_ObjStore : public RGWInfo { +public: + RGWInfo_ObjStore() = default; + ~RGWInfo_ObjStore() override = default; +}; + +class RGWPutBucketObjectLock_ObjStore : public RGWPutBucketObjectLock { +public: + RGWPutBucketObjectLock_ObjStore() = default; + ~RGWPutBucketObjectLock_ObjStore() = default; + int get_params() override; +}; + +class RGWGetBucketObjectLock_ObjStore : public RGWGetBucketObjectLock { +public: + RGWGetBucketObjectLock_ObjStore() = default; + ~RGWGetBucketObjectLock_ObjStore() override = default; +}; + +class RGWPutObjRetention_ObjStore : public RGWPutObjRetention { +public: + RGWPutObjRetention_ObjStore() = default; + ~RGWPutObjRetention_ObjStore() override = default; +}; + +class RGWGetObjRetention_ObjStore : public RGWGetObjRetention { +public: + RGWGetObjRetention_ObjStore() = default; + ~RGWGetObjRetention_ObjStore() = default; +}; + +class RGWPutObjLegalHold_ObjStore : public RGWPutObjLegalHold { +public: + RGWPutObjLegalHold_ObjStore() = default; + ~RGWPutObjLegalHold_ObjStore() override = default; + int get_params() override; +}; + +class RGWGetObjLegalHold_ObjStore : public RGWGetObjLegalHold { +public: + RGWGetObjLegalHold_ObjStore() = default; + ~RGWGetObjLegalHold_ObjStore() = default; +}; + +class RGWRESTOp : public RGWOp { +protected: + int http_ret; + RGWRESTFlusher flusher; +public: + RGWRESTOp() : http_ret(0) {} + void init(RGWRados *store, struct req_state *s, + RGWHandler *dialect_handler) override { + RGWOp::init(store, s, dialect_handler); + flusher.init(s, this); + } + void send_response() override; + virtual int check_caps(RGWUserCaps& caps) + { return -EPERM; } /* should to be implemented! */ + int verify_permission() override; + dmc::client_id dmclock_client() override { return dmc::client_id::admin; } +}; + +class RGWHandler_REST : public RGWHandler { +protected: + + virtual bool is_obj_update_op() { return false; } + virtual RGWOp *op_get() { return NULL; } + virtual RGWOp *op_put() { return NULL; } + virtual RGWOp *op_delete() { return NULL; } + virtual RGWOp *op_head() { return NULL; } + virtual RGWOp *op_post() { return NULL; } + virtual RGWOp *op_copy() { return NULL; } + virtual RGWOp *op_options() { return NULL; } + +public: + static int allocate_formatter(struct req_state *s, int default_formatter, + bool configurable); + + static constexpr int MAX_BUCKET_NAME_LEN = 255; + static constexpr int MAX_OBJ_NAME_LEN = 1024; + + RGWHandler_REST() {} + ~RGWHandler_REST() override {} + + static int validate_bucket_name(const string& bucket); + static int validate_object_name(const string& object); + static int reallocate_formatter(struct req_state *s, int type); + + int init_permissions(RGWOp* op) override; + int read_permissions(RGWOp* op) override; + + virtual RGWOp* get_op(RGWRados* store); + virtual void put_op(RGWOp* op); +}; + +class RGWHandler_REST_SWIFT; +class RGWHandler_SWIFT_Auth; +class RGWHandler_REST_S3; + +namespace rgw { +namespace auth { + +class StrategyRegistry; + +} +} + +class RGWRESTMgr { + bool should_log; + +protected: + std::map resource_mgrs; + std::multimap resources_by_size; + RGWRESTMgr* default_mgr; + + virtual RGWRESTMgr* get_resource_mgr(struct req_state* s, + const std::string& uri, + std::string* out_uri); + + virtual RGWRESTMgr* get_resource_mgr_as_default(struct req_state* const s, + const std::string& uri, + std::string* our_uri) { + return this; + } + +public: + RGWRESTMgr() + : should_log(false), + default_mgr(nullptr) { + } + virtual ~RGWRESTMgr(); + + void register_resource(std::string resource, RGWRESTMgr* mgr); + void register_default_mgr(RGWRESTMgr* mgr); + + virtual RGWRESTMgr* get_manager(struct req_state* const s, + /* Prefix to be concatenated with @uri + * during the lookup. */ + const std::string& frontend_prefix, + const std::string& uri, + std::string* out_uri) final { + return get_resource_mgr(s, frontend_prefix + uri, out_uri); + } + + virtual RGWHandler_REST* get_handler( + struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix + ) { + return nullptr; + } + + virtual void put_handler(RGWHandler_REST* const handler) { + delete handler; + } + + void set_logging(bool _should_log) { + should_log = _should_log; + } + + bool get_logging() const { + return should_log; + } +}; + +class RGWLibIO; +class RGWRestfulIO; + +class RGWREST { + using x_header = basic_sstring; + boost::container::flat_set x_headers; + RGWRESTMgr mgr; + + static int preprocess(struct req_state *s, rgw::io::BasicClient* rio); +public: + RGWREST() {} + RGWHandler_REST *get_handler(RGWRados *store, + struct req_state *s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix, + RGWRestfulIO *rio, + RGWRESTMgr **pmgr, + int *init_error); +#if 0 + RGWHandler *get_handler(RGWRados *store, struct req_state *s, + RGWLibIO *io, RGWRESTMgr **pmgr, + int *init_error); +#endif + + void put_handler(RGWHandler_REST *handler) { + mgr.put_handler(handler); + } + + void register_resource(string resource, RGWRESTMgr *m, + bool register_empty = false) { + if (!register_empty && resource.empty()) + return; + + mgr.register_resource(resource, m); + } + + void register_default_mgr(RGWRESTMgr *m) { + mgr.register_default_mgr(m); + } + + void register_x_headers(const std::string& headers); + + bool log_x_headers(void) { + return (x_headers.size() > 0); + } + + bool log_x_header(const std::string& header) { + return (x_headers.find(header) != x_headers.end()); + } +}; + +static constexpr int64_t NO_CONTENT_LENGTH = -1; +static constexpr int64_t CHUNKED_TRANSFER_ENCODING = -2; + +extern void dump_errno(int http_ret, string& out); +extern void dump_errno(const struct rgw_err &err, string& out); +extern void dump_errno(struct req_state *s); +extern void dump_errno(struct req_state *s, int http_ret); +extern void end_header(struct req_state *s, + RGWOp* op = nullptr, + const char *content_type = nullptr, + const int64_t proposed_content_length = + NO_CONTENT_LENGTH, + bool force_content_type = false, + bool force_no_error = false); +extern void dump_start(struct req_state *s); +extern void list_all_buckets_start(struct req_state *s); +extern void dump_owner(struct req_state *s, const rgw_user& id, string& name, + const char *section = NULL); +extern void dump_header(struct req_state* s, + const boost::string_ref& name, + const boost::string_ref& val); +extern void dump_header(struct req_state* s, + const boost::string_ref& name, + ceph::buffer::list& bl); +extern void dump_header(struct req_state* s, + const boost::string_ref& name, + long long val); +extern void dump_header(struct req_state* s, + const boost::string_ref& name, + const utime_t& val); + +template +static inline void dump_header_prefixed(struct req_state* s, + const boost::string_ref& name_prefix, + const boost::string_ref& name, + Args&&... args) { + char full_name_buf[name_prefix.size() + name.size() + 1]; + const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s", + static_cast(name_prefix.length()), + name_prefix.data(), + static_cast(name.length()), + name.data()); + boost::string_ref full_name(full_name_buf, len); + return dump_header(s, std::move(full_name), std::forward(args)...); +} + +template +static inline void dump_header_infixed(struct req_state* s, + const boost::string_ref& prefix, + const boost::string_ref& infix, + const boost::string_ref& sufix, + Args&&... args) { + char full_name_buf[prefix.size() + infix.size() + sufix.size() + 1]; + const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s%.*s", + static_cast(prefix.length()), + prefix.data(), + static_cast(infix.length()), + infix.data(), + static_cast(sufix.length()), + sufix.data()); + boost::string_ref full_name(full_name_buf, len); + return dump_header(s, std::move(full_name), std::forward(args)...); +} + +template +static inline void dump_header_quoted(struct req_state* s, + const boost::string_ref& name, + const boost::string_ref& val) { + /* We need two extra bytes for quotes. */ + char qvalbuf[val.size() + 2 + 1]; + const auto len = snprintf(qvalbuf, sizeof(qvalbuf), "\"%.*s\"", + static_cast(val.length()), val.data()); + return dump_header(s, name, boost::string_ref(qvalbuf, len)); +} + +template +static inline void dump_header_if_nonempty(struct req_state* s, + const boost::string_ref& name, + const ValueT& value) { + if (name.length() > 0 && value.length() > 0) { + return dump_header(s, name, value); + } +} + +static inline std::string compute_domain_uri(const struct req_state *s) { + std::string uri = (!s->info.domain.empty()) ? s->info.domain : + [&s]() -> std::string { + RGWEnv const &env(*(s->info.env)); + std::string uri = + env.get("SERVER_PORT_SECURE") ? "https://" : "http://"; + if (env.exists("SERVER_NAME")) { + uri.append(env.get("SERVER_NAME", "")); + } else { + uri.append(env.get("HTTP_HOST", "")); + } + return uri; + }(); + return uri; +} + +extern void dump_content_length(struct req_state *s, uint64_t len); +extern int64_t parse_content_length(const char *content_length); +extern void dump_etag(struct req_state *s, + const boost::string_ref& etag, + bool quoted = false); +extern void dump_epoch_header(struct req_state *s, const char *name, real_time t); +extern void dump_time_header(struct req_state *s, const char *name, real_time t); +extern void dump_last_modified(struct req_state *s, real_time t); +extern void abort_early(struct req_state* s, RGWOp* op, int err, + RGWHandler* handler); +extern void dump_range(struct req_state* s, uint64_t ofs, uint64_t end, + uint64_t total_size); +extern void dump_continue(struct req_state *s); +extern void list_all_buckets_end(struct req_state *s); +extern void dump_time(struct req_state *s, const char *name, real_time *t); +extern std::string dump_time_to_str(const real_time& t); +extern void dump_bucket_from_state(struct req_state *s); +extern void dump_redirect(struct req_state *s, const string& redirect); +extern bool is_valid_url(const char *url); +extern void dump_access_control(struct req_state *s, const char *origin, + const char *meth, + const char *hdr, const char *exp_hdr, + uint32_t max_age); +extern void dump_access_control(req_state *s, RGWOp *op); + +extern int dump_body(struct req_state* s, const char* buf, size_t len); +extern int dump_body(struct req_state* s, /* const */ ceph::buffer::list& bl); +extern int dump_body(struct req_state* s, const std::string& str); +extern int recv_body(struct req_state* s, char* buf, size_t max); + +#endif /* CEPH_RGW_REST_H */ diff --git a/src/rgw/rgw_rest_admin.h b/src/rgw/rgw_rest_admin.h new file mode 100644 index 00000000..d23dd9d3 --- /dev/null +++ b/src/rgw/rgw_rest_admin.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_ADMIN_H +#define CEPH_RGW_REST_ADMIN_H + + +class RGWRESTMgr_Admin : public RGWRESTMgr { +public: + RGWRESTMgr_Admin() {} + ~RGWRESTMgr_Admin() override {} +}; + + +#endif diff --git a/src/rgw/rgw_rest_bucket.cc b/src/rgw/rgw_rest_bucket.cc new file mode 100644 index 00000000..857d0c9d --- /dev/null +++ b/src/rgw/rgw_rest_bucket.cc @@ -0,0 +1,350 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_op.h" +#include "rgw_bucket.h" +#include "rgw_rest_bucket.h" + +#include "include/str_list.h" + +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + +class RGWOp_Bucket_Info : public RGWRESTOp { + +public: + RGWOp_Bucket_Info() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_READ); + } + + void execute() override; + + const char* name() const override { return "get_bucket_info"; } +}; + +void RGWOp_Bucket_Info::execute() +{ + RGWBucketAdminOpState op_state; + + bool fetch_stats; + + std::string bucket; + + string uid_str; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_bool(s, "stats", false, &fetch_stats); + + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + op_state.set_fetch_stats(fetch_stats); + + http_ret = RGWBucketAdminOp::info(store, op_state, flusher); +} + +class RGWOp_Get_Policy : public RGWRESTOp { + +public: + RGWOp_Get_Policy() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_READ); + } + + void execute() override; + + const char* name() const override { return "get_policy"; } +}; + +void RGWOp_Get_Policy::execute() +{ + RGWBucketAdminOpState op_state; + + std::string bucket; + std::string object; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "object", object, &object); + + op_state.set_bucket_name(bucket); + op_state.set_object(object); + + http_ret = RGWBucketAdminOp::get_policy(store, op_state, flusher); +} + +class RGWOp_Check_Bucket_Index : public RGWRESTOp { + +public: + RGWOp_Check_Bucket_Index() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "check_bucket_index"; } +}; + +void RGWOp_Check_Bucket_Index::execute() +{ + std::string bucket; + + bool fix_index; + bool check_objects; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_bool(s, "fix", false, &fix_index); + RESTArgs::get_bool(s, "check-objects", false, &check_objects); + + op_state.set_bucket_name(bucket); + op_state.set_fix_index(fix_index); + op_state.set_check_objects(check_objects); + + http_ret = RGWBucketAdminOp::check_index(store, op_state, flusher); +} + +class RGWOp_Bucket_Link : public RGWRESTOp { + +public: + RGWOp_Bucket_Link() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "link_bucket"; } +}; + +void RGWOp_Bucket_Link::execute() +{ + std::string uid_str; + std::string bucket; + std::string bucket_id; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id); + + rgw_user uid(uid_str); + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + op_state.set_bucket_id(bucket_id); + + http_ret = RGWBucketAdminOp::link(store, op_state); +} + +class RGWOp_Bucket_Unlink : public RGWRESTOp { + +public: + RGWOp_Bucket_Unlink() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "unlink_bucket"; } +}; + +void RGWOp_Bucket_Unlink::execute() +{ + std::string uid_str; + std::string bucket; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + + http_ret = RGWBucketAdminOp::unlink(store, op_state); +} + +class RGWOp_Bucket_Remove : public RGWRESTOp { + +public: + RGWOp_Bucket_Remove() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "remove_bucket"; } +}; + +void RGWOp_Bucket_Remove::execute() +{ + std::string bucket; + bool delete_children; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_bool(s, "purge-objects", false, &delete_children); + + op_state.set_bucket_name(bucket); + op_state.set_delete_children(delete_children); + + http_ret = RGWBucketAdminOp::remove_bucket(store, op_state); +} + +class RGWOp_Set_Bucket_Quota : public RGWRESTOp { + +public: + RGWOp_Set_Bucket_Quota() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "set_bucket_quota"; } +}; + +#define QUOTA_INPUT_MAX_LEN 1024 + +void RGWOp_Set_Bucket_Quota::execute() +{ + bool uid_arg_existed = false; + std::string uid_str; + RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed); + if (! uid_arg_existed) { + http_ret = -EINVAL; + return; + } + rgw_user uid(uid_str); + bool bucket_arg_existed = false; + std::string bucket; + RESTArgs::get_string(s, "bucket", bucket, &bucket, &bucket_arg_existed); + if (! bucket_arg_existed) { + http_ret = -EINVAL; + return; + } + + bool use_http_params; + + if (s->content_length > 0) { + use_http_params = false; + } else { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + use_http_params = (!encoding || strcmp(encoding, "chunked") != 0); + } + RGWQuotaInfo quota; + if (!use_http_params) { + bool empty; + http_ret = rgw_rest_get_json_input(store->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty); + if (http_ret < 0) { + if (!empty) + return; + /* was probably chunked input, but no content provided, configure via http params */ + use_http_params = true; + } + } + if (use_http_params) { + RGWBucketInfo bucket_info; + map attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + http_ret = store->get_bucket_info(obj_ctx, uid.tenant, bucket, bucket_info, NULL, &attrs); + if (http_ret < 0) { + return; + } + RGWQuotaInfo *old_quota = &bucket_info.quota; + int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size); + int64_t max_size_kb; + RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects); + RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb); + quota.max_size = max_size_kb * 1024; + RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled); + } + + RGWBucketAdminOpState op_state; + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + op_state.set_quota(quota); + + http_ret = RGWBucketAdminOp::set_quota(store, op_state); +} + +class RGWOp_Object_Remove: public RGWRESTOp { + +public: + RGWOp_Object_Remove() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "remove_object"; } +}; + +void RGWOp_Object_Remove::execute() +{ + std::string bucket; + std::string object; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "object", object, &object); + + op_state.set_bucket_name(bucket); + op_state.set_object(object); + + http_ret = RGWBucketAdminOp::remove_object(store, op_state); +} + +RGWOp *RGWHandler_Bucket::op_get() +{ + + if (s->info.args.sub_resource_exists("policy")) + return new RGWOp_Get_Policy; + + if (s->info.args.sub_resource_exists("index")) + return new RGWOp_Check_Bucket_Index; + + return new RGWOp_Bucket_Info; +} + +RGWOp *RGWHandler_Bucket::op_put() +{ + if (s->info.args.sub_resource_exists("quota")) + return new RGWOp_Set_Bucket_Quota; + return new RGWOp_Bucket_Link; +} + +RGWOp *RGWHandler_Bucket::op_post() +{ + return new RGWOp_Bucket_Unlink; +} + +RGWOp *RGWHandler_Bucket::op_delete() +{ + if (s->info.args.sub_resource_exists("object")) + return new RGWOp_Object_Remove; + + return new RGWOp_Bucket_Remove; +} + diff --git a/src/rgw/rgw_rest_bucket.h b/src/rgw/rgw_rest_bucket.h new file mode 100644 index 00000000..19bfd734 --- /dev/null +++ b/src/rgw/rgw_rest_bucket.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_BUCKET_H +#define CEPH_RGW_REST_BUCKET_H + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + + +class RGWHandler_Bucket : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_put() override; + RGWOp *op_post() override; + RGWOp *op_delete() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Bucket() override = default; + + int read_permissions(RGWOp*) override { + return 0; + } +}; + +class RGWRESTMgr_Bucket : public RGWRESTMgr { +public: + RGWRESTMgr_Bucket() = default; + ~RGWRESTMgr_Bucket() override = default; + + RGWHandler_REST* get_handler(struct req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Bucket(auth_registry); + } +}; + +#endif diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc new file mode 100644 index 00000000..dde6e29b --- /dev/null +++ b/src/rgw/rgw_rest_client.cc @@ -0,0 +1,999 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_common.h" +#include "rgw_rest_client.h" +#include "rgw_auth_s3.h" +#include "rgw_http_errors.h" +#include "rgw_rados.h" + +#include "common/ceph_crypto_cms.h" +#include "common/armor.h" +#include "common/strtol.h" +#include "include/str_list.h" +#include "rgw_crypt_sanitize.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +int RGWHTTPSimpleRequest::get_status() +{ + int retcode = get_req_retcode(); + if (retcode < 0) { + return retcode; + } + return status; +} + +int RGWHTTPSimpleRequest::handle_header(const string& name, const string& val) +{ + if (name == "CONTENT_LENGTH") { + string err; + long len = strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + ldout(cct, 0) << "ERROR: failed converting content length (" << val << ") to int " << dendl; + return -EINVAL; + } + + max_response = len; + } + + return 0; +} + +int RGWHTTPSimpleRequest::receive_header(void *ptr, size_t len) +{ + unique_lock guard(out_headers_lock); + + char line[len + 1]; + + char *s = (char *)ptr, *end = (char *)ptr + len; + char *p = line; + ldout(cct, 10) << "receive_http_header" << dendl; + + while (s != end) { + if (*s == '\r') { + s++; + continue; + } + if (*s == '\n') { + *p = '\0'; + ldout(cct, 10) << "received header:" << line << dendl; + // TODO: fill whatever data required here + char *l = line; + char *tok = strsep(&l, " \t:"); + if (tok && l) { + while (*l == ' ') + l++; + + if (strcmp(tok, "HTTP") == 0 || strncmp(tok, "HTTP/", 5) == 0) { + http_status = atoi(l); + if (http_status == 100) /* 100-continue response */ + continue; + status = rgw_http_error_to_errno(http_status); + } else { + /* convert header field name to upper case */ + char *src = tok; + char buf[len + 1]; + size_t i; + for (i = 0; i < len && *src; ++i, ++src) { + switch (*src) { + case '-': + buf[i] = '_'; + break; + default: + buf[i] = toupper(*src); + } + } + buf[i] = '\0'; + out_headers[buf] = l; + int r = handle_header(buf, l); + if (r < 0) + return r; + } + } + } + if (s != end) + *p++ = *s++; + } + return 0; +} + +static void get_new_date_str(string& date_str) +{ + date_str = rgw_to_asctime(ceph_clock_now()); +} + +static void get_gmt_date_str(string& date_str) +{ + auto now_time = ceph::real_clock::now(); + time_t rawtime = ceph::real_clock::to_time_t(now_time); + + char buffer[80]; + + struct tm timeInfo; + gmtime_r(&rawtime, &timeInfo); + strftime(buffer, sizeof(buffer), "%a, %d %b %Y %H:%M:%S %z", &timeInfo); + + date_str = buffer; +} + +int RGWRESTSimpleRequest::execute(RGWAccessKey& key, const char *_method, const char *resource) +{ + method = _method; + string new_url = url; + string new_resource = resource; + + if (new_url[new_url.size() - 1] == '/' && resource[0] == '/') { + new_url = new_url.substr(0, new_url.size() - 1); + } else if (resource[0] != '/') { + new_resource = "/"; + new_resource.append(resource); + } + new_url.append(new_resource); + url = new_url; + + string date_str; + get_new_date_str(date_str); + headers.push_back(pair("HTTP_DATE", date_str)); + + string canonical_header; + meta_map_t meta_map; + map sub_resources; + + rgw_create_s3_canonical_header(method.c_str(), NULL, NULL, date_str.c_str(), + meta_map, meta_map, url.c_str(), sub_resources, + canonical_header); + + string digest; + try { + digest = rgw::auth::s3::get_v2_signature(cct, key.key, canonical_header); + } catch (int ret) { + return ret; + } + + string auth_hdr = "AWS " + key.id + ":" + digest; + + ldout(cct, 15) << "generated auth header: " << auth_hdr << dendl; + + headers.push_back(pair("AUTHORIZATION", auth_hdr)); + int r = process(); + if (r < 0) + return r; + + return status; +} + +int RGWHTTPSimpleRequest::send_data(void *ptr, size_t len, bool* pause) +{ + if (!send_iter) + return 0; + + if (len > send_iter->get_remaining()) + len = send_iter->get_remaining(); + + send_iter->copy(len, (char *)ptr); + + return len; +} + +int RGWHTTPSimpleRequest::receive_data(void *ptr, size_t len, bool *pause) +{ + size_t cp_len, left_len; + + left_len = max_response > response.length() ? (max_response - response.length()) : 0; + if (left_len == 0) + return 0; /* don't read extra data */ + + cp_len = (len > left_len) ? left_len : len; + bufferptr p((char *)ptr, cp_len); + + response.append(p); + + return 0; +} + +static void append_param(string& dest, const string& name, const string& val) +{ + if (dest.empty()) { + dest.append("?"); + } else { + dest.append("&"); + } + string url_name; + url_encode(name, url_name); + dest.append(url_name); + + if (!val.empty()) { + string url_val; + url_encode(val, url_val); + dest.append("="); + dest.append(url_val); + } +} + +static void do_get_params_str(const param_vec_t& params, map& extra_args, string& dest) +{ + map::iterator miter; + for (miter = extra_args.begin(); miter != extra_args.end(); ++miter) { + append_param(dest, miter->first, miter->second); + } + for (auto iter = params.begin(); iter != params.end(); ++iter) { + append_param(dest, iter->first, iter->second); + } +} + +void RGWHTTPSimpleRequest::get_params_str(map& extra_args, string& dest) +{ + do_get_params_str(params, extra_args, dest); +} + +void RGWHTTPSimpleRequest::get_out_headers(map *pheaders) +{ + unique_lock guard(out_headers_lock); + pheaders->swap(out_headers); + out_headers.clear(); +} + +static int sign_request(CephContext *cct, RGWAccessKey& key, RGWEnv& env, req_info& info) +{ + /* don't sign if no key is provided */ + if (key.key.empty()) { + return 0; + } + + if (cct->_conf->subsys.should_gather()) { + for (const auto& i: env.get_map()) { + ldout(cct, 20) << "> " << i.first << " -> " << rgw::crypt_sanitize::x_meta_map{i.first, i.second} << dendl; + } + } + + string canonical_header; + if (!rgw_create_s3_canonical_header(info, NULL, canonical_header, false)) { + ldout(cct, 0) << "failed to create canonical s3 header" << dendl; + return -EINVAL; + } + + ldout(cct, 10) << "generated canonical header: " << canonical_header << dendl; + + string digest; + try { + digest = rgw::auth::s3::get_v2_signature(cct, key.key, canonical_header); + } catch (int ret) { + return ret; + } + + string auth_hdr = "AWS " + key.id + ":" + digest; + ldout(cct, 15) << "generated auth header: " << auth_hdr << dendl; + + env.set("AUTHORIZATION", auth_hdr); + + return 0; +} + +int RGWRESTSimpleRequest::forward_request(RGWAccessKey& key, req_info& info, size_t max_response, bufferlist *inbl, bufferlist *outbl) +{ + + string date_str; + get_new_date_str(date_str); + + RGWEnv new_env; + req_info new_info(cct, &new_env); + new_info.rebuild_from(info); + string bucket_encode; + string request_uri_encode; + size_t pos = new_info.request_uri.substr(1, new_info.request_uri.size() - 1).find("/"); + string bucket = new_info.request_uri.substr(1, pos); + url_encode(bucket, bucket_encode); + if (std::string::npos != pos) + request_uri_encode = string("/") + bucket_encode + new_info.request_uri.substr(pos + 1); + else + request_uri_encode = string("/") + bucket_encode; + new_info.request_uri = request_uri_encode; + new_env.set("HTTP_DATE", date_str.c_str()); + + int ret = sign_request(cct, key, new_env, new_info); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to sign request" << dendl; + return ret; + } + + for (const auto& kv: new_env.get_map()) { + headers.emplace_back(kv); + } + + meta_map_t& meta_map = new_info.x_meta_map; + for (const auto& kv: meta_map) { + headers.emplace_back(kv); + } + + string params_str; + get_params_str(info.args.get_params(), params_str); + + string new_url = url; + string& resource = new_info.request_uri; + string new_resource = resource; + if (new_url[new_url.size() - 1] == '/' && resource[0] == '/') { + new_url = new_url.substr(0, new_url.size() - 1); + } else if (resource[0] != '/') { + new_resource = "/"; + new_resource.append(resource); + } + new_url.append(new_resource + params_str); + + bufferlist::iterator bliter; + + if (inbl) { + bliter = inbl->begin(); + send_iter = &bliter; + + set_send_length(inbl->length()); + } + + method = new_info.method; + url = new_url; + + int r = process(); + if (r < 0){ + if (r == -EINVAL){ + // curl_easy has errored, generally means the service is not available + r = -ERR_SERVICE_UNAVAILABLE; + } + return r; + } + + response.append((char)0); /* NULL terminate response */ + + if (outbl) { + outbl->claim(response); + } + + return status; +} + +class RGWRESTStreamOutCB : public RGWGetDataCB { + RGWRESTStreamS3PutObj *req; +public: + explicit RGWRESTStreamOutCB(RGWRESTStreamS3PutObj *_req) : req(_req) {} + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override; /* callback for object iteration when sending data */ +}; + +int RGWRESTStreamOutCB::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + dout(20) << "RGWRESTStreamOutCB::handle_data bl.length()=" << bl.length() << " bl_ofs=" << bl_ofs << " bl_len=" << bl_len << dendl; + if (!bl_ofs && bl_len == bl.length()) { + req->add_send_data(bl); + return 0; + } + + bufferptr bp(bl.c_str() + bl_ofs, bl_len); + bufferlist new_bl; + new_bl.push_back(bp); + + req->add_send_data(new_bl); + return 0; +} + +RGWRESTStreamS3PutObj::~RGWRESTStreamS3PutObj() +{ + delete out_cb; +} + +static void grants_by_type_add_one_grant(map& grants_by_type, int perm, ACLGrant& grant) +{ + string& s = grants_by_type[perm]; + + if (!s.empty()) + s.append(", "); + + string id_type_str; + ACLGranteeType& type = grant.get_type(); + switch (type.get_type()) { + case ACL_TYPE_GROUP: + id_type_str = "uri"; + break; + case ACL_TYPE_EMAIL_USER: + id_type_str = "emailAddress"; + break; + default: + id_type_str = "id"; + } + rgw_user id; + grant.get_id(id); + s.append(id_type_str + "=\"" + id.to_str() + "\""); +} + +struct grant_type_to_header { + int type; + const char *header; +}; + +struct grant_type_to_header grants_headers_def[] = { + { RGW_PERM_FULL_CONTROL, "x-amz-grant-full-control"}, + { RGW_PERM_READ, "x-amz-grant-read"}, + { RGW_PERM_WRITE, "x-amz-grant-write"}, + { RGW_PERM_READ_ACP, "x-amz-grant-read-acp"}, + { RGW_PERM_WRITE_ACP, "x-amz-grant-write-acp"}, + { 0, NULL} +}; + +static bool grants_by_type_check_perm(map& grants_by_type, int perm, ACLGrant& grant, int check_perm) +{ + if ((perm & check_perm) == check_perm) { + grants_by_type_add_one_grant(grants_by_type, check_perm, grant); + return true; + } + return false; +} + +static void grants_by_type_add_perm(map& grants_by_type, int perm, ACLGrant& grant) +{ + struct grant_type_to_header *t; + + for (t = grants_headers_def; t->header; t++) { + if (grants_by_type_check_perm(grants_by_type, perm, grant, t->type)) + return; + } +} + +static void add_grants_headers(map& grants, RGWEnv& env, meta_map_t& meta_map) +{ + struct grant_type_to_header *t; + + for (t = grants_headers_def; t->header; t++) { + map::iterator iter = grants.find(t->type); + if (iter != grants.end()) { + env.set(t->header,iter->second); + meta_map[t->header] = iter->second; + } + } +} + +void RGWRESTGenerateHTTPHeaders::init(const string& _method, const string& _url, const string& resource, const param_vec_t& params) +{ + string params_str; + map& args = new_info->args.get_params(); + do_get_params_str(params, args, params_str); + + /* merge params with extra args so that we can sign correctly */ + for (auto iter = params.begin(); iter != params.end(); ++iter) { + new_info->args.append(iter->first, iter->second); + } + + url = _url + resource + params_str; + + string date_str; + get_gmt_date_str(date_str); + + new_env->set("HTTP_DATE", date_str.c_str()); + + method = _method; + new_info->method = method.c_str(); + + new_info->script_uri = "/"; + new_info->script_uri.append(resource); + new_info->request_uri = new_info->script_uri; +} + +static bool is_x_amz(const string& s) { + return boost::algorithm::starts_with(s, "x-amz-"); +} + +void RGWRESTGenerateHTTPHeaders::set_extra_headers(const map& extra_headers) +{ + for (auto iter : extra_headers) { + const string& name = lowercase_dash_http_attr(iter.first); + new_env->set(name, iter.second.c_str()); + if (is_x_amz(name)) { + new_info->x_meta_map[name] = iter.second; + } + } +} + +int RGWRESTGenerateHTTPHeaders::set_obj_attrs(map& rgw_attrs) +{ + map new_attrs; + + /* merge send headers */ + for (auto& attr: rgw_attrs) { + bufferlist& bl = attr.second; + const string& name = attr.first; + string val = bl.c_str(); + if (name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) { + string header_name = RGW_AMZ_META_PREFIX; + header_name.append(name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1)); + new_attrs[header_name] = val; + } + } + + RGWAccessControlPolicy policy; + int ret = rgw_policy_from_attrset(cct, rgw_attrs, &policy); + if (ret < 0) { + ldout(cct, 0) << "ERROR: couldn't get policy ret=" << ret << dendl; + return ret; + } + + set_http_attrs(new_attrs); + set_policy(policy); + + return 0; +} + +static std::set keep_headers = { "content-type", + "content-encoding", + "content-disposition", + "content-language" }; + +void RGWRESTGenerateHTTPHeaders::set_http_attrs(const map& http_attrs) +{ + /* merge send headers */ + for (auto& attr: http_attrs) { + const string& val = attr.second; + const string& name = lowercase_dash_http_attr(attr.first); + if (is_x_amz(name)) { + new_env->set(name, val); + new_info->x_meta_map[name] = val; + } else { + new_env->set(attr.first, val); /* Ugh, using the uppercase representation, + as the signing function calls info.env.get("CONTENT_TYPE"). + This needs to be cleaned up! */ + } + } +} + +void RGWRESTGenerateHTTPHeaders::set_policy(RGWAccessControlPolicy& policy) +{ + /* update acl headers */ + RGWAccessControlList& acl = policy.get_acl(); + multimap& grant_map = acl.get_grant_map(); + multimap::iterator giter; + map grants_by_type; + for (giter = grant_map.begin(); giter != grant_map.end(); ++giter) { + ACLGrant& grant = giter->second; + ACLPermission& perm = grant.get_permission(); + grants_by_type_add_perm(grants_by_type, perm.get_permissions(), grant); + } + add_grants_headers(grants_by_type, *new_env, new_info->x_meta_map); +} + +int RGWRESTGenerateHTTPHeaders::sign(RGWAccessKey& key) +{ + int ret = sign_request(cct, key, *new_env, *new_info); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to sign request" << dendl; + return ret; + } + + return 0; +} + +void RGWRESTStreamS3PutObj::send_init(rgw_obj& obj) +{ + string resource_str; + string resource; + string new_url = url; + + if (host_style == VirtualStyle) { + resource_str = obj.get_oid(); + new_url = obj.bucket.name + "." + new_url; + } else { + resource_str = obj.bucket.name + "/" + obj.get_oid(); + } + + //do not encode slash in object key name + url_encode(resource_str, resource, false); + + if (new_url[new_url.size() - 1] != '/') + new_url.append("/"); + + method = "PUT"; + headers_gen.init(method, new_url, resource, params); + + url = headers_gen.get_url(); +} + +int RGWRESTStreamS3PutObj::send_ready(RGWAccessKey& key, map& rgw_attrs, bool send) +{ + headers_gen.set_obj_attrs(rgw_attrs); + + return send_ready(key, send); +} + +int RGWRESTStreamS3PutObj::send_ready(RGWAccessKey& key, const map& http_attrs, + RGWAccessControlPolicy& policy, bool send) +{ + headers_gen.set_http_attrs(http_attrs); + headers_gen.set_policy(policy); + + return send_ready(key, send); +} + +int RGWRESTStreamS3PutObj::send_ready(RGWAccessKey& key, bool send) +{ + headers_gen.sign(key); + + for (const auto& kv: new_env.get_map()) { + headers.emplace_back(kv); + } + + out_cb = new RGWRESTStreamOutCB(this); + + if (send) { + int r = RGWHTTP::send(this); + if (r < 0) + return r; + } + + return 0; +} + +int RGWRESTStreamS3PutObj::put_obj_init(RGWAccessKey& key, rgw_obj& obj, uint64_t obj_size, map& attrs, bool send) +{ + send_init(obj); + return send_ready(key, attrs, send); +} + +void set_str_from_headers(map& out_headers, const string& header_name, string& str) +{ + map::iterator iter = out_headers.find(header_name); + if (iter != out_headers.end()) { + str = iter->second; + } else { + str.clear(); + } +} + +static int parse_rgwx_mtime(CephContext *cct, const string& s, ceph::real_time *rt) +{ + string err; + vector vec; + + get_str_vec(s, ".", vec); + + if (vec.empty()) { + return -EINVAL; + } + + long secs = strict_strtol(vec[0].c_str(), 10, &err); + long nsecs = 0; + if (!err.empty()) { + ldout(cct, 0) << "ERROR: failed converting mtime (" << s << ") to real_time " << dendl; + return -EINVAL; + } + + if (vec.size() > 1) { + nsecs = strict_strtol(vec[1].c_str(), 10, &err); + if (!err.empty()) { + ldout(cct, 0) << "ERROR: failed converting mtime (" << s << ") to real_time " << dendl; + return -EINVAL; + } + } + + *rt = utime_t(secs, nsecs).to_real_time(); + + return 0; +} + +static void send_prepare_convert(const rgw_obj& obj, string *resource) +{ + string urlsafe_bucket, urlsafe_object; + url_encode(obj.bucket.get_key(':', 0), urlsafe_bucket); + url_encode(obj.key.name, urlsafe_object); + *resource = urlsafe_bucket + "/" + urlsafe_object; +} + +int RGWRESTStreamRWRequest::send_request(RGWAccessKey& key, map& extra_headers, const rgw_obj& obj, RGWHTTPManager *mgr) +{ + string resource; + send_prepare_convert(obj, &resource); + + return send_request(&key, extra_headers, resource, mgr); +} + +int RGWRESTStreamRWRequest::send_prepare(RGWAccessKey& key, map& extra_headers, const rgw_obj& obj) +{ + string resource; + send_prepare_convert(obj, &resource); + + return do_send_prepare(&key, extra_headers, resource); +} + +int RGWRESTStreamRWRequest::send_prepare(RGWAccessKey *key, map& extra_headers, const string& resource, + bufferlist *send_data) +{ + string new_resource; + //do not encode slash + url_encode(resource, new_resource, false); + + return do_send_prepare(key, extra_headers, new_resource, send_data); +} + +int RGWRESTStreamRWRequest::do_send_prepare(RGWAccessKey *key, map& extra_headers, const string& resource, + bufferlist *send_data) +{ + string new_url = url; + if (new_url[new_url.size() - 1] != '/') + new_url.append("/"); + + RGWEnv new_env; + req_info new_info(cct, &new_env); + + string new_resource; + string bucket_name; + string old_resource = resource; + + if (resource[0] == '/') { + new_resource = resource.substr(1); + } else { + new_resource = resource; + } + + size_t pos = new_resource.find("/"); + bucket_name = new_resource.substr(0, pos); + + //when dest is a bucket with out other params, uri should end up with '/' + if(pos == string::npos && params.size() == 0 && host_style == VirtualStyle) { + new_resource.append("/"); + } + + if (host_style == VirtualStyle) { + new_url = bucket_name + "." + new_url; + if(pos == string::npos) { + new_resource = ""; + } else { + new_resource = new_resource.substr(pos+1); + } + } + + RGWRESTGenerateHTTPHeaders headers_gen(cct, &new_env, &new_info); + + headers_gen.init(method, new_url, new_resource, params); + + headers_gen.set_http_attrs(extra_headers); + + if (key) { +#if 0 + new_info.init_meta_info(nullptr); +#endif + + int ret = headers_gen.sign(*key); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to sign request" << dendl; + return ret; + } + } + + for (const auto& kv: new_env.get_map()) { + headers.emplace_back(kv); + } + + if (send_data) { + set_send_length(send_data->length()); + set_outbl(*send_data); + set_send_data_hint(true); + } + + + method = new_info.method; + url = headers_gen.get_url(); + + return 0; +} + +int RGWRESTStreamRWRequest::send_request(RGWAccessKey *key, map& extra_headers, const string& resource, + RGWHTTPManager *mgr, bufferlist *send_data) +{ + int ret = send_prepare(key, extra_headers, resource, send_data); + if (ret < 0) { + return ret; + } + + return send(mgr); +} + + +int RGWRESTStreamRWRequest::send(RGWHTTPManager *mgr) +{ + if (!mgr) { + return RGWHTTP::send(this); + } + + int r = mgr->add_request(this); + if (r < 0) + return r; + + return 0; +} + +int RGWRESTStreamRWRequest::complete_request(string *etag, + real_time *mtime, + uint64_t *psize, + map *pattrs, + map *pheaders) +{ + int ret = wait(); + if (ret < 0) { + return ret; + } + + unique_lock guard(out_headers_lock); + + if (etag) { + set_str_from_headers(out_headers, "ETAG", *etag); + } + if (status >= 0) { + if (mtime) { + string mtime_str; + set_str_from_headers(out_headers, "RGWX_MTIME", mtime_str); + if (!mtime_str.empty()) { + int ret = parse_rgwx_mtime(cct, mtime_str, mtime); + if (ret < 0) { + return ret; + } + } else { + *mtime = real_time(); + } + } + if (psize) { + string size_str; + set_str_from_headers(out_headers, "RGWX_OBJECT_SIZE", size_str); + string err; + *psize = strict_strtoll(size_str.c_str(), 10, &err); + if (!err.empty()) { + ldout(cct, 0) << "ERROR: failed parsing embedded metadata object size (" << size_str << ") to int " << dendl; + return -EIO; + } + } + } + + for (auto iter = out_headers.begin(); pattrs && iter != out_headers.end(); ++iter) { + const string& attr_name = iter->first; + if (attr_name.compare(0, sizeof(RGW_HTTP_RGWX_ATTR_PREFIX) - 1, RGW_HTTP_RGWX_ATTR_PREFIX) == 0) { + string name = attr_name.substr(sizeof(RGW_HTTP_RGWX_ATTR_PREFIX) - 1); + const char *src = name.c_str(); + char buf[name.size() + 1]; + char *dest = buf; + for (; *src; ++src, ++dest) { + switch(*src) { + case '_': + *dest = '-'; + break; + default: + *dest = tolower(*src); + } + } + *dest = '\0'; + (*pattrs)[buf] = iter->second; + } + } + + if (pheaders) { + *pheaders = std::move(out_headers); + } + return status; +} + +int RGWHTTPStreamRWRequest::handle_header(const string& name, const string& val) +{ + if (name == "RGWX_EMBEDDED_METADATA_LEN") { + string err; + long len = strict_strtol(val.c_str(), 10, &err); + if (!err.empty()) { + ldout(cct, 0) << "ERROR: failed converting embedded metadata len (" << val << ") to int " << dendl; + return -EINVAL; + } + + cb->set_extra_data_len(len); + } + return 0; +} + +int RGWHTTPStreamRWRequest::receive_data(void *ptr, size_t len, bool *pause) +{ + size_t orig_len = len; + + if (cb) { + in_data.append((const char *)ptr, len); + + size_t orig_in_data_len = in_data.length(); + + int ret = cb->handle_data(in_data, pause); + if (ret < 0) + return ret; + if (ret == 0) { + in_data.clear(); + } else { + /* partial read */ + ceph_assert(in_data.length() <= orig_in_data_len); + len = ret; + bufferlist bl; + size_t left_to_read = orig_in_data_len - len; + if (in_data.length() > left_to_read) { + in_data.splice(0, in_data.length() - left_to_read, &bl); + } + } + } + ofs += len; + return orig_len; +} + +void RGWHTTPStreamRWRequest::set_stream_write(bool s) { + Mutex::Locker wl(write_lock); + stream_writes = s; +} + +void RGWHTTPStreamRWRequest::unpause_receive() +{ + Mutex::Locker req_locker(get_req_lock()); + if (!read_paused) { + _set_read_paused(false); + } +} + +void RGWHTTPStreamRWRequest::add_send_data(bufferlist& bl) +{ + Mutex::Locker req_locker(get_req_lock()); + Mutex::Locker wl(write_lock); + outbl.claim_append(bl); + _set_write_paused(false); +} + +uint64_t RGWHTTPStreamRWRequest::get_pending_send_size() +{ + Mutex::Locker wl(write_lock); + return outbl.length(); +} + +void RGWHTTPStreamRWRequest::finish_write() +{ + Mutex::Locker req_locker(get_req_lock()); + Mutex::Locker wl(write_lock); + write_stream_complete = true; + _set_write_paused(false); +} + +int RGWHTTPStreamRWRequest::send_data(void *ptr, size_t len, bool *pause) +{ + uint64_t out_len; + uint64_t send_size; + { + Mutex::Locker wl(write_lock); + + if (outbl.length() == 0) { + if ((stream_writes && !write_stream_complete) || + (write_ofs < send_len)) { + *pause = true; + } + return 0; + } + + len = std::min(len, (size_t)outbl.length()); + + bufferlist bl; + outbl.splice(0, len, &bl); + send_size = bl.length(); + if (send_size > 0) { + memcpy(ptr, bl.c_str(), send_size); + write_ofs += send_size; + } + + out_len = outbl.length(); + } + /* don't need to be under write_lock here, avoid deadlocks in case notify callback + * needs to lock */ + if (write_drain_cb) { + write_drain_cb->notify(out_len); + } + return send_size; +} + +class StreamIntoBufferlist : public RGWGetDataCB { + bufferlist& bl; +public: + explicit StreamIntoBufferlist(bufferlist& _bl) : bl(_bl) {} + int handle_data(bufferlist& inbl, off_t bl_ofs, off_t bl_len) override { + bl.claim_append(inbl); + return bl_len; + } +}; + diff --git a/src/rgw/rgw_rest_client.h b/src/rgw/rgw_rest_client.h new file mode 100644 index 00000000..8f9b2c16 --- /dev/null +++ b/src/rgw/rgw_rest_client.h @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_CLIENT_H +#define CEPH_RGW_REST_CLIENT_H + +#include "rgw_http_client.h" + +class RGWGetDataCB; + +class RGWHTTPSimpleRequest : public RGWHTTPClient { +protected: + int http_status; + int status; + + using unique_lock = std::unique_lock; + + std::mutex out_headers_lock; + map out_headers; + param_vec_t params; + + bufferlist::iterator *send_iter; + + size_t max_response; /* we need this as we don't stream out response */ + bufferlist response; + + virtual int handle_header(const string& name, const string& val); + void get_params_str(map& extra_args, string& dest); + +public: + RGWHTTPSimpleRequest(CephContext *_cct, const string& _method, const string& _url, + param_vec_t *_headers, param_vec_t *_params) : RGWHTTPClient(_cct, _method, _url), + http_status(0), status(0), + send_iter(NULL), + max_response(0) { + set_headers(_headers); + set_params(_params); + } + + void set_headers(param_vec_t *_headers) { + if (_headers) + headers = *_headers; + } + + void set_params(param_vec_t *_params) { + if (_params) + params = *_params; + } + + int receive_header(void *ptr, size_t len) override; + int receive_data(void *ptr, size_t len, bool *pause) override; + int send_data(void *ptr, size_t len, bool* pause=nullptr) override; + + bufferlist& get_response() { return response; } + + void get_out_headers(map *pheaders); /* modifies out_headers */ + + int get_http_status() { return http_status; } + int get_status(); +}; + +class RGWRESTSimpleRequest : public RGWHTTPSimpleRequest { +public: + RGWRESTSimpleRequest(CephContext *_cct, const string& _method, const string& _url, + param_vec_t *_headers, param_vec_t *_params) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params) {} + + int execute(RGWAccessKey& key, const char *method, const char *resource); + int forward_request(RGWAccessKey& key, req_info& info, size_t max_response, bufferlist *inbl, bufferlist *outbl); +}; + +class RGWWriteDrainCB { +public: + RGWWriteDrainCB() = default; + virtual ~RGWWriteDrainCB() = default; + virtual void notify(uint64_t pending_size) = 0; +}; + +class RGWRESTGenerateHTTPHeaders { + CephContext *cct; + RGWEnv *new_env; + req_info *new_info; + string method; + string url; + string resource; + +public: + RGWRESTGenerateHTTPHeaders(CephContext *_cct, RGWEnv *_env, req_info *_info) : cct(_cct), new_env(_env), new_info(_info) {} + void init(const string& method, const string& url, const string& resource, const param_vec_t& params); + void set_extra_headers(const map& extra_headers); + int set_obj_attrs(map& rgw_attrs); + void set_http_attrs(const map& http_attrs); + void set_policy(RGWAccessControlPolicy& policy); + int sign(RGWAccessKey& key); + + const string& get_url() { return url; } +}; + +class RGWHTTPStreamRWRequest : public RGWHTTPSimpleRequest { +public: + class ReceiveCB; + +private: + Mutex lock; + Mutex write_lock; + ReceiveCB *cb{nullptr}; + RGWWriteDrainCB *write_drain_cb{nullptr}; + bufferlist outbl; + bufferlist in_data; + size_t chunk_ofs{0}; + size_t ofs{0}; + uint64_t write_ofs{0}; + bool read_paused{false}; + bool send_paused{false}; + bool stream_writes{false}; + bool write_stream_complete{false}; +protected: + int handle_header(const string& name, const string& val) override; +public: + int send_data(void *ptr, size_t len, bool *pause) override; + int receive_data(void *ptr, size_t len, bool *pause) override; + + class ReceiveCB { + protected: + uint64_t extra_data_len{0}; + public: + ReceiveCB() = default; + virtual ~ReceiveCB() = default; + virtual int handle_data(bufferlist& bl, bool *pause = nullptr) = 0; + virtual void set_extra_data_len(uint64_t len) { + extra_data_len = len; + } + }; + + RGWHTTPStreamRWRequest(CephContext *_cct, const string& _method, const string& _url, + param_vec_t *_headers, param_vec_t *_params) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params), + lock("RGWHTTPStreamRWRequest"), write_lock("RGWHTTPStreamRWRequest::write_lock") { + } + RGWHTTPStreamRWRequest(CephContext *_cct, const string& _method, const string& _url, ReceiveCB *_cb, + param_vec_t *_headers, param_vec_t *_params) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params), + lock("RGWHTTPStreamRWRequest"), write_lock("RGWHTTPStreamRWRequest::write_lock"), cb(_cb) { + } + virtual ~RGWHTTPStreamRWRequest() override {} + + void set_outbl(bufferlist& _outbl) { + outbl.swap(_outbl); + } + + void set_in_cb(ReceiveCB *_cb) { cb = _cb; } + void set_write_drain_cb(RGWWriteDrainCB *_cb) { write_drain_cb = _cb; } + + void unpause_receive(); + + void add_send_data(bufferlist& bl); + + void set_stream_write(bool s); + + uint64_t get_pending_send_size(); + + /* finish streaming writes */ + void finish_write(); +}; + +class RGWRESTStreamRWRequest : public RGWHTTPStreamRWRequest { +protected: + HostStyle host_style; +public: + RGWRESTStreamRWRequest(CephContext *_cct, const string& _method, const string& _url, RGWHTTPStreamRWRequest::ReceiveCB *_cb, + param_vec_t *_headers, param_vec_t *_params, HostStyle _host_style = PathStyle) : RGWHTTPStreamRWRequest(_cct, _method, _url, _cb, _headers, _params), host_style(_host_style) { + } + virtual ~RGWRESTStreamRWRequest() override {} + + int send_prepare(RGWAccessKey *key, map& extra_headers, const string& resource, bufferlist *send_data = nullptr /* optional input data */); + int send_prepare(RGWAccessKey& key, map& extra_headers, const rgw_obj& obj); + int send(RGWHTTPManager *mgr); + + int send_request(RGWAccessKey& key, map& extra_headers, const rgw_obj& obj, RGWHTTPManager *mgr); + int send_request(RGWAccessKey *key, map& extra_headers, const string& resource, RGWHTTPManager *mgr, bufferlist *send_data = nullptr /* optional input data */); + + int complete_request(string *etag = nullptr, + real_time *mtime = nullptr, + uint64_t *psize = nullptr, + map *pattrs = nullptr, + map *pheaders = nullptr); + + void add_params(param_vec_t *params); + +private: + int do_send_prepare(RGWAccessKey *key, map& extra_headers, const string& resource, bufferlist *send_data = nullptr /* optional input data */); +}; + +class RGWRESTStreamReadRequest : public RGWRESTStreamRWRequest { +public: + RGWRESTStreamReadRequest(CephContext *_cct, const string& _url, ReceiveCB *_cb, param_vec_t *_headers, + param_vec_t *_params, HostStyle _host_style = PathStyle) : RGWRESTStreamRWRequest(_cct, "GET", _url, _cb, _headers, _params, _host_style) {} +}; + +class RGWRESTStreamHeadRequest : public RGWRESTStreamRWRequest { +public: + RGWRESTStreamHeadRequest(CephContext *_cct, const string& _url, ReceiveCB *_cb, param_vec_t *_headers, + param_vec_t *_params) : RGWRESTStreamRWRequest(_cct, "HEAD", _url, _cb, _headers, _params) {} +}; + +class RGWRESTStreamS3PutObj : public RGWRESTStreamRWRequest { + RGWGetDataCB *out_cb; + RGWEnv new_env; + req_info new_info; + RGWRESTGenerateHTTPHeaders headers_gen; +public: + RGWRESTStreamS3PutObj(CephContext *_cct, const string& _method, const string& _url, param_vec_t *_headers, + param_vec_t *_params, HostStyle _host_style) : RGWRESTStreamRWRequest(_cct, _method, _url, nullptr, _headers, _params, _host_style), + out_cb(NULL), new_info(cct, &new_env), headers_gen(_cct, &new_env, &new_info) {} + ~RGWRESTStreamS3PutObj() override; + + void send_init(rgw_obj& obj); + int send_ready(RGWAccessKey& key, map& rgw_attrs, bool send); + int send_ready(RGWAccessKey& key, const map& http_attrs, + RGWAccessControlPolicy& policy, bool send); + int send_ready(RGWAccessKey& key, bool send); + + int put_obj_init(RGWAccessKey& key, rgw_obj& obj, uint64_t obj_size, map& attrs, bool send); + + RGWGetDataCB *get_out_cb() { return out_cb; } +}; + +#endif + diff --git a/src/rgw/rgw_rest_config.cc b/src/rgw/rgw_rest_config.cc new file mode 100644 index 00000000..e5b863d0 --- /dev/null +++ b/src/rgw/rgw_rest_config.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/ceph_json.h" +#include "common/strtol.h" +#include "rgw_rest.h" +#include "rgw_op.h" +#include "rgw_rados.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_config.h" +#include "rgw_client_io.h" +#include "common/errno.h" +#include "include/ceph_assert.h" + +#include "services/svc_zone.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +void RGWOp_ZoneGroupMap_Get::execute() { + http_ret = zonegroup_map.read(g_ceph_context, store->svc.sysobj); + if (http_ret < 0) { + dout(5) << "failed to read zone_group map" << dendl; + } +} + +void RGWOp_ZoneGroupMap_Get::send_response() { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + if (http_ret < 0) + return; + + if (old_format) { + RGWRegionMap region_map; + region_map.regions = zonegroup_map.zonegroups; + region_map.master_region = zonegroup_map.master_zonegroup; + region_map.bucket_quota = zonegroup_map.bucket_quota; + region_map.user_quota = zonegroup_map.user_quota; + encode_json("region-map", region_map, s->formatter); + } else { + encode_json("zonegroup-map", zonegroup_map, s->formatter); + } + flusher.flush(); +} + +void RGWOp_ZoneConfig_Get::send_response() { + const RGWZoneParams& zone_params = store->svc.zone->get_zone_params(); + + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + if (http_ret < 0) + return; + + encode_json("zone_params", zone_params, s->formatter); + flusher.flush(); +} + +RGWOp* RGWHandler_Config::op_get() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (type.compare("zonegroup-map") == 0) { + return new RGWOp_ZoneGroupMap_Get(false); + } else if (type.compare("zone") == 0) { + return new RGWOp_ZoneConfig_Get(); + } else { + return new RGWOp_ZoneGroupMap_Get(true); + } +} diff --git a/src/rgw/rgw_rest_config.h b/src/rgw/rgw_rest_config.h new file mode 100644 index 00000000..56ca129b --- /dev/null +++ b/src/rgw/rgw_rest_config.h @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_REST_CONFIG_H +#define RGW_REST_CONFIG_H + +#include "rgw_zone.h" + +class RGWOp_ZoneGroupMap_Get : public RGWRESTOp { + RGWZoneGroupMap zonegroup_map; + bool old_format; +public: + explicit RGWOp_ZoneGroupMap_Get(bool _old_format):old_format(_old_format) {} + ~RGWOp_ZoneGroupMap_Get() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { + if (old_format) { + return "get_region_map"; + } else { + return "get_zonegroup_map"; + } + } +}; + +class RGWOp_ZoneConfig_Get : public RGWRESTOp { + RGWZoneParams zone_params; +public: + RGWOp_ZoneConfig_Get() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override {} /* store already has the info we need, just need to send response */ + void send_response() override ; + const char* name() const override { + return "get_zone_config"; + } +}; + +class RGWHandler_Config : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + + int read_permissions(RGWOp*) override { + return 0; + } +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Config() override = default; +}; + + +class RGWRESTMgr_Config : public RGWRESTMgr { +public: + RGWRESTMgr_Config() = default; + ~RGWRESTMgr_Config() override = default; + + RGWHandler_REST* get_handler(struct req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Config(auth_registry); + } +}; + +#endif /* RGW_REST_CONFIG_H */ diff --git a/src/rgw/rgw_rest_conn.cc b/src/rgw/rgw_rest_conn.cc new file mode 100644 index 00000000..08eb51cd --- /dev/null +++ b/src/rgw/rgw_rest_conn.cc @@ -0,0 +1,466 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_rest_conn.h" + +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +RGWRESTConn::RGWRESTConn(CephContext *_cct, RGWSI_Zone *zone_svc, + const string& _remote_id, + const list& remote_endpoints, + HostStyle _host_style) + : cct(_cct), + endpoints(remote_endpoints.begin(), remote_endpoints.end()), + remote_id(_remote_id), host_style(_host_style) +{ + if (zone_svc) { + key = zone_svc->get_zone_params().system_key; + self_zone_group = zone_svc->get_zonegroup().get_id(); + } +} + +RGWRESTConn::RGWRESTConn(CephContext *_cct, RGWSI_Zone *zone_svc, + const string& _remote_id, + const list& remote_endpoints, + RGWAccessKey _cred, + HostStyle _host_style) + : cct(_cct), + endpoints(remote_endpoints.begin(), remote_endpoints.end()), + key(std::move(_cred)), + remote_id(_remote_id), host_style(_host_style) +{ + if (zone_svc) { + self_zone_group = zone_svc->get_zonegroup().get_id(); + } +} + +RGWRESTConn::RGWRESTConn(RGWRESTConn&& other) + : cct(other.cct), + endpoints(std::move(other.endpoints)), + key(std::move(other.key)), + self_zone_group(std::move(other.self_zone_group)), + remote_id(std::move(other.remote_id)), + counter(other.counter.load()) +{ +} + +RGWRESTConn& RGWRESTConn::operator=(RGWRESTConn&& other) +{ + cct = other.cct; + endpoints = std::move(other.endpoints); + key = std::move(other.key); + self_zone_group = std::move(other.self_zone_group); + remote_id = std::move(other.remote_id); + counter = other.counter.load(); + return *this; +} + +int RGWRESTConn::get_url(string& endpoint) +{ + if (endpoints.empty()) { + ldout(cct, 0) << "ERROR: endpoints not configured for upstream zone" << dendl; + return -EIO; + } + + int i = ++counter; + endpoint = endpoints[i % endpoints.size()]; + + return 0; +} + +string RGWRESTConn::get_url() +{ + string endpoint; + if (endpoints.empty()) { + ldout(cct, 0) << "WARNING: endpoints not configured for upstream zone" << dendl; /* we'll catch this later */ + return endpoint; + } + + int i = ++counter; + endpoint = endpoints[i % endpoints.size()]; + + return endpoint; +} + +void RGWRESTConn::populate_params(param_vec_t& params, const rgw_user *uid, const string& zonegroup) +{ + populate_uid(params, uid); + populate_zonegroup(params, zonegroup); +} + +int RGWRESTConn::forward(const rgw_user& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + param_vec_t params; + populate_params(params, &uid, self_zone_group); + if (objv) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "tag", objv->tag)); + char buf[16]; + snprintf(buf, sizeof(buf), "%lld", (long long)objv->ver); + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "ver", buf)); + } + RGWRESTSimpleRequest req(cct, info.method, url, NULL, ¶ms); + return req.forward_request(key, info, max_response, inbl, outbl); +} + +class StreamObjData : public RGWGetDataCB { + rgw_obj obj; +public: + explicit StreamObjData(rgw_obj& _obj) : obj(_obj) {} +}; + +int RGWRESTConn::put_obj_send_init(rgw_obj& obj, const rgw_http_param_pair *extra_params, RGWRESTStreamS3PutObj **req) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + rgw_user uid; + param_vec_t params; + populate_params(params, &uid, self_zone_group); + + if (extra_params) { + append_param_list(params, extra_params); + } + + RGWRESTStreamS3PutObj *wr = new RGWRESTStreamS3PutObj(cct, "PUT", url, NULL, ¶ms, host_style); + wr->send_init(obj); + *req = wr; + return 0; +} + +int RGWRESTConn::put_obj_async(const rgw_user& uid, rgw_obj& obj, uint64_t obj_size, + map& attrs, bool send, + RGWRESTStreamS3PutObj **req) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + param_vec_t params; + populate_params(params, &uid, self_zone_group); + RGWRESTStreamS3PutObj *wr = new RGWRESTStreamS3PutObj(cct, "PUT", url, NULL, ¶ms, host_style); + ret = wr->put_obj_init(key, obj, obj_size, attrs, send); + if (ret < 0) { + delete wr; + return ret; + } + *req = wr; + return 0; +} + +int RGWRESTConn::complete_request(RGWRESTStreamS3PutObj *req, string& etag, real_time *mtime) +{ + int ret = req->complete_request(&etag, mtime); + delete req; + + return ret; +} + +static void set_date_header(const real_time *t, map& headers, bool high_precision_time, const string& header_name) +{ + if (!t) { + return; + } + stringstream s; + utime_t tm = utime_t(*t); + if (high_precision_time) { + tm.gmtime_nsec(s); + } else { + tm.gmtime(s); + } + headers[header_name] = s.str(); +} + +template +static void set_header(T val, map& headers, const string& header_name) +{ + stringstream s; + s << val; + headers[header_name] = s.str(); +} + + +int RGWRESTConn::get_obj(const rgw_user& uid, req_info *info /* optional */, const rgw_obj& obj, + const real_time *mod_ptr, const real_time *unmod_ptr, + uint32_t mod_zone_id, uint64_t mod_pg_ver, + bool prepend_metadata, bool get_op, bool rgwx_stat, + bool sync_manifest, bool skip_decrypt, + bool send, RGWHTTPStreamRWRequest::ReceiveCB *cb, RGWRESTStreamRWRequest **req) +{ + get_obj_params params; + params.uid = uid; + params.info = info; + params.mod_ptr = mod_ptr; + params.mod_pg_ver = mod_pg_ver; + params.prepend_metadata = prepend_metadata; + params.get_op = get_op; + params.rgwx_stat = rgwx_stat; + params.sync_manifest = sync_manifest; + params.skip_decrypt = skip_decrypt; + params.cb = cb; + return get_obj(obj, params, send, req); +} + +int RGWRESTConn::get_obj(const rgw_obj& obj, const get_obj_params& in_params, bool send, RGWRESTStreamRWRequest **req) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + param_vec_t params; + populate_params(params, &in_params.uid, self_zone_group); + if (in_params.prepend_metadata) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "prepend-metadata", "true")); + } + if (in_params.rgwx_stat) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "stat", "true")); + } + if (in_params.sync_manifest) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "sync-manifest", "")); + } + if (in_params.skip_decrypt) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "skip-decrypt", "")); + } + if (!obj.key.instance.empty()) { + const string& instance = obj.key.instance; + params.push_back(param_pair_t("versionId", instance)); + } + if (in_params.get_op) { + *req = new RGWRESTStreamReadRequest(cct, url, in_params.cb, NULL, ¶ms, host_style); + } else { + *req = new RGWRESTStreamHeadRequest(cct, url, in_params.cb, NULL, ¶ms); + } + map extra_headers; + if (in_params.info) { + const auto& orig_map = in_params.info->env->get_map(); + + /* add original headers that start with HTTP_X_AMZ_ */ + static constexpr char SEARCH_AMZ_PREFIX[] = "HTTP_X_AMZ_"; + for (auto iter= orig_map.lower_bound(SEARCH_AMZ_PREFIX); iter != orig_map.end(); ++iter) { + const string& name = iter->first; + if (name == "HTTP_X_AMZ_DATE") /* don't forward date from original request */ + continue; + if (name.compare(0, strlen(SEARCH_AMZ_PREFIX), SEARCH_AMZ_PREFIX) != 0) + break; + extra_headers[iter->first] = iter->second; + } + } + + set_date_header(in_params.mod_ptr, extra_headers, in_params.high_precision_time, "HTTP_IF_MODIFIED_SINCE"); + set_date_header(in_params.unmod_ptr, extra_headers, in_params.high_precision_time, "HTTP_IF_UNMODIFIED_SINCE"); + if (!in_params.etag.empty()) { + set_header(in_params.etag, extra_headers, "HTTP_IF_MATCH"); + } + if (in_params.mod_zone_id != 0) { + set_header(in_params.mod_zone_id, extra_headers, "HTTP_DEST_ZONE_SHORT_ID"); + } + if (in_params.mod_pg_ver != 0) { + set_header(in_params.mod_pg_ver, extra_headers, "HTTP_DEST_PG_VER"); + } + if (in_params.range_is_set) { + char buf[64]; + snprintf(buf, sizeof(buf), "bytes=%lld-%lld", (long long)in_params.range_start, (long long)in_params.range_end); + set_header(buf, extra_headers, "RANGE"); + } + + int r = (*req)->send_prepare(key, extra_headers, obj); + if (r < 0) { + goto done_err; + } + + if (!send) { + return 0; + } + + r = (*req)->send(nullptr); + if (r < 0) { + goto done_err; + } + return 0; +done_err: + delete *req; + *req = nullptr; + return r; +} + +int RGWRESTConn::complete_request(RGWRESTStreamRWRequest *req, + string *etag, + real_time *mtime, + uint64_t *psize, + map *pattrs, + map *pheaders) +{ + int ret = req->complete_request(etag, mtime, psize, pattrs, pheaders); + delete req; + + return ret; +} + +int RGWRESTConn::get_resource(const string& resource, + param_vec_t *extra_params, + map *extra_headers, + bufferlist& bl, + bufferlist *send_data, + RGWHTTPManager *mgr) +{ + string url; + int ret = get_url(url); + if (ret < 0) + return ret; + + param_vec_t params; + + if (extra_params) { + params.insert(params.end(), extra_params->begin(), extra_params->end()); + } + + populate_params(params, nullptr, self_zone_group); + + RGWStreamIntoBufferlist cb(bl); + + RGWRESTStreamReadRequest req(cct, url, &cb, NULL, ¶ms, host_style); + + map headers; + if (extra_headers) { + headers.insert(extra_headers->begin(), extra_headers->end()); + } + + ret = req.send_request(&key, headers, resource, mgr, send_data); + if (ret < 0) { + ldout(cct, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return req.complete_request(); +} + +RGWRESTReadResource::RGWRESTReadResource(RGWRESTConn *_conn, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) + : cct(_conn->get_ctx()), conn(_conn), resource(_resource), + params(make_param_list(pp)), cb(bl), mgr(_mgr), + req(cct, conn->get_url(), &cb, NULL, NULL) +{ + init_common(extra_headers); +} + +RGWRESTReadResource::RGWRESTReadResource(RGWRESTConn *_conn, + const string& _resource, + param_vec_t& _params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) + : cct(_conn->get_ctx()), conn(_conn), resource(_resource), params(_params), + cb(bl), mgr(_mgr), req(cct, conn->get_url(), &cb, NULL, NULL) +{ + init_common(extra_headers); +} + +void RGWRESTReadResource::init_common(param_vec_t *extra_headers) +{ + conn->populate_params(params, nullptr, conn->get_self_zonegroup()); + + if (extra_headers) { + headers.insert(extra_headers->begin(), extra_headers->end()); + } + + req.set_params(¶ms); +} + +int RGWRESTReadResource::read() +{ + int ret = req.send_request(&conn->get_key(), headers, resource, mgr); + if (ret < 0) { + ldout(cct, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return req.complete_request(); +} + +int RGWRESTReadResource::aio_read() +{ + int ret = req.send_request(&conn->get_key(), headers, resource, mgr); + if (ret < 0) { + ldout(cct, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +RGWRESTSendResource::RGWRESTSendResource(RGWRESTConn *_conn, + const string& _method, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) + : cct(_conn->get_ctx()), conn(_conn), method(_method), resource(_resource), + params(make_param_list(pp)), cb(bl), mgr(_mgr), + req(cct, method.c_str(), conn->get_url(), &cb, NULL, NULL, _conn->get_host_style()) +{ + init_common(extra_headers); +} + +RGWRESTSendResource::RGWRESTSendResource(RGWRESTConn *_conn, + const string& _method, + const string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) + : cct(_conn->get_ctx()), conn(_conn), method(_method), resource(_resource), params(params), + cb(bl), mgr(_mgr), req(cct, method.c_str(), conn->get_url(), &cb, NULL, NULL, _conn->get_host_style()) +{ + init_common(extra_headers); +} + +void RGWRESTSendResource::init_common(param_vec_t *extra_headers) +{ + conn->populate_params(params, nullptr, conn->get_self_zonegroup()); + + if (extra_headers) { + headers.insert(extra_headers->begin(), extra_headers->end()); + } + + req.set_params(¶ms); +} + +int RGWRESTSendResource::send(bufferlist& outbl) +{ + req.set_send_length(outbl.length()); + req.set_outbl(outbl); + + int ret = req.send_request(&conn->get_key(), headers, resource, mgr); + if (ret < 0) { + ldout(cct, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return req.complete_request(); +} + +int RGWRESTSendResource::aio_send(bufferlist& outbl) +{ + req.set_send_length(outbl.length()); + req.set_outbl(outbl); + + int ret = req.send_request(&conn->get_key(), headers, resource, mgr); + if (ret < 0) { + ldout(cct, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl; + return ret; + } + + return 0; +} diff --git a/src/rgw/rgw_rest_conn.h b/src/rgw/rgw_rest_conn.h new file mode 100644 index 00000000..9a210292 --- /dev/null +++ b/src/rgw/rgw_rest_conn.h @@ -0,0 +1,521 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_CONN_H +#define CEPH_RGW_REST_CONN_H + +#include "rgw_rados.h" +#include "rgw_rest_client.h" +#include "common/ceph_json.h" +#include "common/RefCountedObj.h" + +#include + +class CephContext; +class RGWSI_Zone; + +template +static int parse_decode_json(T& t, bufferlist& bl) +{ + JSONParser p; + if (!p.parse(bl.c_str(), bl.length())) { + return -EINVAL; + } + + try { + decode_json_obj(t, &p); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + return 0; +} + +struct rgw_http_param_pair { + const char *key; + const char *val; +}; + +// append a null-terminated rgw_http_param_pair list into a list of string pairs +inline void append_param_list(param_vec_t& params, const rgw_http_param_pair* pp) +{ + while (pp && pp->key) { + string k = pp->key; + string v = (pp->val ? pp->val : ""); + params.emplace_back(make_pair(std::move(k), std::move(v))); + ++pp; + } +} + +// copy a null-terminated rgw_http_param_pair list into a list of string pairs +inline param_vec_t make_param_list(const rgw_http_param_pair* pp) +{ + param_vec_t params; + append_param_list(params, pp); + return params; +} + +inline param_vec_t make_param_list(const map *pp) +{ + param_vec_t params; + if (!pp) { + return params; + } + for (auto iter : *pp) { + params.emplace_back(make_pair(iter.first, iter.second)); + } + return params; +} + +class RGWRESTConn +{ + CephContext *cct; + vector endpoints; + RGWAccessKey key; + string self_zone_group; + string remote_id; + HostStyle host_style; + std::atomic counter = { 0 }; + +public: + + RGWRESTConn(CephContext *_cct, RGWSI_Zone *zone_svc, const string& _remote_id, const list& endpoints, HostStyle _host_style = PathStyle); + RGWRESTConn(CephContext *_cct, RGWSI_Zone *zone_svc, const string& _remote_id, const list& endpoints, RGWAccessKey _cred, HostStyle _host_style = PathStyle); + + // custom move needed for atomic + RGWRESTConn(RGWRESTConn&& other); + RGWRESTConn& operator=(RGWRESTConn&& other); + virtual ~RGWRESTConn() = default; + + int get_url(string& endpoint); + string get_url(); + const string& get_self_zonegroup() { + return self_zone_group; + } + const string& get_remote_id() { + return remote_id; + } + RGWAccessKey& get_key() { + return key; + } + + HostStyle get_host_style() { + return host_style; + } + + CephContext *get_ctx() { + return cct; + } + size_t get_endpoint_count() const { return endpoints.size(); } + + virtual void populate_params(param_vec_t& params, const rgw_user *uid, const string& zonegroup); + + /* sync request */ + int forward(const rgw_user& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl); + + + /* async requests */ + int put_obj_send_init(rgw_obj& obj, const rgw_http_param_pair *extra_params, RGWRESTStreamS3PutObj **req); + int put_obj_async(const rgw_user& uid, rgw_obj& obj, uint64_t obj_size, + map& attrs, bool send, RGWRESTStreamS3PutObj **req); + int complete_request(RGWRESTStreamS3PutObj *req, string& etag, ceph::real_time *mtime); + + struct get_obj_params { + rgw_user uid; + req_info *info{nullptr}; + const ceph::real_time *mod_ptr{nullptr}; + const ceph::real_time *unmod_ptr{nullptr}; + bool high_precision_time{true}; + + string etag; + + uint32_t mod_zone_id{0}; + uint64_t mod_pg_ver{0}; + + bool prepend_metadata{false}; + bool get_op{false}; + bool rgwx_stat{false}; + bool sync_manifest{false}; + + bool skip_decrypt{true}; + RGWHTTPStreamRWRequest::ReceiveCB *cb{nullptr}; + + bool range_is_set{false}; + uint64_t range_start{0}; + uint64_t range_end{0}; + }; + + int get_obj(const rgw_obj& obj, const get_obj_params& params, bool send, RGWRESTStreamRWRequest **req); + + int get_obj(const rgw_user& uid, req_info *info /* optional */, const rgw_obj& obj, + const ceph::real_time *mod_ptr, const ceph::real_time *unmod_ptr, + uint32_t mod_zone_id, uint64_t mod_pg_ver, + bool prepend_metadata, bool get_op, bool rgwx_stat, bool sync_manifest, + bool skip_decrypt, bool send, RGWHTTPStreamRWRequest::ReceiveCB *cb, RGWRESTStreamRWRequest **req); + int complete_request(RGWRESTStreamRWRequest *req, + string *etag, + ceph::real_time *mtime, + uint64_t *psize, + map *pattrs, + map *pheaders); + + int get_resource(const string& resource, + param_vec_t *extra_params, + map* extra_headers, + bufferlist& bl, + bufferlist *send_data = nullptr, + RGWHTTPManager *mgr = nullptr); + + template + int get_json_resource(const string& resource, param_vec_t *params, bufferlist *in_data, T& t); + template + int get_json_resource(const string& resource, param_vec_t *params, T& t); + template + int get_json_resource(const string& resource, const rgw_http_param_pair *pp, T& t); + +private: + void populate_zonegroup(param_vec_t& params, const string& zonegroup) { + if (!zonegroup.empty()) { + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "zonegroup", zonegroup)); + } + } + void populate_uid(param_vec_t& params, const rgw_user *uid) { + if (uid) { + string uid_str = uid->to_str(); + if (!uid->empty()){ + params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "uid", uid_str)); + } + } + } +}; + +class S3RESTConn : public RGWRESTConn { + +public: + + S3RESTConn(CephContext *_cct, RGWSI_Zone *svc_zone, const string& _remote_id, const list& endpoints, HostStyle _host_style = PathStyle) : + RGWRESTConn(_cct, svc_zone, _remote_id, endpoints, _host_style) {} + + S3RESTConn(CephContext *_cct, RGWSI_Zone *svc_zone, const string& _remote_id, const list& endpoints, RGWAccessKey _cred, HostStyle _host_style = PathStyle): + RGWRESTConn(_cct, svc_zone, _remote_id, endpoints, _cred, _host_style) {} + ~S3RESTConn() override = default; + + void populate_params(param_vec_t& params, const rgw_user *uid, const string& zonegroup) override { + // do not populate any params in S3 REST Connection. + return; + } +}; + + +template +int RGWRESTConn::get_json_resource(const string& resource, param_vec_t *params, bufferlist *in_data, T& t) +{ + bufferlist bl; + int ret = get_resource(resource, params, nullptr, bl, in_data); + if (ret < 0) { + return ret; + } + + ret = parse_decode_json(t, bl); + if (ret < 0) { + return ret; + } + + return 0; +} + +template +int RGWRESTConn::get_json_resource(const string& resource, param_vec_t *params, T& t) +{ + return get_json_resource(resource, params, nullptr, t); +} + +template +int RGWRESTConn::get_json_resource(const string& resource, const rgw_http_param_pair *pp, T& t) +{ + param_vec_t params = make_param_list(pp); + return get_json_resource(resource, ¶ms, t); +} + +class RGWStreamIntoBufferlist : public RGWHTTPStreamRWRequest::ReceiveCB { + bufferlist& bl; +public: + explicit RGWStreamIntoBufferlist(bufferlist& _bl) : bl(_bl) {} + int handle_data(bufferlist& inbl, bool *pause) override { + bl.claim_append(inbl); + return inbl.length(); + } +}; + +class RGWRESTReadResource : public RefCountedObject, public RGWIOProvider { + CephContext *cct; + RGWRESTConn *conn; + string resource; + param_vec_t params; + map headers; + bufferlist bl; + RGWStreamIntoBufferlist cb; + + RGWHTTPManager *mgr; + RGWRESTStreamReadRequest req; + + void init_common(param_vec_t *extra_headers); + +public: + RGWRESTReadResource(RGWRESTConn *_conn, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr); + + RGWRESTReadResource(RGWRESTConn *_conn, + const string& _resource, + param_vec_t& _params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr); + ~RGWRESTReadResource() = default; + + rgw_io_id get_io_id(int io_type) { + return req.get_io_id(io_type); + } + + void set_io_user_info(void *user_info) override { + req.set_io_user_info(user_info); + } + + void *get_io_user_info() override { + return req.get_io_user_info(); + } + + template + int decode_resource(T *dest); + + int read(); + + int aio_read(); + + string to_str() { + return req.to_str(); + } + + int get_http_status() { + return req.get_http_status(); + } + + int wait(bufferlist *pbl) { + int ret = req.wait(); + if (ret < 0) { + return ret; + } + + if (req.get_status() < 0) { + return req.get_status(); + } + *pbl = bl; + return 0; + } + + template + int wait(T *dest); + + template + int fetch(T *dest); +}; + + +template +int RGWRESTReadResource::decode_resource(T *dest) +{ + int ret = req.get_status(); + if (ret < 0) { + return ret; + } + ret = parse_decode_json(*dest, bl); + if (ret < 0) { + return ret; + } + return 0; +} + +template +int RGWRESTReadResource::fetch(T *dest) +{ + int ret = read(); + if (ret < 0) { + return ret; + } + + ret = decode_resource(dest); + if (ret < 0) { + return ret; + } + return 0; +} + +template +int RGWRESTReadResource::wait(T *dest) +{ + int ret = req.wait(); + if (ret < 0) { + return ret; + } + + ret = decode_resource(dest); + if (ret < 0) { + return ret; + } + return 0; +} + +class RGWRESTSendResource : public RefCountedObject, public RGWIOProvider { + CephContext *cct; + RGWRESTConn *conn; + string method; + string resource; + param_vec_t params; + map headers; + bufferlist bl; + RGWStreamIntoBufferlist cb; + + RGWHTTPManager *mgr; + RGWRESTStreamRWRequest req; + + void init_common(param_vec_t *extra_headers); + +public: + RGWRESTSendResource(RGWRESTConn *_conn, + const string& _method, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr); + + RGWRESTSendResource(RGWRESTConn *_conn, + const string& _method, + const string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr); + + ~RGWRESTSendResource() = default; + + rgw_io_id get_io_id(int io_type) { + return req.get_io_id(io_type); + } + + void set_io_user_info(void *user_info) override { + req.set_io_user_info(user_info); + } + + void *get_io_user_info() override { + return req.get_io_user_info(); + } + + int send(bufferlist& bl); + + int aio_send(bufferlist& bl); + + string to_str() { + return req.to_str(); + } + + int get_http_status() { + return req.get_http_status(); + } + + template + int wait(bufferlist *pbl, E *err_result = nullptr) { + int ret = req.wait(); + *pbl = bl; + + if (ret < 0 && err_result ) { + ret = parse_decode_json(*err_result, bl); + } + + return req.get_status(); + } + + template + int wait(T *dest, E *err_result = nullptr); +}; + +template +int RGWRESTSendResource::wait(T *dest, E *err_result) +{ + int ret = req.wait(); + if (ret >= 0) { + ret = req.get_status(); + } + + if (ret < 0 && err_result) { + ret = parse_decode_json(*err_result, bl); + } + + if (ret < 0) { + return ret; + } + + ret = parse_decode_json(*dest, bl); + if (ret < 0) { + return ret; + } + return 0; + +} + +class RGWRESTPostResource : public RGWRESTSendResource { +public: + RGWRESTPostResource(RGWRESTConn *_conn, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "POST", _resource, + pp, extra_headers, _mgr) {} + + RGWRESTPostResource(RGWRESTConn *_conn, + const string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "POST", _resource, + params, extra_headers, _mgr) {} + +}; + +class RGWRESTPutResource : public RGWRESTSendResource { +public: + RGWRESTPutResource(RGWRESTConn *_conn, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "PUT", _resource, + pp, extra_headers, _mgr) {} + + RGWRESTPutResource(RGWRESTConn *_conn, + const string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "PUT", _resource, + params, extra_headers, _mgr) {} + +}; + +class RGWRESTDeleteResource : public RGWRESTSendResource { +public: + RGWRESTDeleteResource(RGWRESTConn *_conn, + const string& _resource, + const rgw_http_param_pair *pp, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "DELETE", _resource, + pp, extra_headers, _mgr) {} + + RGWRESTDeleteResource(RGWRESTConn *_conn, + const string& _resource, + param_vec_t& params, + param_vec_t *extra_headers, + RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "DELETE", _resource, + params, extra_headers, _mgr) {} + +}; + + + +#endif diff --git a/src/rgw/rgw_rest_iam.cc b/src/rgw/rgw_rest_iam.cc new file mode 100644 index 00000000..ef0e958d --- /dev/null +++ b/src/rgw/rgw_rest_iam.cc @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "rgw_rest.h" +#include "rgw_rest_iam.h" + +#include "rgw_request.h" +#include "rgw_process.h" + +#include "rgw_rest_role.h" +#include "rgw_rest_user_policy.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +void RGWHandler_REST_IAM::rgw_iam_parse_input() +{ + if (post_body.size() > 0) { + ldout(s->cct, 10) << "Content of POST: " << post_body << dendl; + + if (post_body.find("Action") != string::npos) { + boost::char_separator sep("&"); + boost::tokenizer> tokens(post_body, sep); + for (const auto& t : tokens) { + auto pos = t.find("="); + if (pos != string::npos) { + std::string key = t.substr(0, pos); + std::string value = t.substr(pos + 1, t.size() - 1); + if (key == "AssumeRolePolicyDocument" || key == "Path" || key == "PolicyDocument") { + value = url_decode(value); + } + s->info.args.append(key, value); + } + } + } + } + auto payload_hash = rgw::auth::s3::calc_v4_payload_hash(post_body); + s->info.args.append("PayloadHash", payload_hash); +} + +RGWOp *RGWHandler_REST_IAM::op_post() +{ + rgw_iam_parse_input(); + + if (s->info.args.exists("Action")) { + string action = s->info.args.get("Action"); + if (action.compare("CreateRole") == 0) + return new RGWCreateRole; + if (action.compare("DeleteRole") == 0) + return new RGWDeleteRole; + if (action.compare("GetRole") == 0) + return new RGWGetRole; + if (action.compare("UpdateAssumeRolePolicy") == 0) + return new RGWModifyRole; + if (action.compare("ListRoles") == 0) + return new RGWListRoles; + if (action.compare("PutRolePolicy") == 0) + return new RGWPutRolePolicy; + if (action.compare("GetRolePolicy") == 0) + return new RGWGetRolePolicy; + if (action.compare("ListRolePolicies") == 0) + return new RGWListRolePolicies; + if (action.compare("DeleteRolePolicy") == 0) + return new RGWDeleteRolePolicy; + if (action.compare("PutUserPolicy") == 0) + return new RGWPutUserPolicy; + if (action.compare("GetUserPolicy") == 0) + return new RGWGetUserPolicy; + if (action.compare("ListUserPolicies") == 0) + return new RGWListUserPolicies; + if (action.compare("DeleteUserPolicy") == 0) + return new RGWDeleteUserPolicy; + } + + return nullptr; +} + +int RGWHandler_REST_IAM::init(RGWRados *store, + struct req_state *s, + rgw::io::BasicClient *cio) +{ + s->dialect = "iam"; + + if (int ret = RGWHandler_REST_IAM::init_from_header(s, RGW_FORMAT_XML, true); ret < 0) { + ldout(s->cct, 10) << "init_from_header returned err=" << ret << dendl; + return ret; + } + + return RGWHandler_REST::init(store, s, cio); +} + +int RGWHandler_REST_IAM::authorize(const DoutPrefixProvider* dpp) +{ + return RGW_Auth_S3::authorize(dpp, store, auth_registry, s); +} + +int RGWHandler_REST_IAM::init_from_header(struct req_state* s, + int default_formatter, + bool configurable_format) +{ + string req; + string first; + + s->prot_flags = RGW_REST_IAM; + + const char *p, *req_name; + if (req_name = s->relative_uri.c_str(); *req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(); + + /* must be called after the args parsing */ + if (int ret = allocate_formatter(s, default_formatter, configurable_format); ret < 0) + return ret; + + if (*req_name != '/') + return 0; + + req_name++; + + if (!*req_name) + return 0; + + req = req_name; + int pos = req.find('/'); + if (pos >= 0) { + first = req.substr(0, pos); + } else { + first = req; + } + + return 0; +} + +RGWHandler_REST* +RGWRESTMgr_IAM::get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + return new RGWHandler_REST_IAM(auth_registry); +} diff --git a/src/rgw/rgw_rest_iam.h b/src/rgw/rgw_rest_iam.h new file mode 100644 index 00000000..e9dbfcd0 --- /dev/null +++ b/src/rgw/rgw_rest_iam.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_IAM_H +#define CEPH_RGW_REST_IAM_H + +#include "rgw_auth.h" +#include "rgw_auth_filters.h" + +class RGWHandler_REST_IAM : public RGWHandler_REST { + const rgw::auth::StrategyRegistry& auth_registry; + const string& post_body; + RGWOp *op_post() override; + void rgw_iam_parse_input(); +public: + + static int init_from_header(struct req_state *s, int default_formatter, bool configurable_format); + + RGWHandler_REST_IAM(const rgw::auth::StrategyRegistry& auth_registry, const string& post_body="") + : RGWHandler_REST(), + auth_registry(auth_registry), + post_body(post_body) {} + ~RGWHandler_REST_IAM() override = default; + + int init(RGWRados *store, + struct req_state *s, + rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider* dpp) override; + int postauth_init() override { return 0; } +}; + +class RGWRESTMgr_IAM : public RGWRESTMgr { +public: + RGWRESTMgr_IAM() = default; + ~RGWRESTMgr_IAM() override = default; + + RGWRESTMgr *get_resource_mgr(struct req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + + RGWHandler_REST* get_handler(struct req_state*, + const rgw::auth::StrategyRegistry&, + const std::string&) override; +}; + +#endif /* CEPH_RGW_REST_STS_H */ + diff --git a/src/rgw/rgw_rest_log.cc b/src/rgw/rgw_rest_log.cc new file mode 100644 index 00000000..6daeca16 --- /dev/null +++ b/src/rgw/rgw_rest_log.cc @@ -0,0 +1,1060 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/ceph_json.h" +#include "common/strtol.h" +#include "rgw_rest.h" +#include "rgw_op.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_log.h" +#include "rgw_client_io.h" +#include "rgw_sync.h" +#include "rgw_data_sync.h" +#include "rgw_common.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include "common/errno.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define LOG_CLASS_LIST_MAX_ENTRIES (1000) +#define dout_subsys ceph_subsys_rgw + +static int parse_date_str(string& in, real_time& out) { + uint64_t epoch = 0; + uint64_t nsec = 0; + + if (!in.empty()) { + if (utime_t::parse_date(in, &epoch, &nsec) < 0) { + dout(5) << "Error parsing date " << in << dendl; + return -EINVAL; + } + } + out = utime_t(epoch, nsec).to_real_time(); + return 0; +} + +void RGWOp_MDLog_List::execute() { + string period = s->info.args.get("period"); + string shard = s->info.args.get("id"); + string max_entries_str = s->info.args.get("max-entries"); + string st = s->info.args.get("start-time"), + et = s->info.args.get("end-time"), + marker = s->info.args.get("marker"), + err; + real_time ut_st, + ut_et; + void *handle; + unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id " << shard << dendl; + http_ret = -EINVAL; + return; + } + + if (parse_date_str(st, ut_st) < 0) { + http_ret = -EINVAL; + return; + } + + if (parse_date_str(et, ut_et) < 0) { + http_ret = -EINVAL; + return; + } + + if (!max_entries_str.empty()) { + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing max-entries " << max_entries_str << dendl; + http_ret = -EINVAL; + return; + } + if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) { + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + } + } + + if (period.empty()) { + ldout(s->cct, 5) << "Missing period id trying to use current" << dendl; + period = store->svc.zone->get_current_period_id(); + if (period.empty()) { + ldout(s->cct, 5) << "Missing period id" << dendl; + http_ret = -EINVAL; + return; + } + } + + RGWMetadataLog meta_log{s->cct, store, period}; + + meta_log.init_list_entries(shard_id, ut_st, ut_et, marker, &handle); + + http_ret = meta_log.list_entries(handle, max_entries, entries, + &last_marker, &truncated); + + meta_log.complete_list_entries(handle); +} + +void RGWOp_MDLog_List::send_response() { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + if (http_ret < 0) + return; + + s->formatter->open_object_section("log_entries"); + s->formatter->dump_string("marker", last_marker); + s->formatter->dump_bool("truncated", truncated); + { + s->formatter->open_array_section("entries"); + for (list::iterator iter = entries.begin(); + iter != entries.end(); ++iter) { + cls_log_entry& entry = *iter; + store->meta_mgr->dump_log_entry(entry, s->formatter); + flusher.flush(); + } + s->formatter->close_section(); + } + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_MDLog_Info::execute() { + num_objects = s->cct->_conf->rgw_md_log_max_shards; + period = store->meta_mgr->read_oldest_log_period(); + http_ret = period.get_error(); +} + +void RGWOp_MDLog_Info::send_response() { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + s->formatter->open_object_section("mdlog"); + s->formatter->dump_unsigned("num_objects", num_objects); + if (period) { + s->formatter->dump_string("period", period.get_period().get_id()); + s->formatter->dump_unsigned("realm_epoch", period.get_epoch()); + } + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_MDLog_ShardInfo::execute() { + string period = s->info.args.get("period"); + string shard = s->info.args.get("id"); + string err; + + unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id " << shard << dendl; + http_ret = -EINVAL; + return; + } + + if (period.empty()) { + ldout(s->cct, 5) << "Missing period id trying to use current" << dendl; + period = store->svc.zone->get_current_period_id(); + + if (period.empty()) { + ldout(s->cct, 5) << "Missing period id" << dendl; + http_ret = -EINVAL; + return; + } + } + RGWMetadataLog meta_log{s->cct, store, period}; + + http_ret = meta_log.get_info(shard_id, &info); +} + +void RGWOp_MDLog_ShardInfo::send_response() { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + encode_json("info", info, s->formatter); + flusher.flush(); +} + +void RGWOp_MDLog_Delete::execute() { + string st = s->info.args.get("start-time"), + et = s->info.args.get("end-time"), + start_marker = s->info.args.get("start-marker"), + end_marker = s->info.args.get("end-marker"), + period = s->info.args.get("period"), + shard = s->info.args.get("id"), + err; + real_time ut_st, + ut_et; + unsigned shard_id; + + http_ret = 0; + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id " << shard << dendl; + http_ret = -EINVAL; + return; + } + if (et.empty() && end_marker.empty()) { /* bounding end */ + http_ret = -EINVAL; + return; + } + + if (parse_date_str(st, ut_st) < 0) { + http_ret = -EINVAL; + return; + } + + if (parse_date_str(et, ut_et) < 0) { + http_ret = -EINVAL; + return; + } + + if (period.empty()) { + ldout(s->cct, 5) << "Missing period id trying to use current" << dendl; + period = store->svc.zone->get_current_period_id(); + + if (period.empty()) { + ldout(s->cct, 5) << "Missing period id" << dendl; + http_ret = -EINVAL; + return; + } + } + RGWMetadataLog meta_log{s->cct, store, period}; + + http_ret = meta_log.trim(shard_id, ut_st, ut_et, start_marker, end_marker); +} + +void RGWOp_MDLog_Lock::execute() { + string period, shard_id_str, duration_str, locker_id, zone_id; + unsigned shard_id; + + http_ret = 0; + + period = s->info.args.get("period"); + shard_id_str = s->info.args.get("id"); + duration_str = s->info.args.get("length"); + locker_id = s->info.args.get("locker-id"); + zone_id = s->info.args.get("zone-id"); + + if (period.empty()) { + ldout(s->cct, 5) << "Missing period id trying to use current" << dendl; + period = store->svc.zone->get_current_period_id(); + } + + if (period.empty() || + shard_id_str.empty() || + (duration_str.empty()) || + locker_id.empty() || + zone_id.empty()) { + dout(5) << "Error invalid parameter list" << dendl; + http_ret = -EINVAL; + return; + } + + string err; + shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id param " << shard_id_str << dendl; + http_ret = -EINVAL; + return; + } + + RGWMetadataLog meta_log{s->cct, store, period}; + unsigned dur; + dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err); + if (!err.empty() || dur <= 0) { + dout(5) << "invalid length param " << duration_str << dendl; + http_ret = -EINVAL; + return; + } + http_ret = meta_log.lock_exclusive(shard_id, make_timespan(dur), zone_id, + locker_id); + if (http_ret == -EBUSY) + http_ret = -ERR_LOCKED; +} + +void RGWOp_MDLog_Unlock::execute() { + string period, shard_id_str, locker_id, zone_id; + unsigned shard_id; + + http_ret = 0; + + period = s->info.args.get("period"); + shard_id_str = s->info.args.get("id"); + locker_id = s->info.args.get("locker-id"); + zone_id = s->info.args.get("zone-id"); + + if (period.empty()) { + ldout(s->cct, 5) << "Missing period id trying to use current" << dendl; + period = store->svc.zone->get_current_period_id(); + } + + if (period.empty() || + shard_id_str.empty() || + locker_id.empty() || + zone_id.empty()) { + dout(5) << "Error invalid parameter list" << dendl; + http_ret = -EINVAL; + return; + } + + string err; + shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id param " << shard_id_str << dendl; + http_ret = -EINVAL; + return; + } + + RGWMetadataLog meta_log{s->cct, store, period}; + http_ret = meta_log.unlock(shard_id, zone_id, locker_id); +} + +void RGWOp_MDLog_Notify::execute() { +#define LARGE_ENOUGH_BUF (128 * 1024) + + int r = 0; + bufferlist data; + std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF); + if (r < 0) { + http_ret = r; + return; + } + + char* buf = data.c_str(); + ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl; + + JSONParser p; + r = p.parse(buf, data.length()); + if (r < 0) { + ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl; + http_ret = r; + return; + } + + set updated_shards; + try { + decode_json_obj(updated_shards, &p); + } catch (JSONDecoder::err& err) { + ldout(s->cct, 0) << "ERROR: failed to decode JSON" << dendl; + http_ret = -EINVAL; + return; + } + + if (store->ctx()->_conf->subsys.should_gather()) { + for (set::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) { + ldout(s->cct, 20) << __func__ << "(): updated shard=" << *iter << dendl; + } + } + + store->wakeup_meta_sync_shards(updated_shards); + + http_ret = 0; +} + +void RGWOp_BILog_List::execute() { + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + marker = s->info.args.get("marker"), + max_entries_str = s->info.args.get("max-entries"), + bucket_instance = s->info.args.get("bucket-instance"); + RGWBucketInfo bucket_info; + unsigned max_entries; + + if (bucket_name.empty() && bucket_instance.empty()) { + dout(5) << "ERROR: neither bucket nor bucket instance specified" << dendl; + http_ret = -EINVAL; + return; + } + + int shard_id; + http_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bucket_instance, &shard_id); + if (http_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + http_ret = store->get_bucket_instance_info(*s->sysobj_ctx, bucket_instance, bucket_info, NULL, NULL); + if (http_ret < 0) { + dout(5) << "could not get bucket instance info for bucket instance id=" << bucket_instance << dendl; + return; + } + } else { /* !bucket_name.empty() */ + http_ret = store->get_bucket_info(*s->sysobj_ctx, tenant_name, bucket_name, bucket_info, NULL, NULL); + if (http_ret < 0) { + dout(5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + } + + bool truncated; + unsigned count = 0; + string err; + + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + send_response(); + do { + list entries; + int ret = store->list_bi_log_entries(bucket_info, shard_id, + marker, max_entries - count, + entries, &truncated); + if (ret < 0) { + dout(5) << "ERROR: list_bi_log_entries()" << dendl; + return; + } + + count += entries.size(); + + send_response(entries, marker); + } while (truncated && count < max_entries); + + send_response_end(); +} + +void RGWOp_BILog_List::send_response() { + if (sent_header) + return; + + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + sent_header = true; + + if (http_ret < 0) + return; + + s->formatter->open_array_section("entries"); +} + +void RGWOp_BILog_List::send_response(list& entries, string& marker) +{ + for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_bi_log_entry& entry = *iter; + encode_json("entry", entry, s->formatter); + + marker = entry.id; + flusher.flush(); + } +} + +void RGWOp_BILog_List::send_response_end() { + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_BILog_Info::execute() { + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + bucket_instance = s->info.args.get("bucket-instance"); + RGWBucketInfo bucket_info; + + if (bucket_name.empty() && bucket_instance.empty()) { + dout(5) << "ERROR: neither bucket nor bucket instance specified" << dendl; + http_ret = -EINVAL; + return; + } + + int shard_id; + http_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bucket_instance, &shard_id); + if (http_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + http_ret = store->get_bucket_instance_info(*s->sysobj_ctx, bucket_instance, bucket_info, NULL, NULL); + if (http_ret < 0) { + dout(5) << "could not get bucket instance info for bucket instance id=" << bucket_instance << dendl; + return; + } + } else { /* !bucket_name.empty() */ + http_ret = store->get_bucket_info(*s->sysobj_ctx, tenant_name, bucket_name, bucket_info, NULL, NULL); + if (http_ret < 0) { + dout(5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + } + map stats; + int ret = store->get_bucket_stats(bucket_info, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped); + if (ret < 0 && ret != -ENOENT) { + http_ret = ret; + return; + } +} + +void RGWOp_BILog_Info::send_response() { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + if (http_ret < 0) + return; + + s->formatter->open_object_section("info"); + encode_json("bucket_ver", bucket_ver, s->formatter); + encode_json("master_ver", master_ver, s->formatter); + encode_json("max_marker", max_marker, s->formatter); + encode_json("syncstopped", syncstopped, s->formatter); + s->formatter->close_section(); + + flusher.flush(); +} + +void RGWOp_BILog_Delete::execute() { + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + start_marker = s->info.args.get("start-marker"), + end_marker = s->info.args.get("end-marker"), + bucket_instance = s->info.args.get("bucket-instance"); + + RGWBucketInfo bucket_info; + + http_ret = 0; + if ((bucket_name.empty() && bucket_instance.empty()) || + end_marker.empty()) { + dout(5) << "ERROR: one of bucket and bucket instance, and also end-marker is mandatory" << dendl; + http_ret = -EINVAL; + return; + } + + int shard_id; + http_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bucket_instance, &shard_id); + if (http_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + http_ret = store->get_bucket_instance_info(*s->sysobj_ctx, bucket_instance, bucket_info, NULL, NULL); + if (http_ret < 0) { + dout(5) << "could not get bucket instance info for bucket instance id=" << bucket_instance << dendl; + return; + } + } else { /* !bucket_name.empty() */ + http_ret = store->get_bucket_info(*s->sysobj_ctx, tenant_name, bucket_name, bucket_info, NULL, NULL); + if (http_ret < 0) { + dout(5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + } + http_ret = store->trim_bi_log_entries(bucket_info, shard_id, start_marker, end_marker); + if (http_ret < 0) { + dout(5) << "ERROR: trim_bi_log_entries() " << dendl; + } + return; +} + +void RGWOp_DATALog_List::execute() { + string shard = s->info.args.get("id"); + + string st = s->info.args.get("start-time"), + et = s->info.args.get("end-time"), + max_entries_str = s->info.args.get("max-entries"), + marker = s->info.args.get("marker"), + err; + real_time ut_st, + ut_et; + unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + s->info.args.get_bool("extra-info", &extra_info, false); + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id " << shard << dendl; + http_ret = -EINVAL; + return; + } + + if (parse_date_str(st, ut_st) < 0) { + http_ret = -EINVAL; + return; + } + + if (parse_date_str(et, ut_et) < 0) { + http_ret = -EINVAL; + return; + } + + if (!max_entries_str.empty()) { + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing max-entries " << max_entries_str << dendl; + http_ret = -EINVAL; + return; + } + if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) { + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + } + } + + // Note that last_marker is updated to be the marker of the last + // entry listed + http_ret = store->data_log->list_entries(shard_id, ut_st, ut_et, + max_entries, entries, marker, + &last_marker, &truncated); +} + +void RGWOp_DATALog_List::send_response() { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + if (http_ret < 0) + return; + + s->formatter->open_object_section("log_entries"); + s->formatter->dump_string("marker", last_marker); + s->formatter->dump_bool("truncated", truncated); + { + s->formatter->open_array_section("entries"); + for (list::iterator iter = entries.begin(); + iter != entries.end(); ++iter) { + rgw_data_change_log_entry& entry = *iter; + if (!extra_info) { + encode_json("entry", entry.entry, s->formatter); + } else { + encode_json("entry", entry, s->formatter); + } + flusher.flush(); + } + s->formatter->close_section(); + } + s->formatter->close_section(); + flusher.flush(); +} + + +void RGWOp_DATALog_Info::execute() { + num_objects = s->cct->_conf->rgw_data_log_num_shards; + http_ret = 0; +} + +void RGWOp_DATALog_Info::send_response() { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + s->formatter->open_object_section("num_objects"); + s->formatter->dump_unsigned("num_objects", num_objects); + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_DATALog_ShardInfo::execute() { + string shard = s->info.args.get("id"); + string err; + + unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id " << shard << dendl; + http_ret = -EINVAL; + return; + } + + http_ret = store->data_log->get_info(shard_id, &info); +} + +void RGWOp_DATALog_ShardInfo::send_response() { + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + encode_json("info", info, s->formatter); + flusher.flush(); +} + +void RGWOp_DATALog_Lock::execute() { + string shard_id_str, duration_str, locker_id, zone_id; + unsigned shard_id; + + http_ret = 0; + + shard_id_str = s->info.args.get("id"); + duration_str = s->info.args.get("length"); + locker_id = s->info.args.get("locker-id"); + zone_id = s->info.args.get("zone-id"); + + if (shard_id_str.empty() || + (duration_str.empty()) || + locker_id.empty() || + zone_id.empty()) { + dout(5) << "Error invalid parameter list" << dendl; + http_ret = -EINVAL; + return; + } + + string err; + shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id param " << shard_id_str << dendl; + http_ret = -EINVAL; + return; + } + + unsigned dur; + dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err); + if (!err.empty() || dur <= 0) { + dout(5) << "invalid length param " << duration_str << dendl; + http_ret = -EINVAL; + return; + } + http_ret = store->data_log->lock_exclusive(shard_id, make_timespan(dur), zone_id, locker_id); + if (http_ret == -EBUSY) + http_ret = -ERR_LOCKED; +} + +void RGWOp_DATALog_Unlock::execute() { + string shard_id_str, locker_id, zone_id; + unsigned shard_id; + + http_ret = 0; + + shard_id_str = s->info.args.get("id"); + locker_id = s->info.args.get("locker-id"); + zone_id = s->info.args.get("zone-id"); + + if (shard_id_str.empty() || + locker_id.empty() || + zone_id.empty()) { + dout(5) << "Error invalid parameter list" << dendl; + http_ret = -EINVAL; + return; + } + + string err; + shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id param " << shard_id_str << dendl; + http_ret = -EINVAL; + return; + } + + http_ret = store->data_log->unlock(shard_id, zone_id, locker_id); +} + +void RGWOp_DATALog_Notify::execute() { + string source_zone = s->info.args.get("source-zone"); +#define LARGE_ENOUGH_BUF (128 * 1024) + + int r = 0; + bufferlist data; + std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF); + if (r < 0) { + http_ret = r; + return; + } + + char* buf = data.c_str(); + ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl; + + JSONParser p; + r = p.parse(buf, data.length()); + if (r < 0) { + ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl; + http_ret = r; + return; + } + + map > updated_shards; + try { + decode_json_obj(updated_shards, &p); + } catch (JSONDecoder::err& err) { + ldout(s->cct, 0) << "ERROR: failed to decode JSON" << dendl; + http_ret = -EINVAL; + return; + } + + if (store->ctx()->_conf->subsys.should_gather()) { + for (map >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) { + ldout(s->cct, 20) << __func__ << "(): updated shard=" << iter->first << dendl; + set& keys = iter->second; + for (set::iterator kiter = keys.begin(); kiter != keys.end(); ++kiter) { + ldout(s->cct, 20) << __func__ << "(): modified key=" << *kiter << dendl; + } + } + } + + store->wakeup_data_sync_shards(source_zone, updated_shards); + + http_ret = 0; +} + +void RGWOp_DATALog_Delete::execute() { + string st = s->info.args.get("start-time"), + et = s->info.args.get("end-time"), + start_marker = s->info.args.get("start-marker"), + end_marker = s->info.args.get("end-marker"), + shard = s->info.args.get("id"), + err; + real_time ut_st, + ut_et; + unsigned shard_id; + + http_ret = 0; + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing shard_id " << shard << dendl; + http_ret = -EINVAL; + return; + } + if (et.empty() && end_marker.empty()) { /* bounding end */ + http_ret = -EINVAL; + return; + } + + if (parse_date_str(st, ut_st) < 0) { + http_ret = -EINVAL; + return; + } + + if (parse_date_str(et, ut_et) < 0) { + http_ret = -EINVAL; + return; + } + + http_ret = store->data_log->trim_entries(shard_id, ut_st, ut_et, start_marker, end_marker); +} + +// not in header to avoid pulling in rgw_sync.h +class RGWOp_MDLog_Status : public RGWRESTOp { + rgw_meta_sync_status status; +public: + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { return "get_metadata_log_status"; } +}; + +void RGWOp_MDLog_Status::execute() +{ + auto sync = store->get_meta_sync_manager(); + if (sync == nullptr) { + ldout(s->cct, 1) << "no sync manager" << dendl; + http_ret = -ENOENT; + return; + } + http_ret = sync->read_sync_status(&status); +} + +void RGWOp_MDLog_Status::send_response() +{ + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + if (http_ret >= 0) { + encode_json("status", status, s->formatter); + } + flusher.flush(); +} + +// not in header to avoid pulling in rgw_data_sync.h +class RGWOp_BILog_Status : public RGWRESTOp { + std::vector status; +public: + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { return "get_bucket_index_log_status"; } +}; + +void RGWOp_BILog_Status::execute() +{ + const auto source_zone = s->info.args.get("source-zone"); + const auto key = s->info.args.get("bucket"); + if (key.empty()) { + ldout(s->cct, 4) << "no 'bucket' provided" << dendl; + http_ret = -EINVAL; + return; + } + + rgw_bucket bucket; + int shard_id{-1}; // unused + http_ret = rgw_bucket_parse_bucket_key(s->cct, key, &bucket, &shard_id); + if (http_ret < 0) { + ldout(s->cct, 4) << "no 'bucket' provided" << dendl; + http_ret = -EINVAL; + return; + } + + // read the bucket instance info for num_shards + auto ctx = store->svc.sysobj->init_obj_ctx(); + RGWBucketInfo info; + http_ret = store->get_bucket_instance_info(ctx, bucket, info, nullptr, nullptr); + if (http_ret < 0) { + ldout(s->cct, 4) << "failed to read bucket info: " << cpp_strerror(http_ret) << dendl; + return; + } + http_ret = rgw_bucket_sync_status(this, store, source_zone, info, &status); +} + +void RGWOp_BILog_Status::send_response() +{ + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + if (http_ret >= 0) { + encode_json("status", status, s->formatter); + } + flusher.flush(); +} + +// not in header to avoid pulling in rgw_data_sync.h +class RGWOp_DATALog_Status : public RGWRESTOp { + rgw_data_sync_status status; +public: + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override ; + void send_response() override; + const char* name() const override { return "get_data_changes_log_status"; } +}; + +void RGWOp_DATALog_Status::execute() +{ + const auto source_zone = s->info.args.get("source-zone"); + auto sync = store->get_data_sync_manager(source_zone); + if (sync == nullptr) { + ldout(s->cct, 1) << "no sync manager for source-zone " << source_zone << dendl; + http_ret = -ENOENT; + return; + } + http_ret = sync->read_sync_status(&status); +} + +void RGWOp_DATALog_Status::send_response() +{ + set_req_state_err(s, http_ret); + dump_errno(s); + end_header(s); + + if (http_ret >= 0) { + encode_json("status", status, s->formatter); + } + flusher.flush(); +} + + +RGWOp *RGWHandler_Log::op_get() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) { + if (s->info.args.exists("id")) { + if (s->info.args.exists("info")) { + return new RGWOp_MDLog_ShardInfo; + } else { + return new RGWOp_MDLog_List; + } + } else if (s->info.args.exists("status")) { + return new RGWOp_MDLog_Status; + } else { + return new RGWOp_MDLog_Info; + } + } else if (type.compare("bucket-index") == 0) { + if (s->info.args.exists("info")) { + return new RGWOp_BILog_Info; + } else if (s->info.args.exists("status")) { + return new RGWOp_BILog_Status; + } else { + return new RGWOp_BILog_List; + } + } else if (type.compare("data") == 0) { + if (s->info.args.exists("id")) { + if (s->info.args.exists("info")) { + return new RGWOp_DATALog_ShardInfo; + } else { + return new RGWOp_DATALog_List; + } + } else if (s->info.args.exists("status")) { + return new RGWOp_DATALog_Status; + } else { + return new RGWOp_DATALog_Info; + } + } + return NULL; +} + +RGWOp *RGWHandler_Log::op_delete() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) + return new RGWOp_MDLog_Delete; + else if (type.compare("bucket-index") == 0) + return new RGWOp_BILog_Delete; + else if (type.compare("data") == 0) + return new RGWOp_DATALog_Delete; + return NULL; +} + +RGWOp *RGWHandler_Log::op_post() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) { + if (s->info.args.exists("lock")) + return new RGWOp_MDLog_Lock; + else if (s->info.args.exists("unlock")) + return new RGWOp_MDLog_Unlock; + else if (s->info.args.exists("notify")) + return new RGWOp_MDLog_Notify; + } else if (type.compare("data") == 0) { + if (s->info.args.exists("lock")) + return new RGWOp_DATALog_Lock; + else if (s->info.args.exists("unlock")) + return new RGWOp_DATALog_Unlock; + else if (s->info.args.exists("notify")) + return new RGWOp_DATALog_Notify; + } + return NULL; +} + diff --git a/src/rgw/rgw_rest_log.h b/src/rgw/rgw_rest_log.h new file mode 100644 index 00000000..d5fbf814 --- /dev/null +++ b/src/rgw/rgw_rest_log.h @@ -0,0 +1,336 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_REST_LOG_H +#define RGW_REST_LOG_H + +#include "rgw_metadata.h" + +class RGWOp_BILog_List : public RGWRESTOp { + bool sent_header; +public: + RGWOp_BILog_List() : sent_header(false) {} + ~RGWOp_BILog_List() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void send_response() override; + virtual void send_response(list& entries, string& marker); + virtual void send_response_end(); + void execute() override; + const char* name() const override { + return "list_bucket_index_log"; + } +}; + +class RGWOp_BILog_Info : public RGWRESTOp { + string bucket_ver; + string master_ver; + string max_marker; + bool syncstopped; +public: + RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {} + ~RGWOp_BILog_Info() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void send_response() override; + void execute() override; + const char* name() const override { + return "bucket_index_log_info"; + } +}; + +class RGWOp_BILog_Delete : public RGWRESTOp { +public: + RGWOp_BILog_Delete() {} + ~RGWOp_BILog_Delete() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "trim_bucket_index_log"; + } +}; + +class RGWOp_MDLog_List : public RGWRESTOp { + list entries; + string last_marker; + bool truncated; +public: + RGWOp_MDLog_List() : truncated(false) {} + ~RGWOp_MDLog_List() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { + return "list_metadata_log"; + } +}; + +class RGWOp_MDLog_Info : public RGWRESTOp { + unsigned num_objects; + RGWPeriodHistory::Cursor period; +public: + RGWOp_MDLog_Info() : num_objects(0) {} + ~RGWOp_MDLog_Info() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { + return "get_metadata_log_info"; + } +}; + +class RGWOp_MDLog_ShardInfo : public RGWRESTOp { + RGWMetadataLogInfo info; +public: + RGWOp_MDLog_ShardInfo() {} + ~RGWOp_MDLog_ShardInfo() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { + return "get_metadata_log_shard_info"; + } +}; + +class RGWOp_MDLog_Lock : public RGWRESTOp { +public: + RGWOp_MDLog_Lock() {} + ~RGWOp_MDLog_Lock() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "lock_mdlog_object"; + } +}; + +class RGWOp_MDLog_Unlock : public RGWRESTOp { +public: + RGWOp_MDLog_Unlock() {} + ~RGWOp_MDLog_Unlock() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "unlock_mdlog_object"; + } +}; + +class RGWOp_MDLog_Notify : public RGWRESTOp { +public: + RGWOp_MDLog_Notify() {} + ~RGWOp_MDLog_Notify() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "mdlog_notify"; + } +}; + +class RGWOp_MDLog_Delete : public RGWRESTOp { +public: + RGWOp_MDLog_Delete() {} + ~RGWOp_MDLog_Delete() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "trim_metadata_log"; + } +}; + +class RGWOp_DATALog_List : public RGWRESTOp { + list entries; + string last_marker; + bool truncated; + bool extra_info; +public: + RGWOp_DATALog_List() : truncated(false), extra_info(false) {} + ~RGWOp_DATALog_List() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { + return "list_data_changes_log"; + } +}; + +class RGWOp_DATALog_Info : public RGWRESTOp { + unsigned num_objects; +public: + RGWOp_DATALog_Info() : num_objects(0) {} + ~RGWOp_DATALog_Info() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { + return "get_data_changes_log_info"; + } +}; + +class RGWOp_DATALog_ShardInfo : public RGWRESTOp { + RGWDataChangesLogInfo info; +public: + RGWOp_DATALog_ShardInfo() {} + ~RGWOp_DATALog_ShardInfo() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { + return "get_data_changes_log_shard_info"; + } +}; + +class RGWOp_DATALog_Lock : public RGWRESTOp { +public: + RGWOp_DATALog_Lock() {} + ~RGWOp_DATALog_Lock() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "lock_datalog_object"; + } +}; + +class RGWOp_DATALog_Unlock : public RGWRESTOp { +public: + RGWOp_DATALog_Unlock() {} + ~RGWOp_DATALog_Unlock() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "unlock_datalog_object"; + } +}; + +class RGWOp_DATALog_Notify : public RGWRESTOp { +public: + RGWOp_DATALog_Notify() {} + ~RGWOp_DATALog_Notify() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "datalog_notify"; + } +}; + +class RGWOp_DATALog_Delete : public RGWRESTOp { +public: + RGWOp_DATALog_Delete() {} + ~RGWOp_DATALog_Delete() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "trim_data_changes_log"; + } +}; + +class RGWHandler_Log : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + + int read_permissions(RGWOp*) override { + return 0; + } +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Log() override = default; +}; + +class RGWRESTMgr_Log : public RGWRESTMgr { +public: + RGWRESTMgr_Log() = default; + ~RGWRESTMgr_Log() override = default; + + RGWHandler_REST* get_handler(struct req_state* const, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefixs) override { + return new RGWHandler_Log(auth_registry); + } +}; + +#endif /* RGW_REST_LOG_H */ diff --git a/src/rgw/rgw_rest_metadata.cc b/src/rgw/rgw_rest_metadata.cc new file mode 100644 index 00000000..0f81d54c --- /dev/null +++ b/src/rgw/rgw_rest_metadata.cc @@ -0,0 +1,363 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/page.h" + +#include "rgw_rest.h" +#include "rgw_op.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_metadata.h" +#include "rgw_client_io.h" +#include "common/errno.h" +#include "common/strtol.h" +#include "rgw/rgw_b64.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +static inline void frame_metadata_key(req_state *s, string& out) { + bool exists; + string key = s->info.args.get("key", &exists); + + string section; + if (!s->init_state.url_bucket.empty()) { + section = s->init_state.url_bucket; + } else { + section = key; + key.clear(); + } + + out = section; + + if (!key.empty()) { + out += string(":") + key; + } +} + +void RGWOp_Metadata_Get::execute() { + string metadata_key; + + frame_metadata_key(s, metadata_key); + + /* Get keys */ + http_ret = store->meta_mgr->get(metadata_key, s->formatter); + if (http_ret < 0) { + dout(5) << "ERROR: can't get key: " << cpp_strerror(http_ret) << dendl; + return; + } + + http_ret = 0; +} + +void RGWOp_Metadata_Get_Myself::execute() { + string owner_id; + + owner_id = s->owner.get_id().to_str(); + s->info.args.append("key", owner_id); + + return RGWOp_Metadata_Get::execute(); +} + +void RGWOp_Metadata_List::execute() { + string marker; + ldout(s->cct, 16) << __func__ + << " raw marker " << s->info.args.get("marker") + << dendl; + + try { + marker = s->info.args.get("marker"); + if (!marker.empty()) { + marker = rgw::from_base64(marker); + } + ldout(s->cct, 16) << __func__ + << " marker " << marker << dendl; + } catch (...) { + marker = std::string(""); + } + + bool max_entries_specified; + string max_entries_str = + s->info.args.get("max-entries", &max_entries_specified); + + bool extended_response = (max_entries_specified); /* for backward compatibility, if max-entries is not specified + we will send the old response format */ + uint64_t max_entries = 0; + + if (max_entries_specified) { + string err; + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) { + dout(5) << "Error parsing max-entries " << max_entries_str << dendl; + http_ret = -EINVAL; + return; + } + } + + string metadata_key; + + frame_metadata_key(s, metadata_key); + /* List keys */ + void *handle; + int max = 1000; + + /* example markers: + marker = "3:b55a9110:root::bu_9:head"; + marker = "3:b9a8b2a6:root::sorry_janefonda_890:head"; + marker = "3:bf885d8f:root::sorry_janefonda_665:head"; + */ + + http_ret = store->meta_mgr->list_keys_init(metadata_key, marker, &handle); + if (http_ret < 0) { + dout(5) << "ERROR: can't get key: " << cpp_strerror(http_ret) << dendl; + return; + } + + bool truncated; + uint64_t count = 0; + + if (extended_response) { + s->formatter->open_object_section("result"); + } + + s->formatter->open_array_section("keys"); + + uint64_t left; + do { + list keys; + left = (max_entries_specified ? max_entries - count : max); + http_ret = store->meta_mgr->list_keys_next(handle, left, keys, &truncated); + if (http_ret < 0) { + dout(5) << "ERROR: lists_keys_next(): " << cpp_strerror(http_ret) + << dendl; + return; + } + + for (list::iterator iter = keys.begin(); iter != keys.end(); + ++iter) { + s->formatter->dump_string("key", *iter); + ++count; + } + + } while (truncated && left > 0); + + s->formatter->close_section(); + + if (extended_response) { + encode_json("truncated", truncated, s->formatter); + encode_json("count", count, s->formatter); + if (truncated) { + string esc_marker = + rgw::to_base64(store->meta_mgr->get_marker(handle)); + encode_json("marker", esc_marker, s->formatter); + } + s->formatter->close_section(); + } + store->meta_mgr->list_keys_complete(handle); + + http_ret = 0; +} + +int RGWOp_Metadata_Put::get_data(bufferlist& bl) { + size_t cl = 0; + char *data; + int read_len; + + if (s->length) + cl = atoll(s->length); + if (cl) { + data = (char *)malloc(cl + 1); + if (!data) { + return -ENOMEM; + } + read_len = recv_body(s, data, cl); + if (cl != (size_t)read_len) { + dout(10) << "recv_body incomplete" << dendl; + } + if (read_len < 0) { + free(data); + return read_len; + } + bl.append(data, read_len); + } else { + int chunk_size = CEPH_PAGE_SIZE; + const char *enc = s->info.env->get("HTTP_TRANSFER_ENCODING"); + if (!enc || strcmp(enc, "chunked")) { + return -ERR_LENGTH_REQUIRED; + } + data = (char *)malloc(chunk_size); + if (!data) { + return -ENOMEM; + } + do { + read_len = recv_body(s, data, chunk_size); + if (read_len < 0) { + free(data); + return read_len; + } + bl.append(data, read_len); + } while (read_len == chunk_size); + } + + free(data); + return 0; +} + +void RGWOp_Metadata_Put::execute() { + bufferlist bl; + string metadata_key; + + http_ret = get_data(bl); + if (http_ret < 0) { + return; + } + + http_ret = do_aws4_auth_completion(); + if (http_ret < 0) { + return; + } + + frame_metadata_key(s, metadata_key); + + RGWMetadataHandler::sync_type_t sync_type = RGWMetadataHandler::APPLY_ALWAYS; + + bool mode_exists = false; + string mode_string = s->info.args.get("update-type", &mode_exists); + if (mode_exists) { + bool parsed = RGWMetadataHandler::string_to_sync_type(mode_string, + sync_type); + if (!parsed) { + http_ret = -EINVAL; + return; + } + } + + http_ret = store->meta_mgr->put(metadata_key, bl, sync_type, + &ondisk_version); + if (http_ret < 0) { + dout(5) << "ERROR: can't put key: " << cpp_strerror(http_ret) << dendl; + return; + } + // translate internal codes into return header + if (http_ret == STATUS_NO_APPLY) + update_status = "skipped"; + else if (http_ret == STATUS_APPLIED) + update_status = "applied"; +} + +void RGWOp_Metadata_Put::send_response() { + int http_return_code = http_ret; + if ((http_ret == STATUS_NO_APPLY) || (http_ret == STATUS_APPLIED)) + http_return_code = STATUS_NO_CONTENT; + set_req_state_err(s, http_return_code); + dump_errno(s); + stringstream ver_stream; + ver_stream << "ver:" << ondisk_version.ver + <<",tag:" << ondisk_version.tag; + dump_header_if_nonempty(s, "RGWX_UPDATE_STATUS", update_status); + dump_header_if_nonempty(s, "RGWX_UPDATE_VERSION", ver_stream.str()); + end_header(s); +} + +void RGWOp_Metadata_Delete::execute() { + string metadata_key; + + frame_metadata_key(s, metadata_key); + http_ret = store->meta_mgr->remove(metadata_key); + if (http_ret < 0) { + dout(5) << "ERROR: can't remove key: " << cpp_strerror(http_ret) << dendl; + return; + } + http_ret = 0; +} + +void RGWOp_Metadata_Lock::execute() { + string duration_str, lock_id; + string metadata_key; + + frame_metadata_key(s, metadata_key); + + http_ret = 0; + + duration_str = s->info.args.get("length"); + lock_id = s->info.args.get("lock_id"); + + if ((!s->info.args.exists("key")) || + (duration_str.empty()) || + lock_id.empty()) { + dout(5) << "Error invalid parameter list" << dendl; + http_ret = -EINVAL; + return; + } + + int dur; + string err; + + dur = strict_strtol(duration_str.c_str(), 10, &err); + if (!err.empty() || dur <= 0) { + dout(5) << "invalid length param " << duration_str << dendl; + http_ret = -EINVAL; + return; + } + http_ret = store->meta_mgr->lock_exclusive(metadata_key, make_timespan(dur), lock_id); + if (http_ret == -EBUSY) + http_ret = -ERR_LOCKED; +} + +void RGWOp_Metadata_Unlock::execute() { + string lock_id; + string metadata_key; + + frame_metadata_key(s, metadata_key); + + http_ret = 0; + + lock_id = s->info.args.get("lock_id"); + + if ((!s->info.args.exists("key")) || + lock_id.empty()) { + dout(5) << "Error invalid parameter list" << dendl; + http_ret = -EINVAL; + return; + } + + http_ret = store->meta_mgr->unlock(metadata_key, lock_id); +} + +RGWOp *RGWHandler_Metadata::op_get() { + if (s->info.args.exists("myself")) + return new RGWOp_Metadata_Get_Myself; + if (s->info.args.exists("key")) + return new RGWOp_Metadata_Get; + else + return new RGWOp_Metadata_List; +} + +RGWOp *RGWHandler_Metadata::op_put() { + return new RGWOp_Metadata_Put; +} + +RGWOp *RGWHandler_Metadata::op_delete() { + return new RGWOp_Metadata_Delete; +} + +RGWOp *RGWHandler_Metadata::op_post() { + if (s->info.args.exists("lock")) + return new RGWOp_Metadata_Lock; + else if (s->info.args.exists("unlock")) + return new RGWOp_Metadata_Unlock; + + return NULL; +} diff --git a/src/rgw/rgw_rest_metadata.h b/src/rgw/rgw_rest_metadata.h new file mode 100644 index 00000000..728813c7 --- /dev/null +++ b/src/rgw/rgw_rest_metadata.h @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_REST_METADATA_H +#define RGW_REST_METADATA_H + +class RGWOp_Metadata_List : public RGWRESTOp { +public: + RGWOp_Metadata_List() {} + ~RGWOp_Metadata_List() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_READ); + } + void execute() override; + const char* name() const override { return "list_metadata"; } +}; + +class RGWOp_Metadata_Get : public RGWRESTOp { +public: + RGWOp_Metadata_Get() {} + ~RGWOp_Metadata_Get() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_READ); + } + void execute() override; + const char* name() const override { return "get_metadata"; } +}; + +class RGWOp_Metadata_Get_Myself : public RGWOp_Metadata_Get { +public: + RGWOp_Metadata_Get_Myself() {} + ~RGWOp_Metadata_Get_Myself() override {} + + void execute() override; +}; + +class RGWOp_Metadata_Put : public RGWRESTOp { + int get_data(bufferlist& bl); + string update_status; + obj_version ondisk_version; +public: + RGWOp_Metadata_Put() {} + ~RGWOp_Metadata_Put() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_WRITE); + } + void execute() override; + void send_response() override; + const char* name() const override { return "set_metadata"; } + RGWOpType get_type() override { return RGW_OP_ADMIN_SET_METADATA; } +}; + +class RGWOp_Metadata_Delete : public RGWRESTOp { +public: + RGWOp_Metadata_Delete() {} + ~RGWOp_Metadata_Delete() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { return "remove_metadata"; } +}; + +class RGWOp_Metadata_Lock : public RGWRESTOp { +public: + RGWOp_Metadata_Lock() {} + ~RGWOp_Metadata_Lock() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "lock_metadata_object"; + } +}; + +class RGWOp_Metadata_Unlock : public RGWRESTOp { +public: + RGWOp_Metadata_Unlock() {} + ~RGWOp_Metadata_Unlock() override {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("metadata", RGW_CAP_WRITE); + } + void execute() override; + const char* name() const override { + return "unlock_metadata_object"; + } +}; + +class RGWHandler_Metadata : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + + int read_permissions(RGWOp*) override { + return 0; + } +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Metadata() override = default; +}; + +class RGWRESTMgr_Metadata : public RGWRESTMgr { +public: + RGWRESTMgr_Metadata() = default; + ~RGWRESTMgr_Metadata() override = default; + + RGWHandler_REST* get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override { + return new RGWHandler_Metadata(auth_registry); + } +}; + +#endif /* RGW_REST_METADATA_H */ diff --git a/src/rgw/rgw_rest_pubsub.cc b/src/rgw/rgw_rest_pubsub.cc new file mode 100644 index 00000000..de4babd4 --- /dev/null +++ b/src/rgw/rgw_rest_pubsub.cc @@ -0,0 +1,729 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include "rgw_rest_pubsub_common.h" +#include "rgw_rest_pubsub.h" +#include "rgw_pubsub_push.h" +#include "rgw_pubsub.h" +#include "rgw_sync_module_pubsub.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_arn.h" +#include "rgw_auth_s3.h" +#include "services/svc_zone.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + + +// command (AWS compliant): +// POST +// Action=CreateTopic&Name=[&push-endpoint=[&=]] +class RGWPSCreateTopic_ObjStore_AWS : public RGWPSCreateTopicOp { +public: + int get_params() override { + topic_name = s->info.args.get("Name"); + if (topic_name.empty()) { + ldout(s->cct, 1) << "CreateTopic Action 'Name' argument is missing" << dendl; + return -EINVAL; + } + + opaque_data = s->info.args.get("OpaqueData"); + + dest.push_endpoint = s->info.args.get("push-endpoint"); + + if (!validate_and_update_endpoint_secret(dest, s->cct, *(s->info.env))) { + return -EINVAL; + } + for (const auto param : s->info.args.get_params()) { + if (param.first == "Action" || param.first == "Name" || param.first == "PayloadHash") { + continue; + } + dest.push_endpoint_args.append(param.first+"="+param.second+"&"); + } + + if (!dest.push_endpoint_args.empty()) { + // remove last separator + dest.push_endpoint_args.pop_back(); + } + + // dest object only stores endpoint info + // bucket to store events/records will be set only when subscription is created + dest.bucket_name = ""; + dest.oid_prefix = ""; + dest.arn_topic = topic_name; + // the topic ARN will be sent in the reply + const rgw::ARN arn(rgw::Partition::aws, rgw::Service::sns, + store->svc.zone->get_zonegroup().get_name(), + s->user->user_id.tenant, topic_name); + topic_arn = arn.to_string(); + return 0; + } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section_in_ns("CreateTopicResponse", "https://sns.amazonaws.com/doc/2010-03-31/"); + f->open_object_section("CreateTopicResult"); + encode_xml("TopicArn", topic_arn, f); + f->close_section(); + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); + f->close_section(); + rgw_flush_formatter_and_reset(s, f); + } +}; + +// command (AWS compliant): +// POST +// Action=ListTopics +class RGWPSListTopics_ObjStore_AWS : public RGWPSListTopicsOp { +public: + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section_in_ns("ListTopicsResponse", "https://sns.amazonaws.com/doc/2010-03-31/"); + f->open_object_section("ListTopicsResult"); + encode_xml("Topics", result, f); + f->close_section(); + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); + f->close_section(); + rgw_flush_formatter_and_reset(s, f); + } +}; + +// command (extension to AWS): +// POST +// Action=GetTopic&TopicArn= +class RGWPSGetTopic_ObjStore_AWS : public RGWPSGetTopicOp { +public: + int get_params() override { + const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn"))); + + if (!topic_arn || topic_arn->resource.empty()) { + ldout(s->cct, 1) << "GetTopic Action 'TopicArn' argument is missing or invalid" << dendl; + return -EINVAL; + } + + topic_name = topic_arn->resource; + return 0; + } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section("GetTopicResponse"); + f->open_object_section("GetTopicResult"); + encode_xml("Topic", result.topic, f); + f->close_section(); + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); + f->close_section(); + rgw_flush_formatter_and_reset(s, f); + } +}; + +// command (AWS compliant): +// POST +// Action=DeleteTopic&TopicArn= +class RGWPSDeleteTopic_ObjStore_AWS : public RGWPSDeleteTopicOp { +public: + int get_params() override { + const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn"))); + + if (!topic_arn || topic_arn->resource.empty()) { + ldout(s->cct, 1) << "DeleteTopic Action 'TopicArn' argument is missing or invalid" << dendl; + return -EINVAL; + } + + topic_name = topic_arn->resource; + return 0; + } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + const auto f = s->formatter; + f->open_object_section_in_ns("DeleteTopicResponse", "https://sns.amazonaws.com/doc/2010-03-31/"); + f->open_object_section("ResponseMetadata"); + encode_xml("RequestId", s->req_id, f); + f->close_section(); + f->close_section(); + rgw_flush_formatter_and_reset(s, f); + } +}; + +namespace { +// utility classes and functions for handling parameters with the following format: +// Attributes.entry.{N}.{key|value}={VALUE} +// N - any unsigned number +// VALUE - url encoded string + +// and Attribute is holding key and value +// ctor and set are done according to the "type" argument +// if type is not "key" or "value" its a no-op +class Attribute { + std::string key; + std::string value; +public: + Attribute(const std::string& type, const std::string& key_or_value) { + set(type, key_or_value); + } + void set(const std::string& type, const std::string& key_or_value) { + if (type == "key") { + key = key_or_value; + } else if (type == "value") { + value = key_or_value; + } + } + const std::string& get_key() const { return key; } + const std::string& get_value() const { return value; } +}; + +using AttributeMap = std::map; + +// aggregate the attributes into a map +// the key and value are associated by the index (N) +// no assumptions are made on the order in which these parameters are added +void update_attribute_map(const std::string& input, AttributeMap& map) { + const boost::char_separator sep("."); + const boost::tokenizer tokens(input, sep); + auto token = tokens.begin(); + if (*token != "Attributes") { + return; + } + ++token; + + if (*token != "entry") { + return; + } + ++token; + + unsigned idx; + try { + idx = std::stoul(*token); + } catch (const std::invalid_argument&) { + return; + } + ++token; + + std::string key_or_value = ""; + // get the rest of the string regardless of dots + // this is to allow dots in the value + while (token != tokens.end()) { + key_or_value.append(*token+"."); + ++token; + } + // remove last separator + key_or_value.pop_back(); + + auto pos = key_or_value.find("="); + if (pos != string::npos) { + const auto key_or_value_lhs = key_or_value.substr(0, pos); + const auto key_or_value_rhs = url_decode(key_or_value.substr(pos + 1, key_or_value.size() - 1)); + const auto map_it = map.find(idx); + if (map_it == map.end()) { + // new entry + map.emplace(std::make_pair(idx, Attribute(key_or_value_lhs, key_or_value_rhs))); + } else { + // existing entry + map_it->second.set(key_or_value_lhs, key_or_value_rhs); + } + } +} +} + +void RGWHandler_REST_PSTopic_AWS::rgw_topic_parse_input() { + if (post_body.size() > 0) { + ldout(s->cct, 10) << "Content of POST: " << post_body << dendl; + + if (post_body.find("Action") != string::npos) { + const boost::char_separator sep("&"); + const boost::tokenizer> tokens(post_body, sep); + AttributeMap map; + for (const auto& t : tokens) { + auto pos = t.find("="); + if (pos != string::npos) { + const auto key = t.substr(0, pos); + if (key == "Action") { + s->info.args.append(key, t.substr(pos + 1, t.size() - 1)); + } else if (key == "Name" || key == "TopicArn") { + const auto value = url_decode(t.substr(pos + 1, t.size() - 1)); + s->info.args.append(key, value); + } else { + update_attribute_map(t, map); + } + } + } + // update the regular args with the content of the attribute map + for (const auto attr : map) { + s->info.args.append(attr.second.get_key(), attr.second.get_value()); + } + } + const auto payload_hash = rgw::auth::s3::calc_v4_payload_hash(post_body); + s->info.args.append("PayloadHash", payload_hash); + } +} + +RGWOp* RGWHandler_REST_PSTopic_AWS::op_post() { + rgw_topic_parse_input(); + + if (s->info.args.exists("Action")) { + const auto action = s->info.args.get("Action"); + if (action.compare("CreateTopic") == 0) + return new RGWPSCreateTopic_ObjStore_AWS(); + if (action.compare("DeleteTopic") == 0) + return new RGWPSDeleteTopic_ObjStore_AWS; + if (action.compare("ListTopics") == 0) + return new RGWPSListTopics_ObjStore_AWS(); + if (action.compare("GetTopic") == 0) + return new RGWPSGetTopic_ObjStore_AWS(); + } + + return nullptr; +} + +int RGWHandler_REST_PSTopic_AWS::authorize(const DoutPrefixProvider* dpp) { + /*if (s->info.args.exists("Action") && s->info.args.get("Action").find("Topic") != std::string::npos) { + // TODO: some topic specific authorization + return 0; + }*/ + return RGW_Auth_S3::authorize(dpp, store, auth_registry, s); +} + + +namespace { +// return a unique topic by prefexing with the notification name: _ +std::string topic_to_unique(const std::string& topic, const std::string& notification) { + return notification + "_" + topic; +} + +// extract the topic from a unique topic of the form: _ +[[maybe_unused]] std::string unique_to_topic(const std::string& unique_topic, const std::string& notification) { + if (unique_topic.find(notification + "_") == string::npos) { + return ""; + } + return unique_topic.substr(notification.length() + 1); +} + +// from list of bucket topics, find the one that was auto-generated by a notification +auto find_unique_topic(const rgw_pubsub_bucket_topics& bucket_topics, const std::string& notif_name) { + auto it = std::find_if(bucket_topics.topics.begin(), bucket_topics.topics.end(), [&](const auto& val) { return notif_name == val.second.s3_id; }); + return it != bucket_topics.topics.end() ? + std::optional>(it->second): + std::nullopt; +} +} + +// command (S3 compliant): PUT /?notification +// a "notification" and a subscription will be auto-generated +// actual configuration is XML encoded in the body of the message +class RGWPSCreateNotif_ObjStore_S3 : public RGWPSCreateNotifOp { + rgw_pubsub_s3_notifications configurations; + + int get_params_from_body() { + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + int r; + bufferlist data; + std::tie(r, data) = rgw_rest_read_all_input(s, max_size, false); + + if (r < 0) { + ldout(s->cct, 1) << "failed to read XML payload" << dendl; + return r; + } + if (data.length() == 0) { + ldout(s->cct, 1) << "XML payload missing" << dendl; + return -EINVAL; + } + + RGWXMLDecoder::XMLParser parser; + + if (!parser.init()){ + ldout(s->cct, 1) << "failed to initialize XML parser" << dendl; + return -EINVAL; + } + if (!parser.parse(data.c_str(), data.length(), 1)) { + ldout(s->cct, 1) << "failed to parse XML payload" << dendl; + return -ERR_MALFORMED_XML; + } + try { + // NotificationConfigurations is mandatory + RGWXMLDecoder::decode_xml("NotificationConfiguration", configurations, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldout(s->cct, 1) << "failed to parse XML payload. error: " << err << dendl; + return -ERR_MALFORMED_XML; + } + return 0; + } + + int get_params() override { + bool exists; + const auto no_value = s->info.args.get("notification", &exists); + if (!exists) { + ldout(s->cct, 1) << "missing required param 'notification'" << dendl; + return -EINVAL; + } + if (no_value.length() > 0) { + ldout(s->cct, 1) << "param 'notification' should not have any value" << dendl; + return -EINVAL; + } + if (s->bucket_name.empty()) { + ldout(s->cct, 1) << "request must be on a bucket" << dendl; + return -EINVAL; + } + bucket_name = s->bucket_name; + return 0; + } + +public: + const char* name() const override { return "pubsub_notification_create_s3"; } + void execute() override; +}; + +void RGWPSCreateNotif_ObjStore_S3::execute() { + op_ret = get_params_from_body(); + if (op_ret < 0) { + return; + } + + ups.emplace(store, s->owner.get_id()); + auto b = ups->get_bucket(bucket_info.bucket); + ceph_assert(b); + std::string data_bucket_prefix = ""; + std::string data_oid_prefix = ""; + bool push_only = true; + if (store->get_sync_module()) { + const auto psmodule = dynamic_cast(store->get_sync_module().get()); + if (psmodule) { + const auto& conf = psmodule->get_effective_conf(); + data_bucket_prefix = conf["data_bucket_prefix"]; + data_oid_prefix = conf["data_oid_prefix"]; + // TODO: allow "push-only" on PS zone as well + push_only = false; + } + } + + for (const auto& c : configurations.list) { + const auto& notif_name = c.id; + if (notif_name.empty()) { + ldout(s->cct, 1) << "missing notification id" << dendl; + op_ret = -EINVAL; + return; + } + if (c.topic_arn.empty()) { + ldout(s->cct, 1) << "missing topic ARN in notification: '" << notif_name << "'" << dendl; + op_ret = -EINVAL; + return; + } + + const auto arn = rgw::ARN::parse(c.topic_arn); + if (!arn || arn->resource.empty()) { + ldout(s->cct, 1) << "topic ARN has invalid format: '" << c.topic_arn << "' in notification: '" << notif_name << "'" << dendl; + op_ret = -EINVAL; + return; + } + + if (std::find(c.events.begin(), c.events.end(), rgw::notify::UnknownEvent) != c.events.end()) { + ldout(s->cct, 1) << "unknown event type in notification: '" << notif_name << "'" << dendl; + op_ret = -EINVAL; + return; + } + + const auto topic_name = arn->resource; + + // get topic information. destination information is stored in the topic + rgw_pubsub_topic topic_info; + op_ret = ups->get_topic(topic_name, &topic_info); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + // make sure that full topic configuration match + // TODO: use ARN match function + + // create unique topic name. this has 2 reasons: + // (1) topics cannot be shared between different S3 notifications because they hold the filter information + // (2) make topic clneaup easier, when notification is removed + const auto unique_topic_name = topic_to_unique(topic_name, notif_name); + // generate the internal topic. destination is stored here for the "push-only" case + // when no subscription exists + // ARN is cached to make the "GET" method faster + op_ret = ups->create_topic(unique_topic_name, topic_info.dest, topic_info.arn, topic_info.opaque_data); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to auto-generate unique topic '" << unique_topic_name << + "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully auto-generated unique topic '" << unique_topic_name << "'" << dendl; + // generate the notification + rgw::notify::EventTypeList events; + op_ret = b->create_notification(unique_topic_name, c.events, std::make_optional(c.filter), notif_name); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to auto-generate notification for unique topic '" << unique_topic_name << + "', ret=" << op_ret << dendl; + // rollback generated topic (ignore return value) + ups->remove_topic(unique_topic_name); + return; + } + ldout(s->cct, 20) << "successfully auto-generated notification for unique topic '" << unique_topic_name << "'" << dendl; + + if (!push_only) { + // generate the subscription with destination information from the original topic + rgw_pubsub_sub_dest dest = topic_info.dest; + dest.bucket_name = data_bucket_prefix + s->owner.get_id().to_str() + "-" + unique_topic_name; + dest.oid_prefix = data_oid_prefix + notif_name + "/"; + auto sub = ups->get_sub(notif_name); + op_ret = sub->subscribe(unique_topic_name, dest, notif_name); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to auto-generate subscription '" << notif_name << "', ret=" << op_ret << dendl; + // rollback generated notification (ignore return value) + b->remove_notification(unique_topic_name); + // rollback generated topic (ignore return value) + ups->remove_topic(unique_topic_name); + return; + } + ldout(s->cct, 20) << "successfully auto-generated subscription '" << notif_name << "'" << dendl; + } + } +} + +// command (extension to S3): DELETE /bucket?notification[=] +class RGWPSDeleteNotif_ObjStore_S3 : public RGWPSDeleteNotifOp { +private: + std::string notif_name; + + int get_params() override { + bool exists; + notif_name = s->info.args.get("notification", &exists); + if (!exists) { + ldout(s->cct, 1) << "missing required param 'notification'" << dendl; + return -EINVAL; + } + if (s->bucket_name.empty()) { + ldout(s->cct, 1) << "request must be on a bucket" << dendl; + return -EINVAL; + } + bucket_name = s->bucket_name; + return 0; + } + + void remove_notification_by_topic(const std::string& topic_name, const RGWUserPubSub::BucketRef& b) { + op_ret = b->remove_notification(topic_name); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to remove notification of topic '" << topic_name << "', ret=" << op_ret << dendl; + } + op_ret = ups->remove_topic(topic_name); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to remove auto-generated topic '" << topic_name << "', ret=" << op_ret << dendl; + } + } + +public: + void execute() override; + const char* name() const override { return "pubsub_notification_delete_s3"; } +}; + +void RGWPSDeleteNotif_ObjStore_S3::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + ups.emplace(store, s->owner.get_id()); + auto b = ups->get_bucket(bucket_info.bucket); + ceph_assert(b); + + // get all topics on a bucket + rgw_pubsub_bucket_topics bucket_topics; + op_ret = b->get_topics(&bucket_topics); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get list of topics from bucket '" << bucket_info.bucket.name << "', ret=" << op_ret << dendl; + return; + } + + if (!notif_name.empty()) { + // delete a specific notification + const auto unique_topic = find_unique_topic(bucket_topics, notif_name); + if (unique_topic) { + // remove the auto generated subscription according to notification name (if exist) + const auto unique_topic_name = unique_topic->get().topic.name; + auto sub = ups->get_sub(notif_name); + op_ret = sub->unsubscribe(unique_topic_name); + if (op_ret < 0 && op_ret != -ENOENT) { + ldout(s->cct, 1) << "failed to remove auto-generated subscription '" << notif_name << "', ret=" << op_ret << dendl; + return; + } + remove_notification_by_topic(unique_topic_name, b); + return; + } + // notification to be removed is not found - considered success + ldout(s->cct, 20) << "notification '" << notif_name << "' already removed" << dendl; + return; + } + + // delete all notification of on a bucket + for (const auto& topic : bucket_topics.topics) { + // remove the auto generated subscription of the topic (if exist) + rgw_pubsub_topic_subs topic_subs; + op_ret = ups->get_topic(topic.first, &topic_subs); + for (const auto& topic_sub_name : topic_subs.subs) { + auto sub = ups->get_sub(topic_sub_name); + rgw_pubsub_sub_config sub_conf; + op_ret = sub->get_conf(&sub_conf); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get subscription '" << topic_sub_name << "' info, ret=" << op_ret << dendl; + return; + } + if (!sub_conf.s3_id.empty()) { + // S3 notification, has autogenerated subscription + const auto& sub_topic_name = sub_conf.topic; + op_ret = sub->unsubscribe(sub_topic_name); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to remove auto-generated subscription '" << topic_sub_name << "', ret=" << op_ret << dendl; + return; + } + } + } + remove_notification_by_topic(topic.first, b); + } +} + +// command (S3 compliant): GET /bucket?notification[=] +class RGWPSListNotifs_ObjStore_S3 : public RGWPSListNotifsOp { +private: + std::string notif_name; + rgw_pubsub_s3_notifications notifications; + + int get_params() override { + bool exists; + notif_name = s->info.args.get("notification", &exists); + if (!exists) { + ldout(s->cct, 1) << "missing required param 'notification'" << dendl; + return -EINVAL; + } + if (s->bucket_name.empty()) { + ldout(s->cct, 1) << "request must be on a bucket" << dendl; + return -EINVAL; + } + bucket_name = s->bucket_name; + return 0; + } + +public: + void execute() override; + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + notifications.dump_xml(s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); + } + const char* name() const override { return "pubsub_notifications_get_s3"; } +}; + +void RGWPSListNotifs_ObjStore_S3::execute() { + ups.emplace(store, s->owner.get_id()); + auto b = ups->get_bucket(bucket_info.bucket); + ceph_assert(b); + + // get all topics on a bucket + rgw_pubsub_bucket_topics bucket_topics; + op_ret = b->get_topics(&bucket_topics); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get list of topics from bucket '" << bucket_info.bucket.name << "', ret=" << op_ret << dendl; + return; + } + if (!notif_name.empty()) { + // get info of a specific notification + const auto unique_topic = find_unique_topic(bucket_topics, notif_name); + if (unique_topic) { + notifications.list.emplace_back(unique_topic->get()); + return; + } + op_ret = -ENOENT; + ldout(s->cct, 1) << "failed to get notification info for '" << notif_name << "', ret=" << op_ret << dendl; + return; + } + // loop through all topics of the bucket + for (const auto& topic : bucket_topics.topics) { + if (topic.second.s3_id.empty()) { + // not an s3 notification + continue; + } + notifications.list.emplace_back(topic.second); + } +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::op_get() { + return new RGWPSListNotifs_ObjStore_S3(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::op_put() { + return new RGWPSCreateNotif_ObjStore_S3(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::op_delete() { + return new RGWPSDeleteNotif_ObjStore_S3(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::create_get_op() { + return new RGWPSListNotifs_ObjStore_S3(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::create_put_op() { + return new RGWPSCreateNotif_ObjStore_S3(); +} + +RGWOp* RGWHandler_REST_PSNotifs_S3::create_delete_op() { + return new RGWPSDeleteNotif_ObjStore_S3(); +} + diff --git a/src/rgw/rgw_rest_pubsub.h b/src/rgw/rgw_rest_pubsub.h new file mode 100644 index 00000000..f2f63356 --- /dev/null +++ b/src/rgw/rgw_rest_pubsub.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include "rgw_rest_s3.h" + +// s3 compliant notification handler factory +class RGWHandler_REST_PSNotifs_S3 : public RGWHandler_REST_S3 { +protected: + int init_permissions(RGWOp* op) override {return 0;} + int read_permissions(RGWOp* op) override {return 0;} + bool supports_quota() override {return false;} + RGWOp* op_get() override; + RGWOp* op_put() override; + RGWOp* op_delete() override; +public: + using RGWHandler_REST_S3::RGWHandler_REST_S3; + virtual ~RGWHandler_REST_PSNotifs_S3() = default; + // following are used to generate the operations when invoked by another REST handler + static RGWOp* create_get_op(); + static RGWOp* create_put_op(); + static RGWOp* create_delete_op(); +}; + +// AWS compliant topics handler factory +class RGWHandler_REST_PSTopic_AWS : public RGWHandler_REST { + const rgw::auth::StrategyRegistry& auth_registry; + const std::string& post_body; + void rgw_topic_parse_input(); + //static int init_from_header(struct req_state *s, int default_formatter, bool configurable_format); +protected: + RGWOp* op_post() override; +public: + RGWHandler_REST_PSTopic_AWS(const rgw::auth::StrategyRegistry& _auth_registry, const std::string& _post_body) : + auth_registry(_auth_registry), + post_body(_post_body) {} + virtual ~RGWHandler_REST_PSTopic_AWS() = default; + int postauth_init() override { return 0; } + int authorize(const DoutPrefixProvider* dpp) override; +}; + diff --git a/src/rgw/rgw_rest_pubsub_common.cc b/src/rgw/rgw_rest_pubsub_common.cc new file mode 100644 index 00000000..3b5de53f --- /dev/null +++ b/src/rgw/rgw_rest_pubsub_common.cc @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_common.h" +#include "rgw_rest_pubsub_common.h" +#include "common/dout.h" +#include "rgw_url.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +bool validate_and_update_endpoint_secret(rgw_pubsub_sub_dest& dest, CephContext *cct, const RGWEnv& env) { + if (dest.push_endpoint.empty()) { + return true; + } + std::string user; + std::string password; + if (!rgw::parse_url_userinfo(dest.push_endpoint, user, password)) { + ldout(cct, 1) << "endpoint validation error: malformed endpoint URL:" << dest.push_endpoint << dendl; + return false; + } + // this should be verified inside parse_url() + ceph_assert(user.empty() == password.empty()); + if (!user.empty()) { + dest.stored_secret = true; + if (!rgw_transport_is_secure(cct, env)) { + ldout(cct, 1) << "endpoint validation error: sending password over insecure transport" << dendl; + return false; + } + } + return true; +} + +bool subscription_has_endpoint_secret(const rgw_pubsub_sub_config& sub) { + return sub.dest.stored_secret; +} + +bool topic_has_endpoint_secret(const rgw_pubsub_topic_subs& topic) { + return topic.topic.dest.stored_secret; +} + +bool topics_has_endpoint_secret(const rgw_pubsub_user_topics& topics) { + for (const auto& topic : topics.topics) { + if (topic_has_endpoint_secret(topic.second)) return true; + } + return false; +} +void RGWPSCreateTopicOp::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + ups.emplace(store, s->owner.get_id()); + op_ret = ups->create_topic(topic_name, dest, topic_arn, opaque_data); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to create topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully created topic '" << topic_name << "'" << dendl; +} + +void RGWPSListTopicsOp::execute() { + ups.emplace(store, s->owner.get_id()); + op_ret = ups->get_user_topics(&result); + // if there are no topics it is not considered an error + op_ret = op_ret == -ENOENT ? 0 : op_ret; + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get topics, ret=" << op_ret << dendl; + return; + } + if (topics_has_endpoint_secret(result) && !rgw_transport_is_secure(s->cct, *(s->info.env))) { + ldout(s->cct, 1) << "topics contain secret and cannot be sent over insecure transport" << dendl; + op_ret = -EPERM; + return; + } + ldout(s->cct, 20) << "successfully got topics" << dendl; +} + +void RGWPSGetTopicOp::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + ups.emplace(store, s->owner.get_id()); + op_ret = ups->get_topic(topic_name, &result); + if (topic_has_endpoint_secret(result) && !rgw_transport_is_secure(s->cct, *(s->info.env))) { + ldout(s->cct, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl; + op_ret = -EPERM; + return; + } + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 1) << "successfully got topic '" << topic_name << "'" << dendl; +} + +void RGWPSDeleteTopicOp::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + ups.emplace(store, s->owner.get_id()); + op_ret = ups->remove_topic(topic_name); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to remove topic '" << topic_name << ", ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 1) << "successfully removed topic '" << topic_name << "'" << dendl; +} + +void RGWPSCreateSubOp::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + ups.emplace(store, s->owner.get_id()); + auto sub = ups->get_sub(sub_name); + op_ret = sub->subscribe(topic_name, dest); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to create subscription '" << sub_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully created subscription '" << sub_name << "'" << dendl; +} + +void RGWPSGetSubOp::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + ups.emplace(store, s->owner.get_id()); + auto sub = ups->get_sub(sub_name); + op_ret = sub->get_conf(&result); + if (subscription_has_endpoint_secret(result) && !rgw_transport_is_secure(s->cct, *(s->info.env))) { + ldout(s->cct, 1) << "subscription '" << sub_name << "' contain secret and cannot be sent over insecure transport" << dendl; + op_ret = -EPERM; + return; + } + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get subscription '" << sub_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully got subscription '" << sub_name << "'" << dendl; +} + +void RGWPSDeleteSubOp::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + ups.emplace(store, s->owner.get_id()); + auto sub = ups->get_sub(sub_name); + op_ret = sub->unsubscribe(topic_name); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to remove subscription '" << sub_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully removed subscription '" << sub_name << "'" << dendl; +} + +void RGWPSAckSubEventOp::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + ups.emplace(store, s->owner.get_id()); + auto sub = ups->get_sub_with_events(sub_name); + op_ret = sub->remove_event(event_id); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to ack event on subscription '" << sub_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully acked event on subscription '" << sub_name << "'" << dendl; +} + +void RGWPSPullSubEventsOp::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + ups.emplace(store, s->owner.get_id()); + sub = ups->get_sub_with_events(sub_name); + if (!sub) { + op_ret = -ENOENT; + ldout(s->cct, 1) << "failed to get subscription '" << sub_name << "' for events, ret=" << op_ret << dendl; + return; + } + op_ret = sub->list_events(marker, max_entries); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get events from subscription '" << sub_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully got events from subscription '" << sub_name << "'" << dendl; +} + + +int RGWPSCreateNotifOp::verify_permission() { + int ret = get_params(); + if (ret < 0) { + return ret; + } + + const auto& id = s->owner.get_id(); + + ret = store->get_bucket_info(*s->sysobj_ctx, id.tenant, bucket_name, + bucket_info, nullptr, nullptr); + if (ret < 0) { + ldout(s->cct, 1) << "failed to get bucket info, cannot verify ownership" << dendl; + return ret; + } + + if (bucket_info.owner != id) { + ldout(s->cct, 1) << "user doesn't own bucket, not allowed to create notification" << dendl; + return -EPERM; + } + return 0; +} + +int RGWPSDeleteNotifOp::verify_permission() { + int ret = get_params(); + if (ret < 0) { + return ret; + } + + ret = store->get_bucket_info(*s->sysobj_ctx, s->owner.get_id().tenant, bucket_name, + bucket_info, nullptr, nullptr); + if (ret < 0) { + return ret; + } + + if (bucket_info.owner != s->owner.get_id()) { + ldout(s->cct, 1) << "user doesn't own bucket, cannot remove notification" << dendl; + return -EPERM; + } + return 0; +} + +int RGWPSListNotifsOp::verify_permission() { + int ret = get_params(); + if (ret < 0) { + return ret; + } + + ret = store->get_bucket_info(*s->sysobj_ctx, s->owner.get_id().tenant, bucket_name, + bucket_info, nullptr, nullptr); + if (ret < 0) { + return ret; + } + + if (bucket_info.owner != s->owner.get_id()) { + ldout(s->cct, 1) << "user doesn't own bucket, cannot get notification list" << dendl; + return -EPERM; + } + + return 0; +} + diff --git a/src/rgw/rgw_rest_pubsub_common.h b/src/rgw/rgw_rest_pubsub_common.h new file mode 100644 index 00000000..d472fa40 --- /dev/null +++ b/src/rgw/rgw_rest_pubsub_common.h @@ -0,0 +1,287 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once +#include +#include +#include "rgw_op.h" +#include "rgw_pubsub.h" + +// make sure that endpoint is a valid URL +// make sure that if user/password are passed inside URL, it is over secure connection +// update rgw_pubsub_sub_dest to indicate that a password is stored in the URL +bool validate_and_update_endpoint_secret(rgw_pubsub_sub_dest& dest, CephContext *cct, const RGWEnv& env); + +// create a topic +class RGWPSCreateTopicOp : public RGWDefaultResponseOp { +protected: + std::optional ups; + std::string topic_name; + rgw_pubsub_sub_dest dest; + std::string topic_arn; + std::string opaque_data; + + virtual int get_params() = 0; + +public: + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_topic_create"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_CREATE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +// list all topics +class RGWPSListTopicsOp : public RGWOp { +protected: + std::optional ups; + rgw_pubsub_user_topics result; + +public: + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_topics_list"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPICS_LIST; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +// get topic information +class RGWPSGetTopicOp : public RGWOp { +protected: + std::string topic_name; + std::optional ups; + rgw_pubsub_topic_subs result; + + virtual int get_params() = 0; + +public: + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_topic_get"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +// delete a topic +class RGWPSDeleteTopicOp : public RGWDefaultResponseOp { +protected: + string topic_name; + std::optional ups; + + virtual int get_params() = 0; + +public: + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_topic_delete"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_DELETE; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } +}; + +// create a subscription +class RGWPSCreateSubOp : public RGWDefaultResponseOp { +protected: + std::string sub_name; + std::string topic_name; + std::optional ups; + rgw_pubsub_sub_dest dest; + + virtual int get_params() = 0; + +public: + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_subscription_create"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_SUB_CREATE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +// get subscription information (including push-endpoint if exist) +class RGWPSGetSubOp : public RGWOp { +protected: + std::string sub_name; + std::optional ups; + rgw_pubsub_sub_config result; + + virtual int get_params() = 0; + +public: + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_subscription_get"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_SUB_GET; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +// delete subscription +class RGWPSDeleteSubOp : public RGWDefaultResponseOp { +protected: + std::string sub_name; + std::string topic_name; + std::optional ups; + + virtual int get_params() = 0; + +public: + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_subscription_delete"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_SUB_DELETE; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } +}; + +// acking of an event +class RGWPSAckSubEventOp : public RGWDefaultResponseOp { +protected: + std::string sub_name; + std::string event_id; + std::optional ups; + + virtual int get_params() = 0; + +public: + RGWPSAckSubEventOp() {} + + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_subscription_ack"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_SUB_ACK; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +// fetching events from a subscription +// dpending on whether the subscription was created via s3 compliant API or not +// the matching events will be returned +class RGWPSPullSubEventsOp : public RGWOp { +protected: + int max_entries{0}; + std::string sub_name; + std::string marker; + std::optional ups; + RGWUserPubSub::SubRef sub; + + virtual int get_params() = 0; + +public: + RGWPSPullSubEventsOp() {} + + int verify_permission() override { + return 0; + } + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + void execute() override; + + const char* name() const override { return "pubsub_subscription_pull"; } + RGWOpType get_type() override { return RGW_OP_PUBSUB_SUB_PULL; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +// notification creation +class RGWPSCreateNotifOp : public RGWDefaultResponseOp { +protected: + std::optional ups; + string bucket_name; + RGWBucketInfo bucket_info; + + virtual int get_params() = 0; + +public: + int verify_permission() override; + + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + + RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_CREATE; } + uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } +}; + +// delete a notification +class RGWPSDeleteNotifOp : public RGWDefaultResponseOp { +protected: + std::optional ups; + std::string bucket_name; + RGWBucketInfo bucket_info; + + virtual int get_params() = 0; + +public: + int verify_permission() override; + + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + + RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_DELETE; } + uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; } +}; + +// get topics/notifications on a bucket +class RGWPSListNotifsOp : public RGWOp { +protected: + std::string bucket_name; + RGWBucketInfo bucket_info; + std::optional ups; + + virtual int get_params() = 0; + +public: + int verify_permission() override; + + void pre_exec() override { + rgw_bucket_object_pre_exec(s); + } + + RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_LIST; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + diff --git a/src/rgw/rgw_rest_realm.cc b/src/rgw/rgw_rest_realm.cc new file mode 100644 index 00000000..18e37676 --- /dev/null +++ b/src/rgw/rgw_rest_realm.cc @@ -0,0 +1,367 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" +#include "rgw_rest_realm.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_config.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +// reject 'period push' if we would have to fetch too many intermediate periods +static const uint32_t PERIOD_HISTORY_FETCH_MAX = 64; + +// base period op, shared between Get and Post +class RGWOp_Period_Base : public RGWRESTOp { + protected: + RGWPeriod period; + std::ostringstream error_stream; + public: + int verify_permission() override { return 0; } + void send_response() override; +}; + +// reply with the period object on success +void RGWOp_Period_Base::send_response() +{ + set_req_state_err(s, http_ret, error_stream.str()); + dump_errno(s); + + if (http_ret < 0) { + if (!s->err.message.empty()) { + ldout(s->cct, 4) << "Request failed with " << http_ret + << ": " << s->err.message << dendl; + } + end_header(s); + return; + } + + encode_json("period", period, s->formatter); + end_header(s, NULL, "application/json", s->formatter->get_len()); + flusher.flush(); +} + +// GET /admin/realm/period +class RGWOp_Period_Get : public RGWOp_Period_Base { + public: + void execute() override; + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + const char* name() const override { return "get_period"; } +}; + +void RGWOp_Period_Get::execute() +{ + string realm_id, realm_name, period_id; + epoch_t epoch = 0; + RESTArgs::get_string(s, "realm_id", realm_id, &realm_id); + RESTArgs::get_string(s, "realm_name", realm_name, &realm_name); + RESTArgs::get_string(s, "period_id", period_id, &period_id); + RESTArgs::get_uint32(s, "epoch", 0, &epoch); + + period.set_id(period_id); + period.set_epoch(epoch); + + http_ret = period.init(store->ctx(), store->svc.sysobj, realm_id, realm_name); + if (http_ret < 0) + ldout(store->ctx(), 5) << "failed to read period" << dendl; +} + +// POST /admin/realm/period +class RGWOp_Period_Post : public RGWOp_Period_Base { + public: + void execute() override; + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_WRITE); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + const char* name() const override { return "post_period"; } +}; + +void RGWOp_Period_Post::execute() +{ + auto cct = store->ctx(); + + // initialize the period without reading from rados + period.init(cct, store->svc.sysobj, false); + + // decode the period from input + const auto max_size = cct->_conf->rgw_max_put_param_size; + bool empty; + http_ret = rgw_rest_get_json_input(cct, s, period, max_size, &empty); + if (http_ret < 0) { + lderr(cct) << "failed to decode period" << dendl; + return; + } + + // require period.realm_id to match our realm + if (period.get_realm() != store->svc.zone->get_realm().get_id()) { + error_stream << "period with realm id " << period.get_realm() + << " doesn't match current realm " << store->svc.zone->get_realm().get_id() << std::endl; + http_ret = -EINVAL; + return; + } + + // load the realm and current period from rados; there may be a more recent + // period that we haven't restarted with yet. we also don't want to modify + // the objects in use by RGWRados + RGWRealm realm(period.get_realm()); + http_ret = realm.init(cct, store->svc.sysobj); + if (http_ret < 0) { + lderr(cct) << "failed to read current realm: " + << cpp_strerror(-http_ret) << dendl; + return; + } + + RGWPeriod current_period; + http_ret = current_period.init(cct, store->svc.sysobj, realm.get_id()); + if (http_ret < 0) { + lderr(cct) << "failed to read current period: " + << cpp_strerror(-http_ret) << dendl; + return; + } + + // if period id is empty, handle as 'period commit' + if (period.get_id().empty()) { + http_ret = period.commit(store, realm, current_period, error_stream); + if (http_ret < 0) { + lderr(cct) << "master zone failed to commit period" << dendl; + } + return; + } + + // if it's not period commit, nobody is allowed to push to the master zone + if (period.get_master_zone() == store->svc.zone->get_zone_params().get_id()) { + ldout(cct, 10) << "master zone rejecting period id=" + << period.get_id() << " epoch=" << period.get_epoch() << dendl; + http_ret = -EINVAL; // XXX: error code + return; + } + + // write the period to rados + http_ret = period.store_info(false); + if (http_ret < 0) { + lderr(cct) << "failed to store period " << period.get_id() << dendl; + return; + } + // set as latest epoch + http_ret = period.update_latest_epoch(period.get_epoch()); + if (http_ret == -EEXIST) { + // already have this epoch (or a more recent one) + ldout(cct, 4) << "already have epoch >= " << period.get_epoch() + << " for period " << period.get_id() << dendl; + http_ret = 0; + return; + } + if (http_ret < 0) { + lderr(cct) << "failed to set latest epoch" << dendl; + return; + } + + // decide whether we can set_current_period() or set_latest_epoch() + if (period.get_id() != current_period.get_id()) { + auto current_epoch = current_period.get_realm_epoch(); + // discard periods in the past + if (period.get_realm_epoch() < current_epoch) { + ldout(cct, 10) << "discarding period " << period.get_id() + << " with realm epoch " << period.get_realm_epoch() + << " older than current epoch " << current_epoch << dendl; + // return success to ack that we have this period + return; + } + // discard periods too far in the future + if (period.get_realm_epoch() > current_epoch + PERIOD_HISTORY_FETCH_MAX) { + lderr(cct) << "discarding period " << period.get_id() + << " with realm epoch " << period.get_realm_epoch() << " too far in " + "the future from current epoch " << current_epoch << dendl; + http_ret = -ENOENT; // XXX: error code + return; + } + // attach a copy of the period into the period history + auto cursor = store->period_history->attach(RGWPeriod{period}); + if (!cursor) { + // we're missing some history between the new period and current_period + http_ret = cursor.get_error(); + lderr(cct) << "failed to collect the periods between current period " + << current_period.get_id() << " (realm epoch " << current_epoch + << ") and the new period " << period.get_id() + << " (realm epoch " << period.get_realm_epoch() + << "): " << cpp_strerror(-http_ret) << dendl; + return; + } + if (cursor.has_next()) { + // don't switch if we have a newer period in our history + ldout(cct, 4) << "attached period " << period.get_id() + << " to history, but the history contains newer periods" << dendl; + return; + } + // set as current period + http_ret = realm.set_current_period(period); + if (http_ret < 0) { + lderr(cct) << "failed to update realm's current period" << dendl; + return; + } + ldout(cct, 4) << "period " << period.get_id() + << " is newer than current period " << current_period.get_id() + << ", updating realm's current period and notifying zone" << dendl; + realm.notify_new_period(period); + return; + } + // reflect the period into our local objects + http_ret = period.reflect(); + if (http_ret < 0) { + lderr(cct) << "failed to update local objects: " + << cpp_strerror(-http_ret) << dendl; + return; + } + ldout(cct, 4) << "period epoch " << period.get_epoch() + << " is newer than current epoch " << current_period.get_epoch() + << ", updating period's latest epoch and notifying zone" << dendl; + realm.notify_new_period(period); + // update the period history + store->period_history->insert(RGWPeriod{period}); +} + +class RGWHandler_Period : public RGWHandler_Auth_S3 { + protected: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + + RGWOp *op_get() override { return new RGWOp_Period_Get; } + RGWOp *op_post() override { return new RGWOp_Period_Post; } +}; + +class RGWRESTMgr_Period : public RGWRESTMgr { + public: + RGWHandler_REST* get_handler(struct req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Period(auth_registry); + } +}; + + +// GET /admin/realm +class RGWOp_Realm_Get : public RGWRESTOp { + std::unique_ptr realm; +public: + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { return "get_realm"; } +}; + +void RGWOp_Realm_Get::execute() +{ + string id; + RESTArgs::get_string(s, "id", id, &id); + string name; + RESTArgs::get_string(s, "name", name, &name); + + // read realm + realm.reset(new RGWRealm(id, name)); + http_ret = realm->init(g_ceph_context, store->svc.sysobj); + if (http_ret < 0) + lderr(store->ctx()) << "failed to read realm id=" << id + << " name=" << name << dendl; +} + +void RGWOp_Realm_Get::send_response() +{ + set_req_state_err(s, http_ret); + dump_errno(s); + + if (http_ret < 0) { + end_header(s); + return; + } + + encode_json("realm", *realm, s->formatter); + end_header(s, NULL, "application/json", s->formatter->get_len()); + flusher.flush(); +} + +// GET /admin/realm?list +class RGWOp_Realm_List : public RGWRESTOp { + std::string default_id; + std::list realms; +public: + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("zone", RGW_CAP_READ); + } + int verify_permission() override { + return check_caps(s->user->caps); + } + void execute() override; + void send_response() override; + const char* name() const override { return "list_realms"; } +}; + +void RGWOp_Realm_List::execute() +{ + { + // read default realm + RGWRealm realm(store->ctx(), store->svc.sysobj); + [[maybe_unused]] int ret = realm.read_default_id(default_id); + } + http_ret = store->svc.zone->list_realms(realms); + if (http_ret < 0) + lderr(store->ctx()) << "failed to list realms" << dendl; +} + +void RGWOp_Realm_List::send_response() +{ + set_req_state_err(s, http_ret); + dump_errno(s); + + if (http_ret < 0) { + end_header(s); + return; + } + + s->formatter->open_object_section("realms_list"); + encode_json("default_info", default_id, s->formatter); + encode_json("realms", realms, s->formatter); + s->formatter->close_section(); + end_header(s, NULL, "application/json", s->formatter->get_len()); + flusher.flush(); +} + +class RGWHandler_Realm : public RGWHandler_Auth_S3 { +protected: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + RGWOp *op_get() override { + if (s->info.args.sub_resource_exists("list")) + return new RGWOp_Realm_List; + return new RGWOp_Realm_Get; + } +}; + +RGWRESTMgr_Realm::RGWRESTMgr_Realm() +{ + // add the /admin/realm/period resource + register_resource("period", new RGWRESTMgr_Period); +} + +RGWHandler_REST* +RGWRESTMgr_Realm::get_handler(struct req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) +{ + return new RGWHandler_Realm(auth_registry); +} diff --git a/src/rgw/rgw_rest_realm.h b/src/rgw/rgw_rest_realm.h new file mode 100644 index 00000000..68566bcb --- /dev/null +++ b/src/rgw/rgw_rest_realm.h @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_REALM_H +#define CEPH_RGW_REST_REALM_H + +#include "rgw_rest.h" + +class RGWRESTMgr_Realm : public RGWRESTMgr { +public: + RGWRESTMgr_Realm(); + + RGWHandler_REST* get_handler(struct req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override; +}; + +#endif diff --git a/src/rgw/rgw_rest_role.cc b/src/rgw/rgw_rest_role.cc new file mode 100644 index 00000000..dbcb718d --- /dev/null +++ b/src/rgw/rgw_rest_role.cc @@ -0,0 +1,489 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_common.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_role.h" +#include "rgw_rest_role.h" + +#define dout_subsys ceph_subsys_rgw + +int RGWRestRole::verify_permission() +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + string role_name = s->info.args.get("RoleName"); + RGWRole role(s->cct, store, role_name, s->user->user_id.tenant); + if (op_ret = role.get(); op_ret < 0) { + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_ROLE_FOUND; + } + return op_ret; + } + + if (int ret = check_caps(s->user->caps); ret == 0) { + _role = std::move(role); + return ret; + } + + string resource_name = role.get_path() + role_name; + uint64_t op = get_op(); + if (!verify_user_permission(this, + s, + rgw::ARN(resource_name, + "role", + s->user->user_id.tenant, true), + op)) { + return -EACCES; + } + + _role = std::move(role); + + return 0; +} + +void RGWRestRole::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this); +} + +int RGWRoleRead::check_caps(RGWUserCaps& caps) +{ + return caps.check_cap("roles", RGW_CAP_READ); +} + +int RGWRoleWrite::check_caps(RGWUserCaps& caps) +{ + return caps.check_cap("roles", RGW_CAP_WRITE); +} + +int RGWCreateRole::verify_permission() +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (int ret = check_caps(s->user->caps); ret == 0) { + return ret; + } + + string role_name = s->info.args.get("RoleName"); + string role_path = s->info.args.get("Path"); + + string resource_name = role_path + role_name; + if (!verify_user_permission(this, + s, + rgw::ARN(resource_name, + "role", + s->user->user_id.tenant, true), + get_op())) { + return -EACCES; + } + return 0; +} + +int RGWCreateRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + role_path = s->info.args.get("Path"); + trust_policy = s->info.args.get("AssumeRolePolicyDocument"); + max_session_duration = s->info.args.get("MaxSessionDuration"); + + if (role_name.empty() || trust_policy.empty()) { + ldout(s->cct, 20) << "ERROR: one of role name or assume role policy document is empty" + << dendl; + return -EINVAL; + } + + bufferlist bl = bufferlist::static_from_string(trust_policy); + try { + const rgw::IAM::Policy p(s->cct, s->user->user_id.tenant, bl); + } + catch (rgw::IAM::PolicyParseException& e) { + ldout(s->cct, 20) << "failed to parse policy: " << e.what() << dendl; + return -ERR_MALFORMED_DOC; + } + + return 0; +} + +void RGWCreateRole::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + RGWRole role(s->cct, store, role_name, role_path, trust_policy, + s->user->user_id.tenant, max_session_duration); + op_ret = role.create(true); + + if (op_ret == -EEXIST) { + op_ret = -ERR_ROLE_EXISTS; + } + + if (op_ret == 0) { + s->formatter->open_object_section("CreateRoleResponse"); + s->formatter->open_object_section("CreateRoleResult"); + s->formatter->open_object_section("Role"); + role.dump(s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWDeleteRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldout(s->cct, 20) << "ERROR: Role name is empty"<< dendl; + return -EINVAL; + } + + return 0; +} + +void RGWDeleteRole::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + op_ret = _role.delete_obj(); + + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_ROLE_FOUND; + } + + s->formatter->open_object_section("DeleteRoleResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); +} + +int RGWGetRole::verify_permission() +{ + return 0; +} + +int RGWGetRole::_verify_permission(const RGWRole& role) +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (int ret = check_caps(s->user->caps); ret == 0) { + return ret; + } + + string resource_name = role.get_path() + role.get_name(); + if (!verify_user_permission(this, + s, + rgw::ARN(resource_name, + "role", + s->user->user_id.tenant, true), + get_op())) { + return -EACCES; + } + return 0; +} + +int RGWGetRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldout(s->cct, 20) << "ERROR: Role name is empty"<< dendl; + return -EINVAL; + } + + return 0; +} + +void RGWGetRole::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + RGWRole role(s->cct, store, role_name, s->user->user_id.tenant); + op_ret = role.get(); + + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_ROLE_FOUND; + return; + } + + op_ret = _verify_permission(role); + + if (op_ret == 0) { + s->formatter->open_object_section("GetRoleResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("GetRoleResult"); + s->formatter->open_object_section("Role"); + role.dump(s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWModifyRole::get_params() +{ + role_name = s->info.args.get("RoleName"); + trust_policy = s->info.args.get("PolicyDocument"); + + if (role_name.empty() || trust_policy.empty()) { + ldout(s->cct, 20) << "ERROR: One of role name or trust policy is empty"<< dendl; + return -EINVAL; + } + JSONParser p; + if (!p.parse(trust_policy.c_str(), trust_policy.length())) { + ldout(s->cct, 20) << "ERROR: failed to parse assume role policy doc" << dendl; + return -ERR_MALFORMED_DOC; + } + + return 0; +} + +void RGWModifyRole::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + _role.update_trust_policy(trust_policy); + op_ret = _role.update(); + + s->formatter->open_object_section("UpdateAssumeRolePolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); +} + +int RGWListRoles::verify_permission() +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if (int ret = check_caps(s->user->caps); ret == 0) { + return ret; + } + + if (!verify_user_permission(this, + s, + rgw::ARN(), + get_op())) { + return -EACCES; + } + + return 0; +} + +int RGWListRoles::get_params() +{ + path_prefix = s->info.args.get("PathPrefix"); + + return 0; +} + +void RGWListRoles::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + vector result; + op_ret = RGWRole::get_roles_by_path_prefix(store, s->cct, path_prefix, s->user->user_id.tenant, result); + + if (op_ret == 0) { + s->formatter->open_array_section("ListRolesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_array_section("ListRolesResult"); + s->formatter->open_object_section("Roles"); + for (const auto& it : result) { + s->formatter->open_object_section("member"); + it.dump(s->formatter); + s->formatter->close_section(); + } + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWPutRolePolicy::get_params() +{ + role_name = s->info.args.get("RoleName"); + policy_name = s->info.args.get("PolicyName"); + perm_policy = s->info.args.get("PolicyDocument"); + + if (role_name.empty() || policy_name.empty() || perm_policy.empty()) { + ldout(s->cct, 20) << "ERROR: One of role name, policy name or perm policy is empty"<< dendl; + return -EINVAL; + } + bufferlist bl = bufferlist::static_from_string(perm_policy); + try { + const rgw::IAM::Policy p(s->cct, s->user->user_id.tenant, bl); + } + catch (rgw::IAM::PolicyParseException& e) { + ldout(s->cct, 20) << "failed to parse policy: " << e.what() << dendl; + return -ERR_MALFORMED_DOC; + } + return 0; +} + +void RGWPutRolePolicy::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + _role.set_perm_policy(policy_name, perm_policy); + op_ret = _role.update(); + + if (op_ret == 0) { + s->formatter->open_object_section("PutRolePolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWGetRolePolicy::get_params() +{ + role_name = s->info.args.get("RoleName"); + policy_name = s->info.args.get("PolicyName"); + + if (role_name.empty() || policy_name.empty()) { + ldout(s->cct, 20) << "ERROR: One of role name or policy name is empty"<< dendl; + return -EINVAL; + } + return 0; +} + +void RGWGetRolePolicy::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + string perm_policy; + op_ret = _role.get_role_policy(policy_name, perm_policy); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_ENTITY; + } + + if (op_ret == 0) { + s->formatter->open_object_section("GetRolePolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("GetRolePolicyResult"); + s->formatter->dump_string("PolicyName", policy_name); + s->formatter->dump_string("RoleName", role_name); + s->formatter->dump_string("PolicyDocument", perm_policy); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWListRolePolicies::get_params() +{ + role_name = s->info.args.get("RoleName"); + + if (role_name.empty()) { + ldout(s->cct, 20) << "ERROR: Role name is empty"<< dendl; + return -EINVAL; + } + return 0; +} + +void RGWListRolePolicies::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + std::vector policy_names = _role.get_role_policy_names(); + s->formatter->open_object_section("ListRolePoliciesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("ListRolePoliciesResult"); + s->formatter->open_array_section("PolicyNames"); + for (const auto& it : policy_names) { + s->formatter->dump_string("member", it); + } + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); +} + +int RGWDeleteRolePolicy::get_params() +{ + role_name = s->info.args.get("RoleName"); + policy_name = s->info.args.get("PolicyName"); + + if (role_name.empty() || policy_name.empty()) { + ldout(s->cct, 20) << "ERROR: One of role name or policy name is empty"<< dendl; + return -EINVAL; + } + return 0; +} + +void RGWDeleteRolePolicy::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + op_ret = _role.delete_policy(policy_name); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_ROLE_FOUND; + } + + if (op_ret == 0) { + op_ret = _role.update(); + } + + s->formatter->open_object_section("DeleteRolePoliciesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); +} diff --git a/src/rgw/rgw_rest_role.h b/src/rgw/rgw_rest_role.h new file mode 100644 index 00000000..24e6bba6 --- /dev/null +++ b/src/rgw/rgw_rest_role.h @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_ROLE_H +#define CEPH_RGW_REST_ROLE_H + +#include "rgw_role.h" + +class RGWRestRole : public RGWRESTOp { +protected: + string role_name; + string role_path; + string trust_policy; + string policy_name; + string perm_policy; + string path_prefix; + string max_session_duration; + RGWRole _role; +public: + int verify_permission() override; + void send_response() override; + virtual uint64_t get_op() = 0; +}; + +class RGWRoleRead : public RGWRestRole { +public: + RGWRoleRead() = default; + int check_caps(RGWUserCaps& caps) override; +}; + +class RGWRoleWrite : public RGWRestRole { +public: + RGWRoleWrite() = default; + int check_caps(RGWUserCaps& caps) override; +}; + +class RGWCreateRole : public RGWRoleWrite { +public: + RGWCreateRole() = default; + int verify_permission() override; + void execute() override; + int get_params(); + const char* name() const override { return "create_role"; } + RGWOpType get_type() override { return RGW_OP_CREATE_ROLE; } + uint64_t get_op() { return rgw::IAM::iamCreateRole; } +}; + +class RGWDeleteRole : public RGWRoleWrite { +public: + RGWDeleteRole() = default; + void execute() override; + int get_params(); + const char* name() const override { return "delete_role"; } + RGWOpType get_type() override { return RGW_OP_DELETE_ROLE; } + uint64_t get_op() { return rgw::IAM::iamDeleteRole; } +}; + +class RGWGetRole : public RGWRoleRead { + int _verify_permission(const RGWRole& role); +public: + RGWGetRole() = default; + int verify_permission() override; + void execute() override; + int get_params(); + const char* name() const override { return "get_role"; } + RGWOpType get_type() override { return RGW_OP_GET_ROLE; } + uint64_t get_op() { return rgw::IAM::iamGetRole; } +}; + +class RGWModifyRole : public RGWRoleWrite { +public: + RGWModifyRole() = default; + void execute() override; + int get_params(); + const char* name() const override { return "modify_role"; } + RGWOpType get_type() override { return RGW_OP_MODIFY_ROLE; } + uint64_t get_op() { return rgw::IAM::iamModifyRole; } +}; + +class RGWListRoles : public RGWRoleRead { +public: + RGWListRoles() = default; + int verify_permission() override; + void execute() override; + int get_params(); + const char* name() const override { return "list_roles"; } + RGWOpType get_type() override { return RGW_OP_LIST_ROLES; } + uint64_t get_op() { return rgw::IAM::iamListRoles; } +}; + +class RGWPutRolePolicy : public RGWRoleWrite { +public: + RGWPutRolePolicy() = default; + void execute() override; + int get_params(); + const char* name() const override { return "put_role_policy"; } + RGWOpType get_type() override { return RGW_OP_PUT_ROLE_POLICY; } + uint64_t get_op() { return rgw::IAM::iamPutRolePolicy; } +}; + +class RGWGetRolePolicy : public RGWRoleRead { +public: + RGWGetRolePolicy() = default; + void execute() override; + int get_params(); + const char* name() const override { return "get_role_policy"; } + RGWOpType get_type() override { return RGW_OP_GET_ROLE_POLICY; } + uint64_t get_op() { return rgw::IAM::iamGetRolePolicy; } +}; + +class RGWListRolePolicies : public RGWRoleRead { +public: + RGWListRolePolicies() = default; + void execute() override; + int get_params(); + const char* name() const override { return "list_role_policies"; } + RGWOpType get_type() override { return RGW_OP_LIST_ROLE_POLICIES; } + uint64_t get_op() { return rgw::IAM::iamListRolePolicies; } +}; + +class RGWDeleteRolePolicy : public RGWRoleWrite { +public: + RGWDeleteRolePolicy() = default; + void execute() override; + int get_params(); + const char* name() const override { return "delete_role_policy"; } + RGWOpType get_type() override { return RGW_OP_DELETE_ROLE_POLICY; } + uint64_t get_op() { return rgw::IAM::iamDeleteRolePolicy; } +}; +#endif /* CEPH_RGW_REST_ROLE_H */ + diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc new file mode 100644 index 00000000..f25890f5 --- /dev/null +++ b/src/rgw/rgw_rest_s3.cc @@ -0,0 +1,5133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include "common/ceph_crypto.h" +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" +#include "common/safe_io.h" +#include "auth/Crypto.h" +#include +#include +#include +#include + +#include + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_s3website.h" +#include "rgw_rest_pubsub.h" +#include "rgw_auth_s3.h" +#include "rgw_acl.h" +#include "rgw_policy_s3.h" +#include "rgw_user.h" +#include "rgw_cors.h" +#include "rgw_cors_s3.h" +#include "rgw_tag_s3.h" + +#include "rgw_client_io.h" + +#include "rgw_keystone.h" +#include "rgw_auth_keystone.h" +#include "rgw_auth_registry.h" + +#include "rgw_es_query.h" + +#include // for 'typeid' + +#include "rgw_ldap.h" +#include "rgw_token.h" +#include "rgw_rest_role.h" +#include "rgw_crypt.h" +#include "rgw_crypt_sanitize.h" +#include "rgw_rest_user_policy.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include "include/ceph_assert.h" +#include "rgw_role.h" +#include "rgw_rest_sts.h" +#include "rgw_rest_iam.h" +#include "rgw_sts.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace rgw; +using namespace ceph::crypto; + +using std::get; + +void list_all_buckets_start(struct req_state *s) +{ + s->formatter->open_array_section_in_ns("ListAllMyBucketsResult", XMLNS_AWS_S3); +} + +void list_all_buckets_end(struct req_state *s) +{ + s->formatter->close_section(); +} + +void dump_bucket(struct req_state *s, RGWBucketEnt& obj) +{ + s->formatter->open_object_section("Bucket"); + s->formatter->dump_string("Name", obj.bucket.name); + dump_time(s, "CreationDate", &obj.creation_time); + s->formatter->close_section(); +} + +void rgw_get_errno_s3(rgw_http_error *e , int err_no) +{ + rgw_http_errors::const_iterator r = rgw_http_s3_errors.find(err_no); + + if (r != rgw_http_s3_errors.end()) { + e->http_ret = r->second.first; + e->s3_code = r->second.second; + } else { + e->http_ret = 500; + e->s3_code = "UnknownError"; + } +} + +static inline std::string get_s3_expiration_header( + struct req_state* s, + const ceph::real_time& mtime) +{ + return rgw::lc::s3_expiration_header( + s, s->object, s->tagset, mtime, s->bucket_attrs); +} + +struct response_attr_param { + const char *param; + const char *http_attr; +}; + +static struct response_attr_param resp_attr_params[] = { + {"response-content-type", "Content-Type"}, + {"response-content-language", "Content-Language"}, + {"response-expires", "Expires"}, + {"response-cache-control", "Cache-Control"}, + {"response-content-disposition", "Content-Disposition"}, + {"response-content-encoding", "Content-Encoding"}, + {NULL, NULL}, +}; + +int RGWGetObj_ObjStore_S3Website::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + map::iterator iter; + iter = attrs.find(RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION); + if (iter != attrs.end()) { + bufferlist &bl = iter->second; + s->redirect = bl.c_str(); + s->err.http_ret = 301; + ldout(s->cct, 20) << __CEPH_ASSERT_FUNCTION << " redirecting per x-amz-website-redirect-location=" << s->redirect << dendl; + op_ret = -ERR_WEBSITE_REDIRECT; + set_req_state_err(s, op_ret); + dump_errno(s); + dump_content_length(s, 0); + dump_redirect(s, s->redirect); + end_header(s, this); + return op_ret; + } else { + return RGWGetObj_ObjStore_S3::send_response_data(bl, bl_ofs, bl_len); + } +} + +int RGWGetObj_ObjStore_S3Website::send_response_data_error() +{ + return RGWGetObj_ObjStore_S3::send_response_data_error(); +} + +int RGWGetObj_ObjStore_S3::get_params() +{ + // for multisite sync requests, only read the slo manifest itself, rather than + // all of the data from its parts. the parts will sync as separate objects + skip_manifest = s->info.args.exists(RGW_SYS_PARAM_PREFIX "sync-manifest"); + + // multisite sync requests should fetch encrypted data, along with the + // attributes needed to support decryption on the other zone + if (s->system_request) { + skip_decrypt = s->info.args.exists(RGW_SYS_PARAM_PREFIX "skip-decrypt"); + } + + return RGWGetObj_ObjStore::get_params(); +} + +int RGWGetObj_ObjStore_S3::send_response_data_error() +{ + bufferlist bl; + return send_response_data(bl, 0 , 0); +} + +template +int decode_attr_bl_single_value(map& attrs, const char *attr_name, T *result, T def_val) +{ + map::iterator iter = attrs.find(attr_name); + if (iter == attrs.end()) { + *result = def_val; + return 0; + } + bufferlist& bl = iter->second; + if (bl.length() == 0) { + *result = def_val; + return 0; + } + auto bliter = bl.cbegin(); + try { + decode(*result, bliter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +inline bool str_has_cntrl(const std::string s) { + return std::any_of(s.begin(), s.end(), ::iscntrl); +} + +inline bool str_has_cntrl(const char* s) { + std::string _s(s); + return str_has_cntrl(_s); +} + +int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, + off_t bl_len) +{ + const char *content_type = NULL; + string content_type_str; + map response_attrs; + map::iterator riter; + bufferlist metadata_bl; + + string expires = get_s3_expiration_header(s, lastmod); + + if (sent_header) + goto send_data; + + if (custom_http_ret) { + set_req_state_err(s, 0); + dump_errno(s, custom_http_ret); + } else { + set_req_state_err(s, (partial_content && !op_ret) ? STATUS_PARTIAL_CONTENT + : op_ret); + dump_errno(s); + } + + if (op_ret) + goto done; + + if (range_str) + dump_range(s, start, end, s->obj_size); + + if (s->system_request && + s->info.args.exists(RGW_SYS_PARAM_PREFIX "prepend-metadata")) { + + dump_header(s, "Rgwx-Object-Size", (long long)total_len); + + if (rgwx_stat) { + /* + * in this case, we're not returning the object's content, only the prepended + * extra metadata + */ + total_len = 0; + } + + /* JSON encode object metadata */ + JSONFormatter jf; + jf.open_object_section("obj_metadata"); + encode_json("attrs", attrs, &jf); + utime_t ut(lastmod); + encode_json("mtime", ut, &jf); + jf.close_section(); + stringstream ss; + jf.flush(ss); + metadata_bl.append(ss.str()); + dump_header(s, "Rgwx-Embedded-Metadata-Len", metadata_bl.length()); + total_len += metadata_bl.length(); + } + + if (s->system_request && !real_clock::is_zero(lastmod)) { + /* we end up dumping mtime in two different methods, a bit redundant */ + dump_epoch_header(s, "Rgwx-Mtime", lastmod); + uint64_t pg_ver = 0; + int r = decode_attr_bl_single_value(attrs, RGW_ATTR_PG_VER, &pg_ver, (uint64_t)0); + if (r < 0) { + ldout(s->cct, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl; + } + dump_header(s, "Rgwx-Obj-PG-Ver", pg_ver); + + uint32_t source_zone_short_id = 0; + r = decode_attr_bl_single_value(attrs, RGW_ATTR_SOURCE_ZONE, &source_zone_short_id, (uint32_t)0); + if (r < 0) { + ldout(s->cct, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl; + } + if (source_zone_short_id != 0) { + dump_header(s, "Rgwx-Source-Zone-Short-Id", source_zone_short_id); + } + } + + for (auto &it : crypt_http_responses) + dump_header(s, it.first, it.second); + + dump_content_length(s, total_len); + dump_last_modified(s, lastmod); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + dump_header_if_nonempty(s, "x-amz-expiration", expires); + + if (attrs.find(RGW_ATTR_APPEND_PART_NUM) != attrs.end()) { + dump_header(s, "x-rgw-object-type", "Appendable"); + dump_header(s, "x-rgw-next-append-position", s->obj_size); + } else { + dump_header(s, "x-rgw-object-type", "Normal"); + } + + if (! op_ret) { + if (! lo_etag.empty()) { + /* Handle etag of Swift API's large objects (DLO/SLO). It's entirerly + * legit to perform GET on them through S3 API. In such situation, + * a client should receive the composited content with corresponding + * etag value. */ + dump_etag(s, lo_etag); + } else { + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + dump_etag(s, iter->second.to_str()); + } + } + + for (struct response_attr_param *p = resp_attr_params; p->param; p++) { + bool exists; + string val = s->info.args.get(p->param, &exists); + if (exists) { + /* reject unauthenticated response header manipulation, see + * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html */ + if (s->auth.identity->is_anonymous()) { + return -ERR_INVALID_REQUEST; + } + /* HTTP specification says no control characters should be present in + * header values: https://tools.ietf.org/html/rfc7230#section-3.2 + * field-vchar = VCHAR / obs-text + * + * Failure to validate this permits a CRLF injection in HTTP headers, + * whereas S3 GetObject only permits specific headers. + */ + if(str_has_cntrl(val)) { + /* TODO: return a more distinct error in future; + * stating what the problem is */ + return -ERR_INVALID_REQUEST; + } + + if (strcmp(p->param, "response-content-type") != 0) { + response_attrs[p->http_attr] = val; + } else { + content_type_str = val; + content_type = content_type_str.c_str(); + } + } + } + + for (auto iter = attrs.begin(); iter != attrs.end(); ++iter) { + const char *name = iter->first.c_str(); + map::iterator aiter = rgw_to_http_attrs.find(name); + if (aiter != rgw_to_http_attrs.end()) { + if (response_attrs.count(aiter->second) == 0) { + /* Was not already overridden by a response param. */ + + size_t len = iter->second.length(); + string s(iter->second.c_str(), len); + while (len && !s[len - 1]) { + --len; + s.resize(len); + } + response_attrs[aiter->second] = s; + } + } else if (iter->first.compare(RGW_ATTR_CONTENT_TYPE) == 0) { + /* Special handling for content_type. */ + if (!content_type) { + content_type_str = rgw_bl_str(iter->second); + content_type = content_type_str.c_str(); + } + } else if (strcmp(name, RGW_ATTR_SLO_UINDICATOR) == 0) { + // this attr has an extra length prefix from encode() in prior versions + dump_header(s, "X-Object-Meta-Static-Large-Object", "True"); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, + sizeof(RGW_ATTR_META_PREFIX)-1) == 0) { + /* User custom metadata. */ + name += sizeof(RGW_ATTR_PREFIX) - 1; + dump_header(s, name, iter->second); + } else if (iter->first.compare(RGW_ATTR_TAGS) == 0) { + RGWObjTags obj_tags; + try{ + auto it = iter->second.cbegin(); + obj_tags.decode(it); + } catch (buffer::error &err) { + ldout(s->cct,0) << "Error caught buffer::error couldn't decode TagSet " << dendl; + } + dump_header(s, RGW_AMZ_TAG_COUNT, obj_tags.count()); + } else if (iter->first.compare(RGW_ATTR_OBJECT_RETENTION) == 0 && get_retention){ + RGWObjectRetention retention; + try { + decode(retention, iter->second); + dump_header(s, "x-amz-object-lock-mode", retention.get_mode()); + dump_time_header(s, "x-amz-object-lock-retain-until-date", retention.get_retain_until_date()); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl; + } + } else if (iter->first.compare(RGW_ATTR_OBJECT_LEGAL_HOLD) == 0 && get_legal_hold) { + RGWObjectLegalHold legal_hold; + try { + decode(legal_hold, iter->second); + dump_header(s, "x-amz-object-lock-legal-hold",legal_hold.get_status()); + } catch (buffer::error& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectLegalHold" << dendl; + } + } + } + } + +done: + for (riter = response_attrs.begin(); riter != response_attrs.end(); + ++riter) { + dump_header(s, riter->first, riter->second); + } + + if (op_ret == -ERR_NOT_MODIFIED) { + end_header(s, this); + } else { + if (!content_type) + content_type = "binary/octet-stream"; + + end_header(s, this, content_type); + } + + if (metadata_bl.length()) { + dump_body(s, metadata_bl); + } + sent_header = true; + +send_data: + if (get_data && !op_ret) { + int r = dump_body(s, bl.c_str() + bl_ofs, bl_len); + if (r < 0) + return r; + } + + return 0; +} + +int RGWGetObj_ObjStore_S3::get_decrypt_filter(std::unique_ptr *filter, RGWGetObj_Filter* cb, bufferlist* manifest_bl) +{ + if (skip_decrypt) { // bypass decryption for multisite sync requests + return 0; + } + + int res = 0; + std::unique_ptr block_crypt; + res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses); + if (res == 0) { + if (block_crypt != nullptr) { + auto f = std::make_unique(s->cct, cb, std::move(block_crypt)); + if (manifest_bl != nullptr) { + res = f->read_manifest(*manifest_bl); + if (res == 0) { + *filter = std::move(f); + } + } + } + } + return res; +} + +void RGWGetObjTags_ObjStore_S3::send_response_data(bufferlist& bl) +{ + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + s->formatter->open_object_section_in_ns("Tagging", XMLNS_AWS_S3); + s->formatter->open_object_section("TagSet"); + if (has_tags){ + RGWObjTagSet_S3 tagset; + auto iter = bl.cbegin(); + try { + tagset.decode(iter); + } catch (buffer::error& err) { + ldout(s->cct,0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl; + op_ret= -EIO; + return; + } + tagset.dump_xml(s->formatter); + } + s->formatter->close_section(); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +int RGWPutObjTags_ObjStore_S3::get_params() +{ + RGWXMLParser parser; + + if (!parser.init()){ + return -EINVAL; + } + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int r = 0; + bufferlist data; + std::tie(r, data) = rgw_rest_read_all_input(s, max_size, false); + + if (r < 0) + return r; + + if (!parser.parse(data.c_str(), data.length(), 1)) { + return -ERR_MALFORMED_XML; + } + + RGWObjTagging_S3 tagging; + + try { + RGWXMLDecoder::decode_xml("Tagging", tagging, &parser); + } catch (RGWXMLDecoder::err& err) { + ldout(s->cct, 5) << "Malformed tagging request: " << err << dendl; + return -ERR_MALFORMED_XML; + } + + RGWObjTags obj_tags; + r = tagging.rebuild(obj_tags); + if (r < 0) + return r; + + obj_tags.encode(tags_bl); + ldout(s->cct, 20) << "Read " << obj_tags.count() << "tags" << dendl; + + return 0; +} + +void RGWPutObjTags_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + +} + +void RGWDeleteObjTags_ObjStore_S3::send_response() +{ + int r = op_ret; + if (r == -ENOENT) + r = 0; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +void RGWListBuckets_ObjStore_S3::send_response_begin(bool has_buckets) +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + dump_start(s); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, NULL, "application/xml", CHUNKED_TRANSFER_ENCODING); + + if (! op_ret) { + list_all_buckets_start(s); + dump_owner(s, s->user->user_id, s->user->display_name); + s->formatter->open_array_section("Buckets"); + sent_data = true; + } +} + +void RGWListBuckets_ObjStore_S3::send_response_data(RGWUserBuckets& buckets) +{ + if (!sent_data) + return; + + map& m = buckets.get_buckets(); + map::iterator iter; + + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt obj = iter->second; + dump_bucket(s, obj); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWListBuckets_ObjStore_S3::send_response_end() +{ + if (sent_data) { + s->formatter->close_section(); + list_all_buckets_end(s); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWGetUsage_ObjStore_S3::get_params() +{ + start_date = s->info.args.get("start-date"); + end_date = s->info.args.get("end-date"); + return 0; +} + +static void dump_usage_categories_info(Formatter *formatter, const rgw_usage_log_entry& entry, map *categories) +{ + formatter->open_array_section("categories"); + map::const_iterator uiter; + for (uiter = entry.usage_map.begin(); uiter != entry.usage_map.end(); ++uiter) { + if (categories && !categories->empty() && !categories->count(uiter->first)) + continue; + const rgw_usage_data& usage = uiter->second; + formatter->open_object_section("Entry"); + formatter->dump_string("Category", uiter->first); + formatter->dump_int("BytesSent", usage.bytes_sent); + formatter->dump_int("BytesReceived", usage.bytes_received); + formatter->dump_int("Ops", usage.ops); + formatter->dump_int("SuccessfulOps", usage.successful_ops); + formatter->close_section(); // Entry + } + formatter->close_section(); // Category +} + +static void dump_usage_bucket_info(Formatter *formatter, const std::string& name, const cls_user_bucket_entry& entry) +{ + formatter->open_object_section("Entry"); + formatter->dump_string("Bucket", name); + formatter->dump_int("Bytes", entry.size); + formatter->dump_int("Bytes_Rounded", entry.size_rounded); + formatter->close_section(); // entry +} + +void RGWGetUsage_ObjStore_S3::send_response() +{ + if (op_ret < 0) + set_req_state_err(s, op_ret); + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret < 0) + return; + + Formatter *formatter = s->formatter; + string last_owner; + bool user_section_open = false; + + formatter->open_object_section("Usage"); + if (show_log_entries) { + formatter->open_array_section("Entries"); + } + map::iterator iter; + for (iter = usage.begin(); iter != usage.end(); ++iter) { + const rgw_user_bucket& ub = iter->first; + const rgw_usage_log_entry& entry = iter->second; + + if (show_log_entries) { + if (ub.user.compare(last_owner) != 0) { + if (user_section_open) { + formatter->close_section(); + formatter->close_section(); + } + formatter->open_object_section("User"); + formatter->dump_string("Owner", ub.user); + formatter->open_array_section("Buckets"); + user_section_open = true; + last_owner = ub.user; + } + formatter->open_object_section("Bucket"); + formatter->dump_string("Bucket", ub.bucket); + utime_t ut(entry.epoch, 0); + ut.gmtime(formatter->dump_stream("Time")); + formatter->dump_int("Epoch", entry.epoch); + dump_usage_categories_info(formatter, entry, &categories); + formatter->close_section(); // bucket + } + + summary_map[ub.user].aggregate(entry, &categories); + } + + if (show_log_entries) { + if (user_section_open) { + formatter->close_section(); // buckets + formatter->close_section(); //user + } + formatter->close_section(); // entries + } + + if (show_log_sum) { + formatter->open_array_section("Summary"); + map::iterator siter; + for (siter = summary_map.begin(); siter != summary_map.end(); ++siter) { + const rgw_usage_log_entry& entry = siter->second; + formatter->open_object_section("User"); + formatter->dump_string("User", siter->first); + dump_usage_categories_info(formatter, entry, &categories); + rgw_usage_data total_usage; + entry.sum(total_usage, categories); + formatter->open_object_section("Total"); + formatter->dump_int("BytesSent", total_usage.bytes_sent); + formatter->dump_int("BytesReceived", total_usage.bytes_received); + formatter->dump_int("Ops", total_usage.ops); + formatter->dump_int("SuccessfulOps", total_usage.successful_ops); + formatter->close_section(); // total + formatter->close_section(); // user + } + + if (s->cct->_conf->rgw_rest_getusage_op_compat) { + formatter->open_object_section("Stats"); + } + + formatter->dump_int("TotalBytes", header.stats.total_bytes); + formatter->dump_int("TotalBytesRounded", header.stats.total_bytes_rounded); + formatter->dump_int("TotalEntries", header.stats.total_entries); + + if (s->cct->_conf->rgw_rest_getusage_op_compat) { + formatter->close_section(); //Stats + } + + formatter->close_section(); // summary + } + + formatter->open_array_section("CapacityUsed"); + formatter->open_object_section("User"); + formatter->open_array_section("Buckets"); + for (const auto& biter : buckets_usage) { + const cls_user_bucket_entry& entry = biter.second; + dump_usage_bucket_info(formatter, biter.first, entry); + } + formatter->close_section(); // Buckets + formatter->close_section(); // User + formatter->close_section(); // CapacityUsed + + formatter->close_section(); // usage + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWListBucket_ObjStore_S3::get_common_params() +{ + list_versions = s->info.args.exists("versions"); + prefix = s->info.args.get("prefix"); + + // non-standard + s->info.args.get_bool("allow-unordered", &allow_unordered, false); + delimiter = s->info.args.get("delimiter"); + max_keys = s->info.args.get("max-keys"); + op_ret = parse_max_keys(); + if (op_ret < 0) { + return op_ret; + } + encoding_type = s->info.args.get("encoding-type"); + if (s->system_request) { + s->info.args.get_bool("objs-container", &objs_container, false); + const char *shard_id_str = s->info.env->get("HTTP_RGWX_SHARD_ID"); + if (shard_id_str) { + string err; + shard_id = strict_strtol(shard_id_str, 10, &err); + if (!err.empty()) { + ldout(s->cct, 5) << "bad shard id specified: " << shard_id_str << dendl; + return -EINVAL; + } + } else { + shard_id = s->bucket_instance_shard_id; + } + } + return 0; +} + +int RGWListBucket_ObjStore_S3::get_params() +{ + int ret = get_common_params(); + if (ret < 0) { + return ret; + } + if (!list_versions) { + marker = s->info.args.get("marker"); + } else { + marker.name = s->info.args.get("key-marker"); + marker.instance = s->info.args.get("version-id-marker"); + } + return 0; +} + +int RGWListBucket_ObjStore_S3v2::get_params() +{ +int ret = get_common_params(); +if (ret < 0) { + return ret; +} +s->info.args.get_bool("fetch-owner", &fetchOwner, false); +startAfter = s->info.args.get("start-after", &start_after_exist); +continuation_token = s->info.args.get("continuation-token", &continuation_token_exist); +if(!continuation_token_exist) { + marker = startAfter; +} else { + marker = continuation_token; +} +return 0; +} + +void RGWListBucket_ObjStore_S3::send_common_versioned_response() +{ + if (!s->bucket_tenant.empty()) { + s->formatter->dump_string("Tenant", s->bucket_tenant); + } + s->formatter->dump_string("Name", s->bucket_name); + s->formatter->dump_string("Prefix", prefix); + s->formatter->dump_int("MaxKeys", max); + if (!delimiter.empty()) { + s->formatter->dump_string("Delimiter", delimiter); + } + s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" + : "false")); + + if (!common_prefixes.empty()) { + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); + pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->open_array_section("CommonPrefixes"); + if (encode_key) { + s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false)); + } else { + s->formatter->dump_string("Prefix", pref_iter->first); + } + + s->formatter->close_section(); + } + } + } + +void RGWListBucket_ObjStore_S3::send_versioned_response() +{ + s->formatter->open_object_section_in_ns("ListVersionsResult", XMLNS_AWS_S3); + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); + encode_key = true; + } + RGWListBucket_ObjStore_S3::send_common_versioned_response(); + s->formatter->dump_string("KeyMarker", marker.name); + s->formatter->dump_string("VersionIdMarker", marker.instance); + if (is_truncated && !next_marker.empty()) { + s->formatter->dump_string("NextKeyMarker", next_marker.name); + if (next_marker.instance.empty()) { + s->formatter->dump_string("NextVersionIdMarker", "null"); + } + else { + s->formatter->dump_string("NextVersionIdMarker", next_marker.instance); + } + } + + if (op_ret >= 0) { + if (objs_container) { + s->formatter->open_array_section("Entries"); + } + + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + const char *section_name = (iter->is_delete_marker() ? "DeleteMarker" + : "Version"); + s->formatter->open_object_section(section_name); + if (objs_container) { + s->formatter->dump_bool("IsDeleteMarker", iter->is_delete_marker()); + } + rgw_obj_key key(iter->key); + if (encode_key) { + string key_name; + url_encode(key.name, key_name); + s->formatter->dump_string("Key", key_name); + } + else { + s->formatter->dump_string("Key", key.name); + } + string version_id = key.instance; + if (version_id.empty()) { + version_id = "null"; + } + if (s->system_request) { + if (iter->versioned_epoch > 0) { + s->formatter->dump_int("VersionedEpoch", iter->versioned_epoch); + } + s->formatter->dump_string("RgwxTag", iter->tag); + utime_t ut(iter->meta.mtime); + ut.gmtime_nsec(s->formatter->dump_stream("RgwxMtime")); + } + s->formatter->dump_string("VersionId", version_id); + s->formatter->dump_bool("IsLatest", iter->is_current()); + dump_time(s, "LastModified", &iter->meta.mtime); + if (!iter->is_delete_marker()) { + s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str()); + s->formatter->dump_int("Size", iter->meta.accounted_size); + auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); + s->formatter->dump_string("StorageClass", storage_class.c_str()); + } + dump_owner(s, iter->meta.owner, iter->meta.owner_display_name); + if (iter->meta.appendable) { + s->formatter->dump_string("Type", "Appendable"); + } else { + s->formatter->dump_string("Type", "Normal"); + } + s->formatter->close_section(); // Version/DeleteMarker + } + if (objs_container) { + s->formatter->close_section(); // Entries + } + s->formatter->close_section(); // ListVersionsResult + } + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +void RGWListBucket_ObjStore_S3::send_common_response() +{ + if (!s->bucket_tenant.empty()) { + s->formatter->dump_string("Tenant", s->bucket_tenant); + } + s->formatter->dump_string("Name", s->bucket_name); + s->formatter->dump_string("Prefix", prefix); + s->formatter->dump_int("MaxKeys", max); + if (!delimiter.empty()) { + s->formatter->dump_string("Delimiter", delimiter); + } + s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" + : "false")); + + if (!common_prefixes.empty()) { + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); + pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->open_array_section("CommonPrefixes"); + if (encode_key) { + s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false)); + } else { + s->formatter->dump_string("Prefix", pref_iter->first); + } + s->formatter->close_section(); + } + } + } + +void RGWListBucket_ObjStore_S3::send_response() +{ + if (op_ret < 0) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret < 0) { + return; + } + if (list_versions) { + send_versioned_response(); + return; + } + + s->formatter->open_object_section_in_ns("ListBucketResult", XMLNS_AWS_S3); + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); + encode_key = true; + } + RGWListBucket_ObjStore_S3::send_common_response(); + if (op_ret >= 0) { + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + rgw_obj_key key(iter->key); + s->formatter->open_array_section("Contents"); + if (encode_key) { + string key_name; + url_encode(key.name, key_name); + s->formatter->dump_string("Key", key_name); + } else { + s->formatter->dump_string("Key", key.name); + } + dump_time(s, "LastModified", &iter->meta.mtime); + s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str()); + s->formatter->dump_int("Size", iter->meta.accounted_size); + auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); + s->formatter->dump_string("StorageClass", storage_class.c_str()); + dump_owner(s, iter->meta.owner, iter->meta.owner_display_name); + if (s->system_request) { + s->formatter->dump_string("RgwxTag", iter->tag); + } + if (iter->meta.appendable) { + s->formatter->dump_string("Type", "Appendable"); + } else { + s->formatter->dump_string("Type", "Normal"); + } + s->formatter->close_section(); + } + } + s->formatter->dump_string("Marker", marker.name); + if (is_truncated && !next_marker.empty()) { + s->formatter->dump_string("NextMarker", next_marker.name); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWListBucket_ObjStore_S3v2::send_versioned_response() +{ + s->formatter->open_object_section_in_ns("ListVersionsResult", XMLNS_AWS_S3); + RGWListBucket_ObjStore_S3v2::send_common_versioned_response(); + s->formatter->dump_string("KeyContinuationToken", marker.name); + s->formatter->dump_string("VersionIdContinuationToken", marker.instance); + if (is_truncated && !next_marker.empty()) { + s->formatter->dump_string("NextKeyContinuationToken", next_marker.name); + s->formatter->dump_string("NextVersionIdContinuationToken", next_marker.instance); + } + + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); + encode_key = true; + } + + if (op_ret >= 0) { + if (objs_container) { + s->formatter->open_array_section("Entries"); + } + + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + const char *section_name = (iter->is_delete_marker() ? "DeleteContinuationToken" + : "Version"); + s->formatter->open_object_section(section_name); + if (objs_container) { + s->formatter->dump_bool("IsDeleteContinuationToken", iter->is_delete_marker()); + } + rgw_obj_key key(iter->key); + if (encode_key) { + string key_name; + url_encode(key.name, key_name); + s->formatter->dump_string("Key", key_name); + } + else { + s->formatter->dump_string("Key", key.name); + } + string version_id = key.instance; + if (version_id.empty()) { + version_id = "null"; + } + if (s->system_request) { + if (iter->versioned_epoch > 0) { + s->formatter->dump_int("VersionedEpoch", iter->versioned_epoch); + } + s->formatter->dump_string("RgwxTag", iter->tag); + utime_t ut(iter->meta.mtime); + ut.gmtime_nsec(s->formatter->dump_stream("RgwxMtime")); + } + s->formatter->dump_string("VersionId", version_id); + s->formatter->dump_bool("IsLatest", iter->is_current()); + dump_time(s, "LastModified", &iter->meta.mtime); + if (!iter->is_delete_marker()) { + s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str()); + s->formatter->dump_int("Size", iter->meta.accounted_size); + auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); + s->formatter->dump_string("StorageClass", storage_class.c_str()); + } + if (fetchOwner == true) { + dump_owner(s, s->user->user_id, s->user->display_name); + } + s->formatter->close_section(); + } + + + if (objs_container) { + s->formatter->close_section(); + } + + if (!common_prefixes.empty()) { + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); + pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->open_array_section("CommonPrefixes"); + if (encode_key) { + s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false)); + } else { + s->formatter->dump_string("Prefix", pref_iter->first); + } + + s->formatter->dump_int("KeyCount",objs.size()); + if (start_after_exist) { + s->formatter->dump_string("StartAfter", startAfter); + } + s->formatter->close_section(); + } + } + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWListBucket_ObjStore_S3v2::send_response() +{ + if (op_ret < 0) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret < 0) { + return; + } + if (list_versions) { + send_versioned_response(); + return; + } + + s->formatter->open_object_section_in_ns("ListBucketResult", XMLNS_AWS_S3); + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); + encode_key = true; + } + + RGWListBucket_ObjStore_S3::send_common_response(); + if (op_ret >= 0) { + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + rgw_obj_key key(iter->key); + s->formatter->open_array_section("Contents"); + if (encode_key) { + string key_name; + url_encode(key.name, key_name); + s->formatter->dump_string("Key", key_name); + } + else { + s->formatter->dump_string("Key", key.name); + } + dump_time(s, "LastModified", &iter->meta.mtime); + s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str()); + s->formatter->dump_int("Size", iter->meta.accounted_size); + auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); + s->formatter->dump_string("StorageClass", storage_class.c_str()); + if (fetchOwner == true) { + dump_owner(s, s->user->user_id, s->user->display_name); + } + if (s->system_request) { + s->formatter->dump_string("RgwxTag", iter->tag); + } + if (iter->meta.appendable) { + s->formatter->dump_string("Type", "Appendable"); + } else { + s->formatter->dump_string("Type", "Normal"); + } + s->formatter->close_section(); + } + } + if (continuation_token_exist) { + s->formatter->dump_string("ContinuationToken", continuation_token); + } + if (is_truncated && !next_marker.empty()) { + s->formatter->dump_string("NextContinuationToken", next_marker.name); + } + s->formatter->dump_int("KeyCount", objs.size() + common_prefixes.size()); + if (start_after_exist) { + s->formatter->dump_string("StartAfter", startAfter); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketLogging_ObjStore_S3::send_response() +{ + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + s->formatter->open_object_section_in_ns("BucketLoggingStatus", XMLNS_AWS_S3); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketLocation_ObjStore_S3::send_response() +{ + dump_errno(s); + end_header(s, this); + dump_start(s); + + RGWZoneGroup zonegroup; + string api_name; + + int ret = store->svc.zone->get_zonegroup(s->bucket_info.zonegroup, zonegroup); + if (ret >= 0) { + api_name = zonegroup.api_name; + } else { + if (s->bucket_info.zonegroup != "default") { + api_name = s->bucket_info.zonegroup; + } + } + + s->formatter->dump_format_ns("LocationConstraint", XMLNS_AWS_S3, + "%s", api_name.c_str()); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketVersioning_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + s->formatter->open_object_section_in_ns("VersioningConfiguration", XMLNS_AWS_S3); + if (versioned) { + const char *status = (versioning_enabled ? "Enabled" : "Suspended"); + s->formatter->dump_string("Status", status); + const char *mfa_status = (mfa_enabled ? "Enabled" : "Disabled"); + s->formatter->dump_string("MfaDelete", mfa_status); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +struct ver_config_status { + int status{VersioningSuspended}; + + enum MFAStatus { + MFA_UNKNOWN, + MFA_DISABLED, + MFA_ENABLED, + } mfa_status{MFA_UNKNOWN}; + int retcode{0}; + + void decode_xml(XMLObj *obj) { + string status_str; + string mfa_str; + RGWXMLDecoder::decode_xml("Status", status_str, obj); + if (status_str == "Enabled") { + status = VersioningEnabled; + } else if (status_str != "Suspended") { + status = VersioningStatusInvalid; + } + + + if (RGWXMLDecoder::decode_xml("MfaDelete", mfa_str, obj)) { + if (mfa_str == "Enabled") { + mfa_status = MFA_ENABLED; + } else if (mfa_str == "Disabled") { + mfa_status = MFA_DISABLED; + } else { + retcode = -EINVAL; + } + } + } +}; + +int RGWSetBucketVersioning_ObjStore_S3::get_params() +{ + int r = 0; + bufferlist data; + std::tie(r, data) = + rgw_rest_read_all_input(s, s->cct->_conf->rgw_max_put_param_size, false); + if (r < 0) { + return r; + } + + r = do_aws4_auth_completion(); + if (r < 0) { + return r; + } + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldout(s->cct, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + char* buf = data.c_str(); + if (!parser.parse(buf, data.length(), 1)) { + ldout(s->cct, 10) << "NOTICE: failed to parse data: " << buf << dendl; + r = -EINVAL; + return r; + } + + ver_config_status status_conf; + + if (!RGWXMLDecoder::decode_xml("VersioningConfiguration", status_conf, &parser)) { + ldout(s->cct, 10) << "NOTICE: bad versioning config input" << dendl; + return -EINVAL; + } + + if (!store->svc.zone->is_meta_master()) { + /* only need to keep this data around if we're not meta master */ + in_data.append(data); + } + + versioning_status = status_conf.status; + if (versioning_status == VersioningStatusInvalid) { + r = -EINVAL; + } + + if (status_conf.mfa_status != ver_config_status::MFA_UNKNOWN) { + mfa_set_status = true; + switch (status_conf.mfa_status) { + case ver_config_status::MFA_DISABLED: + mfa_status = false; + break; + case ver_config_status::MFA_ENABLED: + mfa_status = true; + break; + default: + ldout(s->cct, 0) << "ERROR: RGWSetBucketVersioning_ObjStore_S3::get_params(): unexpected switch case mfa_status=" << status_conf.mfa_status << dendl; + r = -EIO; + } + } else if (status_conf.retcode < 0) { + r = status_conf.retcode; + } + return r; +} + +void RGWSetBucketVersioning_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); +} + +int RGWSetBucketWebsite_ObjStore_S3::get_params() +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int r = 0; + bufferlist data; + std::tie(r, data) = rgw_rest_read_all_input(s, max_size, false); + + if (r < 0) { + return r; + } + + r = do_aws4_auth_completion(); + if (r < 0) { + return r; + } + + in_data.append(data); + + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldout(s->cct, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + char* buf = data.c_str(); + if (!parser.parse(buf, data.length(), 1)) { + ldout(s->cct, 5) << "failed to parse xml: " << buf << dendl; + return -EINVAL; + } + + try { + RGWXMLDecoder::decode_xml("WebsiteConfiguration", website_conf, &parser, true); + } catch (RGWXMLDecoder::err& err) { + ldout(s->cct, 5) << "unexpected xml: " << buf << dendl; + return -EINVAL; + } + + if (website_conf.is_redirect_all && website_conf.redirect_all.hostname.empty()) { + s->err.message = "A host name must be provided to redirect all requests (e.g. \"example.com\")."; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } else if (!website_conf.is_redirect_all && !website_conf.is_set_index_doc) { + s->err.message = "A value for IndexDocument Suffix must be provided if RedirectAllRequestsTo is empty"; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } else if (!website_conf.is_redirect_all && website_conf.is_set_index_doc && + website_conf.index_doc_suffix.empty()) { + s->err.message = "The IndexDocument Suffix is not well formed"; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } + +#define WEBSITE_ROUTING_RULES_MAX_NUM 50 + int max_num = s->cct->_conf->rgw_website_routing_rules_max_num; + if (max_num < 0) { + max_num = WEBSITE_ROUTING_RULES_MAX_NUM; + } + int routing_rules_num = website_conf.routing_rules.rules.size(); + if (routing_rules_num > max_num) { + ldout(s->cct, 4) << "An website routing config can have up to " + << max_num + << " rules, request website routing rules num: " + << routing_rules_num << dendl; + op_ret = -ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR; + s->err.message = std::to_string(routing_rules_num) +" routing rules provided, the number of routing rules in a website configuration is limited to " + + std::to_string(max_num) + + "."; + return -ERR_INVALID_REQUEST; + } + + return 0; +} + +void RGWSetBucketWebsite_ObjStore_S3::send_response() +{ + if (op_ret < 0) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); +} + +void RGWDeleteBucketWebsite_ObjStore_S3::send_response() +{ + if (op_ret == 0) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); +} + +void RGWGetBucketWebsite_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + if (op_ret < 0) { + return; + } + + RGWBucketWebsiteConf& conf = s->bucket_info.website_conf; + + s->formatter->open_object_section_in_ns("WebsiteConfiguration", XMLNS_AWS_S3); + conf.dump_xml(s->formatter); + s->formatter->close_section(); // WebsiteConfiguration + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void dump_bucket_metadata(struct req_state *s, RGWBucketEnt& bucket) +{ + dump_header(s, "X-RGW-Object-Count", static_cast(bucket.count)); + dump_header(s, "X-RGW-Bytes-Used", static_cast(bucket.size)); +} + +void RGWStatBucket_ObjStore_S3::send_response() +{ + if (op_ret >= 0) { + dump_bucket_metadata(s, bucket); + } + + set_req_state_err(s, op_ret); + dump_errno(s); + + end_header(s, this); + dump_start(s); +} + +static int create_s3_policy(struct req_state *s, RGWRados *store, + RGWAccessControlPolicy_S3& s3policy, + ACLOwner& owner) +{ + if (s->has_acl_header) { + if (!s->canned_acl.empty()) + return -ERR_INVALID_REQUEST; + + return s3policy.create_from_headers(store, s->info.env, owner); + } + + return s3policy.create_canned(owner, s->bucket_owner, s->canned_acl); +} + +class RGWLocationConstraint : public XMLObj +{ +public: + RGWLocationConstraint() {} + ~RGWLocationConstraint() override {} + bool xml_end(const char *el) override { + if (!el) + return false; + + location_constraint = get_data(); + + return true; + } + + string location_constraint; +}; + +class RGWCreateBucketConfig : public XMLObj +{ +public: + RGWCreateBucketConfig() {} + ~RGWCreateBucketConfig() override {} +}; + +class RGWCreateBucketParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) override { + return new XMLObj; + } + +public: + RGWCreateBucketParser() {} + ~RGWCreateBucketParser() override {} + + bool get_location_constraint(string& zone_group) { + XMLObj *config = find_first("CreateBucketConfiguration"); + if (!config) + return false; + + XMLObj *constraint = config->find_first("LocationConstraint"); + if (!constraint) + return false; + + zone_group = constraint->get_data(); + + return true; + } +}; + +int RGWCreateBucket_ObjStore_S3::get_params() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + int r = create_s3_policy(s, store, s3policy, s->owner); + if (r < 0) + return r; + + policy = s3policy; + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int op_ret = 0; + bufferlist data; + std::tie(op_ret, data) = rgw_rest_read_all_input(s, max_size, false); + + if ((op_ret < 0) && (op_ret != -ERR_LENGTH_REQUIRED)) + return op_ret; + + const int auth_ret = do_aws4_auth_completion(); + if (auth_ret < 0) { + return auth_ret; + } + + in_data.append(data); + + if (data.length()) { + RGWCreateBucketParser parser; + + if (!parser.init()) { + ldout(s->cct, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + char* buf = data.c_str(); + bool success = parser.parse(buf, data.length(), 1); + ldout(s->cct, 20) << "create bucket input data=" << buf << dendl; + + if (!success) { + ldout(s->cct, 0) << "failed to parse input: " << buf << dendl; + return -EINVAL; + } + + if (!parser.get_location_constraint(location_constraint)) { + ldout(s->cct, 0) << "provided input did not specify location constraint correctly" << dendl; + return -EINVAL; + } + + ldout(s->cct, 10) << "create bucket location constraint: " + << location_constraint << dendl; + } + + size_t pos = location_constraint.find(':'); + if (pos != string::npos) { + placement_rule.init(location_constraint.substr(pos + 1), s->info.storage_class); + location_constraint = location_constraint.substr(0, pos); + } else { + placement_rule.storage_class = s->info.storage_class; + } + auto iter = s->info.x_meta_map.find("x-amz-bucket-object-lock-enabled"); + if (iter != s->info.x_meta_map.end()) { + if (!boost::algorithm::iequals(iter->second, "true") && !boost::algorithm::iequals(iter->second, "false")) { + return -EINVAL; + } + obj_lock_enabled = boost::algorithm::iequals(iter->second, "true"); + } + return 0; +} + +void RGWCreateBucket_ObjStore_S3::send_response() +{ + if (op_ret == -ERR_BUCKET_EXISTS) + op_ret = 0; + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + if (s->system_request) { + JSONFormatter f; /* use json formatter for system requests output */ + + f.open_object_section("info"); + encode_json("entry_point_object_ver", ep_objv, &f); + encode_json("object_ver", info.objv_tracker.read_version, &f); + encode_json("bucket_info", info, &f); + f.close_section(); + rgw_flush_formatter_and_reset(s, &f); + } +} + +void RGWDeleteBucket_ObjStore_S3::send_response() +{ + int r = op_ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +static inline void map_qs_metadata(struct req_state* s) +{ + /* merge S3 valid user metadata from the query-string into + * x_meta_map, which maps them to attributes */ + const auto& params = const_cast(s->info.args).get_params(); + for (const auto& elt : params) { + std::string k = boost::algorithm::to_lower_copy(elt.first); + if (k.find("x-amz-meta-") == /* offset */ 0) { + add_amz_meta_header(s->info.x_meta_map, k, elt.second); + } + } +} + +int RGWPutObj_ObjStore_S3::get_params() +{ + if (!s->length) + return -ERR_LENGTH_REQUIRED; + + map src_attrs; + size_t pos; + int ret; + + map_qs_metadata(s); + + RGWAccessControlPolicy_S3 s3policy(s->cct); + ret = create_s3_policy(s, store, s3policy, s->owner); + if (ret < 0) + return ret; + + policy = s3policy; + + if_match = s->info.env->get("HTTP_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_IF_NONE_MATCH"); + copy_source = url_decode(s->info.env->get("HTTP_X_AMZ_COPY_SOURCE", "")); + copy_source_range = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_RANGE"); + + /* handle x-amz-copy-source */ + boost::string_view cs_view(copy_source); + if (! cs_view.empty()) { + if (cs_view[0] == '/') + cs_view.remove_prefix(1); + copy_source_bucket_name = cs_view.to_string(); + pos = copy_source_bucket_name.find("/"); + if (pos == std::string::npos) { + ret = -EINVAL; + ldout(s->cct, 5) << "x-amz-copy-source bad format" << dendl; + return ret; + } + copy_source_object_name = + copy_source_bucket_name.substr(pos + 1, copy_source_bucket_name.size()); + copy_source_bucket_name = copy_source_bucket_name.substr(0, pos); +#define VERSION_ID_STR "?versionId=" + pos = copy_source_object_name.find(VERSION_ID_STR); + if (pos == std::string::npos) { + copy_source_object_name = url_decode(copy_source_object_name); + } else { + copy_source_version_id = + copy_source_object_name.substr(pos + sizeof(VERSION_ID_STR) - 1); + copy_source_object_name = + url_decode(copy_source_object_name.substr(0, pos)); + } + pos = copy_source_bucket_name.find(":"); + if (pos == std::string::npos) { + copy_source_tenant_name = s->src_tenant_name; + } else { + copy_source_tenant_name = copy_source_bucket_name.substr(0, pos); + copy_source_bucket_name = copy_source_bucket_name.substr(pos + 1, copy_source_bucket_name.size()); + if (copy_source_bucket_name.empty()) { + ret = -EINVAL; + ldout(s->cct, 5) << "source bucket name is empty" << dendl; + return ret; + } + } + ret = store->get_bucket_info(*s->sysobj_ctx, + copy_source_tenant_name, + copy_source_bucket_name, + copy_source_bucket_info, + NULL, &src_attrs); + if (ret < 0) { + ldout(s->cct, 5) << __func__ << "(): get_bucket_info() returned ret=" << ret << dendl; + return ret; + } + + /* handle x-amz-copy-source-range */ + + if (copy_source_range) { + string range = copy_source_range; + pos = range.find("bytes="); + if (pos == std::string::npos || pos != 0) { + ret = -EINVAL; + ldout(s->cct, 5) << "x-amz-copy-source-range bad format" << dendl; + return ret; + } + /* 6 is the length of "bytes=" */ + range = range.substr(pos + 6); + pos = range.find("-"); + if (pos == std::string::npos) { + ret = -EINVAL; + ldout(s->cct, 5) << "x-amz-copy-source-range bad format" << dendl; + return ret; + } + string first = range.substr(0, pos); + string last = range.substr(pos + 1); + if (first.find_first_not_of("0123456789") != std::string::npos || last.find_first_not_of("0123456789") != std::string::npos) + { + ldpp_dout(this, 5) << "x-amz-copy-source-range bad format not an integer" << dendl; + ret = -EINVAL; + return ret; + } + copy_source_range_fst = strtoull(first.c_str(), NULL, 10); + copy_source_range_lst = strtoull(last.c_str(), NULL, 10); + if (copy_source_range_fst > copy_source_range_lst) + { + ret = -ERANGE; + ldpp_dout(this, 5) << "x-amz-copy-source-range bad format first number bigger than second" << dendl; + return ret; + } + } + + } /* copy_source */ + + /* handle object tagging */ + auto tag_str = s->info.env->get("HTTP_X_AMZ_TAGGING"); + if (tag_str){ + obj_tags = std::make_unique(); + ret = obj_tags->set_from_string(tag_str); + if (ret < 0){ + ldout(s->cct,0) << "setting obj tags failed with " << ret << dendl; + if (ret == -ERR_INVALID_TAG){ + ret = -EINVAL; //s3 returns only -EINVAL for PUT requests + } + + return ret; + } + } + + //handle object lock + auto obj_lock_mode_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_MODE"); + auto obj_lock_date_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_RETAIN_UNTIL_DATE"); + auto obj_legal_hold_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_LEGAL_HOLD"); + if (obj_lock_mode_str && obj_lock_date_str) { + boost::optional date = ceph::from_iso_8601(obj_lock_date_str); + if (boost::none == date || ceph::real_clock::to_time_t(*date) <= ceph_clock_now()) { + ret = -EINVAL; + ldpp_dout(this,0) << "invalid x-amz-object-lock-retain-until-date value" << dendl; + return ret; + } + if (strcmp(obj_lock_mode_str, "GOVERNANCE") != 0 && strcmp(obj_lock_mode_str, "COMPLIANCE") != 0) { + ret = -EINVAL; + ldpp_dout(this,0) << "invalid x-amz-object-lock-mode value" << dendl; + return ret; + } + obj_retention = new RGWObjectRetention(obj_lock_mode_str, *date); + } else if ((obj_lock_mode_str && !obj_lock_date_str) || (!obj_lock_mode_str && obj_lock_date_str)) { + ret = -EINVAL; + ldpp_dout(this,0) << "need both x-amz-object-lock-mode and x-amz-object-lock-retain-until-date " << dendl; + return ret; + } + if (obj_legal_hold_str) { + if (strcmp(obj_legal_hold_str, "ON") != 0 && strcmp(obj_legal_hold_str, "OFF") != 0) { + ret = -EINVAL; + ldpp_dout(this,0) << "invalid x-amz-object-lock-legal-hold value" << dendl; + return ret; + } + obj_legal_hold = new RGWObjectLegalHold(obj_legal_hold_str); + } + if (!s->bucket_info.obj_lock_enabled() && (obj_retention || obj_legal_hold)) { + ldpp_dout(this, 0) << "ERROR: object retention or legal hold can't be set if bucket object lock not configured" << dendl; + ret = -ERR_INVALID_REQUEST; + return ret; + } + multipart_upload_id = s->info.args.get("uploadId"); + multipart_part_str = s->info.args.get("partNumber"); + if (!multipart_part_str.empty()) { + string err; + multipart_part_num = strict_strtol(multipart_part_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 10) << "bad part number: " << multipart_part_str << ": " << err << dendl; + return -EINVAL; + } + } else if (!multipart_upload_id.empty()) { + ldpp_dout(s, 10) << "part number with no multipart upload id" << dendl; + return -EINVAL; + } + + append = s->info.args.exists("append"); + if (append) { + string pos_str = s->info.args.get("position"); + if (pos_str.empty()) { + return -EINVAL; + } else { + position = strtoull(pos_str.c_str(), NULL, 10); + } + } + + return RGWPutObj_ObjStore::get_params(); +} + +int RGWPutObj_ObjStore_S3::get_data(bufferlist& bl) +{ + const int ret = RGWPutObj_ObjStore::get_data(bl); + if (ret == 0) { + const int ret_auth = do_aws4_auth_completion(); + if (ret_auth < 0) { + return ret_auth; + } + } + + return ret; +} + +static int get_success_retcode(int code) +{ + switch (code) { + case 201: + return STATUS_CREATED; + case 204: + return STATUS_NO_CONTENT; + } + return 0; +} + +void RGWPutObj_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + dump_errno(s); + } else { + if (s->cct->_conf->rgw_s3_success_create_obj_status) { + op_ret = get_success_retcode( + s->cct->_conf->rgw_s3_success_create_obj_status); + set_req_state_err(s, op_ret); + } + + string expires = get_s3_expiration_header(s, mtime); + + if (copy_source.empty()) { + dump_errno(s); + dump_etag(s, etag); + dump_content_length(s, 0); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + dump_header_if_nonempty(s, "x-amz-expiration", expires); + for (auto &it : crypt_http_responses) + dump_header(s, it.first, it.second); + } else { + dump_errno(s); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + dump_header_if_nonempty(s, "x-amz-expiration", expires); + end_header(s, this, "application/xml"); + dump_start(s); + struct tm tmp; + utime_t ut(mtime); + time_t secs = (time_t)ut.sec(); + gmtime_r(&secs, &tmp); + char buf[TIME_BUF_SIZE]; + s->formatter->open_object_section_in_ns("CopyPartResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + if (strftime(buf, sizeof(buf), "%Y-%m-%dT%T.000Z", &tmp) > 0) { + s->formatter->dump_string("LastModified", buf); + } + s->formatter->dump_string("ETag", etag); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + return; + } + } + if (append) { + if (op_ret == 0 || op_ret == -ERR_POSITION_NOT_EQUAL_TO_LENGTH) { + dump_header(s, "x-rgw-next-append-position", cur_accounted_size); + } + } + if (s->system_request && !real_clock::is_zero(mtime)) { + dump_epoch_header(s, "Rgwx-Mtime", mtime); + } + end_header(s, this); +} + +static inline int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map& attrs) +{ + RGWRados::Object op_target(store, s->bucket_info, *static_cast(s->obj_ctx), obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrs; + + return read_op.prepare(); +} + +static inline void set_attr(map& attrs, const char* key, const std::string& value) +{ + bufferlist bl; + encode(value,bl); + attrs.emplace(key, std::move(bl)); +} + +static inline void set_attr(map& attrs, const char* key, const char* value) +{ + bufferlist bl; + encode(value,bl); + attrs.emplace(key, std::move(bl)); +} + +int RGWPutObj_ObjStore_S3::get_decrypt_filter( + std::unique_ptr* filter, + RGWGetObj_Filter* cb, + map& attrs, + bufferlist* manifest_bl) +{ + std::map crypt_http_responses_unused; + + int res = 0; + std::unique_ptr block_crypt; + res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses_unused); + if (res == 0) { + if (block_crypt != nullptr) { + auto f = std::unique_ptr(new RGWGetObj_BlockDecrypt(s->cct, cb, std::move(block_crypt))); + //RGWGetObj_BlockDecrypt* f = new RGWGetObj_BlockDecrypt(s->cct, cb, std::move(block_crypt)); + if (f != nullptr) { + if (manifest_bl != nullptr) { + res = f->read_manifest(*manifest_bl); + if (res == 0) { + *filter = std::move(f); + } + } + } + } + } + return res; +} + +int RGWPutObj_ObjStore_S3::get_encrypt_filter( + std::unique_ptr *filter, + rgw::putobj::DataProcessor *cb) +{ + int res = 0; + if (!multipart_upload_id.empty()) { + RGWMPObj mp(s->object.name, multipart_upload_id); + rgw_obj obj; + obj.init_ns(s->bucket, mp.get_meta(), RGW_OBJ_NS_MULTIPART); + obj.set_in_extra_data(true); + map xattrs; + res = get_obj_attrs(store, s, obj, xattrs); + if (res == 0) { + std::unique_ptr block_crypt; + /* We are adding to existing object. + * We use crypto mode that configured as if we were decrypting. */ + res = rgw_s3_prepare_decrypt(s, xattrs, &block_crypt, crypt_http_responses); + if (res == 0 && block_crypt != nullptr) + filter->reset(new RGWPutObj_BlockEncrypt(s->cct, cb, std::move(block_crypt))); + } + /* it is ok, to not have encryption at all */ + } + else + { + std::unique_ptr block_crypt; + res = rgw_s3_prepare_encrypt(s, attrs, nullptr, &block_crypt, crypt_http_responses); + if (res == 0 && block_crypt != nullptr) { + filter->reset(new RGWPutObj_BlockEncrypt(s->cct, cb, std::move(block_crypt))); + } + } + return res; +} + +void RGWPostObj_ObjStore_S3::rebuild_key(string& key) +{ + static string var = "${filename}"; + int pos = key.find(var); + if (pos < 0) + return; + + string new_key = key.substr(0, pos); + new_key.append(filename); + new_key.append(key.substr(pos + var.size())); + + key = new_key; +} + +std::string RGWPostObj_ObjStore_S3::get_current_filename() const +{ + return s->object.name; +} + +std::string RGWPostObj_ObjStore_S3::get_current_content_type() const +{ + return content_type; +} + +int RGWPostObj_ObjStore_S3::get_params() +{ + op_ret = RGWPostObj_ObjStore::get_params(); + if (op_ret < 0) { + return op_ret; + } + + map_qs_metadata(s); + + ldout(s->cct, 20) << "adding bucket to policy env: " << s->bucket.name + << dendl; + env.add_var("bucket", s->bucket.name); + + bool done; + do { + struct post_form_part part; + int r = read_form_part_header(&part, done); + if (r < 0) + return r; + + if (s->cct->_conf->subsys.should_gather()) { + ldout(s->cct, 20) << "read part header -- part.name=" + << part.name << dendl; + + for (const auto& pair : part.fields) { + ldout(s->cct, 20) << "field.name=" << pair.first << dendl; + ldout(s->cct, 20) << "field.val=" << pair.second.val << dendl; + ldout(s->cct, 20) << "field.params:" << dendl; + + for (const auto& param_pair : pair.second.params) { + ldout(s->cct, 20) << " " << param_pair.first + << " -> " << param_pair.second << dendl; + } + } + } + + if (done) { /* unexpected here */ + err_msg = "Malformed request"; + return -EINVAL; + } + + if (stringcasecmp(part.name, "file") == 0) { /* beginning of data transfer */ + struct post_part_field& field = part.fields["Content-Disposition"]; + map::iterator iter = field.params.find("filename"); + if (iter != field.params.end()) { + filename = iter->second; + } + parts[part.name] = part; + break; + } + + bool boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + r = read_data(part.data, chunk_size, boundary, done); + if (r < 0 || !boundary) { + err_msg = "Couldn't find boundary"; + return -EINVAL; + } + parts[part.name] = part; + string part_str(part.data.c_str(), part.data.length()); + env.add_var(part.name, part_str); + } while (!done); + + string object_str; + if (!part_str(parts, "key", &object_str)) { + err_msg = "Key not specified"; + return -EINVAL; + } + + s->object = rgw_obj_key(object_str); + + rebuild_key(s->object.name); + + if (s->object.empty()) { + err_msg = "Empty object name"; + return -EINVAL; + } + + env.add_var("key", s->object.name); + + part_str(parts, "Content-Type", &content_type); + + /* AWS permits POST without Content-Type: http://tracker.ceph.com/issues/20201 */ + if (! content_type.empty()) { + env.add_var("Content-Type", content_type); + } + + map::iterator piter = + parts.upper_bound(RGW_AMZ_META_PREFIX); + for (; piter != parts.end(); ++piter) { + string n = piter->first; + if (strncasecmp(n.c_str(), RGW_AMZ_META_PREFIX, + sizeof(RGW_AMZ_META_PREFIX) - 1) != 0) + break; + + string attr_name = RGW_ATTR_PREFIX; + attr_name.append(n); + + /* need to null terminate it */ + bufferlist& data = piter->second.data; + string str = string(data.c_str(), data.length()); + + bufferlist attr_bl; + attr_bl.append(str.c_str(), str.size() + 1); + + attrs[attr_name] = attr_bl; + } + // TODO: refactor this and the above loop to share code + piter = parts.find(RGW_AMZ_WEBSITE_REDIRECT_LOCATION); + if (piter != parts.end()) { + string n = piter->first; + string attr_name = RGW_ATTR_PREFIX; + attr_name.append(n); + /* need to null terminate it */ + bufferlist& data = piter->second.data; + string str = string(data.c_str(), data.length()); + + bufferlist attr_bl; + attr_bl.append(str.c_str(), str.size() + 1); + + attrs[attr_name] = attr_bl; + } + + int r = get_policy(); + if (r < 0) + return r; + + r = get_tags(); + if (r < 0) + return r; + + + min_len = post_policy.min_length; + max_len = post_policy.max_length; + + + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_tags() +{ + string tags_str; + if (part_str(parts, "tagging", &tags_str)) { + RGWXMLParser parser; + if (!parser.init()){ + ldout(s->cct, 0) << "Couldn't init RGWObjTags XML parser" << dendl; + err_msg = "Server couldn't process the request"; + return -EINVAL; // TODO: This class of errors in rgw code should be a 5XX error + } + if (!parser.parse(tags_str.c_str(), tags_str.size(), 1)) { + ldout(s->cct,0 ) << "Invalid Tagging XML" << dendl; + err_msg = "Invalid Tagging XML"; + return -EINVAL; + } + + RGWObjTagging_S3 tagging; + + try { + RGWXMLDecoder::decode_xml("Tagging", tagging, &parser); + } catch (RGWXMLDecoder::err& err) { + ldout(s->cct, 5) << "Malformed tagging request: " << err << dendl; + return -EINVAL; + } + + RGWObjTags obj_tags; + int r = tagging.rebuild(obj_tags); + if (r < 0) + return r; + + bufferlist tags_bl; + obj_tags.encode(tags_bl); + ldout(s->cct, 20) << "Read " << obj_tags.count() << "tags" << dendl; + attrs[RGW_ATTR_TAGS] = tags_bl; + } + + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_policy() +{ + if (part_bl(parts, "policy", &s->auth.s3_postobj_creds.encoded_policy)) { + bool aws4_auth = false; + + /* x-amz-algorithm handling */ + using rgw::auth::s3::AWS4_HMAC_SHA256_STR; + if ((part_str(parts, "x-amz-algorithm", &s->auth.s3_postobj_creds.x_amz_algorithm)) && + (s->auth.s3_postobj_creds.x_amz_algorithm == AWS4_HMAC_SHA256_STR)) { + ldout(s->cct, 0) << "Signature verification algorithm AWS v4 (AWS4-HMAC-SHA256)" << dendl; + aws4_auth = true; + } else { + ldout(s->cct, 0) << "Signature verification algorithm AWS v2" << dendl; + } + + // check that the signature matches the encoded policy + if (aws4_auth) { + /* AWS4 */ + + /* x-amz-credential handling */ + if (!part_str(parts, "x-amz-credential", + &s->auth.s3_postobj_creds.x_amz_credential)) { + ldout(s->cct, 0) << "No S3 aws4 credential found!" << dendl; + err_msg = "Missing aws4 credential"; + return -EINVAL; + } + + /* x-amz-signature handling */ + if (!part_str(parts, "x-amz-signature", + &s->auth.s3_postobj_creds.signature)) { + ldout(s->cct, 0) << "No aws4 signature found!" << dendl; + err_msg = "Missing aws4 signature"; + return -EINVAL; + } + + /* x-amz-date handling */ + std::string received_date_str; + if (!part_str(parts, "x-amz-date", &received_date_str)) { + ldout(s->cct, 0) << "No aws4 date found!" << dendl; + err_msg = "Missing aws4 date"; + return -EINVAL; + } + } else { + /* AWS2 */ + + // check that the signature matches the encoded policy + if (!part_str(parts, "AWSAccessKeyId", + &s->auth.s3_postobj_creds.access_key)) { + ldout(s->cct, 0) << "No S3 aws2 access key found!" << dendl; + err_msg = "Missing aws2 access key"; + return -EINVAL; + } + + if (!part_str(parts, "signature", &s->auth.s3_postobj_creds.signature)) { + ldout(s->cct, 0) << "No aws2 signature found!" << dendl; + err_msg = "Missing aws2 signature"; + return -EINVAL; + } + } + + part_str(parts, "x-amz-security-token", &s->auth.s3_postobj_creds.x_amz_security_token); + + /* FIXME: this is a makeshift solution. The browser upload authentication will be + * handled by an instance of rgw::auth::Completer spawned in Handler's authorize() + * method. */ + const int ret = rgw::auth::Strategy::apply(this, auth_registry_ptr->get_s3_post(), s); + if (ret != 0) { + return -EACCES; + } else { + /* Populate the owner info. */ + s->owner.set_id(s->user->user_id); + s->owner.set_name(s->user->display_name); + ldout(s->cct, 20) << "Successful Signature Verification!" << dendl; + } + + ceph::bufferlist decoded_policy; + try { + decoded_policy.decode_base64(s->auth.s3_postobj_creds.encoded_policy); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "failed to decode_base64 policy" << dendl; + err_msg = "Could not decode policy"; + return -EINVAL; + } + + decoded_policy.append('\0'); // NULL terminate + ldout(s->cct, 20) << "POST policy: " << decoded_policy.c_str() << dendl; + + + int r = post_policy.from_json(decoded_policy, err_msg); + if (r < 0) { + if (err_msg.empty()) { + err_msg = "Failed to parse policy"; + } + ldout(s->cct, 0) << "failed to parse policy" << dendl; + return -EINVAL; + } + + if (aws4_auth) { + /* AWS4 */ + post_policy.set_var_checked("x-amz-signature"); + } else { + /* AWS2 */ + post_policy.set_var_checked("AWSAccessKeyId"); + post_policy.set_var_checked("signature"); + } + post_policy.set_var_checked("policy"); + + r = post_policy.check(&env, err_msg); + if (r < 0) { + if (err_msg.empty()) { + err_msg = "Policy check failed"; + } + ldout(s->cct, 0) << "policy check failed" << dendl; + return r; + } + + } else { + ldout(s->cct, 0) << "No attached policy found!" << dendl; + } + + string canned_acl; + part_str(parts, "acl", &canned_acl); + + RGWAccessControlPolicy_S3 s3policy(s->cct); + ldout(s->cct, 20) << "canned_acl=" << canned_acl << dendl; + if (s3policy.create_canned(s->owner, s->bucket_owner, canned_acl) < 0) { + err_msg = "Bad canned ACLs"; + return -EINVAL; + } + + policy = s3policy; + + return 0; +} + +int RGWPostObj_ObjStore_S3::complete_get_params() +{ + bool done; + do { + struct post_form_part part; + int r = read_form_part_header(&part, done); + if (r < 0) { + return r; + } + + ceph::bufferlist part_data; + bool boundary; + uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + r = read_data(part.data, chunk_size, boundary, done); + if (r < 0 || !boundary) { + return -EINVAL; + } + + /* Just reading the data but not storing any results of that. */ + } while (!done); + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_data(ceph::bufferlist& bl, bool& again) +{ + bool boundary; + bool done; + + const uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size; + int r = read_data(bl, chunk_size, boundary, done); + if (r < 0) { + return r; + } + + if (boundary) { + if (!done) { + /* Reached end of data, let's drain the rest of the params */ + r = complete_get_params(); + if (r < 0) { + return r; + } + } + } + + again = !boundary; + return bl.length(); +} + +void RGWPostObj_ObjStore_S3::send_response() +{ + if (op_ret == 0 && parts.count("success_action_redirect")) { + string redirect; + + part_str(parts, "success_action_redirect", &redirect); + + string tenant; + string bucket; + string key; + string etag_str = "\""; + + etag_str.append(etag); + etag_str.append("\""); + + string etag_url; + + url_encode(s->bucket_tenant, tenant); /* surely overkill, but cheap */ + url_encode(s->bucket_name, bucket); + url_encode(s->object.name, key); + url_encode(etag_str, etag_url); + + if (!s->bucket_tenant.empty()) { + /* + * What we really would like is to quaily the bucket name, so + * that the client could simply copy it and paste into next request. + * Unfortunately, in S3 we cannot know if the client will decide + * to come through DNS, with "bucket.tenant" sytanx, or through + * URL with "tenant\bucket" syntax. Therefore, we provide the + * tenant separately. + */ + redirect.append("?tenant="); + redirect.append(tenant); + redirect.append("&bucket="); + redirect.append(bucket); + } else { + redirect.append("?bucket="); + redirect.append(bucket); + } + redirect.append("&key="); + redirect.append(key); + redirect.append("&etag="); + redirect.append(etag_url); + + int r = check_utf8(redirect.c_str(), redirect.size()); + if (r < 0) { + op_ret = r; + goto done; + } + dump_redirect(s, redirect); + op_ret = STATUS_REDIRECT; + } else if (op_ret == 0 && parts.count("success_action_status")) { + string status_string; + uint32_t status_int; + + part_str(parts, "success_action_status", &status_string); + + int r = stringtoul(status_string, &status_int); + if (r < 0) { + op_ret = r; + goto done; + } + + switch (status_int) { + case 200: + break; + case 201: + op_ret = STATUS_CREATED; + break; + default: + op_ret = STATUS_NO_CONTENT; + break; + } + } else if (! op_ret) { + op_ret = STATUS_NO_CONTENT; + } + +done: + if (op_ret == STATUS_CREATED) { + for (auto &it : crypt_http_responses) + dump_header(s, it.first, it.second); + s->formatter->open_object_section("PostResponse"); + std::string base_uri = compute_domain_uri(s); + if (!s->bucket_tenant.empty()){ + s->formatter->dump_format("Location", "%s/%s:%s/%s", + base_uri.c_str(), + url_encode(s->bucket_tenant).c_str(), + url_encode(s->bucket_name).c_str(), + url_encode(s->object.name).c_str()); + s->formatter->dump_string("Tenant", s->bucket_tenant); + } else { + s->formatter->dump_format("Location", "%s/%s/%s", + base_uri.c_str(), + url_encode(s->bucket_name).c_str(), + url_encode(s->object.name).c_str()); + } + s->formatter->dump_string("Bucket", s->bucket_name); + s->formatter->dump_string("Key", s->object.name); + s->formatter->dump_string("ETag", etag); + s->formatter->close_section(); + } + s->err.message = err_msg; + set_req_state_err(s, op_ret); + dump_errno(s); + if (op_ret >= 0) { + dump_content_length(s, s->formatter->get_len()); + } + end_header(s, this); + if (op_ret != STATUS_CREATED) + return; + + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWPostObj_ObjStore_S3::get_encrypt_filter( + std::unique_ptr *filter, + rgw::putobj::DataProcessor *cb) +{ + std::unique_ptr block_crypt; + int res = rgw_s3_prepare_encrypt(s, attrs, &parts, &block_crypt, + crypt_http_responses); + if (res == 0 && block_crypt != nullptr) { + filter->reset(new RGWPutObj_BlockEncrypt(s->cct, cb, std::move(block_crypt))); + } + return res; +} + +int RGWDeleteObj_ObjStore_S3::get_params() +{ + const char *if_unmod = s->info.env->get("HTTP_X_AMZ_DELETE_IF_UNMODIFIED_SINCE"); + + if (s->system_request) { + s->info.args.get_bool(RGW_SYS_PARAM_PREFIX "no-precondition-error", &no_precondition_error, false); + } + + if (if_unmod) { + std::string if_unmod_decoded = url_decode(if_unmod); + uint64_t epoch; + uint64_t nsec; + if (utime_t::parse_date(if_unmod_decoded, &epoch, &nsec) < 0) { + ldout(s->cct, 10) << "failed to parse time: " << if_unmod_decoded << dendl; + return -EINVAL; + } + unmod_since = utime_t(epoch, nsec).to_real_time(); + } + + const char *bypass_gov_header = s->info.env->get("HTTP_X_AMZ_BYPASS_GOVERNANCE_RETENTION"); + if (bypass_gov_header) { + std::string bypass_gov_decoded = url_decode(bypass_gov_header); + bypass_governance_mode = boost::algorithm::iequals(bypass_gov_decoded, "true"); + } + + return 0; +} + +void RGWDeleteObj_ObjStore_S3::send_response() +{ + int r = op_ret; + if (r == -ENOENT) + r = 0; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + if (delete_marker) { + dump_header(s, "x-amz-delete-marker", "true"); + } + end_header(s, this); +} + +int RGWCopyObj_ObjStore_S3::init_dest_policy() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + /* build a policy for the target object */ + int r = create_s3_policy(s, store, s3policy, s->owner); + if (r < 0) + return r; + + dest_policy = s3policy; + + return 0; +} + +int RGWCopyObj_ObjStore_S3::get_params() +{ + if_mod = s->info.env->get("HTTP_X_AMZ_COPY_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_X_AMZ_COPY_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_X_AMZ_COPY_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_X_AMZ_COPY_IF_NONE_MATCH"); + + src_tenant_name = s->src_tenant_name; + src_bucket_name = s->src_bucket_name; + src_object = s->src_object; + dest_tenant_name = s->bucket.tenant; + dest_bucket_name = s->bucket.name; + dest_object = s->object.name; + + if (s->system_request) { + source_zone = s->info.args.get(RGW_SYS_PARAM_PREFIX "source-zone"); + s->info.args.get_bool(RGW_SYS_PARAM_PREFIX "copy-if-newer", ©_if_newer, false); + } + + copy_source = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE"); + auto tmp_md_d = s->info.env->get("HTTP_X_AMZ_METADATA_DIRECTIVE"); + if (tmp_md_d) { + if (strcasecmp(tmp_md_d, "COPY") == 0) { + attrs_mod = RGWRados::ATTRSMOD_NONE; + } else if (strcasecmp(tmp_md_d, "REPLACE") == 0) { + attrs_mod = RGWRados::ATTRSMOD_REPLACE; + } else if (!source_zone.empty()) { + attrs_mod = RGWRados::ATTRSMOD_NONE; // default for intra-zone_group copy + } else { + s->err.message = "Unknown metadata directive."; + ldout(s->cct, 0) << s->err.message << dendl; + return -EINVAL; + } + md_directive = tmp_md_d; + } + + if (source_zone.empty() && + (dest_tenant_name.compare(src_tenant_name) == 0) && + (dest_bucket_name.compare(src_bucket_name) == 0) && + (dest_object.compare(src_object.name) == 0) && + src_object.instance.empty() && + (attrs_mod != RGWRados::ATTRSMOD_REPLACE)) { + need_to_check_storage_class = true; + } + + return 0; +} + +int RGWCopyObj_ObjStore_S3::check_storage_class(const rgw_placement_rule& src_placement) +{ + if (src_placement == s->dest_placement) { + /* can only copy object into itself if replacing attrs */ + s->err.message = "This copy request is illegal because it is trying to copy " + "an object to itself without changing the object's metadata, " + "storage class, website redirect location or encryption attributes."; + ldout(s->cct, 0) << s->err.message << dendl; + return -ERR_INVALID_REQUEST; + } + return 0; +} + +void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs) +{ + if (! sent_header) { + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret == 0) { + s->formatter->open_object_section_in_ns("CopyObjectResult", XMLNS_AWS_S3); + } + sent_header = true; + } else { + /* Send progress field. Note that this diverge from the original S3 + * spec. We do this in order to keep connection alive. + */ + s->formatter->dump_int("Progress", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_S3::send_response() +{ + if (!sent_header) + send_partial_response(0); + + if (op_ret == 0) { + dump_time(s, "LastModified", &mtime); + if (!etag.empty()) { + s->formatter->dump_string("ETag", std::move(etag)); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWGetACLs_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + rgw_flush_formatter(s, s->formatter); + dump_body(s, acls); +} + +int RGWPutACLs_ObjStore_S3::get_params() +{ + int ret = RGWPutACLs_ObjStore::get_params(); + if (ret >= 0) { + const int ret_auth = do_aws4_auth_completion(); + if (ret_auth < 0) { + return ret_auth; + } + } + return ret; +} + +int RGWPutACLs_ObjStore_S3::get_policy_from_state(RGWRados *store, + struct req_state *s, + stringstream& ss) +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + // bucket-* canned acls do not apply to bucket + if (s->object.empty()) { + if (s->canned_acl.find("bucket") != string::npos) + s->canned_acl.clear(); + } + + int r = create_s3_policy(s, store, s3policy, owner); + if (r < 0) + return r; + + s3policy.to_xml(ss); + + return 0; +} + +void RGWPutACLs_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); +} + +void RGWGetLC_ObjStore_S3::execute() +{ + config.set_ctx(s->cct); + + map::iterator aiter = s->bucket_attrs.find(RGW_ATTR_LC); + if (aiter == s->bucket_attrs.end()) { + op_ret = -ENOENT; + return; + } + + bufferlist::const_iterator iter{&aiter->second}; + try { + config.decode(iter); + } catch (const buffer::error& e) { + ldout(s->cct, 0) << __func__ << "decode life cycle config failed" << dendl; + op_ret = -EIO; + return; + } +} + +void RGWGetLC_ObjStore_S3::send_response() +{ + if (op_ret) { + if (op_ret == -ENOENT) { + set_req_state_err(s, ERR_NO_SUCH_LC); + } else { + set_req_state_err(s, op_ret); + } + } + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + if (op_ret < 0) + return; + + encode_xml("LifecycleConfiguration", XMLNS_AWS_S3, config, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWPutLC_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); +} + +void RGWDeleteLC_ObjStore_S3::send_response() +{ + if (op_ret == 0) + op_ret = STATUS_NO_CONTENT; + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); +} + +void RGWGetCORS_ObjStore_S3::send_response() +{ + if (op_ret) { + if (op_ret == -ENOENT) + set_req_state_err(s, ERR_NO_SUCH_CORS_CONFIGURATION); + else + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, NULL, "application/xml"); + dump_start(s); + if (! op_ret) { + string cors; + RGWCORSConfiguration_S3 *s3cors = + static_cast(&bucket_cors); + stringstream ss; + + s3cors->to_xml(ss); + cors = ss.str(); + dump_body(s, cors); + } +} + +int RGWPutCORS_ObjStore_S3::get_params() +{ + RGWCORSXMLParser_S3 parser(s->cct); + RGWCORSConfiguration_S3 *cors_config; + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int r = 0; + bufferlist data; + std::tie(r, data) = rgw_rest_read_all_input(s, max_size, false); + if (r < 0) { + return r; + } + + r = do_aws4_auth_completion(); + if (r < 0) { + return r; + } + + if (!parser.init()) { + return -EINVAL; + } + + char* buf = data.c_str(); + if (!buf || !parser.parse(buf, data.length(), 1)) { + return -ERR_MALFORMED_XML; + } + cors_config = + static_cast(parser.find_first( + "CORSConfiguration")); + if (!cors_config) { + return -ERR_MALFORMED_XML; + } + +#define CORS_RULES_MAX_NUM 100 + int max_num = s->cct->_conf->rgw_cors_rules_max_num; + if (max_num < 0) { + max_num = CORS_RULES_MAX_NUM; + } + int cors_rules_num = cors_config->get_rules().size(); + if (cors_rules_num > max_num) { + ldout(s->cct, 4) << "An cors config can have up to " + << max_num + << " rules, request cors rules num: " + << cors_rules_num << dendl; + op_ret = -ERR_INVALID_CORS_RULES_ERROR; + s->err.message = "The number of CORS rules should not exceed allowed limit of " + + std::to_string(max_num) + " rules."; + return -ERR_INVALID_REQUEST; + } + + // forward bucket cors requests to meta master zone + if (!store->svc.zone->is_meta_master()) { + /* only need to keep this data around if we're not meta master */ + in_data.append(data); + } + + if (s->cct->_conf->subsys.should_gather()) { + ldout(s->cct, 15) << "CORSConfiguration"; + cors_config->to_xml(*_dout); + *_dout << dendl; + } + + cors_config->encode(cors_bl); + + return 0; +} + +void RGWPutCORS_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL, "application/xml"); + dump_start(s); +} + +void RGWDeleteCORS_ObjStore_S3::send_response() +{ + int r = op_ret; + if (!r || r == -ENOENT) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, NULL); +} + +void RGWOptionsCORS_ObjStore_S3::send_response() +{ + string hdrs, exp_hdrs; + uint32_t max_age = CORS_MAX_AGE_INVALID; + /*EACCES means, there is no CORS registered yet for the bucket + *ENOENT means, there is no match of the Origin in the list of CORSRule + */ + if (op_ret == -ENOENT) + op_ret = -EACCES; + if (op_ret < 0) { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL); + return; + } + get_response_params(hdrs, exp_hdrs, &max_age); + + dump_errno(s); + dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), + max_age); + end_header(s, NULL); +} + +void RGWGetRequestPayment_ObjStore_S3::send_response() +{ + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + s->formatter->open_object_section_in_ns("RequestPaymentConfiguration", XMLNS_AWS_S3); + const char *payer = requester_pays ? "Requester" : "BucketOwner"; + s->formatter->dump_string("Payer", payer); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +class RGWSetRequestPaymentParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) override { + return new XMLObj; + } + +public: + RGWSetRequestPaymentParser() {} + ~RGWSetRequestPaymentParser() override {} + + int get_request_payment_payer(bool *requester_pays) { + XMLObj *config = find_first("RequestPaymentConfiguration"); + if (!config) + return -EINVAL; + + *requester_pays = false; + + XMLObj *field = config->find_first("Payer"); + if (!field) + return 0; + + auto& s = field->get_data(); + + if (stringcasecmp(s, "Requester") == 0) { + *requester_pays = true; + } else if (stringcasecmp(s, "BucketOwner") != 0) { + return -EINVAL; + } + + return 0; + } +}; + +int RGWSetRequestPayment_ObjStore_S3::get_params() +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int r = 0; + std::tie(r, in_data) = rgw_rest_read_all_input(s, max_size, false); + + if (r < 0) { + return r; + } + + + RGWSetRequestPaymentParser parser; + + if (!parser.init()) { + ldout(s->cct, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + char* buf = in_data.c_str(); + if (!parser.parse(buf, in_data.length(), 1)) { + ldout(s->cct, 10) << "failed to parse data: " << buf << dendl; + return -EINVAL; + } + + return parser.get_request_payment_payer(&requester_pays); +} + +void RGWSetRequestPayment_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); +} + +int RGWInitMultipart_ObjStore_S3::get_params() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + op_ret = create_s3_policy(s, store, s3policy, s->owner); + if (op_ret < 0) + return op_ret; + + policy = s3policy; + + return 0; +} + +void RGWInitMultipart_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + for (auto &it : crypt_http_responses) + dump_header(s, it.first, it.second); + end_header(s, this, "application/xml"); + if (op_ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("InitiateMultipartUploadResult", XMLNS_AWS_S3); + if (!s->bucket_tenant.empty()) + s->formatter->dump_string("Tenant", s->bucket_tenant); + s->formatter->dump_string("Bucket", s->bucket_name); + s->formatter->dump_string("Key", s->object.name); + s->formatter->dump_string("UploadId", upload_id); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWInitMultipart_ObjStore_S3::prepare_encryption(map& attrs) +{ + int res = 0; + res = rgw_s3_prepare_encrypt(s, attrs, nullptr, nullptr, crypt_http_responses); + return res; +} + +int RGWCompleteMultipart_ObjStore_S3::get_params() +{ + int ret = RGWCompleteMultipart_ObjStore::get_params(); + if (ret < 0) { + return ret; + } + + map_qs_metadata(s); + + return do_aws4_auth_completion(); +} + +void RGWCompleteMultipart_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + end_header(s, this, "application/xml"); + if (op_ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("CompleteMultipartUploadResult", XMLNS_AWS_S3); + std::string base_uri = compute_domain_uri(s); + if (!s->bucket_tenant.empty()) { + s->formatter->dump_format("Location", "%s/%s:%s/%s", + base_uri.c_str(), + s->bucket_tenant.c_str(), + s->bucket_name.c_str(), + s->object.name.c_str() + ); + s->formatter->dump_string("Tenant", s->bucket_tenant); + } else { + s->formatter->dump_format("Location", "%s/%s/%s", + base_uri.c_str(), + s->bucket_name.c_str(), + s->object.name.c_str() + ); + } + s->formatter->dump_string("Bucket", s->bucket_name); + s->formatter->dump_string("Key", s->object.name); + s->formatter->dump_string("ETag", etag); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWAbortMultipart_ObjStore_S3::send_response() +{ + int r = op_ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +void RGWListMultipart_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); + + if (op_ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("ListPartsResult", XMLNS_AWS_S3); + map::iterator iter; + map::reverse_iterator test_iter; + int cur_max = 0; + + iter = parts.begin(); + test_iter = parts.rbegin(); + if (test_iter != parts.rend()) { + cur_max = test_iter->first; + } + if (!s->bucket_tenant.empty()) + s->formatter->dump_string("Tenant", s->bucket_tenant); + s->formatter->dump_string("Bucket", s->bucket_name); + s->formatter->dump_string("Key", s->object.name); + s->formatter->dump_string("UploadId", upload_id); + s->formatter->dump_string("StorageClass", "STANDARD"); + s->formatter->dump_int("PartNumberMarker", marker); + s->formatter->dump_int("NextPartNumberMarker", cur_max); + s->formatter->dump_int("MaxParts", max_parts); + s->formatter->dump_string("IsTruncated", (truncated ? "true" : "false")); + + ACLOwner& owner = policy.get_owner(); + dump_owner(s, owner.get_id(), owner.get_display_name()); + + for (; iter != parts.end(); ++iter) { + RGWUploadPartInfo& info = iter->second; + + s->formatter->open_object_section("Part"); + + dump_time(s, "LastModified", &info.modified); + + s->formatter->dump_unsigned("PartNumber", info.num); + s->formatter->dump_format("ETag", "\"%s\"", info.etag.c_str()); + s->formatter->dump_unsigned("Size", info.accounted_size); + s->formatter->close_section(); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWListBucketMultiparts_ObjStore_S3::send_response() +{ + if (op_ret < 0) + set_req_state_err(s, op_ret); + dump_errno(s); + + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); + dump_start(s); + if (op_ret < 0) + return; + + s->formatter->open_object_section_in_ns("ListMultipartUploadsResult", XMLNS_AWS_S3); + if (!s->bucket_tenant.empty()) + s->formatter->dump_string("Tenant", s->bucket_tenant); + s->formatter->dump_string("Bucket", s->bucket_name); + if (!prefix.empty()) + s->formatter->dump_string("ListMultipartUploadsResult.Prefix", prefix); + const string& key_marker = marker.get_key(); + if (!key_marker.empty()) + s->formatter->dump_string("KeyMarker", key_marker); + const string& upload_id_marker = marker.get_upload_id(); + if (!upload_id_marker.empty()) + s->formatter->dump_string("UploadIdMarker", upload_id_marker); + string next_key = next_marker.mp.get_key(); + if (!next_key.empty()) + s->formatter->dump_string("NextKeyMarker", next_key); + string next_upload_id = next_marker.mp.get_upload_id(); + if (!next_upload_id.empty()) + s->formatter->dump_string("NextUploadIdMarker", next_upload_id); + s->formatter->dump_int("MaxUploads", max_uploads); + if (!delimiter.empty()) + s->formatter->dump_string("Delimiter", delimiter); + s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false")); + + if (op_ret >= 0) { + vector::iterator iter; + for (iter = uploads.begin(); iter != uploads.end(); ++iter) { + RGWMPObj& mp = iter->mp; + s->formatter->open_array_section("Upload"); + s->formatter->dump_string("Key", mp.get_key()); + s->formatter->dump_string("UploadId", mp.get_upload_id()); + dump_owner(s, s->user->user_id, s->user->display_name, "Initiator"); + dump_owner(s, s->user->user_id, s->user->display_name); + s->formatter->dump_string("StorageClass", "STANDARD"); + dump_time(s, "Initiated", &iter->obj.meta.mtime); + s->formatter->close_section(); + } + if (!common_prefixes.empty()) { + s->formatter->open_array_section("CommonPrefixes"); + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); + pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->dump_string("CommonPrefixes.Prefix", pref_iter->first); + } + s->formatter->close_section(); + } + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWDeleteMultiObj_ObjStore_S3::get_params() +{ + int ret = RGWDeleteMultiObj_ObjStore::get_params(); + if (ret < 0) { + return ret; + } + + return do_aws4_auth_completion(); +} + +void RGWDeleteMultiObj_ObjStore_S3::send_status() +{ + if (! status_dumped) { + if (op_ret < 0) + set_req_state_err(s, op_ret); + dump_errno(s); + status_dumped = true; + } +} + +void RGWDeleteMultiObj_ObjStore_S3::begin_response() +{ + + if (!status_dumped) { + send_status(); + } + + dump_start(s); + // Explicitly use chunked transfer encoding so that we can stream the result + // to the user without having to wait for the full length of it. + end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); + s->formatter->open_object_section_in_ns("DeleteResult", XMLNS_AWS_S3); + + rgw_flush_formatter(s, s->formatter); +} + +void RGWDeleteMultiObj_ObjStore_S3::send_partial_response(rgw_obj_key& key, + bool delete_marker, + const string& marker_version_id, int ret) +{ + if (!key.empty()) { + if (ret == 0 && !quiet) { + s->formatter->open_object_section("Deleted"); + s->formatter->dump_string("Key", key.name); + if (!key.instance.empty()) { + s->formatter->dump_string("VersionId", key.instance); + } + if (delete_marker) { + s->formatter->dump_bool("DeleteMarker", true); + s->formatter->dump_string("DeleteMarkerVersionId", marker_version_id); + } + s->formatter->close_section(); + } else if (ret < 0) { + struct rgw_http_error r; + int err_no; + + s->formatter->open_object_section("Error"); + + err_no = -ret; + rgw_get_errno_s3(&r, err_no); + + s->formatter->dump_string("Key", key.name); + s->formatter->dump_string("VersionId", key.instance); + s->formatter->dump_string("Code", r.s3_code); + s->formatter->dump_string("Message", r.s3_code); + s->formatter->close_section(); + } + + rgw_flush_formatter(s, s->formatter); + } +} + +void RGWDeleteMultiObj_ObjStore_S3::end_response() +{ + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetObjLayout_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/json"); + + JSONFormatter f; + + if (op_ret < 0) { + return; + } + + f.open_object_section("result"); + ::encode_json("head", head_obj, &f); + ::encode_json("manifest", *manifest, &f); + f.open_array_section("data_location"); + for (auto miter = manifest->obj_begin(); miter != manifest->obj_end(); ++miter) { + f.open_object_section("obj"); + rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store); + uint64_t ofs = miter.get_ofs(); + uint64_t left = manifest->get_obj_size() - ofs; + ::encode_json("ofs", miter.get_ofs(), &f); + ::encode_json("loc", raw_loc, &f); + ::encode_json("loc_ofs", miter.location_ofs(), &f); + uint64_t loc_size = miter.get_stripe_size(); + if (loc_size > left) { + loc_size = left; + } + ::encode_json("loc_size", loc_size, &f); + f.close_section(); + rgw_flush_formatter(s, &f); + } + f.close_section(); + f.close_section(); + rgw_flush_formatter(s, &f); +} + +int RGWConfigBucketMetaSearch_ObjStore_S3::get_params() +{ + auto iter = s->info.x_meta_map.find("x-amz-meta-search"); + if (iter == s->info.x_meta_map.end()) { + s->err.message = "X-Rgw-Meta-Search header not provided"; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } + + list expressions; + get_str_list(iter->second, ",", expressions); + + for (auto& expression : expressions) { + vector args; + get_str_vec(expression, ";", args); + + if (args.empty()) { + s->err.message = "invalid empty expression"; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } + if (args.size() > 2) { + s->err.message = string("invalid expression: ") + expression; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } + + string key = boost::algorithm::to_lower_copy(rgw_trim_whitespace(args[0])); + string val; + if (args.size() > 1) { + val = boost::algorithm::to_lower_copy(rgw_trim_whitespace(args[1])); + } + + if (!boost::algorithm::starts_with(key, RGW_AMZ_META_PREFIX)) { + s->err.message = string("invalid expression, key must start with '" RGW_AMZ_META_PREFIX "' : ") + expression; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } + + key = key.substr(sizeof(RGW_AMZ_META_PREFIX) - 1); + + ESEntityTypeMap::EntityType entity_type; + + if (val.empty() || val == "str" || val == "string") { + entity_type = ESEntityTypeMap::ES_ENTITY_STR; + } else if (val == "int" || val == "integer") { + entity_type = ESEntityTypeMap::ES_ENTITY_INT; + } else if (val == "date" || val == "datetime") { + entity_type = ESEntityTypeMap::ES_ENTITY_DATE; + } else { + s->err.message = string("invalid entity type: ") + val; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } + + mdsearch_config[key] = entity_type; + } + + return 0; +} + +void RGWConfigBucketMetaSearch_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); +} + +void RGWGetBucketMetaSearch_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL, "application/xml"); + + Formatter *f = s->formatter; + f->open_array_section("GetBucketMetaSearchResult"); + for (auto& e : s->bucket_info.mdsearch_config) { + f->open_object_section("Entry"); + string k = string("x-amz-meta-") + e.first; + f->dump_string("Key", k.c_str()); + const char *type; + switch (e.second) { + case ESEntityTypeMap::ES_ENTITY_INT: + type = "int"; + break; + case ESEntityTypeMap::ES_ENTITY_DATE: + type = "date"; + break; + default: + type = "str"; + } + f->dump_string("Type", type); + f->close_section(); + } + f->close_section(); + rgw_flush_formatter(s, f); +} + +void RGWDelBucketMetaSearch_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); +} + +void RGWPutBucketObjectLock_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWGetBucketObjectLock_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + if (op_ret) { + return; + } + encode_xml("ObjectLockConfiguration", s->bucket_info.obj_lock, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +int RGWPutObjRetention_ObjStore_S3::get_params() +{ + const char *bypass_gov_header = s->info.env->get("HTTP_X_AMZ_BYPASS_GOVERNANCE_RETENTION"); + if (bypass_gov_header) { + std::string bypass_gov_decoded = url_decode(bypass_gov_header); + bypass_governance_mode = boost::algorithm::iequals(bypass_gov_decoded, "true"); + } + + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + std::tie(op_ret, data) = rgw_rest_read_all_input(s, max_size, false); + return op_ret; +} + +void RGWPutObjRetention_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWGetObjRetention_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + if (op_ret) { + return; + } + encode_xml("Retention", obj_retention, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWPutObjLegalHold_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +void RGWGetObjLegalHold_ObjStore_S3::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + if (op_ret) { + return; + } + encode_xml("LegalHold", obj_legal_hold, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +RGWOp *RGWHandler_REST_Service_S3::op_get() +{ + if (is_usage_op()) { + return new RGWGetUsage_ObjStore_S3; + } else { + return new RGWListBuckets_ObjStore_S3; + } +} + +RGWOp *RGWHandler_REST_Service_S3::op_head() +{ + return new RGWListBuckets_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Service_S3::op_post() +{ + const auto max_size = s->cct->_conf->rgw_max_put_param_size; + + int ret = 0; + bufferlist data; + std::tie(ret, data) = rgw_rest_read_all_input(s, max_size, false); + string post_body = data.to_str(); + + if (this->isSTSenabled) { + RGWHandler_REST_STS sts_handler(auth_registry, post_body); + sts_handler.init(store, s, s->cio); + auto op = sts_handler.get_op(store); + if (op) { + return op; + } + } + + if (this->isIAMenabled) { + RGWHandler_REST_IAM iam_handler(auth_registry, post_body); + iam_handler.init(store, s, s->cio); + auto op = iam_handler.get_op(store); + if (op) { + return op; + } + } + + if (isPSenabled) { + RGWHandler_REST_PSTopic_AWS topic_handler(auth_registry, post_body); + topic_handler.init(store, s, s->cio); + auto op = topic_handler.get_op(store); + if (op) { + return op; + } + } + + return NULL; +} + +RGWOp *RGWHandler_REST_Bucket_S3::get_obj_op(bool get_data) +{ + // Non-website mode + if (get_data) { + int list_type = 1; + s->info.args.get_int("list-type", &list_type, 1); + switch (list_type) { + case 1: + return new RGWListBucket_ObjStore_S3; + case 2: + return new RGWListBucket_ObjStore_S3v2; + default: + ldpp_dout(s, 5) << __func__ << ": unsupported list-type " << list_type << dendl; + return new RGWListBucket_ObjStore_S3; + } + } else { + return new RGWStatBucket_ObjStore_S3; + } +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_get() +{ + if (s->info.args.sub_resource_exists("logging")) + return new RGWGetBucketLogging_ObjStore_S3; + + if (s->info.args.sub_resource_exists("location")) + return new RGWGetBucketLocation_ObjStore_S3; + + if (s->info.args.sub_resource_exists("versioning")) + return new RGWGetBucketVersioning_ObjStore_S3; + + if (s->info.args.sub_resource_exists("website")) { + if (!s->cct->_conf->rgw_enable_static_website) { + return NULL; + } + return new RGWGetBucketWebsite_ObjStore_S3; + } + + if (s->info.args.exists("mdsearch")) { + return new RGWGetBucketMetaSearch_ObjStore_S3; + } + + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWGetCORS_ObjStore_S3; + } else if (is_request_payment_op()) { + return new RGWGetRequestPayment_ObjStore_S3; + } else if (s->info.args.exists("uploads")) { + return new RGWListBucketMultiparts_ObjStore_S3; + } else if(is_lc_op()) { + return new RGWGetLC_ObjStore_S3; + } else if(is_policy_op()) { + return new RGWGetBucketPolicy; + } else if (is_object_lock_op()) { + return new RGWGetBucketObjectLock_ObjStore_S3; + } else if (is_notification_op()) { + return RGWHandler_REST_PSNotifs_S3::create_get_op(); + } + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_head() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploads")) { + return new RGWListBucketMultiparts_ObjStore_S3; + } + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_put() +{ + if (s->info.args.sub_resource_exists("logging")) + return NULL; + if (s->info.args.sub_resource_exists("versioning")) + return new RGWSetBucketVersioning_ObjStore_S3; + if (s->info.args.sub_resource_exists("website")) { + if (!s->cct->_conf->rgw_enable_static_website) { + return NULL; + } + return new RGWSetBucketWebsite_ObjStore_S3; + } + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWPutCORS_ObjStore_S3; + } else if (is_request_payment_op()) { + return new RGWSetRequestPayment_ObjStore_S3; + } else if(is_lc_op()) { + return new RGWPutLC_ObjStore_S3; + } else if(is_policy_op()) { + return new RGWPutBucketPolicy; + } else if (is_object_lock_op()) { + return new RGWPutBucketObjectLock_ObjStore_S3; + } else if (is_notification_op()) { + return RGWHandler_REST_PSNotifs_S3::create_put_op(); + } + return new RGWCreateBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_delete() +{ + if (is_cors_op()) { + return new RGWDeleteCORS_ObjStore_S3; + } else if(is_lc_op()) { + return new RGWDeleteLC_ObjStore_S3; + } else if(is_policy_op()) { + return new RGWDeleteBucketPolicy; + } else if (is_notification_op()) { + return RGWHandler_REST_PSNotifs_S3::create_delete_op(); + } + + if (s->info.args.sub_resource_exists("website")) { + if (!s->cct->_conf->rgw_enable_static_website) { + return NULL; + } + return new RGWDeleteBucketWebsite_ObjStore_S3; + } + + if (s->info.args.exists("mdsearch")) { + return new RGWDelBucketMetaSearch_ObjStore_S3; + } + + return new RGWDeleteBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_post() +{ + if (s->info.args.exists("delete")) { + return new RGWDeleteMultiObj_ObjStore_S3; + } + + if (s->info.args.exists("mdsearch")) { + return new RGWConfigBucketMetaSearch_ObjStore_S3; + } + + return new RGWPostObj_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Bucket_S3::op_options() +{ + return new RGWOptionsCORS_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Obj_S3::get_obj_op(bool get_data) +{ + RGWGetObj_ObjStore_S3 *get_obj_op = new RGWGetObj_ObjStore_S3; + get_obj_op->set_get_data(get_data); + return get_obj_op; +} + +RGWOp *RGWHandler_REST_Obj_S3::op_get() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploadId")) { + return new RGWListMultipart_ObjStore_S3; + } else if (s->info.args.exists("layout")) { + return new RGWGetObjLayout_ObjStore_S3; + } else if (is_tagging_op()) { + return new RGWGetObjTags_ObjStore_S3; + } else if (is_obj_retention_op()) { + return new RGWGetObjRetention_ObjStore_S3; + } else if (is_obj_legal_hold_op()) { + return new RGWGetObjLegalHold_ObjStore_S3; + } + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Obj_S3::op_head() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploadId")) { + return new RGWListMultipart_ObjStore_S3; + } + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Obj_S3::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_S3; + } else if (is_tagging_op()) { + return new RGWPutObjTags_ObjStore_S3; + } else if (is_obj_retention_op()) { + return new RGWPutObjRetention_ObjStore_S3; + } else if (is_obj_legal_hold_op()) { + return new RGWPutObjLegalHold_ObjStore_S3; + } + + if (s->init_state.src_bucket.empty()) + return new RGWPutObj_ObjStore_S3; + else + return new RGWCopyObj_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Obj_S3::op_delete() +{ + if (is_tagging_op()) { + return new RGWDeleteObjTags_ObjStore_S3; + } + string upload_id = s->info.args.get("uploadId"); + + if (upload_id.empty()) + return new RGWDeleteObj_ObjStore_S3; + else + return new RGWAbortMultipart_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Obj_S3::op_post() +{ + if (s->info.args.exists("uploadId")) + return new RGWCompleteMultipart_ObjStore_S3; + + if (s->info.args.exists("uploads")) + return new RGWInitMultipart_ObjStore_S3; + + return new RGWPostObj_ObjStore_S3; +} + +RGWOp *RGWHandler_REST_Obj_S3::op_options() +{ + return new RGWOptionsCORS_ObjStore_S3; +} + +int RGWHandler_REST_S3::init_from_header(struct req_state* s, + int default_formatter, + bool configurable_format) +{ + string req; + string first; + + const char *req_name = s->relative_uri.c_str(); + const char *p; + + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(); + + /* must be called after the args parsing */ + int ret = allocate_formatter(s, default_formatter, configurable_format); + if (ret < 0) + return ret; + + if (*req_name != '/') + return 0; + + req_name++; + + if (!*req_name) + return 0; + + req = req_name; + int pos = req.find('/'); + if (pos >= 0) { + first = req.substr(0, pos); + } else { + first = req; + } + + /* + * XXX The intent of the check for empty is apparently to let the bucket + * name from DNS to be set ahead. However, we currently take the DNS + * bucket and re-insert it into URL in rgw_rest.cc:RGWREST::preprocess(). + * So, this check is meaningless. + * + * Rather than dropping this, the code needs to be changed into putting + * the bucket (and its tenant) from DNS and Host: header (HTTP_HOST) + * into req_status.bucket_name directly. + */ + if (s->init_state.url_bucket.empty()) { + // Save bucket to tide us over until token is parsed. + s->init_state.url_bucket = first; + if (pos >= 0) { + string encoded_obj_str = req.substr(pos+1); + s->object = rgw_obj_key(encoded_obj_str, s->info.args.get("versionId")); + } + } else { + s->object = rgw_obj_key(req_name, s->info.args.get("versionId")); + } + return 0; +} + +static int verify_mfa(RGWRados *store, RGWUserInfo *user, const string& mfa_str, bool *verified) +{ + vector params; + get_str_vec(mfa_str, " ", params); + + if (params.size() != 2) { + ldout(store->ctx(), 5) << "NOTICE: invalid mfa string provided: " << mfa_str << dendl; + return -EINVAL; + } + + string& serial = params[0]; + string& pin = params[1]; + + auto i = user->mfa_ids.find(serial); + if (i == user->mfa_ids.end()) { + ldout(store->ctx(), 5) << "NOTICE: user does not have mfa device with serial=" << serial << dendl; + return -EACCES; + } + + int ret = store->check_mfa(user->user_id, serial, pin); + if (ret < 0) { + ldout(store->ctx(), 20) << "NOTICE: failed to check MFA, serial=" << serial << dendl; + return -EACCES; + } + + *verified = true; + + return 0; +} + +int RGWHandler_REST_S3::postauth_init() +{ + struct req_init_state *t = &s->init_state; + bool relaxed_names = s->cct->_conf->rgw_relaxed_s3_bucket_names; + + rgw_parse_url_bucket(t->url_bucket, s->user->user_id.tenant, + s->bucket_tenant, s->bucket_name); + + dout(10) << "s->object=" << (!s->object.empty() ? s->object : rgw_obj_key("")) + << " s->bucket=" << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name) << dendl; + + int ret; + ret = rgw_validate_tenant_name(s->bucket_tenant); + if (ret) + return ret; + if (!s->bucket_name.empty()) { + ret = valid_s3_bucket_name(s->bucket_name, relaxed_names); + if (ret) + return ret; + ret = validate_object_name(s->object.name); + if (ret) + return ret; + } + + if (!t->src_bucket.empty()) { + rgw_parse_url_bucket(t->src_bucket, s->user->user_id.tenant, + s->src_tenant_name, s->src_bucket_name); + ret = rgw_validate_tenant_name(s->src_tenant_name); + if (ret) + return ret; + ret = valid_s3_bucket_name(s->src_bucket_name, relaxed_names); + if (ret) + return ret; + } + + const char *mfa = s->info.env->get("HTTP_X_AMZ_MFA"); + if (mfa) { + ret = verify_mfa(store, s->user, string(mfa), &s->mfa_verified); + } + + return 0; +} + +int RGWHandler_REST_S3::init(RGWRados *store, struct req_state *s, + rgw::io::BasicClient *cio) +{ + int ret; + + s->dialect = "s3"; + + ret = rgw_validate_tenant_name(s->bucket_tenant); + if (ret) + return ret; + bool relaxed_names = s->cct->_conf->rgw_relaxed_s3_bucket_names; + if (!s->bucket_name.empty()) { + ret = valid_s3_bucket_name(s->bucket_name, relaxed_names); + if (ret) + return ret; + ret = validate_object_name(s->object.name); + if (ret) + return ret; + } + + const char *cacl = s->info.env->get("HTTP_X_AMZ_ACL"); + if (cacl) + s->canned_acl = cacl; + + s->has_acl_header = s->info.env->exists_prefix("HTTP_X_AMZ_GRANT"); + + const char *copy_source = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE"); + if (copy_source && + (! s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_RANGE")) && + (! s->info.args.exists("uploadId"))) { + + ret = RGWCopyObj::parse_copy_location(copy_source, + s->init_state.src_bucket, + s->src_object); + if (!ret) { + ldout(s->cct, 0) << "failed to parse copy location" << dendl; + return -EINVAL; // XXX why not -ERR_INVALID_BUCKET_NAME or -ERR_BAD_URL? + } + } + + const char *sc = s->info.env->get("HTTP_X_AMZ_STORAGE_CLASS"); + if (sc) { + s->info.storage_class = sc; + } + + return RGWHandler_REST::init(store, s, cio); +} + +int RGWHandler_REST_S3::authorize(const DoutPrefixProvider *dpp) +{ + if (s->info.args.exists("Action") && s->info.args.get("Action") == "AssumeRoleWithWebIdentity") { + return RGW_Auth_STS::authorize(dpp, store, auth_registry, s); + } + return RGW_Auth_S3::authorize(dpp, store, auth_registry, s); +} + +enum class AwsVersion { + UNKNOWN, + V2, + V4 +}; + +enum class AwsRoute { + UNKNOWN, + QUERY_STRING, + HEADERS +}; + +static inline std::pair +discover_aws_flavour(const req_info& info) +{ + using rgw::auth::s3::AWS4_HMAC_SHA256_STR; + + AwsVersion version = AwsVersion::UNKNOWN; + AwsRoute route = AwsRoute::UNKNOWN; + + const char* http_auth = info.env->get("HTTP_AUTHORIZATION"); + if (http_auth && http_auth[0]) { + /* Authorization in Header */ + route = AwsRoute::HEADERS; + + if (!strncmp(http_auth, AWS4_HMAC_SHA256_STR, + strlen(AWS4_HMAC_SHA256_STR))) { + /* AWS v4 */ + version = AwsVersion::V4; + } else if (!strncmp(http_auth, "AWS ", 4)) { + /* AWS v2 */ + version = AwsVersion::V2; + } + } else { + route = AwsRoute::QUERY_STRING; + + if (info.args.get("X-Amz-Algorithm") == AWS4_HMAC_SHA256_STR) { + /* AWS v4 */ + version = AwsVersion::V4; + } else if (!info.args.get("AWSAccessKeyId").empty()) { + /* AWS v2 */ + version = AwsVersion::V2; + } + } + + return std::make_pair(version, route); +} + +/* + * verify that a signed request comes from the keyholder + * by checking the signature against our locally-computed version + * + * it tries AWS v4 before AWS v2 + */ +int RGW_Auth_S3::authorize(const DoutPrefixProvider *dpp, + RGWRados* const store, + const rgw::auth::StrategyRegistry& auth_registry, + struct req_state* const s) +{ + + /* neither keystone and rados enabled; warn and exit! */ + if (!store->ctx()->_conf->rgw_s3_auth_use_rados && + !store->ctx()->_conf->rgw_s3_auth_use_keystone && + !store->ctx()->_conf->rgw_s3_auth_use_ldap) { + ldpp_dout(dpp, 0) << "WARNING: no authorization backend enabled! Users will never authenticate." << dendl; + return -EPERM; + } + + const auto ret = rgw::auth::Strategy::apply(dpp, auth_registry.get_s3_main(), s); + if (ret == 0) { + /* Populate the owner info. */ + s->owner.set_id(s->user->user_id); + s->owner.set_name(s->user->display_name); + } + return ret; +} + +int RGWHandler_Auth_S3::init(RGWRados *store, struct req_state *state, + rgw::io::BasicClient *cio) +{ + int ret = RGWHandler_REST_S3::init_from_header(state, RGW_FORMAT_JSON, + true); + if (ret < 0) + return ret; + + return RGWHandler_REST::init(store, state, cio); +} + +RGWHandler_REST* RGWRESTMgr_S3::get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + bool is_s3website = enable_s3website && (s->prot_flags & RGW_REST_WEBSITE); + int ret = + RGWHandler_REST_S3::init_from_header(s, + is_s3website ? RGW_FORMAT_HTML : + RGW_FORMAT_XML, true); + if (ret < 0) + return NULL; + + RGWHandler_REST* handler; + // TODO: Make this more readable + if (is_s3website) { + if (s->init_state.url_bucket.empty()) { + handler = new RGWHandler_REST_Service_S3Website(auth_registry); + } else if (s->object.empty()) { + handler = new RGWHandler_REST_Bucket_S3Website(auth_registry); + } else { + handler = new RGWHandler_REST_Obj_S3Website(auth_registry); + } + } else { + if (s->init_state.url_bucket.empty()) { + handler = new RGWHandler_REST_Service_S3(auth_registry, enable_sts, enable_iam, enable_pubsub); + } else if (s->object.empty()) { + handler = new RGWHandler_REST_Bucket_S3(auth_registry, enable_pubsub); + } else { + handler = new RGWHandler_REST_Obj_S3(auth_registry); + } + } + + ldout(s->cct, 20) << __func__ << " handler=" << typeid(*handler).name() + << dendl; + return handler; +} + +bool RGWHandler_REST_S3Website::web_dir() const { + std::string subdir_name = url_decode(s->object.name); + + if (subdir_name.empty()) { + return false; + } else if (subdir_name.back() == '/' && subdir_name.size() > 1) { + subdir_name.pop_back(); + } + + rgw_obj obj(s->bucket, subdir_name); + + RGWObjectCtx& obj_ctx = *static_cast(s->obj_ctx); + obj_ctx.set_atomic(obj); + obj_ctx.set_prefetch_data(obj); + + RGWObjState* state = nullptr; + if (store->get_obj_state(&obj_ctx, s->bucket_info, obj, &state, false) < 0) { + return false; + } + if (! state->exists) { + return false; + } + return state->exists; +} + +int RGWHandler_REST_S3Website::init(RGWRados *store, req_state *s, + rgw::io::BasicClient* cio) +{ + // save the original object name before retarget() replaces it with the + // result of get_effective_key(). the error_handler() needs the original + // object name for redirect handling + original_object_name = s->object.name; + + return RGWHandler_REST_S3::init(store, s, cio); +} + +int RGWHandler_REST_S3Website::retarget(RGWOp* op, RGWOp** new_op) { + *new_op = op; + ldout(s->cct, 10) << __func__ << " Starting retarget" << dendl; + + if (!(s->prot_flags & RGW_REST_WEBSITE)) + return 0; + + int ret = store->get_bucket_info(*s->sysobj_ctx, s->bucket_tenant, + s->bucket_name, s->bucket_info, NULL, + &s->bucket_attrs); + if (ret < 0) { + // TODO-FUTURE: if the bucket does not exist, maybe expose it here? + return -ERR_NO_SUCH_BUCKET; + } + if (!s->bucket_info.has_website) { + // TODO-FUTURE: if the bucket has no WebsiteConfig, expose it here + return -ERR_NO_SUCH_WEBSITE_CONFIGURATION; + } + + rgw_obj_key new_obj; + bool get_res = s->bucket_info.website_conf.get_effective_key(s->object.name, &new_obj.name, web_dir()); + if (!get_res) { + s->err.message = "The IndexDocument Suffix is not configurated or not well formed!"; + ldout(s->cct, 5) << s->err.message << dendl; + return -EINVAL; + } + + ldout(s->cct, 10) << "retarget get_effective_key " << s->object << " -> " + << new_obj << dendl; + + RGWBWRoutingRule rrule; + bool should_redirect = + s->bucket_info.website_conf.should_redirect(new_obj.name, 0, &rrule); + + if (should_redirect) { + const string& hostname = s->info.env->get("HTTP_HOST", ""); + const string& protocol = + (s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http"); + int redirect_code = 0; + rrule.apply_rule(protocol, hostname, s->object.name, &s->redirect, + &redirect_code); + // APply a custom HTTP response code + if (redirect_code > 0) + s->err.http_ret = redirect_code; // Apply a custom HTTP response code + ldout(s->cct, 10) << "retarget redirect code=" << redirect_code + << " proto+host:" << protocol << "://" << hostname + << " -> " << s->redirect << dendl; + return -ERR_WEBSITE_REDIRECT; + } + + /* + * FIXME: if s->object != new_obj, drop op and create a new op to handle + * operation. Or remove this comment if it's not applicable anymore + */ + + s->object = new_obj; + + return 0; +} + +RGWOp* RGWHandler_REST_S3Website::op_get() +{ + return get_obj_op(true); +} + +RGWOp* RGWHandler_REST_S3Website::op_head() +{ + return get_obj_op(false); +} + +int RGWHandler_REST_S3Website::serve_errordoc(int http_ret, const string& errordoc_key) { + int ret = 0; + s->formatter->reset(); /* Try to throw it all away */ + + std::shared_ptr getop( static_cast(op_get())); + if (getop.get() == NULL) { + return -1; // Trigger double error handler + } + getop->init(store, s, this); + getop->range_str = NULL; + getop->if_mod = NULL; + getop->if_unmod = NULL; + getop->if_match = NULL; + getop->if_nomatch = NULL; + s->object = errordoc_key; + + ret = init_permissions(getop.get()); + if (ret < 0) { + ldout(s->cct, 20) << "serve_errordoc failed, init_permissions ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + ret = read_permissions(getop.get()); + if (ret < 0) { + ldout(s->cct, 20) << "serve_errordoc failed, read_permissions ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + if (http_ret) { + getop->set_custom_http_response(http_ret); + } + + ret = getop->init_processing(); + if (ret < 0) { + ldout(s->cct, 20) << "serve_errordoc failed, init_processing ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + ret = getop->verify_op_mask(); + if (ret < 0) { + ldout(s->cct, 20) << "serve_errordoc failed, verify_op_mask ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + ret = getop->verify_permission(); + if (ret < 0) { + ldout(s->cct, 20) << "serve_errordoc failed, verify_permission ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + ret = getop->verify_params(); + if (ret < 0) { + ldout(s->cct, 20) << "serve_errordoc failed, verify_params ret=" << ret << dendl; + return -1; // Trigger double error handler + } + + // No going back now + getop->pre_exec(); + /* + * FIXME Missing headers: + * With a working errordoc, the s3 error fields are rendered as HTTP headers, + * x-amz-error-code: NoSuchKey + * x-amz-error-message: The specified key does not exist. + * x-amz-error-detail-Key: foo + */ + getop->execute(); + getop->complete(); + return 0; + +} + +int RGWHandler_REST_S3Website::error_handler(int err_no, + string* error_content) { + int new_err_no = -1; + rgw_http_errors::const_iterator r = rgw_http_s3_errors.find(err_no > 0 ? err_no : -err_no); + int http_error_code = -1; + + if (r != rgw_http_s3_errors.end()) { + http_error_code = r->second.first; + } + ldout(s->cct, 10) << "RGWHandler_REST_S3Website::error_handler err_no=" << err_no << " http_ret=" << http_error_code << dendl; + + RGWBWRoutingRule rrule; + bool should_redirect = + s->bucket_info.website_conf.should_redirect(original_object_name, + http_error_code, &rrule); + + if (should_redirect) { + const string& hostname = s->info.env->get("HTTP_HOST", ""); + const string& protocol = + (s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http"); + int redirect_code = 0; + rrule.apply_rule(protocol, hostname, original_object_name, + &s->redirect, &redirect_code); + // Apply a custom HTTP response code + if (redirect_code > 0) + s->err.http_ret = redirect_code; // Apply a custom HTTP response code + ldout(s->cct, 10) << "error handler redirect code=" << redirect_code + << " proto+host:" << protocol << "://" << hostname + << " -> " << s->redirect << dendl; + return -ERR_WEBSITE_REDIRECT; + } else if (err_no == -ERR_WEBSITE_REDIRECT) { + // Do nothing here, this redirect will be handled in abort_early's ERR_WEBSITE_REDIRECT block + // Do NOT fire the ErrorDoc handler + } else if (!s->bucket_info.website_conf.error_doc.empty()) { + /* This serves an entire page! + On success, it will return zero, and no further content should be sent to the socket + On failure, we need the double-error handler + */ + new_err_no = RGWHandler_REST_S3Website::serve_errordoc(http_error_code, s->bucket_info.website_conf.error_doc); + if (new_err_no != -1) { + err_no = new_err_no; + } + } else { + ldout(s->cct, 20) << "No special error handling today!" << dendl; + } + + return err_no; +} + +RGWOp* RGWHandler_REST_Obj_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; +} + +RGWOp* RGWHandler_REST_Bucket_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; +} + +RGWOp* RGWHandler_REST_Service_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; +} + + +namespace rgw { +namespace auth { +namespace s3 { + +static rgw::auth::Completer::cmplptr_t +null_completer_factory(const boost::optional& secret_key) +{ + return nullptr; +} + + +AWSEngine::VersionAbstractor::auth_data_t +AWSGeneralAbstractor::get_auth_data(const req_state* const s) const +{ + AwsVersion version; + AwsRoute route; + std::tie(version, route) = discover_aws_flavour(s->info); + + if (version == AwsVersion::V2) { + return get_auth_data_v2(s); + } else if (version == AwsVersion::V4) { + return get_auth_data_v4(s, route == AwsRoute::QUERY_STRING); + } else { + /* FIXME(rzarzynski): handle anon user. */ + throw -EINVAL; + } +} + +boost::optional +AWSGeneralAbstractor::get_v4_canonical_headers( + const req_info& info, + const boost::string_view& signedheaders, + const bool using_qs) const +{ + return rgw::auth::s3::get_v4_canonical_headers(info, signedheaders, + using_qs, false); +} + +AWSEngine::VersionAbstractor::auth_data_t +AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s, + const bool using_qs) const +{ + boost::string_view access_key_id; + boost::string_view signed_hdrs; + + boost::string_view date; + boost::string_view credential_scope; + boost::string_view client_signature; + boost::string_view session_token; + + int ret = rgw::auth::s3::parse_v4_credentials(s->info, + access_key_id, + credential_scope, + signed_hdrs, + client_signature, + date, + session_token, + using_qs); + if (ret < 0) { + throw ret; + } + + /* craft canonical headers */ + boost::optional canonical_headers = \ + get_v4_canonical_headers(s->info, signed_hdrs, using_qs); + if (canonical_headers) { + using sanitize = rgw::crypt_sanitize::log_content; + ldout(s->cct, 10) << "canonical headers format = " + << sanitize{*canonical_headers} << dendl; + } else { + throw -EPERM; + } + + bool is_non_s3_op = false; + if (s->op_type == RGW_STS_GET_SESSION_TOKEN || + s->op_type == RGW_STS_ASSUME_ROLE || + s->op_type == RGW_STS_ASSUME_ROLE_WEB_IDENTITY || + s->op_type == RGW_OP_CREATE_ROLE || + s->op_type == RGW_OP_DELETE_ROLE || + s->op_type == RGW_OP_GET_ROLE || + s->op_type == RGW_OP_MODIFY_ROLE || + s->op_type == RGW_OP_LIST_ROLES || + s->op_type == RGW_OP_PUT_ROLE_POLICY || + s->op_type == RGW_OP_GET_ROLE_POLICY || + s->op_type == RGW_OP_LIST_ROLE_POLICIES || + s->op_type == RGW_OP_DELETE_ROLE_POLICY || + s->op_type == RGW_OP_PUT_USER_POLICY || + s->op_type == RGW_OP_GET_USER_POLICY || + s->op_type == RGW_OP_LIST_USER_POLICIES || + s->op_type == RGW_OP_DELETE_USER_POLICY) { + is_non_s3_op = true; + } + + const char* exp_payload_hash = nullptr; + string payload_hash; + if (is_non_s3_op) { + //For non s3 ops, we need to calculate the payload hash + payload_hash = s->info.args.get("PayloadHash"); + exp_payload_hash = payload_hash.c_str(); + } else { + /* Get the expected hash. */ + exp_payload_hash = rgw::auth::s3::get_v4_exp_payload_hash(s->info); + } + + /* Craft canonical URI. Using std::move later so let it be non-const. */ + auto canonical_uri = rgw::auth::s3::get_v4_canonical_uri(s->info); + + /* Craft canonical query string. std::moving later so non-const here. */ + auto canonical_qs = rgw::auth::s3::get_v4_canonical_qs(s->info, using_qs); + + /* Craft canonical request. */ + auto canonical_req_hash = \ + rgw::auth::s3::get_v4_canon_req_hash(s->cct, + s->info.method, + std::move(canonical_uri), + std::move(canonical_qs), + std::move(*canonical_headers), + signed_hdrs, + exp_payload_hash); + + auto string_to_sign = \ + rgw::auth::s3::get_v4_string_to_sign(s->cct, + AWS4_HMAC_SHA256_STR, + date, + credential_scope, + std::move(canonical_req_hash)); + + const auto sig_factory = std::bind(rgw::auth::s3::get_v4_signature, + credential_scope, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3); + + /* Requests authenticated with the Query Parameters are treated as unsigned. + * From "Authenticating Requests: Using Query Parameters (AWS Signature + * Version 4)": + * + * You don't include a payload hash in the Canonical Request, because + * when you create a presigned URL, you don't know the payload content + * because the URL is used to upload an arbitrary payload. Instead, you + * use a constant string UNSIGNED-PAYLOAD. + * + * This means we have absolutely no business in spawning completer. Both + * aws4_auth_needs_complete and aws4_auth_streaming_mode are set to false + * by default. We don't need to change that. */ + if (is_v4_payload_unsigned(exp_payload_hash) || is_v4_payload_empty(s) || is_non_s3_op) { + return { + access_key_id, + client_signature, + session_token, + std::move(string_to_sign), + sig_factory, + null_completer_factory + }; + } else { + /* We're going to handle a signed payload. Be aware that even empty HTTP + * body (no payload) requires verification: + * + * The x-amz-content-sha256 header is required for all AWS Signature + * Version 4 requests. It provides a hash of the request payload. If + * there is no payload, you must provide the hash of an empty string. */ + if (!is_v4_payload_streamed(exp_payload_hash)) { + ldout(s->cct, 10) << "delaying v4 auth" << dendl; + + /* payload in a single chunk */ + switch (s->op_type) + { + case RGW_OP_CREATE_BUCKET: + case RGW_OP_PUT_OBJ: + case RGW_OP_PUT_ACLS: + case RGW_OP_PUT_CORS: + case RGW_OP_INIT_MULTIPART: // in case that Init Multipart uses CHUNK encoding + case RGW_OP_COMPLETE_MULTIPART: + case RGW_OP_SET_BUCKET_VERSIONING: + case RGW_OP_DELETE_MULTI_OBJ: + case RGW_OP_ADMIN_SET_METADATA: + case RGW_OP_SET_BUCKET_WEBSITE: + case RGW_OP_PUT_BUCKET_POLICY: + case RGW_OP_PUT_OBJ_TAGGING: + case RGW_OP_PUT_LC: + case RGW_OP_SET_REQUEST_PAYMENT: + case RGW_OP_PUBSUB_NOTIF_CREATE: + case RGW_OP_PUT_BUCKET_OBJ_LOCK: + case RGW_OP_PUT_OBJ_RETENTION: + case RGW_OP_PUT_OBJ_LEGAL_HOLD: + case RGW_STS_GET_SESSION_TOKEN: + case RGW_STS_ASSUME_ROLE: + break; + default: + dout(10) << "ERROR: AWS4 completion for this operation NOT IMPLEMENTED" << dendl; + throw -ERR_NOT_IMPLEMENTED; + } + + const auto cmpl_factory = std::bind(AWSv4ComplSingle::create, + s, + std::placeholders::_1); + return { + access_key_id, + client_signature, + session_token, + std::move(string_to_sign), + sig_factory, + cmpl_factory + }; + } else { + /* IMHO "streamed" doesn't fit too good here. I would prefer to call + * it "chunked" but let's be coherent with Amazon's terminology. */ + + dout(10) << "body content detected in multiple chunks" << dendl; + + /* payload in multiple chunks */ + + switch(s->op_type) + { + case RGW_OP_PUT_OBJ: + break; + default: + dout(10) << "ERROR: AWS4 completion for this operation NOT IMPLEMENTED (streaming mode)" << dendl; + throw -ERR_NOT_IMPLEMENTED; + } + + dout(10) << "aws4 seed signature ok... delaying v4 auth" << dendl; + + /* In the case of streamed payload client sets the x-amz-content-sha256 + * to "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" but uses "UNSIGNED-PAYLOAD" + * when constructing the Canonical Request. */ + + /* In the case of single-chunk upload client set the header's value is + * coherent with the one used for Canonical Request crafting. */ + + /* In the case of query string-based authentication there should be no + * x-amz-content-sha256 header and the value "UNSIGNED-PAYLOAD" is used + * for CanonReq. */ + const auto cmpl_factory = std::bind(AWSv4ComplMulti::create, + s, + date, + credential_scope, + client_signature, + std::placeholders::_1); + return { + access_key_id, + client_signature, + session_token, + std::move(string_to_sign), + sig_factory, + cmpl_factory + }; + } + } +} + + +boost::optional +AWSGeneralBoto2Abstractor::get_v4_canonical_headers( + const req_info& info, + const boost::string_view& signedheaders, + const bool using_qs) const +{ + return rgw::auth::s3::get_v4_canonical_headers(info, signedheaders, + using_qs, true); +} + + +AWSEngine::VersionAbstractor::auth_data_t +AWSGeneralAbstractor::get_auth_data_v2(const req_state* const s) const +{ + boost::string_view access_key_id; + boost::string_view signature; + boost::string_view session_token; + bool qsr = false; + + const char* http_auth = s->info.env->get("HTTP_AUTHORIZATION"); + if (! http_auth || http_auth[0] == '\0') { + /* Credentials are provided in query string. We also need to verify + * the "Expires" parameter now. */ + access_key_id = s->info.args.get("AWSAccessKeyId"); + signature = s->info.args.get("Signature"); + qsr = true; + + boost::string_view expires = s->info.args.get("Expires"); + if (expires.empty()) { + throw -EPERM; + } + + /* It looks we have the guarantee that expires is a null-terminated, + * and thus string_view::data() can be safely used. */ + const time_t exp = atoll(expires.data()); + time_t now; + time(&now); + + if (now >= exp) { + throw -EPERM; + } + if (s->info.args.exists("X-Amz-Security-Token")) { + session_token = s->info.args.get("X-Amz-Security-Token"); + if (session_token.size() == 0) { + throw -EPERM; + } + } + + } else { + /* The "Authorization" HTTP header is being used. */ + const boost::string_view auth_str(http_auth + strlen("AWS ")); + const size_t pos = auth_str.rfind(':'); + if (pos != boost::string_view::npos) { + access_key_id = auth_str.substr(0, pos); + signature = auth_str.substr(pos + 1); + } + + if (s->info.env->exists("HTTP_X_AMZ_SECURITY_TOKEN")) { + session_token = s->info.env->get("HTTP_X_AMZ_SECURITY_TOKEN"); + if (session_token.size() == 0) { + throw -EPERM; + } + } + } + + /* Let's canonize the HTTP headers that are covered by the AWS auth v2. */ + std::string string_to_sign; + utime_t header_time; + if (! rgw_create_s3_canonical_header(s->info, &header_time, string_to_sign, + qsr)) { + ldout(cct, 10) << "failed to create the canonized auth header\n" + << rgw::crypt_sanitize::auth{s,string_to_sign} << dendl; + throw -EPERM; + } + + ldout(cct, 10) << "string_to_sign:\n" + << rgw::crypt_sanitize::auth{s,string_to_sign} << dendl; + + if (!qsr && !is_time_skew_ok(header_time)) { + throw -ERR_REQUEST_TIME_SKEWED; + } + + return { + std::move(access_key_id), + std::move(signature), + std::move(session_token), + std::move(string_to_sign), + rgw::auth::s3::get_v2_signature, + null_completer_factory + }; +} + + +AWSEngine::VersionAbstractor::auth_data_t +AWSBrowserUploadAbstractor::get_auth_data_v2(const req_state* const s) const +{ + return { + s->auth.s3_postobj_creds.access_key, + s->auth.s3_postobj_creds.signature, + s->auth.s3_postobj_creds.x_amz_security_token, + s->auth.s3_postobj_creds.encoded_policy.to_str(), + rgw::auth::s3::get_v2_signature, + null_completer_factory + }; +} + +AWSEngine::VersionAbstractor::auth_data_t +AWSBrowserUploadAbstractor::get_auth_data_v4(const req_state* const s) const +{ + const boost::string_view credential = s->auth.s3_postobj_creds.x_amz_credential; + + /* grab access key id */ + const size_t pos = credential.find("/"); + const boost::string_view access_key_id = credential.substr(0, pos); + dout(10) << "access key id = " << access_key_id << dendl; + + /* grab credential scope */ + const boost::string_view credential_scope = credential.substr(pos + 1); + dout(10) << "credential scope = " << credential_scope << dendl; + + const auto sig_factory = std::bind(rgw::auth::s3::get_v4_signature, + credential_scope, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3); + + return { + access_key_id, + s->auth.s3_postobj_creds.signature, + s->auth.s3_postobj_creds.x_amz_security_token, + s->auth.s3_postobj_creds.encoded_policy.to_str(), + sig_factory, + null_completer_factory + }; +} + +AWSEngine::VersionAbstractor::auth_data_t +AWSBrowserUploadAbstractor::get_auth_data(const req_state* const s) const +{ + if (s->auth.s3_postobj_creds.x_amz_algorithm == AWS4_HMAC_SHA256_STR) { + ldout(s->cct, 0) << "Signature verification algorithm AWS v4" + << " (AWS4-HMAC-SHA256)" << dendl; + return get_auth_data_v4(s); + } else { + ldout(s->cct, 0) << "Signature verification algorithm AWS v2" << dendl; + return get_auth_data_v2(s); + } +} + +AWSEngine::result_t +AWSEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const +{ + /* Small reminder: an ver_abstractor is allowed to throw! */ + const auto auth_data = ver_abstractor.get_auth_data(s); + + if (auth_data.access_key_id.empty() || auth_data.client_signature.empty()) { + return result_t::deny(-EINVAL); + } else { + return authenticate(dpp, + auth_data.access_key_id, + auth_data.client_signature, + auth_data.session_token, + auth_data.string_to_sign, + auth_data.signature_factory, + auth_data.completer_factory, + s); + } +} + +} /* namespace s3 */ +} /* namespace auth */ +} /* namespace rgw */ + +rgw::LDAPHelper* rgw::auth::s3::LDAPEngine::ldh = nullptr; +std::mutex rgw::auth::s3::LDAPEngine::mtx; + +void rgw::auth::s3::LDAPEngine::init(CephContext* const cct) +{ + if (! cct->_conf->rgw_s3_auth_use_ldap || + cct->_conf->rgw_ldap_uri.empty()) { + return; + } + + if (! ldh) { + std::lock_guard lck(mtx); + if (! ldh) { + const string& ldap_uri = cct->_conf->rgw_ldap_uri; + const string& ldap_binddn = cct->_conf->rgw_ldap_binddn; + const string& ldap_searchdn = cct->_conf->rgw_ldap_searchdn; + const string& ldap_searchfilter = cct->_conf->rgw_ldap_searchfilter; + const string& ldap_dnattr = cct->_conf->rgw_ldap_dnattr; + std::string ldap_bindpw = parse_rgw_ldap_bindpw(cct); + + ldh = new rgw::LDAPHelper(ldap_uri, ldap_binddn, ldap_bindpw, + ldap_searchdn, ldap_searchfilter, ldap_dnattr); + + ldh->init(); + ldh->bind(); + } + } +} + +bool rgw::auth::s3::LDAPEngine::valid() { + std::lock_guard lck(mtx); + return (!!ldh); +} + +rgw::auth::RemoteApplier::acl_strategy_t +rgw::auth::s3::LDAPEngine::get_acl_strategy() const +{ + //This is based on the assumption that the default acl strategy in + // get_perms_from_aclspec, will take care. Extra acl spec is not required. + return nullptr; +} + +rgw::auth::RemoteApplier::AuthInfo +rgw::auth::s3::LDAPEngine::get_creds_info(const rgw::RGWToken& token) const noexcept +{ + /* The short form of "using" can't be used here -- we're aliasing a class' + * member. */ + using acct_privilege_t = \ + rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t; + + return rgw::auth::RemoteApplier::AuthInfo { + rgw_user(token.id), + token.id, + RGW_PERM_FULL_CONTROL, + acct_privilege_t::IS_PLAIN_ACCT, + TYPE_LDAP + }; +} + +rgw::auth::Engine::result_t +rgw::auth::s3::LDAPEngine::authenticate( + const DoutPrefixProvider* dpp, + const boost::string_view& access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t&, + const completer_factory_t& completer_factory, + const req_state* const s) const +{ + /* boost filters and/or string_ref may throw on invalid input */ + rgw::RGWToken base64_token; + try { + base64_token = rgw::from_base64(access_key_id); + } catch (...) { + base64_token = std::string(""); + } + + if (! base64_token.valid()) { + return result_t::deny(); + } + + //TODO: Uncomment, when we have a migration plan in place. + //Check if a user of type other than 'ldap' is already present, if yes, then + //return error. + /*RGWUserInfo user_info; + user_info.user_id = base64_token.id; + if (rgw_get_user_info_by_uid(store, user_info.user_id, user_info) >= 0) { + if (user_info.type != TYPE_LDAP) { + ldpp_dout(dpp, 10) << "ERROR: User id of type: " << user_info.type << " is already present" << dendl; + return nullptr; + } + }*/ + + if (ldh->auth(base64_token.id, base64_token.key) != 0) { + return result_t::deny(-ERR_INVALID_ACCESS_KEY); + } + + auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(), + get_creds_info(base64_token)); + return result_t::grant(std::move(apl), completer_factory(boost::none)); +} /* rgw::auth::s3::LDAPEngine::authenticate */ + +void rgw::auth::s3::LDAPEngine::shutdown() { + if (ldh) { + delete ldh; + ldh = nullptr; + } +} + +/* LocalEngine */ +rgw::auth::Engine::result_t +rgw::auth::s3::LocalEngine::authenticate( + const DoutPrefixProvider* dpp, + const boost::string_view& _access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* const s) const +{ + /* get the user info */ + RGWUserInfo user_info; + /* TODO(rzarzynski): we need to have string-view taking variant. */ + const std::string access_key_id = _access_key_id.to_string(); + if (rgw_get_user_info_by_access_key(store, access_key_id, user_info) < 0) { + ldpp_dout(dpp, 5) << "error reading user info, uid=" << access_key_id + << " can't authenticate" << dendl; + return result_t::deny(-ERR_INVALID_ACCESS_KEY); + } + //TODO: Uncomment, when we have a migration plan in place. + /*else { + if (s->user->type != TYPE_RGW) { + ldpp_dout(dpp, 10) << "ERROR: User id of type: " << s->user->type + << " is present" << dendl; + throw -EPERM; + } + }*/ + + const auto iter = user_info.access_keys.find(access_key_id); + if (iter == std::end(user_info.access_keys)) { + ldpp_dout(dpp, 0) << "ERROR: access key not encoded in user info" << dendl; + return result_t::deny(-EPERM); + } + const RGWAccessKey& k = iter->second; + + const VersionAbstractor::server_signature_t server_signature = \ + signature_factory(cct, k.key, string_to_sign); + auto compare = signature.compare(server_signature); + + ldpp_dout(dpp, 15) << "string_to_sign=" + << rgw::crypt_sanitize::log_content{string_to_sign} + << dendl; + ldpp_dout(dpp, 15) << "server signature=" << server_signature << dendl; + ldpp_dout(dpp, 15) << "client signature=" << signature << dendl; + ldpp_dout(dpp, 15) << "compare=" << compare << dendl; + + if (compare != 0) { + return result_t::deny(-ERR_SIGNATURE_NO_MATCH); + } + + auto apl = apl_factory->create_apl_local(cct, s, user_info, k.subuser, boost::none); + return result_t::grant(std::move(apl), completer_factory(k.key)); +} + +rgw::auth::RemoteApplier::AuthInfo +rgw::auth::s3::STSEngine::get_creds_info(const STS::SessionToken& token) const noexcept +{ + using acct_privilege_t = \ + rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t; + + return rgw::auth::RemoteApplier::AuthInfo { + token.user, + token.acct_name, + token.perm_mask, + (token.is_admin) ? acct_privilege_t::IS_ADMIN_ACCT: acct_privilege_t::IS_PLAIN_ACCT, + token.acct_type + }; +} + +int +rgw::auth::s3::STSEngine::get_session_token(const boost::string_view& session_token, + STS::SessionToken& token) const +{ + string decodedSessionToken; + try { + decodedSessionToken = rgw::from_base64(session_token); + } catch (...) { + ldout(cct, 0) << "ERROR: Invalid session token, not base64 encoded." << dendl; + return -EINVAL; + } + + auto* cryptohandler = cct->get_crypto_handler(CEPH_CRYPTO_AES); + if (! cryptohandler) { + return -EINVAL; + } + string secret_s = cct->_conf->rgw_sts_key; + buffer::ptr secret(secret_s.c_str(), secret_s.length()); + int ret = 0; + if (ret = cryptohandler->validate_secret(secret); ret < 0) { + ldout(cct, 0) << "ERROR: Invalid secret key" << dendl; + return -EINVAL; + } + string error; + auto* keyhandler = cryptohandler->get_key_handler(secret, error); + if (! keyhandler) { + return -EINVAL; + } + error.clear(); + + string decrypted_str; + buffer::list en_input, dec_output; + en_input = buffer::list::static_from_string(decodedSessionToken); + + ret = keyhandler->decrypt(en_input, dec_output, &error); + if (ret < 0) { + ldout(cct, 0) << "ERROR: Decryption failed: " << error << dendl; + return -EPERM; + } else { + try { + dec_output.append('\0'); + auto iter = dec_output.cbegin(); + decode(token, iter); + } catch (const buffer::error& e) { + ldout(cct, 0) << "ERROR: decode SessionToken failed: " << error << dendl; + return -EINVAL; + } + } + return 0; +} + +rgw::auth::Engine::result_t +rgw::auth::s3::STSEngine::authenticate( + const DoutPrefixProvider* dpp, + const boost::string_view& _access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* const s) const +{ + if (! s->info.args.exists("X-Amz-Security-Token") && + ! s->info.env->exists("HTTP_X_AMZ_SECURITY_TOKEN")) { + return result_t::deny(); + } + + STS::SessionToken token; + if (int ret = get_session_token(session_token, token); ret < 0) { + return result_t::reject(ret); + } + //Authentication + //Check if access key is not the same passed in by client + if (token.access_key_id != _access_key_id) { + ldpp_dout(dpp, 0) << "Invalid access key" << dendl; + return result_t::reject(-EPERM); + } + //Check if the token has expired + if (! token.expiration.empty()) { + std::string expiration = token.expiration; + if (! expiration.empty()) { + boost::optional exp = ceph::from_iso_8601(expiration, false); + if (exp) { + real_clock::time_point now = real_clock::now(); + if (now >= *exp) { + ldpp_dout(dpp, 0) << "ERROR: Token expired" << dendl; + return result_t::reject(-EPERM); + } + } else { + ldpp_dout(dpp, 0) << "ERROR: Invalid expiration: " << expiration << dendl; + return result_t::reject(-EPERM); + } + } + } + //Check for signature mismatch + const VersionAbstractor::server_signature_t server_signature = \ + signature_factory(cct, token.secret_access_key, string_to_sign); + auto compare = signature.compare(server_signature); + + ldpp_dout(dpp, 15) << "string_to_sign=" + << rgw::crypt_sanitize::log_content{string_to_sign} + << dendl; + ldpp_dout(dpp, 15) << "server signature=" << server_signature << dendl; + ldpp_dout(dpp, 15) << "client signature=" << signature << dendl; + ldpp_dout(dpp, 15) << "compare=" << compare << dendl; + + if (compare != 0) { + return result_t::reject(-ERR_SIGNATURE_NO_MATCH); + } + + // Get all the authorization info + RGWUserInfo user_info; + rgw_user user_id; + vector role_policies; + string role_name; + if (! token.roleId.empty()) { + RGWRole role(s->cct, store, token.roleId); + if (role.get_by_id() < 0) { + return result_t::deny(-EPERM); + } + vector role_policy_names = role.get_role_policy_names(); + for (auto& policy_name : role_policy_names) { + string perm_policy; + if (int ret = role.get_role_policy(policy_name, perm_policy); ret == 0) { + role_policies.push_back(std::move(perm_policy)); + } + } + if (! token.policy.empty()) { + role_policies.push_back(std::move(token.policy)); + } + // This is mostly needed to assign the owner of a bucket during its creation + user_id = token.user; + role_name = role.get_name(); + } + + if (! token.user.empty() && token.acct_type != TYPE_ROLE) { + // get user info + int ret = rgw_get_user_info_by_uid(store, token.user, user_info, NULL); + if (ret < 0) { + ldpp_dout(dpp, 5) << "ERROR: failed reading user info: uid=" << token.user << dendl; + return result_t::reject(-EPERM); + } + } + + if (token.acct_type == TYPE_KEYSTONE || token.acct_type == TYPE_LDAP) { + auto apl = remote_apl_factory->create_apl_remote(cct, s, get_acl_strategy(), + get_creds_info(token)); + return result_t::grant(std::move(apl), completer_factory(boost::none)); + } else if (token.acct_type == TYPE_ROLE) { + auto apl = role_apl_factory->create_apl_role(cct, s, role_name, user_id, role_policies); + return result_t::grant(std::move(apl), completer_factory(token.secret_access_key)); + } else { // This is for all local users of type TYPE_RGW or TYPE_NONE + string subuser; + auto apl = local_apl_factory->create_apl_local(cct, s, user_info, subuser, token.perm_mask); + return result_t::grant(std::move(apl), completer_factory(token.secret_access_key)); + } +} + +bool rgw::auth::s3::S3AnonymousEngine::is_applicable( + const req_state* s +) const noexcept { + if (s->op == OP_OPTIONS) { + return true; + } + + AwsVersion version; + AwsRoute route; + std::tie(version, route) = discover_aws_flavour(s->info); + + return route == AwsRoute::QUERY_STRING && version == AwsVersion::UNKNOWN; +} diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h new file mode 100644 index 00000000..5010c3be --- /dev/null +++ b/src/rgw/rgw_rest_s3.h @@ -0,0 +1,1045 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_S3_H + +#define CEPH_RGW_REST_S3_H +#define TIME_BUF_SIZE 128 + +#include + +#include +#include + +#include "common/sstring.hh" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_http_errors.h" +#include "rgw_acl_s3.h" +#include "rgw_policy_s3.h" +#include "rgw_lc_s3.h" +#include "rgw_keystone.h" +#include "rgw_rest_conn.h" +#include "rgw_ldap.h" + +#include "rgw_token.h" +#include "include/ceph_assert.h" + +#include "rgw_auth.h" +#include "rgw_auth_filters.h" +#include "rgw_sts.h" + +struct rgw_http_error { + int http_ret; + const char *s3_code; +}; + +void rgw_get_errno_s3(struct rgw_http_error *e, int err_no); + +class RGWGetObj_ObjStore_S3 : public RGWGetObj_ObjStore +{ +protected: + // Serving a custom error page from an object is really a 200 response with + // just the status line altered. + int custom_http_ret = 0; + std::map crypt_http_responses; +public: + RGWGetObj_ObjStore_S3() {} + ~RGWGetObj_ObjStore_S3() override {} + + int get_params() override; + int send_response_data_error() override; + int send_response_data(bufferlist& bl, off_t ofs, off_t len) override; + void set_custom_http_response(int http_ret) { custom_http_ret = http_ret; } + int get_decrypt_filter(std::unique_ptr* filter, + RGWGetObj_Filter* cb, + bufferlist* manifest_bl) override; +}; + +class RGWGetObjTags_ObjStore_S3 : public RGWGetObjTags_ObjStore +{ + bufferlist tags_bl; +public: + RGWGetObjTags_ObjStore_S3() {} + ~RGWGetObjTags_ObjStore_S3() {} + + void send_response_data(bufferlist &bl) override; +}; + +class RGWPutObjTags_ObjStore_S3 : public RGWPutObjTags_ObjStore +{ +public: + RGWPutObjTags_ObjStore_S3() {} + ~RGWPutObjTags_ObjStore_S3() {} + + int get_params() override; + void send_response() override; +}; + +class RGWDeleteObjTags_ObjStore_S3 : public RGWDeleteObjTags +{ +public: + ~RGWDeleteObjTags_ObjStore_S3() override {} + void send_response() override; +}; + +class RGWListBuckets_ObjStore_S3 : public RGWListBuckets_ObjStore { +public: + RGWListBuckets_ObjStore_S3() {} + ~RGWListBuckets_ObjStore_S3() override {} + + int get_params() override { + limit = -1; /* no limit */ + return 0; + } + void send_response_begin(bool has_buckets) override; + void send_response_data(RGWUserBuckets& buckets) override; + void send_response_end() override; +}; + +class RGWGetUsage_ObjStore_S3 : public RGWGetUsage_ObjStore { +public: + RGWGetUsage_ObjStore_S3() {} + ~RGWGetUsage_ObjStore_S3() override {} + + int get_params() override ; + void send_response() override; +}; + +class RGWListBucket_ObjStore_S3 : public RGWListBucket_ObjStore { +protected: + bool objs_container; + bool encode_key {false}; + int get_common_params(); + void send_common_response(); + void send_common_versioned_response(); + public: + RGWListBucket_ObjStore_S3() : objs_container(false) { + default_max = 1000; + } + ~RGWListBucket_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; + void send_versioned_response(); +}; + +class RGWListBucket_ObjStore_S3v2 : public RGWListBucket_ObjStore_S3 { + bool fetchOwner; + bool start_after_exist; + bool continuation_token_exist; + string startAfter; + string continuation_token; +public: + RGWListBucket_ObjStore_S3v2() : fetchOwner(false) { + } + ~RGWListBucket_ObjStore_S3v2() override {} + + int get_params() override; + void send_response() override; + void send_versioned_response(); +}; + +class RGWGetBucketLogging_ObjStore_S3 : public RGWGetBucketLogging { +public: + RGWGetBucketLogging_ObjStore_S3() {} + ~RGWGetBucketLogging_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetBucketLocation_ObjStore_S3 : public RGWGetBucketLocation { +public: + RGWGetBucketLocation_ObjStore_S3() {} + ~RGWGetBucketLocation_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetBucketVersioning_ObjStore_S3 : public RGWGetBucketVersioning { +public: + RGWGetBucketVersioning_ObjStore_S3() {} + ~RGWGetBucketVersioning_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWSetBucketVersioning_ObjStore_S3 : public RGWSetBucketVersioning { +public: + RGWSetBucketVersioning_ObjStore_S3() {} + ~RGWSetBucketVersioning_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWGetBucketWebsite_ObjStore_S3 : public RGWGetBucketWebsite { +public: + RGWGetBucketWebsite_ObjStore_S3() {} + ~RGWGetBucketWebsite_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWSetBucketWebsite_ObjStore_S3 : public RGWSetBucketWebsite { +public: + RGWSetBucketWebsite_ObjStore_S3() {} + ~RGWSetBucketWebsite_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWDeleteBucketWebsite_ObjStore_S3 : public RGWDeleteBucketWebsite { +public: + RGWDeleteBucketWebsite_ObjStore_S3() {} + ~RGWDeleteBucketWebsite_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWStatBucket_ObjStore_S3 : public RGWStatBucket_ObjStore { +public: + RGWStatBucket_ObjStore_S3() {} + ~RGWStatBucket_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWCreateBucket_ObjStore_S3 : public RGWCreateBucket_ObjStore { +public: + RGWCreateBucket_ObjStore_S3() {} + ~RGWCreateBucket_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWDeleteBucket_ObjStore_S3 : public RGWDeleteBucket_ObjStore { +public: + RGWDeleteBucket_ObjStore_S3() {} + ~RGWDeleteBucket_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWPutObj_ObjStore_S3 : public RGWPutObj_ObjStore { +private: + std::map crypt_http_responses; + +public: + RGWPutObj_ObjStore_S3() {} + ~RGWPutObj_ObjStore_S3() override {} + + int get_params() override; + int get_data(bufferlist& bl) override; + void send_response() override; + + int get_encrypt_filter(std::unique_ptr *filter, + rgw::putobj::DataProcessor *cb) override; + int get_decrypt_filter(std::unique_ptr* filter, + RGWGetObj_Filter* cb, + map& attrs, + bufferlist* manifest_bl) override; +}; + +class RGWPostObj_ObjStore_S3 : public RGWPostObj_ObjStore { + parts_collection_t parts; + std::string filename; + std::string content_type; + RGWPolicyEnv env; + RGWPolicy post_policy; + map crypt_http_responses; + + const rgw::auth::StrategyRegistry* auth_registry_ptr = nullptr; + + int get_policy(); + int get_tags(); + void rebuild_key(string& key); + + std::string get_current_filename() const override; + std::string get_current_content_type() const override; + +public: + RGWPostObj_ObjStore_S3() {} + ~RGWPostObj_ObjStore_S3() override {} + + int verify_requester(const rgw::auth::StrategyRegistry& auth_registry) override { + auth_registry_ptr = &auth_registry; + return RGWPostObj_ObjStore::verify_requester(auth_registry); + } + + int get_params() override; + int complete_get_params(); + + void send_response() override; + int get_data(ceph::bufferlist& bl, bool& again) override; + int get_encrypt_filter(std::unique_ptr *filter, + rgw::putobj::DataProcessor *cb) override; +}; + +class RGWDeleteObj_ObjStore_S3 : public RGWDeleteObj_ObjStore { +public: + RGWDeleteObj_ObjStore_S3() {} + ~RGWDeleteObj_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWCopyObj_ObjStore_S3 : public RGWCopyObj_ObjStore { + bool sent_header; +public: + RGWCopyObj_ObjStore_S3() : sent_header(false) {} + ~RGWCopyObj_ObjStore_S3() override {} + + int init_dest_policy() override; + int get_params() override; + int check_storage_class(const rgw_placement_rule& src_placement); + void send_partial_response(off_t ofs) override; + void send_response() override; +}; + +class RGWGetACLs_ObjStore_S3 : public RGWGetACLs_ObjStore { +public: + RGWGetACLs_ObjStore_S3() {} + ~RGWGetACLs_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWPutACLs_ObjStore_S3 : public RGWPutACLs_ObjStore { +public: + RGWPutACLs_ObjStore_S3() {} + ~RGWPutACLs_ObjStore_S3() override {} + + int get_policy_from_state(RGWRados *store, struct req_state *s, stringstream& ss) override; + void send_response() override; + int get_params() override; +}; + +class RGWGetLC_ObjStore_S3 : public RGWGetLC_ObjStore { +protected: + RGWLifecycleConfiguration_S3 config; +public: + RGWGetLC_ObjStore_S3() {} + ~RGWGetLC_ObjStore_S3() override {} + void execute() override; + + void send_response() override; +}; + +class RGWPutLC_ObjStore_S3 : public RGWPutLC_ObjStore { +public: + RGWPutLC_ObjStore_S3() {} + ~RGWPutLC_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWDeleteLC_ObjStore_S3 : public RGWDeleteLC_ObjStore { +public: + RGWDeleteLC_ObjStore_S3() {} + ~RGWDeleteLC_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetCORS_ObjStore_S3 : public RGWGetCORS_ObjStore { +public: + RGWGetCORS_ObjStore_S3() {} + ~RGWGetCORS_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWPutCORS_ObjStore_S3 : public RGWPutCORS_ObjStore { +public: + RGWPutCORS_ObjStore_S3() {} + ~RGWPutCORS_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWDeleteCORS_ObjStore_S3 : public RGWDeleteCORS_ObjStore { +public: + RGWDeleteCORS_ObjStore_S3() {} + ~RGWDeleteCORS_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWOptionsCORS_ObjStore_S3 : public RGWOptionsCORS_ObjStore { +public: + RGWOptionsCORS_ObjStore_S3() {} + ~RGWOptionsCORS_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWGetRequestPayment_ObjStore_S3 : public RGWGetRequestPayment { +public: + RGWGetRequestPayment_ObjStore_S3() {} + ~RGWGetRequestPayment_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWSetRequestPayment_ObjStore_S3 : public RGWSetRequestPayment { +public: + RGWSetRequestPayment_ObjStore_S3() {} + ~RGWSetRequestPayment_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWInitMultipart_ObjStore_S3 : public RGWInitMultipart_ObjStore { +private: + std::map crypt_http_responses; +public: + RGWInitMultipart_ObjStore_S3() {} + ~RGWInitMultipart_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; + int prepare_encryption(map& attrs) override; +}; + +class RGWCompleteMultipart_ObjStore_S3 : public RGWCompleteMultipart_ObjStore { +public: + RGWCompleteMultipart_ObjStore_S3() {} + ~RGWCompleteMultipart_ObjStore_S3() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWAbortMultipart_ObjStore_S3 : public RGWAbortMultipart_ObjStore { +public: + RGWAbortMultipart_ObjStore_S3() {} + ~RGWAbortMultipart_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWListMultipart_ObjStore_S3 : public RGWListMultipart_ObjStore { +public: + RGWListMultipart_ObjStore_S3() {} + ~RGWListMultipart_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWListBucketMultiparts_ObjStore_S3 : public RGWListBucketMultiparts_ObjStore { +public: + RGWListBucketMultiparts_ObjStore_S3() { + default_max = 1000; + } + ~RGWListBucketMultiparts_ObjStore_S3() override {} + + void send_response() override; +}; + +class RGWDeleteMultiObj_ObjStore_S3 : public RGWDeleteMultiObj_ObjStore { +public: + RGWDeleteMultiObj_ObjStore_S3() {} + ~RGWDeleteMultiObj_ObjStore_S3() override {} + + int get_params() override; + void send_status() override; + void begin_response() override; + void send_partial_response(rgw_obj_key& key, bool delete_marker, + const string& marker_version_id, int ret) override; + void end_response() override; +}; + +class RGWPutBucketObjectLock_ObjStore_S3 : public RGWPutBucketObjectLock_ObjStore { +public: + RGWPutBucketObjectLock_ObjStore_S3() {} + ~RGWPutBucketObjectLock_ObjStore_S3() override {} + void send_response() override; +}; + +class RGWGetBucketObjectLock_ObjStore_S3 : public RGWGetBucketObjectLock_ObjStore { +public: + RGWGetBucketObjectLock_ObjStore_S3() {} + ~RGWGetBucketObjectLock_ObjStore_S3() {} + void send_response() override; +}; + +class RGWPutObjRetention_ObjStore_S3 : public RGWPutObjRetention_ObjStore { +public: + RGWPutObjRetention_ObjStore_S3() {} + ~RGWPutObjRetention_ObjStore_S3() {} + int get_params() override; + void send_response() override; +}; + +class RGWGetObjRetention_ObjStore_S3 : public RGWGetObjRetention_ObjStore { +public: + RGWGetObjRetention_ObjStore_S3() {} + ~RGWGetObjRetention_ObjStore_S3() {} + void send_response() override; +}; + +class RGWPutObjLegalHold_ObjStore_S3 : public RGWPutObjLegalHold_ObjStore { +public: + RGWPutObjLegalHold_ObjStore_S3() {} + ~RGWPutObjLegalHold_ObjStore_S3() {} + void send_response() override; +}; + +class RGWGetObjLegalHold_ObjStore_S3 : public RGWGetObjLegalHold_ObjStore { +public: + RGWGetObjLegalHold_ObjStore_S3() {} + ~RGWGetObjLegalHold_ObjStore_S3() {} + void send_response() override; +}; + +class RGWGetObjLayout_ObjStore_S3 : public RGWGetObjLayout { +public: + RGWGetObjLayout_ObjStore_S3() {} + ~RGWGetObjLayout_ObjStore_S3() {} + + void send_response() override; +}; + +class RGWConfigBucketMetaSearch_ObjStore_S3 : public RGWConfigBucketMetaSearch { +public: + RGWConfigBucketMetaSearch_ObjStore_S3() {} + ~RGWConfigBucketMetaSearch_ObjStore_S3() {} + + int get_params() override; + void send_response() override; +}; + +class RGWGetBucketMetaSearch_ObjStore_S3 : public RGWGetBucketMetaSearch { +public: + RGWGetBucketMetaSearch_ObjStore_S3() {} + ~RGWGetBucketMetaSearch_ObjStore_S3() {} + + void send_response() override; +}; + +class RGWDelBucketMetaSearch_ObjStore_S3 : public RGWDelBucketMetaSearch { +public: + RGWDelBucketMetaSearch_ObjStore_S3() {} + ~RGWDelBucketMetaSearch_ObjStore_S3() {} + + void send_response() override; +}; + +class RGW_Auth_S3 { +public: + static int authorize(const DoutPrefixProvider *dpp, + RGWRados *store, + const rgw::auth::StrategyRegistry& auth_registry, + struct req_state *s); +}; + +class RGWHandler_Auth_S3 : public RGWHandler_REST { + friend class RGWRESTMgr_S3; + + const rgw::auth::StrategyRegistry& auth_registry; + +public: + explicit RGWHandler_Auth_S3(const rgw::auth::StrategyRegistry& auth_registry) + : RGWHandler_REST(), + auth_registry(auth_registry) { + } + ~RGWHandler_Auth_S3() override = default; + + static int validate_bucket_name(const string& bucket); + static int validate_object_name(const string& bucket); + + int init(RGWRados *store, + struct req_state *s, + rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider *dpp) override { + return RGW_Auth_S3::authorize(dpp, store, auth_registry, s); + } + int postauth_init() override { return 0; } +}; + +class RGWHandler_REST_S3 : public RGWHandler_REST { + friend class RGWRESTMgr_S3; +protected: + const rgw::auth::StrategyRegistry& auth_registry; +public: + static int init_from_header(struct req_state *s, int default_formatter, bool configurable_format); + + explicit RGWHandler_REST_S3(const rgw::auth::StrategyRegistry& auth_registry) + : RGWHandler_REST(), + auth_registry(auth_registry) { + } + ~RGWHandler_REST_S3() override = default; + + int init(RGWRados *store, + struct req_state *s, + rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider *dpp) override; + int postauth_init() override; +}; + +class RGWHandler_REST_Service_S3 : public RGWHandler_REST_S3 { +protected: + const bool isSTSenabled; + bool isIAMenabled; + const bool isPSenabled; + bool is_usage_op() { + return s->info.args.exists("usage"); + } + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_post() override; +public: + RGWHandler_REST_Service_S3(const rgw::auth::StrategyRegistry& auth_registry, + bool _isSTSenabled, bool _isIAMenabled, bool _isPSenabled) : + RGWHandler_REST_S3(auth_registry), isSTSenabled(_isSTSenabled), isIAMenabled(_isIAMenabled), isPSenabled(_isPSenabled) {} + ~RGWHandler_REST_Service_S3() override = default; +}; + +class RGWHandler_REST_Bucket_S3 : public RGWHandler_REST_S3 { + const bool enable_pubsub; +protected: + bool is_acl_op() { + return s->info.args.exists("acl"); + } + bool is_cors_op() { + return s->info.args.exists("cors"); + } + bool is_lc_op() { + return s->info.args.exists("lifecycle"); + } + bool is_obj_update_op() override { + return is_acl_op() || is_cors_op(); + } + bool is_request_payment_op() { + return s->info.args.exists("requestPayment"); + } + bool is_policy_op() { + return s->info.args.exists("policy"); + } + bool is_object_lock_op() { + return s->info.args.exists("object-lock"); + } + bool is_notification_op() const { + if (enable_pubsub) { + return s->info.args.exists("notification"); + } + return false; + } + RGWOp *get_obj_op(bool get_data); + + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + RGWOp *op_options() override; +public: + RGWHandler_REST_Bucket_S3(const rgw::auth::StrategyRegistry& auth_registry, bool _enable_pubsub) : + RGWHandler_REST_S3(auth_registry), enable_pubsub(_enable_pubsub) {} + ~RGWHandler_REST_Bucket_S3() override = default; +}; + +class RGWHandler_REST_Obj_S3 : public RGWHandler_REST_S3 { +protected: + bool is_acl_op() { + return s->info.args.exists("acl"); + } + bool is_tagging_op() { + return s->info.args.exists("tagging"); + } + bool is_obj_retention_op() { + return s->info.args.exists("retention"); + } + bool is_obj_legal_hold_op() { + return s->info.args.exists("legal-hold"); + } + + bool is_obj_update_op() override { + return is_acl_op() || is_tagging_op() || is_obj_retention_op() || is_obj_legal_hold_op(); + } + RGWOp *get_obj_op(bool get_data); + + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + RGWOp *op_options() override; +public: + using RGWHandler_REST_S3::RGWHandler_REST_S3; + ~RGWHandler_REST_Obj_S3() override = default; +}; + +class RGWRESTMgr_S3 : public RGWRESTMgr { +private: + bool enable_s3website; + bool enable_sts; + bool enable_iam; + const bool enable_pubsub; +public: + explicit RGWRESTMgr_S3(bool enable_s3website = false, bool enable_sts = false, bool enable_iam = false, bool _enable_pubsub = false) + : enable_s3website(enable_s3website), + enable_sts(enable_sts), + enable_iam(enable_iam), + enable_pubsub(_enable_pubsub) { + } + + ~RGWRESTMgr_S3() override = default; + + RGWHandler_REST *get_handler(struct req_state* s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; + +class RGWHandler_REST_Obj_S3Website; + +static inline bool looks_like_ip_address(const char *bucket) +{ + struct in6_addr a; + if (inet_pton(AF_INET6, bucket, static_cast(&a)) == 1) { + return true; + } + int num_periods = 0; + bool expect_period = false; + for (const char *b = bucket; *b; ++b) { + if (*b == '.') { + if (!expect_period) + return false; + ++num_periods; + if (num_periods > 3) + return false; + expect_period = false; + } + else if (isdigit(*b)) { + expect_period = true; + } + else { + return false; + } + } + return (num_periods == 3); +} + +static inline int valid_s3_object_name(const string& name) { + if (name.size() > 1024) { + return -ERR_INVALID_OBJECT_NAME; + } + if (check_utf8(name.c_str(), name.size())) { + return -ERR_INVALID_OBJECT_NAME; + } + return 0; +} + +static inline int valid_s3_bucket_name(const string& name, bool relaxed=false) +{ + // This function enforces Amazon's spec for bucket names. + // (The requirements, not the recommendations.) + int len = name.size(); + if (len < 3) { + // Name too short + return -ERR_INVALID_BUCKET_NAME; + } else if (len > 255) { + // Name too long + return -ERR_INVALID_BUCKET_NAME; + } + + // bucket names must start with a number, letter, or underscore + if (!(isalpha(name[0]) || isdigit(name[0]))) { + if (!relaxed) + return -ERR_INVALID_BUCKET_NAME; + else if (!(name[0] == '_' || name[0] == '.' || name[0] == '-')) + return -ERR_INVALID_BUCKET_NAME; + } + + for (const char *s = name.c_str(); *s; ++s) { + char c = *s; + if (isdigit(c) || (c == '.')) + continue; + if (isalpha(c)) + continue; + if ((c == '-') || (c == '_')) + continue; + // Invalid character + return -ERR_INVALID_BUCKET_NAME; + } + + if (looks_like_ip_address(name.c_str())) + return -ERR_INVALID_BUCKET_NAME; + + return 0; +} + + +namespace rgw { +namespace auth { +namespace s3 { + +class AWSEngine : public rgw::auth::Engine { +public: + class VersionAbstractor { + static constexpr size_t DIGEST_SIZE_V2 = CEPH_CRYPTO_HMACSHA1_DIGESTSIZE; + static constexpr size_t DIGEST_SIZE_V4 = CEPH_CRYPTO_HMACSHA256_DIGESTSIZE; + + /* Knowing the signature max size allows us to employ the sstring, and thus + * avoid dynamic allocations. The multiplier comes from representing digest + * in the base64-encoded form. */ + static constexpr size_t SIGNATURE_MAX_SIZE = \ + std::max(DIGEST_SIZE_V2, DIGEST_SIZE_V4) * 2 + sizeof('\0'); + + public: + virtual ~VersionAbstractor() {}; + + using access_key_id_t = boost::string_view; + using client_signature_t = boost::string_view; + using session_token_t = boost::string_view; + using server_signature_t = basic_sstring; + using string_to_sign_t = std::string; + + /* Transformation for crafting the AWS signature at server side which is + * used later to compare with the user-provided one. The methodology for + * doing that depends on AWS auth version. */ + using signature_factory_t = \ + std::function; + + /* Return an instance of Completer for verifying the payload's fingerprint + * if necessary. Otherwise caller gets nullptr. Caller may provide secret + * key */ + using completer_factory_t = \ + std::function& secret_key)>; + + struct auth_data_t { + access_key_id_t access_key_id; + client_signature_t client_signature; + session_token_t session_token; + string_to_sign_t string_to_sign; + signature_factory_t signature_factory; + completer_factory_t completer_factory; + }; + + virtual auth_data_t get_auth_data(const req_state* s) const = 0; + }; + +protected: + CephContext* cct; + const VersionAbstractor& ver_abstractor; + + AWSEngine(CephContext* const cct, const VersionAbstractor& ver_abstractor) + : cct(cct), + ver_abstractor(ver_abstractor) { + } + + using result_t = rgw::auth::Engine::result_t; + using string_to_sign_t = VersionAbstractor::string_to_sign_t; + using signature_factory_t = VersionAbstractor::signature_factory_t; + using completer_factory_t = VersionAbstractor::completer_factory_t; + + /* TODO(rzarzynski): clean up. We've too many input parameter hee. Also + * the signature get_auth_data() of VersionAbstractor is too complicated. + * Replace these thing with a simple, dedicated structure. */ + virtual result_t authenticate(const DoutPrefixProvider* dpp, + const boost::string_view& access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* s) const = 0; + +public: + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const final; +}; + + +class AWSGeneralAbstractor : public AWSEngine::VersionAbstractor { + CephContext* const cct; + + virtual boost::optional + get_v4_canonical_headers(const req_info& info, + const boost::string_view& signedheaders, + const bool using_qs) const; + + auth_data_t get_auth_data_v2(const req_state* s) const; + auth_data_t get_auth_data_v4(const req_state* s, const bool using_qs) const; + +public: + explicit AWSGeneralAbstractor(CephContext* const cct) + : cct(cct) { + } + + auth_data_t get_auth_data(const req_state* s) const override; +}; + +class AWSGeneralBoto2Abstractor : public AWSGeneralAbstractor { + boost::optional + get_v4_canonical_headers(const req_info& info, + const boost::string_view& signedheaders, + const bool using_qs) const override; + +public: + using AWSGeneralAbstractor::AWSGeneralAbstractor; +}; + +class AWSBrowserUploadAbstractor : public AWSEngine::VersionAbstractor { + static std::string to_string(ceph::bufferlist bl) { + return std::string(bl.c_str(), + static_cast(bl.length())); + } + + auth_data_t get_auth_data_v2(const req_state* s) const; + auth_data_t get_auth_data_v4(const req_state* s) const; + +public: + explicit AWSBrowserUploadAbstractor(CephContext*) { + } + + auth_data_t get_auth_data(const req_state* s) const override; +}; + + +class LDAPEngine : public AWSEngine { + static rgw::LDAPHelper* ldh; + static std::mutex mtx; + + static void init(CephContext* const cct); + + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + using auth_info_t = rgw::auth::RemoteApplier::AuthInfo; + using result_t = rgw::auth::Engine::result_t; + +protected: + RGWRados* const store; + const rgw::auth::RemoteApplier::Factory* const apl_factory; + + acl_strategy_t get_acl_strategy() const; + auth_info_t get_creds_info(const rgw::RGWToken& token) const noexcept; + + result_t authenticate(const DoutPrefixProvider* dpp, + const boost::string_view& access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t&, + const completer_factory_t& completer_factory, + const req_state* s) const override; +public: + LDAPEngine(CephContext* const cct, + RGWRados* const store, + const VersionAbstractor& ver_abstractor, + const rgw::auth::RemoteApplier::Factory* const apl_factory) + : AWSEngine(cct, ver_abstractor), + store(store), + apl_factory(apl_factory) { + init(cct); + } + + using AWSEngine::authenticate; + + const char* get_name() const noexcept override { + return "rgw::auth::s3::LDAPEngine"; + } + + static bool valid(); + static void shutdown(); +}; + +class LocalEngine : public AWSEngine { + RGWRados* const store; + const rgw::auth::LocalApplier::Factory* const apl_factory; + + result_t authenticate(const DoutPrefixProvider* dpp, + const boost::string_view& access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* s) const override; +public: + LocalEngine(CephContext* const cct, + RGWRados* const store, + const VersionAbstractor& ver_abstractor, + const rgw::auth::LocalApplier::Factory* const apl_factory) + : AWSEngine(cct, ver_abstractor), + store(store), + apl_factory(apl_factory) { + } + + using AWSEngine::authenticate; + + const char* get_name() const noexcept override { + return "rgw::auth::s3::LocalEngine"; + } +}; + +class STSEngine : public AWSEngine { + RGWRados* const store; + const rgw::auth::LocalApplier::Factory* const local_apl_factory; + const rgw::auth::RemoteApplier::Factory* const remote_apl_factory; + const rgw::auth::RoleApplier::Factory* const role_apl_factory; + + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + using auth_info_t = rgw::auth::RemoteApplier::AuthInfo; + + acl_strategy_t get_acl_strategy() const { return nullptr; }; + auth_info_t get_creds_info(const STS::SessionToken& token) const noexcept; + + int get_session_token(const boost::string_view& session_token, + STS::SessionToken& token) const; + + result_t authenticate(const DoutPrefixProvider* dpp, + const boost::string_view& access_key_id, + const boost::string_view& signature, + const boost::string_view& session_token, + const string_to_sign_t& string_to_sign, + const signature_factory_t& signature_factory, + const completer_factory_t& completer_factory, + const req_state* s) const override; +public: + STSEngine(CephContext* const cct, + RGWRados* const store, + const VersionAbstractor& ver_abstractor, + const rgw::auth::LocalApplier::Factory* const local_apl_factory, + const rgw::auth::RemoteApplier::Factory* const remote_apl_factory, + const rgw::auth::RoleApplier::Factory* const role_apl_factory) + : AWSEngine(cct, ver_abstractor), + store(store), + local_apl_factory(local_apl_factory), + remote_apl_factory(remote_apl_factory), + role_apl_factory(role_apl_factory) { + } + + using AWSEngine::authenticate; + + const char* get_name() const noexcept override { + return "rgw::auth::s3::STSEngine"; + } +}; + +class S3AnonymousEngine : public rgw::auth::AnonymousEngine { + bool is_applicable(const req_state* s) const noexcept override; + +public: + /* Let's reuse the parent class' constructor. */ + using rgw::auth::AnonymousEngine::AnonymousEngine; + + const char* get_name() const noexcept override { + return "rgw::auth::s3::S3AnonymousEngine"; + } +}; + + +} /* namespace s3 */ +} /* namespace auth */ +} /* namespace rgw */ + + +#endif /* CEPH_RGW_REST_S3_H */ diff --git a/src/rgw/rgw_rest_s3website.h b/src/rgw/rgw_rest_s3website.h new file mode 100644 index 00000000..209ef964 --- /dev/null +++ b/src/rgw/rgw_rest_s3website.h @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Robin H. Johnson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RGW_REST_S3WEBSITE_H +#define CEPH_RGW_REST_S3WEBSITE_H + +#include "rgw_rest_s3.h" + +class RGWHandler_REST_S3Website : public RGWHandler_REST_S3 { + std::string original_object_name; // object name before retarget() + bool web_dir() const; +protected: + int retarget(RGWOp *op, RGWOp **new_op) override; + // TODO: this should be virtual I think, and ensure that it's always + // overridden, but that conflates that op_get/op_head are defined in this + // class and call this; and don't need to be overridden later. + virtual RGWOp *get_obj_op(bool get_data) { return NULL; } + RGWOp *op_get() override; + RGWOp *op_head() override; + // Only allowed to use GET+HEAD + RGWOp *op_put() override { return NULL; } + RGWOp *op_delete() override { return NULL; } + RGWOp *op_post() override { return NULL; } + RGWOp *op_copy() override { return NULL; } + RGWOp *op_options() override { return NULL; } + + int serve_errordoc(int http_ret, const string &errordoc_key); +public: + using RGWHandler_REST_S3::RGWHandler_REST_S3; + ~RGWHandler_REST_S3Website() override = default; + + int init(RGWRados *store, req_state *s, rgw::io::BasicClient* cio) override; + int error_handler(int err_no, string *error_content) override; +}; + +class RGWHandler_REST_Service_S3Website : public RGWHandler_REST_S3Website { +protected: + RGWOp *get_obj_op(bool get_data) override; +public: + using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website; + ~RGWHandler_REST_Service_S3Website() override = default; +}; + +class RGWHandler_REST_Obj_S3Website : public RGWHandler_REST_S3Website { +protected: + RGWOp *get_obj_op(bool get_data) override; +public: + using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website; + ~RGWHandler_REST_Obj_S3Website() override = default; +}; + +/* The cross-inheritance from Obj to Bucket is deliberate! + * S3Websites do NOT support any bucket operations + */ +class RGWHandler_REST_Bucket_S3Website : public RGWHandler_REST_S3Website { +protected: + RGWOp *get_obj_op(bool get_data) override; +public: + using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website; + ~RGWHandler_REST_Bucket_S3Website() override = default; +}; + +// TODO: do we actually need this? +class RGWGetObj_ObjStore_S3Website : public RGWGetObj_ObjStore_S3 +{ + friend class RGWHandler_REST_S3Website; +private: + bool is_errordoc_request; +public: + RGWGetObj_ObjStore_S3Website() : is_errordoc_request(false) {} + explicit RGWGetObj_ObjStore_S3Website(bool is_errordoc_request) : is_errordoc_request(false) { this->is_errordoc_request = is_errordoc_request; } + ~RGWGetObj_ObjStore_S3Website() override {} + int send_response_data_error() override; + int send_response_data(bufferlist& bl, off_t ofs, off_t len) override; + // We override RGWGetObj_ObjStore::get_params here, to allow ignoring all + // conditional params for error pages. + int get_params() override { + if (is_errordoc_request) { + range_str = NULL; + if_mod = NULL; + if_unmod = NULL; + if_match = NULL; + if_nomatch = NULL; + return 0; + } else { + return RGWGetObj_ObjStore_S3::get_params(); + } + } +}; + +#endif diff --git a/src/rgw/rgw_rest_sts.cc b/src/rgw/rgw_rest_sts.cc new file mode 100644 index 00000000..f7424be9 --- /dev/null +++ b/src/rgw/rgw_rest_sts.cc @@ -0,0 +1,459 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include + +#include "ceph_ver.h" + +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rest.h" +#include "rgw_auth.h" +#include "rgw_auth_registry.h" +#include "rgw_rest_sts.h" + +#include "rgw_formats.h" +#include "rgw_client_io.h" + +#include "rgw_request.h" +#include "rgw_process.h" +#include "rgw_iam_policy.h" +#include "rgw_iam_policy_keywords.h" + +#include "rgw_sts.h" + +#include +#include +#include + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +namespace rgw { +namespace auth { +namespace sts { + +bool +WebTokenEngine::is_applicable(const std::string& token) const noexcept +{ + return ! token.empty(); +} + +boost::optional +WebTokenEngine::get_from_idp(const DoutPrefixProvider* dpp, const std::string& token) const +{ + //Access token conforming to OAuth2.0 + if (! cct->_conf->rgw_sts_token_introspection_url.empty()) { + bufferlist introspect_resp; + RGWHTTPTransceiver introspect_req(cct, "POST", cct->_conf->rgw_sts_token_introspection_url, &introspect_resp); + //Headers + introspect_req.append_header("Content-Type", "application/x-www-form-urlencoded"); + string base64_creds = "Basic " + rgw::to_base64(cct->_conf->rgw_sts_client_id + ":" + cct->_conf->rgw_sts_client_secret); + introspect_req.append_header("Authorization", base64_creds); + // POST data + string post_data = "token=" + token; + introspect_req.set_post_data(post_data); + introspect_req.set_send_length(post_data.length()); + + int res = introspect_req.process(); + if (res < 0) { + ldpp_dout(dpp, 10) << "HTTP request res: " << res << dendl; + throw -EINVAL; + } + //Debug only + ldpp_dout(dpp, 20) << "HTTP status: " << introspect_req.get_http_status() << dendl; + ldpp_dout(dpp, 20) << "JSON Response is: " << introspect_resp.c_str() << dendl; + + JSONParser parser; + WebTokenEngine::token_t token; + if (!parser.parse(introspect_resp.c_str(), introspect_resp.length())) { + ldpp_dout(dpp, 2) << "Malformed json" << dendl; + throw -EINVAL; + } else { + bool is_active; + JSONDecoder::decode_json("active", is_active, &parser); + if (! is_active) { + ldpp_dout(dpp, 0) << "Active state is false" << dendl; + throw -ERR_INVALID_IDENTITY_TOKEN; + } + JSONDecoder::decode_json("iss", token.iss, &parser); + JSONDecoder::decode_json("aud", token.aud, &parser); + JSONDecoder::decode_json("sub", token.sub, &parser); + JSONDecoder::decode_json("user_name", token.user_name, &parser); + } + return token; + } + return boost::none; +} + +WebTokenEngine::result_t +WebTokenEngine::authenticate( const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* const s) const +{ + boost::optional t; + + if (! is_applicable(token)) { + return result_t::deny(); + } + + try { + t = get_from_idp(dpp, token); + } catch(...) { + return result_t::deny(-EACCES); + } + + if (t) { + auto apl = apl_factory->create_apl_web_identity(cct, s, *t); + return result_t::grant(std::move(apl)); + } + return result_t::deny(-EACCES); +} + +}; /* namespace sts */ +}; /* namespace auth */ +}; /* namespace rgw */ + +int RGWREST_STS::verify_permission() +{ + STS::STSService _sts(s->cct, store, s->user->user_id, s->auth.identity.get()); + sts = std::move(_sts); + + string rArn = s->info.args.get("RoleArn"); + const auto& [ret, role] = sts.getRoleInfo(rArn); + if (ret < 0) { + return ret; + } + string policy = role.get_assume_role_policy(); + buffer::list bl = buffer::list::static_from_string(policy); + + //Parse the policy + //TODO - This step should be part of Role Creation + try { + const rgw::IAM::Policy p(s->cct, s->user->user_id.tenant, bl); + //Check if the input role arn is there as one of the Principals in the policy, + // If yes, then return 0, else -EPERM + auto p_res = p.eval_principal(s->env, *s->auth.identity); + if (p_res == rgw::IAM::Effect::Deny) { + return -EPERM; + } + auto c_res = p.eval_conditions(s->env); + if (c_res == rgw::IAM::Effect::Deny) { + return -EPERM; + } + } catch (rgw::IAM::PolicyParseException& e) { + ldout(s->cct, 20) << "failed to parse policy: " << e.what() << dendl; + return -EPERM; + } + + return 0; +} + +void RGWREST_STS::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWSTSGetSessionToken::verify_permission() +{ + rgw::Partition partition = rgw::Partition::aws; + rgw::Service service = rgw::Service::s3; + if (!verify_user_permission(this, + s, + rgw::ARN(partition, service, "", s->user->user_id.tenant, ""), + rgw::IAM::stsGetSessionToken)) { + return -EACCES; + } + + return 0; +} + +int RGWSTSGetSessionToken::get_params() +{ + duration = s->info.args.get("DurationSeconds"); + serialNumber = s->info.args.get("SerialNumber"); + tokenCode = s->info.args.get("TokenCode"); + + if (! duration.empty()) { + string err; + uint64_t duration_in_secs = strict_strtoll(duration.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + + if (duration_in_secs < STS::GetSessionTokenRequest::getMinDuration() || + duration_in_secs > s->cct->_conf->rgw_sts_max_session_duration) + return -EINVAL; + } + + return 0; +} + +void RGWSTSGetSessionToken::execute() +{ + if (op_ret = get_params(); op_ret < 0) { + return; + } + + STS::STSService sts(s->cct, store, s->user->user_id, s->auth.identity.get()); + + STS::GetSessionTokenRequest req(duration, serialNumber, tokenCode); + const auto& [ret, creds] = sts.getSessionToken(req); + op_ret = std::move(ret); + //Dump the output + if (op_ret == 0) { + s->formatter->open_object_section("GetSessionTokenResponse"); + s->formatter->open_object_section("GetSessionTokenResult"); + s->formatter->open_object_section("Credentials"); + creds.dump(s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWSTSAssumeRoleWithWebIdentity::get_params() +{ + duration = s->info.args.get("DurationSeconds"); + providerId = s->info.args.get("ProviderId"); + policy = s->info.args.get("Policy"); + roleArn = s->info.args.get("RoleArn"); + roleSessionName = s->info.args.get("RoleSessionName"); + iss = s->info.args.get("provider_id"); + sub = s->info.args.get("sub"); + aud = s->info.args.get("aud"); + + if (roleArn.empty() || roleSessionName.empty() || sub.empty() || aud.empty()) { + ldout(s->cct, 20) << "ERROR: one of role arn or role session name or token is empty" << dendl; + return -EINVAL; + } + + if (! policy.empty()) { + bufferlist bl = bufferlist::static_from_string(policy); + try { + const rgw::IAM::Policy p(s->cct, s->user->user_id.tenant, bl); + } + catch (rgw::IAM::PolicyParseException& e) { + ldout(s->cct, 20) << "failed to parse policy: " << e.what() << "policy" << policy << dendl; + return -ERR_MALFORMED_DOC; + } + } + + return 0; +} + +void RGWSTSAssumeRoleWithWebIdentity::execute() +{ + if (op_ret = get_params(); op_ret < 0) { + return; + } + + STS::AssumeRoleWithWebIdentityRequest req(duration, providerId, policy, roleArn, + roleSessionName, iss, sub, aud); + STS::AssumeRoleWithWebIdentityResponse response = sts.assumeRoleWithWebIdentity(req); + op_ret = std::move(response.assumeRoleResp.retCode); + + //Dump the output + if (op_ret == 0) { + s->formatter->open_object_section("AssumeRoleWithWebIdentityResponse"); + s->formatter->open_object_section("AssumeRoleWithWebIdentityResult"); + encode_json("SubjectFromWebIdentityToken", response.sub , s->formatter); + encode_json("Audience", response.aud , s->formatter); + s->formatter->open_object_section("AssumedRoleUser"); + response.assumeRoleResp.user.dump(s->formatter); + s->formatter->close_section(); + s->formatter->open_object_section("Credentials"); + response.assumeRoleResp.creds.dump(s->formatter); + s->formatter->close_section(); + encode_json("Provider", response.providerId , s->formatter); + encode_json("PackedPolicySize", response.assumeRoleResp.packedPolicySize , s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGWSTSAssumeRole::get_params() +{ + duration = s->info.args.get("DurationSeconds"); + externalId = s->info.args.get("ExternalId"); + policy = s->info.args.get("Policy"); + roleArn = s->info.args.get("RoleArn"); + roleSessionName = s->info.args.get("RoleSessionName"); + serialNumber = s->info.args.get("SerialNumber"); + tokenCode = s->info.args.get("TokenCode"); + + if (roleArn.empty() || roleSessionName.empty()) { + ldout(s->cct, 20) << "ERROR: one of role arn or role session name is empty" << dendl; + return -EINVAL; + } + + if (! policy.empty()) { + bufferlist bl = bufferlist::static_from_string(policy); + try { + const rgw::IAM::Policy p(s->cct, s->user->user_id.tenant, bl); + } + catch (rgw::IAM::PolicyParseException& e) { + ldout(s->cct, 20) << "failed to parse policy: " << e.what() << "policy" << policy << dendl; + return -ERR_MALFORMED_DOC; + } + } + + return 0; +} + +void RGWSTSAssumeRole::execute() +{ + if (op_ret = get_params(); op_ret < 0) { + return; + } + + STS::AssumeRoleRequest req(duration, externalId, policy, roleArn, + roleSessionName, serialNumber, tokenCode); + STS::AssumeRoleResponse response = sts.assumeRole(req); + op_ret = std::move(response.retCode); + //Dump the output + if (op_ret == 0) { + s->formatter->open_object_section("AssumeRoleResponse"); + s->formatter->open_object_section("AssumeRoleResult"); + s->formatter->open_object_section("Credentials"); + response.creds.dump(s->formatter); + s->formatter->close_section(); + s->formatter->open_object_section("AssumedRoleUser"); + response.user.dump(s->formatter); + s->formatter->close_section(); + encode_json("PackedPolicySize", response.packedPolicySize , s->formatter); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +int RGW_Auth_STS::authorize(const DoutPrefixProvider *dpp, + RGWRados *store, + const rgw::auth::StrategyRegistry& auth_registry, + struct req_state *s) +{ + return rgw::auth::Strategy::apply(dpp, auth_registry.get_sts(), s); +} + +void RGWHandler_REST_STS::rgw_sts_parse_input() +{ + if (post_body.size() > 0) { + ldout(s->cct, 10) << "Content of POST: " << post_body << dendl; + + if (post_body.find("Action") != string::npos) { + boost::char_separator sep("&"); + boost::tokenizer> tokens(post_body, sep); + for (const auto& t : tokens) { + auto pos = t.find("="); + if (pos != string::npos) { + s->info.args.append(t.substr(0,pos), + url_decode(t.substr(pos+1, t.size() -1))); + } + } + } + } + auto payload_hash = rgw::auth::s3::calc_v4_payload_hash(post_body); + s->info.args.append("PayloadHash", payload_hash); +} + +RGWOp *RGWHandler_REST_STS::op_post() +{ + rgw_sts_parse_input(); + + if (s->info.args.exists("Action")) { + string action = s->info.args.get("Action"); + if (action == "AssumeRole") { + return new RGWSTSAssumeRole; + } else if (action == "GetSessionToken") { + return new RGWSTSGetSessionToken; + } else if (action == "AssumeRoleWithWebIdentity") { + return new RGWSTSAssumeRoleWithWebIdentity; + } + } + + return nullptr; +} + +int RGWHandler_REST_STS::init(RGWRados *store, + struct req_state *s, + rgw::io::BasicClient *cio) +{ + s->dialect = "sts"; + + if (int ret = RGWHandler_REST_STS::init_from_header(s, RGW_FORMAT_XML, true); ret < 0) { + ldout(s->cct, 10) << "init_from_header returned err=" << ret << dendl; + return ret; + } + + return RGWHandler_REST::init(store, s, cio); +} + +int RGWHandler_REST_STS::authorize(const DoutPrefixProvider* dpp) +{ + if (s->info.args.exists("Action") && s->info.args.get("Action") == "AssumeRoleWithWebIdentity") { + return RGW_Auth_STS::authorize(dpp, store, auth_registry, s); + } + return RGW_Auth_S3::authorize(dpp, store, auth_registry, s); +} + +int RGWHandler_REST_STS::init_from_header(struct req_state* s, + int default_formatter, + bool configurable_format) +{ + string req; + string first; + + s->prot_flags = RGW_REST_STS; + + const char *p, *req_name; + if (req_name = s->relative_uri.c_str(); *req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(); + + /* must be called after the args parsing */ + if (int ret = allocate_formatter(s, default_formatter, configurable_format); ret < 0) + return ret; + + if (*req_name != '/') + return 0; + + req_name++; + + if (!*req_name) + return 0; + + req = req_name; + int pos = req.find('/'); + if (pos >= 0) { + first = req.substr(0, pos); + } else { + first = req; + } + + return 0; +} + +RGWHandler_REST* +RGWRESTMgr_STS::get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + return new RGWHandler_REST_STS(auth_registry); +} diff --git a/src/rgw/rgw_rest_sts.h b/src/rgw/rgw_rest_sts.h new file mode 100644 index 00000000..d9baa2c3 --- /dev/null +++ b/src/rgw/rgw_rest_sts.h @@ -0,0 +1,202 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_STS_H +#define CEPH_RGW_REST_STS_H + +#include "rgw_auth.h" +#include "rgw_auth_filters.h" +#include "rgw_sts.h" +#include "rgw_web_idp.h" + +namespace rgw { +namespace auth { +namespace sts { + +class WebTokenEngine : public rgw::auth::Engine { + CephContext* const cct; + + using result_t = rgw::auth::Engine::result_t; + using token_t = rgw::web_idp::WebTokenClaims; + + const rgw::auth::TokenExtractor* const extractor; + const rgw::auth::WebIdentityApplier::Factory* const apl_factory; + + bool is_applicable(const std::string& token) const noexcept; + + boost::optional + get_from_idp(const DoutPrefixProvider* dpp, const std::string& token) const; + + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* s) const; + +public: + WebTokenEngine(CephContext* const cct, + const rgw::auth::TokenExtractor* const extractor, + const rgw::auth::WebIdentityApplier::Factory* const apl_factory) + : cct(cct), + extractor(extractor), + apl_factory(apl_factory) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::sts::WebTokenEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const override { + return authenticate(dpp, extractor->get_token(s), s); + } +}; /* class WebTokenEngine */ + +class DefaultStrategy : public rgw::auth::Strategy, + public rgw::auth::TokenExtractor, + public rgw::auth::WebIdentityApplier::Factory { + RGWRados* const store; + + /* The engine. */ + const WebTokenEngine web_token_engine; + + using aplptr_t = rgw::auth::IdentityApplier::aplptr_t; + + /* The method implements TokenExtractor for Web Token in req_state. */ + std::string get_token(const req_state* const s) const override { + return s->info.args.get("WebIdentityToken"); + } + + aplptr_t create_apl_web_identity( CephContext* cct, + const req_state* s, + const rgw::web_idp::WebTokenClaims& token) const override { + auto apl = rgw::auth::add_sysreq(cct, store, s, + rgw::auth::WebIdentityApplier(cct, store, token)); + return aplptr_t(new decltype(apl)(std::move(apl))); + } + +public: + DefaultStrategy(CephContext* const cct, + RGWRados* const store) + : store(store), + web_token_engine(cct, + static_cast(this), + static_cast(this)) { + /* When the constructor's body is being executed, all member engines + * should be initialized. Thus, we can safely add them. */ + using Control = rgw::auth::Strategy::Control; + add_engine(Control::SUFFICIENT, web_token_engine); + } + + const char* get_name() const noexcept override { + return "rgw::auth::sts::DefaultStrategy"; + } +}; + +}; /* namespace sts */ +}; /* namespace auth */ +}; + +class RGWREST_STS : public RGWRESTOp { +protected: + STS::STSService sts; +public: + RGWREST_STS() = default; + int verify_permission() override; + void send_response() override; +}; + +class RGWSTSAssumeRoleWithWebIdentity : public RGWREST_STS { +protected: + string duration; + string providerId; + string policy; + string roleArn; + string roleSessionName; + string sub; + string aud; + string iss; +public: + RGWSTSAssumeRoleWithWebIdentity() = default; + void execute() override; + int get_params(); + const char* name() const override { return "assume_role_web_identity"; } + RGWOpType get_type() override { return RGW_STS_ASSUME_ROLE_WEB_IDENTITY; } +}; + +class RGWSTSAssumeRole : public RGWREST_STS { +protected: + string duration; + string externalId; + string policy; + string roleArn; + string roleSessionName; + string serialNumber; + string tokenCode; +public: + RGWSTSAssumeRole() = default; + void execute() override; + int get_params(); + const char* name() const override { return "assume_role"; } + RGWOpType get_type() override { return RGW_STS_ASSUME_ROLE; } +}; + +class RGWSTSGetSessionToken : public RGWREST_STS { +protected: + string duration; + string serialNumber; + string tokenCode; +public: + RGWSTSGetSessionToken() = default; + void execute() override; + int verify_permission() override; + int get_params(); + const char* name() const override { return "get_session_token"; } + RGWOpType get_type() override { return RGW_STS_GET_SESSION_TOKEN; } +}; + +class RGW_Auth_STS { +public: + static int authorize(const DoutPrefixProvider *dpp, + RGWRados *store, + const rgw::auth::StrategyRegistry& auth_registry, + struct req_state *s); +}; + +class RGWHandler_REST_STS : public RGWHandler_REST { + const rgw::auth::StrategyRegistry& auth_registry; + const string& post_body; + RGWOp *op_post() override; + void rgw_sts_parse_input(); +public: + + static int init_from_header(struct req_state *s, int default_formatter, bool configurable_format); + + RGWHandler_REST_STS(const rgw::auth::StrategyRegistry& auth_registry, const string& post_body="") + : RGWHandler_REST(), + auth_registry(auth_registry), + post_body(post_body) {} + ~RGWHandler_REST_STS() override = default; + + int init(RGWRados *store, + struct req_state *s, + rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider* dpp) override; + int postauth_init() override { return 0; } +}; + +class RGWRESTMgr_STS : public RGWRESTMgr { +public: + RGWRESTMgr_STS() = default; + ~RGWRESTMgr_STS() override = default; + + RGWRESTMgr *get_resource_mgr(struct req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + + RGWHandler_REST* get_handler(struct req_state*, + const rgw::auth::StrategyRegistry&, + const std::string&) override; +}; + +#endif /* CEPH_RGW_REST_STS_H */ + diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc new file mode 100644 index 00000000..e1d8095e --- /dev/null +++ b/src/rgw/rgw_rest_swift.cc @@ -0,0 +1,3093 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + +#include "include/ceph_assert.h" +#include "ceph_ver.h" + +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rest_swift.h" +#include "rgw_acl_swift.h" +#include "rgw_cors_swift.h" +#include "rgw_formats.h" +#include "rgw_client_io.h" + +#include "rgw_auth.h" +#include "rgw_swift_auth.h" + +#include "rgw_request.h" +#include "rgw_process.h" + +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include +#include +#include + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +int RGWListBuckets_ObjStore_SWIFT::get_params() +{ + prefix = s->info.args.get("prefix"); + marker = s->info.args.get("marker"); + end_marker = s->info.args.get("end_marker"); + wants_reversed = s->info.args.exists("reverse"); + + if (wants_reversed) { + std::swap(marker, end_marker); + } + + std::string limit_str = s->info.args.get("limit"); + if (!limit_str.empty()) { + std::string err; + long l = strict_strtol(limit_str.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + + if (l > (long)limit_max || l < 0) { + return -ERR_PRECONDITION_FAILED; + } + + limit = (uint64_t)l; + } + + if (s->cct->_conf->rgw_swift_need_stats) { + bool stats, exists; + int r = s->info.args.get_bool("stats", &stats, &exists); + + if (r < 0) { + return r; + } + + if (exists) { + need_stats = stats; + } + } else { + need_stats = false; + } + + return 0; +} + +static void dump_account_metadata(struct req_state * const s, + const RGWUsageStats& global_stats, + const std::map &policies_stats, + /* const */map& attrs, + const RGWQuotaInfo& quota, + const RGWAccessControlPolicy_SWIFTAcct &policy) +{ + /* Adding X-Timestamp to keep align with Swift API */ + dump_header(s, "X-Timestamp", ceph_clock_now()); + + dump_header(s, "X-Account-Container-Count", global_stats.buckets_count); + dump_header(s, "X-Account-Object-Count", global_stats.objects_count); + dump_header(s, "X-Account-Bytes-Used", global_stats.bytes_used); + dump_header(s, "X-Account-Bytes-Used-Actual", global_stats.bytes_used_rounded); + + for (const auto& kv : policies_stats) { + const auto& policy_name = camelcase_dash_http_attr(kv.first); + const auto& policy_stats = kv.second; + + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Container-Count", policy_stats.buckets_count); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Object-Count", policy_stats.objects_count); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Bytes-Used", policy_stats.bytes_used); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Bytes-Used-Actual", policy_stats.bytes_used_rounded); + } + + /* Dump TempURL-related stuff */ + if (s->perm_mask == RGW_PERM_FULL_CONTROL) { + auto iter = s->user->temp_url_keys.find(0); + if (iter != std::end(s->user->temp_url_keys) && ! iter->second.empty()) { + dump_header(s, "X-Account-Meta-Temp-Url-Key", iter->second); + } + + iter = s->user->temp_url_keys.find(1); + if (iter != std::end(s->user->temp_url_keys) && ! iter->second.empty()) { + dump_header(s, "X-Account-Meta-Temp-Url-Key-2", iter->second); + } + } + + /* Dump quota headers. */ + if (quota.enabled) { + if (quota.max_size >= 0) { + dump_header(s, "X-Account-Meta-Quota-Bytes", quota.max_size); + } + + /* Limit on the number of objects in a given account is a RadosGW's + * extension. Swift's account quota WSGI filter doesn't support it. */ + if (quota.max_objects >= 0) { + dump_header(s, "X-Account-Meta-Quota-Count", quota.max_objects); + } + } + + /* Dump user-defined metadata items and generic attrs. */ + const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1; + map::iterator iter; + for (iter = attrs.lower_bound(RGW_ATTR_PREFIX); iter != attrs.end(); ++iter) { + const char *name = iter->first.c_str(); + map::const_iterator geniter = rgw_to_http_attrs.find(name); + + if (geniter != rgw_to_http_attrs.end()) { + dump_header(s, geniter->second, iter->second); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) { + dump_header_prefixed(s, "X-Account-Meta-", + camelcase_dash_http_attr(name + PREFIX_LEN), + iter->second); + } + } + + /* Dump account ACLs */ + auto account_acls = policy.to_str(); + if (account_acls) { + dump_header(s, "X-Account-Access-Control", std::move(*account_acls)); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets) +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } else if (!has_buckets && s->format == RGW_FORMAT_PLAIN) { + op_ret = STATUS_NO_CONTENT; + set_req_state_err(s, op_ret); + } + + if (! s->cct->_conf->rgw_swift_enforce_content_length) { + /* Adding account stats in the header to keep align with Swift API */ + dump_account_metadata(s, + global_stats, + policies_stats, + attrs, + user_quota, + static_cast(*s->user_acl)); + dump_errno(s); + dump_header(s, "Accept-Ranges", "bytes"); + end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true); + } + + if (! op_ret) { + dump_start(s); + s->formatter->open_array_section_with_attrs("account", + FormatterAttrs("name", s->user->display_name.c_str(), NULL)); + + sent_data = true; + } +} + +void RGWListBuckets_ObjStore_SWIFT::handle_listing_chunk(RGWUserBuckets&& buckets) +{ + if (wants_reversed) { + /* Just store in the reversal buffer. Its content will be handled later, + * in send_response_end(). */ + reverse_buffer.emplace(std::begin(reverse_buffer), std::move(buckets)); + } else { + return send_response_data(buckets); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets) +{ + if (! sent_data) { + return; + } + + /* Take care of the prefix parameter of Swift API. There is no business + * in applying the filter earlier as we really need to go through all + * entries regardless of it (the headers like X-Account-Container-Count + * aren't affected by specifying prefix). */ + const std::map& m = buckets.get_buckets(); + for (auto iter = m.lower_bound(prefix); + iter != m.end() && boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + dump_bucket_entry(iter->second); + } +} + +void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const RGWBucketEnt& obj) +{ + s->formatter->open_object_section("container"); + s->formatter->dump_string("name", obj.bucket.name); + + if (need_stats) { + s->formatter->dump_int("count", obj.count); + s->formatter->dump_int("bytes", obj.size); + } + + s->formatter->close_section(); + + if (! s->cct->_conf->rgw_swift_enforce_content_length) { + rgw_flush_formatter(s, s->formatter); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_data_reversed(RGWUserBuckets& buckets) +{ + if (! sent_data) { + return; + } + + /* Take care of the prefix parameter of Swift API. There is no business + * in applying the filter earlier as we really need to go through all + * entries regardless of it (the headers like X-Account-Container-Count + * aren't affected by specifying prefix). */ + std::map& m = buckets.get_buckets(); + + auto iter = m.rbegin(); + for (/* initialized above */; + iter != m.rend() && !boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + /* NOP */; + } + + for (/* iter carried */; + iter != m.rend() && boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + dump_bucket_entry(iter->second); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_end() +{ + if (wants_reversed) { + for (auto& buckets : reverse_buffer) { + send_response_data_reversed(buckets); + } + } + + if (sent_data) { + s->formatter->close_section(); + } + + if (s->cct->_conf->rgw_swift_enforce_content_length) { + /* Adding account stats in the header to keep align with Swift API */ + dump_account_metadata(s, + global_stats, + policies_stats, + attrs, + user_quota, + static_cast(*s->user_acl)); + dump_errno(s); + end_header(s, nullptr, nullptr, s->formatter->get_len(), true); + } + + if (sent_data || s->cct->_conf->rgw_swift_enforce_content_length) { + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWListBucket_ObjStore_SWIFT::get_params() +{ + prefix = s->info.args.get("prefix"); + marker = s->info.args.get("marker"); + end_marker = s->info.args.get("end_marker"); + max_keys = s->info.args.get("limit"); + + // non-standard + s->info.args.get_bool("allow_unordered", &allow_unordered, false); + + delimiter = s->info.args.get("delimiter"); + + op_ret = parse_max_keys(); + if (op_ret < 0) { + return op_ret; + } + // S3 behavior is to silently cap the max-keys. + // Swift behavior is to abort. + if (max > default_max) + return -ERR_PRECONDITION_FAILED; + + string path_args; + if (s->info.args.exists("path")) { // should handle empty path + path_args = s->info.args.get("path"); + if (!delimiter.empty() || !prefix.empty()) { + return -EINVAL; + } + prefix = path_args; + delimiter="/"; + + path = prefix; + if (path.size() && path[path.size() - 1] != '/') + path.append("/"); + + int len = prefix.size(); + int delim_size = delimiter.size(); + + if (len >= delim_size) { + if (prefix.substr(len - delim_size).compare(delimiter) != 0) + prefix.append(delimiter); + } + } + + return 0; +} + +static void dump_container_metadata(struct req_state *, + const RGWBucketEnt&, + const RGWQuotaInfo&, + const RGWBucketWebsiteConf&); + +void RGWListBucket_ObjStore_SWIFT::send_response() +{ + vector::iterator iter = objs.begin(); + map::iterator pref_iter = common_prefixes.begin(); + + dump_start(s); + dump_container_metadata(s, bucket, bucket_quota, + s->bucket_info.website_conf); + + s->formatter->open_array_section_with_attrs("container", + FormatterAttrs("name", + s->bucket.name.c_str(), + NULL)); + + while (iter != objs.end() || pref_iter != common_prefixes.end()) { + bool do_pref = false; + bool do_objs = false; + rgw_obj_key key; + if (iter != objs.end()) { + key = iter->key; + } + if (pref_iter == common_prefixes.end()) + do_objs = true; + else if (iter == objs.end()) + do_pref = true; + else if (!key.empty() && key.name.compare(pref_iter->first) == 0) { + do_objs = true; + ++pref_iter; + } else if (!key.empty() && key.name.compare(pref_iter->first) <= 0) + do_objs = true; + else + do_pref = true; + + if (do_objs && (allow_unordered || marker.empty() || marker < key)) { + if (key.name.compare(path) == 0) + goto next; + + s->formatter->open_object_section("object"); + s->formatter->dump_string("name", key.name); + s->formatter->dump_string("hash", iter->meta.etag); + s->formatter->dump_int("bytes", iter->meta.accounted_size); + if (!iter->meta.user_data.empty()) + s->formatter->dump_string("user_custom_data", iter->meta.user_data); + string single_content_type = iter->meta.content_type; + if (iter->meta.content_type.size()) { + // content type might hold multiple values, just dump the last one + ssize_t pos = iter->meta.content_type.rfind(','); + if (pos > 0) { + ++pos; + while (single_content_type[pos] == ' ') + ++pos; + single_content_type = single_content_type.substr(pos); + } + s->formatter->dump_string("content_type", single_content_type); + } + dump_time(s, "last_modified", &iter->meta.mtime); + s->formatter->close_section(); + } + + if (do_pref && (marker.empty() || pref_iter->first.compare(marker.name) > 0)) { + const string& name = pref_iter->first; + if (name.compare(delimiter) == 0) + goto next; + + s->formatter->open_object_section_with_attrs("subdir", FormatterAttrs("name", name.c_str(), NULL)); + + /* swift is a bit inconsistent here */ + switch (s->format) { + case RGW_FORMAT_XML: + s->formatter->dump_string("name", name); + break; + default: + s->formatter->dump_string("subdir", name); + } + s->formatter->close_section(); + } +next: + if (do_objs) + ++iter; + else + ++pref_iter; + } + + s->formatter->close_section(); + + int64_t content_len = 0; + if (! op_ret) { + content_len = s->formatter->get_len(); + if (content_len == 0) { + op_ret = STATUS_NO_CONTENT; + } + } else if (op_ret > 0) { + op_ret = 0; + } + + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, NULL, content_len); + if (op_ret < 0) { + return; + } + + rgw_flush_formatter_and_reset(s, s->formatter); +} // RGWListBucket_ObjStore_SWIFT::send_response + +static void dump_container_metadata(struct req_state *s, + const RGWBucketEnt& bucket, + const RGWQuotaInfo& quota, + const RGWBucketWebsiteConf& ws_conf) +{ + /* Adding X-Timestamp to keep align with Swift API */ + dump_header(s, "X-Timestamp", utime_t(s->bucket_info.creation_time)); + + dump_header(s, "X-Container-Object-Count", bucket.count); + dump_header(s, "X-Container-Bytes-Used", bucket.size); + dump_header(s, "X-Container-Bytes-Used-Actual", bucket.size_rounded); + + if (s->object.empty()) { + auto swift_policy = \ + static_cast(s->bucket_acl.get()); + std::string read_acl, write_acl; + swift_policy->to_str(read_acl, write_acl); + + if (read_acl.size()) { + dump_header(s, "X-Container-Read", read_acl); + } + if (write_acl.size()) { + dump_header(s, "X-Container-Write", write_acl); + } + if (!s->bucket_info.placement_rule.name.empty()) { + dump_header(s, "X-Storage-Policy", s->bucket_info.placement_rule.name); + } + dump_header(s, "X-Storage-Class", s->bucket_info.placement_rule.get_storage_class()); + + /* Dump user-defined metadata items and generic attrs. */ + const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1; + map::iterator iter; + for (iter = s->bucket_attrs.lower_bound(RGW_ATTR_PREFIX); + iter != s->bucket_attrs.end(); + ++iter) { + const char *name = iter->first.c_str(); + map::const_iterator geniter = rgw_to_http_attrs.find(name); + + if (geniter != rgw_to_http_attrs.end()) { + dump_header(s, geniter->second, iter->second); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) { + dump_header_prefixed(s, "X-Container-Meta-", + camelcase_dash_http_attr(name + PREFIX_LEN), + iter->second); + } + } + } + + /* Dump container versioning info. */ + if (! s->bucket_info.swift_ver_location.empty()) { + dump_header(s, "X-Versions-Location", + url_encode(s->bucket_info.swift_ver_location)); + } + + /* Dump quota headers. */ + if (quota.enabled) { + if (quota.max_size >= 0) { + dump_header(s, "X-Container-Meta-Quota-Bytes", quota.max_size); + } + + if (quota.max_objects >= 0) { + dump_header(s, "X-Container-Meta-Quota-Count", quota.max_objects); + } + } + + /* Dump Static Website headers. */ + if (! ws_conf.index_doc_suffix.empty()) { + dump_header(s, "X-Container-Meta-Web-Index", ws_conf.index_doc_suffix); + } + + if (! ws_conf.error_doc.empty()) { + dump_header(s, "X-Container-Meta-Web-Error", ws_conf.error_doc); + } + + if (! ws_conf.subdir_marker.empty()) { + dump_header(s, "X-Container-Meta-Web-Directory-Type", + ws_conf.subdir_marker); + } + + if (! ws_conf.listing_css_doc.empty()) { + dump_header(s, "X-Container-Meta-Web-Listings-CSS", + ws_conf.listing_css_doc); + } + + if (ws_conf.listing_enabled) { + dump_header(s, "X-Container-Meta-Web-Listings", "true"); + } + + /* Dump bucket's modification time. Compliance with the Swift API really + * needs that. */ + dump_last_modified(s, s->bucket_mtime); +} + +void RGWStatAccount_ObjStore_SWIFT::execute() +{ + RGWStatAccount_ObjStore::execute(); + op_ret = rgw_get_user_attrs_by_uid(store, s->user->user_id, attrs); +} + +void RGWStatAccount_ObjStore_SWIFT::send_response() +{ + if (op_ret >= 0) { + op_ret = STATUS_NO_CONTENT; + dump_account_metadata(s, + global_stats, + policies_stats, + attrs, + user_quota, + static_cast(*s->user_acl)); + } + + set_req_state_err(s, op_ret); + dump_errno(s); + + end_header(s, NULL, NULL, 0, true); + + dump_start(s); +} + +void RGWStatBucket_ObjStore_SWIFT::send_response() +{ + if (op_ret >= 0) { + op_ret = STATUS_NO_CONTENT; + dump_container_metadata(s, bucket, bucket_quota, + s->bucket_info.website_conf); + } + + set_req_state_err(s, op_ret); + dump_errno(s); + + end_header(s, this, NULL, 0, true); + dump_start(s); +} + +static int get_swift_container_settings(req_state * const s, + RGWRados * const store, + RGWAccessControlPolicy * const policy, + bool * const has_policy, + uint32_t * rw_mask, + RGWCORSConfiguration * const cors_config, + bool * const has_cors) +{ + const char * const read_list = s->info.env->get("HTTP_X_CONTAINER_READ"); + const char * const write_list = s->info.env->get("HTTP_X_CONTAINER_WRITE"); + + *has_policy = false; + + if (read_list || write_list) { + RGWAccessControlPolicy_SWIFT swift_policy(s->cct); + const auto r = swift_policy.create(store, + s->user->user_id, + s->user->display_name, + read_list, + write_list, + *rw_mask); + if (r < 0) { + return r; + } + + *policy = swift_policy; + *has_policy = true; + } + + *has_cors = false; + + /*Check and update CORS configuration*/ + const char *allow_origins = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_ALLOW_ORIGIN"); + const char *allow_headers = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_ALLOW_HEADERS"); + const char *expose_headers = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_EXPOSE_HEADERS"); + const char *max_age = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_MAX_AGE"); + if (allow_origins) { + RGWCORSConfiguration_SWIFT *swift_cors = new RGWCORSConfiguration_SWIFT; + int r = swift_cors->create_update(allow_origins, allow_headers, expose_headers, max_age); + if (r < 0) { + dout(0) << "Error creating/updating the cors configuration" << dendl; + delete swift_cors; + return r; + } + *has_cors = true; + *cors_config = *swift_cors; + cors_config->dump(); + delete swift_cors; + } + + return 0; +} + +#define ACCT_REMOVE_ATTR_PREFIX "HTTP_X_REMOVE_ACCOUNT_META_" +#define ACCT_PUT_ATTR_PREFIX "HTTP_X_ACCOUNT_META_" +#define CONT_REMOVE_ATTR_PREFIX "HTTP_X_REMOVE_CONTAINER_META_" +#define CONT_PUT_ATTR_PREFIX "HTTP_X_CONTAINER_META_" + +static void get_rmattrs_from_headers(const req_state * const s, + const char * const put_prefix, + const char * const del_prefix, + set& rmattr_names) +{ + const size_t put_prefix_len = strlen(put_prefix); + const size_t del_prefix_len = strlen(del_prefix); + + for (const auto& kv : s->info.env->get_map()) { + size_t prefix_len = 0; + const char * const p = kv.first.c_str(); + + if (strncasecmp(p, del_prefix, del_prefix_len) == 0) { + /* Explicitly requested removal. */ + prefix_len = del_prefix_len; + } else if ((strncasecmp(p, put_prefix, put_prefix_len) == 0) + && kv.second.empty()) { + /* Removal requested by putting an empty value. */ + prefix_len = put_prefix_len; + } + + if (prefix_len > 0) { + string name(RGW_ATTR_META_PREFIX); + name.append(lowercase_dash_http_attr(p + prefix_len)); + rmattr_names.insert(name); + } + } +} + +static int get_swift_versioning_settings( + req_state * const s, + boost::optional& swift_ver_location) +{ + /* Removing the Swift's versions location has lower priority than setting + * a new one. That's the reason why we're handling it first. */ + const std::string vlocdel = + s->info.env->get("HTTP_X_REMOVE_VERSIONS_LOCATION", ""); + if (vlocdel.size()) { + swift_ver_location = boost::in_place(std::string()); + } + + if (s->info.env->exists("HTTP_X_VERSIONS_LOCATION")) { + /* If the Swift's versioning is globally disabled but someone wants to + * enable it for a given container, new version of Swift will generate + * the precondition failed error. */ + if (! s->cct->_conf->rgw_swift_versioning_enabled) { + return -ERR_PRECONDITION_FAILED; + } + + swift_ver_location = s->info.env->get("HTTP_X_VERSIONS_LOCATION", ""); + } + + return 0; +} + +int RGWCreateBucket_ObjStore_SWIFT::get_params() +{ + bool has_policy; + uint32_t policy_rw_mask = 0; + + int r = get_swift_container_settings(s, store, &policy, &has_policy, + &policy_rw_mask, &cors_config, &has_cors); + if (r < 0) { + return r; + } + + if (!has_policy) { + policy.create_default(s->user->user_id, s->user->display_name); + } + + location_constraint = store->svc.zone->get_zonegroup().api_name; + get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX, + CONT_REMOVE_ATTR_PREFIX, rmattr_names); + placement_rule.init(s->info.env->get("HTTP_X_STORAGE_POLICY", ""), s->info.storage_class); + + return get_swift_versioning_settings(s, swift_ver_location); +} + +static inline int handle_metadata_errors(req_state* const s, const int op_ret) +{ + if (op_ret == -EFBIG) { + /* Handle the custom error message of exceeding maximum custom attribute + * (stored as xattr) size. */ + const auto error_message = boost::str( + boost::format("Metadata value longer than %lld") + % s->cct->_conf.get_val("rgw_max_attr_size")); + set_req_state_err(s, EINVAL, error_message); + return -EINVAL; + } else if (op_ret == -E2BIG) { + const auto error_message = boost::str( + boost::format("Too many metadata items; max %lld") + % s->cct->_conf.get_val("rgw_max_attrs_num_in_req")); + set_req_state_err(s, EINVAL, error_message); + return -EINVAL; + } + + return op_ret; +} + +void RGWCreateBucket_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_CREATED; + } else if (op_ret == -ERR_BUCKET_EXISTS) { + op_ret = STATUS_ACCEPTED; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + /* Propose ending HTTP header with 0 Content-Length header. */ + end_header(s, NULL, NULL, 0); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWDeleteBucket_ObjStore_SWIFT::send_response() +{ + int r = op_ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this, NULL, 0); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static int get_delete_at_param(req_state *s, boost::optional &delete_at) +{ + /* Handle Swift object expiration. */ + real_time delat_proposal; + string x_delete = s->info.env->get("HTTP_X_DELETE_AFTER", ""); + + if (x_delete.empty()) { + x_delete = s->info.env->get("HTTP_X_DELETE_AT", ""); + } else { + /* X-Delete-After HTTP is present. It means we need add its value + * to the current time. */ + delat_proposal = real_clock::now(); + } + + if (x_delete.empty()) { + delete_at = boost::none; + if (s->info.env->exists("HTTP_X_REMOVE_DELETE_AT")) { + delete_at = boost::in_place(real_time()); + } + return 0; + } + string err; + long ts = strict_strtoll(x_delete.c_str(), 10, &err); + + if (!err.empty()) { + return -EINVAL; + } + + delat_proposal += make_timespan(ts); + if (delat_proposal < real_clock::now()) { + return -EINVAL; + } + + delete_at = delat_proposal; + + return 0; +} + +int RGWPutObj_ObjStore_SWIFT::verify_permission() +{ + op_ret = RGWPutObj_ObjStore::verify_permission(); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWPutObj_ObjStore_SWIFT::update_slo_segment_size(rgw_slo_entry& entry) { + + int r = 0; + const string& path = entry.path; + + /* If the path starts with slashes, strip them all. */ + const size_t pos_init = path.find_first_not_of('/'); + + if (pos_init == string::npos) { + return -EINVAL; + } + + const size_t pos_sep = path.find('/', pos_init); + if (pos_sep == string::npos) { + return -EINVAL; + } + + string bucket_name = path.substr(pos_init, pos_sep - pos_init); + string obj_name = path.substr(pos_sep + 1); + + rgw_bucket bucket; + + if (bucket_name.compare(s->bucket.name) != 0) { + RGWBucketInfo bucket_info; + map bucket_attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + r = store->get_bucket_info(obj_ctx, s->user->user_id.tenant, + bucket_name, bucket_info, nullptr, + &bucket_attrs); + if (r < 0) { + ldpp_dout(this, 0) << "could not get bucket info for bucket=" + << bucket_name << dendl; + return r; + } + bucket = bucket_info.bucket; + } else { + bucket = s->bucket; + } + + /* fetch the stored size of the seg (or error if not valid) */ + rgw_obj_key slo_key(obj_name); + rgw_obj slo_seg(bucket, slo_key); + + /* no prefetch */ + RGWObjectCtx obj_ctx(store); + obj_ctx.set_atomic(slo_seg); + + RGWRados::Object op_target(store, s->bucket_info, obj_ctx, slo_seg); + RGWRados::Object::Read read_op(&op_target); + + bool compressed; + RGWCompressionInfo cs_info; + map attrs; + uint64_t size_bytes{0}; + + read_op.params.attrs = &attrs; + read_op.params.obj_size = &size_bytes; + + r = read_op.prepare(); + if (r < 0) { + return r; + } + + r = rgw_compression_info_from_attrset(attrs, compressed, cs_info); + if (r < 0) { + return -EIO; + } + + if (compressed) { + size_bytes = cs_info.orig_size; + } + + /* "When the PUT operation sees the multipart-manifest=put query + * parameter, it reads the request body and verifies that each + * segment object exists and that the sizes and ETags match. If + * there is a mismatch, the PUT operation fails." + */ + if (entry.size_bytes && + (entry.size_bytes != size_bytes)) { + return -EINVAL; + } + + entry.size_bytes = size_bytes; + + return 0; +} /* RGWPutObj_ObjStore_SWIFT::update_slo_segment_sizes */ + +int RGWPutObj_ObjStore_SWIFT::get_params() +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + if (!s->length) { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + if (!encoding || strcmp(encoding, "chunked") != 0) { + ldout(s->cct, 20) << "neither length nor chunked encoding" << dendl; + return -ERR_LENGTH_REQUIRED; + } + + chunked_upload = true; + } + + supplied_etag = s->info.env->get("HTTP_ETAG"); + + if (!s->generic_attrs.count(RGW_ATTR_CONTENT_TYPE)) { + ldout(s->cct, 5) << "content type wasn't provided, trying to guess" << dendl; + const char *suffix = strrchr(s->object.name.c_str(), '.'); + if (suffix) { + suffix++; + if (*suffix) { + string suffix_str(suffix); + const char *mime = rgw_find_mime_by_ext(suffix_str); + if (mime) { + s->generic_attrs[RGW_ATTR_CONTENT_TYPE] = mime; + } + } + } + } + + policy.create_default(s->user->user_id, s->user->display_name); + + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + if (!s->cct->_conf->rgw_swift_custom_header.empty()) { + string custom_header = s->cct->_conf->rgw_swift_custom_header; + if (s->info.env->exists(custom_header.c_str())) { + user_data = s->info.env->get(custom_header.c_str()); + } + } + + dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST"); + bool exists; + string multipart_manifest = s->info.args.get("multipart-manifest", &exists); + if (exists) { + if (multipart_manifest != "put") { + ldout(s->cct, 5) << "invalid multipart-manifest http param: " << multipart_manifest << dendl; + return -EINVAL; + } + +#define MAX_SLO_ENTRY_SIZE (1024 + 128) // 1024 - max obj name, 128 - enough extra for other info + uint64_t max_len = s->cct->_conf->rgw_max_slo_entries * MAX_SLO_ENTRY_SIZE; + + slo_info = new RGWSLOInfo; + + int r = 0; + std::tie(r, slo_info->raw_data) = rgw_rest_get_json_input_keep_data(s->cct, s, slo_info->entries, max_len); + if (r < 0) { + ldout(s->cct, 5) << "failed to read input for slo r=" << r << dendl; + return r; + } + + if ((int64_t)slo_info->entries.size() > s->cct->_conf->rgw_max_slo_entries) { + ldout(s->cct, 5) << "too many entries in slo request: " << slo_info->entries.size() << dendl; + return -EINVAL; + } + + MD5 etag_sum; + uint64_t total_size = 0; + for (auto& entry : slo_info->entries) { + etag_sum.Update((const unsigned char *)entry.etag.c_str(), + entry.etag.length()); + + /* if size_bytes == 0, it should be replaced with the + * real segment size (which could be 0); this follows from the + * fact that Swift requires all segments to exist, but permits + * the size_bytes element to be omitted from the SLO manifest, see + * https://docs.openstack.org/swift/latest/api/large_objects.html + */ + r = update_slo_segment_size(entry); + if (r < 0) { + return r; + } + + total_size += entry.size_bytes; + + ldout(s->cct, 20) << "slo_part: " << entry.path + << " size=" << entry.size_bytes + << " etag=" << entry.etag + << dendl; + } + complete_etag(etag_sum, &lo_etag); + slo_info->total_size = total_size; + + ofs = slo_info->raw_data.length(); + } + + return RGWPutObj_ObjStore::get_params(); +} + +void RGWPutObj_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_CREATED; + } + set_req_state_err(s, op_ret); + } + + if (! lo_etag.empty()) { + /* Static Large Object of Swift API has two etags represented by + * following members: + * - etag - for the manifest itself (it will be stored in xattrs), + * - lo_etag - for the content composited from SLO's segments. + * The value is calculated basing on segments' etags. + * In response for PUT request we have to expose the second one. + * The first one may be obtained by GET with "multipart-manifest=get" + * in query string on a given SLO. */ + dump_etag(s, lo_etag, true /* quoted */); + } else { + dump_etag(s, etag); + } + + dump_last_modified(s, mtime); + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static int get_swift_account_settings(req_state * const s, + RGWRados * const store, + RGWAccessControlPolicy_SWIFTAcct * const policy, + bool * const has_policy) +{ + *has_policy = false; + + const char * const acl_attr = s->info.env->get("HTTP_X_ACCOUNT_ACCESS_CONTROL"); + if (acl_attr) { + RGWAccessControlPolicy_SWIFTAcct swift_acct_policy(s->cct); + const bool r = swift_acct_policy.create(store, + s->user->user_id, + s->user->display_name, + string(acl_attr)); + if (r != true) { + return -EINVAL; + } + + *policy = swift_acct_policy; + *has_policy = true; + } + + return 0; +} + +int RGWPutMetadataAccount_ObjStore_SWIFT::get_params() +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + int ret = get_swift_account_settings(s, + store, + // FIXME: we need to carry unique_ptr in generic class + // and allocate appropriate ACL class in the ctor + static_cast(&policy), + &has_policy); + if (ret < 0) { + return ret; + } + + get_rmattrs_from_headers(s, ACCT_PUT_ATTR_PREFIX, ACCT_REMOVE_ATTR_PREFIX, + rmattr_names); + return 0; +} + +void RGWPutMetadataAccount_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWPutMetadataBucket_ObjStore_SWIFT::get_params() +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + int r = get_swift_container_settings(s, store, &policy, &has_policy, + &policy_rw_mask, &cors_config, &has_cors); + if (r < 0) { + return r; + } + + get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX, CONT_REMOVE_ATTR_PREFIX, + rmattr_names); + placement_rule.init(s->info.env->get("HTTP_X_STORAGE_POLICY", ""), s->info.storage_class); + + return get_swift_versioning_settings(s, swift_ver_location); +} + +void RGWPutMetadataBucket_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret && (op_ret != -EINVAL)) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWPutMetadataObject_ObjStore_SWIFT::get_params() +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + /* Handle Swift object expiration. */ + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST"); + + return 0; +} + +void RGWPutMetadataObject_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_ACCEPTED; + } + set_req_state_err(s, op_ret); + } + + if (!s->is_err()) { + dump_content_length(s, 0); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void bulkdelete_respond(const unsigned num_deleted, + const unsigned int num_unfound, + const std::list& failures, + const int prot_flags, /* in */ + ceph::Formatter& formatter) /* out */ +{ + formatter.open_object_section("delete"); + + string resp_status; + string resp_body; + + if (!failures.empty()) { + int reason = ERR_INVALID_REQUEST; + for (const auto fail_desc : failures) { + if (-ENOENT != fail_desc.err && -EACCES != fail_desc.err) { + reason = fail_desc.err; + } + } + rgw_err err; + set_req_state_err(err, reason, prot_flags); + dump_errno(err, resp_status); + } else if (0 == num_deleted && 0 == num_unfound) { + /* 400 Bad Request */ + dump_errno(400, resp_status); + resp_body = "Invalid bulk delete."; + } else { + /* 200 OK */ + dump_errno(200, resp_status); + } + + encode_json("Number Deleted", num_deleted, &formatter); + encode_json("Number Not Found", num_unfound, &formatter); + encode_json("Response Body", resp_body, &formatter); + encode_json("Response Status", resp_status, &formatter); + + formatter.open_array_section("Errors"); + for (const auto fail_desc : failures) { + formatter.open_array_section("object"); + + stringstream ss_name; + ss_name << fail_desc.path; + encode_json("Name", ss_name.str(), &formatter); + + rgw_err err; + set_req_state_err(err, fail_desc.err, prot_flags); + string status; + dump_errno(err, status); + encode_json("Status", status, &formatter); + formatter.close_section(); + } + formatter.close_section(); + + formatter.close_section(); +} + +int RGWDeleteObj_ObjStore_SWIFT::verify_permission() +{ + op_ret = RGWDeleteObj_ObjStore::verify_permission(); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWDeleteObj_ObjStore_SWIFT::get_params() +{ + const string& mm = s->info.args.get("multipart-manifest"); + multipart_delete = (mm.compare("delete") == 0); + + return RGWDeleteObj_ObjStore::get_params(); +} + +void RGWDeleteObj_ObjStore_SWIFT::send_response() +{ + int r = op_ret; + + if (multipart_delete) { + r = 0; + } else if(!r) { + r = STATUS_NO_CONTENT; + } + + set_req_state_err(s, r); + dump_errno(s); + + if (multipart_delete) { + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + + if (deleter) { + bulkdelete_respond(deleter->get_num_deleted(), + deleter->get_num_unfound(), + deleter->get_failures(), + s->prot_flags, + *s->formatter); + } else if (-ENOENT == op_ret) { + bulkdelete_respond(0, 1, {}, s->prot_flags, *s->formatter); + } else { + RGWBulkDelete::acct_path_t path; + path.bucket_name = s->bucket_name; + path.obj_key = s->object; + + RGWBulkDelete::fail_desc_t fail_desc; + fail_desc.err = op_ret; + fail_desc.path = path; + + bulkdelete_respond(0, 0, { fail_desc }, s->prot_flags, *s->formatter); + } + } else { + end_header(s, this); + } + + rgw_flush_formatter_and_reset(s, s->formatter); + +} + +static void get_contype_from_attrs(map& attrs, + string& content_type) +{ + map::iterator iter = attrs.find(RGW_ATTR_CONTENT_TYPE); + if (iter != attrs.end()) { + content_type = rgw_bl_str(iter->second); + } +} + +static void dump_object_metadata(struct req_state * const s, + const map& attrs) +{ + map response_attrs; + + for (auto kv : attrs) { + const char * name = kv.first.c_str(); + const auto aiter = rgw_to_http_attrs.find(name); + + if (aiter != std::end(rgw_to_http_attrs)) { + response_attrs[aiter->second] = rgw_bl_str(kv.second); + } else if (strcmp(name, RGW_ATTR_SLO_UINDICATOR) == 0) { + // this attr has an extra length prefix from encode() in prior versions + dump_header(s, "X-Object-Meta-Static-Large-Object", "True"); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, + sizeof(RGW_ATTR_META_PREFIX)-1) == 0) { + name += sizeof(RGW_ATTR_META_PREFIX) - 1; + dump_header_prefixed(s, "X-Object-Meta-", + camelcase_dash_http_attr(name), kv.second); + } + } + + /* Handle override and fallback for Content-Disposition HTTP header. + * At the moment this will be used only by TempURL of the Swift API. */ + const auto cditer = rgw_to_http_attrs.find(RGW_ATTR_CONTENT_DISP); + if (cditer != std::end(rgw_to_http_attrs)) { + const auto& name = cditer->second; + + if (!s->content_disp.override.empty()) { + response_attrs[name] = s->content_disp.override; + } else if (!s->content_disp.fallback.empty() + && response_attrs.find(name) == std::end(response_attrs)) { + response_attrs[name] = s->content_disp.fallback; + } + } + + for (const auto kv : response_attrs) { + dump_header(s, kv.first, kv.second); + } + + const auto iter = attrs.find(RGW_ATTR_DELETE_AT); + if (iter != std::end(attrs)) { + utime_t delete_at; + try { + decode(delete_at, iter->second); + if (!delete_at.is_zero()) { + dump_header(s, "X-Delete-At", delete_at.sec()); + } + } catch (buffer::error& err) { + ldout(s->cct, 0) << "ERROR: cannot decode object's " RGW_ATTR_DELETE_AT + " attr, ignoring" + << dendl; + } + } +} + +int RGWCopyObj_ObjStore_SWIFT::init_dest_policy() +{ + dest_policy.create_default(s->user->user_id, s->user->display_name); + + return 0; +} + +int RGWCopyObj_ObjStore_SWIFT::get_params() +{ + if_mod = s->info.env->get("HTTP_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_COPY_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_COPY_IF_NONE_MATCH"); + + src_tenant_name = s->src_tenant_name; + src_bucket_name = s->src_bucket_name; + src_object = s->src_object; + dest_tenant_name = s->bucket_tenant; + dest_bucket_name = s->bucket_name; + dest_object = s->object.name; + + const char * const fresh_meta = s->info.env->get("HTTP_X_FRESH_METADATA"); + if (fresh_meta && strcasecmp(fresh_meta, "TRUE") == 0) { + attrs_mod = RGWRados::ATTRSMOD_REPLACE; + } else { + attrs_mod = RGWRados::ATTRSMOD_MERGE; + } + + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + return 0; +} + +void RGWCopyObj_ObjStore_SWIFT::send_partial_response(off_t ofs) +{ + if (! sent_header) { + if (! op_ret) + op_ret = STATUS_CREATED; + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + + /* Send progress information. Note that this diverge from the original swift + * spec. We do this in order to keep connection alive. + */ + if (op_ret == 0) { + s->formatter->open_array_section("progress"); + } + sent_header = true; + } else { + s->formatter->dump_int("ofs", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_SWIFT::dump_copy_info() +{ + /* Dump X-Copied-From. */ + dump_header(s, "X-Copied-From", url_encode(src_bucket.name) + + "/" + url_encode(src_object.name)); + + /* Dump X-Copied-From-Account. */ + /* XXX tenant */ + dump_header(s, "X-Copied-From-Account", url_encode(s->user->user_id.id)); + + /* Dump X-Copied-From-Last-Modified. */ + dump_time_header(s, "X-Copied-From-Last-Modified", src_mtime); +} + +void RGWCopyObj_ObjStore_SWIFT::send_response() +{ + if (! sent_header) { + string content_type; + if (! op_ret) + op_ret = STATUS_CREATED; + set_req_state_err(s, op_ret); + dump_errno(s); + dump_etag(s, etag); + dump_last_modified(s, mtime); + dump_copy_info(); + get_contype_from_attrs(attrs, content_type); + dump_object_metadata(s, attrs); + end_header(s, this, !content_type.empty() ? content_type.c_str() + : "binary/octet-stream"); + } else { + s->formatter->close_section(); + rgw_flush_formatter(s, s->formatter); + } +} + +int RGWGetObj_ObjStore_SWIFT::verify_permission() +{ + op_ret = RGWGetObj_ObjStore::verify_permission(); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWGetObj_ObjStore_SWIFT::get_params() +{ + const string& mm = s->info.args.get("multipart-manifest"); + skip_manifest = (mm.compare("get") == 0); + + return RGWGetObj_ObjStore::get_params(); +} + +int RGWGetObj_ObjStore_SWIFT::send_response_data_error() +{ + std::string error_content; + op_ret = error_handler(op_ret, &error_content); + if (! op_ret) { + /* The error handler has taken care of the error. */ + return 0; + } + + bufferlist error_bl; + error_bl.append(error_content); + return send_response_data(error_bl, 0, error_bl.length()); +} + +int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, + const off_t bl_ofs, + const off_t bl_len) +{ + string content_type; + + if (sent_header) { + goto send_data; + } + + if (custom_http_ret) { + set_req_state_err(s, 0); + dump_errno(s, custom_http_ret); + } else { + set_req_state_err(s, (partial_content && !op_ret) ? STATUS_PARTIAL_CONTENT + : op_ret); + dump_errno(s); + + if (s->is_err()) { + end_header(s, NULL); + return 0; + } + } + + if (range_str) { + dump_range(s, ofs, end, s->obj_size); + } + + if (s->is_err()) { + end_header(s, NULL); + return 0; + } + + dump_content_length(s, total_len); + dump_last_modified(s, lastmod); + dump_header(s, "X-Timestamp", utime_t(lastmod)); + if (is_slo) { + dump_header(s, "X-Static-Large-Object", "True"); + } + + if (! op_ret) { + if (! lo_etag.empty()) { + dump_etag(s, lo_etag, true /* quoted */); + } else { + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + dump_etag(s, iter->second.to_str()); + } + } + + get_contype_from_attrs(attrs, content_type); + dump_object_metadata(s, attrs); + } + + end_header(s, this, !content_type.empty() ? content_type.c_str() + : "binary/octet-stream"); + + sent_header = true; + +send_data: + if (get_data && !op_ret) { + const auto r = dump_body(s, bl.c_str() + bl_ofs, bl_len); + if (r < 0) { + return r; + } + } + rgw_flush_formatter_and_reset(s, s->formatter); + + return 0; +} + +void RGWOptionsCORS_ObjStore_SWIFT::send_response() +{ + string hdrs, exp_hdrs; + uint32_t max_age = CORS_MAX_AGE_INVALID; + /*EACCES means, there is no CORS registered yet for the bucket + *ENOENT means, there is no match of the Origin in the list of CORSRule + */ + if (op_ret == -ENOENT) + op_ret = -EACCES; + if (op_ret < 0) { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL); + return; + } + get_response_params(hdrs, exp_hdrs, &max_age); + dump_errno(s); + dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), + max_age); + end_header(s, NULL); +} + +int RGWBulkDelete_ObjStore_SWIFT::get_data( + list& items, bool * const is_truncated) +{ + constexpr size_t MAX_LINE_SIZE = 2048; + + RGWClientIOStreamBuf ciosb(static_cast(*(s->cio)), + size_t(s->cct->_conf->rgw_max_chunk_size)); + istream cioin(&ciosb); + + char buf[MAX_LINE_SIZE]; + while (cioin.getline(buf, sizeof(buf))) { + string path_str(buf); + + ldout(s->cct, 20) << "extracted Bulk Delete entry: " << path_str << dendl; + + RGWBulkDelete::acct_path_t path; + + /* We need to skip all slashes at the beginning in order to preserve + * compliance with Swift. */ + const size_t start_pos = path_str.find_first_not_of('/'); + + if (string::npos != start_pos) { + /* Seperator is the first slash after the leading ones. */ + const size_t sep_pos = path_str.find('/', start_pos); + + if (string::npos != sep_pos) { + path.bucket_name = url_decode(path_str.substr(start_pos, + sep_pos - start_pos)); + path.obj_key = url_decode(path_str.substr(sep_pos + 1)); + } else { + /* It's guaranteed here that bucket name is at least one character + * long and is different than slash. */ + path.bucket_name = url_decode(path_str.substr(start_pos)); + } + + items.push_back(path); + } + + if (items.size() == MAX_CHUNK_ENTRIES) { + *is_truncated = true; + return 0; + } + } + + *is_truncated = false; + return 0; +} + +void RGWBulkDelete_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + + bulkdelete_respond(deleter->get_num_deleted(), + deleter->get_num_unfound(), + deleter->get_failures(), + s->prot_flags, + *s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +std::unique_ptr +RGWBulkUploadOp_ObjStore_SWIFT::create_stream() +{ + class SwiftStreamGetter : public StreamGetter { + const size_t conlen; + size_t curpos; + req_state* const s; + + public: + SwiftStreamGetter(req_state* const s, const size_t conlen) + : conlen(conlen), + curpos(0), + s(s) { + } + + ssize_t get_at_most(size_t want, ceph::bufferlist& dst) override { + /* maximum requested by a caller */ + /* data provided by client */ + /* RadosGW's limit. */ + const size_t max_chunk_size = \ + static_cast(s->cct->_conf->rgw_max_chunk_size); + const size_t max_to_read = std::min({ want, conlen - curpos, max_chunk_size }); + + ldout(s->cct, 20) << "bulk_upload: get_at_most max_to_read=" + << max_to_read + << ", dst.c_str()=" << reinterpret_cast(dst.c_str()) << dendl; + + bufferptr bp(max_to_read); + const auto read_len = recv_body(s, bp.c_str(), max_to_read); + dst.append(bp, 0, read_len); + //const auto read_len = recv_body(s, dst.c_str(), max_to_read); + if (read_len < 0) { + return read_len; + } + + curpos += read_len; + return curpos > s->cct->_conf->rgw_max_put_size ? -ERR_TOO_LARGE + : read_len; + } + + ssize_t get_exactly(size_t want, ceph::bufferlist& dst) override { + ldout(s->cct, 20) << "bulk_upload: get_exactly want=" << want << dendl; + + /* FIXME: do this in a loop. */ + const auto ret = get_at_most(want, dst); + ldout(s->cct, 20) << "bulk_upload: get_exactly ret=" << ret << dendl; + if (ret < 0) { + return ret; + } else if (static_cast(ret) != want) { + return -EINVAL; + } else { + return want; + } + } + }; + + if (! s->length) { + op_ret = -EINVAL; + return nullptr; + } else { + ldout(s->cct, 20) << "bulk upload: create_stream for length=" + << s->length << dendl; + + const size_t conlen = atoll(s->length); + return std::unique_ptr(new SwiftStreamGetter(s, conlen)); + } +} + +void RGWBulkUploadOp_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + rgw_flush_formatter_and_reset(s, s->formatter); + + s->formatter->open_object_section("delete"); + + std::string resp_status; + std::string resp_body; + + if (! failures.empty()) { + rgw_err err; + + const auto last_err = { failures.back().err }; + if (boost::algorithm::contains(last_err, terminal_errors)) { + /* The terminal errors are affecting the status of the whole upload. */ + set_req_state_err(err, failures.back().err, s->prot_flags); + } else { + set_req_state_err(err, ERR_INVALID_REQUEST, s->prot_flags); + } + + dump_errno(err, resp_status); + } else if (0 == num_created && failures.empty()) { + /* Nothing created, nothing failed. This means the archive contained no + * entity we could understand (regular file or directory). We need to + * send 400 Bad Request to an HTTP client in the internal status field. */ + dump_errno(400, resp_status); + resp_body = "Invalid Tar File: No Valid Files"; + } else { + /* 200 OK */ + dump_errno(201, resp_status); + } + + encode_json("Number Files Created", num_created, s->formatter); + encode_json("Response Body", resp_body, s->formatter); + encode_json("Response Status", resp_status, s->formatter); + + s->formatter->open_array_section("Errors"); + for (const auto& fail_desc : failures) { + s->formatter->open_array_section("object"); + + encode_json("Name", fail_desc.path, s->formatter); + + rgw_err err; + set_req_state_err(err, fail_desc.err, s->prot_flags); + std::string status; + dump_errno(err, status); + encode_json("Status", status, s->formatter); + + s->formatter->close_section(); + } + s->formatter->close_section(); + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +void RGWGetCrossDomainPolicy_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + + std::stringstream ss; + + ss << R"()" << "\n" + << R"()" << "\n" + << R"()" << "\n" + << g_conf()->rgw_cross_domain_policy << "\n" + << R"()"; + + dump_body(s, ss.str()); +} + +void RGWGetHealthCheck_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret) { + static constexpr char DISABLED[] = "DISABLED BY FILE"; + dump_body(s, DISABLED, strlen(DISABLED)); + } +} + +const vector> RGWInfo_ObjStore_SWIFT::swift_info = +{ + {"bulk_delete", {false, nullptr}}, + {"container_quotas", {false, nullptr}}, + {"swift", {false, RGWInfo_ObjStore_SWIFT::list_swift_data}}, + {"tempurl", { false, RGWInfo_ObjStore_SWIFT::list_tempurl_data}}, + {"slo", {false, RGWInfo_ObjStore_SWIFT::list_slo_data}}, + {"account_quotas", {false, nullptr}}, + {"staticweb", {false, nullptr}}, + {"tempauth", {false, RGWInfo_ObjStore_SWIFT::list_tempauth_data}}, +}; + +void RGWInfo_ObjStore_SWIFT::execute() +{ + bool is_admin_info_enabled = false; + + const string& swiftinfo_sig = s->info.args.get("swiftinfo_sig"); + const string& swiftinfo_expires = s->info.args.get("swiftinfo_expires"); + + if (!swiftinfo_sig.empty() && + !swiftinfo_expires.empty() && + !is_expired(swiftinfo_expires, s->cct)) { + is_admin_info_enabled = true; + } + + s->formatter->open_object_section("info"); + + for (const auto& pair : swift_info) { + if(!is_admin_info_enabled && pair.second.is_admin_info) + continue; + + if (!pair.second.list_data) { + s->formatter->open_object_section((pair.first).c_str()); + s->formatter->close_section(); + } + else { + pair.second.list_data(*(s->formatter), s->cct->_conf, *store); + } + } + + s->formatter->close_section(); +} + +void RGWInfo_ObjStore_SWIFT::send_response() +{ + if (op_ret < 0) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWInfo_ObjStore_SWIFT::list_swift_data(Formatter& formatter, + const ConfigProxy& config, + RGWRados& store) +{ + formatter.open_object_section("swift"); + formatter.dump_int("max_file_size", config->rgw_max_put_size); + formatter.dump_int("container_listing_limit", RGW_LIST_BUCKETS_LIMIT_MAX); + + string ceph_version(CEPH_GIT_NICE_VER); + formatter.dump_string("version", ceph_version); + + const size_t max_attr_name_len = \ + g_conf().get_val("rgw_max_attr_name_len"); + if (max_attr_name_len) { + const size_t meta_name_limit = \ + max_attr_name_len - strlen(RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX); + formatter.dump_int("max_meta_name_length", meta_name_limit); + } + + const size_t meta_value_limit = g_conf().get_val("rgw_max_attr_size"); + if (meta_value_limit) { + formatter.dump_int("max_meta_value_length", meta_value_limit); + } + + const size_t meta_num_limit = \ + g_conf().get_val("rgw_max_attrs_num_in_req"); + if (meta_num_limit) { + formatter.dump_int("max_meta_count", meta_num_limit); + } + + formatter.open_array_section("policies"); + const RGWZoneGroup& zonegroup = store.svc.zone->get_zonegroup(); + + for (const auto& placement_targets : zonegroup.placement_targets) { + formatter.open_object_section("policy"); + if (placement_targets.second.name.compare(zonegroup.default_placement.name) == 0) + formatter.dump_bool("default", true); + formatter.dump_string("name", placement_targets.second.name.c_str()); + formatter.close_section(); + } + formatter.close_section(); + + formatter.dump_int("max_object_name_size", RGWHandler_REST::MAX_OBJ_NAME_LEN); + formatter.dump_bool("strict_cors_mode", true); + formatter.dump_int("max_container_name_length", RGWHandler_REST::MAX_BUCKET_NAME_LEN); + formatter.close_section(); +} + +void RGWInfo_ObjStore_SWIFT::list_tempauth_data(Formatter& formatter, + const ConfigProxy& config, + RGWRados& store) +{ + formatter.open_object_section("tempauth"); + formatter.dump_bool("account_acls", true); + formatter.close_section(); +} +void RGWInfo_ObjStore_SWIFT::list_tempurl_data(Formatter& formatter, + const ConfigProxy& config, + RGWRados& store) +{ + formatter.open_object_section("tempurl"); + formatter.open_array_section("methods"); + formatter.dump_string("methodname", "GET"); + formatter.dump_string("methodname", "HEAD"); + formatter.dump_string("methodname", "PUT"); + formatter.dump_string("methodname", "POST"); + formatter.dump_string("methodname", "DELETE"); + formatter.close_section(); + formatter.close_section(); +} + +void RGWInfo_ObjStore_SWIFT::list_slo_data(Formatter& formatter, + const ConfigProxy& config, + RGWRados& store) +{ + formatter.open_object_section("slo"); + formatter.dump_int("max_manifest_segments", config->rgw_max_slo_entries); + formatter.close_section(); +} + +bool RGWInfo_ObjStore_SWIFT::is_expired(const std::string& expires, CephContext* cct) +{ + string err; + const utime_t now = ceph_clock_now(); + const uint64_t expiration = (uint64_t)strict_strtoll(expires.c_str(), + 10, &err); + if (!err.empty()) { + ldout(cct, 5) << "failed to parse siginfo_expires: " << err << dendl; + return true; + } + + if (expiration <= (uint64_t)now.sec()) { + ldout(cct, 5) << "siginfo expired: " << expiration << " <= " << now.sec() << dendl; + return true; + } + + return false; +} + + +void RGWFormPost::init(RGWRados* const store, + req_state* const s, + RGWHandler* const dialect_handler) +{ + prefix = std::move(s->object.name); + s->object = rgw_obj_key(); + + return RGWPostObj_ObjStore::init(store, s, dialect_handler); +} + +std::size_t RGWFormPost::get_max_file_size() /*const*/ +{ + std::string max_str = get_part_str(ctrl_parts, "max_file_size", "0"); + + std::string err; + const std::size_t max_file_size = + static_cast(strict_strtoll(max_str.c_str(), 10, &err)); + + if (! err.empty()) { + ldout(s->cct, 5) << "failed to parse FormPost's max_file_size: " << err + << dendl; + return 0; + } + + return max_file_size; +} + +bool RGWFormPost::is_non_expired() +{ + std::string expires = get_part_str(ctrl_parts, "expires", "0"); + + std::string err; + const uint64_t expires_timestamp = + static_cast(strict_strtoll(expires.c_str(), 10, &err)); + + if (! err.empty()) { + dout(5) << "failed to parse FormPost's expires: " << err << dendl; + return false; + } + + const utime_t now = ceph_clock_now(); + if (expires_timestamp <= static_cast(now.sec())) { + dout(5) << "FormPost form expired: " + << expires_timestamp << " <= " << now.sec() << dendl; + return false; + } + + return true; +} + +bool RGWFormPost::is_integral() +{ + const std::string form_signature = get_part_str(ctrl_parts, "signature"); + + try { + get_owner_info(s, *s->user); + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } catch (...) { + ldout(s->cct, 5) << "cannot get user_info of account's owner" << dendl; + return false; + } + + for (const auto& kv : s->user->temp_url_keys) { + const int temp_url_key_num = kv.first; + const string& temp_url_key = kv.second; + + if (temp_url_key.empty()) { + continue; + } + + SignatureHelper sig_helper; + sig_helper.calc(temp_url_key, + s->info.request_uri, + get_part_str(ctrl_parts, "redirect"), + get_part_str(ctrl_parts, "max_file_size", "0"), + get_part_str(ctrl_parts, "max_file_count", "0"), + get_part_str(ctrl_parts, "expires", "0")); + + const auto local_sig = sig_helper.get_signature(); + + ldout(s->cct, 20) << "FormPost signature [" << temp_url_key_num << "]" + << " (calculated): " << local_sig << dendl; + + if (sig_helper.is_equal_to(form_signature)) { + return true; + } else { + ldout(s->cct, 5) << "FormPost's signature mismatch: " + << local_sig << " != " << form_signature << dendl; + } + } + + return false; +} + +void RGWFormPost::get_owner_info(const req_state* const s, + RGWUserInfo& owner_info) const +{ + /* We cannot use req_state::bucket_name because it isn't available + * now. It will be initialized in RGWHandler_REST_SWIFT::postauth_init(). */ + const string& bucket_name = s->init_state.url_bucket; + + /* TempURL in Formpost only requires that bucket name is specified. */ + if (bucket_name.empty()) { + throw -EPERM; + } + + string bucket_tenant; + if (!s->account_name.empty()) { + RGWUserInfo uinfo; + bool found = false; + + const rgw_user uid(s->account_name); + if (uid.tenant.empty()) { + const rgw_user tenanted_uid(uid.id, uid.id); + + if (rgw_get_user_info_by_uid(store, tenanted_uid, uinfo) >= 0) { + /* Succeeded. */ + bucket_tenant = uinfo.user_id.tenant; + found = true; + } + } + + if (!found && rgw_get_user_info_by_uid(store, uid, uinfo) < 0) { + throw -EPERM; + } else { + bucket_tenant = uinfo.user_id.tenant; + } + } + + /* Need to get user info of bucket owner. */ + RGWBucketInfo bucket_info; + int ret = store->get_bucket_info(*s->sysobj_ctx, + bucket_tenant, bucket_name, + bucket_info, nullptr); + if (ret < 0) { + throw ret; + } + + ldout(s->cct, 20) << "temp url user (bucket owner): " << bucket_info.owner + << dendl; + + if (rgw_get_user_info_by_uid(store, bucket_info.owner, owner_info) < 0) { + throw -EPERM; + } +} + +int RGWFormPost::get_params() +{ + /* The parentt class extracts boundary info from the Content-Type. */ + int ret = RGWPostObj_ObjStore::get_params(); + if (ret < 0) { + return ret; + } + + policy.create_default(s->user->user_id, s->user->display_name); + + /* Let's start parsing the HTTP body by parsing each form part step- + * by-step till encountering the first part with file data. */ + do { + struct post_form_part part; + ret = read_form_part_header(&part, stream_done); + if (ret < 0) { + return ret; + } + + if (s->cct->_conf->subsys.should_gather()) { + ldout(s->cct, 20) << "read part header -- part.name=" + << part.name << dendl; + + for (const auto& pair : part.fields) { + ldout(s->cct, 20) << "field.name=" << pair.first << dendl; + ldout(s->cct, 20) << "field.val=" << pair.second.val << dendl; + ldout(s->cct, 20) << "field.params:" << dendl; + + for (const auto& param_pair : pair.second.params) { + ldout(s->cct, 20) << " " << param_pair.first + << " -> " << param_pair.second << dendl; + } + } + } + + if (stream_done) { + /* Unexpected here. */ + err_msg = "Malformed request"; + return -EINVAL; + } + + const auto field_iter = part.fields.find("Content-Disposition"); + if (std::end(part.fields) != field_iter && + std::end(field_iter->second.params) != field_iter->second.params.find("filename")) { + /* First data part ahead. */ + current_data_part = std::move(part); + + /* Stop the iteration. We can assume that all control parts have been + * already parsed. The rest of HTTP body should contain data parts + * only. They will be picked up by ::get_data(). */ + break; + } else { + /* Control part ahead. Receive, parse and store for later usage. */ + bool boundary; + ret = read_data(part.data, s->cct->_conf->rgw_max_chunk_size, + boundary, stream_done); + if (ret < 0) { + return ret; + } else if (! boundary) { + err_msg = "Couldn't find boundary"; + return -EINVAL; + } + + ctrl_parts[part.name] = std::move(part); + } + } while (! stream_done); + + min_len = 0; + max_len = get_max_file_size(); + + if (! current_data_part) { + err_msg = "FormPost: no files to process"; + return -EINVAL; + } + + if (! is_non_expired()) { + err_msg = "FormPost: Form Expired"; + return -EPERM; + } + + if (! is_integral()) { + err_msg = "FormPost: Invalid Signature"; + return -EPERM; + } + + return 0; +} + +std::string RGWFormPost::get_current_filename() const +{ + try { + const auto& field = current_data_part->fields.at("Content-Disposition"); + const auto iter = field.params.find("filename"); + + if (std::end(field.params) != iter) { + return prefix + iter->second; + } + } catch (std::out_of_range&) { + /* NOP */; + } + + return prefix; +} + +std::string RGWFormPost::get_current_content_type() const +{ + try { + const auto& field = current_data_part->fields.at("Content-Type"); + return field.val; + } catch (std::out_of_range&) { + /* NOP */; + } + + return std::string(); +} + +bool RGWFormPost::is_next_file_to_upload() +{ + if (! stream_done) { + /* We have at least one additional part in the body. */ + struct post_form_part part; + int r = read_form_part_header(&part, stream_done); + if (r < 0) { + return false; + } + + const auto field_iter = part.fields.find("Content-Disposition"); + if (std::end(part.fields) != field_iter) { + const auto& params = field_iter->second.params; + const auto& filename_iter = params.find("filename"); + + if (std::end(params) != filename_iter && ! filename_iter->second.empty()) { + current_data_part = std::move(part); + return true; + } + } + } + + return false; +} + +int RGWFormPost::get_data(ceph::bufferlist& bl, bool& again) +{ + bool boundary; + + int r = read_data(bl, s->cct->_conf->rgw_max_chunk_size, + boundary, stream_done); + if (r < 0) { + return r; + } + + /* Tell RGWPostObj::execute() that it has some data to put. */ + again = !boundary; + + return bl.length(); +} + +void RGWFormPost::send_response() +{ + std::string redirect = get_part_str(ctrl_parts, "redirect"); + if (! redirect.empty()) { + op_ret = STATUS_REDIRECT; + } + + set_req_state_err(s, op_ret); + s->err.err_code = err_msg; + dump_errno(s); + if (! redirect.empty()) { + dump_redirect(s, redirect); + } + end_header(s, this); +} + +bool RGWFormPost::is_formpost_req(req_state* const s) +{ + std::string content_type; + std::map params; + + parse_boundary_params(s->info.env->get("CONTENT_TYPE", ""), + content_type, params); + + return boost::algorithm::iequals(content_type, "multipart/form-data") && + params.count("boundary") > 0; +} + + +RGWOp *RGWHandler_REST_Service_SWIFT::op_get() +{ + return new RGWListBuckets_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_head() +{ + return new RGWStatAccount_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_put() +{ + if (s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + return nullptr; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_post() +{ + if (s->info.args.exists("bulk-delete")) { + return new RGWBulkDelete_ObjStore_SWIFT; + } + return new RGWPutMetadataAccount_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_delete() +{ + if (s->info.args.exists("bulk-delete")) { + return new RGWBulkDelete_ObjStore_SWIFT; + } + return NULL; +} + +int RGWSwiftWebsiteHandler::serve_errordoc(const int http_ret, + const std::string error_doc) +{ + /* Try to throw it all away. */ + s->formatter->reset(); + + class RGWGetErrorPage : public RGWGetObj_ObjStore_SWIFT { + public: + RGWGetErrorPage(RGWRados* const store, + RGWHandler_REST* const handler, + req_state* const s, + const int http_ret) { + /* Calling a virtual from the base class is safe as the subobject should + * be properly initialized and we haven't overridden the init method. */ + init(store, s, handler); + set_get_data(true); + set_custom_http_response(http_ret); + } + + int error_handler(const int err_no, + std::string* const error_content) override { + /* Enforce that any error generated while getting the error page will + * not be send to a client. This allows us to recover from the double + * fault situation by sending the original message. */ + return 0; + } + } get_errpage_op(store, handler, s, http_ret); + + s->object = std::to_string(http_ret) + error_doc; + + RGWOp* newop = &get_errpage_op; + RGWRequest req(0); + return rgw_process_authenticated(handler, newop, &req, s, true); +} + +int RGWSwiftWebsiteHandler::error_handler(const int err_no, + std::string* const error_content) +{ + const auto& ws_conf = s->bucket_info.website_conf; + + if (can_be_website_req() && ! ws_conf.error_doc.empty()) { + set_req_state_err(s, err_no); + return serve_errordoc(s->err.http_ret, ws_conf.error_doc); + } + + /* Let's go to the default, no-op handler. */ + return err_no; +} + +bool RGWSwiftWebsiteHandler::is_web_mode() const +{ + const boost::string_ref webmode = s->info.env->get("HTTP_X_WEB_MODE", ""); + return boost::algorithm::iequals(webmode, "true"); +} + +bool RGWSwiftWebsiteHandler::can_be_website_req() const +{ + /* Static website works only with the GET or HEAD method. Nothing more. */ + static const std::set ws_methods = { "GET", "HEAD" }; + if (ws_methods.count(s->info.method) == 0) { + return false; + } + + /* We also need to handle early failures from the auth system. In such cases + * req_state::auth.identity may be empty. Let's treat that the same way as + * the anonymous access. */ + if (! s->auth.identity) { + return true; + } + + /* Swift serves websites only for anonymous requests unless client explicitly + * requested this behaviour by supplying X-Web-Mode HTTP header set to true. */ + if (s->auth.identity->is_anonymous() || is_web_mode()) { + return true; + } + + return false; +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_redirect_op() +{ + class RGWMovedPermanently: public RGWOp { + const std::string location; + public: + explicit RGWMovedPermanently(const std::string& location) + : location(location) { + } + + int verify_permission() override { + return 0; + } + + void execute() override { + op_ret = -ERR_PERMANENT_REDIRECT; + return; + } + + void send_response() override { + set_req_state_err(s, op_ret); + dump_errno(s); + dump_content_length(s, 0); + dump_redirect(s, location); + end_header(s, this); + } + + const char* name() const override { + return "RGWMovedPermanently"; + } + }; + + return new RGWMovedPermanently(s->info.request_uri + '/'); +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_index_op() +{ + /* Retarget to get obj on requested index file. */ + if (! s->object.empty()) { + s->object = s->object.name + + s->bucket_info.website_conf.get_index_doc(); + } else { + s->object = s->bucket_info.website_conf.get_index_doc(); + } + + auto getop = new RGWGetObj_ObjStore_SWIFT; + getop->set_get_data(boost::algorithm::equals("GET", s->info.method)); + + return getop; +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_listing_op() +{ + class RGWWebsiteListing : public RGWListBucket_ObjStore_SWIFT { + const std::string prefix_override; + + int get_params() override { + prefix = prefix_override; + max = default_max; + delimiter = "/"; + return 0; + } + + void send_response() override { + /* Generate the header now. */ + set_req_state_err(s, op_ret); + dump_errno(s); + dump_container_metadata(s, bucket, bucket_quota, + s->bucket_info.website_conf); + end_header(s, this, "text/html"); + if (op_ret < 0) { + return; + } + + /* Now it's the time to start generating HTML bucket listing. + * All the crazy stuff with crafting tags will be delegated to + * RGWSwiftWebsiteListingFormatter. */ + std::stringstream ss; + RGWSwiftWebsiteListingFormatter htmler(ss, prefix); + + const auto& ws_conf = s->bucket_info.website_conf; + htmler.generate_header(s->decoded_uri, + ws_conf.listing_css_doc); + + for (const auto& pair : common_prefixes) { + std::string subdir_name = pair.first; + if (! subdir_name.empty()) { + /* To be compliant with Swift we need to remove the trailing + * slash. */ + subdir_name.pop_back(); + } + + htmler.dump_subdir(subdir_name); + } + + for (const rgw_bucket_dir_entry& obj : objs) { + if (! common_prefixes.count(obj.key.name + '/')) { + htmler.dump_object(obj); + } + } + + htmler.generate_footer(); + dump_body(s, ss.str()); + } + public: + /* Taking prefix_override by value to leverage std::string r-value ref + * ctor and thus avoid extra memory copying/increasing ref counter. */ + explicit RGWWebsiteListing(std::string prefix_override) + : prefix_override(std::move(prefix_override)) { + } + }; + + std::string prefix = std::move(s->object.name); + s->object = rgw_obj_key(); + + return new RGWWebsiteListing(std::move(prefix)); +} + +bool RGWSwiftWebsiteHandler::is_web_dir() const +{ + std::string subdir_name = url_decode(s->object.name); + + /* Remove character from the subdir name if it is "/". */ + if (subdir_name.empty()) { + return false; + } else if (subdir_name.back() == '/') { + subdir_name.pop_back(); + if (subdir_name.empty()) { + return false; + } + } + + rgw_obj obj(s->bucket, std::move(subdir_name)); + + /* First, get attrset of the object we'll try to retrieve. */ + RGWObjectCtx& obj_ctx = *static_cast(s->obj_ctx); + obj_ctx.set_atomic(obj); + obj_ctx.set_prefetch_data(obj); + + RGWObjState* state = nullptr; + if (store->get_obj_state(&obj_ctx, s->bucket_info, obj, &state, false) < 0) { + return false; + } + + /* A nonexistent object cannot be a considered as a marker representing + * the emulation of catalog in FS hierarchy. */ + if (! state->exists) { + return false; + } + + /* Decode the content type. */ + std::string content_type; + get_contype_from_attrs(state->attrset, content_type); + + const auto& ws_conf = s->bucket_info.website_conf; + const std::string subdir_marker = ws_conf.subdir_marker.empty() + ? "application/directory" + : ws_conf.subdir_marker; + return subdir_marker == content_type && state->size <= 1; +} + +bool RGWSwiftWebsiteHandler::is_index_present(const std::string& index) +{ + rgw_obj obj(s->bucket, index); + + RGWObjectCtx& obj_ctx = *static_cast(s->obj_ctx); + obj_ctx.set_atomic(obj); + obj_ctx.set_prefetch_data(obj); + + RGWObjState* state = nullptr; + if (store->get_obj_state(&obj_ctx, s->bucket_info, obj, &state, false) < 0) { + return false; + } + + /* A nonexistent object cannot be a considered as a viable index. We will + * try to list the bucket or - if this is impossible - return an error. */ + return state->exists; +} + +int RGWSwiftWebsiteHandler::retarget_bucket(RGWOp* op, RGWOp** new_op) +{ + ldout(s->cct, 10) << "Starting retarget" << dendl; + RGWOp* op_override = nullptr; + + /* In Swift static web content is served if the request is anonymous or + * has X-Web-Mode HTTP header specified to true. */ + if (can_be_website_req()) { + const auto& ws_conf = s->bucket_info.website_conf; + const auto& index = s->bucket_info.website_conf.get_index_doc(); + + if (s->decoded_uri.back() != '/') { + op_override = get_ws_redirect_op(); + } else if (! index.empty() && is_index_present(index)) { + op_override = get_ws_index_op(); + } else if (ws_conf.listing_enabled) { + op_override = get_ws_listing_op(); + } + } + + if (op_override) { + handler->put_op(op); + op_override->init(store, s, handler); + + *new_op = op_override; + } else { + *new_op = op; + } + + /* Return 404 Not Found is the request has web mode enforced but we static web + * wasn't able to serve it accordingly. */ + return ! op_override && is_web_mode() ? -ENOENT : 0; +} + +int RGWSwiftWebsiteHandler::retarget_object(RGWOp* op, RGWOp** new_op) +{ + ldout(s->cct, 10) << "Starting object retarget" << dendl; + RGWOp* op_override = nullptr; + + /* In Swift static web content is served if the request is anonymous or + * has X-Web-Mode HTTP header specified to true. */ + if (can_be_website_req() && is_web_dir()) { + const auto& ws_conf = s->bucket_info.website_conf; + const auto& index = s->bucket_info.website_conf.get_index_doc(); + + if (s->decoded_uri.back() != '/') { + op_override = get_ws_redirect_op(); + } else if (! index.empty() && is_index_present(index)) { + op_override = get_ws_index_op(); + } else if (ws_conf.listing_enabled) { + op_override = get_ws_listing_op(); + } + } else { + /* A regular request or the specified object isn't a subdirectory marker. + * We don't need any re-targeting. Error handling (like sending a custom + * error page) will be performed by error_handler of the actual RGWOp. */ + return 0; + } + + if (op_override) { + handler->put_op(op); + op_override->init(store, s, handler); + + *new_op = op_override; + } else { + *new_op = op; + } + + /* Return 404 Not Found if we aren't able to re-target for subdir marker. */ + return ! op_override ? -ENOENT : 0; +} + + +RGWOp *RGWHandler_REST_Bucket_SWIFT::get_obj_op(bool get_data) +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_SWIFT; + } + + if (get_data) + return new RGWListBucket_ObjStore_SWIFT; + else + return new RGWStatBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_get() +{ + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_head() +{ + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_SWIFT; + } + if(s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + return new RGWCreateBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_delete() +{ + return new RGWDeleteBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_post() +{ + if (RGWFormPost::is_formpost_req(s)) { + return new RGWFormPost; + } else { + return new RGWPutMetadataBucket_ObjStore_SWIFT; + } +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_options() +{ + return new RGWOptionsCORS_ObjStore_SWIFT; +} + + +RGWOp *RGWHandler_REST_Obj_SWIFT::get_obj_op(bool get_data) +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_SWIFT; + } + + RGWGetObj_ObjStore_SWIFT *get_obj_op = new RGWGetObj_ObjStore_SWIFT; + get_obj_op->set_get_data(get_data); + return get_obj_op; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_get() +{ + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_head() +{ + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_SWIFT; + } + if(s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + if (s->init_state.src_bucket.empty()) + return new RGWPutObj_ObjStore_SWIFT; + else + return new RGWCopyObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_delete() +{ + return new RGWDeleteObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_post() +{ + if (RGWFormPost::is_formpost_req(s)) { + return new RGWFormPost; + } else { + return new RGWPutMetadataObject_ObjStore_SWIFT; + } +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_copy() +{ + return new RGWCopyObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_options() +{ + return new RGWOptionsCORS_ObjStore_SWIFT; +} + + +int RGWHandler_REST_SWIFT::authorize(const DoutPrefixProvider *dpp) +{ + return rgw::auth::Strategy::apply(dpp, auth_strategy, s); +} + +int RGWHandler_REST_SWIFT::postauth_init() +{ + struct req_init_state* t = &s->init_state; + + /* XXX Stub this until Swift Auth sets account into URL. */ + s->bucket_tenant = s->user->user_id.tenant; + s->bucket_name = t->url_bucket; + + dout(10) << "s->object=" << + (!s->object.empty() ? s->object : rgw_obj_key("")) + << " s->bucket=" + << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name) + << dendl; + + int ret; + ret = rgw_validate_tenant_name(s->bucket_tenant); + if (ret) + return ret; + ret = validate_bucket_name(s->bucket_name); + if (ret) + return ret; + ret = validate_object_name(s->object.name); + if (ret) + return ret; + + if (!t->src_bucket.empty()) { + /* + * We don't allow cross-tenant copy at present. It requires account + * names in the URL for Swift. + */ + s->src_tenant_name = s->user->user_id.tenant; + s->src_bucket_name = t->src_bucket; + + ret = validate_bucket_name(s->src_bucket_name); + if (ret < 0) { + return ret; + } + ret = validate_object_name(s->src_object.name); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket) +{ + const size_t len = bucket.size(); + + if (len > MAX_BUCKET_NAME_LEN) { + /* Bucket Name too long. Generate custom error message and bind it + * to an R-value reference. */ + const auto msg = boost::str( + boost::format("Container name length of %lld longer than %lld") + % len % int(MAX_BUCKET_NAME_LEN)); + set_req_state_err(s, ERR_INVALID_BUCKET_NAME, msg); + return -ERR_INVALID_BUCKET_NAME; + } + + const auto ret = RGWHandler_REST::validate_bucket_name(bucket); + if (ret < 0) { + return ret; + } + + if (len == 0) + return 0; + + if (bucket[0] == '.') + return -ERR_INVALID_BUCKET_NAME; + + if (check_utf8(bucket.c_str(), len)) + return -ERR_INVALID_UTF8; + + const char *s = bucket.c_str(); + + for (size_t i = 0; i < len; ++i, ++s) { + if (*(unsigned char *)s == 0xff) + return -ERR_INVALID_BUCKET_NAME; + if (*(unsigned char *)s == '/') + return -ERR_INVALID_BUCKET_NAME; + } + + return 0; +} + +static void next_tok(string& str, string& tok, char delim) +{ + if (str.size() == 0) { + tok = ""; + return; + } + tok = str; + int pos = str.find(delim); + if (pos > 0) { + tok = str.substr(0, pos); + str = str.substr(pos + 1); + } else { + str = ""; + } +} + +int RGWHandler_REST_SWIFT::init_from_header(struct req_state* const s, + const std::string& frontend_prefix) +{ + string req; + string first; + + s->prot_flags |= RGW_REST_SWIFT; + + char reqbuf[frontend_prefix.length() + s->decoded_uri.length() + 1]; + sprintf(reqbuf, "%s%s", frontend_prefix.c_str(), s->decoded_uri.c_str()); + const char *req_name = reqbuf; + + const char *p; + + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(); + + /* Skip the leading slash of URL hierarchy. */ + if (req_name[0] != '/') { + return 0; + } else { + req_name++; + } + + if ('\0' == req_name[0]) { + return g_conf()->rgw_swift_url_prefix == "/" ? -ERR_BAD_URL : 0; + } + + req = req_name; + + size_t pos = req.find('/'); + if (std::string::npos != pos && g_conf()->rgw_swift_url_prefix != "/") { + bool cut_url = g_conf()->rgw_swift_url_prefix.length(); + first = req.substr(0, pos); + + if (first.compare(g_conf()->rgw_swift_url_prefix) == 0) { + if (cut_url) { + /* Rewind to the "v1/..." part. */ + next_tok(req, first, '/'); + } + } + } else if (req.compare(g_conf()->rgw_swift_url_prefix) == 0) { + s->formatter = new RGWFormatter_Plain; + return -ERR_BAD_URL; + } else { + first = req; + } + + std::string tenant_path; + if (! g_conf()->rgw_swift_tenant_name.empty()) { + tenant_path = "/AUTH_"; + tenant_path.append(g_conf()->rgw_swift_tenant_name); + } + + /* verify that the request_uri conforms with what's expected */ + char buf[g_conf()->rgw_swift_url_prefix.length() + 16 + tenant_path.length()]; + int blen; + if (g_conf()->rgw_swift_url_prefix == "/") { + blen = sprintf(buf, "/v1%s", tenant_path.c_str()); + } else { + blen = sprintf(buf, "/%s/v1%s", + g_conf()->rgw_swift_url_prefix.c_str(), tenant_path.c_str()); + } + + if (strncmp(reqbuf, buf, blen) != 0) { + return -ENOENT; + } + + int ret = allocate_formatter(s, RGW_FORMAT_PLAIN, true); + if (ret < 0) + return ret; + + string ver; + + next_tok(req, ver, '/'); + + if (!tenant_path.empty() || g_conf()->rgw_swift_account_in_url) { + string account_name; + next_tok(req, account_name, '/'); + + /* Erase all pre-defined prefixes like "AUTH_" or "KEY_". */ + const vector skipped_prefixes = { "AUTH_", "KEY_" }; + + for (const auto pfx : skipped_prefixes) { + const size_t comp_len = min(account_name.length(), pfx.length()); + if (account_name.compare(0, comp_len, pfx) == 0) { + /* Prefix is present. Drop it. */ + account_name = account_name.substr(comp_len); + break; + } + } + + if (account_name.empty()) { + return -ERR_PRECONDITION_FAILED; + } else { + s->account_name = account_name; + } + } + + next_tok(req, first, '/'); + + dout(10) << "ver=" << ver << " first=" << first << " req=" << req << dendl; + if (first.size() == 0) + return 0; + + s->info.effective_uri = "/" + first; + + // Save bucket to tide us over until token is parsed. + s->init_state.url_bucket = first; + + if (req.size()) { + s->object = + rgw_obj_key(req, s->info.env->get("HTTP_X_OBJECT_VERSION_ID", "")); /* rgw swift extension */ + s->info.effective_uri.append("/" + s->object.name); + } + + return 0; +} + +int RGWHandler_REST_SWIFT::init(RGWRados* store, struct req_state* s, + rgw::io::BasicClient *cio) +{ + struct req_init_state *t = &s->init_state; + + s->dialect = "swift"; + + std::string copy_source = s->info.env->get("HTTP_X_COPY_FROM", ""); + if (! copy_source.empty()) { + bool result = RGWCopyObj::parse_copy_location(copy_source, t->src_bucket, + s->src_object); + if (!result) + return -ERR_BAD_URL; + } + + if (s->op == OP_COPY) { + std::string req_dest = s->info.env->get("HTTP_DESTINATION", ""); + if (req_dest.empty()) + return -ERR_BAD_URL; + + std::string dest_bucket_name; + rgw_obj_key dest_obj_key; + bool result = + RGWCopyObj::parse_copy_location(req_dest, dest_bucket_name, + dest_obj_key); + if (!result) + return -ERR_BAD_URL; + + std::string dest_object = dest_obj_key.name; + + /* convert COPY operation into PUT */ + t->src_bucket = t->url_bucket; + s->src_object = s->object; + t->url_bucket = dest_bucket_name; + s->object = rgw_obj_key(dest_object); + s->op = OP_PUT; + } + + s->info.storage_class = s->info.env->get("HTTP_X_OBJECT_STORAGE_CLASS", ""); + + return RGWHandler_REST::init(store, s, cio); +} + +RGWHandler_REST* +RGWRESTMgr_SWIFT::get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + int ret = RGWHandler_REST_SWIFT::init_from_header(s, frontend_prefix); + if (ret < 0) { + ldout(s->cct, 10) << "init_from_header returned err=" << ret << dendl; + return nullptr; + } + + const auto& auth_strategy = auth_registry.get_swift(); + + if (s->init_state.url_bucket.empty()) { + return new RGWHandler_REST_Service_SWIFT(auth_strategy); + } + + if (s->object.empty()) { + return new RGWHandler_REST_Bucket_SWIFT(auth_strategy); + } + + return new RGWHandler_REST_Obj_SWIFT(auth_strategy); +} + +RGWHandler_REST* RGWRESTMgr_SWIFT_Info::get_handler( + struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + s->prot_flags |= RGW_REST_SWIFT; + const auto& auth_strategy = auth_registry.get_swift(); + return new RGWHandler_REST_SWIFT_Info(auth_strategy); +} diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h new file mode 100644 index 00000000..2f902c46 --- /dev/null +++ b/src/rgw/rgw_rest_swift.h @@ -0,0 +1,681 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_SWIFT_H +#define CEPH_RGW_REST_SWIFT_H +#define TIME_BUF_SIZE 128 + +#include +#include + +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_swift_auth.h" +#include "rgw_http_errors.h" + +#include + +class RGWGetObj_ObjStore_SWIFT : public RGWGetObj_ObjStore { + int custom_http_ret = 0; +public: + RGWGetObj_ObjStore_SWIFT() {} + ~RGWGetObj_ObjStore_SWIFT() override {} + + int verify_permission() override; + int get_params() override; + int send_response_data_error() override; + int send_response_data(bufferlist& bl, off_t ofs, off_t len) override; + + void set_custom_http_response(const int http_ret) { + custom_http_ret = http_ret; + } + + bool need_object_expiration() override { + return true; + } +}; + +class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore { + bool need_stats; + bool wants_reversed; + std::string prefix; + std::vector reverse_buffer; + + uint64_t get_default_max() const override { + return 0; + } + +public: + RGWListBuckets_ObjStore_SWIFT() + : need_stats(true), + wants_reversed(false) { + } + ~RGWListBuckets_ObjStore_SWIFT() override {} + + int get_params() override; + void handle_listing_chunk(RGWUserBuckets&& buckets) override; + void send_response_begin(bool has_buckets) override; + void send_response_data(RGWUserBuckets& buckets) override; + void send_response_data_reversed(RGWUserBuckets& buckets); + void dump_bucket_entry(const RGWBucketEnt& obj); + void send_response_end() override; + + bool should_get_stats() override { return need_stats; } + bool supports_account_metadata() override { return true; } +}; + +class RGWListBucket_ObjStore_SWIFT : public RGWListBucket_ObjStore { + string path; +public: + RGWListBucket_ObjStore_SWIFT() { + default_max = 10000; + } + ~RGWListBucket_ObjStore_SWIFT() override {} + + int get_params() override; + void send_response() override; + bool need_container_stats() override { return true; } +}; + +class RGWStatAccount_ObjStore_SWIFT : public RGWStatAccount_ObjStore { + map attrs; +public: + RGWStatAccount_ObjStore_SWIFT() { + } + ~RGWStatAccount_ObjStore_SWIFT() override {} + + void execute() override; + void send_response() override; +}; + +class RGWStatBucket_ObjStore_SWIFT : public RGWStatBucket_ObjStore { +public: + RGWStatBucket_ObjStore_SWIFT() {} + ~RGWStatBucket_ObjStore_SWIFT() override {} + + void send_response() override; +}; + +class RGWCreateBucket_ObjStore_SWIFT : public RGWCreateBucket_ObjStore { +protected: + bool need_metadata_upload() const override { return true; } +public: + RGWCreateBucket_ObjStore_SWIFT() {} + ~RGWCreateBucket_ObjStore_SWIFT() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWDeleteBucket_ObjStore_SWIFT : public RGWDeleteBucket_ObjStore { +public: + RGWDeleteBucket_ObjStore_SWIFT() {} + ~RGWDeleteBucket_ObjStore_SWIFT() override {} + + void send_response() override; +}; + +class RGWPutObj_ObjStore_SWIFT : public RGWPutObj_ObjStore { + string lo_etag; +public: + RGWPutObj_ObjStore_SWIFT() {} + ~RGWPutObj_ObjStore_SWIFT() override {} + + int update_slo_segment_size(rgw_slo_entry& entry); + + int verify_permission() override; + int get_params() override; + void send_response() override; +}; + +class RGWPutMetadataAccount_ObjStore_SWIFT : public RGWPutMetadataAccount_ObjStore { +public: + RGWPutMetadataAccount_ObjStore_SWIFT() {} + ~RGWPutMetadataAccount_ObjStore_SWIFT() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWPutMetadataBucket_ObjStore_SWIFT : public RGWPutMetadataBucket_ObjStore { +public: + RGWPutMetadataBucket_ObjStore_SWIFT() {} + ~RGWPutMetadataBucket_ObjStore_SWIFT() override {} + + int get_params() override; + void send_response() override; +}; + +class RGWPutMetadataObject_ObjStore_SWIFT : public RGWPutMetadataObject_ObjStore { +public: + RGWPutMetadataObject_ObjStore_SWIFT() {} + ~RGWPutMetadataObject_ObjStore_SWIFT() override {} + + int get_params() override; + void send_response() override; + bool need_object_expiration() override { return true; } +}; + +class RGWDeleteObj_ObjStore_SWIFT : public RGWDeleteObj_ObjStore { +public: + RGWDeleteObj_ObjStore_SWIFT() {} + ~RGWDeleteObj_ObjStore_SWIFT() override {} + + int verify_permission() override; + int get_params() override; + bool need_object_expiration() override { return true; } + void send_response() override; +}; + +class RGWCopyObj_ObjStore_SWIFT : public RGWCopyObj_ObjStore { + bool sent_header; +protected: + void dump_copy_info(); +public: + RGWCopyObj_ObjStore_SWIFT() : sent_header(false) {} + ~RGWCopyObj_ObjStore_SWIFT() override {} + + int init_dest_policy() override; + int get_params() override; + void send_response() override; + void send_partial_response(off_t ofs) override; +}; + +class RGWGetACLs_ObjStore_SWIFT : public RGWGetACLs_ObjStore { +public: + RGWGetACLs_ObjStore_SWIFT() {} + ~RGWGetACLs_ObjStore_SWIFT() override {} + + void send_response() override {} +}; + +class RGWPutACLs_ObjStore_SWIFT : public RGWPutACLs_ObjStore { +public: + RGWPutACLs_ObjStore_SWIFT() : RGWPutACLs_ObjStore() {} + ~RGWPutACLs_ObjStore_SWIFT() override {} + + void send_response() override {} +}; + +class RGWOptionsCORS_ObjStore_SWIFT : public RGWOptionsCORS_ObjStore { +public: + RGWOptionsCORS_ObjStore_SWIFT() {} + ~RGWOptionsCORS_ObjStore_SWIFT() override {} + + void send_response() override; +}; + +class RGWBulkDelete_ObjStore_SWIFT : public RGWBulkDelete_ObjStore { +public: + RGWBulkDelete_ObjStore_SWIFT() {} + ~RGWBulkDelete_ObjStore_SWIFT() override {} + + int get_data(std::list& items, + bool * is_truncated) override; + void send_response() override; +}; + +class RGWBulkUploadOp_ObjStore_SWIFT : public RGWBulkUploadOp_ObjStore { + size_t conlen; + size_t curpos; + +public: + RGWBulkUploadOp_ObjStore_SWIFT() + : conlen(0), + curpos(0) { + } + ~RGWBulkUploadOp_ObjStore_SWIFT() = default; + + std::unique_ptr create_stream() override; + void send_response() override; +}; + +class RGWInfo_ObjStore_SWIFT : public RGWInfo_ObjStore { +protected: + struct info + { + bool is_admin_info; + function list_data; + }; + + static const vector> swift_info; +public: + RGWInfo_ObjStore_SWIFT() {} + ~RGWInfo_ObjStore_SWIFT() override {} + + void execute() override; + void send_response() override; + static void list_swift_data(Formatter& formatter, const ConfigProxy& config, RGWRados& store); + static void list_tempauth_data(Formatter& formatter, const ConfigProxy& config, RGWRados& store); + static void list_tempurl_data(Formatter& formatter, const ConfigProxy& config, RGWRados& store); + static void list_slo_data(Formatter& formatter, const ConfigProxy& config, RGWRados& store); + static bool is_expired(const std::string& expires, CephContext* cct); +}; + + +class RGWFormPost : public RGWPostObj_ObjStore { + std::string get_current_filename() const override; + std::string get_current_content_type() const override; + std::size_t get_max_file_size() /*const*/; + bool is_next_file_to_upload() override; + bool is_integral(); + bool is_non_expired(); + void get_owner_info(const req_state* s, + RGWUserInfo& owner_info) const; + + parts_collection_t ctrl_parts; + boost::optional current_data_part; + std::string prefix; + bool stream_done = false; + + class SignatureHelper; +public: + RGWFormPost() = default; + ~RGWFormPost() = default; + + void init(RGWRados* store, + req_state* s, + RGWHandler* dialect_handler) override; + + int get_params() override; + int get_data(ceph::bufferlist& bl, bool& again) override; + void send_response() override; + + static bool is_formpost_req(req_state* const s); +}; + +class RGWFormPost::SignatureHelper +{ +private: + static constexpr uint32_t output_size = + CEPH_CRYPTO_HMACSHA1_DIGESTSIZE * 2 + 1; + + unsigned char dest[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; // 20 + char dest_str[output_size]; + +public: + SignatureHelper() = default; + + const char* calc(const std::string& key, + const boost::string_ref& path_info, + const boost::string_ref& redirect, + const boost::string_ref& max_file_size, + const boost::string_ref& max_file_count, + const boost::string_ref& expires) { + using ceph::crypto::HMACSHA1; + using UCHARPTR = const unsigned char*; + + HMACSHA1 hmac((UCHARPTR) key.data(), key.size()); + + hmac.Update((UCHARPTR) path_info.data(), path_info.size()); + hmac.Update((UCHARPTR) "\n", 1); + + hmac.Update((UCHARPTR) redirect.data(), redirect.size()); + hmac.Update((UCHARPTR) "\n", 1); + + hmac.Update((UCHARPTR) max_file_size.data(), max_file_size.size()); + hmac.Update((UCHARPTR) "\n", 1); + + hmac.Update((UCHARPTR) max_file_count.data(), max_file_count.size()); + hmac.Update((UCHARPTR) "\n", 1); + + hmac.Update((UCHARPTR) expires.data(), expires.size()); + + hmac.Final(dest); + + buf_to_hex((UCHARPTR) dest, sizeof(dest), dest_str); + + return dest_str; + } + + const char* get_signature() const { + return dest_str; + } + + bool is_equal_to(const std::string& rhs) const { + /* never allow out-of-range exception */ + if (rhs.size() < (output_size - 1)) { + return false; + } + return rhs.compare(0 /* pos */, output_size, dest_str) == 0; + } + +}; /* RGWFormPost::SignatureHelper */ + + +class RGWSwiftWebsiteHandler { + RGWRados* const store; + req_state* const s; + RGWHandler_REST* const handler; + + bool is_web_mode() const; + bool can_be_website_req() const; + bool is_web_dir() const; + bool is_index_present(const std::string& index); + + int serve_errordoc(int http_ret, std::string error_doc); + + RGWOp* get_ws_redirect_op(); + RGWOp* get_ws_index_op(); + RGWOp* get_ws_listing_op(); +public: + RGWSwiftWebsiteHandler(RGWRados* const store, + req_state* const s, + RGWHandler_REST* const handler) + : store(store), + s(s), + handler(handler) { + } + + int error_handler(const int err_no, + std::string* const error_content); + int retarget_bucket(RGWOp* op, RGWOp** new_op); + int retarget_object(RGWOp* op, RGWOp** new_op); +}; + + +class RGWHandler_REST_SWIFT : public RGWHandler_REST { + friend class RGWRESTMgr_SWIFT; + friend class RGWRESTMgr_SWIFT_Info; +protected: + const rgw::auth::Strategy& auth_strategy; + + virtual bool is_acl_op() { + return false; + } + + static int init_from_header(struct req_state* s, + const std::string& frontend_prefix); +public: + explicit RGWHandler_REST_SWIFT(const rgw::auth::Strategy& auth_strategy) + : auth_strategy(auth_strategy) { + } + ~RGWHandler_REST_SWIFT() override = default; + + int validate_bucket_name(const string& bucket); + + int init(RGWRados *store, struct req_state *s, rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider *dpp) override; + int postauth_init() override; + + RGWAccessControlPolicy *alloc_policy() { return nullptr; /* return new RGWAccessControlPolicy_SWIFT; */ } + void free_policy(RGWAccessControlPolicy *policy) { delete policy; } +}; + +class RGWHandler_REST_Service_SWIFT : public RGWHandler_REST_SWIFT { +protected: + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_post() override; + RGWOp *op_delete() override; +public: + using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT; + ~RGWHandler_REST_Service_SWIFT() override = default; +}; + +class RGWHandler_REST_Bucket_SWIFT : public RGWHandler_REST_SWIFT { + /* We need the boost::optional here only because of handler's late + * initialization (see the init() method). */ + boost::optional website_handler; +protected: + bool is_obj_update_op() override { + return s->op == OP_POST; + } + + RGWOp *get_obj_op(bool get_data); + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + RGWOp *op_options() override; +public: + using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT; + ~RGWHandler_REST_Bucket_SWIFT() override = default; + + int error_handler(int err_no, std::string *error_content) override { + return website_handler->error_handler(err_no, error_content); + } + + int retarget(RGWOp* op, RGWOp** new_op) override { + return website_handler->retarget_bucket(op, new_op); + } + + int init(RGWRados* const store, + struct req_state* const s, + rgw::io::BasicClient* const cio) override { + website_handler = boost::in_place(store, s, this); + return RGWHandler_REST_SWIFT::init(store, s, cio); + } +}; + +class RGWHandler_REST_Obj_SWIFT : public RGWHandler_REST_SWIFT { + /* We need the boost::optional here only because of handler's late + * initialization (see the init() method). */ + boost::optional website_handler; +protected: + bool is_obj_update_op() override { + return s->op == OP_POST; + } + + RGWOp *get_obj_op(bool get_data); + RGWOp *op_get() override; + RGWOp *op_head() override; + RGWOp *op_put() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + RGWOp *op_copy() override; + RGWOp *op_options() override; + +public: + using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT; + ~RGWHandler_REST_Obj_SWIFT() override = default; + + int error_handler(int err_no, std::string *error_content) override { + return website_handler->error_handler(err_no, error_content); + } + + int retarget(RGWOp* op, RGWOp** new_op) override { + return website_handler->retarget_object(op, new_op); + } + + int init(RGWRados* const store, + struct req_state* const s, + rgw::io::BasicClient* const cio) override { + website_handler = boost::in_place(store, s, this); + return RGWHandler_REST_SWIFT::init(store, s, cio); + } +}; + +class RGWRESTMgr_SWIFT : public RGWRESTMgr { +protected: + RGWRESTMgr* get_resource_mgr_as_default(struct req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this->get_resource_mgr(s, uri, out_uri); + } + +public: + RGWRESTMgr_SWIFT() = default; + ~RGWRESTMgr_SWIFT() override = default; + + RGWHandler_REST *get_handler(struct req_state *s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; + + +class RGWGetCrossDomainPolicy_ObjStore_SWIFT + : public RGWGetCrossDomainPolicy_ObjStore { +public: + RGWGetCrossDomainPolicy_ObjStore_SWIFT() = default; + ~RGWGetCrossDomainPolicy_ObjStore_SWIFT() override = default; + + void send_response() override; +}; + +class RGWGetHealthCheck_ObjStore_SWIFT + : public RGWGetHealthCheck_ObjStore { +public: + RGWGetHealthCheck_ObjStore_SWIFT() = default; + ~RGWGetHealthCheck_ObjStore_SWIFT() override = default; + + void send_response() override; +}; + +class RGWHandler_SWIFT_CrossDomain : public RGWHandler_REST { +public: + RGWHandler_SWIFT_CrossDomain() = default; + ~RGWHandler_SWIFT_CrossDomain() override = default; + + RGWOp *op_get() override { + return new RGWGetCrossDomainPolicy_ObjStore_SWIFT(); + } + + int init(RGWRados* const store, + struct req_state* const state, + rgw::io::BasicClient* const cio) override { + state->dialect = "swift"; + state->formatter = new JSONFormatter; + state->format = RGW_FORMAT_JSON; + + return RGWHandler::init(store, state, cio); + } + + int authorize(const DoutPrefixProvider *dpp) override { + return 0; + } + + int postauth_init() override { + return 0; + } + + int read_permissions(RGWOp *) override { + return 0; + } + + virtual RGWAccessControlPolicy *alloc_policy() { return nullptr; } + virtual void free_policy(RGWAccessControlPolicy *policy) {} +}; + +class RGWRESTMgr_SWIFT_CrossDomain : public RGWRESTMgr { +protected: + RGWRESTMgr *get_resource_mgr(struct req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + +public: + RGWRESTMgr_SWIFT_CrossDomain() = default; + ~RGWRESTMgr_SWIFT_CrossDomain() override = default; + + RGWHandler_REST* get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry&, + const std::string&) override { + s->prot_flags |= RGW_REST_SWIFT; + return new RGWHandler_SWIFT_CrossDomain; + } +}; + + +class RGWHandler_SWIFT_HealthCheck : public RGWHandler_REST { +public: + RGWHandler_SWIFT_HealthCheck() = default; + ~RGWHandler_SWIFT_HealthCheck() override = default; + + RGWOp *op_get() override { + return new RGWGetHealthCheck_ObjStore_SWIFT(); + } + + int init(RGWRados* const store, + struct req_state* const state, + rgw::io::BasicClient* const cio) override { + state->dialect = "swift"; + state->formatter = new JSONFormatter; + state->format = RGW_FORMAT_JSON; + + return RGWHandler::init(store, state, cio); + } + + int authorize(const DoutPrefixProvider *dpp) override { + return 0; + } + + int postauth_init() override { + return 0; + } + + int read_permissions(RGWOp *) override { + return 0; + } + + virtual RGWAccessControlPolicy *alloc_policy() { return nullptr; } + virtual void free_policy(RGWAccessControlPolicy *policy) {} +}; + +class RGWRESTMgr_SWIFT_HealthCheck : public RGWRESTMgr { +protected: + RGWRESTMgr *get_resource_mgr(struct req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + +public: + RGWRESTMgr_SWIFT_HealthCheck() = default; + ~RGWRESTMgr_SWIFT_HealthCheck() override = default; + + RGWHandler_REST* get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry&, + const std::string&) override { + s->prot_flags |= RGW_REST_SWIFT; + return new RGWHandler_SWIFT_HealthCheck; + } +}; + + +class RGWHandler_REST_SWIFT_Info : public RGWHandler_REST_SWIFT { +public: + using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT; + ~RGWHandler_REST_SWIFT_Info() override = default; + + RGWOp *op_get() override { + return new RGWInfo_ObjStore_SWIFT(); + } + + int init(RGWRados* const store, + struct req_state* const state, + rgw::io::BasicClient* const cio) override { + state->dialect = "swift"; + state->formatter = new JSONFormatter; + state->format = RGW_FORMAT_JSON; + + return RGWHandler::init(store, state, cio); + } + + int authorize(const DoutPrefixProvider *dpp) override { + return 0; + } + + int postauth_init() override { + return 0; + } + + int read_permissions(RGWOp *) override { + return 0; + } +}; + +class RGWRESTMgr_SWIFT_Info : public RGWRESTMgr { +public: + RGWRESTMgr_SWIFT_Info() = default; + ~RGWRESTMgr_SWIFT_Info() override = default; + + RGWHandler_REST *get_handler(struct req_state* s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; + +#endif diff --git a/src/rgw/rgw_rest_usage.cc b/src/rgw/rgw_rest_usage.cc new file mode 100644 index 00000000..23b7a971 --- /dev/null +++ b/src/rgw/rgw_rest_usage.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_op.h" +#include "rgw_usage.h" +#include "rgw_rest_usage.h" + +#include "include/str_list.h" + +#define dout_subsys ceph_subsys_rgw + +class RGWOp_Usage_Get : public RGWRESTOp { + +public: + RGWOp_Usage_Get() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("usage", RGW_CAP_READ); + } + void execute() override; + + const char* name() const override { return "get_usage"; } +}; + +void RGWOp_Usage_Get::execute() { + map categories; + + string uid_str; + string bucket_name; + uint64_t start, end; + bool show_entries; + bool show_summary; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); + rgw_user uid(uid_str); + + RESTArgs::get_epoch(s, "start", 0, &start); + RESTArgs::get_epoch(s, "end", (uint64_t)-1, &end); + RESTArgs::get_bool(s, "show-entries", true, &show_entries); + RESTArgs::get_bool(s, "show-summary", true, &show_summary); + + string cat_str; + RESTArgs::get_string(s, "categories", cat_str, &cat_str); + + if (!cat_str.empty()) { + list cat_list; + list::iterator iter; + get_str_list(cat_str, cat_list); + for (iter = cat_list.begin(); iter != cat_list.end(); ++iter) { + categories[*iter] = true; + } + } + + http_ret = RGWUsage::show(store, uid, bucket_name, start, end, show_entries, show_summary, &categories, flusher); +} + +class RGWOp_Usage_Delete : public RGWRESTOp { + +public: + RGWOp_Usage_Delete() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("usage", RGW_CAP_WRITE); + } + void execute() override; + + const char* name() const override { return "trim_usage"; } +}; + +void RGWOp_Usage_Delete::execute() { + string uid_str; + string bucket_name; + uint64_t start, end; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); + rgw_user uid(uid_str); + + RESTArgs::get_epoch(s, "start", 0, &start); + RESTArgs::get_epoch(s, "end", (uint64_t)-1, &end); + + if (uid.empty() && + !bucket_name.empty() && + !start && + end == (uint64_t)-1) { + bool remove_all; + RESTArgs::get_bool(s, "remove-all", false, &remove_all); + if (!remove_all) { + http_ret = -EINVAL; + return; + } + } + + http_ret = RGWUsage::trim(store, uid, bucket_name, start, end); +} + +RGWOp *RGWHandler_Usage::op_get() +{ + return new RGWOp_Usage_Get; +} + +RGWOp *RGWHandler_Usage::op_delete() +{ + return new RGWOp_Usage_Delete; +} + + diff --git a/src/rgw/rgw_rest_usage.h b/src/rgw/rgw_rest_usage.h new file mode 100644 index 00000000..a09f32d0 --- /dev/null +++ b/src/rgw/rgw_rest_usage.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_USAGE_H +#define CEPH_RGW_REST_USAGE_H + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + + +class RGWHandler_Usage : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_delete() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Usage() override = default; + + int read_permissions(RGWOp*) override { + return 0; + } +}; + +class RGWRESTMgr_Usage : public RGWRESTMgr { +public: + RGWRESTMgr_Usage() = default; + ~RGWRESTMgr_Usage() override = default; + + RGWHandler_REST* get_handler(struct req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Usage(auth_registry); + } +}; + +#endif diff --git a/src/rgw/rgw_rest_user.cc b/src/rgw/rgw_rest_user.cc new file mode 100644 index 00000000..d27105e0 --- /dev/null +++ b/src/rgw/rgw_rest_user.cc @@ -0,0 +1,999 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_json.h" + +#include "rgw_op.h" +#include "rgw_user.h" +#include "rgw_rest_user.h" + +#include "include/str_list.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +class RGWOp_User_List : public RGWRESTOp { + +public: + RGWOp_User_List() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_READ); + } + + void execute() override; + + const char* name() const override { return "list_user"; } +}; + +void RGWOp_User_List::execute() +{ + RGWUserAdminOpState op_state; + + uint32_t max_entries; + std::string marker; + RESTArgs::get_uint32(s, "max-entries", 1000, &max_entries); + RESTArgs::get_string(s, "marker", marker, &marker); + + op_state.max_entries = max_entries; + op_state.marker = marker; + http_ret = RGWUserAdminOp_User::list(store, op_state, flusher); +} + +class RGWOp_User_Info : public RGWRESTOp { + +public: + RGWOp_User_Info() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_READ); + } + + void execute() override; + + const char* name() const override { return "get_user_info"; } +}; + +void RGWOp_User_Info::execute() +{ + RGWUserAdminOpState op_state; + + std::string uid_str, access_key_str; + bool fetch_stats; + bool sync_stats; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "access-key", access_key_str, &access_key_str); + + // if uid was not supplied in rest argument, error out now, otherwise we'll + // end up initializing anonymous user, for which keys.init will eventually + // return -EACESS + if (uid_str.empty() && access_key_str.empty()){ + http_ret=-EINVAL; + return; + } + + rgw_user uid(uid_str); + + RESTArgs::get_bool(s, "stats", false, &fetch_stats); + + RESTArgs::get_bool(s, "sync", false, &sync_stats); + + op_state.set_user_id(uid); + op_state.set_access_key(access_key_str); + op_state.set_fetch_stats(fetch_stats); + op_state.set_sync_stats(sync_stats); + + http_ret = RGWUserAdminOp_User::info(store, op_state, flusher); +} + +class RGWOp_User_Create : public RGWRESTOp { + +public: + RGWOp_User_Create() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "create_user"; } +}; + +void RGWOp_User_Create::execute() +{ + std::string uid_str; + std::string display_name; + std::string email; + std::string access_key; + std::string secret_key; + std::string key_type_str; + std::string caps; + std::string tenant_name; + std::string op_mask_str; + + bool gen_key; + bool suspended; + bool system; + bool exclusive; + + int32_t max_buckets; + const int32_t default_max_buckets = + s->cct->_conf.get_val("rgw_user_max_buckets"); + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "display-name", display_name, &display_name); + RESTArgs::get_string(s, "email", email, &email); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + RESTArgs::get_string(s, "user-caps", caps, &caps); + RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name); + RESTArgs::get_bool(s, "generate-key", true, &gen_key); + RESTArgs::get_bool(s, "suspended", false, &suspended); + RESTArgs::get_int32(s, "max-buckets", default_max_buckets, &max_buckets); + RESTArgs::get_bool(s, "system", false, &system); + RESTArgs::get_bool(s, "exclusive", false, &exclusive); + RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str); + + if (!s->user->system && system) { + ldout(s->cct, 0) << "cannot set system flag by non-system user" << dendl; + http_ret = -EINVAL; + return; + } + + if (!tenant_name.empty()) { + uid.tenant = tenant_name; + } + + // TODO: validate required args are passed in. (for eg. uid and display_name here) + op_state.set_user_id(uid); + op_state.set_display_name(display_name); + op_state.set_user_email(email); + op_state.set_caps(caps); + op_state.set_access_key(access_key); + op_state.set_secret_key(secret_key); + + if (!op_mask_str.empty()) { + uint32_t op_mask; + int ret = rgw_parse_op_type_list(op_mask_str, &op_mask); + if (ret < 0) { + ldout(s->cct, 0) << "failed to parse op_mask: " << ret << dendl; + http_ret = -EINVAL; + return; + } + op_state.set_op_mask(op_mask); + } + + if (!key_type_str.empty()) { + int32_t key_type = KEY_TYPE_UNDEFINED; + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + + op_state.set_key_type(key_type); + } + + if (max_buckets != default_max_buckets) + op_state.set_max_buckets(max_buckets); + + if (s->info.args.exists("suspended")) + op_state.set_suspension(suspended); + + if (s->info.args.exists("system")) + op_state.set_system(system); + + if (s->info.args.exists("exclusive")) + op_state.set_exclusive(exclusive); + + if (gen_key) + op_state.set_generate_key(); + + http_ret = RGWUserAdminOp_User::create(store, op_state, flusher); +} + +class RGWOp_User_Modify : public RGWRESTOp { + +public: + RGWOp_User_Modify() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "modify_user"; } +}; + +void RGWOp_User_Modify::execute() +{ + std::string uid_str; + std::string display_name; + std::string email; + std::string access_key; + std::string secret_key; + std::string key_type_str; + std::string caps; + std::string op_mask_str; + + bool gen_key; + bool suspended; + bool system; + bool email_set; + bool quota_set; + int32_t max_buckets; + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "display-name", display_name, &display_name); + RESTArgs::get_string(s, "email", email, &email, &email_set); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "user-caps", caps, &caps); + RESTArgs::get_bool(s, "generate-key", false, &gen_key); + RESTArgs::get_bool(s, "suspended", false, &suspended); + RESTArgs::get_int32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets, "a_set); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + + RESTArgs::get_bool(s, "system", false, &system); + RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str); + + if (!s->user->system && system) { + ldout(s->cct, 0) << "cannot set system flag by non-system user" << dendl; + http_ret = -EINVAL; + return; + } + + op_state.set_user_id(uid); + op_state.set_display_name(display_name); + + if (email_set) + op_state.set_user_email(email); + + op_state.set_caps(caps); + op_state.set_access_key(access_key); + op_state.set_secret_key(secret_key); + + if (quota_set) + op_state.set_max_buckets(max_buckets); + + if (gen_key) + op_state.set_generate_key(); + + if (!key_type_str.empty()) { + int32_t key_type = KEY_TYPE_UNDEFINED; + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + + op_state.set_key_type(key_type); + } + + if (s->info.args.exists("suspended")) + op_state.set_suspension(suspended); + + if (s->info.args.exists("system")) + op_state.set_system(system); + + if (!op_mask_str.empty()) { + uint32_t op_mask; + int ret = rgw_parse_op_type_list(op_mask_str, &op_mask); + if (ret < 0) { + ldout(s->cct, 0) << "failed to parse op_mask: " << ret << dendl; + http_ret = -EINVAL; + return; + } + op_state.set_op_mask(op_mask); + } + + http_ret = RGWUserAdminOp_User::modify(store, op_state, flusher); +} + +class RGWOp_User_Remove : public RGWRESTOp { + +public: + RGWOp_User_Remove() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "remove_user"; } +}; + +void RGWOp_User_Remove::execute() +{ + std::string uid_str; + bool purge_data; + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_bool(s, "purge-data", false, &purge_data); + + // FIXME: no double checking + if (!uid.empty()) + op_state.set_user_id(uid); + + op_state.set_purge_data(purge_data); + + http_ret = RGWUserAdminOp_User::remove(store, op_state, flusher); +} + +class RGWOp_Subuser_Create : public RGWRESTOp { + +public: + RGWOp_Subuser_Create() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "create_subuser"; } +}; + +void RGWOp_Subuser_Create::execute() +{ + std::string uid_str; + std::string subuser; + std::string secret_key; + std::string access_key; + std::string perm_str; + std::string key_type_str; + + bool gen_subuser = false; // FIXME placeholder + bool gen_secret; + bool gen_access; + + uint32_t perm_mask = 0; + int32_t key_type = KEY_TYPE_SWIFT; + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "access", perm_str, &perm_str); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + //RESTArgs::get_bool(s, "generate-subuser", false, &gen_subuser); + RESTArgs::get_bool(s, "generate-secret", false, &gen_secret); + RESTArgs::get_bool(s, "gen-access-key", false, &gen_access); + + perm_mask = rgw_str_to_perm(perm_str.c_str()); + op_state.set_perm(perm_mask); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + op_state.set_access_key(access_key); + op_state.set_secret_key(secret_key); + op_state.set_generate_subuser(gen_subuser); + + if (gen_access) + op_state.set_gen_access(); + + if (gen_secret) + op_state.set_gen_secret(); + + if (!key_type_str.empty()) { + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + } + op_state.set_key_type(key_type); + + http_ret = RGWUserAdminOp_Subuser::create(store, op_state, flusher); +} + +class RGWOp_Subuser_Modify : public RGWRESTOp { + +public: + RGWOp_Subuser_Modify() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "modify_subuser"; } +}; + +void RGWOp_Subuser_Modify::execute() +{ + std::string uid_str; + std::string subuser; + std::string secret_key; + std::string key_type_str; + std::string perm_str; + + RGWUserAdminOpState op_state; + + uint32_t perm_mask; + int32_t key_type = KEY_TYPE_SWIFT; + + bool gen_secret; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "access", perm_str, &perm_str); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + RESTArgs::get_bool(s, "generate-secret", false, &gen_secret); + + perm_mask = rgw_str_to_perm(perm_str.c_str()); + op_state.set_perm(perm_mask); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + + if (!secret_key.empty()) + op_state.set_secret_key(secret_key); + + if (gen_secret) + op_state.set_gen_secret(); + + if (!key_type_str.empty()) { + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + } + op_state.set_key_type(key_type); + + http_ret = RGWUserAdminOp_Subuser::modify(store, op_state, flusher); +} + +class RGWOp_Subuser_Remove : public RGWRESTOp { + +public: + RGWOp_Subuser_Remove() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "remove_subuser"; } +}; + +void RGWOp_Subuser_Remove::execute() +{ + std::string uid_str; + std::string subuser; + bool purge_keys; + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_bool(s, "purge-keys", true, &purge_keys); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + + if (purge_keys) + op_state.set_purge_keys(); + + http_ret = RGWUserAdminOp_Subuser::remove(store, op_state, flusher); +} + +class RGWOp_Key_Create : public RGWRESTOp { + +public: + RGWOp_Key_Create() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "create_access_key"; } +}; + +void RGWOp_Key_Create::execute() +{ + std::string uid_str; + std::string subuser; + std::string access_key; + std::string secret_key; + std::string key_type_str; + + bool gen_key; + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "secret-key", secret_key, &secret_key); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + RESTArgs::get_bool(s, "generate-key", true, &gen_key); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + op_state.set_access_key(access_key); + op_state.set_secret_key(secret_key); + + if (gen_key) + op_state.set_generate_key(); + + if (!key_type_str.empty()) { + int32_t key_type = KEY_TYPE_UNDEFINED; + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + + op_state.set_key_type(key_type); + } + + http_ret = RGWUserAdminOp_Key::create(store, op_state, flusher); +} + +class RGWOp_Key_Remove : public RGWRESTOp { + +public: + RGWOp_Key_Remove() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "remove_access_key"; } +}; + +void RGWOp_Key_Remove::execute() +{ + std::string uid_str; + std::string subuser; + std::string access_key; + std::string key_type_str; + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "subuser", subuser, &subuser); + RESTArgs::get_string(s, "access-key", access_key, &access_key); + RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str); + + op_state.set_user_id(uid); + op_state.set_subuser(subuser); + op_state.set_access_key(access_key); + + if (!key_type_str.empty()) { + int32_t key_type = KEY_TYPE_UNDEFINED; + if (key_type_str.compare("swift") == 0) + key_type = KEY_TYPE_SWIFT; + else if (key_type_str.compare("s3") == 0) + key_type = KEY_TYPE_S3; + + op_state.set_key_type(key_type); + } + + http_ret = RGWUserAdminOp_Key::remove(store, op_state, flusher); +} + +class RGWOp_Caps_Add : public RGWRESTOp { + +public: + RGWOp_Caps_Add() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "add_user_caps"; } +}; + +void RGWOp_Caps_Add::execute() +{ + std::string uid_str; + std::string caps; + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "user-caps", caps, &caps); + + op_state.set_user_id(uid); + op_state.set_caps(caps); + + http_ret = RGWUserAdminOp_Caps::add(store, op_state, flusher); +} + +class RGWOp_Caps_Remove : public RGWRESTOp { + +public: + RGWOp_Caps_Remove() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "remove_user_caps"; } +}; + +void RGWOp_Caps_Remove::execute() +{ + std::string uid_str; + std::string caps; + + RGWUserAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "user-caps", caps, &caps); + + op_state.set_user_id(uid); + op_state.set_caps(caps); + + http_ret = RGWUserAdminOp_Caps::remove(store, op_state, flusher); +} + +struct UserQuotas { + RGWQuotaInfo bucket_quota; + RGWQuotaInfo user_quota; + + UserQuotas() {} + + explicit UserQuotas(RGWUserInfo& info) : bucket_quota(info.bucket_quota), + user_quota(info.user_quota) {} + + void dump(Formatter *f) const { + encode_json("bucket_quota", bucket_quota, f); + encode_json("user_quota", user_quota, f); + } + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket_quota", bucket_quota, obj); + JSONDecoder::decode_json("user_quota", user_quota, obj); + } +}; + +class RGWOp_Quota_Info : public RGWRESTOp { + +public: + RGWOp_Quota_Info() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_READ); + } + + void execute() override; + + const char* name() const override { return "get_quota_info"; } +}; + + +void RGWOp_Quota_Info::execute() +{ + RGWUserAdminOpState op_state; + + std::string uid_str; + std::string quota_type; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "quota-type", quota_type, "a_type); + + if (uid_str.empty()) { + http_ret = -EINVAL; + return; + } + + rgw_user uid(uid_str); + + bool show_all = quota_type.empty(); + bool show_bucket = show_all || (quota_type == "bucket"); + bool show_user = show_all || (quota_type == "user"); + + if (!(show_all || show_bucket || show_user)) { + http_ret = -EINVAL; + return; + } + + op_state.set_user_id(uid); + + RGWUser user; + http_ret = user.init(store, op_state); + if (http_ret < 0) + return; + + if (!op_state.has_existing_user()) { + http_ret = -ERR_NO_SUCH_USER; + return; + } + + RGWUserInfo info; + string err_msg; + http_ret = user.info(info, &err_msg); + if (http_ret < 0) + return; + + flusher.start(0); + if (show_all) { + UserQuotas quotas(info); + encode_json("quota", quotas, s->formatter); + } else if (show_user) { + encode_json("user_quota", info.user_quota, s->formatter); + } else { + encode_json("bucket_quota", info.bucket_quota, s->formatter); + } + + flusher.flush(); +} + +class RGWOp_Quota_Set : public RGWRESTOp { + +public: + RGWOp_Quota_Set() {} + + int check_caps(RGWUserCaps& caps) override { + return caps.check_cap("users", RGW_CAP_WRITE); + } + + void execute() override; + + const char* name() const override { return "set_quota_info"; } +}; + +/** + * set quota + * + * two different ways to set the quota info: as json struct in the message body or via http params. + * + * as json: + * + * PUT /admin/user?uid=["a-type=] + * + * whereas quota-type is optional and is either user, or bucket + * + * if quota-type is not specified then we expect to get a structure that contains both quotas, + * otherwise we'll only get the relevant configuration. + * + * E.g., if quota type not specified: + * { + * "user_quota" : { + * "max_size_kb" : 4096, + * "max_objects" : -1, + * "enabled" : false + * }, + * "bucket_quota" : { + * "max_size_kb" : 1024, + * "max_objects" : -1, + * "enabled" : true + * } + * } + * + * + * or if quota type is specified: + * { + * "max_size_kb" : 4096, + * "max_objects" : -1, + * "enabled" : false + * } + * + * Another option is not to pass any body and set the following http params: + * + * + * max-size-kb= + * max-objects= + * enabled[={true,false}] + * + * all params are optionals and default to the current settings. With this type of configuration the + * quota-type param is mandatory. + * + */ + +void RGWOp_Quota_Set::execute() +{ + RGWUserAdminOpState op_state; + + std::string uid_str; + std::string quota_type; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "quota-type", quota_type, "a_type); + + if (uid_str.empty()) { + http_ret = -EINVAL; + return; + } + + rgw_user uid(uid_str); + + bool set_all = quota_type.empty(); + bool set_bucket = set_all || (quota_type == "bucket"); + bool set_user = set_all || (quota_type == "user"); + + if (!(set_all || set_bucket || set_user)) { + ldout(store->ctx(), 20) << "invalid quota type" << dendl; + http_ret = -EINVAL; + return; + } + + bool use_http_params; + + if (s->content_length > 0) { + use_http_params = false; + } else { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + use_http_params = (!encoding || strcmp(encoding, "chunked") != 0); + } + + if (use_http_params && set_all) { + ldout(store->ctx(), 20) << "quota type was not specified, can't set all quotas via http headers" << dendl; + http_ret = -EINVAL; + return; + } + + op_state.set_user_id(uid); + + RGWUser user; + http_ret = user.init(store, op_state); + if (http_ret < 0) { + ldout(store->ctx(), 20) << "failed initializing user info: " << http_ret << dendl; + return; + } + + if (!op_state.has_existing_user()) { + http_ret = -ERR_NO_SUCH_USER; + return; + } + +#define QUOTA_INPUT_MAX_LEN 1024 + if (set_all) { + UserQuotas quotas; + + if ((http_ret = rgw_rest_get_json_input(store->ctx(), s, quotas, QUOTA_INPUT_MAX_LEN, NULL)) < 0) { + ldout(store->ctx(), 20) << "failed to retrieve input" << dendl; + return; + } + + op_state.set_user_quota(quotas.user_quota); + op_state.set_bucket_quota(quotas.bucket_quota); + } else { + RGWQuotaInfo quota; + + if (!use_http_params) { + bool empty; + http_ret = rgw_rest_get_json_input(store->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty); + if (http_ret < 0) { + ldout(store->ctx(), 20) << "failed to retrieve input" << dendl; + if (!empty) + return; + + /* was probably chunked input, but no content provided, configure via http params */ + use_http_params = true; + } + } + + if (use_http_params) { + RGWUserInfo info; + string err_msg; + http_ret = user.info(info, &err_msg); + if (http_ret < 0) { + ldout(store->ctx(), 20) << "failed to get user info: " << http_ret << dendl; + return; + } + RGWQuotaInfo *old_quota; + if (set_user) { + old_quota = &info.user_quota; + } else { + old_quota = &info.bucket_quota; + } + + RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects); + RESTArgs::get_int64(s, "max-size", old_quota->max_size, "a.max_size); + int64_t max_size_kb; + bool has_max_size_kb = false; + RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb); + if (has_max_size_kb) { + quota.max_size = max_size_kb * 1024; + } + RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled); + } + + if (set_user) { + op_state.set_user_quota(quota); + } else { + op_state.set_bucket_quota(quota); + } + } + + string err; + http_ret = user.modify(op_state, &err); + if (http_ret < 0) { + ldout(store->ctx(), 20) << "failed updating user info: " << http_ret << ": " << err << dendl; + return; + } +} + +RGWOp *RGWHandler_User::op_get() +{ + if (s->info.args.sub_resource_exists("quota")) + return new RGWOp_Quota_Info; + + if (s->info.args.sub_resource_exists("list")) + return new RGWOp_User_List; + + return new RGWOp_User_Info; +} + +RGWOp *RGWHandler_User::op_put() +{ + if (s->info.args.sub_resource_exists("subuser")) + return new RGWOp_Subuser_Create; + + if (s->info.args.sub_resource_exists("key")) + return new RGWOp_Key_Create; + + if (s->info.args.sub_resource_exists("caps")) + return new RGWOp_Caps_Add; + + if (s->info.args.sub_resource_exists("quota")) + return new RGWOp_Quota_Set; + + return new RGWOp_User_Create; +} + +RGWOp *RGWHandler_User::op_post() +{ + if (s->info.args.sub_resource_exists("subuser")) + return new RGWOp_Subuser_Modify; + + return new RGWOp_User_Modify; +} + +RGWOp *RGWHandler_User::op_delete() +{ + if (s->info.args.sub_resource_exists("subuser")) + return new RGWOp_Subuser_Remove; + + if (s->info.args.sub_resource_exists("key")) + return new RGWOp_Key_Remove; + + if (s->info.args.sub_resource_exists("caps")) + return new RGWOp_Caps_Remove; + + return new RGWOp_User_Remove; +} + diff --git a/src/rgw/rgw_rest_user.h b/src/rgw/rgw_rest_user.h new file mode 100644 index 00000000..047fe5ff --- /dev/null +++ b/src/rgw/rgw_rest_user.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_USER_H +#define CEPH_RGW_REST_USER_H + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + + +class RGWHandler_User : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_put() override; + RGWOp *op_post() override; + RGWOp *op_delete() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_User() override = default; + + int read_permissions(RGWOp*) override { + return 0; + } +}; + +class RGWRESTMgr_User : public RGWRESTMgr { +public: + RGWRESTMgr_User() = default; + ~RGWRESTMgr_User() override = default; + + RGWHandler_REST *get_handler(struct req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_User(auth_registry); + } +}; + +#endif diff --git a/src/rgw/rgw_rest_user_policy.cc b/src/rgw/rgw_rest_user_policy.cc new file mode 100644 index 00000000..d93f69ae --- /dev/null +++ b/src/rgw/rgw_rest_user_policy.cc @@ -0,0 +1,363 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_common.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_user_policy.h" + +#define dout_subsys ceph_subsys_rgw + +using rgw::IAM::Policy; + +void RGWRestUserPolicy::dump(Formatter *f) const +{ + encode_json("Policyname", policy_name , f); + encode_json("Username", user_name , f); + encode_json("Policydocument", policy, f); +} + +void RGWRestUserPolicy::send_response() +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s); +} + +int RGWRestUserPolicy::verify_permission() +{ + if (s->auth.identity->is_anonymous()) { + return -EACCES; + } + + if(int ret = check_caps(s->user->caps); ret == 0) { + return ret; + } + + uint64_t op = get_op(); + string user_name = s->info.args.get("UserName"); + rgw_user user_id(user_name); + if (! verify_user_permission(this, s, rgw::ARN(rgw::ARN(user_id.id, + "user", + user_id.tenant)), op)) { + return -EACCES; + } + return 0; +} + +bool RGWRestUserPolicy::validate_input() +{ + if (policy_name.length() > MAX_POLICY_NAME_LEN) { + ldout(s->cct, 0) << "ERROR: Invalid policy name length " << dendl; + return false; + } + + std::regex regex_policy_name("[A-Za-z0-9:=,.@-]+"); + if (! std::regex_match(policy_name, regex_policy_name)) { + ldout(s->cct, 0) << "ERROR: Invalid chars in policy name " << dendl; + return false; + } + + return true; +} + +int RGWUserPolicyRead::check_caps(RGWUserCaps& caps) +{ + return caps.check_cap("user-policy", RGW_CAP_READ); +} + +int RGWUserPolicyWrite::check_caps(RGWUserCaps& caps) +{ + return caps.check_cap("user-policy", RGW_CAP_WRITE); +} + +uint64_t RGWPutUserPolicy::get_op() +{ + return rgw::IAM::iamPutUserPolicy; +} + +int RGWPutUserPolicy::get_params() +{ + policy_name = url_decode(s->info.args.get("PolicyName"), true); + user_name = url_decode(s->info.args.get("UserName"), true); + policy = url_decode(s->info.args.get("PolicyDocument"), true); + + if (policy_name.empty() || user_name.empty() || policy.empty()) { + ldout(s->cct, 20) << "ERROR: one of policy name, user name or policy document is empty" + << dendl; + return -EINVAL; + } + + if (! validate_input()) { + return -EINVAL; + } + + return 0; +} + +void RGWPutUserPolicy::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + bufferlist bl = bufferlist::static_from_string(policy); + + RGWUserInfo info; + rgw_user user_id(user_name); + op_ret = rgw_get_user_info_by_uid(store, user_id, info); + if (op_ret < 0) { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + map uattrs; + op_ret = rgw_get_user_attrs_by_uid(store, user_id, uattrs); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + try { + const Policy p(s->cct, s->user->user_id.tenant, bl); + map policies; + if (auto it = uattrs.find(RGW_ATTR_USER_POLICY); it != uattrs.end()) { + bufferlist out_bl = uattrs[RGW_ATTR_USER_POLICY]; + decode(policies, out_bl); + } + bufferlist in_bl; + policies[policy_name] = policy; + encode(policies, in_bl); + uattrs[RGW_ATTR_USER_POLICY] = in_bl; + + RGWObjVersionTracker objv_tracker; + op_ret = rgw_store_user_info(store, info, &info, &objv_tracker, real_time(), false, &uattrs); + if (op_ret < 0) { + op_ret = -ERR_INTERNAL_ERROR; + } + } catch (rgw::IAM::PolicyParseException& e) { + ldout(s->cct, 20) << "failed to parse policy: " << e.what() << dendl; + op_ret = -ERR_MALFORMED_DOC; + } + + if (op_ret == 0) { + s->formatter->open_object_section("PutUserPolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } +} + +uint64_t RGWGetUserPolicy::get_op() +{ + return rgw::IAM::iamGetUserPolicy; +} + +int RGWGetUserPolicy::get_params() +{ + policy_name = s->info.args.get("PolicyName"); + user_name = s->info.args.get("UserName"); + + if (policy_name.empty() || user_name.empty()) { + ldout(s->cct, 20) << "ERROR: one of policy name or user name is empty" + << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWGetUserPolicy::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + rgw_user user_id(user_name); + map uattrs; + op_ret = rgw_get_user_attrs_by_uid(store, user_id, uattrs); + if (op_ret == -ENOENT) { + ldout(s->cct, 0) << "ERROR: attrs not found for user" << user_name << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + if (op_ret == 0) { + s->formatter->open_object_section("GetUserPolicyResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("GetUserPolicyResult"); + map policies; + if (auto it = uattrs.find(RGW_ATTR_USER_POLICY); it != uattrs.end()) { + bufferlist bl = uattrs[RGW_ATTR_USER_POLICY]; + decode(policies, bl); + if (auto it = policies.find(policy_name); it != policies.end()) { + policy = policies[policy_name]; + dump(s->formatter); + } else { + ldout(s->cct, 0) << "ERROR: policy not found" << policy << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + } else { + ldout(s->cct, 0) << "ERROR: RGW_ATTR_USER_POLICY not found" << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + s->formatter->close_section(); + s->formatter->close_section(); + } + if (op_ret < 0) { + op_ret = -ERR_INTERNAL_ERROR; + } +} + +uint64_t RGWListUserPolicies::get_op() +{ + return rgw::IAM::iamListUserPolicies; +} + +int RGWListUserPolicies::get_params() +{ + user_name = s->info.args.get("UserName"); + + if (user_name.empty()) { + ldout(s->cct, 20) << "ERROR: user name is empty" << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWListUserPolicies::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + rgw_user user_id(user_name); + map uattrs; + op_ret = rgw_get_user_attrs_by_uid(store, user_id, uattrs); + if (op_ret == -ENOENT) { + ldout(s->cct, 0) << "ERROR: attrs not found for user" << user_name << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + if (op_ret == 0) { + map policies; + if (auto it = uattrs.find(RGW_ATTR_USER_POLICY); it != uattrs.end()) { + s->formatter->open_object_section("ListUserPoliciesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->open_object_section("ListUserPoliciesResult"); + bufferlist bl = uattrs[RGW_ATTR_USER_POLICY]; + decode(policies, bl); + for (const auto& p : policies) { + s->formatter->open_object_section("PolicyNames"); + s->formatter->dump_string("member", p.first); + s->formatter->close_section(); + } + s->formatter->close_section(); + s->formatter->close_section(); + } else { + ldout(s->cct, 0) << "ERROR: RGW_ATTR_USER_POLICY not found" << dendl; + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + } + if (op_ret < 0) { + op_ret = -ERR_INTERNAL_ERROR; + } +} + +uint64_t RGWDeleteUserPolicy::get_op() +{ + return rgw::IAM::iamDeleteUserPolicy; +} + +int RGWDeleteUserPolicy::get_params() +{ + policy_name = s->info.args.get("PolicyName"); + user_name = s->info.args.get("UserName"); + + if (policy_name.empty() || user_name.empty()) { + ldout(s->cct, 20) << "ERROR: One of policy name or user name is empty"<< dendl; + return -EINVAL; + } + + return 0; +} + +void RGWDeleteUserPolicy::execute() +{ + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + RGWUserInfo info; + rgw_user user_id(user_name); + op_ret = rgw_get_user_info_by_uid(store, user_id, info); + if (op_ret < 0) { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + map uattrs; + op_ret = rgw_get_user_attrs_by_uid(store, user_id, uattrs); + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + + map policies; + if (auto it = uattrs.find(RGW_ATTR_USER_POLICY); it != uattrs.end()) { + bufferlist out_bl = uattrs[RGW_ATTR_USER_POLICY]; + decode(policies, out_bl); + + if (auto p = policies.find(policy_name); p != policies.end()) { + bufferlist in_bl; + policies.erase(p); + encode(policies, in_bl); + uattrs[RGW_ATTR_USER_POLICY] = in_bl; + + RGWObjVersionTracker objv_tracker; + op_ret = rgw_store_user_info(store, info, &info, &objv_tracker, real_time(), false, &uattrs); + if (op_ret < 0) { + op_ret = -ERR_INTERNAL_ERROR; + } + if (op_ret == 0) { + s->formatter->open_object_section("DeleteUserPoliciesResponse"); + s->formatter->open_object_section("ResponseMetadata"); + s->formatter->dump_string("RequestId", s->trans_id); + s->formatter->close_section(); + s->formatter->close_section(); + } + } else { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } + } else { + op_ret = -ERR_NO_SUCH_ENTITY; + return; + } +} diff --git a/src/rgw/rgw_rest_user_policy.h b/src/rgw/rgw_rest_user_policy.h new file mode 100644 index 00000000..895f4e61 --- /dev/null +++ b/src/rgw/rgw_rest_user_policy.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_REST_USER_POLICY_H +#define CEPH_RGW_REST_USER_POLICY_H + +class RGWRestUserPolicy : public RGWRESTOp { +protected: + static constexpr int MAX_POLICY_NAME_LEN = 128; + string policy_name; + string user_name; + string policy; + + bool validate_input(); + +public: + int verify_permission() override; + virtual uint64_t get_op() = 0; + void send_response() override; + void dump(Formatter *f) const; +}; + +class RGWUserPolicyRead : public RGWRestUserPolicy { +public: + RGWUserPolicyRead() = default; + int check_caps(RGWUserCaps& caps) override; +}; + +class RGWUserPolicyWrite : public RGWRestUserPolicy { +public: + RGWUserPolicyWrite() = default; + int check_caps(RGWUserCaps& caps) override; +}; + +class RGWPutUserPolicy : public RGWUserPolicyWrite { +public: + RGWPutUserPolicy() = default; + void execute() override; + int get_params(); + const char* name() const override { return "put_user-policy"; } + uint64_t get_op() override; + RGWOpType get_type() override { return RGW_OP_PUT_USER_POLICY; } +}; + +class RGWGetUserPolicy : public RGWUserPolicyRead { +public: + RGWGetUserPolicy() = default; + void execute() override; + int get_params(); + const char* name() const override { return "get_user_policy"; } + uint64_t get_op() override; + RGWOpType get_type() override { return RGW_OP_GET_USER_POLICY; } +}; + +class RGWListUserPolicies : public RGWUserPolicyRead { +public: + RGWListUserPolicies() = default; + void execute() override; + int get_params(); + const char* name() const override { return "list_user_policies"; } + uint64_t get_op() override; + RGWOpType get_type() override { return RGW_OP_LIST_USER_POLICIES; } +}; + +class RGWDeleteUserPolicy : public RGWUserPolicyWrite { +public: + RGWDeleteUserPolicy() = default; + void execute() override; + int get_params(); + const char* name() const override { return "delete_user_policy"; } + uint64_t get_op() override; + RGWOpType get_type() override { return RGW_OP_DELETE_USER_POLICY; } +}; + +#endif /* CEPH_RGW_REST_USER_POLICY_H */ + diff --git a/src/rgw/rgw_role.cc b/src/rgw/rgw_role.cc new file mode 100644 index 00000000..6e6b137a --- /dev/null +++ b/src/rgw/rgw_role.cc @@ -0,0 +1,502 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "common/ceph_time.h" +#include "rgw_rados.h" +#include "rgw_zone.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_common.h" +#include "rgw_tools.h" +#include "rgw_role.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + + +const string RGWRole::role_name_oid_prefix = "role_names."; +const string RGWRole::role_oid_prefix = "roles."; +const string RGWRole::role_path_oid_prefix = "role_paths."; +const string RGWRole::role_arn_prefix = "arn:aws:iam::"; + +int RGWRole::store_info(bool exclusive) +{ + using ceph::encode; + string oid = get_info_oid_prefix() + id; + + bufferlist bl; + encode(*this, bl); + return rgw_put_system_obj(store, store->svc.zone->get_zone_params().roles_pool, oid, + bl, exclusive, NULL, real_time(), NULL); +} + +int RGWRole::store_name(bool exclusive) +{ + RGWNameToId nameToId; + nameToId.obj_id = id; + + string oid = tenant + get_names_oid_prefix() + name; + + bufferlist bl; + using ceph::encode; + encode(nameToId, bl); + return rgw_put_system_obj(store, store->svc.zone->get_zone_params().roles_pool, oid, + bl, exclusive, NULL, real_time(), NULL); +} + +int RGWRole::store_path(bool exclusive) +{ + string oid = tenant + get_path_oid_prefix() + path + get_info_oid_prefix() + id; + + bufferlist bl; + return rgw_put_system_obj(store, store->svc.zone->get_zone_params().roles_pool, oid, + bl, exclusive, NULL, real_time(), NULL); +} + +int RGWRole::create(bool exclusive) +{ + int ret; + + if (! validate_input()) { + return -EINVAL; + } + + /* check to see the name is not used */ + ret = read_id(name, tenant, id); + if (exclusive && ret == 0) { + ldout(cct, 0) << "ERROR: name " << name << " already in use for role id " + << id << dendl; + return -EEXIST; + } else if ( ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "failed reading role id " << id << ": " + << cpp_strerror(-ret) << dendl; + return ret; + } + + /* create unique id */ + uuid_d new_uuid; + char uuid_str[37]; + new_uuid.generate_random(); + new_uuid.print(uuid_str); + id = uuid_str; + + //arn + arn = role_arn_prefix + tenant + ":role" + path + name; + + // Creation time + real_clock::time_point t = real_clock::now(); + + struct timeval tv; + real_clock::to_timeval(t, tv); + + char buf[30]; + struct tm result; + gmtime_r(&tv.tv_sec, &result); + strftime(buf,30,"%Y-%m-%dT%H:%M:%S", &result); + sprintf(buf + strlen(buf),".%dZ",(int)tv.tv_usec/1000); + creation_date.assign(buf, strlen(buf)); + + auto& pool = store->svc.zone->get_zone_params().roles_pool; + ret = store_info(exclusive); + if (ret < 0) { + ldout(cct, 0) << "ERROR: storing role info in pool: " << pool.name << ": " + << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = store_name(exclusive); + if (ret < 0) { + ldout(cct, 0) << "ERROR: storing role name in pool: " << pool.name << ": " + << name << ": " << cpp_strerror(-ret) << dendl; + + //Delete the role info that was stored in the previous call + string oid = get_info_oid_prefix() + id; + int info_ret = rgw_delete_system_obj(store, pool, oid, NULL); + if (info_ret < 0) { + ldout(cct, 0) << "ERROR: cleanup of role id from pool: " << pool.name << ": " + << id << ": " << cpp_strerror(-info_ret) << dendl; + } + return ret; + } + + ret = store_path(exclusive); + if (ret < 0) { + ldout(cct, 0) << "ERROR: storing role path in pool: " << pool.name << ": " + << path << ": " << cpp_strerror(-ret) << dendl; + //Delete the role info that was stored in the previous call + string oid = get_info_oid_prefix() + id; + int info_ret = rgw_delete_system_obj(store, pool, oid, NULL); + if (info_ret < 0) { + ldout(cct, 0) << "ERROR: cleanup of role id from pool: " << pool.name << ": " + << id << ": " << cpp_strerror(-info_ret) << dendl; + } + //Delete role name that was stored in previous call + oid = tenant + get_names_oid_prefix() + name; + int name_ret = rgw_delete_system_obj(store, pool, oid, NULL); + if (name_ret < 0) { + ldout(cct, 0) << "ERROR: cleanup of role name from pool: " << pool.name << ": " + << name << ": " << cpp_strerror(-name_ret) << dendl; + } + return ret; + } + return 0; +} + +int RGWRole::delete_obj() +{ + auto& pool = store->svc.zone->get_zone_params().roles_pool; + + int ret = read_name(); + if (ret < 0) { + return ret; + } + + ret = read_info(); + if (ret < 0) { + return ret; + } + + if (! perm_policy_map.empty()) { + return -ERR_DELETE_CONFLICT; + } + + // Delete id + string oid = get_info_oid_prefix() + id; + ret = rgw_delete_system_obj(store, pool, oid, NULL); + if (ret < 0) { + ldout(cct, 0) << "ERROR: deleting role id from pool: " << pool.name << ": " + << id << ": " << cpp_strerror(-ret) << dendl; + } + + // Delete name + oid = tenant + get_names_oid_prefix() + name; + ret = rgw_delete_system_obj(store, pool, oid, NULL); + if (ret < 0) { + ldout(cct, 0) << "ERROR: deleting role name from pool: " << pool.name << ": " + << name << ": " << cpp_strerror(-ret) << dendl; + } + + // Delete path + oid = tenant + get_path_oid_prefix() + path + get_info_oid_prefix() + id; + ret = rgw_delete_system_obj(store, pool, oid, NULL); + if (ret < 0) { + ldout(cct, 0) << "ERROR: deleting role path from pool: " << pool.name << ": " + << path << ": " << cpp_strerror(-ret) << dendl; + } + return ret; +} + +int RGWRole::get() +{ + int ret = read_name(); + if (ret < 0) { + return ret; + } + + ret = read_info(); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWRole::get_by_id() +{ + int ret = read_info(); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWRole::update() +{ + auto& pool = store->svc.zone->get_zone_params().roles_pool; + + int ret = store_info(false); + if (ret < 0) { + ldout(cct, 0) << "ERROR: storing info in pool: " << pool.name << ": " + << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +void RGWRole::set_perm_policy(const string& policy_name, const string& perm_policy) +{ + perm_policy_map[policy_name] = perm_policy; +} + +vector RGWRole::get_role_policy_names() +{ + vector policy_names; + for (const auto& it : perm_policy_map) + { + policy_names.push_back(std::move(it.first)); + } + + return policy_names; +} + +int RGWRole::get_role_policy(const string& policy_name, string& perm_policy) +{ + const auto it = perm_policy_map.find(policy_name); + if (it == perm_policy_map.end()) { + ldout(cct, 0) << "ERROR: Policy name: " << policy_name << " not found" << dendl; + return -ENOENT; + } else { + perm_policy = it->second; + } + return 0; +} + +int RGWRole::delete_policy(const string& policy_name) +{ + const auto& it = perm_policy_map.find(policy_name); + if (it == perm_policy_map.end()) { + ldout(cct, 0) << "ERROR: Policy name: " << policy_name << " not found" << dendl; + return -ENOENT; + } else { + perm_policy_map.erase(it); + } + return 0; +} + +void RGWRole::dump(Formatter *f) const +{ + encode_json("RoleId", id , f); + encode_json("RoleName", name , f); + encode_json("Path", path, f); + encode_json("Arn", arn, f); + encode_json("CreateDate", creation_date, f); + encode_json("MaxSessionDuration", max_session_duration, f); + encode_json("AssumeRolePolicyDocument", trust_policy, f); +} + +void RGWRole::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("path", path, obj); + JSONDecoder::decode_json("arn", arn, obj); + JSONDecoder::decode_json("create_date", creation_date, obj); + JSONDecoder::decode_json("max_session_duration", max_session_duration, obj); + JSONDecoder::decode_json("assume_role_policy_document", trust_policy, obj); +} + +int RGWRole::read_id(const string& role_name, const string& tenant, string& role_id) +{ + auto& pool = store->svc.zone->get_zone_params().roles_pool; + string oid = tenant + get_names_oid_prefix() + role_name; + bufferlist bl; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL); + if (ret < 0) { + return ret; + } + + RGWNameToId nameToId; + try { + auto iter = bl.cbegin(); + using ceph::decode; + decode(nameToId, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode role from pool: " << pool.name << ": " + << role_name << dendl; + return -EIO; + } + role_id = nameToId.obj_id; + return 0; +} + +int RGWRole::read_info() +{ + auto& pool = store->svc.zone->get_zone_params().roles_pool; + string oid = get_info_oid_prefix() + id; + bufferlist bl; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed reading role info from pool: " << pool.name << + ": " << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + try { + using ceph::decode; + auto iter = bl.cbegin(); + decode(*this, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode role info from pool: " << pool.name << + ": " << id << dendl; + return -EIO; + } + + return 0; +} + +int RGWRole::read_name() +{ + auto& pool = store->svc.zone->get_zone_params().roles_pool; + string oid = tenant + get_names_oid_prefix() + name; + bufferlist bl; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed reading role name from pool: " << pool.name << ": " + << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + RGWNameToId nameToId; + try { + using ceph::decode; + auto iter = bl.cbegin(); + decode(nameToId, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode role name from pool: " << pool.name << ": " + << name << dendl; + return -EIO; + } + id = nameToId.obj_id; + return 0; +} + +bool RGWRole::validate_input() +{ + if (name.length() > MAX_ROLE_NAME_LEN) { + ldout(cct, 0) << "ERROR: Invalid name length " << dendl; + return false; + } + + if (path.length() > MAX_PATH_NAME_LEN) { + ldout(cct, 0) << "ERROR: Invalid path length " << dendl; + return false; + } + + std::regex regex_name("[A-Za-z0-9:=,.@-]+"); + if (! std::regex_match(name, regex_name)) { + ldout(cct, 0) << "ERROR: Invalid chars in name " << dendl; + return false; + } + + std::regex regex_path("(/[!-~]+/)|(/)"); + if (! std::regex_match(path,regex_path)) { + ldout(cct, 0) << "ERROR: Invalid chars in path " << dendl; + return false; + } + + if (max_session_duration < SESSION_DURATION_MIN || + max_session_duration > SESSION_DURATION_MAX) { + ldout(cct, 0) << "ERROR: Invalid session duration, should be between 3600 and 43200 seconds " << dendl; + return false; + } + return true; +} + +void RGWRole::extract_name_tenant(const std::string& str) +{ + size_t pos = str.find('$'); + if (pos != std::string::npos) { + tenant = str.substr(0, pos); + name = str.substr(pos + 1); + } +} + +void RGWRole::update_trust_policy(string& trust_policy) +{ + this->trust_policy = trust_policy; +} + +int RGWRole::get_roles_by_path_prefix(RGWRados *store, + CephContext *cct, + const string& path_prefix, + const string& tenant, + vector& roles) +{ + auto pool = store->svc.zone->get_zone_params().roles_pool; + string prefix; + + // List all roles if path prefix is empty + if (! path_prefix.empty()) { + prefix = tenant + role_path_oid_prefix + path_prefix; + } else { + prefix = tenant + role_path_oid_prefix; + } + + //Get the filtered objects + list result; + bool is_truncated; + RGWListRawObjsCtx ctx; + do { + list oids; + int r = store->list_raw_objects(pool, prefix, 1000, ctx, oids, &is_truncated); + if (r < 0) { + ldout(cct, 0) << "ERROR: listing filtered objects failed: " << pool.name << ": " + << prefix << ": " << cpp_strerror(-r) << dendl; + return r; + } + for (const auto& iter : oids) { + result.push_back(iter.substr(role_path_oid_prefix.size())); + } + } while (is_truncated); + + for (const auto& it : result) { + //Find the role oid prefix from the end + size_t pos = it.rfind(role_oid_prefix); + if (pos == string::npos) { + continue; + } + // Split the result into path and info_oid + id + string path = it.substr(0, pos); + + /*Make sure that prefix is part of path (False results could've been returned) + because of the role info oid + id appended to the path)*/ + if(path_prefix.empty() || path.find(path_prefix) != string::npos) { + //Get id from info oid prefix + id + string id = it.substr(pos + role_oid_prefix.length()); + + RGWRole role(cct, store); + role.set_id(id); + int ret = role.read_info(); + if (ret < 0) { + return ret; + } + roles.push_back(std::move(role)); + } + } + + return 0; +} + +const string& RGWRole::get_names_oid_prefix() +{ + return role_name_oid_prefix; +} + +const string& RGWRole::get_info_oid_prefix() +{ + return role_oid_prefix; +} + +const string& RGWRole::get_path_oid_prefix() +{ + return role_path_oid_prefix; +} diff --git a/src/rgw/rgw_role.h b/src/rgw/rgw_role.h new file mode 100644 index 00000000..90976099 --- /dev/null +++ b/src/rgw/rgw_role.h @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_ROLE_H +#define CEPH_RGW_ROLE_H + +#include + +#include "common/ceph_context.h" + +#include "rgw/rgw_rados.h" + +class RGWRole +{ + using string = std::string; + static const string role_name_oid_prefix; + static const string role_oid_prefix; + static const string role_path_oid_prefix; + static const string role_arn_prefix; + static constexpr int MAX_ROLE_NAME_LEN = 64; + static constexpr int MAX_PATH_NAME_LEN = 512; + static constexpr uint64_t SESSION_DURATION_MIN = 3600; // in seconds + static constexpr uint64_t SESSION_DURATION_MAX = 43200; // in seconds + + CephContext *cct; + RGWRados *store; + string id; + string name; + string path; + string arn; + string creation_date; + string trust_policy; + map perm_policy_map; + string tenant; + uint64_t max_session_duration; + + int store_info(bool exclusive); + int store_name(bool exclusive); + int store_path(bool exclusive); + int read_id(const string& role_name, const string& tenant, string& role_id); + int read_name(); + int read_info(); + void set_id(const string& id) { this->id = id; } + bool validate_input(); + void extract_name_tenant(const std::string& str); + +public: + RGWRole(CephContext *cct, + RGWRados *store, + string name, + string path, + string trust_policy, + string tenant, + string max_session_duration_str="") + : cct(cct), + store(store), + name(std::move(name)), + path(std::move(path)), + trust_policy(std::move(trust_policy)), + tenant(std::move(tenant)) { + if (this->path.empty()) + this->path = "/"; + extract_name_tenant(this->name); + if (max_session_duration_str.empty()) { + max_session_duration = SESSION_DURATION_MIN; + } else { + max_session_duration = std::stoull(max_session_duration_str); + } + } + + RGWRole(CephContext *cct, + RGWRados *store, + string name, + string tenant) + : cct(cct), + store(store), + name(std::move(name)), + tenant(std::move(tenant)) { + extract_name_tenant(this->name); + } + + RGWRole(CephContext *cct, + RGWRados *store, + string id) + : cct(cct), + store(store), + id(std::move(id)) {} + + RGWRole(CephContext *cct, + RGWRados *store) + : cct(cct), + store(store) {} + + RGWRole() {} + + ~RGWRole() = default; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(id, bl); + encode(name, bl); + encode(path, bl); + encode(arn, bl); + encode(creation_date, bl); + encode(trust_policy, bl); + encode(perm_policy_map, bl); + encode(tenant, bl); + encode(max_session_duration, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(id, bl); + decode(name, bl); + decode(path, bl); + decode(arn, bl); + decode(creation_date, bl); + decode(trust_policy, bl); + decode(perm_policy_map, bl); + if (struct_v >= 2) { + decode(tenant, bl); + } + if (struct_v >= 3) { + decode(max_session_duration, bl); + } + DECODE_FINISH(bl); + } + + const string& get_id() const { return id; } + const string& get_name() const { return name; } + const string& get_path() const { return path; } + const string& get_create_date() const { return creation_date; } + const string& get_assume_role_policy() const { return trust_policy;} + const uint64_t& get_max_session_duration() const { return max_session_duration; } + + int create(bool exclusive); + int delete_obj(); + int get(); + int get_by_id(); + int update(); + void update_trust_policy(string& trust_policy); + void set_perm_policy(const string& policy_name, const string& perm_policy); + vector get_role_policy_names(); + int get_role_policy(const string& policy_name, string& perm_policy); + int delete_policy(const string& policy_name); + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + static const string& get_names_oid_prefix(); + static const string& get_info_oid_prefix(); + static const string& get_path_oid_prefix(); + static int get_roles_by_path_prefix(RGWRados *store, + CephContext *cct, + const string& path_prefix, + const string& tenant, + vector& roles); +}; +WRITE_CLASS_ENCODER(RGWRole) +#endif /* CEPH_RGW_ROLE_H */ + diff --git a/src/rgw/rgw_service.cc b/src/rgw/rgw_service.cc new file mode 100644 index 00000000..0369806c --- /dev/null +++ b/src/rgw/rgw_service.cc @@ -0,0 +1,191 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_service.h" + +#include "services/svc_finisher.h" +#include "services/svc_notify.h" +#include "services/svc_rados.h" +#include "services/svc_zone.h" +#include "services/svc_zone_utils.h" +#include "services/svc_quota.h" +#include "services/svc_sync_modules.h" +#include "services/svc_sys_obj.h" +#include "services/svc_sys_obj_cache.h" +#include "services/svc_sys_obj_core.h" + +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rgw + + +RGWServices_Def::RGWServices_Def() = default; +RGWServices_Def::~RGWServices_Def() +{ + shutdown(); +} + +int RGWServices_Def::init(CephContext *cct, + bool have_cache, + bool raw) +{ + finisher = std::make_unique(cct); + notify = std::make_unique(cct); + rados = std::make_unique(cct); + zone = std::make_unique(cct); + zone_utils = std::make_unique(cct); + quota = std::make_unique(cct); + sync_modules = std::make_unique(cct); + sysobj = std::make_unique(cct); + sysobj_core = std::make_unique(cct); + + if (have_cache) { + sysobj_cache = std::make_unique(cct); + } + finisher->init(); + notify->init(zone.get(), rados.get(), finisher.get()); + rados->init(); + zone->init(sysobj.get(), rados.get(), sync_modules.get()); + zone_utils->init(rados.get(), zone.get()); + quota->init(zone.get()); + sync_modules->init(); + sysobj_core->core_init(rados.get(), zone.get()); + if (have_cache) { + sysobj_cache->init(rados.get(), zone.get(), notify.get()); + sysobj->init(rados.get(), sysobj_cache.get()); + } else { + sysobj->init(rados.get(), sysobj_core.get()); + } + + can_shutdown = true; + + int r = finisher->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start finisher service (" << cpp_strerror(-r) << dendl; + return r; + } + + if (!raw) { + r = notify->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start notify service (" << cpp_strerror(-r) << dendl; + return r; + } + } + + r = rados->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start rados service (" << cpp_strerror(-r) << dendl; + return r; + } + + if (!raw) { + r = zone->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start zone service (" << cpp_strerror(-r) << dendl; + return r; + } + } + + r = zone_utils->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start zone_utils service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = quota->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start quota service (" << cpp_strerror(-r) << dendl; + return r; + } + + r = sysobj_core->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start sysobj_core service (" << cpp_strerror(-r) << dendl; + return r; + } + + if (have_cache) { + r = sysobj_cache->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start sysobj_cache service (" << cpp_strerror(-r) << dendl; + return r; + } + } + + r = sysobj->start(); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to start sysobj service (" << cpp_strerror(-r) << dendl; + return r; + } + + /* cache or core services will be started by sysobj */ + + return 0; +} + +void RGWServices_Def::shutdown() +{ + if (!can_shutdown) { + return; + } + + if (has_shutdown) { + return; + } + + sysobj->shutdown(); + sysobj_core->shutdown(); + notify->shutdown(); + if (sysobj_cache) { + sysobj_cache->shutdown(); + } + quota->shutdown(); + zone_utils->shutdown(); + zone->shutdown(); + rados->shutdown(); + + has_shutdown = true; + +} + + +int RGWServices::do_init(CephContext *cct, bool have_cache, bool raw) +{ + int r = _svc.init(cct, have_cache, raw); + if (r < 0) { + return r; + } + + finisher = _svc.finisher.get(); + notify = _svc.notify.get(); + rados = _svc.rados.get(); + zone = _svc.zone.get(); + zone_utils = _svc.zone_utils.get(); + quota = _svc.quota.get(); + sync_modules = _svc.sync_modules.get(); + sysobj = _svc.sysobj.get(); + cache = _svc.sysobj_cache.get(); + core = _svc.sysobj_core.get(); + + return 0; +} + +int RGWServiceInstance::start() +{ + if (start_state != StateInit) { + return 0; + } + + start_state = StateStarting;; /* setting started prior to do_start() on purpose so that circular + references can call start() on each other */ + + int r = do_start(); + if (r < 0) { + return r; + } + + start_state = StateStarted; + + return 0; +} diff --git a/src/rgw/rgw_service.h b/src/rgw/rgw_service.h new file mode 100644 index 00000000..316bacdb --- /dev/null +++ b/src/rgw/rgw_service.h @@ -0,0 +1,112 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SERVICE_H +#define CEPH_RGW_SERVICE_H + + +#include +#include +#include + +#include "rgw/rgw_common.h" + +struct RGWServices_Def; + +class RGWServiceInstance +{ + friend struct RGWServices_Def; + +protected: + CephContext *cct; + + enum StartState { + StateInit = 0, + StateStarting = 1, + StateStarted = 2, + } start_state{StateInit}; + + virtual void shutdown() {} + virtual int do_start() { + return 0; + } +public: + RGWServiceInstance(CephContext *_cct) : cct(_cct) {} + virtual ~RGWServiceInstance() {} + + int start(); + bool is_started() { + return (start_state == StateStarted); + } + + CephContext *ctx() { + return cct; + } +}; + +class RGWSI_Finisher; +class RGWSI_Notify; +class RGWSI_RADOS; +class RGWSI_Zone; +class RGWSI_ZoneUtils; +class RGWSI_Quota; +class RGWSI_SyncModules; +class RGWSI_SysObj; +class RGWSI_SysObj_Core; +class RGWSI_SysObj_Cache; + +struct RGWServices_Def +{ + bool can_shutdown{false}; + bool has_shutdown{false}; + + std::unique_ptr finisher; + std::unique_ptr notify; + std::unique_ptr rados; + std::unique_ptr zone; + std::unique_ptr zone_utils; + std::unique_ptr quota; + std::unique_ptr sync_modules; + std::unique_ptr sysobj; + std::unique_ptr sysobj_core; + std::unique_ptr sysobj_cache; + + RGWServices_Def(); + ~RGWServices_Def(); + + int init(CephContext *cct, bool have_cache, bool raw_storage); + void shutdown(); +}; + + +struct RGWServices +{ + RGWServices_Def _svc; + + RGWSI_Finisher *finisher{nullptr}; + RGWSI_Notify *notify{nullptr}; + RGWSI_RADOS *rados{nullptr}; + RGWSI_Zone *zone{nullptr}; + RGWSI_ZoneUtils *zone_utils{nullptr}; + RGWSI_Quota *quota{nullptr}; + RGWSI_SyncModules *sync_modules{nullptr}; + RGWSI_SysObj *sysobj{nullptr}; + RGWSI_SysObj_Cache *cache{nullptr}; + RGWSI_SysObj_Core *core{nullptr}; + + int do_init(CephContext *cct, bool have_cache, bool raw_storage); + + int init(CephContext *cct, bool have_cache) { + return do_init(cct, have_cache, false); + } + + int init_raw(CephContext *cct, bool have_cache) { + return do_init(cct, have_cache, true); + } + void shutdown() { + _svc.shutdown(); + } +}; + + +#endif diff --git a/src/rgw/rgw_string.cc b/src/rgw/rgw_string.cc new file mode 100644 index 00000000..d49bba71 --- /dev/null +++ b/src/rgw/rgw_string.cc @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_string.h" + +static bool char_eq(char c1, char c2) +{ + return c1 == c2; +} + +static bool ci_char_eq(char c1, char c2) +{ + return tolower(c1) == tolower(c2); +} + +bool match_wildcards(boost::string_view pattern, boost::string_view input, + uint32_t flags) +{ + const auto eq = (flags & MATCH_CASE_INSENSITIVE) ? &ci_char_eq : &char_eq; + + auto it1 = pattern.begin(); + auto it2 = input.begin(); + while (true) { + if (it1 == pattern.end()) + return it2 == input.end(); + if (*it1 == '*') { + if (it1 + 1 == pattern.end()) + return true; + if (it2 == input.end() || eq(*(it1 + 1), *it2)) + ++it1; + else + ++it2; + continue; + } + if (it2 == input.end()) + return false; + if (*it1 == '?' || eq(*it1, *it2)) { + ++it1; + ++it2; + continue; + } + return false; + } + return false; +} diff --git a/src/rgw/rgw_string.h b/src/rgw/rgw_string.h new file mode 100644 index 00000000..c5666753 --- /dev/null +++ b/src/rgw/rgw_string.h @@ -0,0 +1,236 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_STRING_H +#define CEPH_RGW_STRING_H + +#include +#include +#include + +#include +#include + +struct ltstr_nocase +{ + bool operator()(const std::string& s1, const std::string& s2) const + { + return strcasecmp(s1.c_str(), s2.c_str()) < 0; + } +}; + +static inline int stringcasecmp(const std::string& s1, const std::string& s2) +{ + return strcasecmp(s1.c_str(), s2.c_str()); +} + +static inline int stringcasecmp(const std::string& s1, const char *s2) +{ + return strcasecmp(s1.c_str(), s2); +} + +static inline int stringcasecmp(const std::string& s1, int ofs, int size, const std::string& s2) +{ + return strncasecmp(s1.c_str() + ofs, s2.c_str(), size); +} + +static inline int stringtoll(const std::string& s, int64_t *val) +{ + char *end; + + long long result = strtoll(s.c_str(), &end, 10); + if (result == LLONG_MAX) + return -EINVAL; + + if (*end) + return -EINVAL; + + *val = (int64_t)result; + + return 0; +} + +static inline int stringtoull(const std::string& s, uint64_t *val) +{ + char *end; + + unsigned long long result = strtoull(s.c_str(), &end, 10); + if (result == ULLONG_MAX) + return -EINVAL; + + if (*end) + return -EINVAL; + + *val = (uint64_t)result; + + return 0; +} + +static inline int stringtol(const std::string& s, int32_t *val) +{ + char *end; + + long result = strtol(s.c_str(), &end, 10); + if (result == LONG_MAX) + return -EINVAL; + + if (*end) + return -EINVAL; + + *val = (int32_t)result; + + return 0; +} + +static inline int stringtoul(const std::string& s, uint32_t *val) +{ + char *end; + + unsigned long result = strtoul(s.c_str(), &end, 10); + if (result == ULONG_MAX) + return -EINVAL; + + if (*end) + return -EINVAL; + + *val = (uint32_t)result; + + return 0; +} + +/* A converter between boost::string_view and null-terminated C-strings. + * It copies memory while trying to utilize the local memory instead of + * issuing dynamic allocations. */ +template +static inline boost::container::small_vector +sview2cstr(const boost::string_view& sv) +{ + boost::container::small_vector cstr; + cstr.reserve(sv.size() + sizeof('\0')); + + cstr.assign(std::begin(sv), std::end(sv)); + cstr.push_back('\0'); + + return cstr; +} + +/* std::strlen() isn't guaranteed to be computable at compile-time. Although + * newer GCCs actually do that, Clang doesn't. Please be aware this function + * IS NOT A DROP-IN REPLACEMENT FOR STRLEN -- it returns a different result + * for strings having \0 in the middle. */ +template +static inline constexpr size_t sarrlen(const char (&arr)[N]) { + return N - 1; +} + +namespace detail { + +// variadic sum() to add up string lengths for reserve() +static inline constexpr size_t sum() { return 0; } +template +constexpr size_t sum(size_t v, Args... args) { return v + sum(args...); } + +// traits for string_size() +template +struct string_traits { + static constexpr size_t size(const T& s) { return s.size(); } +}; +// specializations for char*/const char* use strlen() +template <> +struct string_traits { + static size_t size(const char* s) { return std::strlen(s); } +}; +template <> +struct string_traits : string_traits {}; +// constexpr specializations for char[]/const char[] +template +struct string_traits { + static constexpr size_t size_(const char* s, size_t i) { + return i < N ? (*(s + i) == '\0' ? i : size_(s, i + 1)) + : throw std::invalid_argument("Unterminated string constant."); + } + static constexpr size_t size(const char(&s)[N]) { return size_(s, 0); } +}; +template +struct string_traits : string_traits {}; + +// helpers for string_cat_reserve() +static inline void append_to(std::string& s) {} +template +void append_to(std::string& s, const boost::string_view& v, const Args&... args) +{ + s.append(v.begin(), v.end()); + append_to(s, args...); +} + +// helpers for string_join_reserve() +static inline void join_next(std::string& s, const boost::string_view& d) {} +template +void join_next(std::string& s, const boost::string_view& d, + const boost::string_view& v, const Args&... args) +{ + s.append(d.begin(), d.end()); + s.append(v.begin(), v.end()); + join_next(s, d, args...); +} + +static inline void join(std::string& s, const boost::string_view& d) {} +template +void join(std::string& s, const boost::string_view& d, + const boost::string_view& v, const Args&... args) +{ + s.append(v.begin(), v.end()); + join_next(s, d, args...); +} + +} // namespace detail + +/// return the length of a c string, string literal, or string type +template +constexpr size_t string_size(const T& s) +{ + return detail::string_traits::size(s); +} + +/// concatenates the given string arguments, returning as a std::string that +/// gets preallocated with reserve() +template +std::string string_cat_reserve(const Args&... args) +{ + size_t total_size = detail::sum(string_size(args)...); + std::string result; + result.reserve(total_size); + detail::append_to(result, args...); + return result; +} + +/// joins the given string arguments with a delimiter, returning as a +/// std::string that gets preallocated with reserve() +template +std::string string_join_reserve(const boost::string_view& delim, + const Args&... args) +{ + size_t delim_size = delim.size() * std::max(0, sizeof...(args) - 1); + size_t total_size = detail::sum(string_size(args)...) + delim_size; + std::string result; + result.reserve(total_size); + detail::join(result, delim, args...); + return result; +} +template +std::string string_join_reserve(char delim, const Args&... args) +{ + return string_join_reserve(boost::string_view{&delim, 1}, args...); +} + + +/// use case-insensitive comparison in match_wildcards() +static constexpr uint32_t MATCH_CASE_INSENSITIVE = 0x01; + +/// attempt to match the given input string with the pattern, which may contain +/// the wildcard characters * and ? +extern bool match_wildcards(boost::string_view pattern, + boost::string_view input, + uint32_t flags = 0); + +#endif diff --git a/src/rgw/rgw_sts.cc b/src/rgw/rgw_sts.cc new file mode 100644 index 00000000..0cef12ac --- /dev/null +++ b/src/rgw/rgw_sts.cc @@ -0,0 +1,427 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "common/ceph_time.h" +#include "rgw_rados.h" +#include "auth/Crypto.h" +#include "include/ceph_fs.h" +#include "common/iso_8601.h" + +#include "include/types.h" +#include "rgw_string.h" + +#include "rgw_b64.h" +#include "rgw_common.h" +#include "rgw_tools.h" +#include "rgw_role.h" +#include "rgw_user.h" +#include "rgw_iam_policy.h" +#include "rgw_sts.h" + +#define dout_subsys ceph_subsys_rgw + +namespace STS { + +void Credentials::dump(Formatter *f) const +{ + encode_json("AccessKeyId", accessKeyId , f); + encode_json("Expiration", expiration , f); + encode_json("SecretAccessKey", secretAccessKey , f); + encode_json("SessionToken", sessionToken , f); +} + +int Credentials::generateCredentials(CephContext* cct, + const uint64_t& duration, + const boost::optional& policy, + const boost::optional& roleId, + boost::optional user, + rgw::auth::Identity* identity) +{ + uuid_d accessKey, secretKey; + char accessKeyId_str[MAX_ACCESS_KEY_LEN], secretAccessKey_str[MAX_SECRET_KEY_LEN]; + + //AccessKeyId + gen_rand_alphanumeric_plain(cct, accessKeyId_str, sizeof(accessKeyId_str)); + accessKeyId = accessKeyId_str; + + //SecretAccessKey + gen_rand_alphanumeric_upper(cct, secretAccessKey_str, sizeof(secretAccessKey_str)); + secretAccessKey = secretAccessKey_str; + + //Expiration + real_clock::time_point t = real_clock::now(); + real_clock::time_point exp = t + std::chrono::seconds(duration); + expiration = ceph::to_iso_8601(exp); + + //Session Token - Encrypt using AES + auto* cryptohandler = cct->get_crypto_handler(CEPH_CRYPTO_AES); + if (! cryptohandler) { + return -EINVAL; + } + string secret_s = cct->_conf->rgw_sts_key; + buffer::ptr secret(secret_s.c_str(), secret_s.length()); + int ret = 0; + if (ret = cryptohandler->validate_secret(secret); ret < 0) { + ldout(cct, 0) << "ERROR: Invalid secret key" << dendl; + return ret; + } + string error; + auto* keyhandler = cryptohandler->get_key_handler(secret, error); + if (! keyhandler) { + return -EINVAL; + } + error.clear(); + //Storing policy and roleId as part of token, so that they can be extracted + // from the token itself for policy evaluation. + SessionToken token; + //authentication info + token.access_key_id = accessKeyId; + token.secret_access_key = secretAccessKey; + token.expiration = expiration; + + //Authorization info + if (policy) + token.policy = *policy; + else + token.policy = {}; + + if (roleId) + token.roleId = *roleId; + else + token.roleId = {}; + + if (user) + token.user = *user; + else { + rgw_user u({}, {}); + token.user = u; + } + + if (identity) { + token.acct_name = identity->get_acct_name(); + token.perm_mask = identity->get_perm_mask(); + token.is_admin = identity->is_admin_of(token.user); + token.acct_type = identity->get_identity_type(); + } else { + token.acct_name = {}; + token.perm_mask = 0; + token.is_admin = 0; + token.acct_type = TYPE_ROLE; + } + + buffer::list input, enc_output; + encode(token, input); + + if (ret = keyhandler->encrypt(input, enc_output, &error); ret < 0) { + return ret; + } + + bufferlist encoded_op; + enc_output.encode_base64(encoded_op); + encoded_op.append('\0'); + sessionToken = encoded_op.c_str(); + + return ret; +} + +void AssumedRoleUser::dump(Formatter *f) const +{ + encode_json("Arn", arn , f); + encode_json("AssumeRoleId", assumeRoleId , f); +} + +int AssumedRoleUser::generateAssumedRoleUser(CephContext* cct, + RGWRados *store, + const string& roleId, + const rgw::ARN& roleArn, + const string& roleSessionName) +{ + string resource = std::move(roleArn.resource); + boost::replace_first(resource, "role", "assumed-role"); + resource.append("/"); + resource.append(roleSessionName); + + rgw::ARN assumed_role_arn(rgw::Partition::aws, + rgw::Service::sts, + "", roleArn.account, resource); + arn = assumed_role_arn.to_string(); + + //Assumeroleid = roleid:rolesessionname + assumeRoleId = roleId + ":" + roleSessionName; + + return 0; +} + +AssumeRoleRequestBase::AssumeRoleRequestBase( const string& duration, + const string& iamPolicy, + const string& roleArn, + const string& roleSessionName) + : iamPolicy(iamPolicy), roleArn(roleArn), roleSessionName(roleSessionName) +{ + if (duration.empty()) { + this->duration = DEFAULT_DURATION_IN_SECS; + } else { + this->duration = strict_strtoll(duration.c_str(), 10, &this->err_msg); + } +} + +int AssumeRoleRequestBase::validate_input() const +{ + if (!err_msg.empty()) { + return -EINVAL; + } + + if (duration < MIN_DURATION_IN_SECS || + duration > MAX_DURATION_IN_SECS) { + return -EINVAL; + } + + if (! iamPolicy.empty() && + (iamPolicy.size() < MIN_POLICY_SIZE || iamPolicy.size() > MAX_POLICY_SIZE)) { + return -ERR_PACKED_POLICY_TOO_LARGE; + } + + if (! roleArn.empty() && + (roleArn.size() < MIN_ROLE_ARN_SIZE || roleArn.size() > MAX_ROLE_ARN_SIZE)) { + return -EINVAL; + } + + if (! roleSessionName.empty()) { + if (roleSessionName.size() < MIN_ROLE_SESSION_SIZE || roleSessionName.size() > MAX_ROLE_SESSION_SIZE) { + return -EINVAL; + } + + std::regex regex_roleSession("[A-Za-z0-9_=,.@-]+"); + if (! std::regex_match(roleSessionName, regex_roleSession)) { + return -EINVAL; + } + } + + return 0; +} + +int AssumeRoleWithWebIdentityRequest::validate_input() const +{ + if (! providerId.empty()) { + if (providerId.length() < MIN_PROVIDER_ID_LEN || + providerId.length() > MAX_PROVIDER_ID_LEN) { + return -EINVAL; + } + } + return AssumeRoleRequestBase::validate_input(); +} + +int AssumeRoleRequest::validate_input() const +{ + if (! externalId.empty()) { + if (externalId.length() < MIN_EXTERNAL_ID_LEN || + externalId.length() > MAX_EXTERNAL_ID_LEN) { + return -EINVAL; + } + + std::regex regex_externalId("[A-Za-z0-9_=,.@:/-]+"); + if (! std::regex_match(externalId, regex_externalId)) { + return -EINVAL; + } + } + if (! serialNumber.empty()){ + if (serialNumber.size() < MIN_SERIAL_NUMBER_SIZE || serialNumber.size() > MAX_SERIAL_NUMBER_SIZE) { + return -EINVAL; + } + + std::regex regex_serialNumber("[A-Za-z0-9_=/:,.@-]+"); + if (! std::regex_match(serialNumber, regex_serialNumber)) { + return -EINVAL; + } + } + if (! tokenCode.empty() && tokenCode.size() == TOKEN_CODE_SIZE) { + return -EINVAL; + } + + return AssumeRoleRequestBase::validate_input(); +} + +std::tuple STSService::getRoleInfo(const string& arn) +{ + if (auto r_arn = rgw::ARN::parse(arn); r_arn) { + auto pos = r_arn->resource.find_last_of('/'); + string roleName = r_arn->resource.substr(pos + 1); + RGWRole role(cct, store, roleName, r_arn->account); + if (int ret = role.get(); ret < 0) { + if (ret == -ENOENT) { + ret = -ERR_NO_ROLE_FOUND; + } + return make_tuple(ret, this->role); + } else { + this->role = std::move(role); + return make_tuple(0, this->role); + } + } else { + return make_tuple(-EINVAL, this->role); + } +} + +int STSService::storeARN(string& arn) +{ + int ret = 0; + RGWUserInfo info; + if (ret = rgw_get_user_info_by_uid(store, user_id, info); ret < 0) { + return -ERR_NO_SUCH_ENTITY; + } + + info.assumed_role_arn = arn; + + RGWObjVersionTracker objv_tracker; + if (ret = rgw_store_user_info(store, info, &info, &objv_tracker, real_time(), + false); ret < 0) { + return -ERR_INTERNAL_ERROR; + } + return ret; +} + +AssumeRoleWithWebIdentityResponse STSService::assumeRoleWithWebIdentity(AssumeRoleWithWebIdentityRequest& req) +{ + AssumeRoleWithWebIdentityResponse response; + response.assumeRoleResp.packedPolicySize = 0; + + if (req.getProviderId().empty()) { + response.providerId = req.getIss(); + } + response.aud = req.getAud(); + response.sub = req.getSub(); + + //Get the role info which is being assumed + boost::optional r_arn = rgw::ARN::parse(req.getRoleARN()); + if (r_arn == boost::none) { + response.assumeRoleResp.retCode = -EINVAL; + return response; + } + + string roleId = role.get_id(); + uint64_t roleMaxSessionDuration = role.get_max_session_duration(); + req.setMaxDuration(roleMaxSessionDuration); + + //Validate input + response.assumeRoleResp.retCode = req.validate_input(); + if (response.assumeRoleResp.retCode < 0) { + return response; + } + + //Calculate PackedPolicySize + string policy = req.getPolicy(); + response.assumeRoleResp.packedPolicySize = (policy.size() / req.getMaxPolicySize()) * 100; + + //Generate Assumed Role User + response.assumeRoleResp.retCode = response.assumeRoleResp.user.generateAssumedRoleUser(cct, + store, + roleId, + r_arn.get(), + req.getRoleSessionName()); + if (response.assumeRoleResp.retCode < 0) { + return response; + } + + //Generate Credentials + //Role and Policy provide the authorization info, user id and applier info are not needed + response.assumeRoleResp.retCode = response.assumeRoleResp.creds.generateCredentials(cct, req.getDuration(), + req.getPolicy(), roleId, + user_id, nullptr); + if (response.assumeRoleResp.retCode < 0) { + return response; + } + + response.assumeRoleResp.retCode = 0; + return response; +} + +AssumeRoleResponse STSService::assumeRole(AssumeRoleRequest& req) +{ + AssumeRoleResponse response; + response.packedPolicySize = 0; + + //Get the role info which is being assumed + boost::optional r_arn = rgw::ARN::parse(req.getRoleARN()); + if (r_arn == boost::none) { + response.retCode = -EINVAL; + return response; + } + + string roleId = role.get_id(); + uint64_t roleMaxSessionDuration = role.get_max_session_duration(); + req.setMaxDuration(roleMaxSessionDuration); + + //Validate input + response.retCode = req.validate_input(); + if (response.retCode < 0) { + return response; + } + + //Calculate PackedPolicySize + string policy = req.getPolicy(); + response.packedPolicySize = (policy.size() / req.getMaxPolicySize()) * 100; + + //Generate Assumed Role User + response.retCode = response.user.generateAssumedRoleUser(cct, store, roleId, r_arn.get(), req.getRoleSessionName()); + if (response.retCode < 0) { + return response; + } + + //Generate Credentials + //Role and Policy provide the authorization info, user id and applier info are not needed + response.retCode = response.creds.generateCredentials(cct, req.getDuration(), + req.getPolicy(), roleId, + user_id, nullptr); + if (response.retCode < 0) { + return response; + } + + //Save ARN with the user + string arn = response.user.getARN(); + response.retCode = storeARN(arn); + if (response.retCode < 0) { + return response; + } + + response.retCode = 0; + return response; +} + +GetSessionTokenRequest::GetSessionTokenRequest(const string& duration, const string& serialNumber, const string& tokenCode) +{ + if (duration.empty()) { + this->duration = DEFAULT_DURATION_IN_SECS; + } else { + this->duration = stoull(duration); + } + this->serialNumber = serialNumber; + this->tokenCode = tokenCode; +} + +GetSessionTokenResponse STSService::getSessionToken(GetSessionTokenRequest& req) +{ + int ret; + Credentials cred; + + //Generate Credentials + if (ret = cred.generateCredentials(cct, + req.getDuration(), + boost::none, + boost::none, + user_id, + identity); ret < 0) { + return make_tuple(ret, cred); + } + + return make_tuple(0, cred); +} + +} diff --git a/src/rgw/rgw_sts.h b/src/rgw/rgw_sts.h new file mode 100644 index 00000000..1ad48504 --- /dev/null +++ b/src/rgw/rgw_sts.h @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_STS_H +#define CEPH_RGW_STS_H + +#include "rgw_role.h" +#include "rgw_auth.h" +#include "rgw_web_idp.h" + +namespace STS { + +class AssumeRoleRequestBase { +protected: + static constexpr uint64_t MIN_POLICY_SIZE = 1; + static constexpr uint64_t MAX_POLICY_SIZE = 2048; + static constexpr uint64_t DEFAULT_DURATION_IN_SECS = 3600; + static constexpr uint64_t MIN_DURATION_IN_SECS = 900; + static constexpr uint64_t MIN_ROLE_ARN_SIZE = 2; + static constexpr uint64_t MAX_ROLE_ARN_SIZE = 2048; + static constexpr uint64_t MIN_ROLE_SESSION_SIZE = 2; + static constexpr uint64_t MAX_ROLE_SESSION_SIZE = 64; + uint64_t MAX_DURATION_IN_SECS; + uint64_t duration; + string err_msg; + string iamPolicy; + string roleArn; + string roleSessionName; +public: + AssumeRoleRequestBase(const string& duration, + const string& iamPolicy, + const string& roleArn, + const string& roleSessionName); + const string& getRoleARN() const { return roleArn; } + const string& getRoleSessionName() const { return roleSessionName; } + const string& getPolicy() const {return iamPolicy; } + static const uint64_t& getMaxPolicySize() { return MAX_POLICY_SIZE; } + void setMaxDuration(const uint64_t& maxDuration) { MAX_DURATION_IN_SECS = maxDuration; } + const uint64_t& getDuration() const { return duration; } + int validate_input() const; +}; + +class AssumeRoleWithWebIdentityRequest : public AssumeRoleRequestBase { + static constexpr uint64_t MIN_PROVIDER_ID_LEN = 4; + static constexpr uint64_t MAX_PROVIDER_ID_LEN = 2048; + string providerId; + string iamPolicy; + string iss; + string sub; + string aud; +public: + AssumeRoleWithWebIdentityRequest( const string& duration, + const string& providerId, + const string& iamPolicy, + const string& roleArn, + const string& roleSessionName, + const string& iss, + const string& sub, + const string& aud) + : AssumeRoleRequestBase(duration, iamPolicy, roleArn, roleSessionName), + providerId(providerId), iss(iss), sub(sub), aud(aud) {} + const string& getProviderId() const { return providerId; } + const string& getIss() const { return iss; } + const string& getAud() const { return aud; } + const string& getSub() const { return sub; } + int validate_input() const; +}; + +class AssumeRoleRequest : public AssumeRoleRequestBase { + static constexpr uint64_t MIN_EXTERNAL_ID_LEN = 2; + static constexpr uint64_t MAX_EXTERNAL_ID_LEN = 1224; + static constexpr uint64_t MIN_SERIAL_NUMBER_SIZE = 9; + static constexpr uint64_t MAX_SERIAL_NUMBER_SIZE = 256; + static constexpr uint64_t TOKEN_CODE_SIZE = 6; + string externalId; + string serialNumber; + string tokenCode; +public: + AssumeRoleRequest(const string& duration, + const string& externalId, + const string& iamPolicy, + const string& roleArn, + const string& roleSessionName, + const string& serialNumber, + const string& tokenCode) + : AssumeRoleRequestBase(duration, iamPolicy, roleArn, roleSessionName), + externalId(externalId), serialNumber(serialNumber), tokenCode(tokenCode){} + int validate_input() const; +}; + +class GetSessionTokenRequest { +protected: + static constexpr uint64_t MIN_DURATION_IN_SECS = 900; + static constexpr uint64_t DEFAULT_DURATION_IN_SECS = 3600; + uint64_t duration; + string serialNumber; + string tokenCode; + +public: + GetSessionTokenRequest(const string& duration, const string& serialNumber, const string& tokenCode); + + const uint64_t& getDuration() const { return duration; } + static const uint64_t& getMinDuration() { return MIN_DURATION_IN_SECS; } +}; + +class AssumedRoleUser { + string arn; + string assumeRoleId; +public: + int generateAssumedRoleUser( CephContext* cct, + RGWRados *store, + const string& roleId, + const rgw::ARN& roleArn, + const string& roleSessionName); + const string& getARN() const { return arn; } + const string& getAssumeRoleId() const { return assumeRoleId; } + void dump(Formatter *f) const; +}; + +struct SessionToken { + string access_key_id; + string secret_access_key; + string expiration; + string policy; + string roleId; + rgw_user user; + string acct_name; + uint32_t perm_mask; + bool is_admin; + uint32_t acct_type; + + SessionToken() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(access_key_id, bl); + encode(secret_access_key, bl); + encode(expiration, bl); + encode(policy, bl); + encode(roleId, bl); + encode(user, bl); + encode(acct_name, bl); + encode(perm_mask, bl); + encode(is_admin, bl); + encode(acct_type, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(access_key_id, bl); + decode(secret_access_key, bl); + decode(expiration, bl); + decode(policy, bl); + decode(roleId, bl); + decode(user, bl); + decode(acct_name, bl); + decode(perm_mask, bl); + decode(is_admin, bl); + decode(acct_type, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(SessionToken) + +class Credentials { + static constexpr int MAX_ACCESS_KEY_LEN = 20; + static constexpr int MAX_SECRET_KEY_LEN = 40; + string accessKeyId; + string expiration; + string secretAccessKey; + string sessionToken; +public: + int generateCredentials(CephContext* cct, + const uint64_t& duration, + const boost::optional& policy, + const boost::optional& roleId, + boost::optional user, + rgw::auth::Identity* identity); + const string& getAccessKeyId() const { return accessKeyId; } + const string& getExpiration() const { return expiration; } + const string& getSecretAccessKey() const { return secretAccessKey; } + const string& getSessionToken() const { return sessionToken; } + void dump(Formatter *f) const; +}; + +struct AssumeRoleResponse { + int retCode; + AssumedRoleUser user; + Credentials creds; + uint64_t packedPolicySize; +}; + +struct AssumeRoleWithWebIdentityResponse { + AssumeRoleResponse assumeRoleResp; + string aud; + string providerId; + string sub; +}; + +using AssumeRoleResponse = struct AssumeRoleResponse ; +using GetSessionTokenResponse = std::tuple; +using AssumeRoleWithWebIdentityResponse = struct AssumeRoleWithWebIdentityResponse; + +class STSService { + CephContext* cct; + RGWRados *store; + rgw_user user_id; + RGWRole role; + rgw::auth::Identity* identity; + int storeARN(string& arn); +public: + STSService() = default; + STSService(CephContext* cct, RGWRados *store, rgw_user user_id, rgw::auth::Identity* identity) : cct(cct), store(store), user_id(user_id), identity(identity) {} + std::tuple getRoleInfo(const string& arn); + AssumeRoleResponse assumeRole(AssumeRoleRequest& req); + GetSessionTokenResponse getSessionToken(GetSessionTokenRequest& req); + AssumeRoleWithWebIdentityResponse assumeRoleWithWebIdentity(AssumeRoleWithWebIdentityRequest& req); +}; +} +#endif /* CEPH_RGW_STS_H */ + diff --git a/src/rgw/rgw_swift_auth.cc b/src/rgw/rgw_swift_auth.cc new file mode 100644 index 00000000..eb0264a3 --- /dev/null +++ b/src/rgw/rgw_swift_auth.cc @@ -0,0 +1,759 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include +#include +#include +#include + +#include "rgw_swift_auth.h" +#include "rgw_rest.h" + +#include "common/ceph_crypto.h" +#include "common/Clock.h" + +#include "include/random.h" + +#include "rgw_client_io.h" +#include "rgw_http_client.h" +#include "include/str_list.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +#define DEFAULT_SWIFT_PREFIX "/swift" + +using namespace ceph::crypto; + + +namespace rgw { +namespace auth { +namespace swift { + +/* TempURL: applier */ +void TempURLApplier::modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const /* in/out */ +{ + bool inline_exists = false; + const std::string& filename = s->info.args.get("filename"); + + s->info.args.get("inline", &inline_exists); + if (inline_exists) { + s->content_disp.override = "inline"; + } else if (!filename.empty()) { + std::string fenc; + url_encode(filename, fenc); + s->content_disp.override = "attachment; filename=\"" + fenc + "\""; + } else { + std::string fenc; + url_encode(s->object.name, fenc); + s->content_disp.fallback = "attachment; filename=\"" + fenc + "\""; + } + + ldpp_dout(dpp, 20) << "finished applying changes to req_state for TempURL: " + << " content_disp override " << s->content_disp.override + << " content_disp fallback " << s->content_disp.fallback + << dendl; + +} + +/* TempURL: engine */ +bool TempURLEngine::is_applicable(const req_state* const s) const noexcept +{ + return s->info.args.exists("temp_url_sig") || + s->info.args.exists("temp_url_expires"); +} + +void TempURLEngine::get_owner_info(const DoutPrefixProvider* dpp, const req_state* const s, + RGWUserInfo& owner_info) const +{ + /* We cannot use req_state::bucket_name because it isn't available + * now. It will be initialized in RGWHandler_REST_SWIFT::postauth_init(). */ + const string& bucket_name = s->init_state.url_bucket; + + /* TempURL requires that bucket and object names are specified. */ + if (bucket_name.empty() || s->object.empty()) { + throw -EPERM; + } + + /* TempURL case is completely different than the Keystone auth - you may + * get account name only through extraction from URL. In turn, knowledge + * about account is neccessary to obtain its bucket tenant. Without that, + * the access would be limited to accounts with empty tenant. */ + string bucket_tenant; + if (!s->account_name.empty()) { + RGWUserInfo uinfo; + bool found = false; + + const rgw_user uid(s->account_name); + if (uid.tenant.empty()) { + const rgw_user tenanted_uid(uid.id, uid.id); + + if (rgw_get_user_info_by_uid(store, tenanted_uid, uinfo) >= 0) { + /* Succeeded. */ + bucket_tenant = uinfo.user_id.tenant; + found = true; + } + } + + if (!found && rgw_get_user_info_by_uid(store, uid, uinfo) < 0) { + throw -EPERM; + } else { + bucket_tenant = uinfo.user_id.tenant; + } + } + + /* Need to get user info of bucket owner. */ + RGWBucketInfo bucket_info; + int ret = store->get_bucket_info(*s->sysobj_ctx, + bucket_tenant, bucket_name, + bucket_info, nullptr); + if (ret < 0) { + throw ret; + } + + ldpp_dout(dpp, 20) << "temp url user (bucket owner): " << bucket_info.owner + << dendl; + + if (rgw_get_user_info_by_uid(store, bucket_info.owner, owner_info) < 0) { + throw -EPERM; + } +} + +std::string TempURLEngine::convert_from_iso8601(std::string expires) const +{ + /* Swift's TempURL allows clients to send the expiration as ISO8601- + * compatible strings. Though, only plain UNIX timestamp are taken + * for the HMAC calculations. We need to make the conversion. */ + struct tm date_t; + if (!parse_iso8601(expires.c_str(), &date_t, nullptr, true)) { + return std::move(expires); + } else { + return std::to_string(internal_timegm(&date_t)); + } +} + +bool TempURLEngine::is_expired(const std::string& expires) const +{ + string err; + const utime_t now = ceph_clock_now(); + const uint64_t expiration = (uint64_t)strict_strtoll(expires.c_str(), + 10, &err); + if (!err.empty()) { + dout(5) << "failed to parse temp_url_expires: " << err << dendl; + return true; + } + + if (expiration <= (uint64_t)now.sec()) { + dout(5) << "temp url expired: " << expiration << " <= " << now.sec() << dendl; + return true; + } + + return false; +} + +bool TempURLEngine::is_disallowed_header_present(const req_info& info) const +{ + static const auto headers = { + "HTTP_X_OBJECT_MANIFEST", + }; + + return std::any_of(std::begin(headers), std::end(headers), + [&info](const char* header) { + return info.env->exists(header); + }); +} + +std::string extract_swift_subuser(const std::string& swift_user_name) +{ + size_t pos = swift_user_name.find(':'); + if (std::string::npos == pos) { + return swift_user_name; + } else { + return swift_user_name.substr(pos + 1); + } +} + +class TempURLEngine::SignatureHelper +{ +private: + static constexpr uint32_t output_size = + CEPH_CRYPTO_HMACSHA1_DIGESTSIZE * 2 + 1; + + unsigned char dest[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; // 20 + char dest_str[output_size]; + +public: + SignatureHelper() = default; + + const char* calc(const std::string& key, + const boost::string_view& method, + const boost::string_view& path, + const std::string& expires) { + + using ceph::crypto::HMACSHA1; + using UCHARPTR = const unsigned char*; + + HMACSHA1 hmac((UCHARPTR) key.c_str(), key.size()); + hmac.Update((UCHARPTR) method.data(), method.size()); + hmac.Update((UCHARPTR) "\n", 1); + hmac.Update((UCHARPTR) expires.c_str(), expires.size()); + hmac.Update((UCHARPTR) "\n", 1); + hmac.Update((UCHARPTR) path.data(), path.size()); + hmac.Final(dest); + + buf_to_hex((UCHARPTR) dest, sizeof(dest), dest_str); + + return dest_str; + } + + bool is_equal_to(const std::string& rhs) const { + /* never allow out-of-range exception */ + if (rhs.size() < (output_size - 1)) { + return false; + } + return rhs.compare(0 /* pos */, output_size, dest_str) == 0; + } + +}; /* TempURLEngine::SignatureHelper */ + +class TempURLEngine::PrefixableSignatureHelper + : private TempURLEngine::SignatureHelper { + using base_t = SignatureHelper; + + const boost::string_view decoded_uri; + const boost::string_view object_name; + boost::string_view no_obj_uri; + + const boost::optional prefix; + +public: + PrefixableSignatureHelper(const std::string& _decoded_uri, + const std::string& object_name, + const boost::optional prefix) + : decoded_uri(_decoded_uri), + object_name(object_name), + prefix(prefix) { + /* Transform: v1/acct/cont/obj - > v1/acct/cont/ + * + * NOTE(rzarzynski): we really want to substr() on boost::string_view, + * not std::string. Otherwise we would end with no_obj_uri referencing + * a temporary. */ + no_obj_uri = \ + decoded_uri.substr(0, decoded_uri.length() - object_name.length()); + } + + const char* calc(const std::string& key, + const boost::string_view& method, + const boost::string_view& path, + const std::string& expires) { + if (!prefix) { + return base_t::calc(key, method, path, expires); + } else { + const auto prefixed_path = \ + string_cat_reserve("prefix:", no_obj_uri, *prefix); + return base_t::calc(key, method, prefixed_path, expires); + } + } + + bool is_equal_to(const std::string& rhs) const { + bool is_auth_ok = base_t::is_equal_to(rhs); + + if (prefix && is_auth_ok) { + const auto prefix_uri = string_cat_reserve(no_obj_uri, *prefix); + is_auth_ok = boost::algorithm::starts_with(decoded_uri, prefix_uri); + } + + return is_auth_ok; + } +}; /* TempURLEngine::PrefixableSignatureHelper */ + +TempURLEngine::result_t +TempURLEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const +{ + if (! is_applicable(s)) { + return result_t::deny(); + } + + /* NOTE(rzarzynski): RGWHTTPArgs::get(), in contrast to RGWEnv::get(), + * never returns nullptr. If the requested parameter is absent, we will + * get the empty string. */ + const std::string& temp_url_sig = s->info.args.get("temp_url_sig"); + const std::string& temp_url_expires = \ + convert_from_iso8601(s->info.args.get("temp_url_expires")); + + if (temp_url_sig.empty() || temp_url_expires.empty()) { + return result_t::deny(); + } + + /* Though, for prefixed tempurls we need to differentiate between empty + * prefix and lack of prefix. Empty prefix means allowance for whole + * container. */ + const boost::optional temp_url_prefix = \ + s->info.args.get_optional("temp_url_prefix"); + + RGWUserInfo owner_info; + try { + get_owner_info(dpp, s, owner_info); + } catch (...) { + ldpp_dout(dpp, 5) << "cannot get user_info of account's owner" << dendl; + return result_t::reject(); + } + + if (owner_info.temp_url_keys.empty()) { + ldpp_dout(dpp, 5) << "user does not have temp url key set, aborting" << dendl; + return result_t::reject(); + } + + if (is_expired(temp_url_expires)) { + ldpp_dout(dpp, 5) << "temp url link expired" << dendl; + return result_t::reject(-EPERM); + } + + if (is_disallowed_header_present(s->info)) { + ldout(cct, 5) << "temp url rejected due to disallowed header" << dendl; + return result_t::reject(-EINVAL); + } + + /* We need to verify two paths because of compliance with Swift, Tempest + * and old versions of RadosGW. The second item will have the prefix + * of Swift API entry point removed. */ + + /* XXX can we search this ONCE? */ + const size_t pos = g_conf()->rgw_swift_url_prefix.find_last_not_of('/') + 1; + const boost::string_view ref_uri = s->decoded_uri; + const std::array allowed_paths = { + ref_uri, + ref_uri.substr(pos + 1) + }; + + /* Account owner calculates the signature also against a HTTP method. */ + boost::container::static_vector allowed_methods; + if (strcmp("HEAD", s->info.method) == 0) { + /* HEAD requests are specially handled. */ + /* TODO: after getting a newer boost (with static_vector supporting + * initializers lists), get back to the good notation: + * allowed_methods = {"HEAD", "GET", "PUT" }; + * Just for now let's use emplace_back to construct the vector. */ + allowed_methods.emplace_back("HEAD"); + allowed_methods.emplace_back("GET"); + allowed_methods.emplace_back("PUT"); + } else if (strlen(s->info.method) > 0) { + allowed_methods.emplace_back(s->info.method); + } + + /* Need to try each combination of keys, allowed path and methods. */ + PrefixableSignatureHelper sig_helper { + s->decoded_uri, + s->object.name, + temp_url_prefix + }; + + for (const auto& kv : owner_info.temp_url_keys) { + const int temp_url_key_num = kv.first; + const string& temp_url_key = kv.second; + + if (temp_url_key.empty()) { + continue; + } + + for (const auto& path : allowed_paths) { + for (const auto& method : allowed_methods) { + const char* const local_sig = sig_helper.calc(temp_url_key, method, + path, temp_url_expires); + + ldpp_dout(dpp, 20) << "temp url signature [" << temp_url_key_num + << "] (calculated): " << local_sig + << dendl; + + if (sig_helper.is_equal_to(temp_url_sig)) { + auto apl = apl_factory->create_apl_turl(cct, s, owner_info); + return result_t::grant(std::move(apl)); + } else { + ldpp_dout(dpp, 5) << "temp url signature mismatch: " << local_sig + << " != " << temp_url_sig << dendl; + } + } + } + } + + return result_t::reject(); +} + + +/* External token */ +bool ExternalTokenEngine::is_applicable(const std::string& token) const noexcept +{ + if (token.empty()) { + return false; + } else if (g_conf()->rgw_swift_auth_url.empty()) { + return false; + } else { + return true; + } +} + +ExternalTokenEngine::result_t +ExternalTokenEngine::authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* const s) const +{ + if (! is_applicable(token)) { + return result_t::deny(); + } + + std::string auth_url = g_conf()->rgw_swift_auth_url; + if (auth_url.back() != '/') { + auth_url.append("/"); + } + + auth_url.append("token"); + char url_buf[auth_url.size() + 1 + token.length() + 1]; + sprintf(url_buf, "%s/%s", auth_url.c_str(), token.c_str()); + + RGWHTTPHeadersCollector validator(cct, "GET", url_buf, { "X-Auth-Groups", "X-Auth-Ttl" }); + + ldpp_dout(dpp, 10) << "rgw_swift_validate_token url=" << url_buf << dendl; + + int ret = validator.process(); + if (ret < 0) { + throw ret; + } + + std::string swift_user; + try { + std::vector swift_groups; + get_str_vec(validator.get_header_value("X-Auth-Groups"), + ",", swift_groups); + + if (0 == swift_groups.size()) { + return result_t::deny(-EPERM); + } else { + swift_user = std::move(swift_groups[0]); + } + } catch (const std::out_of_range&) { + /* The X-Auth-Groups header isn't present in the response. */ + return result_t::deny(-EPERM); + } + + if (swift_user.empty()) { + return result_t::deny(-EPERM); + } + + ldpp_dout(dpp, 10) << "swift user=" << swift_user << dendl; + + RGWUserInfo tmp_uinfo; + ret = rgw_get_user_info_by_swift(store, swift_user, tmp_uinfo); + if (ret < 0) { + ldpp_dout(dpp, 0) << "NOTICE: couldn't map swift user" << dendl; + throw ret; + } + + auto apl = apl_factory->create_apl_local(cct, s, tmp_uinfo, + extract_swift_subuser(swift_user), + boost::none); + return result_t::grant(std::move(apl)); +} + +static int build_token(const string& swift_user, + const string& key, + const uint64_t nonce, + const utime_t& expiration, + bufferlist& bl) +{ + using ceph::encode; + encode(swift_user, bl); + encode(nonce, bl); + encode(expiration, bl); + + bufferptr p(CEPH_CRYPTO_HMACSHA1_DIGESTSIZE); + + char buf[bl.length() * 2 + 1]; + buf_to_hex((const unsigned char *)bl.c_str(), bl.length(), buf); + dout(20) << "build_token token=" << buf << dendl; + + char k[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; + // FIPS zeroization audit 20191116: this memset is not intended to + // wipe out a secret after use. + memset(k, 0, sizeof(k)); + const char *s = key.c_str(); + for (int i = 0; i < (int)key.length(); i++, s++) { + k[i % CEPH_CRYPTO_HMACSHA1_DIGESTSIZE] |= *s; + } + calc_hmac_sha1(k, sizeof(k), bl.c_str(), bl.length(), p.c_str()); + ::ceph::crypto::zeroize_for_security(k, sizeof(k)); + + bl.append(p); + + return 0; + +} + +static int encode_token(CephContext *cct, string& swift_user, string& key, + bufferlist& bl) +{ + const auto nonce = ceph::util::generate_random_number(); + + utime_t expiration = ceph_clock_now(); + expiration += cct->_conf->rgw_swift_token_expiration; + + return build_token(swift_user, key, nonce, expiration, bl); +} + + +/* AUTH_rgwtk (signed token): engine */ +bool SignedTokenEngine::is_applicable(const std::string& token) const noexcept +{ + if (token.empty()) { + return false; + } else { + return token.compare(0, 10, "AUTH_rgwtk") == 0; + } +} + +SignedTokenEngine::result_t +SignedTokenEngine::authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* const s) const +{ + if (! is_applicable(token)) { + return result_t::deny(-EPERM); + } + + /* Effective token string is the part after the prefix. */ + const std::string etoken = token.substr(strlen("AUTH_rgwtk")); + const size_t etoken_len = etoken.length(); + + if (etoken_len & 1) { + ldpp_dout(dpp, 0) << "NOTICE: failed to verify token: odd token length=" + << etoken_len << dendl; + throw -EINVAL; + } + + ceph::bufferptr p(etoken_len/2); + int ret = hex_to_buf(etoken.c_str(), p.c_str(), etoken_len); + if (ret < 0) { + throw ret; + } + + ceph::bufferlist tok_bl; + tok_bl.append(p); + + uint64_t nonce; + utime_t expiration; + std::string swift_user; + + try { + auto iter = tok_bl.cbegin(); + + using ceph::decode; + decode(swift_user, iter); + decode(nonce, iter); + decode(expiration, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "NOTICE: failed to decode token" << dendl; + throw -EINVAL; + } + + const utime_t now = ceph_clock_now(); + if (expiration < now) { + ldpp_dout(dpp, 0) << "NOTICE: old timed out token was used now=" << now + << " token.expiration=" << expiration + << dendl; + return result_t::deny(-EPERM); + } + + RGWUserInfo user_info; + ret = rgw_get_user_info_by_swift(store, swift_user, user_info); + if (ret < 0) { + throw ret; + } + + ldpp_dout(dpp, 10) << "swift_user=" << swift_user << dendl; + + const auto siter = user_info.swift_keys.find(swift_user); + if (siter == std::end(user_info.swift_keys)) { + return result_t::deny(-EPERM); + } + + const auto swift_key = siter->second; + + bufferlist local_tok_bl; + ret = build_token(swift_user, swift_key.key, nonce, expiration, local_tok_bl); + if (ret < 0) { + throw ret; + } + + if (local_tok_bl.length() != tok_bl.length()) { + ldpp_dout(dpp, 0) << "NOTICE: tokens length mismatch:" + << " tok_bl.length()=" << tok_bl.length() + << " local_tok_bl.length()=" << local_tok_bl.length() + << dendl; + return result_t::deny(-EPERM); + } + + if (memcmp(local_tok_bl.c_str(), tok_bl.c_str(), + local_tok_bl.length()) != 0) { + char buf[local_tok_bl.length() * 2 + 1]; + + buf_to_hex(reinterpret_cast(local_tok_bl.c_str()), + local_tok_bl.length(), buf); + + ldpp_dout(dpp, 0) << "NOTICE: tokens mismatch tok=" << buf << dendl; + return result_t::deny(-EPERM); + } + + auto apl = apl_factory->create_apl_local(cct, s, user_info, + extract_swift_subuser(swift_user), + boost::none); + return result_t::grant(std::move(apl)); +} + +} /* namespace swift */ +} /* namespace auth */ +} /* namespace rgw */ + + +void RGW_SWIFT_Auth_Get::execute() +{ + int ret = -EPERM; + + const char *key = s->info.env->get("HTTP_X_AUTH_KEY"); + const char *user = s->info.env->get("HTTP_X_AUTH_USER"); + + s->prot_flags |= RGW_REST_SWIFT; + + string user_str; + RGWUserInfo info; + bufferlist bl; + RGWAccessKey *swift_key; + map::iterator siter; + + string swift_url = g_conf()->rgw_swift_url; + string swift_prefix = g_conf()->rgw_swift_url_prefix; + string tenant_path; + + /* + * We did not allow an empty Swift prefix before, but we want it now. + * So, we take rgw_swift_url_prefix = "/" to yield the empty prefix. + * The rgw_swift_url_prefix = "" is the default and yields "/swift" + * in a backwards-compatible way. + */ + if (swift_prefix.size() == 0) { + swift_prefix = DEFAULT_SWIFT_PREFIX; + } else if (swift_prefix == "/") { + swift_prefix.clear(); + } else { + if (swift_prefix[0] != '/') { + swift_prefix.insert(0, "/"); + } + } + + if (swift_url.size() == 0) { + bool add_port = false; + const char *server_port = s->info.env->get("SERVER_PORT_SECURE"); + const char *protocol; + if (server_port) { + add_port = (strcmp(server_port, "443") != 0); + protocol = "https"; + } else { + server_port = s->info.env->get("SERVER_PORT"); + add_port = (strcmp(server_port, "80") != 0); + protocol = "http"; + } + const char *host = s->info.env->get("HTTP_HOST"); + if (!host) { + dout(0) << "NOTICE: server is misconfigured, missing rgw_swift_url_prefix or rgw_swift_url, HTTP_HOST is not set" << dendl; + ret = -EINVAL; + goto done; + } + swift_url = protocol; + swift_url.append("://"); + swift_url.append(host); + if (add_port && !strchr(host, ':')) { + swift_url.append(":"); + swift_url.append(server_port); + } + } + + if (!key || !user) + goto done; + + user_str = user; + + if ((ret = rgw_get_user_info_by_swift(store, user_str, info)) < 0) + { + ret = -EACCES; + goto done; + } + + siter = info.swift_keys.find(user_str); + if (siter == info.swift_keys.end()) { + ret = -EPERM; + goto done; + } + swift_key = &siter->second; + + if (swift_key->key.compare(key) != 0) { + dout(0) << "NOTICE: RGW_SWIFT_Auth_Get::execute(): bad swift key" << dendl; + ret = -EPERM; + goto done; + } + + if (!g_conf()->rgw_swift_tenant_name.empty()) { + tenant_path = "/AUTH_"; + tenant_path.append(g_conf()->rgw_swift_tenant_name); + } else if (g_conf()->rgw_swift_account_in_url) { + tenant_path = "/AUTH_"; + tenant_path.append(info.user_id.to_str()); + } + + dump_header(s, "X-Storage-Url", swift_url + swift_prefix + "/v1" + + tenant_path); + + using rgw::auth::swift::encode_token; + if ((ret = encode_token(s->cct, swift_key->id, swift_key->key, bl)) < 0) + goto done; + + { + static constexpr size_t PREFIX_LEN = sizeof("AUTH_rgwtk") - 1; + char token_val[PREFIX_LEN + bl.length() * 2 + 1]; + + snprintf(token_val, PREFIX_LEN + 1, "AUTH_rgwtk"); + buf_to_hex((const unsigned char *)bl.c_str(), bl.length(), + token_val + PREFIX_LEN); + + dump_header(s, "X-Storage-Token", token_val); + dump_header(s, "X-Auth-Token", token_val); + } + + ret = STATUS_NO_CONTENT; + +done: + set_req_state_err(s, ret); + dump_errno(s); + end_header(s); +} + +int RGWHandler_SWIFT_Auth::init(RGWRados *store, struct req_state *state, + rgw::io::BasicClient *cio) +{ + state->dialect = "swift-auth"; + state->formatter = new JSONFormatter; + state->format = RGW_FORMAT_JSON; + + return RGWHandler::init(store, state, cio); +} + +int RGWHandler_SWIFT_Auth::authorize(const DoutPrefixProvider *dpp) +{ + return 0; +} + +RGWOp *RGWHandler_SWIFT_Auth::op_get() +{ + return new RGW_SWIFT_Auth_Get; +} + diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h new file mode 100644 index 00000000..33d1cb22 --- /dev/null +++ b/src/rgw/rgw_swift_auth.h @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SWIFT_AUTH_H +#define CEPH_RGW_SWIFT_AUTH_H + +#include "rgw_common.h" +#include "rgw_user.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_auth.h" +#include "rgw_auth_keystone.h" +#include "rgw_auth_filters.h" + +#define RGW_SWIFT_TOKEN_EXPIRATION (15 * 60) + +namespace rgw { +namespace auth { +namespace swift { + +/* TempURL: applier. */ +class TempURLApplier : public rgw::auth::LocalApplier { +public: + TempURLApplier(CephContext* const cct, + const RGWUserInfo& user_info) + : LocalApplier(cct, user_info, LocalApplier::NO_SUBUSER, boost::none) { + }; + + void modify_request_state(const DoutPrefixProvider* dpp, req_state * s) const override; /* in/out */ + + struct Factory { + virtual ~Factory() {} + virtual aplptr_t create_apl_turl(CephContext* cct, + const req_state* s, + const RGWUserInfo& user_info) const = 0; + }; +}; + +/* TempURL: engine */ +class TempURLEngine : public rgw::auth::Engine { + using result_t = rgw::auth::Engine::result_t; + + CephContext* const cct; + /* const */ RGWRados* const store; + const TempURLApplier::Factory* const apl_factory; + + /* Helper methods. */ + void get_owner_info(const DoutPrefixProvider* dpp, + const req_state* s, + RGWUserInfo& owner_info) const; + std::string convert_from_iso8601(std::string expires) const; + bool is_applicable(const req_state* s) const noexcept; + bool is_expired(const std::string& expires) const; + bool is_disallowed_header_present(const req_info& info) const; + + class SignatureHelper; + class PrefixableSignatureHelper; + +public: + TempURLEngine(CephContext* const cct, + /*const*/ RGWRados* const store, + const TempURLApplier::Factory* const apl_factory) + : cct(cct), + store(store), + apl_factory(apl_factory) { + } + + /* Interface implementations. */ + const char* get_name() const noexcept override { + return "rgw::auth::swift::TempURLEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const override; +}; + + +/* AUTH_rgwtk */ +class SignedTokenEngine : public rgw::auth::Engine { + using result_t = rgw::auth::Engine::result_t; + + CephContext* const cct; + RGWRados* const store; + const rgw::auth::TokenExtractor* const extractor; + const rgw::auth::LocalApplier::Factory* const apl_factory; + + bool is_applicable(const std::string& token) const noexcept; + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* s) const; + +public: + SignedTokenEngine(CephContext* const cct, + /* const */RGWRados* const store, + const rgw::auth::TokenExtractor* const extractor, + const rgw::auth::LocalApplier::Factory* const apl_factory) + : cct(cct), + store(store), + extractor(extractor), + apl_factory(apl_factory) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::swift::SignedTokenEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const override { + return authenticate(dpp, extractor->get_token(s), s); + } +}; + + +/* External token */ +class ExternalTokenEngine : public rgw::auth::Engine { + using result_t = rgw::auth::Engine::result_t; + + CephContext* const cct; + RGWRados* const store; + const rgw::auth::TokenExtractor* const extractor; + const rgw::auth::LocalApplier::Factory* const apl_factory; + + bool is_applicable(const std::string& token) const noexcept; + result_t authenticate(const DoutPrefixProvider* dpp, + const std::string& token, + const req_state* s) const; + +public: + ExternalTokenEngine(CephContext* const cct, + /* const */RGWRados* const store, + const rgw::auth::TokenExtractor* const extractor, + const rgw::auth::LocalApplier::Factory* const apl_factory) + : cct(cct), + store(store), + extractor(extractor), + apl_factory(apl_factory) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::swift::ExternalTokenEngine"; + } + + result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s) const override { + return authenticate(dpp, extractor->get_token(s), s); + } +}; + +/* SwiftAnonymous: applier. */ +class SwiftAnonymousApplier : public rgw::auth::LocalApplier { + public: + SwiftAnonymousApplier(CephContext* const cct, + const RGWUserInfo& user_info) + : LocalApplier(cct, user_info, LocalApplier::NO_SUBUSER, boost::none) { + }; + bool is_admin_of(const rgw_user& uid) const {return false;} + bool is_owner_of(const rgw_user& uid) const {return uid.id.compare(RGW_USER_ANON_ID) == 0;} +}; + +class SwiftAnonymousEngine : public rgw::auth::AnonymousEngine { + const rgw::auth::TokenExtractor* const extractor; + + bool is_applicable(const req_state* s) const noexcept override { + return extractor->get_token(s).empty(); + } + +public: + SwiftAnonymousEngine(CephContext* const cct, + const SwiftAnonymousApplier::Factory* const apl_factory, + const rgw::auth::TokenExtractor* const extractor) + : AnonymousEngine(cct, apl_factory), + extractor(extractor) { + } + + const char* get_name() const noexcept override { + return "rgw::auth::swift::SwiftAnonymousEngine"; + } +}; + + +class DefaultStrategy : public rgw::auth::Strategy, + public rgw::auth::TokenExtractor, + public rgw::auth::RemoteApplier::Factory, + public rgw::auth::LocalApplier::Factory, + public rgw::auth::swift::TempURLApplier::Factory { + RGWRados* const store; + ImplicitTenants& implicit_tenant_context; + + /* The engines. */ + const rgw::auth::swift::TempURLEngine tempurl_engine; + const rgw::auth::swift::SignedTokenEngine signed_engine; + boost::optional keystone_engine; + const rgw::auth::swift::ExternalTokenEngine external_engine; + const rgw::auth::swift::SwiftAnonymousEngine anon_engine; + + using keystone_config_t = rgw::keystone::CephCtxConfig; + using keystone_cache_t = rgw::keystone::TokenCache; + using aplptr_t = rgw::auth::IdentityApplier::aplptr_t; + using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t; + + /* The method implements TokenExtractor for X-Auth-Token present in req_state. */ + std::string get_token(const req_state* const s) const override { + /* Returning a reference here would end in GCC complaining about a reference + * to temporary. */ + return s->info.env->get("HTTP_X_AUTH_TOKEN", ""); + } + + aplptr_t create_apl_remote(CephContext* const cct, + const req_state* const s, + acl_strategy_t&& extra_acl_strategy, + const rgw::auth::RemoteApplier::AuthInfo &info) const override { + auto apl = \ + rgw::auth::add_3rdparty(store, rgw_user(s->account_name), + rgw::auth::add_sysreq(cct, store, s, + rgw::auth::RemoteApplier(cct, store, std::move(extra_acl_strategy), info, + implicit_tenant_context, + rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_SWIFT))); + /* TODO(rzarzynski): replace with static_ptr. */ + return aplptr_t(new decltype(apl)(std::move(apl))); + } + + aplptr_t create_apl_local(CephContext* const cct, + const req_state* const s, + const RGWUserInfo& user_info, + const std::string& subuser, + const boost::optional& perm_mask) const override { + auto apl = \ + rgw::auth::add_3rdparty(store, rgw_user(s->account_name), + rgw::auth::add_sysreq(cct, store, s, + rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask))); + /* TODO(rzarzynski): replace with static_ptr. */ + return aplptr_t(new decltype(apl)(std::move(apl))); + } + + aplptr_t create_apl_turl(CephContext* const cct, + const req_state* const s, + const RGWUserInfo& user_info) const override { + /* TempURL doesn't need any user account override. It's a Swift-specific + * mechanism that requires account name internally, so there is no + * business with delegating the responsibility outside. */ + return aplptr_t(new rgw::auth::swift::TempURLApplier(cct, user_info)); + } + +public: + DefaultStrategy(CephContext* const cct, + ImplicitTenants& implicit_tenant_context, + RGWRados* const store) + : store(store), + implicit_tenant_context(implicit_tenant_context), + tempurl_engine(cct, + store, + static_cast(this)), + signed_engine(cct, + store, + static_cast(this), + static_cast(this)), + external_engine(cct, + store, + static_cast(this), + static_cast(this)), + anon_engine(cct, + static_cast(this), + static_cast(this)) { + /* When the constructor's body is being executed, all member engines + * should be initialized. Thus, we can safely add them. */ + using Control = rgw::auth::Strategy::Control; + + add_engine(Control::SUFFICIENT, tempurl_engine); + add_engine(Control::SUFFICIENT, signed_engine); + + /* The auth strategy is responsible for deciding whether a parcular + * engine is disabled or not. */ + if (! cct->_conf->rgw_keystone_url.empty()) { + keystone_engine.emplace(cct, + static_cast(this), + static_cast(this), + keystone_config_t::get_instance(), + keystone_cache_t::get_instance()); + + add_engine(Control::SUFFICIENT, *keystone_engine); + } + if (! cct->_conf->rgw_swift_auth_url.empty()) { + add_engine(Control::SUFFICIENT, external_engine); + } + + add_engine(Control::SUFFICIENT, anon_engine); + } + + const char* get_name() const noexcept override { + return "rgw::auth::swift::DefaultStrategy"; + } +}; + +} /* namespace swift */ +} /* namespace auth */ +} /* namespace rgw */ + + +class RGW_SWIFT_Auth_Get : public RGWOp { +public: + RGW_SWIFT_Auth_Get() {} + ~RGW_SWIFT_Auth_Get() override {} + + int verify_permission() override { return 0; } + void execute() override; + const char* name() const override { return "swift_auth_get"; } + dmc::client_id dmclock_client() override { return dmc::client_id::auth; } +}; + +class RGWHandler_SWIFT_Auth : public RGWHandler_REST { +public: + RGWHandler_SWIFT_Auth() {} + ~RGWHandler_SWIFT_Auth() override {} + RGWOp *op_get() override; + + int init(RGWRados *store, struct req_state *state, rgw::io::BasicClient *cio) override; + int authorize(const DoutPrefixProvider *dpp) override; + int postauth_init() override { return 0; } + int read_permissions(RGWOp *op) override { return 0; } + + virtual RGWAccessControlPolicy *alloc_policy() { return NULL; } + virtual void free_policy(RGWAccessControlPolicy *policy) {} +}; + +class RGWRESTMgr_SWIFT_Auth : public RGWRESTMgr { +public: + RGWRESTMgr_SWIFT_Auth() = default; + ~RGWRESTMgr_SWIFT_Auth() override = default; + + RGWRESTMgr *get_resource_mgr(struct req_state* const s, + const std::string& uri, + std::string* const out_uri) override { + return this; + } + + RGWHandler_REST* get_handler(struct req_state*, + const rgw::auth::StrategyRegistry&, + const std::string&) override { + return new RGWHandler_SWIFT_Auth; + } +}; + + +#endif diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc new file mode 100644 index 00000000..b0e95959 --- /dev/null +++ b/src/rgw/rgw_sync.cc @@ -0,0 +1,3136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "common/ceph_json.h" +#include "common/RWLock.h" +#include "common/RefCountedObj.h" +#include "common/WorkQueue.h" +#include "common/Throttle.h" +#include "common/admin_socket.h" +#include "common/errno.h" + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_sync.h" +#include "rgw_metadata.h" +#include "rgw_rest_conn.h" +#include "rgw_tools.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_http_client.h" +#include "rgw_sync_trace.h" + +#include "cls/lock/cls_lock_client.h" + +#include "services/svc_zone.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "meta sync: ") + +static string mdlog_sync_status_oid = "mdlog.sync-status"; +static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard"; +static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index"; + +RGWSyncErrorLogger::RGWSyncErrorLogger(RGWRados *_store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) { + for (int i = 0; i < num_shards; i++) { + oids.push_back(get_shard_oid(oid_prefix, i)); + } +} +string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) { + char buf[oid_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id); + return string(buf); +} + +RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) { + cls_log_entry entry; + + rgw_sync_error_info info(source_zone, error_code, message); + bufferlist bl; + encode(info, bl); + store->time_log_prepare_entry(entry, real_clock::now(), section, name, bl); + + uint32_t shard_id = ++counter % num_shards; + + + return new RGWRadosTimelogAddCR(store, oids[shard_id], entry); +} + +void RGWSyncBackoff::update_wait_time() +{ + if (cur_wait == 0) { + cur_wait = 1; + } else { + cur_wait = (cur_wait << 1); + } + if (cur_wait >= max_secs) { + cur_wait = max_secs; + } +} + +void RGWSyncBackoff::backoff_sleep() +{ + update_wait_time(); + sleep(cur_wait); +} + +void RGWSyncBackoff::backoff(RGWCoroutine *op) +{ + update_wait_time(); + op->wait(utime_t(cur_wait, 0)); +} + +int RGWBackoffControlCR::operate() { + reenter(this) { + // retry the operation until it succeeds + while (true) { + yield { + Mutex::Locker l(lock); + cr = alloc_cr(); + cr->get(); + call(cr); + } + { + Mutex::Locker l(lock); + cr->put(); + cr = NULL; + } + if (retcode >= 0) { + break; + } + if (retcode != -EBUSY && retcode != -EAGAIN) { + ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl; + if (exit_on_error) { + return set_cr_error(retcode); + } + } + if (reset_backoff) { + backoff.reset(); + } + yield backoff.backoff(this); + } + + // run an optional finisher + yield call(alloc_finisher_cr()); + if (retcode < 0) { + ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} + +void rgw_mdlog_info::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("num_objects", num_shards, obj); + JSONDecoder::decode_json("period", period, obj); + JSONDecoder::decode_json("realm_epoch", realm_epoch, obj); +} + +void rgw_mdlog_entry::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("section", section, obj); + JSONDecoder::decode_json("name", name, obj); + utime_t ut; + JSONDecoder::decode_json("timestamp", ut, obj); + timestamp = ut.to_real_time(); + JSONDecoder::decode_json("data", log_data, obj); +} + +void rgw_mdlog_shard_data::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + JSONDecoder::decode_json("entries", entries, obj); +}; + +int RGWShardCollectCR::operate() { + reenter(this) { + while (spawn_next()) { + current_running++; + + while (current_running >= max_concurrent) { + int child_ret; + yield wait_for_child(); + if (collect_next(&child_ret)) { + current_running--; + if (child_ret < 0 && child_ret != -ENOENT) { + ldout(cct, 10) << __func__ << ": failed to fetch log status, ret=" << child_ret << dendl; + status = child_ret; + } + } + } + } + while (current_running > 0) { + int child_ret; + yield wait_for_child(); + if (collect_next(&child_ret)) { + current_running--; + if (child_ret < 0 && child_ret != -ENOENT) { + ldout(cct, 10) << __func__ << ": failed to fetch log status, ret=" << child_ret << dendl; + status = child_ret; + } + } + } + if (status < 0) { + return set_cr_error(status); + } + return set_cr_done(); + } + return 0; +} + +class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR { + RGWMetaSyncEnv *sync_env; + + const std::string& period; + int num_shards; + map *mdlog_info; + + int shard_id; +#define READ_MDLOG_MAX_CONCURRENT 10 + +public: + RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env, + const std::string& period, int _num_shards, + map *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT), + sync_env(_sync_env), + period(period), num_shards(_num_shards), + mdlog_info(_mdlog_info), shard_id(0) {} + bool spawn_next() override; +}; + +class RGWListRemoteMDLogCR : public RGWShardCollectCR { + RGWMetaSyncEnv *sync_env; + + const std::string& period; + map shards; + int max_entries_per_shard; + map *result; + + map::iterator iter; +#define READ_MDLOG_MAX_CONCURRENT 10 + +public: + RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env, + const std::string& period, map& _shards, + int _max_entries_per_shard, + map *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT), + sync_env(_sync_env), period(period), + max_entries_per_shard(_max_entries_per_shard), + result(_result) { + shards.swap(_shards); + iter = shards.begin(); + } + bool spawn_next() override; +}; + +RGWRemoteMetaLog::~RGWRemoteMetaLog() +{ + delete error_logger; +} + +int RGWRemoteMetaLog::read_log_info(rgw_mdlog_info *log_info) +{ + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { NULL, NULL } }; + + int ret = conn->get_json_resource("/admin/log", pairs, *log_info); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog info" << dendl; + return ret; + } + + ldpp_dout(dpp, 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl; + + return 0; +} + +int RGWRemoteMetaLog::read_master_log_shards_info(const string &master_period, map *shards_info) +{ + if (store->svc.zone->is_meta_master()) { + return 0; + } + + rgw_mdlog_info log_info; + int ret = read_log_info(&log_info); + if (ret < 0) { + return ret; + } + + return run(new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info)); +} + +int RGWRemoteMetaLog::read_master_log_shards_next(const string& period, map shard_markers, map *result) +{ + if (store->svc.zone->is_meta_master()) { + return 0; + } + + return run(new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result)); +} + +int RGWRemoteMetaLog::init() +{ + conn = store->svc.zone->get_master_conn(); + + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + + error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS); + + init_sync_env(&sync_env); + + tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "meta"); + + return 0; +} + +void RGWRemoteMetaLog::finish() +{ + going_down = true; + stop(); +} + +#define CLONE_MAX_ENTRIES 100 + +int RGWMetaSyncStatusManager::init() +{ + if (store->svc.zone->is_meta_master()) { + return 0; + } + + if (!store->svc.zone->get_master_conn()) { + lderr(store->ctx()) << "no REST connection to master zone" << dendl; + return -EIO; + } + + int r = rgw_init_ioctx(store->get_rados_handle(), store->svc.zone->get_zone_params().log_pool, ioctx, true); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to open log pool (" << store->svc.zone->get_zone_params().log_pool << " ret=" << r << dendl; + return r; + } + + r = master_log.init(); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to init remote log, r=" << r << dendl; + return r; + } + + RGWMetaSyncEnv& sync_env = master_log.get_sync_env(); + + rgw_meta_sync_status sync_status; + r = read_sync_status(&sync_status); + if (r < 0 && r != -ENOENT) { + lderr(store->ctx()) << "ERROR: failed to read sync status, r=" << r << dendl; + return r; + } + + int num_shards = sync_status.sync_info.num_shards; + + for (int i = 0; i < num_shards; i++) { + shard_objs[i] = rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env.shard_obj_name(i)); + } + + RWLock::WLocker wl(ts_to_shard_lock); + for (int i = 0; i < num_shards; i++) { + clone_markers.push_back(string()); + utime_shard ut; + ut.shard_id = i; + ts_to_shard[ut] = i; + } + + return 0; +} + +unsigned RGWMetaSyncStatusManager::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWMetaSyncStatusManager::gen_prefix(std::ostream& out) const +{ + return out << "meta sync: "; +} + +void RGWMetaSyncEnv::init(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWRados *_store, RGWRESTConn *_conn, + RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager, + RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer) { + dpp = _dpp; + cct = _cct; + store = _store; + conn = _conn; + async_rados = _async_rados; + http_manager = _http_manager; + error_logger = _error_logger; + sync_tracer = _sync_tracer; +} + +string RGWMetaSyncEnv::status_oid() +{ + return mdlog_sync_status_oid; +} + +string RGWMetaSyncEnv::shard_obj_name(int shard_id) +{ + char buf[mdlog_sync_status_shard_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id); + + return string(buf); +} + +class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest { + RGWRados *store; + RGWMetadataLog *mdlog; + int shard_id; + int max_entries; + +protected: + int _send_request() override { + real_time from_time; + real_time end_time; + + void *handle; + + mdlog->init_list_entries(shard_id, from_time, end_time, marker, &handle); + + int ret = mdlog->list_entries(handle, max_entries, entries, &marker, &truncated); + + mdlog->complete_list_entries(handle); + + return ret; + } +public: + string marker; + list entries; + bool truncated; + + RGWAsyncReadMDLogEntries(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + RGWMetadataLog* mdlog, int _shard_id, + std::string _marker, int _max_entries) + : RGWAsyncRadosRequest(caller, cn), store(_store), mdlog(mdlog), + shard_id(_shard_id), max_entries(_max_entries), marker(std::move(_marker)) {} +}; + +class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine { + RGWMetaSyncEnv *sync_env; + RGWMetadataLog *const mdlog; + int shard_id; + string marker; + string *pmarker; + int max_entries; + list *entries; + bool *truncated; + + RGWAsyncReadMDLogEntries *req{nullptr}; + +public: + RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog, + int _shard_id, string*_marker, int _max_entries, + list *_entries, bool *_truncated) + : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog), + shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries), + entries(_entries), truncated(_truncated) {} + + ~RGWReadMDLogEntriesCR() override { + if (req) { + req->finish(); + } + } + + int send_request() override { + marker = *pmarker; + req = new RGWAsyncReadMDLogEntries(this, stack->create_completion_notifier(), + sync_env->store, mdlog, shard_id, marker, + max_entries); + sync_env->async_rados->queue(req); + return 0; + } + + int request_complete() override { + *pmarker = std::move(req->marker); + *entries = std::move(req->entries); + *truncated = req->truncated; + return req->get_ret_status(); + } +}; + + +class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine { + RGWMetaSyncEnv *env; + RGWRESTReadResource *http_op; + + const std::string& period; + int shard_id; + RGWMetadataLogInfo *shard_info; + +public: + RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period, + int _shard_id, RGWMetadataLogInfo *_shard_info) + : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL), + period(period), shard_id(_shard_id), shard_info(_shard_info) {} + + int operate() override { + auto store = env->store; + RGWRESTConn *conn = store->svc.zone->get_master_conn(); + reenter(this) { + yield { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + rgw_http_param_pair pairs[] = { { "type" , "metadata" }, + { "id", buf }, + { "period", period.c_str() }, + { "info" , NULL }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(conn, p, pairs, NULL, + env->http_manager); + + init_new_io(http_op); + + int ret = http_op->aio_read(); + if (ret < 0) { + ldpp_dout(env->dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + int ret = http_op->wait(shard_info); + http_op->put(); + if (ret < 0) { + return set_cr_error(ret); + } + return set_cr_done(); + } + } + return 0; + } +}; + +class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine { + RGWMetaSyncEnv *sync_env; + RGWRESTReadResource *http_op; + + const std::string& period; + int shard_id; + string marker; + uint32_t max_entries; + rgw_mdlog_shard_data *result; + +public: + RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period, + int _shard_id, const string& _marker, uint32_t _max_entries, + rgw_mdlog_shard_data *_result) + : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL), + period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {} + + int send_request() override { + RGWRESTConn *conn = sync_env->conn; + + char buf[32]; + snprintf(buf, sizeof(buf), "%d", shard_id); + + char max_entries_buf[32]; + snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries); + + const char *marker_key = (marker.empty() ? "" : "marker"); + + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { "id", buf }, + { "period", period.c_str() }, + { "max-entries", max_entries_buf }, + { marker_key, marker.c_str() }, + { NULL, NULL } }; + + string p = "/admin/log/"; + + http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager); + init_new_io(http_op); + + int ret = http_op->aio_read(); + if (ret < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return ret; + } + + return 0; + } + + int request_complete() override { + int ret = http_op->wait(result); + http_op->put(); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl; + return ret; + } + return 0; + } +}; + +bool RGWReadRemoteMDLogInfoCR::spawn_next() { + if (shard_id >= num_shards) { + return false; + } + spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false); + shard_id++; + return true; +} + +bool RGWListRemoteMDLogCR::spawn_next() { + if (iter == shards.end()) { + return false; + } + + spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false); + ++iter; + return true; +} + +class RGWInitSyncStatusCoroutine : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + rgw_meta_sync_info status; + vector shards_info; + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; +public: + RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env, + const rgw_meta_sync_info &status) + : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env), + status(status), shards_info(status.num_shards), + lease_cr(nullptr), lease_stack(nullptr) {} + + ~RGWInitSyncStatusCoroutine() override { + if (lease_cr) { + lease_cr->abort(); + } + } + + int operate() override { + int ret; + reenter(this) { + yield { + set_status("acquiring sync lock"); + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + RGWRados *store = sync_env->store; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env->status_oid()), + lock_name, lock_duration, this)); + lease_stack.reset(spawn(lease_cr.get(), false)); + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + ldpp_dout(sync_env->dpp, 5) << "lease cr failed, done early " << dendl; + set_status("lease lock failed, early abort"); + return set_cr_error(lease_cr->get_ret_status()); + } + set_sleeping(true); + yield; + } + yield { + set_status("writing sync status"); + RGWRados *store = sync_env->store; + call(new RGWSimpleRadosWriteCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env->status_oid()), + status)); + } + + if (retcode < 0) { + set_status("failed to write sync status"); + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl; + yield lease_cr->go_down(); + return set_cr_error(retcode); + } + /* fetch current position in logs */ + set_status("fetching remote log position"); + yield { + for (int i = 0; i < (int)status.num_shards; i++) { + spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i, + &shards_info[i]), false); + } + } + + drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */ + + yield { + set_status("updating sync status"); + for (int i = 0; i < (int)status.num_shards; i++) { + rgw_meta_sync_marker marker; + RGWMetadataLogInfo& info = shards_info[i]; + marker.next_step_marker = info.marker; + marker.timestamp = info.last_update; + RGWRados *store = sync_env->store; + spawn(new RGWSimpleRadosWriteCR(sync_env->async_rados, + store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env->shard_obj_name(i)), + marker), true); + } + } + yield { + set_status("changing sync state: build full sync maps"); + status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps; + RGWRados *store = sync_env->store; + call(new RGWSimpleRadosWriteCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env->status_oid()), + status)); + } + set_status("drop lock lease"); + yield lease_cr->go_down(); + while (collect(&ret, NULL)) { + if (ret < 0) { + return set_cr_error(ret); + } + yield; + } + drain_all(); + return set_cr_done(); + } + return 0; + } +}; + +class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + RGWMetaSyncEnv *env; + const int num_shards; + int shard_id{0}; + map& markers; + + public: + RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards, + map& markers) + : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS), + env(env), num_shards(num_shards), markers(markers) + {} + bool spawn_next() override; +}; + +bool RGWReadSyncStatusMarkersCR::spawn_next() +{ + if (shard_id >= num_shards) { + return false; + } + using CR = RGWSimpleRadosReadCR; + rgw_raw_obj obj{env->store->svc.zone->get_zone_params().log_pool, + env->shard_obj_name(shard_id)}; + spawn(new CR(env->async_rados, env->store->svc.sysobj, obj, &markers[shard_id]), false); + shard_id++; + return true; +} + +class RGWReadSyncStatusCoroutine : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + rgw_meta_sync_status *sync_status; + +public: + RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env, + rgw_meta_sync_status *_status) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status) + {} + int operate() override; +}; + +int RGWReadSyncStatusCoroutine::operate() +{ + reenter(this) { + // read sync info + using ReadInfoCR = RGWSimpleRadosReadCR; + yield { + bool empty_on_enoent = false; // fail on ENOENT + rgw_raw_obj obj{sync_env->store->svc.zone->get_zone_params().log_pool, + sync_env->status_oid()}; + call(new ReadInfoCR(sync_env->async_rados, sync_env->store->svc.sysobj, obj, + &sync_status->sync_info, empty_on_enoent)); + } + if (retcode < 0) { + ldpp_dout(sync_env->dpp, 4) << "failed to read sync status info with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + // read shard markers + using ReadMarkersCR = RGWReadSyncStatusMarkersCR; + yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards, + sync_status->sync_markers)); + if (retcode < 0) { + ldpp_dout(sync_env->dpp, 4) << "failed to read sync status markers with " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} + +class RGWFetchAllMetaCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + int num_shards; + + + int ret_status; + + list sections; + list::iterator sections_iter; + + struct meta_list_result { + list keys; + string marker; + uint64_t count{0}; + bool truncated{false}; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("keys", keys, obj); + JSONDecoder::decode_json("marker", marker, obj); + JSONDecoder::decode_json("count", count, obj); + JSONDecoder::decode_json("truncated", truncated, obj); + } + } result; + list::iterator iter; + + std::unique_ptr entries_index; + + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; + bool lost_lock; + bool failed; + + string marker; + + map& markers; + + RGWSyncTraceNodeRef tn; + +public: + RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards, + map& _markers, + RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + num_shards(_num_shards), + ret_status(0), lease_cr(nullptr), lease_stack(nullptr), + lost_lock(false), failed(false), markers(_markers) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "fetch_all_meta"); + } + + ~RGWFetchAllMetaCR() override { + } + + void append_section_from_set(set& all_sections, const string& name) { + set::iterator iter = all_sections.find(name); + if (iter != all_sections.end()) { + sections.emplace_back(std::move(*iter)); + all_sections.erase(iter); + } + } + /* + * meta sync should go in the following order: user, bucket.instance, bucket + * then whatever other sections exist (if any) + */ + void rearrange_sections() { + set all_sections; + std::move(sections.begin(), sections.end(), + std::inserter(all_sections, all_sections.end())); + sections.clear(); + + append_section_from_set(all_sections, "user"); + append_section_from_set(all_sections, "bucket.instance"); + append_section_from_set(all_sections, "bucket"); + + std::move(all_sections.begin(), all_sections.end(), + std::back_inserter(sections)); + } + + int operate() override { + RGWRESTConn *conn = sync_env->conn; + + reenter(this) { + yield { + set_status(string("acquiring lock (") + sync_env->status_oid() + ")"); + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, + sync_env->store, + rgw_raw_obj(sync_env->store->svc.zone->get_zone_params().log_pool, sync_env->status_oid()), + lock_name, lock_duration, this)); + lease_stack.reset(spawn(lease_cr.get(), false)); + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + ldpp_dout(sync_env->dpp, 5) << "lease cr failed, done early " << dendl; + set_status("failed acquiring lock"); + return set_cr_error(lease_cr->get_ret_status()); + } + set_sleeping(true); + yield; + } + entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards, + sync_env->store->svc.zone->get_zone_params().log_pool, + mdlog_sync_full_sync_index_prefix)); + yield { + call(new RGWReadRESTResourceCR >(cct, conn, sync_env->http_manager, + "/admin/metadata", NULL, §ions)); + } + if (get_ret_status() < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to fetch metadata sections" << dendl; + yield entries_index->finish(); + yield lease_cr->go_down(); + drain_all(); + return set_cr_error(get_ret_status()); + } + rearrange_sections(); + sections_iter = sections.begin(); + for (; sections_iter != sections.end(); ++sections_iter) { + do { + yield { +#define META_FULL_SYNC_CHUNK_SIZE "1000" + string entrypoint = string("/admin/metadata/") + *sections_iter; + rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE }, + { "marker", result.marker.c_str() }, + { NULL, NULL } }; + result.keys.clear(); + call(new RGWReadRESTResourceCR(cct, conn, sync_env->http_manager, + entrypoint, pairs, &result)); + } + ret_status = get_ret_status(); + if (ret_status == -ENOENT) { + set_retcode(0); /* reset coroutine status so that we don't return it */ + ret_status = 0; + } + if (ret_status < 0) { + tn->log(0, SSTR("ERROR: failed to fetch metadata section: " << *sections_iter)); + yield entries_index->finish(); + yield lease_cr->go_down(); + drain_all(); + return set_cr_error(ret_status); + } + iter = result.keys.begin(); + for (; iter != result.keys.end(); ++iter) { + if (!lease_cr->is_locked()) { + lost_lock = true; + break; + } + yield; // allow entries_index consumer to make progress + + tn->log(20, SSTR("list metadata: section=" << *sections_iter << " key=" << *iter)); + string s = *sections_iter + ":" + *iter; + int shard_id; + RGWRados *store = sync_env->store; + int ret = store->meta_mgr->get_log_shard_id(*sections_iter, *iter, &shard_id); + if (ret < 0) { + tn->log(0, SSTR("ERROR: could not determine shard id for " << *sections_iter << ":" << *iter)); + ret_status = ret; + break; + } + if (!entries_index->append(s, shard_id)) { + break; + } + } + } while (result.truncated); + } + yield { + if (!entries_index->finish()) { + failed = true; + } + } + if (!failed) { + for (map::iterator iter = markers.begin(); iter != markers.end(); ++iter) { + int shard_id = (int)iter->first; + rgw_meta_sync_marker& marker = iter->second; + marker.total_entries = entries_index->get_total_entries(shard_id); + spawn(new RGWSimpleRadosWriteCR(sync_env->async_rados, sync_env->store->svc.sysobj, + rgw_raw_obj(sync_env->store->svc.zone->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)), + marker), true); + } + } + + drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */ + + yield lease_cr->go_down(); + + int ret; + while (collect(&ret, NULL)) { + if (ret < 0) { + return set_cr_error(ret); + } + yield; + } + drain_all(); + if (failed) { + yield return set_cr_error(-EIO); + } + if (lost_lock) { + yield return set_cr_error(-EBUSY); + } + + if (ret_status < 0) { + yield return set_cr_error(ret_status); + } + + yield return set_cr_done(); + } + return 0; + } +}; + +static string full_sync_index_shard_oid(int shard_id) +{ + char buf[mdlog_sync_full_sync_index_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id); + return string(buf); +} + +class RGWReadRemoteMetadataCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + RGWRESTReadResource *http_op; + + string section; + string key; + + bufferlist *pbl; + + RGWSyncTraceNodeRef tn; + +public: + RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env, + const string& _section, const string& _key, bufferlist *_pbl, + const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + http_op(NULL), + section(_section), + key(_key), + pbl(_pbl) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "read_remote_meta", + section + ":" + key); + } + + int operate() override { + RGWRESTConn *conn = sync_env->conn; + reenter(this) { + yield { + rgw_http_param_pair pairs[] = { { "key" , key.c_str()}, + { NULL, NULL } }; + + string p = string("/admin/metadata/") + section + "/" + key; + + http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); + + int ret = http_op->aio_read(); + if (ret < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + int ret = http_op->wait(pbl); + http_op->put(); + if (ret < 0) { + return set_cr_error(ret); + } + return set_cr_done(); + } + } + return 0; + } +}; + +class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest { + RGWRados *store; + string raw_key; + bufferlist bl; +protected: + int _send_request() override { + int ret = store->meta_mgr->put(raw_key, bl, RGWMetadataHandler::APPLY_ALWAYS); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl; + return ret; + } + return 0; + } +public: + RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + const string& _raw_key, + bufferlist& _bl) : RGWAsyncRadosRequest(caller, cn), store(_store), + raw_key(_raw_key), bl(_bl) {} +}; + + +class RGWMetaStoreEntryCR : public RGWSimpleCoroutine { + RGWMetaSyncEnv *sync_env; + string raw_key; + bufferlist bl; + + RGWAsyncMetaStoreEntry *req; + +public: + RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env, + const string& _raw_key, + bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), + raw_key(_raw_key), bl(_bl), req(NULL) { + } + + ~RGWMetaStoreEntryCR() override { + if (req) { + req->finish(); + } + } + + int send_request() override { + req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(), + sync_env->store, raw_key, bl); + sync_env->async_rados->queue(req); + return 0; + } + + int request_complete() override { + return req->get_ret_status(); + } +}; + +class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest { + RGWRados *store; + string raw_key; +protected: + int _send_request() override { + int ret = store->meta_mgr->remove(raw_key); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl; + return ret; + } + return 0; + } +public: + RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store, + const string& _raw_key) : RGWAsyncRadosRequest(caller, cn), store(_store), + raw_key(_raw_key) {} +}; + + +class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine { + RGWMetaSyncEnv *sync_env; + string raw_key; + + RGWAsyncMetaRemoveEntry *req; + +public: + RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env, + const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), + raw_key(_raw_key), req(NULL) { + } + + ~RGWMetaRemoveEntryCR() override { + if (req) { + req->finish(); + } + } + + int send_request() override { + req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(), + sync_env->store, raw_key); + sync_env->async_rados->queue(req); + return 0; + } + + int request_complete() override { + int r = req->get_ret_status(); + if (r == -ENOENT) { + r = 0; + } + return r; + } +}; + +#define META_SYNC_UPDATE_MARKER_WINDOW 10 + + +int RGWLastCallerWinsCR::operate() { + RGWCoroutine *call_cr; + reenter(this) { + while (cr) { + call_cr = cr; + cr = nullptr; + yield call(call_cr); + /* cr might have been modified at this point */ + } + return set_cr_done(); + } + return 0; +} + +class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack { + RGWMetaSyncEnv *sync_env; + + string marker_oid; + rgw_meta_sync_marker sync_marker; + + RGWSyncTraceNodeRef tn; + +public: + RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env, + const string& _marker_oid, + const rgw_meta_sync_marker& _marker, + RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW), + sync_env(_sync_env), + marker_oid(_marker_oid), + sync_marker(_marker), + tn(_tn){} + + RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override { + sync_marker.marker = new_marker; + if (index_pos > 0) { + sync_marker.pos = index_pos; + } + + if (!real_clock::is_zero(timestamp)) { + sync_marker.timestamp = timestamp; + } + + ldpp_dout(sync_env->dpp, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl; + tn->log(20, SSTR("new marker=" << new_marker)); + RGWRados *store = sync_env->store; + return new RGWSimpleRadosWriteCR(sync_env->async_rados, + store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, marker_oid), + sync_marker); + } + + RGWOrderCallCR *allocate_order_control_cr() override { + return new RGWLastCallerWinsCR(sync_env->cct); + } +}; + +RGWMetaSyncSingleEntryCR::RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env, + const string& _raw_key, const string& _entry_marker, + const RGWMDLogStatus& _op_status, + RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + raw_key(_raw_key), entry_marker(_entry_marker), + op_status(_op_status), + pos(0), sync_status(0), + marker_tracker(_marker_tracker), tries(0) { + error_injection = (sync_env->cct->_conf->rgw_sync_meta_inject_err_probability > 0); + tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", raw_key); +} + +int RGWMetaSyncSingleEntryCR::operate() { + reenter(this) { +#define NUM_TRANSIENT_ERROR_RETRIES 10 + + if (error_injection && + rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) { + ldpp_dout(sync_env->dpp, 0) << __FILE__ << ":" << __LINE__ << ": injecting meta sync error on key=" << raw_key << dendl; + return set_cr_error(-EIO); + } + + if (op_status != MDLOG_STATUS_COMPLETE) { + tn->log(20, "skipping pending operation"); + yield call(marker_tracker->finish(entry_marker)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + tn->set_flag(RGW_SNS_FLAG_ACTIVE); + for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) { + yield { + pos = raw_key.find(':'); + section = raw_key.substr(0, pos); + key = raw_key.substr(pos + 1); + tn->log(10, SSTR("fetching remote metadata entry" << (tries == 0 ? "" : " (retry)"))); + call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl, tn)); + } + + sync_status = retcode; + + if (sync_status == -ENOENT) { + /* FIXME: do we need to remove the entry from the local zone? */ + break; + } + + if ((sync_status == -EAGAIN || sync_status == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) { + ldpp_dout(sync_env->dpp, 20) << *this << ": failed to fetch remote metadata: " << section << ":" << key << ", will retry" << dendl; + continue; + } + + if (sync_status < 0) { + tn->log(10, SSTR("failed to send read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status)); + log_error() << "failed to send read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl; + yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), section, key, -sync_status, + string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status))); + return set_cr_error(sync_status); + } + + break; + } + + retcode = 0; + for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) { + if (sync_status != -ENOENT) { + tn->log(10, SSTR("storing local metadata entry")); + yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl)); + } else { + tn->log(10, SSTR("removing local metadata entry")); + yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key)); + } + if ((retcode == -EAGAIN || retcode == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) { + ldpp_dout(sync_env->dpp, 20) << *this << ": failed to store metadata: " << section << ":" << key << ", got retcode=" << retcode << dendl; + continue; + } + break; + } + + sync_status = retcode; + + if (sync_status == 0 && marker_tracker) { + /* update marker */ + yield call(marker_tracker->finish(entry_marker)); + sync_status = retcode; + } + if (sync_status < 0) { + tn->log(10, SSTR("failed, status=" << sync_status)); + return set_cr_error(sync_status); + } + tn->log(10, "success"); + return set_cr_done(); + } + return 0; +} + +class RGWCloneMetaLogCoroutine : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + RGWMetadataLog *mdlog; + + const std::string& period; + int shard_id; + string marker; + bool truncated = false; + string *new_marker; + + int max_entries = CLONE_MAX_ENTRIES; + + RGWRESTReadResource *http_op = nullptr; + boost::intrusive_ptr completion; + + RGWMetadataLogInfo shard_info; + rgw_mdlog_shard_data data; + +public: + RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog, + const std::string& period, int _id, + const string& _marker, string *_new_marker) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog), + period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) { + if (new_marker) { + *new_marker = marker; + } + } + ~RGWCloneMetaLogCoroutine() override { + if (http_op) { + http_op->put(); + } + if (completion) { + completion->cancel(); + } + } + + int operate() override; + + int state_init(); + int state_read_shard_status(); + int state_read_shard_status_complete(); + int state_send_rest_request(); + int state_receive_rest_response(); + int state_store_mdlog_entries(); + int state_store_mdlog_entries_complete(); +}; + +class RGWMetaSyncShardCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + const rgw_pool& pool; + const std::string& period; //< currently syncing period id + const epoch_t realm_epoch; //< realm_epoch of period + RGWMetadataLog* mdlog; //< log of syncing period + uint32_t shard_id; + rgw_meta_sync_marker& sync_marker; + boost::optional temp_marker; //< for pending updates + string marker; + string max_marker; + const std::string& period_marker; //< max marker stored in next period + + RGWRadosGetOmapKeysCR::ResultPtr omapkeys; + std::set entries; + std::set::iterator iter; + + string oid; + + RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr; + + list log_entries; + list::iterator log_iter; + bool truncated = false; + + string mdlog_marker; + string raw_key; + rgw_mdlog_entry mdlog_entry; + + Mutex inc_lock; + Cond inc_cond; + + boost::asio::coroutine incremental_cr; + boost::asio::coroutine full_cr; + + boost::intrusive_ptr lease_cr; + boost::intrusive_ptr lease_stack; + + bool lost_lock = false; + + bool *reset_backoff; + + // hold a reference to the cr stack while it's in the map + using StackRef = boost::intrusive_ptr; + map stack_to_pos; + map pos_to_prev; + + bool can_adjust_marker = false; + bool done_with_period = false; + + int total_entries = 0; + + RGWSyncTraceNodeRef tn; +public: + RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool, + const std::string& period, epoch_t realm_epoch, + RGWMetadataLog* mdlog, uint32_t _shard_id, + rgw_meta_sync_marker& _marker, + const std::string& period_marker, bool *_reset_backoff, + RGWSyncTraceNodeRef& _tn) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool), + period(period), realm_epoch(realm_epoch), mdlog(mdlog), + shard_id(_shard_id), sync_marker(_marker), + period_marker(period_marker), inc_lock("RGWMetaSyncShardCR::inc_lock"), + reset_backoff(_reset_backoff), tn(_tn) { + *reset_backoff = false; + } + + ~RGWMetaSyncShardCR() override { + delete marker_tracker; + if (lease_cr) { + lease_cr->abort(); + } + } + + void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) { + delete marker_tracker; + marker_tracker = mt; + } + + int operate() override { + int r; + while (true) { + switch (sync_marker.state) { + case rgw_meta_sync_marker::FullSync: + r = full_sync(); + if (r < 0) { + ldpp_dout(sync_env->dpp, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl; + return set_cr_error(r); + } + return 0; + case rgw_meta_sync_marker::IncrementalSync: + r = incremental_sync(); + if (r < 0) { + ldpp_dout(sync_env->dpp, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl; + return set_cr_error(r); + } + return 0; + } + } + /* unreachable */ + return 0; + } + + void collect_children() + { + int child_ret; + RGWCoroutinesStack *child; + while (collect_next(&child_ret, &child)) { + auto iter = stack_to_pos.find(child); + if (iter == stack_to_pos.end()) { + /* some other stack that we don't care about */ + continue; + } + + string& pos = iter->second; + + if (child_ret < 0) { + ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl; + } + + map::iterator prev_iter = pos_to_prev.find(pos); + ceph_assert(prev_iter != pos_to_prev.end()); + + /* + * we should get -EAGAIN for transient errors, for which we want to retry, so we don't + * update the marker and abort. We'll get called again for these. Permanent errors will be + * handled by marking the entry at the error log shard, so that we retry on it separately + */ + if (child_ret == -EAGAIN) { + can_adjust_marker = false; + } + + if (pos_to_prev.size() == 1) { + if (can_adjust_marker) { + sync_marker.marker = pos; + } + pos_to_prev.erase(prev_iter); + } else { + ceph_assert(pos_to_prev.size() > 1); + pos_to_prev.erase(prev_iter); + prev_iter = pos_to_prev.begin(); + if (can_adjust_marker) { + sync_marker.marker = prev_iter->second; + } + } + + ldpp_dout(sync_env->dpp, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl; + stack_to_pos.erase(iter); + } + } + + int full_sync() { +#define OMAP_GET_MAX_ENTRIES 100 + int max_entries = OMAP_GET_MAX_ENTRIES; + reenter(&full_cr) { + set_status("full_sync"); + tn->log(10, "start full sync"); + oid = full_sync_index_shard_oid(shard_id); + can_adjust_marker = true; + /* grab lock */ + yield { + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + RGWRados *store = sync_env->store; + lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store, + rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)), + lock_name, lock_duration, this)); + lease_stack.reset(spawn(lease_cr.get(), false)); + lost_lock = false; + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + drain_all(); + tn->log(5, "failed to take lease"); + return lease_cr->get_ret_status(); + } + set_sleeping(true); + yield; + } + tn->log(10, "took lease"); + + /* lock succeeded, a retry now should avoid previous backoff status */ + *reset_backoff = true; + + /* prepare marker tracker */ + set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env, + sync_env->shard_obj_name(shard_id), + sync_marker, tn)); + + marker = sync_marker.marker; + + total_entries = sync_marker.pos; + + /* sync! */ + do { + if (!lease_cr->is_locked()) { + tn->log(10, "lost lease"); + lost_lock = true; + break; + } + omapkeys = std::make_shared(); + yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid), + marker, max_entries, omapkeys)); + if (retcode < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl; + tn->log(0, SSTR("ERROR: failed to list omap keys, status=" << retcode)); + yield lease_cr->go_down(); + drain_all(); + return retcode; + } + entries = std::move(omapkeys->entries); + tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync")); + if (entries.size() > 0) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + } + iter = entries.begin(); + for (; iter != entries.end(); ++iter) { + marker = *iter; + tn->log(20, SSTR("full sync: " << marker)); + total_entries++; + if (!marker_tracker->start(marker, total_entries, real_time())) { + tn->log(0, SSTR("ERROR: cannot start syncing " << marker << ". Duplicate entry?")); + } else { + // fetch remote and write locally + yield { + RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker, tn), false); + // stack_to_pos holds a reference to the stack + stack_to_pos[stack] = marker; + pos_to_prev[marker] = marker; + } + } + } + collect_children(); + } while (omapkeys->more && can_adjust_marker); + + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + + while (num_spawned() > 1) { + yield wait_for_child(); + collect_children(); + } + + if (!lost_lock) { + /* update marker to reflect we're done with full sync */ + if (can_adjust_marker) { + // apply updates to a temporary marker, or operate() will send us + // to incremental_sync() after we yield + temp_marker = sync_marker; + temp_marker->state = rgw_meta_sync_marker::IncrementalSync; + temp_marker->marker = std::move(temp_marker->next_step_marker); + temp_marker->next_step_marker.clear(); + temp_marker->realm_epoch = realm_epoch; + ldpp_dout(sync_env->dpp, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl; + + using WriteMarkerCR = RGWSimpleRadosWriteCR; + yield call(new WriteMarkerCR(sync_env->async_rados, sync_env->store->svc.sysobj, + rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)), + *temp_marker)); + } + + if (retcode < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl; + yield lease_cr->go_down(); + drain_all(); + return retcode; + } + } + + /* + * if we reached here, it means that lost_lock is true, otherwise the state + * change in the previous block will prevent us from reaching here + */ + + yield lease_cr->go_down(); + + lease_cr.reset(); + + drain_all(); + + if (!can_adjust_marker) { + return -EAGAIN; + } + + if (lost_lock) { + return -EBUSY; + } + + tn->log(10, "full sync complete"); + + // apply the sync marker update + ceph_assert(temp_marker); + sync_marker = std::move(*temp_marker); + temp_marker = boost::none; + // must not yield after this point! + } + return 0; + } + + + int incremental_sync() { + reenter(&incremental_cr) { + set_status("incremental_sync"); + tn->log(10, "start incremental sync"); + can_adjust_marker = true; + /* grab lock */ + if (!lease_cr) { /* could have had a lease_cr lock from previous state */ + yield { + uint32_t lock_duration = cct->_conf->rgw_sync_lease_period; + string lock_name = "sync_lock"; + RGWRados *store = sync_env->store; + lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store, + rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)), + lock_name, lock_duration, this)); + lease_stack.reset(spawn(lease_cr.get(), false)); + lost_lock = false; + } + while (!lease_cr->is_locked()) { + if (lease_cr->is_done()) { + drain_all(); + tn->log(10, "failed to take lease"); + return lease_cr->get_ret_status(); + } + set_sleeping(true); + yield; + } + } + tn->log(10, "took lease"); + // if the period has advanced, we can't use the existing marker + if (sync_marker.realm_epoch < realm_epoch) { + ldpp_dout(sync_env->dpp, 4) << "clearing marker=" << sync_marker.marker + << " from old realm_epoch=" << sync_marker.realm_epoch + << " (now " << realm_epoch << ')' << dendl; + sync_marker.realm_epoch = realm_epoch; + sync_marker.marker.clear(); + } + mdlog_marker = sync_marker.marker; + set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env, + sync_env->shard_obj_name(shard_id), + sync_marker, tn)); + + /* + * mdlog_marker: the remote sync marker positiion + * sync_marker: the local sync marker position + * max_marker: the max mdlog position that we fetched + * marker: the current position we try to sync + * period_marker: the last marker before the next period begins (optional) + */ + marker = max_marker = sync_marker.marker; + /* inc sync */ + do { + if (!lease_cr->is_locked()) { + lost_lock = true; + tn->log(10, "lost lease"); + break; + } +#define INCREMENTAL_MAX_ENTRIES 100 + ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl; + if (!period_marker.empty() && period_marker <= mdlog_marker) { + tn->log(10, SSTR("finished syncing current period: mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker << " period_marker=" << period_marker)); + done_with_period = true; + break; + } + if (mdlog_marker <= max_marker) { + /* we're at the tip, try to bring more entries */ + ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl; + yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog, + period, shard_id, + mdlog_marker, &mdlog_marker)); + } + if (retcode < 0) { + tn->log(10, SSTR(*this << ": failed to fetch more log entries, retcode=" << retcode)); + yield lease_cr->go_down(); + drain_all(); + *reset_backoff = false; // back off and try again later + return retcode; + } + *reset_backoff = true; /* if we got to this point, all systems function */ + if (mdlog_marker > max_marker) { + tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */ + tn->log(20, SSTR("mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker)); + marker = max_marker; + yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id, + &max_marker, INCREMENTAL_MAX_ENTRIES, + &log_entries, &truncated)); + if (retcode < 0) { + tn->log(10, SSTR("failed to list mdlog entries, retcode=" << retcode)); + yield lease_cr->go_down(); + drain_all(); + *reset_backoff = false; // back off and try again later + return retcode; + } + for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) { + if (!period_marker.empty() && period_marker <= log_iter->id) { + done_with_period = true; + if (period_marker < log_iter->id) { + tn->log(10, SSTR("found key=" << log_iter->id + << " past period_marker=" << period_marker)); + break; + } + ldpp_dout(sync_env->dpp, 10) << "found key at period_marker=" << period_marker << dendl; + // sync this entry, then return control to RGWMetaSyncCR + } + if (!mdlog_entry.convert_from(*log_iter)) { + tn->log(0, SSTR("ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry")); + continue; + } + tn->log(20, SSTR("log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp)); + if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl; + } else { + raw_key = log_iter->section + ":" + log_iter->name; + yield { + RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker, tn), false); + ceph_assert(stack); + // stack_to_pos holds a reference to the stack + stack_to_pos[stack] = log_iter->id; + pos_to_prev[log_iter->id] = marker; + } + } + marker = log_iter->id; + } + } + collect_children(); + ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl; + if (done_with_period) { + // return control to RGWMetaSyncCR and advance to the next period + tn->log(10, SSTR(*this << ": done with period")); + break; + } + if (mdlog_marker == max_marker && can_adjust_marker) { + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); +#define INCREMENTAL_INTERVAL 20 + yield wait(utime_t(INCREMENTAL_INTERVAL, 0)); + } + } while (can_adjust_marker); + + tn->unset_flag(RGW_SNS_FLAG_ACTIVE); + + while (num_spawned() > 1) { + yield wait_for_child(); + collect_children(); + } + + yield lease_cr->go_down(); + + drain_all(); + + if (lost_lock) { + return -EBUSY; + } + + if (!can_adjust_marker) { + return -EAGAIN; + } + + return set_cr_done(); + } + /* TODO */ + return 0; + } +}; + +class RGWMetaSyncShardControlCR : public RGWBackoffControlCR +{ + RGWMetaSyncEnv *sync_env; + + const rgw_pool& pool; + const std::string& period; + epoch_t realm_epoch; + RGWMetadataLog* mdlog; + uint32_t shard_id; + rgw_meta_sync_marker sync_marker; + const std::string period_marker; + + RGWSyncTraceNodeRef tn; + + static constexpr bool exit_on_error = false; // retry on all errors +public: + RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool, + const std::string& period, epoch_t realm_epoch, + RGWMetadataLog* mdlog, uint32_t _shard_id, + const rgw_meta_sync_marker& _marker, + std::string&& period_marker, + RGWSyncTraceNodeRef& _tn_parent) + : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env), + pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog), + shard_id(_shard_id), sync_marker(_marker), + period_marker(std::move(period_marker)) { + tn = sync_env->sync_tracer->add_node(_tn_parent, "shard", + std::to_string(shard_id)); + } + + RGWCoroutine *alloc_cr() override { + return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog, + shard_id, sync_marker, period_marker, backoff_ptr(), tn); + } + + RGWCoroutine *alloc_finisher_cr() override { + RGWRados *store = sync_env->store; + return new RGWSimpleRadosReadCR(sync_env->async_rados, store->svc.sysobj, + rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)), + &sync_marker); + } +}; + +class RGWMetaSyncCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + const rgw_pool& pool; + RGWPeriodHistory::Cursor cursor; //< sync position in period history + RGWPeriodHistory::Cursor next; //< next period in history + rgw_meta_sync_status sync_status; + RGWSyncTraceNodeRef tn; + + std::mutex mutex; //< protect access to shard_crs + + // TODO: it should be enough to hold a reference on the stack only, as calling + // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has + // already completed + using ControlCRRef = boost::intrusive_ptr; + using StackRef = boost::intrusive_ptr; + using RefPair = std::pair; + map shard_crs; + int ret{0}; + +public: + RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, const RGWPeriodHistory::Cursor &cursor, + const rgw_meta_sync_status& _sync_status, RGWSyncTraceNodeRef& _tn) + : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + pool(sync_env->store->svc.zone->get_zone_params().log_pool), + cursor(cursor), sync_status(_sync_status), tn(_tn) {} + + ~RGWMetaSyncCR() { + } + + int operate() override { + reenter(this) { + // loop through one period at a time + tn->log(1, "start"); + for (;;) { + if (cursor == sync_env->store->period_history->get_current()) { + next = RGWPeriodHistory::Cursor{}; + if (cursor) { + ldpp_dout(sync_env->dpp, 10) << "RGWMetaSyncCR on current period=" + << cursor.get_period().get_id() << dendl; + } else { + ldpp_dout(sync_env->dpp, 10) << "RGWMetaSyncCR with no period" << dendl; + } + } else { + next = cursor; + next.next(); + ldpp_dout(sync_env->dpp, 10) << "RGWMetaSyncCR on period=" + << cursor.get_period().get_id() << ", next=" + << next.get_period().get_id() << dendl; + } + + yield { + // get the mdlog for the current period (may be empty) + auto& period_id = sync_status.sync_info.period; + auto realm_epoch = sync_status.sync_info.realm_epoch; + auto mdlog = sync_env->store->meta_mgr->get_log(period_id); + + tn->log(1, SSTR("realm epoch=" << realm_epoch << " period id=" << period_id)); + + // prevent wakeup() from accessing shard_crs while we're spawning them + std::lock_guard lock(mutex); + + // sync this period on each shard + for (const auto& m : sync_status.sync_markers) { + uint32_t shard_id = m.first; + auto& marker = m.second; + + std::string period_marker; + if (next) { + // read the maximum marker from the next period's sync status + period_marker = next.get_period().get_sync_status()[shard_id]; + if (period_marker.empty()) { + // no metadata changes have occurred on this shard, skip it + ldpp_dout(sync_env->dpp, 10) << "RGWMetaSyncCR: skipping shard " << shard_id + << " with empty period marker" << dendl; + continue; + } + } + + using ShardCR = RGWMetaSyncShardControlCR; + auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch, + mdlog, shard_id, marker, + std::move(period_marker), tn); + auto stack = spawn(cr, false); + shard_crs[shard_id] = RefPair{cr, stack}; + } + } + // wait for each shard to complete + while (ret == 0 && num_spawned() > 0) { + yield wait_for_child(); + collect(&ret, nullptr); + } + drain_all(); + { + // drop shard cr refs under lock + std::lock_guard lock(mutex); + shard_crs.clear(); + } + if (ret < 0) { + return set_cr_error(ret); + } + // advance to the next period + ceph_assert(next); + cursor = next; + + // write the updated sync info + sync_status.sync_info.period = cursor.get_period().get_id(); + sync_status.sync_info.realm_epoch = cursor.get_epoch(); + yield call(new RGWSimpleRadosWriteCR(sync_env->async_rados, + sync_env->store->svc.sysobj, + rgw_raw_obj(pool, sync_env->status_oid()), + sync_status.sync_info)); + } + } + return 0; + } + + void wakeup(int shard_id) { + std::lock_guard lock(mutex); + auto iter = shard_crs.find(shard_id); + if (iter == shard_crs.end()) { + return; + } + iter->second.first->wakeup(); + } +}; + +void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) { + env->dpp = dpp; + env->cct = store->ctx(); + env->store = store; + env->conn = conn; + env->async_rados = async_rados; + env->http_manager = &http_manager; + env->error_logger = error_logger; + env->sync_tracer = store->get_sync_tracer(); +} + +int RGWRemoteMetaLog::read_sync_status(rgw_meta_sync_status *sync_status) +{ + if (store->svc.zone->is_meta_master()) { + return 0; + } + // cannot run concurrently with run_sync(), so run in a separate manager + RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry()); + RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl; + return ret; + } + RGWMetaSyncEnv sync_env_local = sync_env; + sync_env_local.http_manager = &http_manager; + tn->log(20, "read sync status"); + ret = crs.run(new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status)); + http_manager.stop(); + return ret; +} + +int RGWRemoteMetaLog::init_sync_status() +{ + if (store->svc.zone->is_meta_master()) { + return 0; + } + + rgw_mdlog_info mdlog_info; + int r = read_log_info(&mdlog_info); + if (r < 0) { + lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl; + return r; + } + + rgw_meta_sync_info sync_info; + sync_info.num_shards = mdlog_info.num_shards; + auto cursor = store->period_history->get_current(); + if (cursor) { + sync_info.period = cursor.get_period().get_id(); + sync_info.realm_epoch = cursor.get_epoch(); + } + + return run(new RGWInitSyncStatusCoroutine(&sync_env, sync_info)); +} + +int RGWRemoteMetaLog::store_sync_info(const rgw_meta_sync_info& sync_info) +{ + tn->log(20, "store sync info"); + return run(new RGWSimpleRadosWriteCR(async_rados, store->svc.sysobj, + rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env.status_oid()), + sync_info)); +} + +// return a cursor to the period at our sync position +static RGWPeriodHistory::Cursor get_period_at(RGWRados* store, + const rgw_meta_sync_info& info) +{ + if (info.period.empty()) { + // return an empty cursor with error=0 + return RGWPeriodHistory::Cursor{}; + } + + // look for an existing period in our history + auto cursor = store->period_history->lookup(info.realm_epoch); + if (cursor) { + // verify that the period ids match + auto& existing = cursor.get_period().get_id(); + if (existing != info.period) { + lderr(store->ctx()) << "ERROR: sync status period=" << info.period + << " does not match period=" << existing + << " in history at realm epoch=" << info.realm_epoch << dendl; + return RGWPeriodHistory::Cursor{-EEXIST}; + } + return cursor; + } + + // read the period from rados or pull it from the master + RGWPeriod period; + int r = store->period_puller->pull(info.period, period); + if (r < 0) { + lderr(store->ctx()) << "ERROR: failed to read period id " + << info.period << ": " << cpp_strerror(r) << dendl; + return RGWPeriodHistory::Cursor{r}; + } + // attach the period to our history + cursor = store->period_history->attach(std::move(period)); + if (!cursor) { + r = cursor.get_error(); + lderr(store->ctx()) << "ERROR: failed to read period history back to " + << info.period << ": " << cpp_strerror(r) << dendl; + } + return cursor; +} + +int RGWRemoteMetaLog::run_sync() +{ + if (store->svc.zone->is_meta_master()) { + return 0; + } + + int r = 0; + + // get shard count and oldest log period from master + rgw_mdlog_info mdlog_info; + for (;;) { + if (going_down) { + ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl; + return 0; + } + r = read_log_info(&mdlog_info); + if (r == -EIO || r == -ENOENT) { + // keep retrying if master isn't alive or hasn't initialized the log + ldpp_dout(dpp, 10) << __func__ << "(): waiting for master.." << dendl; + backoff.backoff_sleep(); + continue; + } + backoff.reset(); + if (r < 0) { + lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl; + return r; + } + break; + } + + rgw_meta_sync_status sync_status; + do { + if (going_down) { + ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl; + return 0; + } + r = run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status)); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: failed to fetch sync status r=" << r << dendl; + return r; + } + + if (!mdlog_info.period.empty()) { + // restart sync if the remote has a period, but: + // a) our status does not, or + // b) our sync period comes before the remote's oldest log period + if (sync_status.sync_info.period.empty() || + sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) { + sync_status.sync_info.state = rgw_meta_sync_info::StateInit; + string reason; + if (sync_status.sync_info.period.empty()) { + reason = "period is empty"; + } else { + reason = SSTR("sync_info realm epoch is behind: " << sync_status.sync_info.realm_epoch << " < " << mdlog_info.realm_epoch); + } + tn->log(1, "initialize sync (reason: " + reason + ")"); + ldpp_dout(dpp, 1) << "epoch=" << sync_status.sync_info.realm_epoch + << " in sync status comes before remote's oldest mdlog epoch=" + << mdlog_info.realm_epoch << ", restarting sync" << dendl; + } + } + + if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) { + ldpp_dout(dpp, 20) << __func__ << "(): init" << dendl; + sync_status.sync_info.num_shards = mdlog_info.num_shards; + auto cursor = store->period_history->get_current(); + if (cursor) { + // run full sync, then start incremental from the current period/epoch + sync_status.sync_info.period = cursor.get_period().get_id(); + sync_status.sync_info.realm_epoch = cursor.get_epoch(); + } + r = run(new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info)); + if (r == -EBUSY) { + backoff.backoff_sleep(); + continue; + } + backoff.reset(); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to init sync status r=" << r << dendl; + return r; + } + } + } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit); + + auto num_shards = sync_status.sync_info.num_shards; + if (num_shards != mdlog_info.num_shards) { + lderr(store->ctx()) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl; + return -EINVAL; + } + + RGWPeriodHistory::Cursor cursor; + do { + r = run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status)); + if (r < 0 && r != -ENOENT) { + tn->log(0, SSTR("ERROR: failed to fetch sync status r=" << r)); + return r; + } + + switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) { + case rgw_meta_sync_info::StateBuildingFullSyncMaps: + tn->log(20, "building full sync maps"); + r = run(new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn)); + if (r == -EBUSY || r == -EAGAIN) { + backoff.backoff_sleep(); + continue; + } + backoff.reset(); + if (r < 0) { + tn->log(0, SSTR("ERROR: failed to fetch all metadata keys (r=" << r << ")")); + return r; + } + + sync_status.sync_info.state = rgw_meta_sync_info::StateSync; + r = store_sync_info(sync_status.sync_info); + if (r < 0) { + tn->log(0, SSTR("ERROR: failed to update sync status (r=" << r << ")")); + return r; + } + /* fall through */ + case rgw_meta_sync_info::StateSync: + tn->log(20, "sync"); + // find our position in the period history (if any) + cursor = get_period_at(store, sync_status.sync_info); + r = cursor.get_error(); + if (r < 0) { + return r; + } + meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status, tn); + r = run(meta_sync_cr); + if (r < 0) { + tn->log(0, "ERROR: failed to fetch all metadata keys"); + return r; + } + break; + default: + tn->log(0, "ERROR: bad sync state!"); + return -EIO; + } + } while (!going_down); + + return 0; +} + +void RGWRemoteMetaLog::wakeup(int shard_id) +{ + if (!meta_sync_cr) { + return; + } + meta_sync_cr->wakeup(shard_id); +} + +int RGWCloneMetaLogCoroutine::operate() +{ + reenter(this) { + do { + yield { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl; + return state_init(); + } + yield { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl; + return state_read_shard_status(); + } + yield { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl; + return state_read_shard_status_complete(); + } + yield { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl; + return state_send_rest_request(); + } + yield { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl; + return state_receive_rest_response(); + } + yield { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl; + return state_store_mdlog_entries(); + } + } while (truncated); + yield { + ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl; + return state_store_mdlog_entries_complete(); + } + } + + return 0; +} + +int RGWCloneMetaLogCoroutine::state_init() +{ + data = rgw_mdlog_shard_data(); + + return 0; +} + +int RGWCloneMetaLogCoroutine::state_read_shard_status() +{ + const bool add_ref = false; // default constructs with refs=1 + + completion.reset(new RGWMetadataLogInfoCompletion( + [this](int ret, const cls_log_header& header) { + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(sync_env->dpp, 1) << "ERROR: failed to read mdlog info with " + << cpp_strerror(ret) << dendl; + } + } else { + shard_info.marker = header.max_marker; + shard_info.last_update = header.max_time.to_real_time(); + } + // wake up parent stack + io_complete(); + }), add_ref); + + int ret = mdlog->get_info_async(shard_id, completion.get()); + if (ret < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl; + return set_cr_error(ret); + } + + return io_block(0); +} + +int RGWCloneMetaLogCoroutine::state_read_shard_status_complete() +{ + completion.reset(); + + ldpp_dout(sync_env->dpp, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl; + + marker = shard_info.marker; + + return 0; +} + +int RGWCloneMetaLogCoroutine::state_send_rest_request() +{ + RGWRESTConn *conn = sync_env->conn; + + char buf[32]; + snprintf(buf, sizeof(buf), "%d", shard_id); + + char max_entries_buf[32]; + snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries); + + const char *marker_key = (marker.empty() ? "" : "marker"); + + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { "id", buf }, + { "period", period.c_str() }, + { "max-entries", max_entries_buf }, + { marker_key, marker.c_str() }, + { NULL, NULL } }; + + http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); + + int ret = http_op->aio_read(); + if (ret < 0) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + http_op = NULL; + return set_cr_error(ret); + } + + return io_block(0); +} + +int RGWCloneMetaLogCoroutine::state_receive_rest_response() +{ + int ret = http_op->wait(&data); + if (ret < 0) { + error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl; + ldpp_dout(sync_env->dpp, 5) << "failed to wait for op, ret=" << ret << dendl; + http_op->put(); + http_op = NULL; + return set_cr_error(ret); + } + http_op->put(); + http_op = NULL; + + ldpp_dout(sync_env->dpp, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl; + + truncated = ((int)data.entries.size() == max_entries); + + if (data.entries.empty()) { + if (new_marker) { + *new_marker = marker; + } + return set_cr_done(); + } + + if (new_marker) { + *new_marker = data.entries.back().id; + } + + return 0; +} + + +int RGWCloneMetaLogCoroutine::state_store_mdlog_entries() +{ + list dest_entries; + + vector::iterator iter; + for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) { + rgw_mdlog_entry& entry = *iter; + ldpp_dout(sync_env->dpp, 20) << "entry: name=" << entry.name << dendl; + + cls_log_entry dest_entry; + dest_entry.id = entry.id; + dest_entry.section = entry.section; + dest_entry.name = entry.name; + dest_entry.timestamp = utime_t(entry.timestamp); + + encode(entry.log_data, dest_entry.data); + + dest_entries.push_back(dest_entry); + + marker = entry.id; + } + + RGWAioCompletionNotifier *cn = stack->create_completion_notifier(); + + int ret = mdlog->store_entries_in_shard(dest_entries, shard_id, cn->completion()); + if (ret < 0) { + cn->put(); + ldpp_dout(sync_env->dpp, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl; + return set_cr_error(ret); + } + return io_block(0); +} + +int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete() +{ + return set_cr_done(); +} + + +// TODO: move into rgw_sync_trim.cc +#undef dout_prefix +#define dout_prefix (*_dout << "meta trim: ") + +/// purge all log shards for the given mdlog +class PurgeLogShardsCR : public RGWShardCollectCR { + RGWRados *const store; + const RGWMetadataLog* mdlog; + const int num_shards; + rgw_raw_obj obj; + int i{0}; + + static constexpr int max_concurrent = 16; + + public: + PurgeLogShardsCR(RGWRados *store, const RGWMetadataLog* mdlog, + const rgw_pool& pool, int num_shards) + : RGWShardCollectCR(store->ctx(), max_concurrent), + store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "") + {} + + bool spawn_next() override { + if (i == num_shards) { + return false; + } + mdlog->get_shard_oid(i++, obj.oid); + spawn(new RGWRadosRemoveCR(store, obj), false); + return true; + } +}; + +using Cursor = RGWPeriodHistory::Cursor; + +/// purge mdlogs from the oldest up to (but not including) the given realm_epoch +class PurgePeriodLogsCR : public RGWCoroutine { + RGWRados *const store; + RGWMetadataManager *const metadata; + RGWObjVersionTracker objv; + Cursor cursor; + epoch_t realm_epoch; + epoch_t *last_trim_epoch; //< update last trim on success + + public: + PurgePeriodLogsCR(RGWRados *store, epoch_t realm_epoch, epoch_t *last_trim) + : RGWCoroutine(store->ctx()), store(store), metadata(store->meta_mgr), + realm_epoch(realm_epoch), last_trim_epoch(last_trim) + {} + + int operate() override; +}; + +int PurgePeriodLogsCR::operate() +{ + reenter(this) { + // read our current oldest log period + yield call(metadata->read_oldest_log_period_cr(&cursor, &objv)); + if (retcode < 0) { + return set_cr_error(retcode); + } + ceph_assert(cursor); + ldout(cct, 20) << "oldest log realm_epoch=" << cursor.get_epoch() + << " period=" << cursor.get_period().get_id() << dendl; + + // trim -up to- the given realm_epoch + while (cursor.get_epoch() < realm_epoch) { + ldout(cct, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch() + << " period=" << cursor.get_period().get_id() << dendl; + yield { + const auto mdlog = metadata->get_log(cursor.get_period().get_id()); + const auto& pool = store->svc.zone->get_zone_params().log_pool; + auto num_shards = cct->_conf->rgw_md_log_max_shards; + call(new PurgeLogShardsCR(store, mdlog, pool, num_shards)); + } + if (retcode < 0) { + ldout(cct, 1) << "failed to remove log shards: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + ldout(cct, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch() + << " period=" << cursor.get_period().get_id() << dendl; + + // update our mdlog history + yield call(metadata->trim_log_period_cr(cursor, &objv)); + if (retcode == -ENOENT) { + // must have raced to update mdlog history. return success and allow the + // winner to continue purging + ldout(cct, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch() + << " period=" << cursor.get_period().get_id() << dendl; + return set_cr_done(); + } else if (retcode < 0) { + ldout(cct, 1) << "failed to remove log shards for realm_epoch=" + << cursor.get_epoch() << " period=" << cursor.get_period().get_id() + << " with: " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + if (*last_trim_epoch < cursor.get_epoch()) { + *last_trim_epoch = cursor.get_epoch(); + } + + ceph_assert(cursor.has_next()); // get_current() should always come after + cursor.next(); + } + return set_cr_done(); + } + return 0; +} + +namespace { + +using connection_map = std::map>; + +/// construct a RGWRESTConn for each zone in the realm +template +connection_map make_peer_connections(RGWRados *store, + const Zonegroups& zonegroups) +{ + connection_map connections; + for (auto& g : zonegroups) { + for (auto& z : g.second.zones) { + std::unique_ptr conn{ + new RGWRESTConn(store->ctx(), store->svc.zone, z.first, z.second.endpoints)}; + connections.emplace(z.first, std::move(conn)); + } + } + return connections; +} + +/// return the marker that it's safe to trim up to +const std::string& get_stable_marker(const rgw_meta_sync_marker& m) +{ + return m.state == m.FullSync ? m.next_step_marker : m.marker; +} + +/// comparison operator for take_min_status() +bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs) +{ + // sort by stable marker + return get_stable_marker(lhs) < get_stable_marker(rhs); +} + +/// populate the status with the minimum stable marker of each shard for any +/// peer whose realm_epoch matches the minimum realm_epoch in the input +template +int take_min_status(CephContext *cct, Iter first, Iter last, + rgw_meta_sync_status *status) +{ + if (first == last) { + return -EINVAL; + } + const size_t num_shards = cct->_conf->rgw_md_log_max_shards; + + status->sync_info.realm_epoch = std::numeric_limits::max(); + for (auto p = first; p != last; ++p) { + // validate peer's shard count + if (p->sync_markers.size() != num_shards) { + ldout(cct, 1) << "take_min_status got peer status with " + << p->sync_markers.size() << " shards, expected " + << num_shards << dendl; + return -EINVAL; + } + if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) { + // earlier epoch, take its entire status + *status = std::move(*p); + } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) { + // same epoch, take any earlier markers + auto m = status->sync_markers.begin(); + for (auto& shard : p->sync_markers) { + if (shard.second < m->second) { + m->second = std::move(shard.second); + } + ++m; + } + } + } + return 0; +} + +struct TrimEnv { + const DoutPrefixProvider *dpp; + RGWRados *const store; + RGWHTTPManager *const http; + int num_shards; + const std::string& zone; + Cursor current; //< cursor to current period + epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged + + TrimEnv(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards) + : dpp(dpp), store(store), http(http), num_shards(num_shards), + zone(store->svc.zone->get_zone_params().get_id()), + current(store->period_history->get_current()) + {} +}; + +struct MasterTrimEnv : public TrimEnv { + connection_map connections; //< peer connections + std::vector peer_status; //< sync status for each peer + /// last trim marker for each shard, only applies to current period's mdlog + std::vector last_trim_markers; + + MasterTrimEnv(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards) + : TrimEnv(dpp, store, http, num_shards), + last_trim_markers(num_shards) + { + auto& period = current.get_period(); + connections = make_peer_connections(store, period.get_map().zonegroups); + connections.erase(zone); + peer_status.resize(connections.size()); + } +}; + +struct PeerTrimEnv : public TrimEnv { + /// last trim timestamp for each shard, only applies to current period's mdlog + std::vector last_trim_timestamps; + + PeerTrimEnv(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards) + : TrimEnv(dpp, store, http, num_shards), + last_trim_timestamps(num_shards) + {} + + void set_num_shards(int num_shards) { + this->num_shards = num_shards; + last_trim_timestamps.resize(num_shards); + } +}; + +} // anonymous namespace + + +/// spawn a trim cr for each shard that needs it, while limiting the number +/// of concurrent shards +class MetaMasterTrimShardCollectCR : public RGWShardCollectCR { + private: + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + MasterTrimEnv& env; + RGWMetadataLog *mdlog; + int shard_id{0}; + std::string oid; + const rgw_meta_sync_status& sync_status; + + public: + MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog, + const rgw_meta_sync_status& sync_status) + : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS), + env(env), mdlog(mdlog), sync_status(sync_status) + {} + + bool spawn_next() override; +}; + +bool MetaMasterTrimShardCollectCR::spawn_next() +{ + while (shard_id < env.num_shards) { + auto m = sync_status.sync_markers.find(shard_id); + if (m == sync_status.sync_markers.end()) { + shard_id++; + continue; + } + auto& stable = get_stable_marker(m->second); + auto& last_trim = env.last_trim_markers[shard_id]; + + if (stable <= last_trim) { + // already trimmed + ldout(cct, 20) << "skipping log shard " << shard_id + << " at marker=" << stable + << " last_trim=" << last_trim + << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl; + shard_id++; + continue; + } + + mdlog->get_shard_oid(shard_id, oid); + + ldout(cct, 10) << "trimming log shard " << shard_id + << " at marker=" << stable + << " last_trim=" << last_trim + << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl; + spawn(new RGWSyncLogTrimCR(env.store, oid, stable, &last_trim), false); + shard_id++; + return true; + } + return false; +} + +/// spawn rest requests to read each peer's sync status +class MetaMasterStatusCollectCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + MasterTrimEnv& env; + connection_map::iterator c; + std::vector::iterator s; + public: + explicit MetaMasterStatusCollectCR(MasterTrimEnv& env) + : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS), + env(env), c(env.connections.begin()), s(env.peer_status.begin()) + {} + + bool spawn_next() override { + if (c == env.connections.end()) { + return false; + } + static rgw_http_param_pair params[] = { + { "type", "metadata" }, + { "status", nullptr }, + { nullptr, nullptr } + }; + + ldout(cct, 20) << "query sync status from " << c->first << dendl; + auto conn = c->second.get(); + using StatusCR = RGWReadRESTResourceCR; + spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s), + false); + ++c; + ++s; + return true; + } +}; + +class MetaMasterTrimCR : public RGWCoroutine { + MasterTrimEnv& env; + rgw_meta_sync_status min_status; //< minimum sync status of all peers + int ret{0}; + + public: + explicit MetaMasterTrimCR(MasterTrimEnv& env) + : RGWCoroutine(env.store->ctx()), env(env) + {} + + int operate() override; +}; + +int MetaMasterTrimCR::operate() +{ + reenter(this) { + // TODO: detect this and fail before we spawn the trim thread? + if (env.connections.empty()) { + ldout(cct, 4) << "no peers, exiting" << dendl; + return set_cr_done(); + } + + ldout(cct, 10) << "fetching sync status for zone " << env.zone << dendl; + // query mdlog sync status from peers + yield call(new MetaMasterStatusCollectCR(env)); + + // must get a successful reply from all peers to consider trimming + if (ret < 0) { + ldout(cct, 4) << "failed to fetch sync status from all peers" << dendl; + return set_cr_error(ret); + } + + // determine the minimum epoch and markers + ret = take_min_status(env.store->ctx(), env.peer_status.begin(), + env.peer_status.end(), &min_status); + if (ret < 0) { + ldout(cct, 4) << "failed to calculate min sync status from peers" << dendl; + return set_cr_error(ret); + } + yield { + auto store = env.store; + auto epoch = min_status.sync_info.realm_epoch; + ldout(cct, 4) << "realm epoch min=" << epoch + << " current=" << env.current.get_epoch()<< dendl; + if (epoch > env.last_trim_epoch + 1) { + // delete any prior mdlog periods + spawn(new PurgePeriodLogsCR(store, epoch, &env.last_trim_epoch), true); + } else { + ldout(cct, 10) << "mdlogs already purged up to realm_epoch " + << env.last_trim_epoch << dendl; + } + + // if realm_epoch == current, trim mdlog based on markers + if (epoch == env.current.get_epoch()) { + auto mdlog = store->meta_mgr->get_log(env.current.get_period().get_id()); + spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true); + } + } + // ignore any errors during purge/trim because we want to hold the lock open + return set_cr_done(); + } + return 0; +} + + +/// read the first entry of the master's mdlog shard and trim to that position +class MetaPeerTrimShardCR : public RGWCoroutine { + RGWMetaSyncEnv& env; + RGWMetadataLog *mdlog; + const std::string& period_id; + const int shard_id; + RGWMetadataLogInfo info; + ceph::real_time stable; //< safe timestamp to trim, according to master + ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim + rgw_mdlog_shard_data result; //< result from master's mdlog listing + + public: + MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog, + const std::string& period_id, int shard_id, + ceph::real_time *last_trim) + : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog), + period_id(period_id), shard_id(shard_id), last_trim(last_trim) + {} + + int operate() override; +}; + +int MetaPeerTrimShardCR::operate() +{ + reenter(this) { + // query master's first mdlog entry for this shard + yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id, + "", 1, &result)); + if (retcode < 0) { + ldpp_dout(env.dpp, 5) << "failed to read first entry from master's mdlog shard " + << shard_id << " for period " << period_id + << ": " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + if (result.entries.empty()) { + // if there are no mdlog entries, we don't have a timestamp to compare. we + // can't just trim everything, because there could be racing updates since + // this empty reply. query the mdlog shard info to read its max timestamp, + // then retry the listing to make sure it's still empty before trimming to + // that + ldpp_dout(env.dpp, 10) << "empty master mdlog shard " << shard_id + << ", reading last timestamp from shard info" << dendl; + // read the mdlog shard info for the last timestamp + using ShardInfoCR = RGWReadRemoteMDLogShardInfoCR; + yield call(new ShardInfoCR(&env, period_id, shard_id, &info)); + if (retcode < 0) { + ldpp_dout(env.dpp, 5) << "failed to read info from master's mdlog shard " + << shard_id << " for period " << period_id + << ": " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + if (ceph::real_clock::is_zero(info.last_update)) { + return set_cr_done(); // nothing to trim + } + ldpp_dout(env.dpp, 10) << "got mdlog shard info with last update=" + << info.last_update << dendl; + // re-read the master's first mdlog entry to make sure it hasn't changed + yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id, + "", 1, &result)); + if (retcode < 0) { + ldpp_dout(env.dpp, 5) << "failed to read first entry from master's mdlog shard " + << shard_id << " for period " << period_id + << ": " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + // if the mdlog is still empty, trim to max marker + if (result.entries.empty()) { + stable = info.last_update; + } else { + stable = result.entries.front().timestamp; + + // can only trim -up to- master's first timestamp, so subtract a second. + // (this is why we use timestamps instead of markers for the peers) + stable -= std::chrono::seconds(1); + } + } else { + stable = result.entries.front().timestamp; + stable -= std::chrono::seconds(1); + } + + if (stable <= *last_trim) { + ldpp_dout(env.dpp, 10) << "skipping log shard " << shard_id + << " at timestamp=" << stable + << " last_trim=" << *last_trim << dendl; + return set_cr_done(); + } + + ldpp_dout(env.dpp, 10) << "trimming log shard " << shard_id + << " at timestamp=" << stable + << " last_trim=" << *last_trim << dendl; + yield { + std::string oid; + mdlog->get_shard_oid(shard_id, oid); + call(new RGWRadosTimelogTrimCR(env.store, oid, real_time{}, stable, "", "")); + } + if (retcode < 0 && retcode != -ENODATA) { + ldpp_dout(env.dpp, 1) << "failed to trim mdlog shard " << shard_id + << ": " << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + *last_trim = stable; + return set_cr_done(); + } + return 0; +} + +class MetaPeerTrimShardCollectCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + + PeerTrimEnv& env; + RGWMetadataLog *mdlog; + const std::string& period_id; + RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR + int shard_id{0}; + + public: + MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog) + : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS), + env(env), mdlog(mdlog), period_id(env.current.get_period().get_id()) + { + meta_env.init(env.dpp, cct, env.store, env.store->svc.zone->get_master_conn(), + env.store->get_async_rados(), env.http, nullptr, + env.store->get_sync_tracer()); + } + + bool spawn_next() override; +}; + +bool MetaPeerTrimShardCollectCR::spawn_next() +{ + if (shard_id >= env.num_shards) { + return false; + } + auto& last_trim = env.last_trim_timestamps[shard_id]; + spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim), + false); + shard_id++; + return true; +} + +class MetaPeerTrimCR : public RGWCoroutine { + PeerTrimEnv& env; + rgw_mdlog_info mdlog_info; //< master's mdlog info + + public: + explicit MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {} + + int operate() override; +}; + +int MetaPeerTrimCR::operate() +{ + reenter(this) { + ldout(cct, 10) << "fetching master mdlog info" << dendl; + yield { + // query mdlog_info from master for oldest_log_period + rgw_http_param_pair params[] = { + { "type", "metadata" }, + { nullptr, nullptr } + }; + + using LogInfoCR = RGWReadRESTResourceCR; + call(new LogInfoCR(cct, env.store->svc.zone->get_master_conn(), env.http, + "/admin/log/", params, &mdlog_info)); + } + if (retcode < 0) { + ldout(cct, 4) << "failed to read mdlog info from master" << dendl; + return set_cr_error(retcode); + } + // use master's shard count instead + env.set_num_shards(mdlog_info.num_shards); + + if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) { + // delete any prior mdlog periods + yield call(new PurgePeriodLogsCR(env.store, mdlog_info.realm_epoch, + &env.last_trim_epoch)); + } else { + ldout(cct, 10) << "mdlogs already purged through realm_epoch " + << env.last_trim_epoch << dendl; + } + + // if realm_epoch == current, trim mdlog based on master's markers + if (mdlog_info.realm_epoch == env.current.get_epoch()) { + yield { + auto meta_mgr = env.store->meta_mgr; + auto mdlog = meta_mgr->get_log(env.current.get_period().get_id()); + call(new MetaPeerTrimShardCollectCR(env, mdlog)); + // ignore any errors during purge/trim because we want to hold the lock open + } + } + return set_cr_done(); + } + return 0; +} + +class MetaTrimPollCR : public RGWCoroutine { + RGWRados *const store; + const utime_t interval; //< polling interval + const rgw_raw_obj obj; + const std::string name{"meta_trim"}; //< lock name + const std::string cookie; + + protected: + /// allocate the coroutine to run within the lease + virtual RGWCoroutine* alloc_cr() = 0; + + public: + MetaTrimPollCR(RGWRados *store, utime_t interval) + : RGWCoroutine(store->ctx()), store(store), interval(interval), + obj(store->svc.zone->get_zone_params().log_pool, RGWMetadataLogHistory::oid), + cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)) + {} + + int operate() override; +}; + +int MetaTrimPollCR::operate() +{ + reenter(this) { + for (;;) { + set_status("sleeping"); + wait(interval); + + // prevent others from trimming for our entire wait interval + set_status("acquiring trim lock"); + yield call(new RGWSimpleRadosLockCR(store->get_async_rados(), store, + obj, name, cookie, interval.sec())); + if (retcode < 0) { + ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl; + continue; + } + + set_status("trimming"); + yield call(alloc_cr()); + + if (retcode < 0) { + // on errors, unlock so other gateways can try + set_status("unlocking"); + yield call(new RGWSimpleRadosUnlockCR(store->get_async_rados(), store, + obj, name, cookie)); + } + } + } + return 0; +} + +class MetaMasterTrimPollCR : public MetaTrimPollCR { + MasterTrimEnv env; //< trim state to share between calls + RGWCoroutine* alloc_cr() override { + return new MetaMasterTrimCR(env); + } + public: + MetaMasterTrimPollCR(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, + int num_shards, utime_t interval) + : MetaTrimPollCR(store, interval), + env(dpp, store, http, num_shards) + {} +}; + +class MetaPeerTrimPollCR : public MetaTrimPollCR { + PeerTrimEnv env; //< trim state to share between calls + RGWCoroutine* alloc_cr() override { + return new MetaPeerTrimCR(env); + } + public: + MetaPeerTrimPollCR(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, + int num_shards, utime_t interval) + : MetaTrimPollCR(store, interval), + env(dpp, store, http, num_shards) + {} +}; + +RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, + int num_shards, utime_t interval) +{ + if (store->svc.zone->is_meta_master()) { + return new MetaMasterTrimPollCR(dpp, store, http, num_shards, interval); + } + return new MetaPeerTrimPollCR(dpp, store, http, num_shards, interval); +} + + +struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR { + MetaMasterAdminTrimCR(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards) + : MasterTrimEnv(dpp, store, http, num_shards), + MetaMasterTrimCR(*static_cast(this)) + {} +}; + +struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR { + MetaPeerAdminTrimCR(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards) + : PeerTrimEnv(dpp, store, http, num_shards), + MetaPeerTrimCR(*static_cast(this)) + {} +}; + +RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, RGWRados *store, + RGWHTTPManager *http, + int num_shards) +{ + if (store->svc.zone->is_meta_master()) { + return new MetaMasterAdminTrimCR(dpp, store, http, num_shards); + } + return new MetaPeerAdminTrimCR(dpp, store, http, num_shards); +} diff --git a/src/rgw/rgw_sync.h b/src/rgw/rgw_sync.h new file mode 100644 index 00000000..7774e164 --- /dev/null +++ b/src/rgw/rgw_sync.h @@ -0,0 +1,534 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SYNC_H +#define CEPH_RGW_SYNC_H + +#include + +#include "include/stringify.h" +#include "common/RWLock.h" + +#include "rgw_coroutine.h" +#include "rgw_http_client.h" +#include "rgw_metadata.h" +#include "rgw_meta_sync_status.h" +#include "rgw_rados.h" +#include "rgw_sync_trace.h" + + +#define ERROR_LOGGER_SHARDS 32 +#define RGW_SYNC_ERROR_LOG_SHARD_PREFIX "sync.error-log" + +struct rgw_mdlog_info { + uint32_t num_shards; + std::string period; //< period id of the master's oldest metadata log + epoch_t realm_epoch; //< realm epoch of oldest metadata log + + rgw_mdlog_info() : num_shards(0), realm_epoch(0) {} + + void decode_json(JSONObj *obj); +}; + + +struct rgw_mdlog_entry { + string id; + string section; + string name; + ceph::real_time timestamp; + RGWMetadataLogData log_data; + + void decode_json(JSONObj *obj); + + bool convert_from(cls_log_entry& le) { + id = le.id; + section = le.section; + name = le.name; + timestamp = le.timestamp.to_real_time(); + try { + auto iter = le.data.cbegin(); + decode(log_data, iter); + } catch (buffer::error& err) { + return false; + } + return true; + } +}; + +struct rgw_mdlog_shard_data { + string marker; + bool truncated; + vector entries; + + void decode_json(JSONObj *obj); +}; + +class RGWAsyncRadosProcessor; +class RGWMetaSyncStatusManager; +class RGWMetaSyncCR; +class RGWRESTConn; +class RGWSyncTraceManager; + +class RGWSyncErrorLogger { + RGWRados *store; + + vector oids; + int num_shards; + + std::atomic counter = { 0 }; +public: + RGWSyncErrorLogger(RGWRados *_store, const string &oid_prefix, int _num_shards); + RGWCoroutine *log_error_cr(const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message); + + static string get_shard_oid(const string& oid_prefix, int shard_id); +}; + +struct rgw_sync_error_info { + string source_zone; + uint32_t error_code; + string message; + + rgw_sync_error_info() : error_code(0) {} + rgw_sync_error_info(const string& _source_zone, uint32_t _error_code, const string& _message) : source_zone(_source_zone), error_code(_error_code), message(_message) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(source_zone, bl); + encode(error_code, bl); + encode(message, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(source_zone, bl); + decode(error_code, bl); + decode(message, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_sync_error_info) + +#define DEFAULT_BACKOFF_MAX 30 + +class RGWSyncBackoff { + int cur_wait; + int max_secs; + + void update_wait_time(); +public: + explicit RGWSyncBackoff(int _max_secs = DEFAULT_BACKOFF_MAX) : cur_wait(0), max_secs(_max_secs) {} + + void backoff_sleep(); + void reset() { + cur_wait = 0; + } + + void backoff(RGWCoroutine *op); +}; + +class RGWBackoffControlCR : public RGWCoroutine +{ + RGWCoroutine *cr; + Mutex lock; + + RGWSyncBackoff backoff; + bool reset_backoff; + + bool exit_on_error; + +protected: + bool *backoff_ptr() { + return &reset_backoff; + } + + Mutex& cr_lock() { + return lock; + } + + RGWCoroutine *get_cr() { + return cr; + } + +public: + RGWBackoffControlCR(CephContext *_cct, bool _exit_on_error) : RGWCoroutine(_cct), cr(NULL), lock("RGWBackoffControlCR::lock:" + stringify(this)), + reset_backoff(false), exit_on_error(_exit_on_error) { + } + + ~RGWBackoffControlCR() override { + if (cr) { + cr->put(); + } + } + + virtual RGWCoroutine *alloc_cr() = 0; + virtual RGWCoroutine *alloc_finisher_cr() { return NULL; } + + int operate() override; +}; + +struct RGWMetaSyncEnv { + const DoutPrefixProvider *dpp; + CephContext *cct{nullptr}; + RGWRados *store{nullptr}; + RGWRESTConn *conn{nullptr}; + RGWAsyncRadosProcessor *async_rados{nullptr}; + RGWHTTPManager *http_manager{nullptr}; + RGWSyncErrorLogger *error_logger{nullptr}; + RGWSyncTraceManager *sync_tracer{nullptr}; + + RGWMetaSyncEnv() {} + + void init(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWRados *_store, RGWRESTConn *_conn, + RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager, + RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer); + + string shard_obj_name(int shard_id); + string status_oid(); +}; + +class RGWRemoteMetaLog : public RGWCoroutinesManager { + const DoutPrefixProvider *dpp; + RGWRados *store; + RGWRESTConn *conn; + RGWAsyncRadosProcessor *async_rados; + + RGWHTTPManager http_manager; + RGWMetaSyncStatusManager *status_manager; + RGWSyncErrorLogger *error_logger{nullptr}; + RGWSyncTraceManager *sync_tracer{nullptr}; + + RGWMetaSyncCR *meta_sync_cr{nullptr}; + + RGWSyncBackoff backoff; + + RGWMetaSyncEnv sync_env; + + void init_sync_env(RGWMetaSyncEnv *env); + int store_sync_info(const rgw_meta_sync_info& sync_info); + + std::atomic going_down = { false }; + + RGWSyncTraceNodeRef tn; + +public: + RGWRemoteMetaLog(const DoutPrefixProvider *dpp, RGWRados *_store, + RGWAsyncRadosProcessor *async_rados, + RGWMetaSyncStatusManager *_sm) + : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), + dpp(dpp), store(_store), conn(NULL), async_rados(async_rados), + http_manager(store->ctx(), completion_mgr), + status_manager(_sm) {} + + ~RGWRemoteMetaLog() override; + + int init(); + void finish(); + + int read_log_info(rgw_mdlog_info *log_info); + int read_master_log_shards_info(const string& master_period, map *shards_info); + int read_master_log_shards_next(const string& period, map shard_markers, map *result); + int read_sync_status(rgw_meta_sync_status *sync_status); + int init_sync_status(); + int run_sync(); + + void wakeup(int shard_id); + + RGWMetaSyncEnv& get_sync_env() { + return sync_env; + } +}; + +class RGWMetaSyncStatusManager : public DoutPrefixProvider { + RGWRados *store; + librados::IoCtx ioctx; + + RGWRemoteMetaLog master_log; + + map shard_objs; + + struct utime_shard { + real_time ts; + int shard_id; + + utime_shard() : shard_id(-1) {} + + bool operator<(const utime_shard& rhs) const { + if (ts == rhs.ts) { + return shard_id < rhs.shard_id; + } + return ts < rhs.ts; + } + }; + + RWLock ts_to_shard_lock; + map ts_to_shard; + vector clone_markers; + +public: + RGWMetaSyncStatusManager(RGWRados *_store, RGWAsyncRadosProcessor *async_rados) + : store(_store), master_log(this, store, async_rados, this), + ts_to_shard_lock("ts_to_shard_lock") {} + int init(); + + int read_sync_status(rgw_meta_sync_status *sync_status) { + return master_log.read_sync_status(sync_status); + } + int init_sync_status() { return master_log.init_sync_status(); } + int read_log_info(rgw_mdlog_info *log_info) { + return master_log.read_log_info(log_info); + } + int read_master_log_shards_info(const string& master_period, map *shards_info) { + return master_log.read_master_log_shards_info(master_period, shards_info); + } + int read_master_log_shards_next(const string& period, map shard_markers, map *result) { + return master_log.read_master_log_shards_next(period, shard_markers, result); + } + + int run() { return master_log.run_sync(); } + + + // implements DoutPrefixProvider + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; + + void wakeup(int shard_id) { return master_log.wakeup(shard_id); } + void stop() { + master_log.finish(); + } +}; + +class RGWOrderCallCR : public RGWCoroutine +{ +public: + RGWOrderCallCR(CephContext *cct) : RGWCoroutine(cct) {} + + virtual void call_cr(RGWCoroutine *_cr) = 0; +}; + +class RGWLastCallerWinsCR : public RGWOrderCallCR +{ + RGWCoroutine *cr{nullptr}; + +public: + explicit RGWLastCallerWinsCR(CephContext *cct) : RGWOrderCallCR(cct) {} + ~RGWLastCallerWinsCR() { + if (cr) { + cr->put(); + } + } + + int operate() override; + + void call_cr(RGWCoroutine *_cr) override { + if (cr) { + cr->put(); + } + cr = _cr; + } +}; + +template +class RGWSyncShardMarkerTrack { + struct marker_entry { + uint64_t pos; + real_time timestamp; + + marker_entry() : pos(0) {} + marker_entry(uint64_t _p, const real_time& _ts) : pos(_p), timestamp(_ts) {} + }; + typename std::map pending; + + map finish_markers; + + int window_size; + int updates_since_flush; + + RGWOrderCallCR *order_cr{nullptr}; + +protected: + typename std::set need_retry_set; + + virtual RGWCoroutine *store_marker(const T& new_marker, uint64_t index_pos, const real_time& timestamp) = 0; + virtual RGWOrderCallCR *allocate_order_control_cr() = 0; + virtual void handle_finish(const T& marker) { } + +public: + RGWSyncShardMarkerTrack(int _window_size) : window_size(_window_size), updates_since_flush(0) {} + virtual ~RGWSyncShardMarkerTrack() { + if (order_cr) { + order_cr->put(); + } + } + + bool start(const T& pos, int index_pos, const real_time& timestamp) { + if (pending.find(pos) != pending.end()) { + return false; + } + pending[pos] = marker_entry(index_pos, timestamp); + return true; + } + + void try_update_high_marker(const T& pos, int index_pos, const real_time& timestamp) { + finish_markers[pos] = marker_entry(index_pos, timestamp); + } + + RGWCoroutine *finish(const T& pos) { + if (pending.empty()) { + /* can happen, due to a bug that ended up with multiple objects with the same name and version + * -- which can happen when versioning is enabled an the version is 'null'. + */ + return NULL; + } + + typename std::map::iterator iter = pending.begin(); + + bool is_first = (pos == iter->first); + + typename std::map::iterator pos_iter = pending.find(pos); + if (pos_iter == pending.end()) { + /* see pending.empty() comment */ + return NULL; + } + + finish_markers[pos] = pos_iter->second; + + pending.erase(pos); + + handle_finish(pos); + + updates_since_flush++; + + if (is_first && (updates_since_flush >= window_size || pending.empty())) { + return flush(); + } + return NULL; + } + + RGWCoroutine *flush() { + if (finish_markers.empty()) { + return NULL; + } + + typename std::map::iterator i; + + if (pending.empty()) { + i = finish_markers.end(); + } else { + i = finish_markers.lower_bound(pending.begin()->first); + } + if (i == finish_markers.begin()) { + return NULL; + } + updates_since_flush = 0; + + auto last = i; + --i; + const T& high_marker = i->first; + marker_entry& high_entry = i->second; + RGWCoroutine *cr = order(store_marker(high_marker, high_entry.pos, high_entry.timestamp)); + finish_markers.erase(finish_markers.begin(), last); + return cr; + } + + /* + * a key needs retry if it was processing when another marker that points + * to the same bucket shards arrives. Instead of processing it, we mark + * it as need_retry so that when we finish processing the original, we + * retry the processing on the same bucket shard, in case there are more + * entries to process. This closes a race that can happen. + */ + bool need_retry(const K& key) { + return (need_retry_set.find(key) != need_retry_set.end()); + } + + void set_need_retry(const K& key) { + need_retry_set.insert(key); + } + + void reset_need_retry(const K& key) { + need_retry_set.erase(key); + } + + RGWCoroutine *order(RGWCoroutine *cr) { + /* either returns a new RGWLastWriteWinsCR, or update existing one, in which case it returns + * nothing and the existing one will call the cr + */ + if (order_cr && order_cr->is_done()) { + order_cr->put(); + order_cr = nullptr; + } + if (!order_cr) { + order_cr = allocate_order_control_cr(); + order_cr->get(); + order_cr->call_cr(cr); + return order_cr; + } + order_cr->call_cr(cr); + return nullptr; /* don't call it a second time */ + } +}; + +class RGWMetaSyncShardMarkerTrack; + +class RGWMetaSyncSingleEntryCR : public RGWCoroutine { + RGWMetaSyncEnv *sync_env; + + string raw_key; + string entry_marker; + RGWMDLogStatus op_status; + + ssize_t pos; + string section; + string key; + + int sync_status; + + bufferlist md_bl; + + RGWMetaSyncShardMarkerTrack *marker_tracker; + + int tries; + + bool error_injection; + + RGWSyncTraceNodeRef tn; + +public: + RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env, + const string& _raw_key, const string& _entry_marker, + const RGWMDLogStatus& _op_status, + RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent); + + int operate() override; +}; + +class RGWShardCollectCR : public RGWCoroutine { + int cur_shard; + int current_running; + int max_concurrent; + int status; + +public: + RGWShardCollectCR(CephContext *_cct, int _max_concurrent) : RGWCoroutine(_cct), + current_running(0), + max_concurrent(_max_concurrent), + status(0) {} + + virtual bool spawn_next() = 0; + int operate() override; +}; + +// MetaLogTrimCR factory function +RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, + int num_shards, utime_t interval); + +// factory function for mdlog trim via radosgw-admin +RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, RGWRados *store, + RGWHTTPManager *http, + int num_shards); + +#endif diff --git a/src/rgw/rgw_sync_counters.cc b/src/rgw/rgw_sync_counters.cc new file mode 100644 index 00000000..b4130068 --- /dev/null +++ b/src/rgw/rgw_sync_counters.cc @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_context.h" +#include "rgw_sync_counters.h" + +namespace sync_counters { + +PerfCountersRef build(CephContext *cct, const std::string& name) +{ + PerfCountersBuilder b(cct, name, l_first, l_last); + + // share these counters with ceph-mgr + b.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + b.add_u64_avg(l_fetch, "fetch_bytes", "Number of object bytes replicated"); + b.add_u64_counter(l_fetch_not_modified, "fetch_not_modified", "Number of objects already replicated"); + b.add_u64_counter(l_fetch_err, "fetch_errors", "Number of object replication errors"); + + b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests"); + b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors"); + + auto logger = PerfCountersRef{ b.create_perf_counters(), cct }; + cct->get_perfcounters_collection()->add(logger.get()); + return logger; +} + +} // namespace sync_counters diff --git a/src/rgw/rgw_sync_counters.h b/src/rgw/rgw_sync_counters.h new file mode 100644 index 00000000..4c270241 --- /dev/null +++ b/src/rgw/rgw_sync_counters.h @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "common/perf_counters_collection.h" + +namespace sync_counters { + +enum { + l_first = 805000, + + l_fetch, + l_fetch_not_modified, + l_fetch_err, + + l_poll, + l_poll_err, + + l_last, +}; + +PerfCountersRef build(CephContext *cct, const std::string& name); + +} // namespace sync_counters diff --git a/src/rgw/rgw_sync_log_trim.cc b/src/rgw/rgw_sync_log_trim.cc new file mode 100644 index 00000000..a8a3fdee --- /dev/null +++ b/src/rgw/rgw_sync_log_trim.cc @@ -0,0 +1,1094 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * Author: Casey Bodley + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include +#include +#include + +#include "include/scope_guard.h" +#include "common/bounded_key_counter.h" +#include "common/errno.h" +#include "rgw_sync_log_trim.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_data_sync.h" +#include "rgw_metadata.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_sync.h" + +#include "services/svc_zone.h" + +#include +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rgw + +#undef dout_prefix +#define dout_prefix (*_dout << "trim: ") + +using rgw::BucketTrimConfig; +using BucketChangeCounter = BoundedKeyCounter; + +const std::string rgw::BucketTrimStatus::oid = "bilog.trim"; +using rgw::BucketTrimStatus; + + +// watch/notify api for gateways to coordinate about which buckets to trim +enum TrimNotifyType { + NotifyTrimCounters = 0, + NotifyTrimComplete, +}; +WRITE_RAW_ENCODER(TrimNotifyType); + +struct TrimNotifyHandler { + virtual ~TrimNotifyHandler() = default; + + virtual void handle(bufferlist::const_iterator& input, bufferlist& output) = 0; +}; + +/// api to share the bucket trim counters between gateways in the same zone. +/// each gateway will process different datalog shards, so the gateway that runs +/// the trim process needs to accumulate their counters +struct TrimCounters { + /// counter for a single bucket + struct BucketCounter { + std::string bucket; //< bucket instance metadata key + int count{0}; + + BucketCounter() = default; + BucketCounter(const std::string& bucket, int count) + : bucket(bucket), count(count) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + using Vector = std::vector; + + /// request bucket trim counters from peer gateways + struct Request { + uint16_t max_buckets; //< maximum number of bucket counters to return + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + + /// return the current bucket trim counters + struct Response { + Vector bucket_counters; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + + /// server interface to query the hottest buckets + struct Server { + virtual ~Server() = default; + + virtual void get_bucket_counters(int count, Vector& counters) = 0; + virtual void reset_bucket_counters() = 0; + }; + + /// notify handler + class Handler : public TrimNotifyHandler { + Server *const server; + public: + explicit Handler(Server *server) : server(server) {} + + void handle(bufferlist::const_iterator& input, bufferlist& output) override; + }; +}; +std::ostream& operator<<(std::ostream& out, const TrimCounters::BucketCounter& rhs) +{ + return out << rhs.bucket << ":" << rhs.count; +} + +void TrimCounters::BucketCounter::encode(bufferlist& bl) const +{ + using ceph::encode; + // no versioning to save space + encode(bucket, bl); + encode(count, bl); +} +void TrimCounters::BucketCounter::decode(bufferlist::const_iterator& p) +{ + using ceph::decode; + decode(bucket, p); + decode(count, p); +} +WRITE_CLASS_ENCODER(TrimCounters::BucketCounter); + +void TrimCounters::Request::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(max_buckets, bl); + ENCODE_FINISH(bl); +} +void TrimCounters::Request::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(max_buckets, p); + DECODE_FINISH(p); +} +WRITE_CLASS_ENCODER(TrimCounters::Request); + +void TrimCounters::Response::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(bucket_counters, bl); + ENCODE_FINISH(bl); +} +void TrimCounters::Response::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(bucket_counters, p); + DECODE_FINISH(p); +} +WRITE_CLASS_ENCODER(TrimCounters::Response); + +void TrimCounters::Handler::handle(bufferlist::const_iterator& input, + bufferlist& output) +{ + Request request; + decode(request, input); + auto count = std::min(request.max_buckets, 128); + + Response response; + server->get_bucket_counters(count, response.bucket_counters); + encode(response, output); +} + +/// api to notify peer gateways that trim has completed and their bucket change +/// counters can be reset +struct TrimComplete { + struct Request { + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + struct Response { + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + }; + + /// server interface to reset bucket counters + using Server = TrimCounters::Server; + + /// notify handler + class Handler : public TrimNotifyHandler { + Server *const server; + public: + explicit Handler(Server *server) : server(server) {} + + void handle(bufferlist::const_iterator& input, bufferlist& output) override; + }; +}; + +void TrimComplete::Request::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ENCODE_FINISH(bl); +} +void TrimComplete::Request::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + DECODE_FINISH(p); +} +WRITE_CLASS_ENCODER(TrimComplete::Request); + +void TrimComplete::Response::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ENCODE_FINISH(bl); +} +void TrimComplete::Response::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + DECODE_FINISH(p); +} +WRITE_CLASS_ENCODER(TrimComplete::Response); + +void TrimComplete::Handler::handle(bufferlist::const_iterator& input, + bufferlist& output) +{ + Request request; + decode(request, input); + + server->reset_bucket_counters(); + + Response response; + encode(response, output); +} + + +/// rados watcher for bucket trim notifications +class BucketTrimWatcher : public librados::WatchCtx2 { + RGWRados *const store; + const rgw_raw_obj& obj; + rgw_rados_ref ref; + uint64_t handle{0}; + + using HandlerPtr = std::unique_ptr; + boost::container::flat_map handlers; + + public: + BucketTrimWatcher(RGWRados *store, const rgw_raw_obj& obj, + TrimCounters::Server *counters) + : store(store), obj(obj) { + handlers.emplace(NotifyTrimCounters, new TrimCounters::Handler(counters)); + handlers.emplace(NotifyTrimComplete, new TrimComplete::Handler(counters)); + } + + ~BucketTrimWatcher() { + stop(); + } + + int start() { + int r = store->get_raw_obj_ref(obj, &ref); + if (r < 0) { + return r; + } + + // register a watch on the realm's control object + r = ref.ioctx.watch2(ref.obj.oid, &handle, this); + if (r == -ENOENT) { + constexpr bool exclusive = true; + r = ref.ioctx.create(ref.obj.oid, exclusive); + if (r == -EEXIST || r == 0) { + r = ref.ioctx.watch2(ref.obj.oid, &handle, this); + } + } + if (r < 0) { + lderr(store->ctx()) << "Failed to watch " << ref.obj + << " with " << cpp_strerror(-r) << dendl; + ref.ioctx.close(); + return r; + } + + ldout(store->ctx(), 10) << "Watching " << ref.obj.oid << dendl; + return 0; + } + + int restart() { + int r = ref.ioctx.unwatch2(handle); + if (r < 0) { + lderr(store->ctx()) << "Failed to unwatch on " << ref.obj + << " with " << cpp_strerror(-r) << dendl; + } + r = ref.ioctx.watch2(ref.obj.oid, &handle, this); + if (r < 0) { + lderr(store->ctx()) << "Failed to restart watch on " << ref.obj + << " with " << cpp_strerror(-r) << dendl; + ref.ioctx.close(); + } + return r; + } + + void stop() { + if (handle) { + ref.ioctx.unwatch2(handle); + ref.ioctx.close(); + } + } + + /// respond to bucket trim notifications + void handle_notify(uint64_t notify_id, uint64_t cookie, + uint64_t notifier_id, bufferlist& bl) override { + if (cookie != handle) { + return; + } + bufferlist reply; + try { + auto p = bl.cbegin(); + TrimNotifyType type; + decode(type, p); + + auto handler = handlers.find(type); + if (handler != handlers.end()) { + handler->second->handle(p, reply); + } else { + lderr(store->ctx()) << "no handler for notify type " << type << dendl; + } + } catch (const buffer::error& e) { + lderr(store->ctx()) << "Failed to decode notification: " << e.what() << dendl; + } + ref.ioctx.notify_ack(ref.obj.oid, notify_id, cookie, reply); + } + + /// reestablish the watch if it gets disconnected + void handle_error(uint64_t cookie, int err) override { + if (cookie != handle) { + return; + } + if (err == -ENOTCONN) { + ldout(store->ctx(), 4) << "Disconnected watch on " << ref.obj << dendl; + restart(); + } + } +}; + + +/// Interface to communicate with the trim manager about completed operations +struct BucketTrimObserver { + virtual ~BucketTrimObserver() = default; + + virtual void on_bucket_trimmed(std::string&& bucket_instance) = 0; + virtual bool trimmed_recently(const boost::string_view& bucket_instance) = 0; +}; + +/// populate the status with the minimum stable marker of each shard +template +int take_min_status(CephContext *cct, Iter first, Iter last, + std::vector *status) +{ + for (auto peer = first; peer != last; ++peer) { + if (peer->size() != status->size()) { + // all peers must agree on the number of shards + return -EINVAL; + } + auto m = status->begin(); + for (auto& shard : *peer) { + auto& marker = *m++; + // only consider incremental sync markers + if (shard.state != rgw_bucket_shard_sync_info::StateIncrementalSync) { + continue; + } + // always take the first marker, or any later marker that's smaller + if (peer == first || marker > shard.inc_marker.position) { + marker = std::move(shard.inc_marker.position); + } + } + } + return 0; +} + +/// trim each bilog shard to the given marker, while limiting the number of +/// concurrent requests +class BucketTrimShardCollectCR : public RGWShardCollectCR { + static constexpr int MAX_CONCURRENT_SHARDS = 16; + RGWRados *const store; + const RGWBucketInfo& bucket_info; + const std::vector& markers; //< shard markers to trim + size_t i{0}; //< index of current shard marker + public: + BucketTrimShardCollectCR(RGWRados *store, const RGWBucketInfo& bucket_info, + const std::vector& markers) + : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS), + store(store), bucket_info(bucket_info), markers(markers) + {} + bool spawn_next() override; +}; + +bool BucketTrimShardCollectCR::spawn_next() +{ + while (i < markers.size()) { + const auto& marker = markers[i]; + const auto shard_id = i++; + + // skip empty markers + if (!marker.empty()) { + ldout(cct, 10) << "trimming bilog shard " << shard_id + << " of " << bucket_info.bucket << " at marker " << marker << dendl; + spawn(new RGWRadosBILogTrimCR(store, bucket_info, shard_id, + std::string{}, marker), + false); + return true; + } + } + return false; +} + +/// trim the bilog of all of the given bucket instance's shards +class BucketTrimInstanceCR : public RGWCoroutine { + RGWRados *const store; + RGWHTTPManager *const http; + BucketTrimObserver *const observer; + std::string bucket_instance; + const std::string& zone_id; //< my zone id + RGWBucketInfo bucket_info; //< bucket instance info to locate bucket indices + int child_ret = 0; + + using StatusShards = std::vector; + std::vector peer_status; //< sync status for each peer + std::vector min_markers; //< min marker per shard + + public: + BucketTrimInstanceCR(RGWRados *store, RGWHTTPManager *http, + BucketTrimObserver *observer, + const std::string& bucket_instance) + : RGWCoroutine(store->ctx()), store(store), + http(http), observer(observer), + bucket_instance(bucket_instance), + zone_id(store->svc.zone->get_zone().id), + peer_status(store->svc.zone->get_zone_data_notify_to_map().size()) + {} + + int operate() override; +}; + +int BucketTrimInstanceCR::operate() +{ + reenter(this) { + ldout(cct, 4) << "starting trim on bucket=" << bucket_instance << dendl; + + // query peers for sync status + set_status("fetching sync status from peers"); + yield { + // query data sync status from each sync peer + rgw_http_param_pair params[] = { + { "type", "bucket-index" }, + { "status", nullptr }, + { "bucket", bucket_instance.c_str() }, + { "source-zone", zone_id.c_str() }, + { nullptr, nullptr } + }; + + auto p = peer_status.begin(); + for (auto& c : store->svc.zone->get_zone_data_notify_to_map()) { + using StatusCR = RGWReadRESTResourceCR; + spawn(new StatusCR(cct, c.second, http, "/admin/log/", params, &*p), + false); + ++p; + } + // in parallel, read the local bucket instance info + spawn(new RGWGetBucketInstanceInfoCR(store->get_async_rados(), store, + bucket_instance, &bucket_info), + false); + } + // wait for a response from each peer. all must respond to attempt trim + while (num_spawned()) { + yield wait_for_child(); + collect(&child_ret, nullptr); + if (child_ret < 0) { + drain_all(); + return set_cr_error(child_ret); + } + } + + // initialize each shard with the maximum marker, which is only used when + // there are no peers syncing from us + min_markers.assign(std::max(1u, bucket_info.num_shards), + RGWSyncLogTrimCR::max_marker); + + // determine the minimum marker for each shard + retcode = take_min_status(cct, peer_status.begin(), peer_status.end(), + &min_markers); + if (retcode < 0) { + ldout(cct, 4) << "failed to correlate bucket sync status from peers" << dendl; + return set_cr_error(retcode); + } + + // trim shards with a ShardCollectCR + ldout(cct, 10) << "trimming bilogs for bucket=" << bucket_info.bucket + << " markers=" << min_markers << ", shards=" << min_markers.size() << dendl; + set_status("trimming bilog shards"); + yield call(new BucketTrimShardCollectCR(store, bucket_info, min_markers)); + // ENODATA just means there were no keys to trim + if (retcode == -ENODATA) { + retcode = 0; + } + if (retcode < 0) { + ldout(cct, 4) << "failed to trim bilog shards: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + + observer->on_bucket_trimmed(std::move(bucket_instance)); + return set_cr_done(); + } + return 0; +} + +/// trim each bucket instance while limiting the number of concurrent operations +class BucketTrimInstanceCollectCR : public RGWShardCollectCR { + RGWRados *const store; + RGWHTTPManager *const http; + BucketTrimObserver *const observer; + std::vector::const_iterator bucket; + std::vector::const_iterator end; + public: + BucketTrimInstanceCollectCR(RGWRados *store, RGWHTTPManager *http, + BucketTrimObserver *observer, + const std::vector& buckets, + int max_concurrent) + : RGWShardCollectCR(store->ctx(), max_concurrent), + store(store), http(http), observer(observer), + bucket(buckets.begin()), end(buckets.end()) + {} + bool spawn_next() override; +}; + +bool BucketTrimInstanceCollectCR::spawn_next() +{ + if (bucket == end) { + return false; + } + spawn(new BucketTrimInstanceCR(store, http, observer, *bucket), false); + ++bucket; + return true; +} + +/// correlate the replies from each peer gateway into the given counter +int accumulate_peer_counters(bufferlist& bl, BucketChangeCounter& counter) +{ + counter.clear(); + + try { + // decode notify responses + auto p = bl.cbegin(); + std::map, bufferlist> replies; + std::set> timeouts; + decode(replies, p); + decode(timeouts, p); + + for (auto& peer : replies) { + auto q = peer.second.cbegin(); + TrimCounters::Response response; + decode(response, q); + for (const auto& b : response.bucket_counters) { + counter.insert(b.bucket, b.count); + } + } + } catch (const buffer::error& e) { + return -EIO; + } + return 0; +} + +/// metadata callback has the signature bool(string&& key, string&& marker) +using MetadataListCallback = std::function; + +/// lists metadata keys, passing each to a callback until it returns false. +/// on reaching the end, it will restart at the beginning and list up to the +/// initial marker +class AsyncMetadataList : public RGWAsyncRadosRequest { + CephContext *const cct; + RGWMetadataManager *const mgr; + const std::string section; + const std::string start_marker; + MetadataListCallback callback; + + int _send_request() override; + public: + AsyncMetadataList(CephContext *cct, RGWCoroutine *caller, + RGWAioCompletionNotifier *cn, RGWMetadataManager *mgr, + const std::string& section, const std::string& start_marker, + const MetadataListCallback& callback) + : RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr), + section(section), start_marker(start_marker), callback(callback) + {} +}; + +int AsyncMetadataList::_send_request() +{ + void* handle = nullptr; + std::list keys; + bool truncated{false}; + std::string marker; + + // start a listing at the given marker + int r = mgr->list_keys_init(section, start_marker, &handle); + if (r == -EINVAL) { + // restart with empty marker below + } else if (r < 0) { + ldout(cct, 10) << "failed to init metadata listing: " + << cpp_strerror(r) << dendl; + return r; + } else { + ldout(cct, 20) << "starting metadata listing at " << start_marker << dendl; + + // release the handle when scope exits + auto g = make_scope_guard([=] { mgr->list_keys_complete(handle); }); + + do { + // get the next key and marker + r = mgr->list_keys_next(handle, 1, keys, &truncated); + if (r < 0) { + ldout(cct, 10) << "failed to list metadata: " + << cpp_strerror(r) << dendl; + return r; + } + marker = mgr->get_marker(handle); + + if (!keys.empty()) { + ceph_assert(keys.size() == 1); + auto& key = keys.front(); + if (!callback(std::move(key), std::move(marker))) { + return 0; + } + } + } while (truncated); + + if (start_marker.empty()) { + // already listed all keys + return 0; + } + } + + // restart the listing from the beginning (empty marker) + handle = nullptr; + + r = mgr->list_keys_init(section, "", &handle); + if (r < 0) { + ldout(cct, 10) << "failed to restart metadata listing: " + << cpp_strerror(r) << dendl; + return r; + } + ldout(cct, 20) << "restarting metadata listing" << dendl; + + // release the handle when scope exits + auto g = make_scope_guard([=] { mgr->list_keys_complete(handle); }); + do { + // get the next key and marker + r = mgr->list_keys_next(handle, 1, keys, &truncated); + if (r < 0) { + ldout(cct, 10) << "failed to list metadata: " + << cpp_strerror(r) << dendl; + return r; + } + marker = mgr->get_marker(handle); + + if (!keys.empty()) { + ceph_assert(keys.size() == 1); + auto& key = keys.front(); + // stop at original marker + if (marker > start_marker) { + return 0; + } + if (!callback(std::move(key), std::move(marker))) { + return 0; + } + } + } while (truncated); + + return 0; +} + +/// coroutine wrapper for AsyncMetadataList +class MetadataListCR : public RGWSimpleCoroutine { + RGWAsyncRadosProcessor *const async_rados; + RGWMetadataManager *const mgr; + const std::string& section; + const std::string& start_marker; + MetadataListCallback callback; + RGWAsyncRadosRequest *req{nullptr}; + public: + MetadataListCR(CephContext *cct, RGWAsyncRadosProcessor *async_rados, + RGWMetadataManager *mgr, const std::string& section, + const std::string& start_marker, + const MetadataListCallback& callback) + : RGWSimpleCoroutine(cct), async_rados(async_rados), mgr(mgr), + section(section), start_marker(start_marker), callback(callback) + {} + ~MetadataListCR() override { + request_cleanup(); + } + + int send_request() override { + req = new AsyncMetadataList(cct, this, stack->create_completion_notifier(), + mgr, section, start_marker, callback); + async_rados->queue(req); + return 0; + } + int request_complete() override { + return req->get_ret_status(); + } + void request_cleanup() override { + if (req) { + req->finish(); + req = nullptr; + } + } +}; + +class BucketTrimCR : public RGWCoroutine { + RGWRados *const store; + RGWHTTPManager *const http; + const BucketTrimConfig& config; + BucketTrimObserver *const observer; + const rgw_raw_obj& obj; + ceph::mono_time start_time; + bufferlist notify_replies; + BucketChangeCounter counter; + std::vector buckets; //< buckets selected for trim + BucketTrimStatus status; + RGWObjVersionTracker objv; //< version tracker for trim status object + std::string last_cold_marker; //< position for next trim marker + + static const std::string section; //< metadata section for bucket instances + public: + BucketTrimCR(RGWRados *store, RGWHTTPManager *http, + const BucketTrimConfig& config, BucketTrimObserver *observer, + const rgw_raw_obj& obj) + : RGWCoroutine(store->ctx()), store(store), http(http), config(config), + observer(observer), obj(obj), counter(config.counter_size) + {} + + int operate() override; +}; + +const std::string BucketTrimCR::section{"bucket.instance"}; + +int BucketTrimCR::operate() +{ + reenter(this) { + start_time = ceph::mono_clock::now(); + + if (config.buckets_per_interval) { + // query watch/notify for hot buckets + ldout(cct, 10) << "fetching active bucket counters" << dendl; + set_status("fetching active bucket counters"); + yield { + // request the top bucket counters from each peer gateway + const TrimNotifyType type = NotifyTrimCounters; + TrimCounters::Request request{32}; + bufferlist bl; + encode(type, bl); + encode(request, bl); + call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms, + ¬ify_replies)); + } + if (retcode < 0) { + ldout(cct, 10) << "failed to fetch peer bucket counters" << dendl; + return set_cr_error(retcode); + } + + // select the hottest buckets for trim + retcode = accumulate_peer_counters(notify_replies, counter); + if (retcode < 0) { + ldout(cct, 4) << "failed to correlate peer bucket counters" << dendl; + return set_cr_error(retcode); + } + buckets.reserve(config.buckets_per_interval); + + const int max_count = config.buckets_per_interval - + config.min_cold_buckets_per_interval; + counter.get_highest(max_count, + [this] (const std::string& bucket, int count) { + buckets.push_back(bucket); + }); + } + + if (buckets.size() < config.buckets_per_interval) { + // read BucketTrimStatus for marker position + set_status("reading trim status"); + using ReadStatus = RGWSimpleRadosReadCR; + yield call(new ReadStatus(store->get_async_rados(), store->svc.sysobj, obj, + &status, true, &objv)); + if (retcode < 0) { + ldout(cct, 10) << "failed to read bilog trim status: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + if (status.marker == "MAX") { + status.marker.clear(); // restart at the beginning + } + ldout(cct, 10) << "listing cold buckets from marker=" + << status.marker << dendl; + + set_status("listing cold buckets for trim"); + yield { + // capture a reference so 'this' remains valid in the callback + auto ref = boost::intrusive_ptr{this}; + // list cold buckets to consider for trim + auto cb = [this, ref] (std::string&& bucket, std::string&& marker) { + // filter out keys that we trimmed recently + if (observer->trimmed_recently(bucket)) { + return true; + } + // filter out active buckets that we've already selected + auto i = std::find(buckets.begin(), buckets.end(), bucket); + if (i != buckets.end()) { + return true; + } + buckets.emplace_back(std::move(bucket)); + // remember the last cold bucket spawned to update the status marker + last_cold_marker = std::move(marker); + // return true if there's room for more + return buckets.size() < config.buckets_per_interval; + }; + + call(new MetadataListCR(cct, store->get_async_rados(), store->meta_mgr, + section, status.marker, cb)); + } + if (retcode < 0) { + ldout(cct, 4) << "failed to list bucket instance metadata: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + } + + // trim bucket instances with limited concurrency + set_status("trimming buckets"); + ldout(cct, 4) << "collected " << buckets.size() << " buckets for trim" << dendl; + yield call(new BucketTrimInstanceCollectCR(store, http, observer, buckets, + config.concurrent_buckets)); + // ignore errors from individual buckets + + // write updated trim status + if (!last_cold_marker.empty() && status.marker != last_cold_marker) { + set_status("writing updated trim status"); + status.marker = std::move(last_cold_marker); + ldout(cct, 20) << "writing bucket trim marker=" << status.marker << dendl; + using WriteStatus = RGWSimpleRadosWriteCR; + yield call(new WriteStatus(store->get_async_rados(), store->svc.sysobj, obj, + status, &objv)); + if (retcode < 0) { + ldout(cct, 4) << "failed to write updated trim status: " + << cpp_strerror(retcode) << dendl; + return set_cr_error(retcode); + } + } + + // notify peers that trim completed + set_status("trim completed"); + yield { + const TrimNotifyType type = NotifyTrimComplete; + TrimComplete::Request request; + bufferlist bl; + encode(type, bl); + encode(request, bl); + call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms, + nullptr)); + } + if (retcode < 0) { + ldout(cct, 10) << "failed to notify peers of trim completion" << dendl; + return set_cr_error(retcode); + } + + ldout(cct, 4) << "bucket index log processing completed in " + << ceph::mono_clock::now() - start_time << dendl; + return set_cr_done(); + } + return 0; +} + +class BucketTrimPollCR : public RGWCoroutine { + RGWRados *const store; + RGWHTTPManager *const http; + const BucketTrimConfig& config; + BucketTrimObserver *const observer; + const rgw_raw_obj& obj; + const std::string name{"trim"}; //< lock name + const std::string cookie; + + public: + BucketTrimPollCR(RGWRados *store, RGWHTTPManager *http, + const BucketTrimConfig& config, + BucketTrimObserver *observer, const rgw_raw_obj& obj) + : RGWCoroutine(store->ctx()), store(store), http(http), + config(config), observer(observer), obj(obj), + cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)) + {} + + int operate() override; +}; + +int BucketTrimPollCR::operate() +{ + reenter(this) { + for (;;) { + set_status("sleeping"); + wait(utime_t{static_cast(config.trim_interval_sec), 0}); + + // prevent others from trimming for our entire wait interval + set_status("acquiring trim lock"); + yield call(new RGWSimpleRadosLockCR(store->get_async_rados(), store, + obj, name, cookie, + config.trim_interval_sec)); + if (retcode < 0) { + ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl; + continue; + } + + set_status("trimming"); + yield call(new BucketTrimCR(store, http, config, observer, obj)); + if (retcode < 0) { + // on errors, unlock so other gateways can try + set_status("unlocking"); + yield call(new RGWSimpleRadosUnlockCR(store->get_async_rados(), store, + obj, name, cookie)); + } + } + } + return 0; +} + +/// tracks a bounded list of events with timestamps. old events can be expired, +/// and recent events can be searched by key. expiration depends on events being +/// inserted in temporal order +template +class RecentEventList { + public: + using clock_type = Clock; + using time_point = typename clock_type::time_point; + + RecentEventList(size_t max_size, const ceph::timespan& max_duration) + : events(max_size), max_duration(max_duration) + {} + + /// insert an event at the given point in time. this time must be at least as + /// recent as the last inserted event + void insert(T&& value, const time_point& now) { + // ceph_assert(events.empty() || now >= events.back().time) + events.push_back(Event{std::move(value), now}); + } + + /// performs a linear search for an event matching the given key, whose type + /// U can be any that provides operator==(U, T) + template + bool lookup(const U& key) const { + for (const auto& event : events) { + if (key == event.value) { + return true; + } + } + return false; + } + + /// remove events that are no longer recent compared to the given point in time + void expire_old(const time_point& now) { + const auto expired_before = now - max_duration; + while (!events.empty() && events.front().time < expired_before) { + events.pop_front(); + } + } + + private: + struct Event { + T value; + time_point time; + }; + boost::circular_buffer events; + const ceph::timespan max_duration; +}; + +namespace rgw { + +// read bucket trim configuration from ceph context +void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config) +{ + const auto& conf = cct->_conf; + + config.trim_interval_sec = + conf.get_val("rgw_sync_log_trim_interval"); + config.counter_size = 512; + config.buckets_per_interval = + conf.get_val("rgw_sync_log_trim_max_buckets"); + config.min_cold_buckets_per_interval = + conf.get_val("rgw_sync_log_trim_min_cold_buckets"); + config.concurrent_buckets = + conf.get_val("rgw_sync_log_trim_concurrent_buckets"); + config.notify_timeout_ms = 10000; + config.recent_size = 128; + config.recent_duration = std::chrono::hours(2); +} + +class BucketTrimManager::Impl : public TrimCounters::Server, + public BucketTrimObserver { + public: + RGWRados *const store; + const BucketTrimConfig config; + + const rgw_raw_obj status_obj; + + /// count frequency of bucket instance entries in the data changes log + BucketChangeCounter counter; + + using RecentlyTrimmedBucketList = RecentEventList; + using clock_type = RecentlyTrimmedBucketList::clock_type; + /// track recently trimmed buckets to focus trim activity elsewhere + RecentlyTrimmedBucketList trimmed; + + /// serve the bucket trim watch/notify api + BucketTrimWatcher watcher; + + /// protect data shared between data sync, trim, and watch/notify threads + std::mutex mutex; + + Impl(RGWRados *store, const BucketTrimConfig& config) + : store(store), config(config), + status_obj(store->svc.zone->get_zone_params().log_pool, BucketTrimStatus::oid), + counter(config.counter_size), + trimmed(config.recent_size, config.recent_duration), + watcher(store, status_obj, this) + {} + + /// TrimCounters::Server interface for watch/notify api + void get_bucket_counters(int count, TrimCounters::Vector& buckets) { + buckets.reserve(count); + std::lock_guard lock(mutex); + counter.get_highest(count, [&buckets] (const std::string& key, int count) { + buckets.emplace_back(key, count); + }); + ldout(store->ctx(), 20) << "get_bucket_counters: " << buckets << dendl; + } + + void reset_bucket_counters() override { + ldout(store->ctx(), 20) << "bucket trim completed" << dendl; + std::lock_guard lock(mutex); + counter.clear(); + trimmed.expire_old(clock_type::now()); + } + + /// BucketTrimObserver interface to remember successfully-trimmed buckets + void on_bucket_trimmed(std::string&& bucket_instance) override { + ldout(store->ctx(), 20) << "trimmed bucket instance " << bucket_instance << dendl; + std::lock_guard lock(mutex); + trimmed.insert(std::move(bucket_instance), clock_type::now()); + } + + bool trimmed_recently(const boost::string_view& bucket_instance) override { + std::lock_guard lock(mutex); + return trimmed.lookup(bucket_instance); + } +}; + +BucketTrimManager::BucketTrimManager(RGWRados *store, + const BucketTrimConfig& config) + : impl(new Impl(store, config)) +{ +} +BucketTrimManager::~BucketTrimManager() = default; + +int BucketTrimManager::init() +{ + return impl->watcher.start(); +} + +void BucketTrimManager::on_bucket_changed(const boost::string_view& bucket) +{ + std::lock_guard lock(impl->mutex); + // filter recently trimmed bucket instances out of bucket change counter + if (impl->trimmed.lookup(bucket)) { + return; + } + impl->counter.insert(bucket.to_string()); +} + +RGWCoroutine* BucketTrimManager::create_bucket_trim_cr(RGWHTTPManager *http) +{ + return new BucketTrimPollCR(impl->store, http, impl->config, + impl.get(), impl->status_obj); +} + +RGWCoroutine* BucketTrimManager::create_admin_bucket_trim_cr(RGWHTTPManager *http) +{ + // return the trim coroutine without any polling + return new BucketTrimCR(impl->store, http, impl->config, + impl.get(), impl->status_obj); +} + +} // namespace rgw diff --git a/src/rgw/rgw_sync_log_trim.h b/src/rgw/rgw_sync_log_trim.h new file mode 100644 index 00000000..13d1f63a --- /dev/null +++ b/src/rgw/rgw_sync_log_trim.h @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * Author: Casey Bodley + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef RGW_SYNC_LOG_TRIM_H +#define RGW_SYNC_LOG_TRIM_H + +#include +#include +#include "include/encoding.h" +#include "common/ceph_time.h" + +class CephContext; +class RGWCoroutine; +class RGWHTTPManager; +class RGWRados; + +namespace rgw { + +/// Interface to inform the trim process about which buckets are most active +struct BucketChangeObserver { + virtual ~BucketChangeObserver() = default; + + virtual void on_bucket_changed(const boost::string_view& bucket_instance) = 0; +}; + +/// Configuration for BucketTrimManager +struct BucketTrimConfig { + /// time interval in seconds between bucket trim attempts + uint32_t trim_interval_sec{0}; + /// maximum number of buckets to track with BucketChangeObserver + size_t counter_size{0}; + /// maximum number of buckets to process each trim interval + uint32_t buckets_per_interval{0}; + /// minimum number of buckets to choose from the global bucket instance list + uint32_t min_cold_buckets_per_interval{0}; + /// maximum number of buckets to process in parallel + uint32_t concurrent_buckets{0}; + /// timeout in ms for bucket trim notify replies + uint64_t notify_timeout_ms{0}; + /// maximum number of recently trimmed buckets to remember (should be small + /// enough for a linear search) + size_t recent_size{0}; + /// maximum duration to consider a trim as 'recent' (should be some multiple + /// of the trim interval, at least) + ceph::timespan recent_duration{0}; +}; + +/// fill out the BucketTrimConfig from the ceph context +void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config); + +/// Determines the buckets on which to focus trim activity, using two sources of +/// input: the frequency of entries read from the data changes log, and a global +/// listing of the bucket.instance metadata. This allows us to trim active +/// buckets quickly, while also ensuring that all buckets will eventually trim +class BucketTrimManager : public BucketChangeObserver { + class Impl; + std::unique_ptr impl; + public: + BucketTrimManager(RGWRados *store, const BucketTrimConfig& config); + ~BucketTrimManager(); + + int init(); + + /// increment a counter for the given bucket instance + void on_bucket_changed(const boost::string_view& bucket_instance) override; + + /// create a coroutine to run the bucket trim process every trim interval + RGWCoroutine* create_bucket_trim_cr(RGWHTTPManager *http); + + /// create a coroutine to trim buckets directly via radosgw-admin + RGWCoroutine* create_admin_bucket_trim_cr(RGWHTTPManager *http); +}; + +/// provides persistent storage for the trim manager's current position in the +/// list of bucket instance metadata +struct BucketTrimStatus { + std::string marker; //< metadata key of current bucket instance + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(marker, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(marker, p); + DECODE_FINISH(p); + } + + static const std::string oid; +}; + +} // namespace rgw + +WRITE_CLASS_ENCODER(rgw::BucketTrimStatus); + +#endif // RGW_SYNC_LOG_TRIM_H diff --git a/src/rgw/rgw_sync_module.cc b/src/rgw/rgw_sync_module.cc new file mode 100644 index 00000000..91f31adb --- /dev/null +++ b/src/rgw/rgw_sync_module.cc @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_cr_rados.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_bucket.h" + +#include "rgw_sync_module_log.h" +#include "rgw_sync_module_es.h" +#include "rgw_sync_module_aws.h" +#include "rgw_sync_module_pubsub.h" + +#include + +#define dout_subsys ceph_subsys_rgw + +RGWMetadataHandler *RGWSyncModuleInstance::alloc_bucket_meta_handler() +{ + return RGWBucketMetaHandlerAllocator::alloc(); +} + +RGWMetadataHandler *RGWSyncModuleInstance::alloc_bucket_instance_meta_handler() +{ + return RGWBucketInstanceMetaHandlerAllocator::alloc(); +} + +RGWStatRemoteObjCBCR::RGWStatRemoteObjCBCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + bucket_info(_bucket_info), key(_key) { +} + +RGWCallStatRemoteObjCR::RGWCallStatRemoteObjCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + bucket_info(_bucket_info), key(_key) { +} + +int RGWCallStatRemoteObjCR::operate() { + reenter(this) { + yield { + call(new RGWStatRemoteObjCR(sync_env->async_rados, sync_env->store, + sync_env->source_zone, + bucket_info, key, &mtime, &size, &etag, &attrs, &headers)); + } + if (retcode < 0) { + ldout(sync_env->cct, 10) << "RGWStatRemoteObjCR() returned " << retcode << dendl; + return set_cr_error(retcode); + } + ldout(sync_env->cct, 20) << "stat of remote obj: z=" << sync_env->source_zone + << " b=" << bucket_info.bucket << " k=" << key + << " size=" << size << " mtime=" << mtime << dendl; + yield { + RGWStatRemoteObjCBCR *cb = allocate_callback(); + if (cb) { + cb->set_result(mtime, size, etag, std::move(attrs), std::move(headers)); + call(cb); + } + } + if (retcode < 0) { + ldout(sync_env->cct, 10) << "RGWStatRemoteObjCR() callback returned " << retcode << dendl; + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; +} + +void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager) +{ + RGWSyncModuleRef default_module(std::make_shared()); + modules_manager->register_module("rgw", default_module, true); + + RGWSyncModuleRef archive_module(std::make_shared()); + modules_manager->register_module("archive", archive_module); + + RGWSyncModuleRef log_module(std::make_shared()); + modules_manager->register_module("log", log_module); + + RGWSyncModuleRef es_module(std::make_shared()); + modules_manager->register_module("elasticsearch", es_module); + + RGWSyncModuleRef aws_module(std::make_shared()); + modules_manager->register_module("cloud", aws_module); + + RGWSyncModuleRef pubsub_module(std::make_shared()); + modules_manager->register_module("pubsub", pubsub_module); +} diff --git a/src/rgw/rgw_sync_module.h b/src/rgw/rgw_sync_module.h new file mode 100644 index 00000000..aa68934c --- /dev/null +++ b/src/rgw/rgw_sync_module.h @@ -0,0 +1,197 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SYNC_MODULE_H +#define CEPH_RGW_SYNC_MODULE_H + +#include "rgw_common.h" +#include "rgw_coroutine.h" + +class RGWBucketInfo; +class RGWRemoteDataLog; +struct RGWDataSyncEnv; +struct rgw_bucket_entry_owner; +struct rgw_obj_key; + + +class RGWDataSyncModule { +public: + RGWDataSyncModule() {} + virtual ~RGWDataSyncModule() {} + + virtual void init(RGWDataSyncEnv *sync_env, uint64_t instance_id) {} + + virtual RGWCoroutine *init_sync(RGWDataSyncEnv *sync_env) { + return nullptr; + } + + virtual RGWCoroutine *start_sync(RGWDataSyncEnv *sync_env) { + return nullptr; + } + virtual RGWCoroutine *sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, std::optional versioned_epoch, rgw_zone_set *zones_trace) = 0; + virtual RGWCoroutine *remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0; + virtual RGWCoroutine *create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0; +}; + +class RGWRESTMgr; +class RGWMetadataHandler; + +class RGWSyncModuleInstance { +public: + RGWSyncModuleInstance() {} + virtual ~RGWSyncModuleInstance() {} + virtual RGWDataSyncModule *get_data_handler() = 0; + virtual RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) { + return orig; + } + virtual bool supports_user_writes() { + return false; + } + virtual RGWMetadataHandler *alloc_bucket_meta_handler(); + virtual RGWMetadataHandler *alloc_bucket_instance_meta_handler(); + + // indication whether the sync module start with full sync (default behavior) + // incremental sync would follow anyway + virtual bool should_full_sync() const { + return true; + } +}; + +typedef std::shared_ptr RGWSyncModuleInstanceRef; + +class JSONFormattable; + +class RGWSyncModule { + +public: + RGWSyncModule() {} + virtual ~RGWSyncModule() {} + + virtual bool supports_writes() { + return false; + } + virtual bool supports_data_export() = 0; + virtual int create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) = 0; +}; + +typedef std::shared_ptr RGWSyncModuleRef; + + +class RGWSyncModulesManager { + Mutex lock; + + map modules; +public: + RGWSyncModulesManager() : lock("RGWSyncModulesManager") {} + + void register_module(const string& name, RGWSyncModuleRef& module, bool is_default = false) { + Mutex::Locker l(lock); + modules[name] = module; + if (is_default) { + modules[string()] = module; + } + } + + bool get_module(const string& name, RGWSyncModuleRef *module) { + Mutex::Locker l(lock); + auto iter = modules.find(name); + if (iter == modules.end()) { + return false; + } + if (module != nullptr) { + *module = iter->second; + } + return true; + } + + + int supports_data_export(const string& name) { + RGWSyncModuleRef module; + if (!get_module(name, &module)) { + return -ENOENT; + } + + return module.get()->supports_data_export(); + } + + int create_instance(CephContext *cct, const string& name, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) { + RGWSyncModuleRef module; + if (!get_module(name, &module)) { + return -ENOENT; + } + + return module.get()->create_instance(cct, config, instance); + } + + vector get_registered_module_names() const { + vector names; + for (auto& i: modules) { + if (!i.first.empty()) { + names.push_back(i.first); + } + } + return names; + } +}; + +class RGWStatRemoteObjCBCR : public RGWCoroutine { +protected: + RGWDataSyncEnv *sync_env; + + RGWBucketInfo bucket_info; + rgw_obj_key key; + + ceph::real_time mtime; + uint64_t size = 0; + string etag; + map attrs; + map headers; +public: + RGWStatRemoteObjCBCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key); + ~RGWStatRemoteObjCBCR() override {} + + void set_result(ceph::real_time& _mtime, + uint64_t _size, + const string& _etag, + map&& _attrs, + map&& _headers) { + mtime = _mtime; + size = _size; + etag = _etag; + attrs = std::move(_attrs); + headers = std::move(_headers); + } +}; + +class RGWCallStatRemoteObjCR : public RGWCoroutine { + ceph::real_time mtime; + uint64_t size{0}; + string etag; + map attrs; + map headers; + +protected: + RGWDataSyncEnv *sync_env; + + RGWBucketInfo bucket_info; + rgw_obj_key key; + +public: + RGWCallStatRemoteObjCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key); + + ~RGWCallStatRemoteObjCR() override {} + + int operate() override; + + virtual RGWStatRemoteObjCBCR *allocate_callback() { + return nullptr; + } +}; + +void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager); + +#endif diff --git a/src/rgw/rgw_sync_module_aws.cc b/src/rgw/rgw_sync_module_aws.cc new file mode 100644 index 00000000..e8074d8b --- /dev/null +++ b/src/rgw/rgw_sync_module_aws.cc @@ -0,0 +1,1807 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" + +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_sync_module_aws.h" +#include "rgw_cr_rados.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rest.h" +#include "rgw_acl.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include + +#define dout_subsys ceph_subsys_rgw + + +#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024) + +static string default_target_path = "rgw-${zonegroup}-${sid}/${bucket}"; + +static string get_key_oid(const rgw_obj_key& key) +{ + string oid = key.name; + if (!key.instance.empty() && + !key.have_null_instance()) { + oid += string(":") + key.instance; + } + return oid; +} + +static string obj_to_aws_path(const rgw_obj& obj) +{ + string path = obj.bucket.name + "/" + get_key_oid(obj.key); + + + return path; +} + +/* + + json configuration definition: + + { + "connection": { + "access_key": , + "secret": , + "endpoint": , + "host_style": , + }, + "acls": [ { "type": , + "source_id": , + "dest_id": } ... ], # optional, acl mappings, no mappings if does not exist + "target_path": , # override default + + + # anything below here is for non trivial configuration + # can be used in conjuction with the above + + "default": { + "connection": { + "access_key": , + "secret": , + "endpoint": , + "host_style" , + }, + "acls": [ # list of source uids and how they map into destination uids in the dest objects acls + { + "type" : , # optional, default is id + "source_id": , + "dest_id": + } ... ] + "target_path": "rgwx-${sid}/${bucket}" # how a bucket name is mapped to destination path, + # final object name will be target_path + "/" + obj + }, + "connections": [ + { + "id": , + "access_key": , + "secret": , + "endpoint": , + } ... ], + "acl_profiles": [ + { + "id": , # acl mappings + "acls": [ { + "type": , + "source_id": , + "dest_id": + } ... ] + } + ], + "profiles": [ + { + "source_bucket": , # can specify either specific bucket name (foo), or prefix (foo*) + "target_path": , # (override default) + "connection_id": , # optional, if empty references default connection + "acls_id": , # optional, if empty references default mappings + } ... ], + } + +target path optional variables: + +(evaluated at init) +sid: sync instance id, randomly generated by sync process on first sync initalization +zonegroup: zonegroup name +zonegroup_id: zonegroup name +zone: zone name +zone_id: zone name + +(evaluated when syncing) +bucket: bucket name +owner: bucket owner + +*/ + +struct ACLMapping { + ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER}; + string source_id; + string dest_id; + + ACLMapping() = default; + + ACLMapping(ACLGranteeTypeEnum t, + const string& s, + const string& d) : type(t), + source_id(s), + dest_id(d) {} + + void init(const JSONFormattable& config) { + const string& t = config["type"]; + + if (t == "email") { + type = ACL_TYPE_EMAIL_USER; + } else if (t == "uri") { + type = ACL_TYPE_GROUP; + } else { + type = ACL_TYPE_CANON_USER; + } + + source_id = config["source_id"]; + dest_id = config["dest_id"]; + } + + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ObjectSection os(jf, "acl_mapping"); + string s; + switch (type) { + case ACL_TYPE_EMAIL_USER: + s = "email"; + break; + case ACL_TYPE_GROUP: + s = "uri"; + break; + default: + s = "id"; + break; + } + encode_json("type", s, &jf); + encode_json("source_id", source_id, &jf); + encode_json("dest_id", dest_id, &jf); + } +}; + +struct ACLMappings { + map acl_mappings; + + void init(const JSONFormattable& config) { + for (auto& c : config.array()) { + ACLMapping m; + m.init(c); + + acl_mappings.emplace(std::make_pair(m.source_id, m)); + } + } + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ArraySection os(jf, "acls"); + + for (auto& i : acl_mappings) { + i.second.dump_conf(cct, jf); + } + } +}; + +struct AWSSyncConfig_ACLProfiles { + map > acl_profiles; + + void init(const JSONFormattable& config) { + for (auto& c : config.array()) { + const string& profile_id = c["id"]; + + std::shared_ptr ap{new ACLMappings}; + ap->init(c["acls"]); + + acl_profiles[profile_id] = ap; + } + } + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ArraySection section(jf, "acl_profiles"); + + for (auto& p : acl_profiles) { + Formatter::ObjectSection section(jf, "profile"); + encode_json("id", p.first, &jf); + p.second->dump_conf(cct, jf); + } + } + + bool find(const string& profile_id, ACLMappings *result) const { + auto iter = acl_profiles.find(profile_id); + if (iter == acl_profiles.end()) { + return false; + } + *result = *iter->second; + return true; + } +}; + +struct AWSSyncConfig_Connection { + string connection_id; + string endpoint; + RGWAccessKey key; + HostStyle host_style{PathStyle}; + + bool has_endpoint{false}; + bool has_key{false}; + bool has_host_style{false}; + + void init(const JSONFormattable& config) { + has_endpoint = config.exists("endpoint"); + has_key = config.exists("access_key") || config.exists("secret"); + has_host_style = config.exists("host_style"); + + connection_id = config["id"]; + endpoint = config["endpoint"]; + + key = RGWAccessKey(config["access_key"], config["secret"]); + string host_style_str = config["host_style"]; + if (host_style_str != "virtual") { + host_style = PathStyle; + } else { + host_style = VirtualStyle; + } + } + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ObjectSection section(jf, "connection"); + encode_json("id", connection_id, &jf); + encode_json("endpoint", endpoint, &jf); + string s = (host_style == PathStyle ? "path" : "virtual"); + encode_json("host_style", s, &jf); + + { + Formatter::ObjectSection os(jf, "key"); + encode_json("access_key", key.id, &jf); + string secret = (key.key.empty() ? "" : "******"); + encode_json("secret", secret, &jf); + } + } +}; + +static int conf_to_uint64(CephContext *cct, const JSONFormattable& config, const string& key, uint64_t *pval) +{ + string sval; + if (config.find(key, &sval)) { + string err; + uint64_t val = strict_strtoll(sval.c_str(), 10, &err); + if (!err.empty()) { + ldout(cct, 0) << "ERROR: could not parse configurable value for cloud sync module: " << key << ": " << sval << dendl; + return -EINVAL; + } + *pval = val; + } + return 0; +} + +struct AWSSyncConfig_S3 { + uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE}; + uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE}; + + int init(CephContext *cct, const JSONFormattable& config) { + int r = conf_to_uint64(cct, config, "multipart_sync_threshold", &multipart_sync_threshold); + if (r < 0) { + return r; + } + + r = conf_to_uint64(cct, config, "multipart_min_part_size", &multipart_min_part_size); + if (r < 0) { + return r; + } +#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024) + if (multipart_min_part_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) { + multipart_min_part_size = MULTIPART_MIN_POSSIBLE_PART_SIZE; + } + return 0; + } + + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ObjectSection section(jf, "s3"); + encode_json("multipart_sync_threshold", multipart_sync_threshold, &jf); + encode_json("multipart_min_part_size", multipart_min_part_size, &jf); + } +}; + +struct AWSSyncConfig_Profile { + string source_bucket; + bool prefix{false}; + string target_path; + string connection_id; + string acls_id; + + std::shared_ptr conn_conf; + std::shared_ptr acls; + + std::shared_ptr conn; + + void init(const JSONFormattable& config) { + source_bucket = config["source_bucket"]; + + prefix = (!source_bucket.empty() && source_bucket[source_bucket.size() - 1] == '*'); + + if (prefix) { + source_bucket = source_bucket.substr(0, source_bucket.size() - 1); + } + + target_path = config["target_path"]; + connection_id = config["connection_id"]; + acls_id = config["acls_id"]; + + if (config.exists("connection")) { + conn_conf = make_shared(); + conn_conf->init(config["connection"]); + } + + if (config.exists("acls")) { + acls = make_shared(); + acls->init(config["acls"]); + } + } + + void dump_conf(CephContext *cct, JSONFormatter& jf, const char *section = "config") const { + Formatter::ObjectSection config(jf, section); + string sb{source_bucket}; + if (prefix) { + sb.append("*"); + } + encode_json("source_bucket", sb, &jf); + encode_json("target_path", target_path, &jf); + encode_json("connection_id", connection_id, &jf); + encode_json("acls_id", acls_id, &jf); + if (conn_conf.get()) { + conn_conf->dump_conf(cct, jf); + } + if (acls.get()) { + acls->dump_conf(cct, jf); + } + } +}; + +static void find_and_replace(const string& src, const string& find, const string& replace, string *dest) +{ + string s = src; + + size_t pos = s.find(find); + while (pos != string::npos) { + size_t next_ofs = pos + find.size(); + s = s.substr(0, pos) + replace + s.substr(next_ofs); + pos = s.find(find, next_ofs); + } + + *dest = s; +} + +static void apply_meta_param(const string& src, const string& param, const string& val, string *dest) +{ + string s = string("${") + param + "}"; + find_and_replace(src, s, val, dest); +} + + +struct AWSSyncConfig { + AWSSyncConfig_Profile default_profile; + std::shared_ptr root_profile; + + map > connections; + AWSSyncConfig_ACLProfiles acl_profiles; + + map > explicit_profiles; + + AWSSyncConfig_S3 s3; + + int init_profile(CephContext *cct, const JSONFormattable& profile_conf, AWSSyncConfig_Profile& profile, + bool connection_must_exist) { + if (!profile.connection_id.empty()) { + if (profile.conn_conf) { + ldout(cct, 0) << "ERROR: ambiguous profile connection configuration, connection_id=" << profile.connection_id << dendl; + return -EINVAL; + } + if (connections.find(profile.connection_id) == connections.end()) { + ldout(cct, 0) << "ERROR: profile configuration reference non-existent connection_id=" << profile.connection_id << dendl; + return -EINVAL; + } + profile.conn_conf = connections[profile.connection_id]; + } else if (!profile.conn_conf) { + profile.connection_id = default_profile.connection_id; + auto i = connections.find(profile.connection_id); + if (i != connections.end()) { + profile.conn_conf = i->second; + } + } + + if (connection_must_exist && !profile.conn_conf) { + ldout(cct, 0) << "ERROR: remote connection undefined for sync profile" << dendl; + return -EINVAL; + } + + if (profile.conn_conf && default_profile.conn_conf) { + if (!profile.conn_conf->has_endpoint) { + profile.conn_conf->endpoint = default_profile.conn_conf->endpoint; + } + if (!profile.conn_conf->has_host_style) { + profile.conn_conf->host_style = default_profile.conn_conf->host_style; + } + if (!profile.conn_conf->has_key) { + profile.conn_conf->key = default_profile.conn_conf->key; + } + } + + ACLMappings acl_mappings; + + if (!profile.acls_id.empty()) { + if (!acl_profiles.find(profile.acls_id, &acl_mappings)) { + ldout(cct, 0) << "ERROR: profile configuration reference non-existent acls id=" << profile.acls_id << dendl; + return -EINVAL; + } + profile.acls = acl_profiles.acl_profiles[profile.acls_id]; + } else if (!profile.acls) { + if (default_profile.acls) { + profile.acls = default_profile.acls; + profile.acls_id = default_profile.acls_id; + } + } + + if (profile.target_path.empty()) { + profile.target_path = default_profile.target_path; + } + if (profile.target_path.empty()) { + profile.target_path = default_target_path; + } + + return 0; + } + + int init_target(CephContext *cct, const JSONFormattable& profile_conf, std::shared_ptr *ptarget) { + std::shared_ptr profile; + profile.reset(new AWSSyncConfig_Profile); + profile->init(profile_conf); + + int ret = init_profile(cct, profile_conf, *profile, true); + if (ret < 0) { + return ret; + } + + auto& sb = profile->source_bucket; + + if (explicit_profiles.find(sb) != explicit_profiles.end()) { + ldout(cct, 0) << "WARNING: duplicate target configuration in sync module" << dendl; + } + + explicit_profiles[sb] = profile; + if (ptarget) { + *ptarget = profile; + } + return 0; + } + + bool do_find_profile(const rgw_bucket bucket, std::shared_ptr *result) { + const string& name = bucket.name; + auto iter = explicit_profiles.upper_bound(name); + if (iter == explicit_profiles.begin()) { + return false; + } + + --iter; + if (iter->first.size() > name.size()) { + return false; + } + if (name.compare(0, iter->first.size(), iter->first) != 0) { + return false; + } + + std::shared_ptr& target = iter->second; + + if (!target->prefix && + name.size() != iter->first.size()) { + return false; + } + + *result = target; + return true; + } + + void find_profile(const rgw_bucket bucket, std::shared_ptr *result) { + if (!do_find_profile(bucket, result)) { + *result = root_profile; + } + } + + AWSSyncConfig() {} + + int init(CephContext *cct, const JSONFormattable& config) { + auto& default_conf = config["default"]; + + if (config.exists("default")) { + default_profile.init(default_conf); + init_profile(cct, default_conf, default_profile, false); + } + + for (auto& conn : config["connections"].array()) { + auto new_conn = conn; + + std::shared_ptr c{new AWSSyncConfig_Connection}; + c->init(new_conn); + + connections[new_conn["id"]] = c; + } + + acl_profiles.init(config["acl_profiles"]); + + int r = s3.init(cct, config["s3"]); + if (r < 0) { + return r; + } + + auto new_root_conf = config; + + r = init_target(cct, new_root_conf, &root_profile); /* the root profile config */ + if (r < 0) { + return r; + } + + for (auto target_conf : config["profiles"].array()) { + int r = init_target(cct, target_conf, nullptr); + if (r < 0) { + return r; + } + } + + JSONFormatter jf(true); + dump_conf(cct, jf); + stringstream ss; + jf.flush(ss); + + ldout(cct, 5) << "sync module config (parsed representation):\n" << ss.str() << dendl; + + return 0; + } + + void expand_target(RGWDataSyncEnv *sync_env, const string& sid, const string& path, string *dest) { + apply_meta_param(path, "sid", sid, dest); + + const RGWZoneGroup& zg = sync_env->store->svc.zone->get_zonegroup(); + apply_meta_param(path, "zonegroup", zg.get_name(), dest); + apply_meta_param(path, "zonegroup_id", zg.get_id(), dest); + + const RGWZone& zone = sync_env->store->svc.zone->get_zone(); + apply_meta_param(path, "zone", zone.name, dest); + apply_meta_param(path, "zone_id", zone.id, dest); + } + + void update_config(RGWDataSyncEnv *sync_env, const string& sid) { + expand_target(sync_env, sid, root_profile->target_path, &root_profile->target_path); + ldout(sync_env->cct, 20) << "updated target: (root) -> " << root_profile->target_path << dendl; + for (auto& t : explicit_profiles) { + expand_target(sync_env, sid, t.second->target_path, &t.second->target_path); + ldout(sync_env->cct, 20) << "updated target: " << t.first << " -> " << t.second->target_path << dendl; + } + } + + void dump_conf(CephContext *cct, JSONFormatter& jf) const { + Formatter::ObjectSection config(jf, "config"); + root_profile->dump_conf(cct, jf); + jf.open_array_section("connections"); + for (auto c : connections) { + c.second->dump_conf(cct, jf); + } + jf.close_section(); + + acl_profiles.dump_conf(cct, jf); + + { // targets + Formatter::ArraySection as(jf, "profiles"); + for (auto& t : explicit_profiles) { + Formatter::ObjectSection target_section(jf, "profile"); + encode_json("name", t.first, &jf); + t.second->dump_conf(cct, jf); + } + } + } + + string get_path(std::shared_ptr& profile, + const RGWBucketInfo& bucket_info, + const rgw_obj_key& obj) { + string bucket_str; + string owner; + if (!bucket_info.owner.tenant.empty()) { + bucket_str = owner = bucket_info.owner.tenant + "-"; + owner += bucket_info.owner.id; + } + bucket_str += bucket_info.bucket.name; + + const string& path = profile->target_path; + + string new_path; + apply_meta_param(path, "bucket", bucket_str, &new_path); + apply_meta_param(new_path, "owner", owner, &new_path); + + new_path += string("/") + get_key_oid(obj); + + return new_path; + } + + void get_target(std::shared_ptr& profile, + const RGWBucketInfo& bucket_info, + const rgw_obj_key& obj, + string *bucket_name, + string *obj_name) { + string path = get_path(profile, bucket_info, obj); + size_t pos = path.find('/'); + + *bucket_name = path.substr(0, pos); + *obj_name = path.substr(pos + 1); + } + + void init_conns(RGWDataSyncEnv *sync_env, const string& id) { + update_config(sync_env, id); + + auto& root_conf = root_profile->conn_conf; + + root_profile->conn.reset(new S3RESTConn(sync_env->cct, + sync_env->store->svc.zone, + id, + { root_conf->endpoint }, + root_conf->key, + root_conf->host_style)); + + for (auto i : explicit_profiles) { + auto& c = i.second; + + c->conn.reset(new S3RESTConn(sync_env->cct, + sync_env->store->svc.zone, + id, + { c->conn_conf->endpoint }, + c->conn_conf->key, + c->conn_conf->host_style)); + } + } +}; + + +struct AWSSyncInstanceEnv { + AWSSyncConfig conf; + string id; + + explicit AWSSyncInstanceEnv(AWSSyncConfig& _conf) : conf(_conf) {} + + void init(RGWDataSyncEnv *sync_env, uint64_t instance_id) { + char buf[32]; + snprintf(buf, sizeof(buf), "%llx", (unsigned long long)instance_id); + id = buf; + + conf.init_conns(sync_env, id); + } + + void get_profile(const rgw_bucket& bucket, std::shared_ptr *ptarget) { + conf.find_profile(bucket, ptarget); + ceph_assert(ptarget); + } +}; + +static int do_decode_rest_obj(CephContext *cct, map& attrs, map& headers, rgw_rest_obj *info) +{ + for (auto header : headers) { + const string& val = header.second; + if (header.first == "RGWX_OBJECT_SIZE") { + info->content_len = atoi(val.c_str()); + } else { + info->attrs[header.first] = val; + } + } + + info->acls.set_ctx(cct); + auto aiter = attrs.find(RGW_ATTR_ACL); + if (aiter != attrs.end()) { + bufferlist& bl = aiter->second; + auto bliter = bl.cbegin(); + try { + info->acls.decode(bliter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode policy off attrs" << dendl; + return -EIO; + } + } else { + ldout(cct, 0) << "WARNING: acl attrs not provided" << dendl; + } + + return 0; +} + +class RGWRESTStreamGetCRF : public RGWStreamReadHTTPResourceCRF +{ + RGWDataSyncEnv *sync_env; + RGWRESTConn *conn; + rgw_obj src_obj; + RGWRESTConn::get_obj_params req_params; + + rgw_sync_aws_src_obj_properties src_properties; +public: + RGWRESTStreamGetCRF(CephContext *_cct, + RGWCoroutinesEnv *_env, + RGWCoroutine *_caller, + RGWDataSyncEnv *_sync_env, + RGWRESTConn *_conn, + rgw_obj& _src_obj, + const rgw_sync_aws_src_obj_properties& _src_properties) : RGWStreamReadHTTPResourceCRF(_cct, _env, _caller, + _sync_env->http_manager, _src_obj.key), + sync_env(_sync_env), conn(_conn), src_obj(_src_obj), + src_properties(_src_properties) { + } + + int init() override { + /* init input connection */ + + + req_params.get_op = true; + req_params.prepend_metadata = true; + + req_params.unmod_ptr = &src_properties.mtime; + req_params.etag = src_properties.etag; + req_params.mod_zone_id = src_properties.zone_short_id; + req_params.mod_pg_ver = src_properties.pg_ver; + + if (range.is_set) { + req_params.range_is_set = true; + req_params.range_start = range.ofs; + req_params.range_end = range.ofs + range.size - 1; + } + + RGWRESTStreamRWRequest *in_req; + int ret = conn->get_obj(src_obj, req_params, false /* send */, &in_req); + if (ret < 0) { + ldout(sync_env->cct, 0) << "ERROR: " << __func__ << "(): conn->get_obj() returned ret=" << ret << dendl; + return ret; + } + + set_req(in_req); + + return RGWStreamReadHTTPResourceCRF::init(); + } + + int decode_rest_obj(map& headers, bufferlist& extra_data) override { + map src_attrs; + + ldout(sync_env->cct, 20) << __func__ << ":" << " headers=" << headers << " extra_data.length()=" << extra_data.length() << dendl; + + if (extra_data.length() > 0) { + JSONParser jp; + if (!jp.parse(extra_data.c_str(), extra_data.length())) { + ldout(sync_env->cct, 0) << "ERROR: failed to parse response extra data. len=" << extra_data.length() << " data=" << extra_data.c_str() << dendl; + return -EIO; + } + + JSONDecoder::decode_json("attrs", src_attrs, &jp); + } + return do_decode_rest_obj(sync_env->cct, src_attrs, headers, &rest_obj); + } + + bool need_extra_data() override { + return true; + } +}; + +static std::set keep_headers = { "CONTENT_TYPE", + "CONTENT_ENCODING", + "CONTENT_DISPOSITION", + "CONTENT_LANGUAGE" }; + +class RGWAWSStreamPutCRF : public RGWStreamWriteHTTPResourceCRF +{ + RGWDataSyncEnv *sync_env; + rgw_sync_aws_src_obj_properties src_properties; + std::shared_ptr target; + rgw_obj dest_obj; + string etag; +public: + RGWAWSStreamPutCRF(CephContext *_cct, + RGWCoroutinesEnv *_env, + RGWCoroutine *_caller, + RGWDataSyncEnv *_sync_env, + const rgw_sync_aws_src_obj_properties& _src_properties, + std::shared_ptr& _target, + rgw_obj& _dest_obj) : RGWStreamWriteHTTPResourceCRF(_cct, _env, _caller, _sync_env->http_manager), + sync_env(_sync_env), src_properties(_src_properties), target(_target), dest_obj(_dest_obj) { + } + + int init() override { + /* init output connection */ + RGWRESTStreamS3PutObj *out_req{nullptr}; + + if (multipart.is_multipart) { + char buf[32]; + snprintf(buf, sizeof(buf), "%d", multipart.part_num); + rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() }, + { "partNumber", buf }, + { nullptr, nullptr } }; + target->conn->put_obj_send_init(dest_obj, params, &out_req); + } else { + target->conn->put_obj_send_init(dest_obj, nullptr, &out_req); + } + + set_req(out_req); + + return RGWStreamWriteHTTPResourceCRF::init(); + } + + static bool keep_attr(const string& h) { + return (keep_headers.find(h) != keep_headers.end() || + boost::algorithm::starts_with(h, "X_AMZ_")); + } + + static void init_send_attrs(CephContext *cct, + const rgw_rest_obj& rest_obj, + const rgw_sync_aws_src_obj_properties& src_properties, + const AWSSyncConfig_Profile *target, + map *attrs) { + auto& new_attrs = *attrs; + + new_attrs.clear(); + + for (auto& hi : rest_obj.attrs) { + if (keep_attr(hi.first)) { + new_attrs.insert(hi); + } + } + + auto acl = rest_obj.acls.get_acl(); + + map > access_map; + + if (target->acls) { + for (auto& grant : acl.get_grant_map()) { + auto& orig_grantee = grant.first; + auto& perm = grant.second; + + string grantee; + + const auto& am = target->acls->acl_mappings; + + auto iter = am.find(orig_grantee); + if (iter == am.end()) { + ldout(cct, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl; + continue; + } + + grantee = iter->second.dest_id; + + string type; + + switch (iter->second.type) { + case ACL_TYPE_CANON_USER: + type = "id"; + break; + case ACL_TYPE_EMAIL_USER: + type = "emailAddress"; + break; + case ACL_TYPE_GROUP: + type = "uri"; + break; + default: + continue; + } + + string tv = type + "=" + grantee; + + int flags = perm.get_permission().get_permissions(); + if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) { + access_map[flags].push_back(tv); + continue; + } + + for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) { + if (flags & i) { + access_map[i].push_back(tv); + } + } + } + } + + for (auto aiter : access_map) { + int grant_type = aiter.first; + + string header_str("x-amz-grant-"); + + switch (grant_type) { + case RGW_PERM_READ: + header_str.append("read"); + break; + case RGW_PERM_WRITE: + header_str.append("write"); + break; + case RGW_PERM_READ_ACP: + header_str.append("read-acp"); + break; + case RGW_PERM_WRITE_ACP: + header_str.append("write-acp"); + break; + case RGW_PERM_FULL_CONTROL: + header_str.append("full-control"); + break; + } + + string s; + + for (auto viter : aiter.second) { + if (!s.empty()) { + s.append(", "); + } + s.append(viter); + } + + ldout(cct, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl; + + new_attrs[header_str] = s; + } + + char buf[32]; + snprintf(buf, sizeof(buf), "%llu", (long long)src_properties.versioned_epoch); + new_attrs["x-amz-meta-rgwx-versioned-epoch"] = buf; + + utime_t ut(src_properties.mtime); + snprintf(buf, sizeof(buf), "%lld.%09lld", + (long long)ut.sec(), + (long long)ut.nsec()); + + new_attrs["x-amz-meta-rgwx-source-mtime"] = buf; + new_attrs["x-amz-meta-rgwx-source-etag"] = src_properties.etag; + new_attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name; + if (!rest_obj.key.instance.empty()) { + new_attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance; + } + } + + void send_ready(const rgw_rest_obj& rest_obj) override { + RGWRESTStreamS3PutObj *r = static_cast(req); + + map new_attrs; + if (!multipart.is_multipart) { + init_send_attrs(sync_env->cct, rest_obj, src_properties, target.get(), &new_attrs); + } + + r->set_send_length(rest_obj.content_len); + + RGWAccessControlPolicy policy; + + r->send_ready(target->conn->get_key(), new_attrs, policy, false); + } + + void handle_headers(const map& headers) { + for (auto h : headers) { + if (h.first == "ETAG") { + etag = h.second; + } + } + } + + bool get_etag(string *petag) { + if (etag.empty()) { + return false; + } + *petag = etag; + return true; + } +}; + + +class RGWAWSStreamObjToCloudPlainCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWRESTConn *source_conn; + std::shared_ptr target; + rgw_obj src_obj; + rgw_obj dest_obj; + + rgw_sync_aws_src_obj_properties src_properties; + + std::shared_ptr in_crf; + std::shared_ptr out_crf; + +public: + RGWAWSStreamObjToCloudPlainCR(RGWDataSyncEnv *_sync_env, + RGWRESTConn *_source_conn, + const rgw_obj& _src_obj, + const rgw_sync_aws_src_obj_properties& _src_properties, + std::shared_ptr _target, + const rgw_obj& _dest_obj) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + source_conn(_source_conn), + target(_target), + src_obj(_src_obj), + dest_obj(_dest_obj), + src_properties(_src_properties) {} + + int operate() override { + reenter(this) { + /* init input */ + in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sync_env, + source_conn, src_obj, + src_properties)); + + /* init output */ + out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sync_env, + src_properties, target, dest_obj)); + + yield call(new RGWStreamSpliceCR(cct, sync_env->http_manager, in_crf, out_crf)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSStreamObjToCloudMultipartPartCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWRESTConn *source_conn; + std::shared_ptr target; + rgw_obj src_obj; + rgw_obj dest_obj; + + rgw_sync_aws_src_obj_properties src_properties; + + string upload_id; + + rgw_sync_aws_multipart_part_info part_info; + + std::shared_ptr in_crf; + std::shared_ptr out_crf; + + string *petag; + +public: + RGWAWSStreamObjToCloudMultipartPartCR(RGWDataSyncEnv *_sync_env, + RGWRESTConn *_source_conn, + const rgw_obj& _src_obj, + std::shared_ptr& _target, + const rgw_obj& _dest_obj, + const rgw_sync_aws_src_obj_properties& _src_properties, + const string& _upload_id, + const rgw_sync_aws_multipart_part_info& _part_info, + string *_petag) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + source_conn(_source_conn), + target(_target), + src_obj(_src_obj), + dest_obj(_dest_obj), + src_properties(_src_properties), + upload_id(_upload_id), + part_info(_part_info), + petag(_petag) {} + + int operate() override { + reenter(this) { + /* init input */ + in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sync_env, + source_conn, src_obj, + src_properties)); + + in_crf->set_range(part_info.ofs, part_info.size); + + /* init output */ + out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sync_env, + src_properties, target, dest_obj)); + + out_crf->set_multipart(upload_id, part_info.part_num, part_info.size); + + yield call(new RGWStreamSpliceCR(cct, sync_env->http_manager, in_crf, out_crf)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + if (!(static_cast(out_crf.get()))->get_etag(petag)) { + ldout(sync_env->cct, 0) << "ERROR: failed to get etag from PUT request" << dendl; + return set_cr_error(-EIO); + } + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSAbortMultipartCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWRESTConn *dest_conn; + rgw_obj dest_obj; + + string upload_id; + +public: + RGWAWSAbortMultipartCR(RGWDataSyncEnv *_sync_env, + RGWRESTConn *_dest_conn, + const rgw_obj& _dest_obj, + const string& _upload_id) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + dest_conn(_dest_conn), + dest_obj(_dest_obj), + upload_id(_upload_id) {} + + int operate() override { + reenter(this) { + + yield { + rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} }; + bufferlist bl; + call(new RGWDeleteRESTResourceCR(sync_env->cct, dest_conn, sync_env->http_manager, + obj_to_aws_path(dest_obj), params)); + } + + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (retcode=" << retcode << ")" << dendl; + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSInitMultipartCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWRESTConn *dest_conn; + rgw_obj dest_obj; + + uint64_t obj_size; + map attrs; + + bufferlist out_bl; + + string *upload_id; + + struct InitMultipartResult { + string bucket; + string key; + string upload_id; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Bucket", bucket, obj); + RGWXMLDecoder::decode_xml("Key", key, obj); + RGWXMLDecoder::decode_xml("UploadId", upload_id, obj); + } + } result; + +public: + RGWAWSInitMultipartCR(RGWDataSyncEnv *_sync_env, + RGWRESTConn *_dest_conn, + const rgw_obj& _dest_obj, + uint64_t _obj_size, + const map& _attrs, + string *_upload_id) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + dest_conn(_dest_conn), + dest_obj(_dest_obj), + obj_size(_obj_size), + attrs(_attrs), + upload_id(_upload_id) {} + + int operate() override { + reenter(this) { + + yield { + rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} }; + bufferlist bl; + call(new RGWPostRawRESTResourceCR (sync_env->cct, dest_conn, sync_env->http_manager, + obj_to_aws_path(dest_obj), params, &attrs, bl, &out_bl)); + } + + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl; + return set_cr_error(retcode); + } + { + /* + * If one of the following fails we cannot abort upload, as we cannot + * extract the upload id. If one of these fail it's very likely that that's + * the least of our problem. + */ + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldout(sync_env->cct, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl; + return set_cr_error(-EIO); + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldout(sync_env->cct, 5) << "ERROR: failed to parse xml: " << str << dendl; + return set_cr_error(-EIO); + } + + try { + RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldout(sync_env->cct, 5) << "ERROR: unexpected xml: " << str << dendl; + return set_cr_error(-EIO); + } + } + + ldout(sync_env->cct, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl; + + *upload_id = result.upload_id; + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSCompleteMultipartCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWRESTConn *dest_conn; + rgw_obj dest_obj; + + bufferlist out_bl; + + string upload_id; + + struct CompleteMultipartReq { + map parts; + + explicit CompleteMultipartReq(const map& _parts) : parts(_parts) {} + + void dump_xml(Formatter *f) const { + for (auto p : parts) { + f->open_object_section("Part"); + encode_xml("PartNumber", p.first, f); + encode_xml("ETag", p.second.etag, f); + f->close_section(); + }; + } + } req_enc; + + struct CompleteMultipartResult { + string location; + string bucket; + string key; + string etag; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Location", bucket, obj); + RGWXMLDecoder::decode_xml("Bucket", bucket, obj); + RGWXMLDecoder::decode_xml("Key", key, obj); + RGWXMLDecoder::decode_xml("ETag", etag, obj); + } + } result; + +public: + RGWAWSCompleteMultipartCR(RGWDataSyncEnv *_sync_env, + RGWRESTConn *_dest_conn, + const rgw_obj& _dest_obj, + string _upload_id, + const map& _parts) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + dest_conn(_dest_conn), + dest_obj(_dest_obj), + upload_id(_upload_id), + req_enc(_parts) {} + + int operate() override { + reenter(this) { + + yield { + rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} }; + stringstream ss; + XMLFormatter formatter; + + encode_xml("CompleteMultipartUpload", req_enc, &formatter); + + formatter.flush(ss); + + bufferlist bl; + bl.append(ss.str()); + + call(new RGWPostRawRESTResourceCR (sync_env->cct, dest_conn, sync_env->http_manager, + obj_to_aws_path(dest_obj), params, nullptr, bl, &out_bl)); + } + + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl; + return set_cr_error(retcode); + } + { + /* + * If one of the following fails we cannot abort upload, as we cannot + * extract the upload id. If one of these fail it's very likely that that's + * the least of our problem. + */ + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldout(sync_env->cct, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl; + return set_cr_error(-EIO); + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldout(sync_env->cct, 5) << "ERROR: failed to parse xml: " << str << dendl; + return set_cr_error(-EIO); + } + + try { + RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldout(sync_env->cct, 5) << "ERROR: unexpected xml: " << str << dendl; + return set_cr_error(-EIO); + } + } + + ldout(sync_env->cct, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl; + + return set_cr_done(); + } + + return 0; + } +}; + + +class RGWAWSStreamAbortMultipartUploadCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWRESTConn *dest_conn; + const rgw_obj dest_obj; + const rgw_raw_obj status_obj; + + string upload_id; + +public: + + RGWAWSStreamAbortMultipartUploadCR(RGWDataSyncEnv *_sync_env, + RGWRESTConn *_dest_conn, + const rgw_obj& _dest_obj, + const rgw_raw_obj& _status_obj, + const string& _upload_id) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + dest_conn(_dest_conn), + dest_obj(_dest_obj), + status_obj(_status_obj), + upload_id(_upload_id) {} + + int operate() override { + reenter(this) { + yield call(new RGWAWSAbortMultipartCR(sync_env, dest_conn, dest_obj, upload_id)); + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " retcode=" << retcode << dendl; + /* ignore error, best effort */ + } + yield call(new RGWRadosRemoveCR(sync_env->store, status_obj)); + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " retcode=" << retcode << dendl; + /* ignore error, best effort */ + } + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSStreamObjToCloudMultipartCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + AWSSyncConfig& conf; + RGWRESTConn *source_conn; + std::shared_ptr target; + rgw_obj src_obj; + rgw_obj dest_obj; + + uint64_t obj_size; + string src_etag; + rgw_sync_aws_src_obj_properties src_properties; + rgw_rest_obj rest_obj; + + rgw_sync_aws_multipart_upload_info status; + + map new_attrs; + + rgw_sync_aws_multipart_part_info *pcur_part_info{nullptr}; + + int ret_err{0}; + + rgw_raw_obj status_obj; + +public: + RGWAWSStreamObjToCloudMultipartCR(RGWDataSyncEnv *_sync_env, + AWSSyncConfig& _conf, + RGWRESTConn *_source_conn, + const rgw_obj& _src_obj, + std::shared_ptr& _target, + const rgw_obj& _dest_obj, + uint64_t _obj_size, + const rgw_sync_aws_src_obj_properties& _src_properties, + const rgw_rest_obj& _rest_obj) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + conf(_conf), + source_conn(_source_conn), + target(_target), + src_obj(_src_obj), + dest_obj(_dest_obj), + obj_size(_obj_size), + src_properties(_src_properties), + rest_obj(_rest_obj), + status_obj(sync_env->store->svc.zone->get_zone_params().log_pool, + RGWBucketSyncStatusManager::obj_status_oid(sync_env->source_zone, src_obj)) { + } + + + int operate() override { + reenter(this) { + yield call(new RGWSimpleRadosReadCR(sync_env->async_rados, sync_env->store->svc.sysobj, + status_obj, &status, false)); + + if (retcode < 0 && retcode != -ENOENT) { + ldout(sync_env->cct, 0) << "ERROR: failed to read sync status of object " << src_obj << " retcode=" << retcode << dendl; + return retcode; + } + + if (retcode >= 0) { + /* check here that mtime and size did not change */ + + if (status.src_properties.mtime != src_properties.mtime || status.obj_size != obj_size || + status.src_properties.etag != src_properties.etag) { + yield call(new RGWAWSStreamAbortMultipartUploadCR(sync_env, target->conn.get(), dest_obj, status_obj, status.upload_id)); + retcode = -ENOENT; + } + } + + if (retcode == -ENOENT) { + RGWAWSStreamPutCRF::init_send_attrs(sync_env->cct, rest_obj, src_properties, target.get(), &new_attrs); + + yield call(new RGWAWSInitMultipartCR(sync_env, target->conn.get(), dest_obj, status.obj_size, std::move(new_attrs), &status.upload_id)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + status.obj_size = obj_size; + status.src_properties = src_properties; +#define MULTIPART_MAX_PARTS 10000 + uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS; + status.part_size = std::max(conf.s3.multipart_min_part_size, min_part_size); + status.num_parts = (obj_size + status.part_size - 1) / status.part_size; + status.cur_part = 1; + } + + for (; (uint32_t)status.cur_part <= status.num_parts; ++status.cur_part) { + yield { + rgw_sync_aws_multipart_part_info& cur_part_info = status.parts[status.cur_part]; + cur_part_info.part_num = status.cur_part; + cur_part_info.ofs = status.cur_ofs; + cur_part_info.size = std::min((uint64_t)status.part_size, status.obj_size - status.cur_ofs); + + pcur_part_info = &cur_part_info; + + status.cur_ofs += status.part_size; + + call(new RGWAWSStreamObjToCloudMultipartPartCR(sync_env, + source_conn, src_obj, + target, + dest_obj, + status.src_properties, + status.upload_id, + cur_part_info, + &cur_part_info.etag)); + } + + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to sync obj=" << src_obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << status.cur_part << " (error: " << cpp_strerror(-retcode) << ")" << dendl; + ret_err = retcode; + yield call(new RGWAWSStreamAbortMultipartUploadCR(sync_env, target->conn.get(), dest_obj, status_obj, status.upload_id)); + return set_cr_error(ret_err); + } + + yield call(new RGWSimpleRadosWriteCR(sync_env->async_rados, sync_env->store->svc.sysobj, status_obj, status)); + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to store multipart upload state, retcode=" << retcode << dendl; + /* continue with upload anyway */ + } + ldout(sync_env->cct, 20) << "sync of object=" << src_obj << " via multipart upload, finished sending part #" << status.cur_part << " etag=" << pcur_part_info->etag << dendl; + } + + yield call(new RGWAWSCompleteMultipartCR(sync_env, target->conn.get(), dest_obj, status.upload_id, status.parts)); + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to complete multipart upload of obj=" << src_obj << " (error: " << cpp_strerror(-retcode) << ")" << dendl; + ret_err = retcode; + yield call(new RGWAWSStreamAbortMultipartUploadCR(sync_env, target->conn.get(), dest_obj, status_obj, status.upload_id)); + return set_cr_error(ret_err); + } + + /* remove status obj */ + yield call(new RGWRadosRemoveCR(sync_env->store, status_obj)); + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to abort multipart upload obj=" << src_obj << " upload_id=" << status.upload_id << " part number " << status.cur_part << " (" << cpp_strerror(-retcode) << ")" << dendl; + /* ignore error, best effort */ + } + return set_cr_done(); + } + + return 0; + } +}; +template +int decode_attr(map& attrs, const char *attr_name, T *result, T def_val) +{ + map::iterator iter = attrs.find(attr_name); + if (iter == attrs.end()) { + *result = def_val; + return 0; + } + bufferlist& bl = iter->second; + if (bl.length() == 0) { + *result = def_val; + return 0; + } + auto bliter = bl.cbegin(); + try { + decode(*result, bliter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +// maybe use Fetch Remote Obj instead? +class RGWAWSHandleRemoteObjCBCR: public RGWStatRemoteObjCBCR { + AWSSyncInstanceEnv& instance; + + uint64_t versioned_epoch{0}; + + RGWRESTConn *source_conn{nullptr}; + std::shared_ptr target; + bufferlist res; + unordered_map bucket_created; + string target_bucket_name; + string target_obj_name; + rgw_rest_obj rest_obj; + int ret{0}; + + uint32_t src_zone_short_id{0}; + uint64_t src_pg_ver{0}; + + bufferlist out_bl; + + struct CreateBucketResult { + string code; + + void decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Code", code, obj); + } + } result; + +public: + RGWAWSHandleRemoteObjCBCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, + rgw_obj_key& _key, + AWSSyncInstanceEnv& _instance, + uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sync_env, _bucket_info, _key), + instance(_instance), versioned_epoch(_versioned_epoch) + {} + + ~RGWAWSHandleRemoteObjCBCR(){ + } + + int operate() override { + reenter(this) { + ret = decode_attr(attrs, RGW_ATTR_PG_VER, &src_pg_ver, (uint64_t)0); + if (ret < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl; + } else { + ret = decode_attr(attrs, RGW_ATTR_SOURCE_ZONE, &src_zone_short_id, (uint32_t)0); + if (ret < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to decode source zone short_id attr, ignoring" << dendl; + src_pg_ver = 0; /* all or nothing */ + } + } + ldout(sync_env->cct, 4) << "AWS: download begin: z=" << sync_env->source_zone + << " b=" << bucket_info.bucket << " k=" << key << " size=" << size + << " mtime=" << mtime << " etag=" << etag + << " zone_short_id=" << src_zone_short_id << " pg_ver=" << src_pg_ver + << dendl; + + + source_conn = sync_env->store->svc.zone->get_zone_conn_by_id(sync_env->source_zone); + if (!source_conn) { + ldout(sync_env->cct, 0) << "ERROR: cannot find http connection to zone " << sync_env->source_zone << dendl; + return set_cr_error(-EINVAL); + } + + instance.get_profile(bucket_info.bucket, &target); + instance.conf.get_target(target, bucket_info, key, &target_bucket_name, &target_obj_name); + + if (bucket_created.find(target_bucket_name) == bucket_created.end()){ + yield { + ldout(sync_env->cct,0) << "AWS: creating bucket " << target_bucket_name << dendl; + bufferlist bl; + call(new RGWPutRawRESTResourceCR (sync_env->cct, target->conn.get(), + sync_env->http_manager, + target_bucket_name, nullptr, bl, &out_bl)); + } + if (retcode < 0 ) { + RGWXMLDecoder::XMLParser parser; + if (!parser.init()) { + ldout(sync_env->cct, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl; + return set_cr_error(retcode); + } + + if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) { + string str(out_bl.c_str(), out_bl.length()); + ldout(sync_env->cct, 5) << "ERROR: failed to parse xml: " << str << dendl; + return set_cr_error(retcode); + } + + try { + RGWXMLDecoder::decode_xml("Error", result, &parser, true); + } catch (RGWXMLDecoder::err& err) { + string str(out_bl.c_str(), out_bl.length()); + ldout(sync_env->cct, 5) << "ERROR: unexpected xml: " << str << dendl; + return set_cr_error(retcode); + } + + if (result.code != "BucketAlreadyOwnedByYou") { + return set_cr_error(retcode); + } + } + + bucket_created[target_bucket_name] = true; + } + + yield { + rgw_obj src_obj(bucket_info.bucket, key); + + /* init output */ + rgw_bucket target_bucket; + target_bucket.name = target_bucket_name; /* this is only possible because we only use bucket name for + uri resolution */ + rgw_obj dest_obj(target_bucket, target_obj_name); + + + rgw_sync_aws_src_obj_properties src_properties; + src_properties.mtime = mtime; + src_properties.etag = etag; + src_properties.zone_short_id = src_zone_short_id; + src_properties.pg_ver = src_pg_ver; + src_properties.versioned_epoch = versioned_epoch; + + if (size < instance.conf.s3.multipart_sync_threshold) { + call(new RGWAWSStreamObjToCloudPlainCR(sync_env, source_conn, src_obj, + src_properties, + target, + dest_obj)); + } else { + rgw_rest_obj rest_obj; + rest_obj.init(key); + if (do_decode_rest_obj(sync_env->cct, attrs, headers, &rest_obj)) { + ldout(sync_env->cct, 0) << "ERROR: failed to decode rest obj out of headers=" << headers << ", attrs=" << attrs << dendl; + return set_cr_error(-EINVAL); + } + call(new RGWAWSStreamObjToCloudMultipartCR(sync_env, instance.conf, source_conn, src_obj, + target, dest_obj, size, src_properties, rest_obj)); + } + } + if (retcode < 0) { + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + return 0; + } +}; + +class RGWAWSHandleRemoteObjCR : public RGWCallStatRemoteObjCR { + AWSSyncInstanceEnv& instance; + uint64_t versioned_epoch; +public: + RGWAWSHandleRemoteObjCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, + AWSSyncInstanceEnv& _instance, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sync_env, _bucket_info, _key), + instance(_instance), versioned_epoch(_versioned_epoch) { + } + + ~RGWAWSHandleRemoteObjCR() {} + + RGWStatRemoteObjCBCR *allocate_callback() override { + return new RGWAWSHandleRemoteObjCBCR(sync_env, bucket_info, key, instance, versioned_epoch); + } +}; + +class RGWAWSRemoveRemoteObjCBCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env{nullptr}; + std::shared_ptr target; + RGWBucketInfo bucket_info; + rgw_obj_key key; + ceph::real_time mtime; + AWSSyncInstanceEnv& instance; + int ret{0}; +public: + RGWAWSRemoveRemoteObjCBCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, const ceph::real_time& _mtime, + AWSSyncInstanceEnv& _instance) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + bucket_info(_bucket_info), key(_key), + mtime(_mtime), instance(_instance) {} + int operate() override { + reenter(this) { + ldout(sync_env->cct, 0) << ": remove remote obj: z=" << sync_env->source_zone + << " b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime << dendl; + yield { + instance.get_profile(bucket_info.bucket, &target); + string path = instance.conf.get_path(target, bucket_info, key); + ldout(sync_env->cct, 0) << "AWS: removing aws object at" << path << dendl; + + call(new RGWDeleteRESTResourceCR(sync_env->cct, target->conn.get(), + sync_env->http_manager, + path, nullptr /* params */)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } + +}; + + +class RGWAWSDataSyncModule: public RGWDataSyncModule { + CephContext *cct; + AWSSyncInstanceEnv instance; +public: + RGWAWSDataSyncModule(CephContext *_cct, AWSSyncConfig& _conf) : + cct(_cct), + instance(_conf) { + } + + void init(RGWDataSyncEnv *sync_env, uint64_t instance_id) override { + instance.init(sync_env, instance_id); + } + + ~RGWAWSDataSyncModule() {} + + RGWCoroutine *sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, + std::optional versioned_epoch, + rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 0) << instance.id << ": sync_object: b=" << bucket_info.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + return new RGWAWSHandleRemoteObjCR(sync_env, bucket_info, key, instance, versioned_epoch.value_or(0)); + } + RGWCoroutine *remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, + rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 0) <<"rm_object: b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return new RGWAWSRemoveRemoteObjCBCR(sync_env, bucket_info, key, mtime, instance); + } + RGWCoroutine *create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, + rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 0) <<"AWS Not implemented: create_delete_marker: b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime + << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return NULL; + } +}; + +class RGWAWSSyncModuleInstance : public RGWSyncModuleInstance { + RGWAWSDataSyncModule data_handler; +public: + RGWAWSSyncModuleInstance(CephContext *cct, AWSSyncConfig& _conf) : data_handler(cct, _conf) {} + RGWDataSyncModule *get_data_handler() override { + return &data_handler; + } +}; + +int RGWAWSSyncModule::create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance){ + AWSSyncConfig conf; + + int r = conf.init(cct, config); + if (r < 0) { + return r; + } + + instance->reset(new RGWAWSSyncModuleInstance(cct, conf)); + return 0; +} diff --git a/src/rgw/rgw_sync_module_aws.h b/src/rgw/rgw_sync_module_aws.h new file mode 100644 index 00000000..a44202b5 --- /dev/null +++ b/src/rgw/rgw_sync_module_aws.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_SYNC_MODULE_AWS_H +#define RGW_SYNC_MODULE_AWS_H + +#include "rgw_sync_module.h" + +struct rgw_sync_aws_multipart_part_info { + int part_num{0}; + uint64_t ofs{0}; + uint64_t size{0}; + string etag; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(part_num, bl); + encode(ofs, bl); + encode(size, bl); + encode(etag, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(part_num, bl); + decode(ofs, bl); + decode(size, bl); + decode(etag, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_part_info) + +struct rgw_sync_aws_src_obj_properties { + ceph::real_time mtime; + string etag; + uint32_t zone_short_id{0}; + uint64_t pg_ver{0}; + uint64_t versioned_epoch{0}; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(mtime, bl); + encode(etag, bl); + encode(zone_short_id, bl); + encode(pg_ver, bl); + encode(versioned_epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(mtime, bl); + decode(etag, bl); + decode(zone_short_id, bl); + decode(pg_ver, bl); + decode(versioned_epoch, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_sync_aws_src_obj_properties) + +struct rgw_sync_aws_multipart_upload_info { + string upload_id; + uint64_t obj_size; + rgw_sync_aws_src_obj_properties src_properties; + uint32_t part_size{0}; + uint32_t num_parts{0}; + + int cur_part{0}; + uint64_t cur_ofs{0}; + + std::map parts; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(upload_id, bl); + encode(obj_size, bl); + encode(src_properties, bl); + encode(part_size, bl); + encode(num_parts, bl); + encode(cur_part, bl); + encode(cur_ofs, bl); + encode(parts, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(upload_id, bl); + decode(obj_size, bl); + decode(src_properties, bl); + decode(part_size, bl); + decode(num_parts, bl); + decode(cur_part, bl); + decode(cur_ofs, bl); + decode(parts, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_upload_info) + +class RGWAWSSyncModule : public RGWSyncModule { + public: + RGWAWSSyncModule() {} + bool supports_data_export() override { return false;} + int create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; + +#endif /* RGW_SYNC_MODULE_AWS_H */ diff --git a/src/rgw/rgw_sync_module_es.cc b/src/rgw/rgw_sync_module_es.cc new file mode 100644 index 00000000..36b652a1 --- /dev/null +++ b/src/rgw/rgw_sync_module_es.cc @@ -0,0 +1,918 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_b64.h" +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_sync_module_es.h" +#include "rgw_sync_module_es_rest.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rest.h" +#include "rgw_op.h" +#include "rgw_es_query.h" +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include "include/str_list.h" + +#include + +#define dout_subsys ceph_subsys_rgw + + +/* + * whitelist utility. Config string is a list of entries, where an entry is either an item, + * a prefix, or a suffix. An item would be the name of the entity that we'd look up, + * a prefix would be a string ending with an asterisk, a suffix would be a string starting + * with an asterisk. For example: + * + * bucket1, bucket2, foo*, *bar + */ +class ItemList { + bool approve_all{false}; + + set entries; + set prefixes; + set suffixes; + + void parse(const string& str) { + list l; + + get_str_list(str, ",", l); + + for (auto& entry : l) { + entry = rgw_trim_whitespace(entry); + if (entry.empty()) { + continue; + } + + if (entry == "*") { + approve_all = true; + return; + } + + if (entry[0] == '*') { + suffixes.insert(entry.substr(1)); + continue; + } + + if (entry.back() == '*') { + prefixes.insert(entry.substr(0, entry.size() - 1)); + continue; + } + + entries.insert(entry); + } + } + +public: + ItemList() {} + void init(const string& str, bool def_val) { + if (str.empty()) { + approve_all = def_val; + } else { + parse(str); + } + } + + bool exists(const string& entry) { + if (approve_all) { + return true; + } + + if (entries.find(entry) != entries.end()) { + return true; + } + + auto i = prefixes.upper_bound(entry); + if (i != prefixes.begin()) { + --i; + if (boost::algorithm::starts_with(entry, *i)) { + return true; + } + } + + for (i = suffixes.begin(); i != suffixes.end(); ++i) { + if (boost::algorithm::ends_with(entry, *i)) { + return true; + } + } + + return false; + } +}; + +#define ES_NUM_SHARDS_MIN 5 + +#define ES_NUM_SHARDS_DEFAULT 16 +#define ES_NUM_REPLICAS_DEFAULT 1 + +using ESVersion = std::pair; +static constexpr ESVersion ES_V5{5,0}; +static constexpr ESVersion ES_V7{7,0}; + +struct ESInfo { + std::string name; + std::string cluster_name; + std::string cluster_uuid; + ESVersion version; + + void decode_json(JSONObj *obj); + + std::string get_version_str(){ + return std::to_string(version.first) + "." + std::to_string(version.second); + } +}; + +// simple wrapper structure to wrap the es version nested type +struct es_version_decoder { + ESVersion version; + + int parse_version(const std::string& s) { + int major, minor; + int ret = sscanf(s.c_str(), "%d.%d", &major, &minor); + if (ret < 0) { + return ret; + } + version = std::make_pair(major,minor); + return 0; + } + + void decode_json(JSONObj *obj) { + std::string s; + JSONDecoder::decode_json("number",s,obj); + if (parse_version(s) < 0) + throw JSONDecoder::err("Failed to parse ElasticVersion"); + } +}; + + +void ESInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("cluster_name", cluster_name, obj); + JSONDecoder::decode_json("cluster_uuid", cluster_uuid, obj); + es_version_decoder esv; + JSONDecoder::decode_json("version", esv, obj); + version = std::move(esv.version); +} + +struct ElasticConfig { + uint64_t sync_instance{0}; + string id; + string index_path; + std::unique_ptr conn; + bool explicit_custom_meta{true}; + string override_index_path; + ItemList index_buckets; + ItemList allow_owners; + uint32_t num_shards{0}; + uint32_t num_replicas{0}; + std::map default_headers = {{ "Content-Type", "application/json" }}; + ESInfo es_info; + + void init(CephContext *cct, const JSONFormattable& config) { + string elastic_endpoint = config["endpoint"]; + id = string("elastic:") + elastic_endpoint; + conn.reset(new RGWRESTConn(cct, nullptr, id, { elastic_endpoint })); + explicit_custom_meta = config["explicit_custom_meta"](true); + index_buckets.init(config["index_buckets_list"], true); /* approve all buckets by default */ + allow_owners.init(config["approved_owners_list"], true); /* approve all bucket owners by default */ + override_index_path = config["override_index_path"]; + num_shards = config["num_shards"](ES_NUM_SHARDS_DEFAULT); + if (num_shards < ES_NUM_SHARDS_MIN) { + num_shards = ES_NUM_SHARDS_MIN; + } + num_replicas = config["num_replicas"](ES_NUM_REPLICAS_DEFAULT); + if (string user = config["username"], pw = config["password"]; + !user.empty() && !pw.empty()) { + auto auth_string = user + ":" + pw; + default_headers.emplace("AUTHORIZATION", "Basic " + rgw::to_base64(auth_string)); + } + + } + + void init_instance(const RGWRealm& realm, uint64_t instance_id) { + sync_instance = instance_id; + + if (!override_index_path.empty()) { + index_path = override_index_path; + return; + } + + char buf[32]; + snprintf(buf, sizeof(buf), "-%08x", (uint32_t)(sync_instance & 0xFFFFFFFF)); + + index_path = "/rgw-" + realm.get_name() + buf; + } + + string get_index_path() { + return index_path; + } + + map& get_request_headers() { + return default_headers; + } + + string get_obj_path(const RGWBucketInfo& bucket_info, const rgw_obj_key& key) { + if (es_info.version >= ES_V7) { + return index_path+ "/_doc/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance)); +; + } else { + return index_path + "/object/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance)); + } + } + + bool should_handle_operation(RGWBucketInfo& bucket_info) { + return index_buckets.exists(bucket_info.bucket.name) && + allow_owners.exists(bucket_info.owner.to_str()); + } +}; + +using ElasticConfigRef = std::shared_ptr; + +static const char *es_type_to_str(const ESType& t) { + switch (t) { + case ESType::String: return "string"; + case ESType::Text: return "text"; + case ESType::Keyword: return "keyword"; + case ESType::Long: return "long"; + case ESType::Integer: return "integer"; + case ESType::Short: return "short"; + case ESType::Byte: return "byte"; + case ESType::Double: return "double"; + case ESType::Float: return "float"; + case ESType::Half_Float: return "half_float"; + case ESType::Scaled_Float: return "scaled_float"; + case ESType::Date: return "date"; + case ESType::Boolean: return "boolean"; + case ESType::Integer_Range: return "integer_range"; + case ESType::Float_Range: return "float_range"; + case ESType::Double_Range: return "date_range"; + case ESType::Date_Range: return "date_range"; + case ESType::Geo_Point: return "geo_point"; + case ESType::Ip: return "ip"; + default: + return ""; + } +} + +struct es_type_v2 { + ESType estype; + const char *format{nullptr}; + std::optional analyzed; + + es_type_v2(ESType et) : estype(et) {} + + void dump(Formatter *f) const { + const char *type_str = es_type_to_str(estype); + encode_json("type", type_str, f); + if (format) { + encode_json("format", format, f); + } + + auto is_analyzed = analyzed; + + if (estype == ESType::String && + !is_analyzed) { + is_analyzed = false; + } + + if (is_analyzed) { + encode_json("index", (is_analyzed.value() ? "analyzed" : "not_analyzed"), f); + } + } +}; + +struct es_type_v5 { + ESType estype; + const char *format{nullptr}; + std::optional analyzed; + std::optional index; + + es_type_v5(ESType et) : estype(et) {} + + void dump(Formatter *f) const { + ESType new_estype; + if (estype != ESType::String) { + new_estype = estype; + } else { + bool is_analyzed = analyzed.value_or(false); + new_estype = (is_analyzed ? ESType::Text : ESType::Keyword); + /* index = true; ... Not setting index=true, because that's the default, + * and dumping a boolean value *might* be a problem when backporting this + * because value might get quoted + */ + } + + const char *type_str = es_type_to_str(new_estype); + encode_json("type", type_str, f); + if (format) { + encode_json("format", format, f); + } + if (index) { + encode_json("index", index.value(), f); + } + } +}; + +template +struct es_type : public T { + es_type(T t) : T(t) {} + es_type& set_format(const char *f) { + T::format = f; + return *this; + } + + es_type& set_analyzed(bool a) { + T::analyzed = a; + return *this; + } +}; + +template +struct es_index_mappings { + ESVersion es_version; + ESType string_type {ESType::String}; + + es_index_mappings(ESVersion esv):es_version(esv) { + } + + es_type est(ESType t) const { + return es_type(t); + } + + void dump_custom(const char *section, ESType type, const char *format, Formatter *f) const { + f->open_object_section(section); + ::encode_json("type", "nested", f); + f->open_object_section("properties"); + encode_json("name", est(string_type), f); + encode_json("value", est(type).set_format(format), f); + f->close_section(); // entry + f->close_section(); // custom-string + } + + void dump(Formatter *f) const { + if (es_version <= ES_V7) + f->open_object_section("object"); + f->open_object_section("properties"); + encode_json("bucket", est(string_type), f); + encode_json("name", est(string_type), f); + encode_json("instance", est(string_type), f); + encode_json("versioned_epoch", est(ESType::Long), f); + f->open_object_section("meta"); + f->open_object_section("properties"); + encode_json("cache_control", est(string_type), f); + encode_json("content_disposition", est(string_type), f); + encode_json("content_encoding", est(string_type), f); + encode_json("content_language", est(string_type), f); + encode_json("content_type", est(string_type), f); + encode_json("storage_class", est(string_type), f); + encode_json("etag", est(string_type), f); + encode_json("expires", est(string_type), f); + encode_json("mtime", est(ESType::Date) + .set_format("strict_date_optional_time||epoch_millis"), f); + encode_json("size", est(ESType::Long), f); + dump_custom("custom-string", string_type, nullptr, f); + dump_custom("custom-int", ESType::Long, nullptr, f); + dump_custom("custom-date", ESType::Date, "strict_date_optional_time||epoch_millis", f); + f->close_section(); // properties + f->close_section(); // meta + f->close_section(); // properties + + if (es_version <= ES_V7) + f->close_section(); // object + } +}; + +struct es_index_settings { + uint32_t num_replicas; + uint32_t num_shards; + + es_index_settings(uint32_t _replicas, uint32_t _shards) : num_replicas(_replicas), num_shards(_shards) {} + + void dump(Formatter *f) const { + encode_json("number_of_replicas", num_replicas, f); + encode_json("number_of_shards", num_shards, f); + } +}; + +struct es_index_config_base { + virtual ~es_index_config_base() {} + virtual void dump(Formatter *f) const = 0; +}; + +template +struct es_index_config : public es_index_config_base { + es_index_settings settings; + es_index_mappings mappings; + + es_index_config(es_index_settings& _s, ESVersion esv) : settings(_s), mappings(esv) { + } + + void dump(Formatter *f) const { + encode_json("settings", settings, f); + encode_json("mappings", mappings, f); + } +}; + +static bool is_sys_attr(const std::string& attr_name){ + static constexpr std::initializer_list rgw_sys_attrs = + {RGW_ATTR_PG_VER, + RGW_ATTR_SOURCE_ZONE, + RGW_ATTR_ID_TAG, + RGW_ATTR_TEMPURL_KEY1, + RGW_ATTR_TEMPURL_KEY2, + RGW_ATTR_UNIX1, + RGW_ATTR_UNIX_KEY1 + }; + + return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end(); +} + +static size_t attr_len(const bufferlist& val) +{ + size_t len = val.length(); + if (len && val[len - 1] == '\0') { + --len; + } + + return len; +} + +struct es_obj_metadata { + CephContext *cct; + ElasticConfigRef es_conf; + RGWBucketInfo bucket_info; + rgw_obj_key key; + ceph::real_time mtime; + uint64_t size; + map attrs; + uint64_t versioned_epoch; + + es_obj_metadata(CephContext *_cct, ElasticConfigRef _es_conf, const RGWBucketInfo& _bucket_info, + const rgw_obj_key& _key, ceph::real_time& _mtime, uint64_t _size, + map& _attrs, uint64_t _versioned_epoch) : cct(_cct), es_conf(_es_conf), bucket_info(_bucket_info), key(_key), + mtime(_mtime), size(_size), attrs(std::move(_attrs)), versioned_epoch(_versioned_epoch) {} + + void dump(Formatter *f) const { + map out_attrs; + map custom_meta; + RGWAccessControlPolicy policy; + set permissions; + RGWObjTags obj_tags; + + for (auto i : attrs) { + const string& attr_name = i.first; + bufferlist& val = i.second; + + if (!boost::algorithm::starts_with(attr_name, RGW_ATTR_PREFIX)) { + continue; + } + + if (boost::algorithm::starts_with(attr_name, RGW_ATTR_META_PREFIX)) { + custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1), + string(val.c_str(), attr_len(val))); + continue; + } + + if (boost::algorithm::starts_with(attr_name, RGW_ATTR_CRYPT_PREFIX)) { + continue; + } + + if (boost::algorithm::starts_with(attr_name, RGW_ATTR_OLH_PREFIX)) { + // skip versioned object olh info + continue; + } + + if (attr_name == RGW_ATTR_ACL) { + try { + auto i = val.cbegin(); + decode(policy, i); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl; + continue; + } + + const RGWAccessControlList& acl = policy.get_acl(); + + permissions.insert(policy.get_owner().get_id().to_str()); + for (auto acliter : acl.get_grant_map()) { + const ACLGrant& grant = acliter.second; + if (grant.get_type().get_type() == ACL_TYPE_CANON_USER && + ((uint32_t)grant.get_permission().get_permissions() & RGW_PERM_READ) != 0) { + rgw_user user; + if (grant.get_id(user)) { + permissions.insert(user.to_str()); + } + } + } + } else if (attr_name == RGW_ATTR_TAGS) { + try { + auto tags_bl = val.cbegin(); + decode(obj_tags, tags_bl); + } catch (buffer::error& err) { + ldout(cct,0) << "ERROR: failed to decode obj tags for " + << bucket_info.bucket << "/" << key << dendl; + continue; + } + } else if (attr_name == RGW_ATTR_COMPRESSION) { + RGWCompressionInfo cs_info; + try { + auto vals_bl = val.cbegin(); + decode(cs_info, vals_bl); + } catch (buffer::error& err) { + ldout(cct,0) << "ERROR: failed to decode compression attr for " + << bucket_info.bucket << "/" << key << dendl; + continue; + } + out_attrs.emplace("compression",std::move(cs_info.compression_type)); + } else { + if (!is_sys_attr(attr_name)) { + out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1), + std::string(val.c_str(), attr_len(val))); + } + } + } + ::encode_json("bucket", bucket_info.bucket.name, f); + ::encode_json("name", key.name, f); + string instance = key.instance; + if (instance.empty()) + instance = "null"; + ::encode_json("instance", instance, f); + ::encode_json("versioned_epoch", versioned_epoch, f); + ::encode_json("owner", policy.get_owner(), f); + ::encode_json("permissions", permissions, f); + f->open_object_section("meta"); + ::encode_json("size", size, f); + + string mtime_str; + rgw_to_iso8601(mtime, &mtime_str); + ::encode_json("mtime", mtime_str, f); + for (auto i : out_attrs) { + ::encode_json(i.first.c_str(), i.second, f); + } + map custom_str; + map custom_int; + map custom_date; + + for (auto i : custom_meta) { + auto config = bucket_info.mdsearch_config.find(i.first); + if (config == bucket_info.mdsearch_config.end()) { + if (!es_conf->explicit_custom_meta) { + /* default custom meta is of type string */ + custom_str[i.first] = i.second; + } else { + ldout(cct, 20) << "custom meta entry key=" << i.first << " not found in bucket mdsearch config: " << bucket_info.mdsearch_config << dendl; + } + continue; + } + switch (config->second) { + case ESEntityTypeMap::ES_ENTITY_DATE: + custom_date[i.first] = i.second; + break; + case ESEntityTypeMap::ES_ENTITY_INT: + custom_int[i.first] = i.second; + break; + default: + custom_str[i.first] = i.second; + } + } + + if (!custom_str.empty()) { + f->open_array_section("custom-string"); + for (auto i : custom_str) { + f->open_object_section("entity"); + ::encode_json("name", i.first.c_str(), f); + ::encode_json("value", i.second, f); + f->close_section(); + } + f->close_section(); + } + if (!custom_int.empty()) { + f->open_array_section("custom-int"); + for (auto i : custom_int) { + f->open_object_section("entity"); + ::encode_json("name", i.first.c_str(), f); + ::encode_json("value", i.second, f); + f->close_section(); + } + f->close_section(); + } + if (!custom_date.empty()) { + f->open_array_section("custom-date"); + for (auto i : custom_date) { + /* + * try to exlicitly parse date field, otherwise elasticsearch could reject the whole doc, + * which will end up with failed sync + */ + real_time t; + int r = parse_time(i.second.c_str(), &t); + if (r < 0) { + ldout(cct, 20) << __func__ << "(): failed to parse time (" << i.second << "), skipping encoding of custom date attribute" << dendl; + continue; + } + + string time_str; + rgw_to_iso8601(t, &time_str); + + f->open_object_section("entity"); + ::encode_json("name", i.first.c_str(), f); + ::encode_json("value", time_str.c_str(), f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); // meta + const auto& m = obj_tags.get_tags(); + if (m.size() > 0){ + f->open_array_section("tagging"); + for (const auto &it : m) { + f->open_object_section("tag"); + ::encode_json("key", it.first, f); + ::encode_json("value",it.second, f); + f->close_section(); + } + f->close_section(); // tagging + } + } +}; + +class RGWElasticInitConfigCBCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + ElasticConfigRef conf; + ESInfo es_info; + + struct _err_response { + struct err_reason { + vector root_cause; + string type; + string reason; + string index; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("root_cause", root_cause, obj); + JSONDecoder::decode_json("type", type, obj); + JSONDecoder::decode_json("reason", reason, obj); + JSONDecoder::decode_json("index", index, obj); + } + } error; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("error", error, obj); + } + } err_response; + +public: + RGWElasticInitConfigCBCR(RGWDataSyncEnv *_sync_env, + ElasticConfigRef _conf) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + conf(_conf) {} + int operate() override { + reenter(this) { + ldout(sync_env->cct, 0) << ": init elasticsearch config zone=" << sync_env->source_zone << dendl; + yield call(new RGWReadRESTResourceCR (sync_env->cct, + conf->conn.get(), + sync_env->http_manager, + "/", nullptr /*params*/, + &(conf->default_headers), + &es_info)); + if (retcode < 0) { + return set_cr_error(retcode); + } + + yield { + string path = conf->get_index_path(); + ldout(sync_env->cct, 5) << "got elastic version=" << es_info.get_version_str() << dendl; + + es_index_settings settings(conf->num_replicas, conf->num_shards); + + std::unique_ptr index_conf; + + if (es_info.version >= ES_V5) { + ldout(sync_env->cct, 0) << "elasticsearch: index mapping: version >= 5" << dendl; + index_conf.reset(new es_index_config(settings, es_info.version)); + } else { + ldout(sync_env->cct, 0) << "elasticsearch: index mapping: version < 5" << dendl; + index_conf.reset(new es_index_config(settings, es_info.version)); + } + call(new RGWPutRESTResourceCR (sync_env->cct, + conf->conn.get(), + sync_env->http_manager, + path, nullptr /*params*/, + &(conf->default_headers), + *index_conf, nullptr, &err_response)); + } + if (retcode < 0) { + ldout(sync_env->cct, 0) << "elasticsearch: failed to initialize index: response.type=" << err_response.error.type << " response.reason=" << err_response.error.reason << dendl; + + if (err_response.error.type != "index_already_exists_exception" && + err_response.error.type != "resource_already_exists_exception") { + return set_cr_error(retcode); + } + + ldout(sync_env->cct, 0) << "elasticsearch: index already exists, assuming external initialization" << dendl; + } + return set_cr_done(); + } + return 0; + } + +}; + +class RGWElasticHandleRemoteObjCBCR : public RGWStatRemoteObjCBCR { + ElasticConfigRef conf; + uint64_t versioned_epoch; +public: + RGWElasticHandleRemoteObjCBCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, + ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sync_env, _bucket_info, _key), conf(_conf), + versioned_epoch(_versioned_epoch) {} + int operate() override { + reenter(this) { + ldout(sync_env->cct, 10) << ": stat of remote obj: z=" << sync_env->source_zone + << " b=" << bucket_info.bucket << " k=" << key + << " size=" << size << " mtime=" << mtime << dendl; + + yield { + string path = conf->get_obj_path(bucket_info, key); + es_obj_metadata doc(sync_env->cct, conf, bucket_info, key, mtime, size, attrs, versioned_epoch); + + call(new RGWPutRESTResourceCR(sync_env->cct, conf->conn.get(), + sync_env->http_manager, + path, nullptr /* params */, + &(conf->default_headers), + doc, nullptr /* result */)); + + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +class RGWElasticHandleRemoteObjCR : public RGWCallStatRemoteObjCR { + ElasticConfigRef conf; + uint64_t versioned_epoch; +public: + RGWElasticHandleRemoteObjCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, + ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sync_env, _bucket_info, _key), + conf(_conf), versioned_epoch(_versioned_epoch) { + } + + ~RGWElasticHandleRemoteObjCR() override {} + + RGWStatRemoteObjCBCR *allocate_callback() override { + return new RGWElasticHandleRemoteObjCBCR(sync_env, bucket_info, key, conf, versioned_epoch); + } +}; + +class RGWElasticRemoveRemoteObjCBCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + RGWBucketInfo bucket_info; + rgw_obj_key key; + ceph::real_time mtime; + ElasticConfigRef conf; +public: + RGWElasticRemoveRemoteObjCBCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, const ceph::real_time& _mtime, + ElasticConfigRef _conf) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), + bucket_info(_bucket_info), key(_key), + mtime(_mtime), conf(_conf) {} + int operate() override { + reenter(this) { + ldout(sync_env->cct, 10) << ": remove remote obj: z=" << sync_env->source_zone + << " b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime << dendl; + yield { + string path = conf->get_obj_path(bucket_info, key); + + call(new RGWDeleteRESTResourceCR(sync_env->cct, conf->conn.get(), + sync_env->http_manager, + path, nullptr /* params */)); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } + +}; + +class RGWElasticDataSyncModule : public RGWDataSyncModule { + ElasticConfigRef conf; +public: + RGWElasticDataSyncModule(CephContext *cct, const JSONFormattable& config) : conf(std::make_shared()) { + conf->init(cct, config); + } + ~RGWElasticDataSyncModule() override {} + + void init(RGWDataSyncEnv *sync_env, uint64_t instance_id) override { + conf->init_instance(sync_env->store->svc.zone->get_realm(), instance_id); + // try to get elastic search version + RGWCoroutinesManager crs(sync_env->store->ctx(), sync_env->store->get_cr_registry()); + RGWHTTPManager http_manager(sync_env->store->ctx(), crs.get_completion_mgr()); + int ret = http_manager.start(); + if (ret < 0) { + return; + } + ret = crs.run(new RGWReadRESTResourceCR(sync_env->cct, + conf->conn.get(), + &http_manager, + "/", nullptr, + &(conf->default_headers), + &(conf->es_info))); + http_manager.stop(); + if (ret < 0) { + ldout(sync_env->cct, 1) << conf->id << ": fetch elastic info failed: " << ret << dendl; + } else { + ldout(sync_env->cct, 5) << conf->id << ": got elastic version=" << conf->es_info.get_version_str() << dendl; + } + } + + RGWCoroutine *init_sync(RGWDataSyncEnv *sync_env) override { + ldout(sync_env->cct, 5) << conf->id << ": init" << dendl; + return new RGWElasticInitConfigCBCR(sync_env, conf); + } + RGWCoroutine *sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, std::optional versioned_epoch, rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 10) << conf->id << ": sync_object: b=" << bucket_info.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + if (!conf->should_handle_operation(bucket_info)) { + ldout(sync_env->cct, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl; + return nullptr; + } + return new RGWElasticHandleRemoteObjCR(sync_env, bucket_info, key, conf, versioned_epoch.value_or(0)); + } + RGWCoroutine *remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + /* versioned and versioned epoch params are useless in the elasticsearch backend case */ + ldout(sync_env->cct, 10) << conf->id << ": rm_object: b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + if (!conf->should_handle_operation(bucket_info)) { + ldout(sync_env->cct, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl; + return nullptr; + } + return new RGWElasticRemoveRemoteObjCBCR(sync_env, bucket_info, key, mtime, conf); + } + RGWCoroutine *create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 10) << conf->id << ": create_delete_marker: b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime + << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + ldout(sync_env->cct, 10) << conf->id << ": skipping operation (not handled)" << dendl; + return NULL; + } + RGWRESTConn *get_rest_conn() { + return conf->conn.get(); + } + + string get_index_path() { + return conf->get_index_path(); + } + + map& get_request_headers() { + return conf->get_request_headers(); + } +}; + +RGWElasticSyncModuleInstance::RGWElasticSyncModuleInstance(CephContext *cct, const JSONFormattable& config) +{ + data_handler = std::unique_ptr(new RGWElasticDataSyncModule(cct, config)); +} + +RGWDataSyncModule *RGWElasticSyncModuleInstance::get_data_handler() +{ + return data_handler.get(); +} + +RGWRESTConn *RGWElasticSyncModuleInstance::get_rest_conn() +{ + return data_handler->get_rest_conn(); +} + +string RGWElasticSyncModuleInstance::get_index_path() { + return data_handler->get_index_path(); +} + +map& RGWElasticSyncModuleInstance::get_request_headers() { + return data_handler->get_request_headers(); +} + +RGWRESTMgr *RGWElasticSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) { + if (dialect != RGW_REST_S3) { + return orig; + } + delete orig; + return new RGWRESTMgr_MDSearch_S3(); +} + +int RGWElasticSyncModule::create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) { + string endpoint = config["endpoint"]; + instance->reset(new RGWElasticSyncModuleInstance(cct, config)); + return 0; +} + diff --git a/src/rgw/rgw_sync_module_es.h b/src/rgw/rgw_sync_module_es.h new file mode 100644 index 00000000..cb5c9106 --- /dev/null +++ b/src/rgw/rgw_sync_module_es.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SYNC_MODULE_ES_H +#define CEPH_RGW_SYNC_MODULE_ES_H + +#include "rgw_sync_module.h" + +enum class ESType { + /* string datatypes */ + String, /* Deprecated Since 5.X+ */ + Text, + Keyword, + + /* Numeric Types */ + Long, Integer, Short, Byte, Double, Float, Half_Float, Scaled_Float, + + /* Date Type */ + Date, + + /* Boolean */ + Boolean, + + /* Binary; Must Be Base64 Encoded */ + Binary, + + /* Range Types */ + Integer_Range, Float_Range, Long_Range, Double_Range, Date_Range, + + /* A Few Specialized Types */ + Geo_Point, + Ip +}; + + +class RGWElasticSyncModule : public RGWSyncModule { +public: + RGWElasticSyncModule() {} + bool supports_data_export() override { + return false; + } + int create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; + +class RGWElasticDataSyncModule; +class RGWRESTConn; + +class RGWElasticSyncModuleInstance : public RGWSyncModuleInstance { + std::unique_ptr data_handler; +public: + RGWElasticSyncModuleInstance(CephContext *cct, const JSONFormattable& config); + RGWDataSyncModule *get_data_handler() override; + RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override; + RGWRESTConn *get_rest_conn(); + std::string get_index_path(); + map& get_request_headers(); + bool supports_user_writes() override { + return true; + } +}; + +#endif diff --git a/src/rgw/rgw_sync_module_es_rest.cc b/src/rgw/rgw_sync_module_es_rest.cc new file mode 100644 index 00000000..751d8220 --- /dev/null +++ b/src/rgw/rgw_sync_module_es_rest.cc @@ -0,0 +1,423 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_sync_module_es.h" +#include "rgw_sync_module_es_rest.h" +#include "rgw_es_query.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +struct es_index_obj_response { + string bucket; + rgw_obj_key key; + uint64_t versioned_epoch{0}; + ACLOwner owner; + set read_permissions; + + struct { + uint64_t size{0}; + ceph::real_time mtime; + string etag; + string content_type; + string storage_class; + map custom_str; + map custom_int; + map custom_date; + + template + struct _custom_entry { + string name; + T value; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("value", value, obj); + } + }; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("size", size, obj); + string mtime_str; + JSONDecoder::decode_json("mtime", mtime_str, obj); + parse_time(mtime_str.c_str(), &mtime); + JSONDecoder::decode_json("etag", etag, obj); + JSONDecoder::decode_json("content_type", content_type, obj); + JSONDecoder::decode_json("storage_class", storage_class, obj); + list<_custom_entry > str_entries; + JSONDecoder::decode_json("custom-string", str_entries, obj); + for (auto& e : str_entries) { + custom_str[e.name] = e.value; + } + list<_custom_entry > int_entries; + JSONDecoder::decode_json("custom-int", int_entries, obj); + for (auto& e : int_entries) { + custom_int[e.name] = e.value; + } + list<_custom_entry > date_entries; + JSONDecoder::decode_json("custom-date", date_entries, obj); + for (auto& e : date_entries) { + custom_date[e.name] = e.value; + } + } + } meta; + + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("bucket", bucket, obj); + JSONDecoder::decode_json("name", key.name, obj); + JSONDecoder::decode_json("instance", key.instance, obj); + JSONDecoder::decode_json("versioned_epoch", versioned_epoch, obj); + JSONDecoder::decode_json("permissions", read_permissions, obj); + JSONDecoder::decode_json("owner", owner, obj); + JSONDecoder::decode_json("meta", meta, obj); + } +}; + +struct es_search_response { + uint32_t took; + bool timed_out; + struct { + uint32_t total; + uint32_t successful; + uint32_t failed; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("total", total, obj); + JSONDecoder::decode_json("successful", successful, obj); + JSONDecoder::decode_json("failed", failed, obj); + } + } shards; + struct obj_hit { + string index; + string type; + string id; + // double score + es_index_obj_response source; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("_index", index, obj); + JSONDecoder::decode_json("_type", type, obj); + JSONDecoder::decode_json("_id", id, obj); + JSONDecoder::decode_json("_source", source, obj); + } + }; + struct { + uint32_t total; + // double max_score; + list hits; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("total", total, obj); + // JSONDecoder::decode_json("max_score", max_score, obj); + JSONDecoder::decode_json("hits", hits, obj); + } + } hits; + void decode_json(JSONObj *obj) { + JSONDecoder::decode_json("took", took, obj); + JSONDecoder::decode_json("timed_out", timed_out, obj); + JSONDecoder::decode_json("_shards", shards, obj); + JSONDecoder::decode_json("hits", hits, obj); + } +}; + +class RGWMetadataSearchOp : public RGWOp { + RGWSyncModuleInstanceRef sync_module_ref; + RGWElasticSyncModuleInstance *es_module; +protected: + string expression; + string custom_prefix; +#define MAX_KEYS_DEFAULT 100 + uint64_t max_keys{MAX_KEYS_DEFAULT}; + string marker_str; + uint64_t marker{0}; + string next_marker; + bool is_truncated{false}; + string err; + + es_search_response response; + +public: + RGWMetadataSearchOp(const RGWSyncModuleInstanceRef& sync_module) : sync_module_ref(sync_module) { + es_module = static_cast(sync_module_ref.get()); + } + + int verify_permission() override { + return 0; + } + virtual int get_params() = 0; + void pre_exec() override; + void execute() override; + + const char* name() const override { return "metadata_search"; } + virtual RGWOpType get_type() override { return RGW_OP_METADATA_SEARCH; } + virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; + +void RGWMetadataSearchOp::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWMetadataSearchOp::execute() +{ + op_ret = get_params(); + if (op_ret < 0) + return; + + list > conds; + + if (!s->user->system) { + conds.push_back(make_pair("permissions", s->user->user_id.to_str())); + } + + if (!s->bucket_name.empty()) { + conds.push_back(make_pair("bucket", s->bucket_name)); + } + + ESQueryCompiler es_query(expression, &conds, custom_prefix); + + static map aliases = { + { "bucket", "bucket" }, /* forces lowercase */ + { "name", "name" }, + { "key", "name" }, + { "instance", "instance" }, + { "etag", "meta.etag" }, + { "size", "meta.size" }, + { "mtime", "meta.mtime" }, + { "lastmodified", "meta.mtime" }, + { "last_modified", "meta.mtime" }, + { "contenttype", "meta.content_type" }, + { "content_type", "meta.content_type" }, + { "storageclass", "meta.storage_class" }, + { "storage_class", "meta.storage_class" }, + }; + es_query.set_field_aliases(&aliases); + + static map generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR}, + {"name", ESEntityTypeMap::ES_ENTITY_STR}, + {"instance", ESEntityTypeMap::ES_ENTITY_STR}, + {"permissions", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.content_type", ESEntityTypeMap::ES_ENTITY_STR}, + {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE}, + {"meta.size", ESEntityTypeMap::ES_ENTITY_INT}, + {"meta.storage_class", ESEntityTypeMap::ES_ENTITY_STR} }; + ESEntityTypeMap gm(generic_map); + es_query.set_generic_type_map(&gm); + + static set restricted_fields = { {"permissions"} }; + es_query.set_restricted_fields(&restricted_fields); + + map custom_map; + for (auto& i : s->bucket_info.mdsearch_config) { + custom_map[i.first] = (ESEntityTypeMap::EntityType)i.second; + } + + ESEntityTypeMap em(custom_map); + es_query.set_custom_type_map(&em); + + bool valid = es_query.compile(&err); + if (!valid) { + ldout(s->cct, 10) << "invalid query, failed generating request json" << dendl; + op_ret = -EINVAL; + return; + } + + JSONFormatter f; + encode_json("root", es_query, &f); + + RGWRESTConn *conn = es_module->get_rest_conn(); + + bufferlist in; + bufferlist out; + + stringstream ss; + + f.flush(ss); + in.append(ss.str()); + + string resource = es_module->get_index_path() + "/_search"; + param_vec_t params; + static constexpr int BUFSIZE = 32; + char buf[BUFSIZE]; + snprintf(buf, sizeof(buf), "%lld", (long long)max_keys); + params.push_back(param_pair_t("size", buf)); + if (marker > 0) { + params.push_back(param_pair_t("from", marker_str.c_str())); + } + ldout(s->cct, 20) << "sending request to elasticsearch, payload=" << string(in.c_str(), in.length()) << dendl; + auto& extra_headers = es_module->get_request_headers(); + op_ret = conn->get_resource(resource, ¶ms, &extra_headers, out, &in); + if (op_ret < 0) { + ldout(s->cct, 0) << "ERROR: failed to fetch resource (r=" << resource << ", ret=" << op_ret << ")" << dendl; + return; + } + + ldout(s->cct, 20) << "response: " << string(out.c_str(), out.length()) << dendl; + + JSONParser jparser; + if (!jparser.parse(out.c_str(), out.length())) { + ldout(s->cct, 0) << "ERROR: failed to parse elasticsearch response" << dendl; + op_ret = -EINVAL; + return; + } + + try { + decode_json_obj(response, &jparser); + } catch (JSONDecoder::err& e) { + ldout(s->cct, 0) << "ERROR: failed to decode JSON input: " << e.message << dendl; + op_ret = -EINVAL; + return; + } + +} + +class RGWMetadataSearch_ObjStore_S3 : public RGWMetadataSearchOp { +public: + explicit RGWMetadataSearch_ObjStore_S3(const RGWSyncModuleInstanceRef& _sync_module) : RGWMetadataSearchOp(_sync_module) { + custom_prefix = "x-amz-meta-"; + } + + int get_params() override { + expression = s->info.args.get("query"); + bool exists; + string max_keys_str = s->info.args.get("max-keys", &exists); +#define MAX_KEYS_MAX 10000 + if (exists) { + string err; + max_keys = strict_strtoll(max_keys_str.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + if (max_keys > MAX_KEYS_MAX) { + max_keys = MAX_KEYS_MAX; + } + } + marker_str = s->info.args.get("marker", &exists); + if (exists) { + string err; + marker = strict_strtoll(marker_str.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + } + uint64_t nm = marker + max_keys; + static constexpr int BUFSIZE = 32; + char buf[BUFSIZE]; + snprintf(buf, sizeof(buf), "%lld", (long long)nm); + next_marker = buf; + return 0; + } + void send_response() override { + if (op_ret) { + s->err.message = err; + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret < 0) { + return; + } + + is_truncated = (response.hits.hits.size() >= max_keys); + + s->formatter->open_object_section("SearchMetadataResponse"); + s->formatter->dump_string("Marker", marker_str); + s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false")); + if (is_truncated) { + s->formatter->dump_string("NextMarker", next_marker); + } + if (s->format == RGW_FORMAT_JSON) { + s->formatter->open_array_section("Objects"); + } + for (auto& i : response.hits.hits) { + s->formatter->open_object_section("Contents"); + es_index_obj_response& e = i.source; + s->formatter->dump_string("Bucket", e.bucket); + s->formatter->dump_string("Key", e.key.name); + string instance = (!e.key.instance.empty() ? e.key.instance : "null"); + s->formatter->dump_string("Instance", instance.c_str()); + s->formatter->dump_int("VersionedEpoch", e.versioned_epoch); + dump_time(s, "LastModified", &e.meta.mtime); + s->formatter->dump_int("Size", e.meta.size); + s->formatter->dump_format("ETag", "\"%s\"", e.meta.etag.c_str()); + s->formatter->dump_string("ContentType", e.meta.content_type.c_str()); + s->formatter->dump_string("StorageClass", e.meta.storage_class.c_str()); + dump_owner(s, e.owner.get_id(), e.owner.get_display_name()); + s->formatter->open_array_section("CustomMetadata"); + for (auto& m : e.meta.custom_str) { + s->formatter->open_object_section("Entry"); + s->formatter->dump_string("Name", m.first.c_str()); + s->formatter->dump_string("Value", m.second); + s->formatter->close_section(); + } + for (auto& m : e.meta.custom_int) { + s->formatter->open_object_section("Entry"); + s->formatter->dump_string("Name", m.first.c_str()); + s->formatter->dump_int("Value", m.second); + s->formatter->close_section(); + } + for (auto& m : e.meta.custom_date) { + s->formatter->open_object_section("Entry"); + s->formatter->dump_string("Name", m.first.c_str()); + s->formatter->dump_string("Value", m.second); + s->formatter->close_section(); + } + s->formatter->close_section(); + rgw_flush_formatter(s, s->formatter); + s->formatter->close_section(); + }; + if (s->format == RGW_FORMAT_JSON) { + s->formatter->close_section(); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +}; + +class RGWHandler_REST_MDSearch_S3 : public RGWHandler_REST_S3 { +protected: + RGWOp *op_get() override { + if (s->info.args.exists("query")) { + return new RGWMetadataSearch_ObjStore_S3(store->get_sync_module()); + } + if (!s->init_state.url_bucket.empty() && + s->info.args.exists("mdsearch")) { + return new RGWGetBucketMetaSearch_ObjStore_S3; + } + return nullptr; + } + RGWOp *op_head() override { + return nullptr; + } + RGWOp *op_post() override { + return nullptr; + } +public: + explicit RGWHandler_REST_MDSearch_S3(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {} + virtual ~RGWHandler_REST_MDSearch_S3() {} +}; + + +RGWHandler_REST* RGWRESTMgr_MDSearch_S3::get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + int ret = + RGWHandler_REST_S3::init_from_header(s, + RGW_FORMAT_XML, true); + if (ret < 0) { + return nullptr; + } + + if (!s->object.empty()) { + return nullptr; + } + + RGWHandler_REST *handler = new RGWHandler_REST_MDSearch_S3(auth_registry); + + ldout(s->cct, 20) << __func__ << " handler=" << typeid(*handler).name() + << dendl; + return handler; +} + diff --git a/src/rgw/rgw_sync_module_es_rest.h b/src/rgw/rgw_sync_module_es_rest.h new file mode 100644 index 00000000..b31b8e2c --- /dev/null +++ b/src/rgw/rgw_sync_module_es_rest.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SYNC_MODULE_ES_REST_H +#define CEPH_RGW_SYNC_MODULE_ES_REST_H + +#include "rgw_rest.h" + +class RGWElasticSyncModuleInstance; + +class RGWRESTMgr_MDSearch_S3 : public RGWRESTMgr { +public: + explicit RGWRESTMgr_MDSearch_S3() {} + + RGWHandler_REST *get_handler(struct req_state* s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; + +#endif diff --git a/src/rgw/rgw_sync_module_log.cc b/src/rgw/rgw_sync_module_log.cc new file mode 100644 index 00000000..2b893aad --- /dev/null +++ b/src/rgw/rgw_sync_module_log.cc @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_cr_rados.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_sync_module_log.h" + +#define dout_subsys ceph_subsys_rgw + +class RGWLogStatRemoteObjCBCR : public RGWStatRemoteObjCBCR { +public: + RGWLogStatRemoteObjCBCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key) : RGWStatRemoteObjCBCR(_sync_env, _bucket_info, _key) {} + int operate() override { + ldout(sync_env->cct, 0) << "SYNC_LOG: stat of remote obj: z=" << sync_env->source_zone + << " b=" << bucket_info.bucket << " k=" << key << " size=" << size << " mtime=" << mtime + << " attrs=" << attrs << dendl; + return set_cr_done(); + } + +}; + +class RGWLogStatRemoteObjCR : public RGWCallStatRemoteObjCR { +public: + RGWLogStatRemoteObjCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key) : RGWCallStatRemoteObjCR(_sync_env, _bucket_info, _key) { + } + + ~RGWLogStatRemoteObjCR() override {} + + RGWStatRemoteObjCBCR *allocate_callback() override { + return new RGWLogStatRemoteObjCBCR(sync_env, bucket_info, key); + } +}; + +class RGWLogDataSyncModule : public RGWDataSyncModule { + string prefix; +public: + explicit RGWLogDataSyncModule(const string& _prefix) : prefix(_prefix) {} + + RGWCoroutine *sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, std::optional versioned_epoch, rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 0) << prefix << ": SYNC_LOG: sync_object: b=" << bucket_info.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + return new RGWLogStatRemoteObjCR(sync_env, bucket_info, key); + } + RGWCoroutine *remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 0) << prefix << ": SYNC_LOG: rm_object: b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return NULL; + } + RGWCoroutine *create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, rgw_obj_key& key, real_time& mtime, + rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 0) << prefix << ": SYNC_LOG: create_delete_marker: b=" << bucket_info.bucket << " k=" << key << " mtime=" << mtime + << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return NULL; + } +}; + +class RGWLogSyncModuleInstance : public RGWSyncModuleInstance { + RGWLogDataSyncModule data_handler; +public: + explicit RGWLogSyncModuleInstance(const string& prefix) : data_handler(prefix) {} + RGWDataSyncModule *get_data_handler() override { + return &data_handler; + } +}; + +int RGWLogSyncModule::create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) { + string prefix = config["prefix"]; + instance->reset(new RGWLogSyncModuleInstance(prefix)); + return 0; +} + diff --git a/src/rgw/rgw_sync_module_log.h b/src/rgw/rgw_sync_module_log.h new file mode 100644 index 00000000..d0059e32 --- /dev/null +++ b/src/rgw/rgw_sync_module_log.h @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SYNC_MODULE_LOG_H +#define CEPH_RGW_SYNC_MODULE_LOG_H + +#include "rgw_sync_module.h" + +class RGWLogSyncModule : public RGWSyncModule { +public: + RGWLogSyncModule() {} + bool supports_data_export() override { + return false; + } + int create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; + +#endif diff --git a/src/rgw/rgw_sync_module_pubsub.cc b/src/rgw/rgw_sync_module_pubsub.cc new file mode 100644 index 00000000..fd514b81 --- /dev/null +++ b/src/rgw/rgw_sync_module_pubsub.cc @@ -0,0 +1,1578 @@ +#include "rgw_common.h" +#include "rgw_coroutine.h" +#include "rgw_sync_module.h" +#include "rgw_data_sync.h" +#include "rgw_sync_module_pubsub.h" +#include "rgw_sync_module_pubsub_rest.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_cr_tools.h" +#include "rgw_op.h" +#include "rgw_pubsub.h" +#include "rgw_pubsub_push.h" +#include "rgw_notify_event_type.h" +#include "rgw_perf_counters.h" +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +#include "rgw_amqp.h" +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +#include "rgw_kafka.h" +#endif + +#include +#include + +#define dout_subsys ceph_subsys_rgw + + +#define PUBSUB_EVENTS_RETENTION_DEFAULT 7 + +/* + +config: + +{ + "tenant": , # default: + "uid": , # default: "pubsub" + "data_bucket_prefix": # default: "pubsub-" + "data_oid_prefix": # + "events_retention_days": # default: 7 + "start_with_full_sync" # default: false + + # non-dynamic config + "notifications": [ + { + "path": , # this can be either an explicit path: , or /, + # or a prefix if it ends with a wildcard + "topic": + }, + ... + ], + "subscriptions": [ + { + "name": , + "topic": , + "push_endpoint": , + "push_endpoint_args:" . # any push endpoint specific args (include all args) + "data_bucket": , # override name of bucket where subscription data will be store + "data_oid_prefix": # set prefix for subscription data object ids + "s3_id": # in case of S3 compatible notifications, the notification ID will be set here + }, + ... + ] +} + +*/ + +// utility function to convert the args list from string format +// (ampresend separated with equal sign) to prased structure +RGWHTTPArgs string_to_args(const std::string& str_args) { + RGWHTTPArgs args; + args.set(str_args); + args.parse(); + return args; +} + +struct PSSubConfig { + std::string name; + std::string topic; + std::string push_endpoint_name; + std::string push_endpoint_args; + std::string data_bucket_name; + std::string data_oid_prefix; + std::string s3_id; + std::string arn_topic; + RGWPubSubEndpoint::Ptr push_endpoint; + + void from_user_conf(CephContext *cct, const rgw_pubsub_sub_config& uc) { + name = uc.name; + topic = uc.topic; + push_endpoint_name = uc.dest.push_endpoint; + data_bucket_name = uc.dest.bucket_name; + data_oid_prefix = uc.dest.oid_prefix; + s3_id = uc.s3_id; + arn_topic = uc.dest.arn_topic; + if (!push_endpoint_name.empty()) { + push_endpoint_args = uc.dest.push_endpoint_args; + try { + push_endpoint = RGWPubSubEndpoint::create(push_endpoint_name, arn_topic, string_to_args(push_endpoint_args), cct); + ldout(cct, 20) << "push endpoint created: " << push_endpoint->to_str() << dendl; + } catch (const RGWPubSubEndpoint::configuration_error& e) { + ldout(cct, 1) << "ERROR: failed to create push endpoint: " + << push_endpoint_name << " due to: " << e.what() << dendl; + } + } + } + + void dump(Formatter *f) const { + encode_json("name", name, f); + encode_json("topic", topic, f); + encode_json("push_endpoint", push_endpoint_name, f); + encode_json("push_endpoint_args", push_endpoint_args, f); + encode_json("data_bucket_name", data_bucket_name, f); + encode_json("data_oid_prefix", data_oid_prefix, f); + encode_json("s3_id", s3_id, f); + } + + void init(CephContext *cct, const JSONFormattable& config, + const string& data_bucket_prefix, + const string& default_oid_prefix) { + name = config["name"]; + topic = config["topic"]; + push_endpoint_name = config["push_endpoint"]; + string default_bucket_name = data_bucket_prefix + name; + data_bucket_name = config["data_bucket"](default_bucket_name.c_str()); + data_oid_prefix = config["data_oid_prefix"](default_oid_prefix.c_str()); + s3_id = config["s3_id"]; + arn_topic = config["arn_topic"]; + if (!push_endpoint_name.empty()) { + push_endpoint_args = config["push_endpoint_args"]; + try { + push_endpoint = RGWPubSubEndpoint::create(push_endpoint_name, arn_topic, string_to_args(push_endpoint_args), cct); + ldout(cct, 20) << "push endpoint created: " << push_endpoint->to_str() << dendl; + } catch (const RGWPubSubEndpoint::configuration_error& e) { + ldout(cct, 1) << "ERROR: failed to create push endpoint: " + << push_endpoint_name << " due to: " << e.what() << dendl; + } + } + } +}; + +using PSSubConfigRef = std::shared_ptr; + +struct PSTopicConfig { + std::string name; + std::set subs; + std::string opaque_data; + + void dump(Formatter *f) const { + encode_json("name", name, f); + encode_json("subs", subs, f); + encode_json("opaque", opaque_data, f); + } +}; + +struct PSNotificationConfig { + uint64_t id{0}; + string path; /* a path or a path prefix that would trigger the event (prefix: if ends with a wildcard) */ + string topic; + bool is_prefix{false}; + + + void dump(Formatter *f) const { + encode_json("id", id, f); + encode_json("path", path, f); + encode_json("topic", topic, f); + encode_json("is_prefix", is_prefix, f); + } + + void init(CephContext *cct, const JSONFormattable& config) { + path = config["path"]; + if (!path.empty() && path[path.size() - 1] == '*') { + path = path.substr(0, path.size() - 1); + is_prefix = true; + } + topic = config["topic"]; + } +}; + +template +static string json_str(const char *name, const T& obj, bool pretty = false) +{ + stringstream ss; + JSONFormatter f(pretty); + + encode_json(name, obj, &f); + f.flush(ss); + + return ss.str(); +} + +using PSTopicConfigRef = std::shared_ptr; +using TopicsRef = std::shared_ptr>; + +struct PSConfig { + const std::string id{"pubsub"}; + rgw_user user; + std::string data_bucket_prefix; + std::string data_oid_prefix; + + int events_retention_days{0}; + + uint64_t sync_instance{0}; + uint64_t max_id{0}; + + /* FIXME: no hard coded buckets, we'll have configurable topics */ + std::map subs; + std::map topics; + std::multimap notifications; + + bool start_with_full_sync{false}; + + void dump(Formatter *f) const { + encode_json("id", id, f); + encode_json("user", user, f); + encode_json("data_bucket_prefix", data_bucket_prefix, f); + encode_json("data_oid_prefix", data_oid_prefix, f); + encode_json("events_retention_days", events_retention_days, f); + encode_json("sync_instance", sync_instance, f); + encode_json("max_id", max_id, f); + { + Formatter::ArraySection section(*f, "subs"); + for (auto& sub : subs) { + encode_json("sub", *sub.second, f); + } + } + { + Formatter::ArraySection section(*f, "topics"); + for (auto& topic : topics) { + encode_json("topic", *topic.second, f); + } + } + { + Formatter::ObjectSection section(*f, "notifications"); + std::string last; + for (auto& notif : notifications) { + const string& n = notif.first; + if (n != last) { + if (!last.empty()) { + f->close_section(); + } + f->open_array_section(n.c_str()); + } + last = n; + encode_json("notifications", notif.second, f); + } + if (!last.empty()) { + f->close_section(); + } + } + encode_json("start_with_full_sync", start_with_full_sync, f); + } + + void init(CephContext *cct, const JSONFormattable& config) { + string uid = config["uid"]("pubsub"); + user = rgw_user(config["tenant"], uid); + data_bucket_prefix = config["data_bucket_prefix"]("pubsub-"); + data_oid_prefix = config["data_oid_prefix"]; + events_retention_days = config["events_retention_days"](PUBSUB_EVENTS_RETENTION_DEFAULT); + + for (auto& c : config["notifications"].array()) { + PSNotificationConfig nc; + nc.id = ++max_id; + nc.init(cct, c); + notifications.insert(std::make_pair(nc.path, nc)); + + PSTopicConfig topic_config = { .name = nc.topic }; + topics[nc.topic] = make_shared(topic_config); + } + for (auto& c : config["subscriptions"].array()) { + auto sc = std::make_shared(); + sc->init(cct, c, data_bucket_prefix, data_oid_prefix); + subs[sc->name] = sc; + auto iter = topics.find(sc->topic); + if (iter != topics.end()) { + iter->second->subs.insert(sc->name); + } + } + start_with_full_sync = config["start_with_full_sync"](false); + + ldout(cct, 5) << "pubsub: module config (parsed representation):\n" << json_str("config", *this, true) << dendl; + } + + void init_instance(const RGWRealm& realm, uint64_t instance_id) { + sync_instance = instance_id; + } + + void get_topics(CephContext *cct, const rgw_bucket& bucket, const rgw_obj_key& key, TopicsRef *result) { + const std::string path = bucket.name + "/" + key.name; + + auto iter = notifications.upper_bound(path); + if (iter == notifications.begin()) { + return; + } + + do { + --iter; + if (iter->first.size() > path.size()) { + break; + } + if (path.compare(0, iter->first.size(), iter->first) != 0) { + break; + } + + PSNotificationConfig& target = iter->second; + + if (!target.is_prefix && + path.size() != iter->first.size()) { + continue; + } + + auto topic = topics.find(target.topic); + if (topic == topics.end()) { + continue; + } + + ldout(cct, 20) << ": found topic for path=" << bucket << "/" << key << ": id=" << target.id << + " target_path=" << target.path << ", topic=" << target.topic << dendl; + (*result)->push_back(topic->second); + } while (iter != notifications.begin()); + } + + bool find_sub(const string& name, PSSubConfigRef *ref) { + auto iter = subs.find(name); + if (iter != subs.end()) { + *ref = iter->second; + return true; + } + return false; + } +}; + +using PSConfigRef = std::shared_ptr; +template +using EventRef = std::shared_ptr; + +struct objstore_event { + string id; + const rgw_bucket& bucket; + const rgw_obj_key& key; + const ceph::real_time& mtime; + const std::vector > *attrs; + + objstore_event(const rgw_bucket& _bucket, + const rgw_obj_key& _key, + const ceph::real_time& _mtime, + const std::vector > *_attrs) : bucket(_bucket), + key(_key), + mtime(_mtime), + attrs(_attrs) {} + + string get_hash() { + string etag; + RGWMD5Etag hash; + hash.update(bucket.bucket_id); + hash.update(key.name); + hash.update(key.instance); + hash.finish(&etag); + + assert(etag.size() > 8); + + return etag.substr(0, 8); + } + + void dump(Formatter *f) const { + { + Formatter::ObjectSection s(*f, "bucket"); + encode_json("name", bucket.name, f); + encode_json("tenant", bucket.tenant, f); + encode_json("bucket_id", bucket.bucket_id, f); + } + { + Formatter::ObjectSection s(*f, "key"); + encode_json("name", key.name, f); + encode_json("instance", key.instance, f); + } + utime_t mt(mtime); + encode_json("mtime", mt, f); + Formatter::ObjectSection s(*f, "attrs"); + if (attrs) { + for (auto& attr : *attrs) { + encode_json(attr.first.c_str(), attr.second.c_str(), f); + } + } + } +}; + +static void make_event_ref(CephContext *cct, const rgw_bucket& bucket, + const rgw_obj_key& key, + const ceph::real_time& mtime, + const std::vector > *attrs, + rgw::notify::EventType event_type, + EventRef *event) { + *event = std::make_shared(); + + EventRef& e = *event; + e->event_name = rgw::notify::to_ceph_string(event_type); + e->source = bucket.name + "/" + key.name; + e->timestamp = real_clock::now(); + + objstore_event oevent(bucket, key, mtime, attrs); + + const utime_t ts(e->timestamp); + set_event_id(e->id, oevent.get_hash(), ts); + + encode_json("info", oevent, &e->info); +} + +static void make_s3_record_ref(CephContext *cct, const rgw_bucket& bucket, + const rgw_user& owner, + const rgw_obj_key& key, + const ceph::real_time& mtime, + const std::vector > *attrs, + rgw::notify::EventType event_type, + EventRef *record) { + *record = std::make_shared(); + + EventRef& r = *record; + r->eventTime = mtime; + r->eventName = rgw::notify::to_string(event_type); + // userIdentity: not supported in sync module + // x_amz_request_id: not supported in sync module + // x_amz_id_2: not supported in sync module + // configurationId is filled from subscription configuration + r->bucket_name = bucket.name; + r->bucket_ownerIdentity = owner.to_str(); + r->bucket_arn = to_string(rgw::ARN(bucket)); + r->bucket_id = bucket.bucket_id; // rgw extension + r->object_key = key.name; + // object_size not supported in sync module + objstore_event oevent(bucket, key, mtime, attrs); + r->object_etag = oevent.get_hash(); + r->object_versionId = key.instance; + + // use timestamp as per key sequence id (hex encoded) + const utime_t ts(real_clock::now()); + boost::algorithm::hex((const char*)&ts, (const char*)&ts + sizeof(utime_t), + std::back_inserter(r->object_sequencer)); + + set_event_id(r->id, r->object_etag, ts); +} + +class PSManager; +using PSManagerRef = std::shared_ptr; + +struct PSEnv { + PSConfigRef conf; + shared_ptr data_user_info; + PSManagerRef manager; + + PSEnv() : conf(make_shared()), + data_user_info(make_shared()) {} + + void init(CephContext *cct, const JSONFormattable& config) { + conf->init(cct, config); + } + + void init_instance(const RGWRealm& realm, uint64_t instance_id, PSManagerRef& mgr); +}; + +using PSEnvRef = std::shared_ptr; + +template +class PSEvent { + const EventRef event; + +public: + PSEvent(const EventRef& _event) : event(_event) {} + + void format(bufferlist *bl) const { + bl->append(json_str("", *event)); + } + + void encode_event(bufferlist& bl) const { + encode(*event, bl); + } + + const string& id() const { + return event->id; + } +}; + +template +class RGWSingletonCR : public RGWCoroutine { + friend class WrapperCR; + + boost::asio::coroutine wrapper_state; + bool started{false}; + int operate_ret{0}; + + struct WaiterInfo { + RGWCoroutine *cr{nullptr}; + T *result; + }; + using WaiterInfoRef = std::shared_ptr; + + deque waiters; + + void add_waiter(RGWCoroutine *cr, T *result) { + auto waiter = std::make_shared(); + waiter->cr = cr; + waiter->result = result; + waiters.push_back(waiter); + }; + + bool get_next_waiter(WaiterInfoRef *waiter) { + if (waiters.empty()) { + waiter->reset(); + return false; + } + + *waiter = waiters.front(); + waiters.pop_front(); + return true; + } + + int operate_wrapper() override { + reenter(&wrapper_state) { + while (!is_done()) { + ldout(cct, 20) << __func__ << "(): operate_wrapper() -> operate()" << dendl; + operate_ret = operate(); + if (operate_ret < 0) { + ldout(cct, 20) << *this << ": operate() returned r=" << operate_ret << dendl; + } + if (!is_done()) { + yield; + } + } + + ldout(cct, 20) << __func__ << "(): RGWSingletonCR: operate_wrapper() done, need to wake up " << waiters.size() << " waiters" << dendl; + /* we're done, can't yield anymore */ + + WaiterInfoRef waiter; + while (get_next_waiter(&waiter)) { + ldout(cct, 20) << __func__ << "(): RGWSingletonCR: waking up waiter" << dendl; + waiter->cr->set_retcode(retcode); + waiter->cr->set_sleeping(false); + return_result(waiter->result); + put(); + } + + return retcode; + } + return 0; + } + + virtual void return_result(T *result) {} + +public: + RGWSingletonCR(CephContext *_cct) + : RGWCoroutine(_cct) {} + + int execute(RGWCoroutine *caller, T *result = nullptr) { + if (!started) { + ldout(cct, 20) << __func__ << "(): singleton not started, starting" << dendl; + started = true; + caller->call(this); + return 0; + } else if (!is_done()) { + ldout(cct, 20) << __func__ << "(): singleton not done yet, registering as waiter" << dendl; + get(); + add_waiter(caller, result); + caller->set_sleeping(true); + return 0; + } + + ldout(cct, 20) << __func__ << "(): singleton done, returning retcode=" << retcode << dendl; + caller->set_retcode(retcode); + return_result(result); + return retcode; + } +}; + + +class PSSubscription; +using PSSubscriptionRef = std::shared_ptr; + +class PSSubscription { + class InitCR; + friend class InitCR; + friend class RGWPSHandleObjEventCR; + + RGWDataSyncEnv *sync_env; + PSEnvRef env; + PSSubConfigRef sub_conf; + std::shared_ptr get_bucket_info_result; + RGWBucketInfo *bucket_info{nullptr}; + RGWDataAccessRef data_access; + RGWDataAccess::BucketRef bucket; + + InitCR *init_cr{nullptr}; + + class InitBucketLifecycleCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + PSConfigRef& conf; + LCRule rule; + + int retention_days; + + rgw_bucket_lifecycle_config_params lc_config; + + public: + InitBucketLifecycleCR(RGWDataSyncEnv *_sync_env, + PSConfigRef& _conf, + RGWBucketInfo& _bucket_info, + std::map& _bucket_attrs) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + conf(_conf) { + lc_config.bucket_info = _bucket_info; + lc_config.bucket_attrs = _bucket_attrs; + retention_days = conf->events_retention_days; + } + + int operate() override { + reenter(this) { + + rule.init_simple_days_rule("Pubsub Expiration", "" /* all objects in bucket */, retention_days); + + { + /* maybe we already have it configured? */ + RGWLifecycleConfiguration old_config; + auto aiter = lc_config.bucket_attrs.find(RGW_ATTR_LC); + if (aiter != lc_config.bucket_attrs.end()) { + bufferlist::const_iterator iter{&aiter->second}; + try { + old_config.decode(iter); + } catch (const buffer::error& e) { + ldout(cct, 0) << __func__ << "(): decode life cycle config failed" << dendl; + } + } + + auto old_rules = old_config.get_rule_map(); + for (auto ori : old_rules) { + auto& old_rule = ori.second; + + if (old_rule.get_prefix().empty() && + old_rule.get_expiration().get_days() == retention_days && + old_rule.is_enabled()) { + ldout(sync_env->cct, 20) << "no need to set lifecycle rule on bucketi, existing rule matches config" << dendl; + return set_cr_done(); + } + } + } + + lc_config.config.add_rule(rule); + yield call(new RGWBucketLifecycleConfigCR(sync_env->async_rados, + sync_env->store, + lc_config)); + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to set lifecycle on bucket: ret=" << retcode << dendl; + return set_cr_error(retcode); + } + + return set_cr_done(); + } + return 0; + } + }; + + class InitCR : public RGWSingletonCR { + RGWDataSyncEnv *sync_env; + PSSubscriptionRef sub; + rgw_get_bucket_info_params get_bucket_info; + rgw_bucket_create_local_params create_bucket; + PSConfigRef& conf; + PSSubConfigRef& sub_conf; + int i; + + public: + InitCR(RGWDataSyncEnv *_sync_env, + PSSubscriptionRef& _sub) : RGWSingletonCR(_sync_env->cct), + sync_env(_sync_env), + sub(_sub), conf(sub->env->conf), + sub_conf(sub->sub_conf) { + } + + int operate() override { + reenter(this) { + get_bucket_info.tenant = conf->user.tenant; + get_bucket_info.bucket_name = sub_conf->data_bucket_name; + sub->get_bucket_info_result = make_shared(); + + for (i = 0; i < 2; ++i) { + yield call(new RGWGetBucketInfoCR(sync_env->async_rados, + sync_env->store, + get_bucket_info, + sub->get_bucket_info_result)); + if (retcode < 0 && retcode != -ENOENT) { + ldout(sync_env->cct, 0) << "ERROR: failed to geting bucket info: " << "tenant=" + << get_bucket_info.tenant << " name=" << get_bucket_info.bucket_name << ": ret=" << retcode << dendl; + } + if (retcode == 0) { + { + auto& result = sub->get_bucket_info_result; + sub->bucket_info = &result->bucket_info; + + int ret = sub->data_access->get_bucket(result->bucket_info, result->attrs, &sub->bucket); + if (ret < 0) { + ldout(sync_env->cct, 0) << "ERROR: data_access.get_bucket() bucket=" << result->bucket_info.bucket << " failed, ret=" << ret << dendl; + return set_cr_error(ret); + } + } + + yield call(new InitBucketLifecycleCR(sync_env, conf, + sub->get_bucket_info_result->bucket_info, + sub->get_bucket_info_result->attrs)); + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to init lifecycle on bucket (bucket=" << sub_conf->data_bucket_name << ") ret=" << retcode << dendl; + return set_cr_error(retcode); + } + + return set_cr_done(); + } + + create_bucket.user_info = sub->env->data_user_info; + create_bucket.bucket_name = sub_conf->data_bucket_name; + ldout(sync_env->cct, 20) << "pubsub: bucket create: using user info: " << json_str("obj", *sub->env->data_user_info, true) << dendl; + yield call(new RGWBucketCreateLocalCR(sync_env->async_rados, + sync_env->store, + create_bucket)); + if (retcode < 0) { + ldout(sync_env->cct, 0) << "ERROR: failed to create bucket: " << "tenant=" + << get_bucket_info.tenant << " name=" << get_bucket_info.bucket_name << ": ret=" << retcode << dendl; + return set_cr_error(retcode); + } + + /* second iteration: we got -ENOENT and created a bucket */ + } + + /* failed twice on -ENOENT, unexpected */ + ldout(sync_env->cct, 0) << "ERROR: failed to create bucket " << "tenant=" << get_bucket_info.tenant + << " name=" << get_bucket_info.bucket_name << dendl; + return set_cr_error(-EIO); + } + return 0; + } + }; + + template + class StoreEventCR : public RGWCoroutine { + RGWDataSyncEnv* const sync_env; + const PSSubscriptionRef sub; + const PSEvent pse; + const string oid_prefix; + + public: + StoreEventCR(RGWDataSyncEnv* const _sync_env, + const PSSubscriptionRef& _sub, + const EventRef& _event) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + sub(_sub), + pse(_event), + oid_prefix(sub->sub_conf->data_oid_prefix) { + } + + int operate() override { + rgw_object_simple_put_params put_obj; + reenter(this) { + + put_obj.bucket = sub->bucket; + put_obj.key = rgw_obj_key(oid_prefix + pse.id()); + + pse.format(&put_obj.data); + + { + bufferlist bl; + pse.encode_event(bl); + bufferlist bl64; + bl.encode_base64(bl64); + put_obj.user_data = bl64.to_str(); + } + + yield call(new RGWObjectSimplePutCR(sync_env->async_rados, + sync_env->store, + put_obj)); + if (retcode < 0) { + ldpp_dout(sync_env->dpp, 10) << "failed to store event: " << put_obj.bucket << "/" << put_obj.key << " ret=" << retcode << dendl; + return set_cr_error(retcode); + } else { + ldpp_dout(sync_env->dpp, 20) << "event stored: " << put_obj.bucket << "/" << put_obj.key << dendl; + } + + return set_cr_done(); + } + return 0; + } + }; + + template + class PushEventCR : public RGWCoroutine { + RGWDataSyncEnv* const sync_env; + const EventRef event; + const PSSubConfigRef& sub_conf; + + public: + PushEventCR(RGWDataSyncEnv* const _sync_env, + const PSSubscriptionRef& _sub, + const EventRef& _event) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + event(_event), + sub_conf(_sub->sub_conf) { + } + + int operate() override { + reenter(this) { + ceph_assert(sub_conf->push_endpoint); + yield call(sub_conf->push_endpoint->send_to_completion_async(*event.get(), sync_env)); + + if (retcode < 0) { + ldout(sync_env->cct, 10) << "failed to push event: " << event->id << + " to endpoint: " << sub_conf->push_endpoint_name << " ret=" << retcode << dendl; + return set_cr_error(retcode); + } + + ldout(sync_env->cct, 20) << "event: " << event->id << + " pushed to endpoint: " << sub_conf->push_endpoint_name << dendl; + return set_cr_done(); + } + return 0; + } + }; + +public: + PSSubscription(RGWDataSyncEnv *_sync_env, + PSEnvRef _env, + PSSubConfigRef& _sub_conf) : sync_env(_sync_env), + env(_env), + sub_conf(_sub_conf), + data_access(std::make_shared(sync_env->store)) {} + + PSSubscription(RGWDataSyncEnv *_sync_env, + PSEnvRef _env, + rgw_pubsub_sub_config& user_sub_conf) : sync_env(_sync_env), + env(_env), + sub_conf(std::make_shared()), + data_access(std::make_shared(sync_env->store)) { + sub_conf->from_user_conf(sync_env->cct, user_sub_conf); + } + virtual ~PSSubscription() { + if (init_cr) { + init_cr->put(); + } + } + + template + static PSSubscriptionRef get_shared(RGWDataSyncEnv *_sync_env, + PSEnvRef _env, + C& _sub_conf) { + auto sub = std::make_shared(_sync_env, _env, _sub_conf); + sub->init_cr = new InitCR(_sync_env, sub); + sub->init_cr->get(); + return sub; + } + + int call_init_cr(RGWCoroutine *caller) { + return init_cr->execute(caller); + } + + template + static RGWCoroutine *store_event_cr(RGWDataSyncEnv* const sync_env, const PSSubscriptionRef& sub, const EventRef& event) { + return new StoreEventCR(sync_env, sub, event); + } + + template + static RGWCoroutine *push_event_cr(RGWDataSyncEnv* const sync_env, const PSSubscriptionRef& sub, const EventRef& event) { + return new PushEventCR(sync_env, sub, event); + } + friend class InitCR; +}; + +class PSManager +{ + RGWDataSyncEnv *sync_env; + PSEnvRef env; + + std::map subs; + + class GetSubCR : public RGWSingletonCR { + RGWDataSyncEnv *sync_env; + PSManagerRef mgr; + rgw_user owner; + string sub_name; + string sub_id; + PSSubscriptionRef *ref; + + PSConfigRef conf; + + PSSubConfigRef sub_conf; + rgw_pubsub_sub_config user_sub_conf; + + public: + GetSubCR(RGWDataSyncEnv *_sync_env, + PSManagerRef& _mgr, + const rgw_user& _owner, + const string& _sub_name, + PSSubscriptionRef *_ref) : RGWSingletonCR(_sync_env->cct), + sync_env(_sync_env), + mgr(_mgr), + owner(_owner), + sub_name(_sub_name), + ref(_ref), + conf(mgr->env->conf) { + } + ~GetSubCR() { } + + int operate() override { + reenter(this) { + if (owner.empty()) { + if (!conf->find_sub(sub_name, &sub_conf)) { + ldout(sync_env->cct, 10) << "failed to find subscription config: name=" << sub_name << dendl; + mgr->remove_get_sub(owner, sub_name); + return set_cr_error(-ENOENT); + } + + *ref = PSSubscription::get_shared(sync_env, mgr->env, sub_conf); + } else { + using ReadInfoCR = RGWSimpleRadosReadCR; + yield { + RGWUserPubSub ups(sync_env->store, owner); + rgw_raw_obj obj; + ups.get_sub_meta_obj(sub_name, &obj); + bool empty_on_enoent = false; + call(new ReadInfoCR(sync_env->async_rados, sync_env->store->svc.sysobj, + obj, + &user_sub_conf, empty_on_enoent)); + } + if (retcode < 0) { + mgr->remove_get_sub(owner, sub_name); + return set_cr_error(retcode); + } + + *ref = PSSubscription::get_shared(sync_env, mgr->env, user_sub_conf); + } + + yield (*ref)->call_init_cr(this); + if (retcode < 0) { + ldout(sync_env->cct, 10) << "failed to init subscription" << dendl; + mgr->remove_get_sub(owner, sub_name); + return set_cr_error(retcode); + } + + if (owner.empty()) { + mgr->subs[sub_name] = *ref; + } + mgr->remove_get_sub(owner, sub_name); + + return set_cr_done(); + } + return 0; + } + + void return_result(PSSubscriptionRef *result) override { + ldout(cct, 20) << __func__ << "(): returning result: retcode=" << retcode << " resultp=" << (void *)result << dendl; + if (retcode >= 0) { + *result = *ref; + } + } + }; + + string sub_id(const rgw_user& owner, const string& sub_name) { + string owner_prefix; + if (!owner.empty()) { + owner_prefix = owner.to_str() + "/"; + } + + return owner_prefix + sub_name; + } + + std::map get_subs; + + GetSubCR *& get_get_subs(const rgw_user& owner, const string& name) { + return get_subs[sub_id(owner, name)]; + } + + void remove_get_sub(const rgw_user& owner, const string& name) { + get_subs.erase(sub_id(owner, name)); + } + + bool find_sub_instance(const rgw_user& owner, const string& sub_name, PSSubscriptionRef *sub) { + auto iter = subs.find(sub_id(owner, sub_name)); + if (iter != subs.end()) { + *sub = iter->second; + return true; + } + return false; + } + + PSManager(RGWDataSyncEnv *_sync_env, + PSEnvRef _env) : sync_env(_sync_env), + env(_env) {} + +public: + static PSManagerRef get_shared(RGWDataSyncEnv *_sync_env, + PSEnvRef _env) { + return std::shared_ptr(new PSManager(_sync_env, _env)); + } + + static int call_get_subscription_cr(RGWDataSyncEnv *sync_env, PSManagerRef& mgr, + RGWCoroutine *caller, const rgw_user& owner, const string& sub_name, PSSubscriptionRef *ref) { + if (mgr->find_sub_instance(owner, sub_name, ref)) { + /* found it! nothing to execute */ + ldout(sync_env->cct, 20) << __func__ << "(): found sub instance" << dendl; + } + auto& gs = mgr->get_get_subs(owner, sub_name); + if (!gs) { + ldout(sync_env->cct, 20) << __func__ << "(): first get subs" << dendl; + gs = new GetSubCR(sync_env, mgr, owner, sub_name, ref); + } + ldout(sync_env->cct, 20) << __func__ << "(): executing get subs" << dendl; + return gs->execute(caller, ref); + } + + friend class GetSubCR; +}; + +void PSEnv::init_instance(const RGWRealm& realm, uint64_t instance_id, PSManagerRef& mgr) { + manager = mgr; + conf->init_instance(realm, instance_id); +} + +class RGWPSInitEnvCBCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + PSEnvRef env; + PSConfigRef& conf; + + rgw_user_create_params create_user; + rgw_get_user_info_params get_user_info; +public: + RGWPSInitEnvCBCR(RGWDataSyncEnv *_sync_env, + PSEnvRef& _env) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + env(_env), conf(env->conf) {} + int operate() override { + reenter(this) { + ldout(sync_env->cct, 0) << ": init pubsub config zone=" << sync_env->source_zone << dendl; + + /* nothing to do here right now */ + create_user.user = conf->user; + create_user.max_buckets = 0; /* unlimited */ + create_user.display_name = "pubsub"; + create_user.generate_key = false; + yield call(new RGWUserCreateCR(sync_env->async_rados, sync_env->store, create_user)); + if (retcode < 0) { + ldout(sync_env->store->ctx(), 0) << "ERROR: failed to create rgw user: ret=" << retcode << dendl; + return set_cr_error(retcode); + } + + get_user_info.user = conf->user; + yield call(new RGWGetUserInfoCR(sync_env->async_rados, sync_env->store, get_user_info, env->data_user_info)); + if (retcode < 0) { + ldout(sync_env->store->ctx(), 0) << "ERROR: failed to create rgw user: ret=" << retcode << dendl; + return set_cr_error(retcode); + } + + ldout(sync_env->cct, 20) << "pubsub: get user info cr returned: " << json_str("obj", *env->data_user_info, true) << dendl; + + + return set_cr_done(); + } + return 0; + } +}; + +bool match(const rgw_pubsub_topic_filter& filter, const std::string& key_name, rgw::notify::EventType event_type) { + if (!match(filter.events, event_type)) { + return false; + } + if (!match(filter.s3_filter.key_filter, key_name)) { + return false; + } + return true; +} + +class RGWPSFindBucketTopicsCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + PSEnvRef env; + rgw_user owner; + rgw_bucket bucket; + rgw_obj_key key; + rgw::notify::EventType event_type; + + RGWUserPubSub ups; + + rgw_raw_obj bucket_obj; + rgw_raw_obj user_obj; + rgw_pubsub_bucket_topics bucket_topics; + rgw_pubsub_user_topics user_topics; + TopicsRef *topics; +public: + RGWPSFindBucketTopicsCR(RGWDataSyncEnv *_sync_env, + PSEnvRef& _env, + const rgw_user& _owner, + const rgw_bucket& _bucket, + const rgw_obj_key& _key, + rgw::notify::EventType _event_type, + TopicsRef *_topics) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + env(_env), + owner(_owner), + bucket(_bucket), + key(_key), + event_type(_event_type), + ups(_sync_env->store, owner), + topics(_topics) { + *topics = std::make_shared >(); + } + int operate() override { + reenter(this) { + ups.get_bucket_meta_obj(bucket, &bucket_obj); + ups.get_user_meta_obj(&user_obj); + + using ReadInfoCR = RGWSimpleRadosReadCR; + yield { + bool empty_on_enoent = true; + call(new ReadInfoCR(sync_env->async_rados, sync_env->store->svc.sysobj, + bucket_obj, + &bucket_topics, empty_on_enoent)); + } + if (retcode < 0 && retcode != -ENOENT) { + return set_cr_error(retcode); + } + + ldout(sync_env->cct, 20) << "RGWPSFindBucketTopicsCR(): found " << bucket_topics.topics.size() << " topics for bucket " << bucket << dendl; + + if (!bucket_topics.topics.empty()) { + using ReadUserTopicsInfoCR = RGWSimpleRadosReadCR; + yield { + bool empty_on_enoent = true; + call(new ReadUserTopicsInfoCR(sync_env->async_rados, sync_env->store->svc.sysobj, + user_obj, + &user_topics, empty_on_enoent)); + } + if (retcode < 0 && retcode != -ENOENT) { + return set_cr_error(retcode); + } + } + + for (auto& titer : bucket_topics.topics) { + auto& topic_filter = titer.second; + auto& info = topic_filter.topic; + if (!match(topic_filter, key.name, event_type)) { + continue; + } + std::shared_ptr tc = std::make_shared(); + tc->name = info.name; + tc->subs = user_topics.topics[info.name].subs; + tc->opaque_data = info.opaque_data; + (*topics)->push_back(tc); + } + + env->conf->get_topics(sync_env->cct, bucket, key, topics); + return set_cr_done(); + } + return 0; + } +}; + +class RGWPSHandleObjEventCR : public RGWCoroutine { + RGWDataSyncEnv* const sync_env; + const PSEnvRef env; + const rgw_user& owner; + const EventRef event; + const EventRef record; + const TopicsRef topics; + const std::array owners; + bool has_subscriptions; + bool event_handled; + bool sub_conf_found; + PSSubscriptionRef sub; + std::array::const_iterator oiter; + std::vector::const_iterator titer; + std::set::const_iterator siter; + int last_sub_conf_error; + +public: + RGWPSHandleObjEventCR(RGWDataSyncEnv* const _sync_env, + const PSEnvRef _env, + const rgw_user& _owner, + const EventRef& _event, + const EventRef& _record, + const TopicsRef& _topics) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + env(_env), + owner(_owner), + event(_event), + record(_record), + topics(_topics), + owners({owner, rgw_user{}}), + has_subscriptions(false), + event_handled(false) {} + + int operate() override { + reenter(this) { + ldout(sync_env->cct, 20) << ": handle event: obj: z=" << sync_env->source_zone + << " event=" << json_str("event", *event, false) + << " owner=" << owner << dendl; + + ldout(sync_env->cct, 20) << "pubsub: " << topics->size() << " topics found for path" << dendl; + + // outside caller should check that + ceph_assert(!topics->empty()); + + if (perfcounter) perfcounter->inc(l_rgw_pubsub_event_triggered); + + // loop over all topics related to the bucket/object + for (titer = topics->begin(); titer != topics->end(); ++titer) { + ldout(sync_env->cct, 20) << ": notification for " << event->source << ": topic=" << + (*titer)->name << ", has " << (*titer)->subs.size() << " subscriptions" << dendl; + // loop over all subscriptions of the topic + for (siter = (*titer)->subs.begin(); siter != (*titer)->subs.end(); ++siter) { + ldout(sync_env->cct, 20) << ": subscription: " << *siter << dendl; + has_subscriptions = true; + sub_conf_found = false; + // try to read subscription configuration from global/user cond + // configuration is considered missing only if does not exist in either + for (oiter = owners.begin(); oiter != owners.end(); ++oiter) { + yield PSManager::call_get_subscription_cr(sync_env, env->manager, this, *oiter, *siter, &sub); + if (retcode < 0) { + if (sub_conf_found) { + // not a real issue, sub conf already found + retcode = 0; + } + last_sub_conf_error = retcode; + continue; + } + sub_conf_found = true; + if (sub->sub_conf->s3_id.empty()) { + // subscription was not made by S3 compatible API + ldout(sync_env->cct, 20) << "storing event for subscription=" << *siter << " owner=" << *oiter << " ret=" << retcode << dendl; + yield call(PSSubscription::store_event_cr(sync_env, sub, event)); + if (retcode < 0) { + if (perfcounter) perfcounter->inc(l_rgw_pubsub_store_fail); + ldout(sync_env->cct, 1) << "ERROR: failed to store event for subscription=" << *siter << " ret=" << retcode << dendl; + } else { + if (perfcounter) perfcounter->inc(l_rgw_pubsub_store_ok); + event_handled = true; + } + if (sub->sub_conf->push_endpoint) { + ldout(sync_env->cct, 20) << "push event for subscription=" << *siter << " owner=" << *oiter << " ret=" << retcode << dendl; + yield call(PSSubscription::push_event_cr(sync_env, sub, event)); + if (retcode < 0) { + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed); + ldout(sync_env->cct, 1) << "ERROR: failed to push event for subscription=" << *siter << " ret=" << retcode << dendl; + } else { + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok); + event_handled = true; + } + } + } else { + // subscription was made by S3 compatible API + ldout(sync_env->cct, 20) << "storing record for subscription=" << *siter << " owner=" << *oiter << " ret=" << retcode << dendl; + record->configurationId = sub->sub_conf->s3_id; + record->opaque_data = (*titer)->opaque_data; + yield call(PSSubscription::store_event_cr(sync_env, sub, record)); + if (retcode < 0) { + if (perfcounter) perfcounter->inc(l_rgw_pubsub_store_fail); + ldout(sync_env->cct, 1) << "ERROR: failed to store record for subscription=" << *siter << " ret=" << retcode << dendl; + } else { + if (perfcounter) perfcounter->inc(l_rgw_pubsub_store_ok); + event_handled = true; + } + if (sub->sub_conf->push_endpoint) { + ldout(sync_env->cct, 20) << "push record for subscription=" << *siter << " owner=" << *oiter << " ret=" << retcode << dendl; + yield call(PSSubscription::push_event_cr(sync_env, sub, record)); + if (retcode < 0) { + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed); + ldout(sync_env->cct, 1) << "ERROR: failed to push record for subscription=" << *siter << " ret=" << retcode << dendl; + } else { + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok); + event_handled = true; + } + } + } + } + if (!sub_conf_found) { + // could not find conf for subscription at user or global levels + if (perfcounter) perfcounter->inc(l_rgw_pubsub_missing_conf); + ldout(sync_env->cct, 1) << "ERROR: failed to find subscription config for subscription=" << *siter + << " ret=" << last_sub_conf_error << dendl; + if (retcode == -ENOENT) { + // missing subscription info should be reflected back as invalid argument + // and not as missing object + retcode = -EINVAL; + } + } + } + } + if (has_subscriptions && !event_handled) { + // event is considered "lost" of it has subscriptions on any of its topics + // but it was not stored in, or pushed to, any of them + if (perfcounter) perfcounter->inc(l_rgw_pubsub_event_lost); + } + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +// coroutine invoked on remote object creation +class RGWPSHandleRemoteObjCBCR : public RGWStatRemoteObjCBCR { + RGWDataSyncEnv *sync_env; + PSEnvRef env; + std::optional versioned_epoch; + EventRef event; + EventRef record; + TopicsRef topics; +public: + RGWPSHandleRemoteObjCBCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, + PSEnvRef _env, std::optional _versioned_epoch, + TopicsRef& _topics) : RGWStatRemoteObjCBCR(_sync_env, _bucket_info, _key), + sync_env(_sync_env), + env(_env), + versioned_epoch(_versioned_epoch), + topics(_topics) { + } + int operate() override { + reenter(this) { + ldout(sync_env->cct, 20) << ": stat of remote obj: z=" << sync_env->source_zone + << " b=" << bucket_info.bucket << " k=" << key << " size=" << size << " mtime=" << mtime + << " attrs=" << attrs << dendl; + { + std::vector > attrs; + for (auto& attr : attrs) { + std::string k = attr.first; + if (boost::algorithm::starts_with(k, RGW_ATTR_PREFIX)) { + k = k.substr(sizeof(RGW_ATTR_PREFIX) - 1); + } + attrs.push_back(std::make_pair(k, attr.second)); + } + // at this point we don't know whether we need the ceph event or S3 record + // this is why both are created here, once we have information about the + // subscription, we will store/push only the relevant ones + make_event_ref(sync_env->cct, + bucket_info.bucket, key, + mtime, &attrs, + rgw::notify::ObjectCreated, &event); + make_s3_record_ref(sync_env->cct, + bucket_info.bucket, bucket_info.owner, key, + mtime, &attrs, + rgw::notify::ObjectCreated, &record); + } + + yield call(new RGWPSHandleObjEventCR(sync_env, env, bucket_info.owner, event, record, topics)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +class RGWPSHandleRemoteObjCR : public RGWCallStatRemoteObjCR { + PSEnvRef env; + std::optional versioned_epoch; + TopicsRef topics; +public: + RGWPSHandleRemoteObjCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, + PSEnvRef _env, std::optional _versioned_epoch, + TopicsRef& _topics) : RGWCallStatRemoteObjCR(_sync_env, _bucket_info, _key), + env(_env), versioned_epoch(_versioned_epoch), + topics(_topics) { + } + + ~RGWPSHandleRemoteObjCR() override {} + + RGWStatRemoteObjCBCR *allocate_callback() override { + return new RGWPSHandleRemoteObjCBCR(sync_env, bucket_info, key, env, versioned_epoch, topics); + } +}; + +class RGWPSHandleObjCreateCR : public RGWCoroutine { + + RGWDataSyncEnv *sync_env; + RGWBucketInfo bucket_info; + rgw_obj_key key; + PSEnvRef env; + std::optional versioned_epoch; + TopicsRef topics; +public: + RGWPSHandleObjCreateCR(RGWDataSyncEnv *_sync_env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, + PSEnvRef _env, std::optional _versioned_epoch) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + bucket_info(_bucket_info), + key(_key), + env(_env), + versioned_epoch(_versioned_epoch) { + } + + ~RGWPSHandleObjCreateCR() override {} + + int operate() override { + reenter(this) { + yield call(new RGWPSFindBucketTopicsCR(sync_env, env, bucket_info.owner, + bucket_info.bucket, key, + rgw::notify::ObjectCreated, + &topics)); + if (retcode < 0) { + ldout(sync_env->cct, 1) << "ERROR: RGWPSFindBucketTopicsCR returned ret=" << retcode << dendl; + return set_cr_error(retcode); + } + if (topics->empty()) { + ldout(sync_env->cct, 20) << "no topics found for " << bucket_info.bucket << "/" << key << dendl; + return set_cr_done(); + } + yield call(new RGWPSHandleRemoteObjCR(sync_env, bucket_info, key, env, versioned_epoch, topics)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } +}; + +// coroutine invoked on remote object deletion +class RGWPSGenericObjEventCBCR : public RGWCoroutine { + RGWDataSyncEnv *sync_env; + PSEnvRef env; + rgw_user owner; + rgw_bucket bucket; + rgw_obj_key key; + ceph::real_time mtime; + rgw::notify::EventType event_type; + EventRef event; + EventRef record; + TopicsRef topics; +public: + RGWPSGenericObjEventCBCR(RGWDataSyncEnv *_sync_env, + PSEnvRef _env, + RGWBucketInfo& _bucket_info, rgw_obj_key& _key, const ceph::real_time& _mtime, + rgw::notify::EventType _event_type) : RGWCoroutine(_sync_env->cct), + sync_env(_sync_env), + env(_env), + owner(_bucket_info.owner), + bucket(_bucket_info.bucket), + key(_key), + mtime(_mtime), event_type(_event_type) {} + int operate() override { + reenter(this) { + ldout(sync_env->cct, 20) << ": remove remote obj: z=" << sync_env->source_zone + << " b=" << bucket << " k=" << key << " mtime=" << mtime << dendl; + yield call(new RGWPSFindBucketTopicsCR(sync_env, env, owner, bucket, key, event_type, &topics)); + if (retcode < 0) { + ldout(sync_env->cct, 1) << "ERROR: RGWPSFindBucketTopicsCR returned ret=" << retcode << dendl; + return set_cr_error(retcode); + } + if (topics->empty()) { + ldout(sync_env->cct, 20) << "no topics found for " << bucket << "/" << key << dendl; + return set_cr_done(); + } + // at this point we don't know whether we need the ceph event or S3 record + // this is why both are created here, once we have information about the + // subscription, we will store/push only the relevant ones + make_event_ref(sync_env->cct, + bucket, key, + mtime, nullptr, + event_type, &event); + make_s3_record_ref(sync_env->cct, + bucket, owner, key, + mtime, nullptr, + event_type, &record); + yield call(new RGWPSHandleObjEventCR(sync_env, env, owner, event, record, topics)); + if (retcode < 0) { + return set_cr_error(retcode); + } + return set_cr_done(); + } + return 0; + } + +}; + +class RGWPSDataSyncModule : public RGWDataSyncModule { + PSEnvRef env; + PSConfigRef& conf; + +public: + RGWPSDataSyncModule(CephContext *cct, const JSONFormattable& config) : env(std::make_shared()), conf(env->conf) { + env->init(cct, config); + } + + ~RGWPSDataSyncModule() override {} + + void init(RGWDataSyncEnv *sync_env, uint64_t instance_id) override { + PSManagerRef mgr = PSManager::get_shared(sync_env, env); + env->init_instance(sync_env->store->svc.zone->get_realm(), instance_id, mgr); + } + + RGWCoroutine *start_sync(RGWDataSyncEnv *sync_env) override { + ldout(sync_env->cct, 5) << conf->id << ": start" << dendl; + return new RGWPSInitEnvCBCR(sync_env, env); + } + + RGWCoroutine *sync_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, + rgw_obj_key& key, std::optional versioned_epoch, rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 10) << conf->id << ": sync_object: b=" << bucket_info.bucket << + " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl; + return new RGWPSHandleObjCreateCR(sync_env, bucket_info, key, env, versioned_epoch); + } + + RGWCoroutine *remove_object(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, + rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 10) << conf->id << ": rm_object: b=" << bucket_info.bucket << + " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return new RGWPSGenericObjEventCBCR(sync_env, env, bucket_info, key, mtime, rgw::notify::ObjectRemovedDelete); + } + + RGWCoroutine *create_delete_marker(RGWDataSyncEnv *sync_env, RGWBucketInfo& bucket_info, + rgw_obj_key& key, real_time& mtime, rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override { + ldout(sync_env->cct, 10) << conf->id << ": create_delete_marker: b=" << bucket_info.bucket << + " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl; + return new RGWPSGenericObjEventCBCR(sync_env, env, bucket_info, key, mtime, rgw::notify::ObjectRemovedDeleteMarkerCreated); + } + + PSConfigRef& get_conf() { return conf; } +}; + +RGWPSSyncModuleInstance::RGWPSSyncModuleInstance(CephContext *cct, const JSONFormattable& config) +{ + data_handler = std::unique_ptr(new RGWPSDataSyncModule(cct, config)); + const std::string jconf = json_str("conf", *data_handler->get_conf()); + JSONParser p; + if (!p.parse(jconf.c_str(), jconf.size())) { + ldout(cct, 1) << "ERROR: failed to parse sync module effective conf: " << jconf << dendl; + effective_conf = config; + } else { + effective_conf.decode_json(&p); + } +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + if (!rgw::amqp::init(cct)) { + ldout(cct, 1) << "ERROR: failed to initialize AMQP manager in pubsub sync module" << dendl; + } +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + if (!rgw::kafka::init(cct)) { + ldout(cct, 1) << "ERROR: failed to initialize Kafka manager in pubsub sync module" << dendl; + } +#endif +} + +RGWPSSyncModuleInstance::~RGWPSSyncModuleInstance() { +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + rgw::amqp::shutdown(); +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + rgw::kafka::shutdown(); +#endif +} + +RGWDataSyncModule *RGWPSSyncModuleInstance::get_data_handler() +{ + return data_handler.get(); +} + +RGWRESTMgr *RGWPSSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) { + if (dialect != RGW_REST_S3) { + return orig; + } + return new RGWRESTMgr_PubSub(); +} + +bool RGWPSSyncModuleInstance::should_full_sync() const { + return data_handler->get_conf()->start_with_full_sync; +} + +int RGWPSSyncModule::create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) { + instance->reset(new RGWPSSyncModuleInstance(cct, config)); + return 0; +} + + diff --git a/src/rgw/rgw_sync_module_pubsub.h b/src/rgw/rgw_sync_module_pubsub.h new file mode 100644 index 00000000..68d39786 --- /dev/null +++ b/src/rgw/rgw_sync_module_pubsub.h @@ -0,0 +1,40 @@ +#ifndef CEPH_RGW_SYNC_MODULE_PUBSUB_H +#define CEPH_RGW_SYNC_MODULE_PUBSUB_H + +#include "rgw_sync_module.h" + +class RGWPSSyncModule : public RGWSyncModule { +public: + RGWPSSyncModule() {} + bool supports_data_export() override { + return false; + } + bool supports_writes() override { + return true; + } + int create_instance(CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; +}; + +class RGWPSDataSyncModule; +class RGWRESTConn; + +class RGWPSSyncModuleInstance : public RGWSyncModuleInstance { + std::unique_ptr data_handler; + JSONFormattable effective_conf; +public: + RGWPSSyncModuleInstance(CephContext *cct, const JSONFormattable& config); + ~RGWPSSyncModuleInstance(); + RGWDataSyncModule *get_data_handler() override; + RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override; + bool supports_user_writes() override { + return true; + } + const JSONFormattable& get_effective_conf() { + return effective_conf; + } + // start with full sync based on configuration + // default to incremental only + virtual bool should_full_sync() const override; +}; + +#endif diff --git a/src/rgw/rgw_sync_module_pubsub_rest.cc b/src/rgw/rgw_sync_module_pubsub_rest.cc new file mode 100644 index 00000000..aec5a346 --- /dev/null +++ b/src/rgw/rgw_sync_module_pubsub_rest.cc @@ -0,0 +1,526 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include "rgw_rest_pubsub_common.h" +#include "rgw_rest_pubsub.h" +#include "rgw_sync_module_pubsub.h" +#include "rgw_pubsub_push.h" +#include "rgw_sync_module_pubsub_rest.h" +#include "rgw_pubsub.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_arn.h" +#include "rgw_zone.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +// command: PUT /topics/[&push-endpoint=[&=]] +class RGWPSCreateTopic_ObjStore : public RGWPSCreateTopicOp { +public: + int get_params() override { + + topic_name = s->object.name; + + opaque_data = s->info.args.get("OpaqueData"); + dest.push_endpoint = s->info.args.get("push-endpoint"); + + if (!validate_and_update_endpoint_secret(dest, s->cct, *(s->info.env))) { + return -EINVAL; + } + dest.push_endpoint_args = s->info.args.get_str(); + // dest object only stores endpoint info + // bucket to store events/records will be set only when subscription is created + dest.bucket_name = ""; + dest.oid_prefix = ""; + dest.arn_topic = topic_name; + // the topic ARN will be sent in the reply + const rgw::ARN arn(rgw::Partition::aws, rgw::Service::sns, + store->svc.zone->get_zonegroup().get_name(), + s->user->user_id.tenant, topic_name); + topic_arn = arn.to_string(); + return 0; + } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/json"); + + if (op_ret < 0) { + return; + } + + { + Formatter::ObjectSection section(*s->formatter, "result"); + encode_json("arn", topic_arn, s->formatter); + } + rgw_flush_formatter_and_reset(s, s->formatter); + } +}; + +// command: GET /topics +class RGWPSListTopics_ObjStore : public RGWPSListTopicsOp { +public: + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/json"); + + if (op_ret < 0) { + return; + } + + encode_json("result", result, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); + } +}; + +// command: GET /topics/ +class RGWPSGetTopic_ObjStore : public RGWPSGetTopicOp { +public: + int get_params() override { + topic_name = s->object.name; + return 0; + } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/json"); + + if (op_ret < 0) { + return; + } + + encode_json("result", result, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); + } +}; + +// command: DELETE /topics/ +class RGWPSDeleteTopic_ObjStore : public RGWPSDeleteTopicOp { +public: + int get_params() override { + topic_name = s->object.name; + return 0; + } +}; + +// ceph specifc topics handler factory +class RGWHandler_REST_PSTopic : public RGWHandler_REST_S3 { +protected: + int init_permissions(RGWOp* op) override { + return 0; + } + + int read_permissions(RGWOp* op) override { + return 0; + } + + bool supports_quota() override { + return false; + } + + RGWOp *op_get() override { + if (s->init_state.url_bucket.empty()) { + return nullptr; + } + if (s->object.empty()) { + return new RGWPSListTopics_ObjStore(); + } + return new RGWPSGetTopic_ObjStore(); + } + RGWOp *op_put() override { + if (!s->object.empty()) { + return new RGWPSCreateTopic_ObjStore(); + } + return nullptr; + } + RGWOp *op_delete() override { + if (!s->object.empty()) { + return new RGWPSDeleteTopic_ObjStore(); + } + return nullptr; + } +public: + explicit RGWHandler_REST_PSTopic(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {} + virtual ~RGWHandler_REST_PSTopic() = default; +}; + +// command: PUT /subscriptions/?topic=[&push-endpoint=[&=]]... +class RGWPSCreateSub_ObjStore : public RGWPSCreateSubOp { +public: + int get_params() override { + sub_name = s->object.name; + + bool exists; + topic_name = s->info.args.get("topic", &exists); + if (!exists) { + ldout(s->cct, 1) << "missing required param 'topic'" << dendl; + return -EINVAL; + } + + const auto psmodule = static_cast(store->get_sync_module().get()); + const auto& conf = psmodule->get_effective_conf(); + + dest.push_endpoint = s->info.args.get("push-endpoint"); + if (!validate_and_update_endpoint_secret(dest, s->cct, *(s->info.env))) { + return -EINVAL; + } + dest.push_endpoint_args = s->info.args.get_str(); + dest.bucket_name = string(conf["data_bucket_prefix"]) + s->owner.get_id().to_str() + "-" + topic_name; + dest.oid_prefix = string(conf["data_oid_prefix"]) + sub_name + "/"; + dest.arn_topic = topic_name; + + return 0; + } +}; + +// command: GET /subscriptions/ +class RGWPSGetSub_ObjStore : public RGWPSGetSubOp { +public: + int get_params() override { + sub_name = s->object.name; + return 0; + } + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/json"); + + if (op_ret < 0) { + return; + } + + encode_json("result", result, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); + } +}; + +// command: DELETE /subscriptions/ +class RGWPSDeleteSub_ObjStore : public RGWPSDeleteSubOp { +public: + int get_params() override { + sub_name = s->object.name; + topic_name = s->info.args.get("topic"); + return 0; + } +}; + +// command: POST /subscriptions/?ack&event-id= +class RGWPSAckSubEvent_ObjStore : public RGWPSAckSubEventOp { +public: + explicit RGWPSAckSubEvent_ObjStore() {} + + int get_params() override { + sub_name = s->object.name; + + bool exists; + + event_id = s->info.args.get("event-id", &exists); + if (!exists) { + ldout(s->cct, 1) << "missing required param 'event-id'" << dendl; + return -EINVAL; + } + return 0; + } +}; + +// command: GET /subscriptions/?events[&max-entries=][&marker=] +class RGWPSPullSubEvents_ObjStore : public RGWPSPullSubEventsOp { +public: + int get_params() override { + sub_name = s->object.name; + marker = s->info.args.get("marker"); + const int ret = s->info.args.get_int("max-entries", &max_entries, + RGWUserPubSub::Sub::DEFAULT_MAX_EVENTS); + if (ret < 0) { + ldout(s->cct, 1) << "failed to parse 'max-entries' param" << dendl; + return -EINVAL; + } + return 0; + } + + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/json"); + + if (op_ret < 0) { + return; + } + + encode_json("result", *sub, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); + } +}; + +// subscriptions handler factory +class RGWHandler_REST_PSSub : public RGWHandler_REST_S3 { +protected: + int init_permissions(RGWOp* op) override { + return 0; + } + + int read_permissions(RGWOp* op) override { + return 0; + } + bool supports_quota() override { + return false; + } + RGWOp *op_get() override { + if (s->object.empty()) { + return nullptr; + } + if (s->info.args.exists("events")) { + return new RGWPSPullSubEvents_ObjStore(); + } + return new RGWPSGetSub_ObjStore(); + } + RGWOp *op_put() override { + if (!s->object.empty()) { + return new RGWPSCreateSub_ObjStore(); + } + return nullptr; + } + RGWOp *op_delete() override { + if (!s->object.empty()) { + return new RGWPSDeleteSub_ObjStore(); + } + return nullptr; + } + RGWOp *op_post() override { + if (s->info.args.exists("ack")) { + return new RGWPSAckSubEvent_ObjStore(); + } + return nullptr; + } +public: + explicit RGWHandler_REST_PSSub(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {} + virtual ~RGWHandler_REST_PSSub() = default; +}; + +namespace { +// extract bucket name from ceph specific notification command, with the format: +// /notifications/ +int notif_bucket_path(const string& path, std::string& bucket_name) { + if (path.empty()) { + return -EINVAL; + } + size_t pos = path.find('/'); + if (pos == string::npos) { + return -EINVAL; + } + if (pos >= path.size()) { + return -EINVAL; + } + + string type = path.substr(0, pos); + if (type != "bucket") { + return -EINVAL; + } + + bucket_name = path.substr(pos + 1); + return 0; +} +} + +// command (ceph specific): PUT /notification/bucket/?topic= +class RGWPSCreateNotif_ObjStore : public RGWPSCreateNotifOp { +private: + std::string topic_name; + rgw::notify::EventTypeList events; + + int get_params() override { + bool exists; + topic_name = s->info.args.get("topic", &exists); + if (!exists) { + ldout(s->cct, 1) << "missing required param 'topic'" << dendl; + return -EINVAL; + } + + std::string events_str = s->info.args.get("events", &exists); + if (!exists) { + // if no events are provided, we notify on all of them + events_str = "OBJECT_CREATE,OBJECT_DELETE,DELETE_MARKER_CREATE"; + } + rgw::notify::from_string_list(events_str, events); + if (std::find(events.begin(), events.end(), rgw::notify::UnknownEvent) != events.end()) { + ldout(s->cct, 1) << "invalid event type in list: " << events_str << dendl; + return -EINVAL; + } + return notif_bucket_path(s->object.name, bucket_name); + } + +public: + const char* name() const override { return "pubsub_notification_create"; } + void execute() override; +}; + +void RGWPSCreateNotif_ObjStore::execute() +{ + ups.emplace(store, s->owner.get_id()); + + auto b = ups->get_bucket(bucket_info.bucket); + op_ret = b->create_notification(topic_name, events); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to create notification for topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully created notification for topic '" << topic_name << "'" << dendl; +} + +// command: DELETE /notifications/bucket/?topic= +class RGWPSDeleteNotif_ObjStore : public RGWPSDeleteNotifOp { +private: + std::string topic_name; + + int get_params() override { + bool exists; + topic_name = s->info.args.get("topic", &exists); + if (!exists) { + ldout(s->cct, 1) << "missing required param 'topic'" << dendl; + return -EINVAL; + } + return notif_bucket_path(s->object.name, bucket_name); + } + +public: + void execute() override; + const char* name() const override { return "pubsub_notification_delete"; } +}; + +void RGWPSDeleteNotif_ObjStore::execute() { + op_ret = get_params(); + if (op_ret < 0) { + return; + } + + ups.emplace(store, s->owner.get_id()); + auto b = ups->get_bucket(bucket_info.bucket); + op_ret = b->remove_notification(topic_name); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to remove notification from topic '" << topic_name << "', ret=" << op_ret << dendl; + return; + } + ldout(s->cct, 20) << "successfully removed notification from topic '" << topic_name << "'" << dendl; +} + +// command: GET /notifications/bucket/ +class RGWPSListNotifs_ObjStore : public RGWPSListNotifsOp { +private: + rgw_pubsub_bucket_topics result; + + int get_params() override { + return notif_bucket_path(s->object.name, bucket_name); + } + +public: + void execute() override; + void send_response() override { + if (op_ret) { + set_req_state_err(s, op_ret); + } + dump_errno(s); + end_header(s, this, "application/json"); + + if (op_ret < 0) { + return; + } + encode_json("result", result, s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); + } + const char* name() const override { return "pubsub_notifications_list"; } +}; + +void RGWPSListNotifs_ObjStore::execute() +{ + ups.emplace(store, s->owner.get_id()); + auto b = ups->get_bucket(bucket_info.bucket); + op_ret = b->get_topics(&result); + if (op_ret < 0) { + ldout(s->cct, 1) << "failed to get topics, ret=" << op_ret << dendl; + return; + } +} + +// ceph specific notification handler factory +class RGWHandler_REST_PSNotifs : public RGWHandler_REST_S3 { +protected: + int init_permissions(RGWOp* op) override { + return 0; + } + + int read_permissions(RGWOp* op) override { + return 0; + } + bool supports_quota() override { + return false; + } + RGWOp *op_get() override { + if (s->object.empty()) { + return nullptr; + } + return new RGWPSListNotifs_ObjStore(); + } + RGWOp *op_put() override { + if (!s->object.empty()) { + return new RGWPSCreateNotif_ObjStore(); + } + return nullptr; + } + RGWOp *op_delete() override { + if (!s->object.empty()) { + return new RGWPSDeleteNotif_ObjStore(); + } + return nullptr; + } +public: + explicit RGWHandler_REST_PSNotifs(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {} + virtual ~RGWHandler_REST_PSNotifs() = default; +}; + +// factory for ceph specific PubSub REST handlers +RGWHandler_REST* RGWRESTMgr_PubSub::get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + if (RGWHandler_REST_S3::init_from_header(s, RGW_FORMAT_JSON, true) < 0) { + return nullptr; + } + + RGWHandler_REST* handler{nullptr}; + + // ceph specific PubSub API: topics/subscriptions/notification are reserved bucket names + // this API is available only on RGW that belong to a pubsub zone + if (s->init_state.url_bucket == "topics") { + handler = new RGWHandler_REST_PSTopic(auth_registry); + } else if (s->init_state.url_bucket == "subscriptions") { + handler = new RGWHandler_REST_PSSub(auth_registry); + } else if (s->init_state.url_bucket == "notifications") { + handler = new RGWHandler_REST_PSNotifs(auth_registry); + } else if (s->info.args.exists("notification")) { + const int ret = RGWHandler_REST::allocate_formatter(s, RGW_FORMAT_XML, true); + if (ret == 0) { + handler = new RGWHandler_REST_PSNotifs_S3(auth_registry); + } + } + + ldout(s->cct, 20) << __func__ << " handler=" << (handler ? typeid(*handler).name() : "") << dendl; + + return handler; +} + diff --git a/src/rgw/rgw_sync_module_pubsub_rest.h b/src/rgw/rgw_sync_module_pubsub_rest.h new file mode 100644 index 00000000..92fd8fe7 --- /dev/null +++ b/src/rgw/rgw_sync_module_pubsub_rest.h @@ -0,0 +1,13 @@ +#ifndef CEPH_RGW_SYNC_MODULE_PUBSUB_REST_H +#define CEPH_RGW_SYNC_MODULE_PUBSUB_REST_H + +#include "rgw_rest.h" + +class RGWRESTMgr_PubSub : public RGWRESTMgr { +public: + virtual RGWHandler_REST* get_handler(struct req_state* s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) override; +}; + +#endif diff --git a/src/rgw/rgw_sync_trace.cc b/src/rgw/rgw_sync_trace.cc new file mode 100644 index 00000000..34aa00e9 --- /dev/null +++ b/src/rgw/rgw_sync_trace.cc @@ -0,0 +1,288 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SYNC_TRACE_H +#define CEPH_RGW_SYNC_TRACE_H + +#include + +#include "common/debug.h" +#include "common/ceph_json.h" + +#include "rgw_sync_trace.h" +#include "rgw_rados.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw_sync + +RGWSyncTraceNode::RGWSyncTraceNode(CephContext *_cct, uint64_t _handle, + const RGWSyncTraceNodeRef& _parent, + const string& _type, const string& _id) : cct(_cct), + parent(_parent), + type(_type), + id(_id), + handle(_handle), + history(cct->_conf->rgw_sync_trace_per_node_log_size) +{ + if (parent.get()) { + prefix = parent->get_prefix(); + } + + if (!type.empty()) { + prefix += type; + if (!id.empty()) { + prefix += "[" + id + "]"; + } + prefix += ":"; + } +} + +void RGWSyncTraceNode::log(int level, const string& s) +{ + status = s; + history.push_back(status); + /* dump output on either rgw_sync, or rgw -- but only once */ + if (cct->_conf->subsys.should_gather(ceph_subsys_rgw_sync, level)) { + lsubdout(cct, rgw_sync, + ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl; + } else { + lsubdout(cct, rgw, + ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl; + } +} + + +class RGWSyncTraceServiceMapThread : public RGWRadosThread { + RGWRados *store; + RGWSyncTraceManager *manager; + + uint64_t interval_msec() override { + return cct->_conf->rgw_sync_trace_servicemap_update_interval * 1000; + } +public: + RGWSyncTraceServiceMapThread(RGWRados *_store, RGWSyncTraceManager *_manager) + : RGWRadosThread(_store, "sync-trace"), store(_store), manager(_manager) {} + + int process() override; +}; + +int RGWSyncTraceServiceMapThread::process() +{ + map status; + status["current_sync"] = manager->get_active_names(); + int ret = store->update_service_map(std::move(status)); + if (ret < 0) { + ldout(store->ctx(), 0) << "ERROR: update_service_map() returned ret=" << ret << dendl; + } + return 0; +} + +RGWSyncTraceNodeRef RGWSyncTraceManager::add_node(const RGWSyncTraceNodeRef& parent, + const std::string& type, + const std::string& id) +{ + shunique_lock wl(lock, ceph::acquire_unique); + auto handle = alloc_handle(); + RGWSyncTraceNodeRef& ref = nodes[handle]; + ref.reset(new RGWSyncTraceNode(cct, handle, parent, type, id)); + // return a separate shared_ptr that calls finish() on the node instead of + // deleting it. the lambda capture holds a reference to the original 'ref' + auto deleter = [ref, this] (RGWSyncTraceNode *node) { finish_node(node); }; + return {ref.get(), deleter}; +} + +bool RGWSyncTraceNode::match(const string& search_term, bool search_history) +{ + try { + std::regex expr(search_term); + std::smatch m; + + if (regex_search(prefix, m, expr)) { + return true; + } + if (regex_search(status, m,expr)) { + return true; + } + if (!search_history) { + return false; + } + + for (auto h : history) { + if (regex_search(h, m, expr)) { + return true; + } + } + } catch (const std::regex_error& e) { + ldout(cct, 5) << "NOTICE: sync trace: bad expression: bad regex search term" << dendl; + } + + return false; +} + +void RGWSyncTraceManager::init(RGWRados *store) +{ + service_map_thread = new RGWSyncTraceServiceMapThread(store, this); + service_map_thread->start(); +} + +RGWSyncTraceManager::~RGWSyncTraceManager() +{ + cct->get_admin_socket()->unregister_commands(this); + service_map_thread->stop(); + delete service_map_thread; + + nodes.clear(); +} + +int RGWSyncTraceManager::hook_to_admin_command() +{ + AdminSocket *admin_socket = cct->get_admin_socket(); + + admin_commands = { { "sync trace show", "sync trace show name=search,type=CephString,req=false", "sync trace show [filter_str]: show current multisite tracing information" }, + { "sync trace history", "sync trace history name=search,type=CephString,req=false", "sync trace history [filter_str]: show history of multisite tracing information" }, + { "sync trace active", "sync trace active name=search,type=CephString,req=false", "show active multisite sync entities information" }, + { "sync trace active_short", "sync trace active_short name=search,type=CephString,req=false", "show active multisite sync entities entries" } }; + for (auto cmd : admin_commands) { + int r = admin_socket->register_command(cmd[0], cmd[1], this, + cmd[2]); + if (r < 0) { + lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl; + return r; + } + } + return 0; +} + +static void dump_node(RGWSyncTraceNode *entry, bool show_history, JSONFormatter& f) +{ + f.open_object_section("entry"); + ::encode_json("status", entry->to_str(), &f); + if (show_history) { + f.open_array_section("history"); + for (auto h : entry->get_history()) { + ::encode_json("entry", h, &f); + } + f.close_section(); + } + f.close_section(); +} + +string RGWSyncTraceManager::get_active_names() +{ + shunique_lock rl(lock, ceph::acquire_shared); + + stringstream ss; + JSONFormatter f; + + f.open_array_section("result"); + for (auto n : nodes) { + auto& entry = n.second; + + if (!entry->test_flags(RGW_SNS_FLAG_ACTIVE)) { + continue; + } + const string& name = entry->get_resource_name(); + if (!name.empty()) { + ::encode_json("entry", name, &f); + } + f.flush(ss); + } + f.close_section(); + f.flush(ss); + + return ss.str(); +} + +bool RGWSyncTraceManager::call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) { + + bool show_history = (command == "sync trace history"); + bool show_short = (command == "sync trace active_short"); + bool show_active = (command == "sync trace active") || show_short; + + string search; + + auto si = cmdmap.find("search"); + if (si != cmdmap.end()) { + search = boost::get(si->second); + } + + shunique_lock rl(lock, ceph::acquire_shared); + + stringstream ss; + JSONFormatter f(true); + + f.open_object_section("result"); + f.open_array_section("running"); + for (auto n : nodes) { + auto& entry = n.second; + + if (!search.empty() && !entry->match(search, show_history)) { + continue; + } + if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) { + continue; + } + if (show_short) { + const string& name = entry->get_resource_name(); + if (!name.empty()) { + ::encode_json("entry", name, &f); + } + } else { + dump_node(entry.get(), show_history, f); + } + f.flush(ss); + } + f.close_section(); + + f.open_array_section("complete"); + for (auto& entry : complete_nodes) { + if (!search.empty() && !entry->match(search, show_history)) { + continue; + } + if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) { + continue; + } + dump_node(entry.get(), show_history, f); + f.flush(ss); + } + f.close_section(); + + f.close_section(); + f.flush(ss); + out.append(ss); + + return true; +} + +void RGWSyncTraceManager::finish_node(RGWSyncTraceNode *node) +{ + RGWSyncTraceNodeRef old_node; + + { + shunique_lock wl(lock, ceph::acquire_unique); + if (!node) { + return; + } + auto iter = nodes.find(node->handle); + if (iter == nodes.end()) { + /* not found, already finished */ + return; + } + + if (complete_nodes.full()) { + /* take a reference to the entry that is going to be evicted, + * can't let it get evicted under lock held, otherwise + * it's a deadlock as it will call finish_node() + */ + old_node = complete_nodes.front(); + } + + complete_nodes.push_back(iter->second); + nodes.erase(iter); + } +}; + +#endif + diff --git a/src/rgw/rgw_sync_trace.h b/src/rgw/rgw_sync_trace.h new file mode 100644 index 00000000..d2925cf1 --- /dev/null +++ b/src/rgw/rgw_sync_trace.h @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_SYNC_LOG_H +#define CEPH_RGW_SYNC_LOG_H + +#include + +#include "common/Mutex.h" +#include "common/shunique_lock.h" +#include "common/admin_socket.h" + +#include +#include +#include +#include +#include + +#define SSTR(o) ({ \ + std::stringstream ss; \ + ss << o; \ + ss.str(); \ +}) + +#define RGW_SNS_FLAG_ACTIVE 1 +#define RGW_SNS_FLAG_ERROR 2 + +class RGWRados; +class RGWSyncTraceManager; +class RGWSyncTraceNode; +class RGWSyncTraceServiceMapThread; + +using RGWSyncTraceNodeRef = std::shared_ptr; + +class RGWSyncTraceNode final { + friend class RGWSyncTraceManager; + + CephContext *cct; + RGWSyncTraceNodeRef parent; + + uint16_t state{0}; + std::string status; + + Mutex lock{"RGWSyncTraceNode::lock"}; + + std::string type; + std::string id; + + std::string prefix; + + std::string resource_name; + + uint64_t handle; + + boost::circular_buffer history; + + // private constructor, create with RGWSyncTraceManager::add_node() + RGWSyncTraceNode(CephContext *_cct, uint64_t _handle, + const RGWSyncTraceNodeRef& _parent, + const std::string& _type, const std::string& _id); + + public: + void set_resource_name(const string& s) { + resource_name = s; + } + + const string& get_resource_name() { + return resource_name; + } + + void set_flag(uint16_t s) { + state |= s; + } + void unset_flag(uint16_t s) { + state &= ~s; + } + bool test_flags(uint16_t f) { + return (state & f) == f; + } + void log(int level, const std::string& s); + + std::string to_str() { + return prefix + " " + status; + } + + const string& get_prefix() { + return prefix; + } + + std::ostream& operator<<(std::ostream& os) { + os << to_str(); + return os; + } + + boost::circular_buffer& get_history() { + return history; + } + + bool match(const string& search_term, bool search_history); +}; + +class RGWSyncTraceManager : public AdminSocketHook { + friend class RGWSyncTraceNode; + + mutable std::shared_timed_mutex lock; + using shunique_lock = ceph::shunique_lock; + + CephContext *cct; + RGWSyncTraceServiceMapThread *service_map_thread{nullptr}; + + std::map nodes; + boost::circular_buffer complete_nodes; + + std::atomic count = { 0 }; + + std::list > admin_commands; + + uint64_t alloc_handle() { + return ++count; + } + void finish_node(RGWSyncTraceNode *node); + +public: + RGWSyncTraceManager(CephContext *_cct, int max_lru) : cct(_cct), complete_nodes(max_lru) {} + ~RGWSyncTraceManager(); + + void init(RGWRados *store); + + const RGWSyncTraceNodeRef root_node; + + RGWSyncTraceNodeRef add_node(const RGWSyncTraceNodeRef& parent, + const std::string& type, + const std::string& id = ""); + + int hook_to_admin_command(); + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override; + string get_active_names(); +}; + + +#endif diff --git a/src/rgw/rgw_tag.cc b/src/rgw/rgw_tag.cc new file mode 100644 index 00000000..05c48bb1 --- /dev/null +++ b/src/rgw/rgw_tag.cc @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include +#include + +#include "rgw_tag.h" +#include "rgw_common.h" + +static constexpr uint32_t MAX_OBJ_TAGS=10; +static constexpr uint32_t MAX_TAG_KEY_SIZE=128; +static constexpr uint32_t MAX_TAG_VAL_SIZE=256; + +bool RGWObjTags::add_tag(const string&key, const string& val){ + return tag_map.emplace(std::make_pair(key,val)).second; +} + +bool RGWObjTags::emplace_tag(std::string&& key, std::string&& val){ + return tag_map.emplace(std::move(key), std::move(val)).second; +} + +int RGWObjTags::check_and_add_tag(const string&key, const string& val){ + if (tag_map.size() == MAX_OBJ_TAGS || + key.size() > MAX_TAG_KEY_SIZE || + val.size() > MAX_TAG_VAL_SIZE || + key.size() == 0){ + return -ERR_INVALID_TAG; + } + + // if we get a conflicting key, either the XML is malformed or the user + // supplied an invalid string + if (!add_tag(key,val)) + return -EINVAL; + + return 0; +} + +int RGWObjTags::set_from_string(const string& input){ + int ret=0; + vector kvs; + boost::split(kvs, input, boost::is_any_of("&")); + for (const auto& kv: kvs){ + auto p = kv.find("="); + string key,val; + if (p != string::npos) { + ret = check_and_add_tag(url_decode(kv.substr(0,p)), + url_decode(kv.substr(p+1))); + } else { + ret = check_and_add_tag(url_decode(kv)); + } + + if (ret < 0) + return ret; + } + return ret; +} diff --git a/src/rgw/rgw_tag.h b/src/rgw/rgw_tag.h new file mode 100644 index 00000000..80a18ae3 --- /dev/null +++ b/src/rgw/rgw_tag.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_TAG_H +#define RGW_TAG_H + +#include +#include +#include + +class RGWObjTags +{ +public: + using tag_map_t = boost::container::flat_map ; + +protected: + tag_map_t tag_map; + public: + RGWObjTags() = default; + ~RGWObjTags() = default; + + void encode(bufferlist& bl) const { + ENCODE_START(1,1,bl); + encode(tag_map, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl); + decode(tag_map,bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + bool add_tag(const std::string& key, const std::string& val=""); + bool emplace_tag(std::string&& key, std::string&& val); + int check_and_add_tag(const std::string& key, const std::string& val=""); + size_t count() const {return tag_map.size();} + int set_from_string(const std::string& input); + void clear() { tag_map.clear(); } + bool empty() const noexcept { return tag_map.empty(); } + const tag_map_t& get_tags() const {return tag_map;} +}; +WRITE_CLASS_ENCODER(RGWObjTags) + +#endif /* RGW_TAG_H */ diff --git a/src/rgw/rgw_tag_s3.cc b/src/rgw/rgw_tag_s3.cc new file mode 100644 index 00000000..c5ad87ca --- /dev/null +++ b/src/rgw/rgw_tag_s3.cc @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include "include/types.h" + +#include "rgw_tag_s3.h" + +void RGWObjTagEntry_S3::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Key", key, obj, true); + RGWXMLDecoder::decode_xml("Value", val, obj, true); +} + +void RGWObjTagEntry_S3::dump_xml(Formatter *f) const { + encode_xml("Key", key, f); + encode_xml("Value", val, f); + + if (key.empty()) { + throw RGWXMLDecoder::err("empty key"); + } + + if (val.empty()) { + throw RGWXMLDecoder::err("empty val"); + } +} + +void RGWObjTagSet_S3::decode_xml(XMLObj *obj) { + vector entries; + + RGWXMLDecoder::decode_xml("Tag", entries, obj, true); + + for (auto& entry : entries) { + const std::string& key = entry.get_key(); + const std::string& val = entry.get_val(); + if (!add_tag(key,val)) { + throw RGWXMLDecoder::err("failed to add tag"); + } + } +} + +int RGWObjTagSet_S3::rebuild(RGWObjTags& dest) { + int ret; + for (const auto &it : tag_map){ + ret = dest.check_and_add_tag(it.first, it.second); + if (ret < 0) + return ret; + } + return 0; +} + +void RGWObjTagging_S3::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("TagSet", tagset, obj, true); +} + +void RGWObjTagSet_S3::dump_xml(Formatter *f) const { + for (const auto& tag : tag_map){ + Formatter::ObjectSection os(*f, "Tag"); + encode_xml("Key", tag.first, f); + encode_xml("Value", tag.second, f); + } +} + diff --git a/src/rgw/rgw_tag_s3.h b/src/rgw/rgw_tag_s3.h new file mode 100644 index 00000000..7ed02277 --- /dev/null +++ b/src/rgw/rgw_tag_s3.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef RGW_TAG_S3_H +#define RGW_TAG_S3_H + +#include +#include +#include +#include +#include +#include + +#include "rgw_tag.h" +#include "rgw_xml.h" + +class RGWObjTagEntry_S3 +{ + std::string key; + std::string val; +public: + RGWObjTagEntry_S3() {} + RGWObjTagEntry_S3(const std::string &k, const std::string &v):key(k),val(v) {}; + ~RGWObjTagEntry_S3() {} + + const std::string& get_key () const { return key; } + const std::string& get_val () const { return val; } + + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); +}; + +class RGWObjTagSet_S3: public RGWObjTags +{ +public: + int rebuild(RGWObjTags& dest); + + void dump_xml(Formatter *f) const; + void decode_xml(XMLObj *obj); +}; + +class RGWObjTagging_S3 +{ + RGWObjTagSet_S3 tagset; +public: + void decode_xml(XMLObj *obj); + int rebuild(RGWObjTags& dest) { + return tagset.rebuild(dest); + } +}; + + +#endif /* RGW_TAG_S3_H */ diff --git a/src/rgw/rgw_tar.h b/src/rgw/rgw_tar.h new file mode 100644 index 00000000..b322a291 --- /dev/null +++ b/src/rgw/rgw_tar.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_TAR_H +#define CEPH_RGW_TAR_H + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace rgw { +namespace tar { + +static constexpr size_t BLOCK_SIZE = 512; + + +static inline std::pair> +interpret_block(const StatusIndicator& status, ceph::bufferlist& bl); + + +class StatusIndicator { + friend std::pair> + interpret_block(const StatusIndicator& status, ceph::bufferlist& bl); + + bool is_empty; + bool is_eof; + + StatusIndicator() + : is_empty(false), + is_eof(false) { + } + + StatusIndicator(const StatusIndicator& prev_status, + const bool is_empty) + : is_empty(is_empty), + is_eof(is_empty && prev_status.empty()) { + } + +public: + bool empty() const { + return is_empty; + } + + bool eof() const { + return is_eof; + } + + static StatusIndicator create() { + return StatusIndicator(); + } +} /* class StatusIndicator */; + + +enum class FileType : char { + UNKNOWN = '\0', + + /* The tar format uses ASCII encoding. */ + NORMAL_FILE = '0', + DIRECTORY = '5' +}; /* enum class FileType */ + +class HeaderView { +protected: + /* Everythng is char here (ASCII encoding), so we don't need to worry about + * the struct padding. */ + const struct header_t { + char filename[100]; + char __filemode[8]; + char __owner_id[8]; + char __group_id[8]; + char filesize[12]; + char lastmod[12]; + char checksum[8]; + char filetype; + char __padding[355]; + } *header; + + static_assert(sizeof(*header) == BLOCK_SIZE, + "The TAR header must be exactly BLOCK_SIZE length"); + + /* The label is far more imporant from what the code really does. */ + static size_t pos2len(const size_t pos) { + return pos + 1; + } + +public: + explicit HeaderView(const char (&header)[BLOCK_SIZE]) + : header(reinterpret_cast(header)) { + } + + FileType get_filetype() const { + switch (header->filetype) { + case static_cast(FileType::NORMAL_FILE): + return FileType::NORMAL_FILE; + case static_cast(FileType::DIRECTORY): + return FileType::DIRECTORY; + default: + return FileType::UNKNOWN; + } + } + + boost::string_ref get_filename() const { + return boost::string_ref(header->filename, + std::min(sizeof(header->filename), + strlen(header->filename))); + } + + size_t get_filesize() const { + /* The string_ref is pretty suitable here because tar encodes its + * metadata in ASCII. */ + const boost::string_ref raw(header->filesize, sizeof(header->filesize)); + + /* We need to find where the padding ends. */ + const auto pad_ends_at = std::min(raw.find_last_not_of('\0'), + raw.find_last_not_of(' ')); + const auto trimmed = raw.substr(0, + pad_ends_at == boost::string_ref::npos ? boost::string_ref::npos + : pos2len(pad_ends_at)); + + size_t sum = 0, mul = 1; + for (const char c : boost::adaptors::reverse(trimmed)) { + sum += (c - '0') * mul; + mul *= 8; + } + + return sum; + } +}; /* class Header */ + + +static inline std::pair> +interpret_block(const StatusIndicator& status, ceph::bufferlist& bl) { + static constexpr std::array zero_block = {0, }; + const char (&block)[BLOCK_SIZE] = \ + reinterpret_cast(*bl.c_str()); + + if (std::memcmp(zero_block.data(), block, BLOCK_SIZE) == 0) { + return std::make_pair(StatusIndicator(status, true), boost::none); + } else { + return std::make_pair(StatusIndicator(status, false), HeaderView(block)); + } +} + +} /* namespace tar */ +} /* namespace rgw */ + +#endif /* CEPH_RGW_TAR_H */ diff --git a/src/rgw/rgw_token.cc b/src/rgw/rgw_token.cc new file mode 100644 index 00000000..a5c6f76e --- /dev/null +++ b/src/rgw/rgw_token.cc @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include + +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/debug.h" +#include "global/global_init.h" +#include "include/ceph_assert.h" +#include "include/str_list.h" + +#include "rgw_token.h" +#include "rgw_b64.h" + +#define dout_subsys ceph_subsys_rgw + +namespace { + + using namespace rgw; + using std::get; + using std::string; + + RGWToken::token_type type{RGWToken::TOKEN_NONE}; + string access_key{""}; + string secret_key{""}; + + Formatter* formatter{nullptr}; + + bool verbose {false}; + bool do_encode {false}; + bool do_decode {false}; + +} + +void usage() +{ + cout << "usage: radosgw-token --encode --ttype= [options...]" << std::endl; + cout << "\t(maybe exporting RGW_ACCESS_KEY_ID and RGW_SECRET_ACCESS_KEY)" + << std::endl; + cout << "\t := ad | ldap" << std::endl; + cout << "\n"; + generic_client_usage(); +} + +int main(int argc, char **argv) +{ + std::string val; + vector args; + argv_to_vec(argc, (const char **)argv, args); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + char *v{nullptr}; + v = getenv("RGW_ACCESS_KEY_ID"); + if (v) { + access_key = v; + } + + v = getenv("RGW_SECRET_ACCESS_KEY"); + if (v) { + secret_key = v; + } + + for (auto arg_iter = args.begin(); arg_iter != args.end();) { + if (ceph_argparse_witharg(args, arg_iter, &val, "--access", + (char*) nullptr)) { + access_key = val; + } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret", + (char*) nullptr)) { + secret_key = val; + } else if (ceph_argparse_witharg(args, arg_iter, &val, "--ttype", + (char*) nullptr)) { + for (const auto& ttype : {"ad", "ldap"}) { + if (boost::iequals(val, ttype)) { + type = RGWToken::to_type(val); + break; + } + } + } else if (ceph_argparse_flag(args, arg_iter, "--encode", + (char*) nullptr)) { + do_encode = true; + } else if (ceph_argparse_flag(args, arg_iter, "--decode", + (char*) nullptr)) { + do_decode = true; + } else if (ceph_argparse_flag(args, arg_iter, "--verbose", + (char*) nullptr)) { + verbose = true; + } else { + ++arg_iter; + } + } + + if ((! do_encode) || + (type == RGWToken::TOKEN_NONE)) { + return -EINVAL; + } + + formatter = new JSONFormatter(true /* pretty */); + + RGWToken token(type, access_key, secret_key); + if (do_encode) { + token.encode_json(formatter); + std::ostringstream os; + formatter->flush(os); + string token_str = os.str(); + if (verbose) { + std::cout << "expanded token: " << token_str << std::endl; + if (do_decode) { + RGWToken token2(token_str); + std::cout << "decoded expanded token: " << token2 << std::endl; + } + } + std::cout << to_base64(token_str) << std::endl; + } + + return 0; +} diff --git a/src/rgw/rgw_token.h b/src/rgw/rgw_token.h new file mode 100644 index 00000000..8f50133d --- /dev/null +++ b/src/rgw/rgw_token.h @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_TOKEN_H +#define RGW_TOKEN_H + +#include +#include +#include + +#include "common/ceph_json.h" +#include "common/Formatter.h" +#include "rgw/rgw_b64.h" + +namespace rgw { + + using std::string; + + class RGWToken { + public: + static constexpr auto type_name = "RGW_TOKEN"; + + enum token_type : uint32_t { + TOKEN_NONE, + TOKEN_AD, + TOKEN_KEYSTONE, + TOKEN_LDAP, + }; + + static enum token_type to_type(const string& s) { + if (boost::iequals(s, "ad")) + return TOKEN_AD; + if (boost::iequals(s, "ldap")) + return TOKEN_LDAP; + if (boost::iequals(s, "keystone")) + return TOKEN_KEYSTONE; + return TOKEN_NONE; + } + + static const char* from_type(enum token_type type) { + switch (type) { + case TOKEN_AD: + return "ad"; + break; + case TOKEN_LDAP: + return "ldap"; + break; + case TOKEN_KEYSTONE: + return "keystone"; + break; + default: + return "none"; + }; + } + + token_type type; + string id; + string key; + + virtual uint32_t version() const { return 1; }; + + bool valid() const{ + return ((type != TOKEN_NONE) && + (! id.empty()) && + (! key.empty())); + } + + RGWToken() + : type(TOKEN_NONE) {}; + + RGWToken(enum token_type _type, const std::string& _id, + const std::string& _key) + : type(_type), id(_id), key(_key) {}; + + RGWToken(const string& json) { + JSONParser p; + p.parse(json.c_str(), json.length()); + JSONDecoder::decode_json(RGWToken::type_name, *this, &p); + } + + void encode(bufferlist& bl) const { + uint32_t ver = version(); + string typestr{from_type(type)}; + ENCODE_START(1, 1, bl); + encode(type_name, bl); + encode(ver, bl); + encode(typestr, bl); + encode(id, bl); + encode(key, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + string name; + string typestr; + uint32_t version; + DECODE_START(1, bl); + decode(name, bl); + decode(version, bl); + decode(typestr, bl); + type = to_type(typestr); + decode(id, bl); + decode(key, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter* f) const { + ::encode_json("version", uint32_t(version()), f); + ::encode_json("type", from_type(type), f); + ::encode_json("id", id, f); + ::encode_json("key", key, f); + } + + void encode_json(Formatter* f) { + RGWToken& token = *this; + f->open_object_section(type_name); + ::encode_json(type_name, token, f); + f->close_section(); + } + + void decode_json(JSONObj* obj) { + uint32_t version; + string type_name; + string typestr; + JSONDecoder::decode_json("version", version, obj); + JSONDecoder::decode_json("type", typestr, obj); + type = to_type(typestr); + JSONDecoder::decode_json("id", id, obj); + JSONDecoder::decode_json("key", key, obj); + } + + std::string encode_json_base64(Formatter* f) { + encode_json(f); + std::ostringstream os; + f->flush(os); + return to_base64(std::move(os.str())); + } + + friend inline ostream& operator<<(ostream& os, const RGWToken& token); + + virtual ~RGWToken() {}; + }; + WRITE_CLASS_ENCODER(RGWToken) + + inline ostream& operator<<(ostream& os, const RGWToken& token) + { + os << "<>"; + return os; + } + +} /* namespace rgw */ + +#endif /* RGW_TOKEN_H */ diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc new file mode 100644 index 00000000..057535e4 --- /dev/null +++ b/src/rgw/rgw_tools.cc @@ -0,0 +1,527 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "common/errno.h" +#include "common/safe_io.h" +#include "librados/librados_asio.h" +#include "common/async/yield_context.h" + +#include "include/types.h" +#include "include/stringify.h" + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_tools.h" +#include "rgw_acl_s3.h" +#include "rgw_op.h" +#include "rgw_putobj_processor.h" +#include "rgw_aio_throttle.h" +#include "rgw_compression.h" +#include "rgw_zone.h" +#include "osd/osd_types.h" + +#include "services/svc_sys_obj.h" +#include "services/svc_zone_utils.h" + +#define dout_subsys ceph_subsys_rgw +#define dout_context g_ceph_context + +#define READ_CHUNK_LEN (512 * 1024) + +static std::map* ext_mime_map; + +int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, + librados::IoCtx& ioctx, bool create, + bool mostly_omap) +{ + int r = rados->ioctx_create(pool.name.c_str(), ioctx); + if (r == -ENOENT && create) { + r = rados->pool_create(pool.name.c_str()); + if (r == -ERANGE) { + dout(0) + << __func__ + << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r) + << " (this can be due to a pool or placement group misconfiguration, e.g." + << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)" + << dendl; + } + if (r < 0 && r != -EEXIST) { + return r; + } + + r = rados->ioctx_create(pool.name.c_str(), ioctx); + if (r < 0) { + return r; + } + + r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false); + if (r < 0 && r != -EOPNOTSUPP) { + return r; + } + + if (mostly_omap) { + // set pg_autoscale_bias + bufferlist inbl; + float bias = g_conf().get_val("rgw_rados_pool_autoscale_bias"); + int r = rados->mon_command( + "{\"prefix\": \"osd pool set\", \"pool\": \"" + + pool.name + "\", \"var\": \"pg_autoscale_bias\": \"" + + stringify(bias) + "\"}", + inbl, NULL, NULL); + if (r < 0) { + dout(10) << __func__ << " warning: failed to set pg_autoscale_bias on " + << pool.name << dendl; + } + // set pg_num_min + int min = g_conf().get_val("rgw_rados_pool_pg_num_min"); + r = rados->mon_command( + "{\"prefix\": \"osd pool set\", \"pool\": \"" + + pool.name + "\", \"var\": \"pg_num_min\": \"" + + stringify(min) + "\"}", + inbl, NULL, NULL); + if (r < 0) { + dout(10) << __func__ << " warning: failed to set pg_num_min on " + << pool.name << dendl; + } + } + } else if (r < 0) { + return r; + } + if (!pool.ns.empty()) { + ioctx.set_namespace(pool.ns); + } + return 0; +} + +int rgw_put_system_obj(RGWRados *rgwstore, const rgw_pool& pool, const string& oid, bufferlist& data, bool exclusive, + RGWObjVersionTracker *objv_tracker, real_time set_mtime, map *pattrs) +{ + map no_attrs; + if (!pattrs) { + pattrs = &no_attrs; + } + + rgw_raw_obj obj(pool, oid); + + auto obj_ctx = rgwstore->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + int ret = sysobj.wop() + .set_objv_tracker(objv_tracker) + .set_exclusive(exclusive) + .set_mtime(set_mtime) + .set_attrs(*pattrs) + .write(data); + + if (ret == -ENOENT) { + ret = rgwstore->create_pool(pool); + if (ret >= 0) { + ret = sysobj.wop() + .set_objv_tracker(objv_tracker) + .set_exclusive(exclusive) + .set_mtime(set_mtime) + .set_attrs(*pattrs) + .write(data); + } + } + + return ret; +} + +int rgw_get_system_obj(RGWRados *rgwstore, RGWSysObjectCtx& obj_ctx, const rgw_pool& pool, const string& key, bufferlist& bl, + RGWObjVersionTracker *objv_tracker, real_time *pmtime, map *pattrs, + rgw_cache_entry_info *cache_info, boost::optional refresh_version) +{ + bufferlist::iterator iter; + int request_len = READ_CHUNK_LEN; + rgw_raw_obj obj(pool, key); + + obj_version original_readv; + if (objv_tracker && !objv_tracker->read_version.empty()) { + original_readv = objv_tracker->read_version; + } + + do { + auto sysobj = obj_ctx.get_obj(obj); + auto rop = sysobj.rop(); + + int ret = rop.set_attrs(pattrs) + .set_last_mod(pmtime) + .set_objv_tracker(objv_tracker) + .stat(); + if (ret < 0) + return ret; + + ret = rop.set_cache_info(cache_info) + .set_refresh_version(refresh_version) + .read(&bl); + if (ret == -ECANCELED) { + /* raced, restart */ + if (!original_readv.empty()) { + /* we were asked to read a specific obj_version, failed */ + return ret; + } + if (objv_tracker) { + objv_tracker->read_version.clear(); + } + sysobj.invalidate(); + continue; + } + if (ret < 0) + return ret; + + if (ret < request_len) + break; + bl.clear(); + request_len *= 2; + } while (true); + + return 0; +} + +int rgw_delete_system_obj(RGWRados *rgwstore, const rgw_pool& pool, const string& oid, + RGWObjVersionTracker *objv_tracker) +{ + auto obj_ctx = rgwstore->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(rgw_raw_obj{pool, oid}); + rgw_raw_obj obj(pool, oid); + return sysobj.wop() + .set_objv_tracker(objv_tracker) + .remove(); +} + +thread_local bool is_asio_thread = false; + +int rgw_rados_operate(librados::IoCtx& ioctx, const std::string& oid, + librados::ObjectReadOperation *op, bufferlist* pbl, + optional_yield y) +{ +#ifdef HAVE_BOOST_CONTEXT + // given a yield_context, call async_operate() to yield the coroutine instead + // of blocking + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + boost::system::error_code ec; + auto bl = librados::async_operate(context, ioctx, oid, op, 0, yield[ec]); + if (pbl) { + *pbl = std::move(bl); + } + return -ec.value(); + } + // work on asio threads should be asynchronous, so warn when they block + if (is_asio_thread) { + dout(20) << "WARNING: blocking librados call" << dendl; + } +#endif + return ioctx.operate(oid, op, nullptr); +} + +int rgw_rados_operate(librados::IoCtx& ioctx, const std::string& oid, + librados::ObjectWriteOperation *op, optional_yield y) +{ +#ifdef HAVE_BOOST_CONTEXT + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + boost::system::error_code ec; + librados::async_operate(context, ioctx, oid, op, 0, yield[ec]); + return -ec.value(); + } + if (is_asio_thread) { + dout(20) << "WARNING: blocking librados call" << dendl; + } +#endif + return ioctx.operate(oid, op); +} + +void parse_mime_map_line(const char *start, const char *end) +{ + char line[end - start + 1]; + strncpy(line, start, end - start); + line[end - start] = '\0'; + char *l = line; +#define DELIMS " \t\n\r" + + while (isspace(*l)) + l++; + + char *mime = strsep(&l, DELIMS); + if (!mime) + return; + + char *ext; + do { + ext = strsep(&l, DELIMS); + if (ext && *ext) { + (*ext_mime_map)[ext] = mime; + } + } while (ext); +} + + +void parse_mime_map(const char *buf) +{ + const char *start = buf, *end = buf; + while (*end) { + while (*end && *end != '\n') { + end++; + } + parse_mime_map_line(start, end); + end++; + start = end; + } +} + +static int ext_mime_map_init(CephContext *cct, const char *ext_map) +{ + int fd = open(ext_map, O_RDONLY); + char *buf = NULL; + int ret; + if (fd < 0) { + ret = -errno; + ldout(cct, 0) << __func__ << " failed to open file=" << ext_map + << " : " << cpp_strerror(-ret) << dendl; + return ret; + } + + struct stat st; + ret = fstat(fd, &st); + if (ret < 0) { + ret = -errno; + ldout(cct, 0) << __func__ << " failed to stat file=" << ext_map + << " : " << cpp_strerror(-ret) << dendl; + goto done; + } + + buf = (char *)malloc(st.st_size + 1); + if (!buf) { + ret = -ENOMEM; + ldout(cct, 0) << __func__ << " failed to allocate buf" << dendl; + goto done; + } + + ret = safe_read(fd, buf, st.st_size + 1); + if (ret != st.st_size) { + // huh? file size has changed? + ldout(cct, 0) << __func__ << " raced! will retry.." << dendl; + free(buf); + close(fd); + return ext_mime_map_init(cct, ext_map); + } + buf[st.st_size] = '\0'; + + parse_mime_map(buf); + ret = 0; +done: + free(buf); + close(fd); + return ret; +} + +const char *rgw_find_mime_by_ext(string& ext) +{ + map::iterator iter = ext_mime_map->find(ext); + if (iter == ext_mime_map->end()) + return NULL; + + return iter->second.c_str(); +} + +void rgw_filter_attrset(map& unfiltered_attrset, const string& check_prefix, + map *attrset) +{ + attrset->clear(); + map::iterator iter; + for (iter = unfiltered_attrset.lower_bound(check_prefix); + iter != unfiltered_attrset.end(); ++iter) { + if (!boost::algorithm::starts_with(iter->first, check_prefix)) + break; + (*attrset)[iter->first] = iter->second; + } +} + +RGWDataAccess::RGWDataAccess(RGWRados *_store) : store(_store) +{ + sysobj_ctx = std::make_unique(store->svc.sysobj->init_obj_ctx()); +} + + +int RGWDataAccess::Bucket::finish_init() +{ + auto iter = attrs.find(RGW_ATTR_ACL); + if (iter == attrs.end()) { + return 0; + } + + bufferlist::const_iterator bliter = iter->second.begin(); + try { + policy.decode(bliter); + } catch (buffer::error& err) { + return -EIO; + } + + return 0; +} + +int RGWDataAccess::Bucket::init() +{ + int ret = sd->store->get_bucket_info(*sd->sysobj_ctx, + tenant, name, + bucket_info, + &mtime, + &attrs); + if (ret < 0) { + return ret; + } + + return finish_init(); +} + +int RGWDataAccess::Bucket::init(const RGWBucketInfo& _bucket_info, + const map& _attrs) +{ + bucket_info = _bucket_info; + attrs = _attrs; + + return finish_init(); +} + +int RGWDataAccess::Bucket::get_object(const rgw_obj_key& key, + ObjectRef *obj) { + obj->reset(new Object(sd, shared_from_this(), key)); + return 0; +} + +int RGWDataAccess::Object::put(bufferlist& data, + map& attrs) +{ + RGWRados *store = sd->store; + CephContext *cct = store->ctx(); + + string tag; + append_rand_alpha(cct, tag, tag, 32); + + RGWBucketInfo& bucket_info = bucket->bucket_info; + + using namespace rgw::putobj; + rgw::AioThrottle aio(store->ctx()->_conf->rgw_put_obj_min_window_size); + + RGWObjectCtx obj_ctx(store); + rgw_obj obj(bucket_info.bucket, key); + + auto& owner = bucket->policy.get_owner(); + + string req_id = store->svc.zone_utils->unique_id(store->get_new_req_id()); + + AtomicObjectProcessor processor(&aio, store, bucket_info, + nullptr, + owner.get_id(), + obj_ctx, obj, olh_epoch, req_id); + + int ret = processor.prepare(); + if (ret < 0) + return ret; + + using namespace rgw::putobj; + + DataProcessor *filter = &processor; + + CompressorRef plugin; + boost::optional compressor; + + const auto& compression_type = store->svc.zone->get_zone_params().get_compression_type(bucket_info.placement_rule); + if (compression_type != "none") { + plugin = Compressor::create(store->ctx(), compression_type); + if (!plugin) { + ldout(store->ctx(), 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } else { + compressor.emplace(store->ctx(), plugin, filter); + filter = &*compressor; + } + } + + off_t ofs = 0; + auto obj_size = data.length(); + + RGWMD5Etag etag_calc; + + do { + size_t read_len = std::min(data.length(), (unsigned int)cct->_conf->rgw_max_chunk_size); + + bufferlist bl; + + data.splice(0, read_len, &bl); + etag_calc.update(bl); + + ret = filter->process(std::move(bl), ofs); + if (ret < 0) + return ret; + + ofs += read_len; + } while (data.length() > 0); + + ret = filter->process({}, ofs); + if (ret < 0) { + return ret; + } + bool has_etag_attr = false; + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + etag = bl.to_str(); + has_etag_attr = true; + } + + if (!aclbl) { + RGWAccessControlPolicy_S3 policy(cct); + + policy.create_canned(bucket->policy.get_owner(), bucket->policy.get_owner(), string()); /* default private policy */ + + policy.encode(aclbl.emplace()); + } + + if (etag.empty()) { + etag_calc.finish(&etag); + } + + if (!has_etag_attr) { + bufferlist etagbl; + etagbl.append(etag); + attrs[RGW_ATTR_ETAG] = etagbl; + } + attrs[RGW_ATTR_ACL] = *aclbl; + + string *puser_data = nullptr; + if (user_data) { + puser_data = &(*user_data); + } + + return processor.complete(obj_size, etag, + &mtime, mtime, + attrs, delete_at, + nullptr, nullptr, + puser_data, + nullptr, nullptr); +} + +void RGWDataAccess::Object::set_policy(const RGWAccessControlPolicy& policy) +{ + policy.encode(aclbl.emplace()); +} + +int rgw_tools_init(CephContext *cct) +{ + ext_mime_map = new std::map; + ext_mime_map_init(cct, cct->_conf->rgw_mime_types_file.c_str()); + // ignore errors; missing mime.types is not fatal + return 0; +} + +void rgw_tools_cleanup() +{ + delete ext_mime_map; + ext_mime_map = nullptr; +} diff --git a/src/rgw/rgw_tools.h b/src/rgw/rgw_tools.h new file mode 100644 index 00000000..0e8b1621 --- /dev/null +++ b/src/rgw/rgw_tools.h @@ -0,0 +1,202 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_TOOLS_H +#define CEPH_RGW_TOOLS_H + +#include + +#include "include/types.h" +#include "common/ceph_time.h" +#include "rgw_common.h" + +class RGWRados; +class RGWSysObjectCtx; +struct RGWObjVersionTracker; +class optional_yield; + +struct obj_version; + +int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, + librados::IoCtx& ioctx, + bool create = false, + bool mostly_omap = false); + +int rgw_put_system_obj(RGWRados *rgwstore, const rgw_pool& pool, const string& oid, bufferlist& data, bool exclusive, + RGWObjVersionTracker *objv_tracker, real_time set_mtime, map *pattrs = NULL); +int rgw_get_system_obj(RGWRados *rgwstore, RGWSysObjectCtx& obj_ctx, const rgw_pool& pool, const string& key, bufferlist& bl, + RGWObjVersionTracker *objv_tracker, real_time *pmtime, map *pattrs = NULL, + rgw_cache_entry_info *cache_info = NULL, + boost::optional refresh_version = boost::none); +int rgw_delete_system_obj(RGWRados *rgwstore, const rgw_pool& pool, const string& oid, + RGWObjVersionTracker *objv_tracker); + +const char *rgw_find_mime_by_ext(string& ext); + +void rgw_filter_attrset(map& unfiltered_attrset, const string& check_prefix, + map *attrset); + +/// indicates whether the current thread is in boost::asio::io_context::run(), +/// used to log warnings if synchronous librados calls are made +extern thread_local bool is_asio_thread; + +/// perform the rados operation, using the yield context when given +int rgw_rados_operate(librados::IoCtx& ioctx, const std::string& oid, + librados::ObjectReadOperation *op, bufferlist* pbl, + optional_yield y); +int rgw_rados_operate(librados::IoCtx& ioctx, const std::string& oid, + librados::ObjectWriteOperation *op, optional_yield y); + +int rgw_tools_init(CephContext *cct); +void rgw_tools_cleanup(); + +template +class RGWEtag +{ + H hash; + +public: + RGWEtag() {} + + void update(const char *buf, size_t len) { + hash.Update((const unsigned char *)buf, len); + } + + void update(bufferlist& bl) { + if (bl.length() > 0) { + update(bl.c_str(), bl.length()); + } + } + + void update(const string& s) { + if (!s.empty()) { + update(s.c_str(), s.size()); + } + } + void finish(string *etag) { + char etag_buf[S]; + char etag_buf_str[S * 2 + 16]; + + hash.Final((unsigned char *)etag_buf); + buf_to_hex((const unsigned char *)etag_buf, S, + etag_buf_str); + + *etag = etag_buf_str; + } +}; + +using RGWMD5Etag = RGWEtag; + +class RGWDataAccess +{ + RGWRados *store; + std::unique_ptr sysobj_ctx; + +public: + RGWDataAccess(RGWRados *_store); + + class Object; + class Bucket; + + using BucketRef = std::shared_ptr; + using ObjectRef = std::shared_ptr; + + class Bucket : public enable_shared_from_this { + friend class RGWDataAccess; + friend class Object; + + RGWDataAccess *sd{nullptr}; + RGWBucketInfo bucket_info; + string tenant; + string name; + string bucket_id; + ceph::real_time mtime; + map attrs; + + RGWAccessControlPolicy policy; + int finish_init(); + + Bucket(RGWDataAccess *_sd, + const string& _tenant, + const string& _name, + const string& _bucket_id) : sd(_sd), + tenant(_tenant), + name(_name), + bucket_id(_bucket_id) {} + Bucket(RGWDataAccess *_sd) : sd(_sd) {} + int init(); + int init(const RGWBucketInfo& _bucket_info, const map& _attrs); + public: + int get_object(const rgw_obj_key& key, + ObjectRef *obj); + + }; + + + class Object { + RGWDataAccess *sd{nullptr}; + BucketRef bucket; + rgw_obj_key key; + + ceph::real_time mtime; + string etag; + std::optional olh_epoch; + ceph::real_time delete_at; + std::optional user_data; + + std::optional aclbl; + + Object(RGWDataAccess *_sd, + BucketRef&& _bucket, + const rgw_obj_key& _key) : sd(_sd), + bucket(_bucket), + key(_key) {} + public: + int put(bufferlist& data, map& attrs); /* might modify attrs */ + + void set_mtime(const ceph::real_time& _mtime) { + mtime = _mtime; + } + + void set_etag(const string& _etag) { + etag = _etag; + } + + void set_olh_epoch(uint64_t epoch) { + olh_epoch = epoch; + } + + void set_delete_at(ceph::real_time _delete_at) { + delete_at = _delete_at; + } + + void set_user_data(const string& _user_data) { + user_data = _user_data; + } + + void set_policy(const RGWAccessControlPolicy& policy); + + friend class Bucket; + }; + + int get_bucket(const string& tenant, + const string name, + const string bucket_id, + BucketRef *bucket) { + bucket->reset(new Bucket(this, tenant, name, bucket_id)); + return (*bucket)->init(); + } + + int get_bucket(const RGWBucketInfo& bucket_info, + const map& attrs, + BucketRef *bucket) { + bucket->reset(new Bucket(this)); + return (*bucket)->init(bucket_info, attrs); + } + friend class Bucket; + friend class Object; +}; + +using RGWDataAccessRef = std::shared_ptr; + +#endif diff --git a/src/rgw/rgw_torrent.cc b/src/rgw/rgw_torrent.cc new file mode 100644 index 00000000..3fca9ba9 --- /dev/null +++ b/src/rgw/rgw_torrent.cc @@ -0,0 +1,266 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include + +#include "rgw_torrent.h" +#include "include/str_list.h" +#include "include/rados/librados.hpp" + +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + +using ceph::crypto::MD5; +using namespace librados; +using namespace boost; +using ceph::crypto::SHA1; + +seed::seed() +{ + seed::info.piece_length = 0; + seed::info.len = 0; + sha_len = 0; + is_torrent = false; +} + +seed::~seed() +{ + seed::info.sha1_bl.clear(); + bl.clear(); + s = NULL; + store = NULL; +} + +void seed::init(struct req_state *p_req, RGWRados *p_store) +{ + s = p_req; + store = p_store; +} + +int seed::get_torrent_file(RGWRados::Object::Read &read_op, + uint64_t &total_len, + ceph::bufferlist &bl_data, + rgw_obj &obj) +{ + /* add other field if config is set */ + dencode.bencode_dict(bl); + set_announce(); + if (!comment.empty()) + { + dencode.bencode(COMMENT, comment, bl); + } + if (!create_by.empty()) + { + dencode.bencode(CREATED_BY, create_by, bl); + } + if (!encoding.empty()) + { + dencode.bencode(ENCODING, encoding, bl); + } + + string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + ldout(s->cct, 20) << "NOTICE: head obj oid= " << oid << dendl; + + const set obj_key{RGW_OBJ_TORRENT}; + map m; + const int r = read_op.state.cur_ioctx->omap_get_vals_by_keys(oid, obj_key, &m); + if (r < 0) { + ldout(s->cct, 0) << "ERROR: omap_get_vals_by_keys failed: " << r << dendl; + return r; + } + if (m.size() != 1) { + ldout(s->cct, 0) << "ERROR: omap key " RGW_OBJ_TORRENT " not found" << dendl; + return -EINVAL; + } + bl.append(std::move(m.begin()->second)); + dencode.bencode_end(bl); + + bl_data = bl; + total_len = bl.length(); + return 0; +} + +bool seed::get_flag() +{ + return is_torrent; +} + +void seed::update(bufferlist &bl) +{ + if (!is_torrent) + { + return; + } + info.len += bl.length(); + sha1(&h, bl, bl.length()); +} + +int seed::complete() +{ + uint64_t remain = info.len%info.piece_length; + uint8_t remain_len = ((remain > 0)? 1 : 0); + sha_len = (info.len/info.piece_length + remain_len)*CEPH_CRYPTO_SHA1_DIGESTSIZE; + + int ret = 0; + /* produce torrent data */ + do_encode(); + + /* save torrent data into OMAP */ + ret = save_torrent_file(); + if (0 != ret) + { + ldout(s->cct, 0) << "ERROR: failed to save_torrent_file() ret= "<< ret << dendl; + return ret; + } + + return 0; +} + +off_t seed::get_data_len() +{ + return info.len; +} + +void seed::set_create_date(ceph::real_time& value) +{ + utime_t date = ceph::real_clock::to_timespec(value); + create_date = date.sec(); +} + +void seed::set_info_pieces(char *buff) +{ + info.sha1_bl.append(buff, CEPH_CRYPTO_SHA1_DIGESTSIZE); +} + +void seed::set_info_name(const string& value) +{ + info.name = value; +} + +void seed::sha1(SHA1 *h, bufferlist &bl, off_t bl_len) +{ + off_t num = bl_len/info.piece_length; + off_t remain = 0; + remain = bl_len%info.piece_length; + + char *pstr = bl.c_str(); + char sha[25]; + + /* get sha1 */ + for (off_t i = 0; i < num; i++) + { + // FIPS zeroization audit 20191116: this memset is not intended to + // wipe out a secret after use. + memset(sha, 0x00, sizeof(sha)); + h->Update((unsigned char *)pstr, info.piece_length); + h->Final((unsigned char *)sha); + set_info_pieces(sha); + pstr += info.piece_length; + } + + /* process remain */ + if (0 != remain) + { + // FIPS zeroization audit 20191116: this memset is not intended to + // wipe out a secret after use. + memset(sha, 0x00, sizeof(sha)); + h->Update((unsigned char *)pstr, remain); + h->Final((unsigned char *)sha); + set_info_pieces(sha); + } + ::ceph::crypto::zeroize_for_security(sha, sizeof(sha)); +} + +int seed::get_params() +{ + is_torrent = true; + info.piece_length = g_conf()->rgw_torrent_sha_unit; + create_by = g_conf()->rgw_torrent_createby; + encoding = g_conf()->rgw_torrent_encoding; + origin = g_conf()->rgw_torrent_origin; + comment = g_conf()->rgw_torrent_comment; + announce = g_conf()->rgw_torrent_tracker; + + /* tracker and tracker list is empty, set announce to origin */ + if (announce.empty() && !origin.empty()) + { + announce = origin; + } + + return 0; +} + +void seed::set_announce() +{ + list announce_list; // used to get announce list from conf + get_str_list(announce, ",", announce_list); + + if (announce_list.empty()) + { + ldout(s->cct, 5) << "NOTICE: announce_list is empty " << dendl; + return; + } + + list::iterator iter = announce_list.begin(); + dencode.bencode_key(ANNOUNCE, bl); + dencode.bencode_key((*iter), bl); + + dencode.bencode_key(ANNOUNCE_LIST, bl); + dencode.bencode_list(bl); + for (; iter != announce_list.end(); ++iter) + { + dencode.bencode_list(bl); + dencode.bencode_key((*iter), bl); + dencode.bencode_end(bl); + } + dencode.bencode_end(bl); +} + +void seed::do_encode() +{ + /*Only encode create_date and sha1 info*/ + /*Other field will be added if confi is set when run get torrent*/ + dencode.bencode(CREATION_DATE, create_date, bl); + + dencode.bencode_key(INFO_PIECES, bl); + dencode.bencode_dict(bl); + dencode.bencode(LENGTH, info.len, bl); + dencode.bencode(NAME, info.name, bl); + dencode.bencode(PIECE_LENGTH, info.piece_length, bl); + + char info_sha[100] = { 0 }; + sprintf(info_sha, "%" PRIu64, sha_len); + string sha_len_str = info_sha; + dencode.bencode_key(PIECES, bl); + bl.append(sha_len_str.c_str(), sha_len_str.length()); + bl.append(':'); + bl.append(info.sha1_bl.c_str(), sha_len); + dencode.bencode_end(bl); +} + +int seed::save_torrent_file() +{ + int op_ret = 0; + string key = RGW_OBJ_TORRENT; + rgw_obj obj(s->bucket, s->object.name); + + rgw_raw_obj raw_obj; + store->obj_to_raw(s->bucket_info.placement_rule, obj, &raw_obj); + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(raw_obj); + + op_ret = sysobj.omap().set(key, bl); + if (op_ret < 0) + { + ldout(s->cct, 0) << "ERROR: failed to omap_set() op_ret = " << op_ret << dendl; + return op_ret; + } + + return op_ret; +} diff --git a/src/rgw/rgw_torrent.h b/src/rgw/rgw_torrent.h new file mode 100644 index 00000000..c135323d --- /dev/null +++ b/src/rgw/rgw_torrent.h @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_TORRENT_H +#define CEPH_RGW_TORRENT_H + +#include +#include +#include +#include + +#include "common/ceph_time.h" + +#include "rgw_rados.h" +#include "rgw_common.h" + +using ceph::crypto::SHA1; + +struct req_state; + +#define RGW_OBJ_TORRENT "rgw.torrent" + +#define ANNOUNCE "announce" +#define ANNOUNCE_LIST "announce-list" +#define COMMENT "comment" +#define CREATED_BY "created by" +#define CREATION_DATE "creation date" +#define ENCODING "encoding" +#define LENGTH "length" +#define NAME "name" +#define PIECE_LENGTH "piece length" +#define PIECES "pieces" +#define INFO_PIECES "info" +#define GET_TORRENT "torrent" + +class TorrentBencode +{ +public: + TorrentBencode() {} + ~TorrentBencode() {} + + //control characters + void bencode_dict(bufferlist& bl) { bl.append('d'); } + void bencode_list(bufferlist& bl) { bl.append('l'); } + void bencode_end(bufferlist& bl) { bl.append('e'); } + + //single values + void bencode(int value, bufferlist& bl) + { + bl.append('i'); + char info[100] = { 0 }; + sprintf(info, "%d", value); + bl.append(info, strlen(info)); + bencode_end(bl); + } + + //single values + void bencode(const std::string& str, bufferlist& bl) + { + bencode_key(str, bl); + } + + //dictionary elements + void bencode(const std::string& key, int value, bufferlist& bl) + { + bencode_key(key, bl); + bencode(value, bl); + } + + //dictionary elements + void bencode(const std::string& key, const std::string& value, bufferlist& bl) + { + bencode_key(key, bl); + bencode(value, bl); + } + + //key len + void bencode_key(const std::string& key, bufferlist& bl) + { + int len = key.length(); + char info[100] = { 0 }; + sprintf(info, "%d:", len); + bl.append(info, strlen(info)); + bl.append(key.c_str(), len); + } +}; + +/* torrent file struct */ +class seed +{ +private: + struct + { + int piece_length; // each piece length + bufferlist sha1_bl; // save sha1 + string name; // file name + off_t len; // file total bytes + }info; + + string announce; // tracker + string origin; // origin + time_t create_date{0}; // time of the file created + string comment; // comment + string create_by; // app name and version + string encoding; // if encode use gbk rather than gtf-8 use this field + uint64_t sha_len; // sha1 length + bool is_torrent; // flag + bufferlist bl; // bufflist ready to send + + struct req_state *s{nullptr}; + RGWRados *store{nullptr}; + SHA1 h; + + TorrentBencode dencode; +public: + seed(); + ~seed(); + + int get_params(); + void init(struct req_state *p_req, RGWRados *p_store); + int get_torrent_file(RGWRados::Object::Read &read_op, + uint64_t &total_len, + ceph::bufferlist &bl_data, + rgw_obj &obj); + + off_t get_data_len(); + bool get_flag(); + + void set_create_date(ceph::real_time& value); + void set_info_name(const string& value); + void update(bufferlist &bl); + int complete(); + +private: + void do_encode (); + void set_announce(); + void set_exist(bool exist); + void set_info_pieces(char *buff); + void sha1(SHA1 *h, bufferlist &bl, off_t bl_len); + int save_torrent_file(); +}; +#endif /* CEPH_RGW_TORRENT_H */ diff --git a/src/rgw/rgw_url.cc b/src/rgw/rgw_url.cc new file mode 100644 index 00000000..58f7b549 --- /dev/null +++ b/src/rgw/rgw_url.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +namespace rgw { + +namespace { + const auto USER_GROUP_IDX = 3; + const auto PASSWORD_GROUP_IDX = 4; + const auto HOST_GROUP_IDX = 5; + + const std::string schema_re = "([[:alpha:]]+:\\/\\/)"; + const std::string user_pass_re = "(([^:\\s]+):([^@\\s]+)@)?"; + const std::string host_port_re = "([[:alnum:].:-]+)"; + const std::string path_re = "(/[[:print:]]+)?"; +} + +bool parse_url_authority(const std::string& url, std::string& host, std::string& user, std::string& password) { + const std::string re = schema_re + user_pass_re + host_port_re + path_re; + const std::regex url_regex(re, std::regex::icase); + std::smatch url_match_result; + + if (std::regex_match(url, url_match_result, url_regex)) { + host = url_match_result[HOST_GROUP_IDX]; + user = url_match_result[USER_GROUP_IDX]; + password = url_match_result[PASSWORD_GROUP_IDX]; + return true; + } + + return false; +} + +bool parse_url_userinfo(const std::string& url, std::string& user, std::string& password) { + const std::string re = schema_re + user_pass_re + host_port_re + path_re; + const std::regex url_regex(re); + std::smatch url_match_result; + + if (std::regex_match(url, url_match_result, url_regex)) { + user = url_match_result[USER_GROUP_IDX]; + password = url_match_result[PASSWORD_GROUP_IDX]; + return true; + } + + return false; +} +} + diff --git a/src/rgw/rgw_url.h b/src/rgw/rgw_url.h new file mode 100644 index 00000000..089401a4 --- /dev/null +++ b/src/rgw/rgw_url.h @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +namespace rgw { +// parse a URL of the form: http|https|amqp|amqps|kafka://[user:password@][:port] +bool parse_url_authority(const std::string& url, std::string& host, std::string& user, std::string& password); +bool parse_url_userinfo(const std::string& url, std::string& user, std::string& password); +} + diff --git a/src/rgw/rgw_usage.cc b/src/rgw/rgw_usage.cc new file mode 100644 index 00000000..a82bc66c --- /dev/null +++ b/src/rgw/rgw_usage.cc @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "rgw_rados.h" +#include "rgw_usage.h" +#include "rgw_formats.h" + + + +static void dump_usage_categories_info(Formatter *formatter, const rgw_usage_log_entry& entry, map *categories) +{ + formatter->open_array_section("categories"); + map::const_iterator uiter; + for (uiter = entry.usage_map.begin(); uiter != entry.usage_map.end(); ++uiter) { + if (categories && !categories->empty() && !categories->count(uiter->first)) + continue; + const rgw_usage_data& usage = uiter->second; + formatter->open_object_section("entry"); + formatter->dump_string("category", uiter->first); + formatter->dump_int("bytes_sent", usage.bytes_sent); + formatter->dump_int("bytes_received", usage.bytes_received); + formatter->dump_int("ops", usage.ops); + formatter->dump_int("successful_ops", usage.successful_ops); + formatter->close_section(); // entry + } + formatter->close_section(); // categories +} + +int RGWUsage::show(RGWRados *store, const rgw_user& uid, const string& bucket_name, uint64_t start_epoch, + uint64_t end_epoch, bool show_log_entries, bool show_log_sum, map *categories, + RGWFormatterFlusher& flusher) +{ + uint32_t max_entries = 1000; + + bool is_truncated = true; + + RGWUsageIter usage_iter; + Formatter *formatter = flusher.get_formatter(); + + map usage; + + flusher.start(0); + + formatter->open_object_section("usage"); + if (show_log_entries) { + formatter->open_array_section("entries"); + } + string last_owner; + bool user_section_open = false; + map summary_map; + while (is_truncated) { + int ret = store->read_usage(uid, bucket_name, start_epoch, end_epoch, max_entries, + &is_truncated, usage_iter, usage); + + if (ret == -ENOENT) { + ret = 0; + is_truncated = false; + } + + if (ret < 0) { + return ret; + } + + map::iterator iter; + for (iter = usage.begin(); iter != usage.end(); ++iter) { + const rgw_user_bucket& ub = iter->first; + const rgw_usage_log_entry& entry = iter->second; + + if (show_log_entries) { + if (ub.user.compare(last_owner) != 0) { + if (user_section_open) { + formatter->close_section(); + formatter->close_section(); + } + formatter->open_object_section("user"); + formatter->dump_string("user", ub.user); + formatter->open_array_section("buckets"); + user_section_open = true; + last_owner = ub.user; + } + formatter->open_object_section("bucket"); + formatter->dump_string("bucket", ub.bucket); + utime_t ut(entry.epoch, 0); + ut.gmtime(formatter->dump_stream("time")); + formatter->dump_int("epoch", entry.epoch); + string owner = entry.owner.to_str(); + string payer = entry.payer.to_str(); + formatter->dump_string("owner", owner); + if (!payer.empty() && payer != owner) { + formatter->dump_string("payer", payer); + } + dump_usage_categories_info(formatter, entry, categories); + formatter->close_section(); // bucket + flusher.flush(); + } + + summary_map[ub.user].aggregate(entry, categories); + } + } + if (show_log_entries) { + if (user_section_open) { + formatter->close_section(); // buckets + formatter->close_section(); //user + } + formatter->close_section(); // entries + } + + if (show_log_sum) { + formatter->open_array_section("summary"); + map::iterator siter; + for (siter = summary_map.begin(); siter != summary_map.end(); ++siter) { + const rgw_usage_log_entry& entry = siter->second; + formatter->open_object_section("user"); + formatter->dump_string("user", siter->first); + dump_usage_categories_info(formatter, entry, categories); + rgw_usage_data total_usage; + entry.sum(total_usage, *categories); + formatter->open_object_section("total"); + formatter->dump_int("bytes_sent", total_usage.bytes_sent); + formatter->dump_int("bytes_received", total_usage.bytes_received); + formatter->dump_int("ops", total_usage.ops); + formatter->dump_int("successful_ops", total_usage.successful_ops); + formatter->close_section(); // total + + formatter->close_section(); // user + + flusher.flush(); + } + + formatter->close_section(); // summary + } + + formatter->close_section(); // usage + flusher.flush(); + + return 0; +} + +int RGWUsage::trim(RGWRados *store, const rgw_user& uid, const string& bucket_name, uint64_t start_epoch, + uint64_t end_epoch) +{ + return store->trim_usage(uid, bucket_name, start_epoch, end_epoch); +} + +int RGWUsage::clear(RGWRados *store) +{ + return store->clear_usage(); +} diff --git a/src/rgw/rgw_usage.h b/src/rgw/rgw_usage.h new file mode 100644 index 00000000..da39e596 --- /dev/null +++ b/src/rgw/rgw_usage.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_USAGE_H +#define CEPH_RGW_USAGE_H + +#include +#include + +#include "common/Formatter.h" +#include "rgw_formats.h" + +class RGWRados; + + +class RGWUsage +{ +public: + static int show(RGWRados *store, const rgw_user& uid, const string& bucket_name, uint64_t start_epoch, + uint64_t end_epoch, bool show_log_entries, bool show_log_sum, + std::map *categories, RGWFormatterFlusher& flusher); + + static int trim(RGWRados *store, const rgw_user& uid, const string& bucket_name, uint64_t start_epoch, + uint64_t end_epoch); + + static int clear(RGWRados *store); +}; + + +#endif diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc new file mode 100644 index 00000000..d70ec879 --- /dev/null +++ b/src/rgw/rgw_user.cc @@ -0,0 +1,2958 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include +#include + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include "common/RWLock.h" +#include "rgw_rados.h" +#include "rgw_zone.h" +#include "rgw_acl.h" + +#include "include/types.h" +#include "rgw_user.h" +#include "rgw_string.h" + +// until everything is moved from rgw_common +#include "rgw_common.h" + +#include "rgw_bucket.h" +#include "rgw_quota.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" +#include "services/svc_sys_obj_cache.h" + +#define dout_subsys ceph_subsys_rgw + + + +static RGWMetadataHandler *user_meta_handler = NULL; +extern void op_type_to_str(uint32_t mask, char *buf, int len); + +/** + * Get the anonymous (ie, unauthenticated) user info. + */ +void rgw_get_anon_user(RGWUserInfo& info) +{ + info.user_id = RGW_USER_ANON_ID; + info.display_name.clear(); + info.access_keys.clear(); +} + +int rgw_user_sync_all_stats(RGWRados *store, const rgw_user& user_id) +{ + CephContext *cct = store->ctx(); + size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk; + bool is_truncated = false; + string marker; + int ret; + RGWSysObjectCtx obj_ctx = store->svc.sysobj->init_obj_ctx(); + + do { + RGWUserBuckets user_buckets; + ret = rgw_read_user_buckets(store, user_id, user_buckets, marker, + string(), max_entries, false, &is_truncated); + if (ret < 0) { + ldout(cct, 0) << "failed to read user buckets: ret=" << ret << dendl; + return ret; + } + map& buckets = user_buckets.get_buckets(); + for (map::iterator i = buckets.begin(); + i != buckets.end(); + ++i) { + marker = i->first; + + RGWBucketEnt& bucket_ent = i->second; + RGWBucketInfo bucket_info; + + ret = store->get_bucket_info(obj_ctx, user_id.tenant, bucket_ent.bucket.name, + bucket_info, nullptr, nullptr); + if (ret < 0) { + ldout(cct, 0) << "ERROR: could not read bucket info: bucket=" << bucket_ent.bucket << " ret=" << ret << dendl; + continue; + } + ret = rgw_bucket_sync_user_stats(store, user_id, bucket_info); + if (ret < 0) { + ldout(cct, 0) << "ERROR: could not sync bucket stats: ret=" << ret << dendl; + return ret; + } + RGWQuotaInfo bucket_quota; + ret = store->check_bucket_shards(bucket_info, bucket_info.bucket, bucket_quota); + if (ret < 0) { + ldout(cct, 0) << "ERROR in check_bucket_shards: " << cpp_strerror(-ret)<< dendl; + } + } + } while (is_truncated); + + ret = store->complete_sync_user_stats(user_id); + if (ret < 0) { + cerr << "ERROR: failed to complete syncing user stats: ret=" << ret << std::endl; + return ret; + } + + return 0; +} + +int rgw_user_get_all_buckets_stats(RGWRados *store, const rgw_user& user_id, map&buckets_usage_map) +{ + CephContext *cct = store->ctx(); + size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk; + bool done; + bool is_truncated; + string marker; + int ret; + + do { + RGWUserBuckets user_buckets; + ret = rgw_read_user_buckets(store, user_id, user_buckets, marker, + string(), max_entries, false, &is_truncated); + if (ret < 0) { + ldout(cct, 0) << "failed to read user buckets: ret=" << ret << dendl; + return ret; + } + map& buckets = user_buckets.get_buckets(); + for (const auto& i : buckets) { + marker = i.first; + + const RGWBucketEnt& bucket_ent = i.second; + cls_user_bucket_entry entry; + ret = store->cls_user_get_bucket_stats(bucket_ent.bucket, entry); + if (ret < 0) { + ldout(cct, 0) << "ERROR: could not get bucket stats: ret=" << ret << dendl; + return ret; + } + buckets_usage_map.emplace(bucket_ent.bucket.name, entry); + } + done = (buckets.size() < max_entries); + } while (!done); + + return 0; +} + +/** + * Save the given user information to storage. + * Returns: 0 on success, -ERR# on failure. + */ +int rgw_store_user_info(RGWRados *store, + RGWUserInfo& info, + RGWUserInfo *old_info, + RGWObjVersionTracker *objv_tracker, + real_time mtime, + bool exclusive, + map *pattrs) +{ + int ret; + RGWObjVersionTracker ot; + + if (objv_tracker) { + ot = *objv_tracker; + } + + if (ot.write_version.tag.empty()) { + if (ot.read_version.tag.empty()) { + ot.generate_new_write_ver(store->ctx()); + } else { + ot.write_version = ot.read_version; + ot.write_version.ver++; + } + } + + map::iterator iter; + for (iter = info.swift_keys.begin(); iter != info.swift_keys.end(); ++iter) { + if (old_info && old_info->swift_keys.count(iter->first) != 0) + continue; + RGWAccessKey& k = iter->second; + /* check if swift mapping exists */ + RGWUserInfo inf; + int r = rgw_get_user_info_by_swift(store, k.id, inf); + if (r >= 0 && inf.user_id.compare(info.user_id) != 0) { + ldout(store->ctx(), 0) << "WARNING: can't store user info, swift id (" << k.id + << ") already mapped to another user (" << info.user_id << ")" << dendl; + return -EEXIST; + } + } + + if (!info.access_keys.empty()) { + /* check if access keys already exist */ + RGWUserInfo inf; + map::iterator iter = info.access_keys.begin(); + for (; iter != info.access_keys.end(); ++iter) { + RGWAccessKey& k = iter->second; + if (old_info && old_info->access_keys.count(iter->first) != 0) + continue; + int r = rgw_get_user_info_by_access_key(store, k.id, inf); + if (r >= 0 && inf.user_id.compare(info.user_id) != 0) { + ldout(store->ctx(), 0) << "WARNING: can't store user info, access key already mapped to another user" << dendl; + return -EEXIST; + } + } + } + + RGWUID ui; + ui.user_id = info.user_id; + + bufferlist link_bl; + encode(ui, link_bl); + + bufferlist data_bl; + encode(ui, data_bl); + encode(info, data_bl); + + string key; + info.user_id.to_str(key); + + ret = store->meta_mgr->put_entry(user_meta_handler, key, data_bl, exclusive, &ot, mtime, pattrs); + if (ret < 0) + return ret; + + if (!info.user_email.empty()) { + if (!old_info || + old_info->user_email.compare(info.user_email) != 0) { /* only if new index changed */ + ret = rgw_put_system_obj(store, store->svc.zone->get_zone_params().user_email_pool, info.user_email, + link_bl, exclusive, NULL, real_time()); + if (ret < 0) + return ret; + } + } + + if (!info.access_keys.empty()) { + map::iterator iter = info.access_keys.begin(); + for (; iter != info.access_keys.end(); ++iter) { + RGWAccessKey& k = iter->second; + if (old_info && old_info->access_keys.count(iter->first) != 0) + continue; + + ret = rgw_put_system_obj(store, store->svc.zone->get_zone_params().user_keys_pool, k.id, + link_bl, exclusive, NULL, real_time()); + if (ret < 0) + return ret; + } + } + + map::iterator siter; + for (siter = info.swift_keys.begin(); siter != info.swift_keys.end(); ++siter) { + RGWAccessKey& k = siter->second; + if (old_info && old_info->swift_keys.count(siter->first) != 0) + continue; + + ret = rgw_put_system_obj(store, store->svc.zone->get_zone_params().user_swift_pool, k.id, + link_bl, exclusive, NULL, real_time()); + if (ret < 0) + return ret; + } + + return ret; +} + +struct user_info_entry { + RGWUserInfo info; + RGWObjVersionTracker objv_tracker; + real_time mtime; +}; + +static RGWChainedCacheImpl uinfo_cache; + +int rgw_get_user_info_from_index(RGWRados * const store, + const string& key, + const rgw_pool& pool, + RGWUserInfo& info, + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime) +{ + if (auto e = uinfo_cache.find(key)) { + info = e->info; + if (objv_tracker) + *objv_tracker = e->objv_tracker; + if (pmtime) + *pmtime = e->mtime; + return 0; + } + + user_info_entry e; + bufferlist bl; + RGWUID uid; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + + int ret = rgw_get_system_obj(store, obj_ctx, pool, key, bl, NULL, &e.mtime); + if (ret < 0) + return ret; + + rgw_cache_entry_info cache_info; + + auto iter = bl.cbegin(); + try { + decode(uid, iter); + int ret = rgw_get_user_info_by_uid(store, uid.user_id, e.info, &e.objv_tracker, NULL, &cache_info); + if (ret < 0) { + return ret; + } + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: failed to decode user info, caught buffer::error" << dendl; + return -EIO; + } + + uinfo_cache.put(store->svc.cache, key, &e, { &cache_info }); + + info = e.info; + if (objv_tracker) + *objv_tracker = e.objv_tracker; + if (pmtime) + *pmtime = e.mtime; + + return 0; +} + +/** + * Given a uid, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +int rgw_get_user_info_by_uid(RGWRados *store, + const rgw_user& uid, + RGWUserInfo& info, + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime, + rgw_cache_entry_info * const cache_info, + map * const pattrs) +{ + bufferlist bl; + RGWUID user_id; + + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + string oid = uid.to_str(); + int ret = rgw_get_system_obj(store, obj_ctx, store->svc.zone->get_zone_params().user_uid_pool, oid, bl, objv_tracker, pmtime, pattrs, cache_info); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(user_id, iter); + if (user_id.user_id.compare(uid) != 0) { + lderr(store->ctx()) << "ERROR: rgw_get_user_info_by_uid(): user id mismatch: " << user_id.user_id << " != " << uid << dendl; + return -EIO; + } + if (!iter.end()) { + decode(info, iter); + } + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: failed to decode user info, caught buffer::error" << dendl; + return -EIO; + } + + return 0; +} + +/** + * Given an email, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +int rgw_get_user_info_by_email(RGWRados *store, string& email, RGWUserInfo& info, + RGWObjVersionTracker *objv_tracker, real_time *pmtime) +{ + return rgw_get_user_info_from_index(store, email, store->svc.zone->get_zone_params().user_email_pool, info, objv_tracker, pmtime); +} + +/** + * Given an swift username, finds the user_info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +extern int rgw_get_user_info_by_swift(RGWRados * const store, + const string& swift_name, + RGWUserInfo& info, /* out */ + RGWObjVersionTracker * const objv_tracker, + real_time * const pmtime) +{ + return rgw_get_user_info_from_index(store, swift_name, + store->svc.zone->get_zone_params().user_swift_pool, + info, objv_tracker, pmtime); +} + +/** + * Given an access key, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +extern int rgw_get_user_info_by_access_key(RGWRados* store, + const std::string& access_key, + RGWUserInfo& info, + RGWObjVersionTracker* objv_tracker, + real_time *pmtime) +{ + return rgw_get_user_info_from_index(store, access_key, + store->svc.zone->get_zone_params().user_keys_pool, + info, objv_tracker, pmtime); +} + +int rgw_get_user_attrs_by_uid(RGWRados *store, + const rgw_user& user_id, + map& attrs, + RGWObjVersionTracker *objv_tracker) +{ + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + rgw_raw_obj obj(store->svc.zone->get_zone_params().user_uid_pool, user_id.to_str()); + auto src = obj_ctx.get_obj(obj); + + return src.rop() + .set_attrs(&attrs) + .set_objv_tracker(objv_tracker) + .stat(); +} + +int rgw_remove_key_index(RGWRados *store, RGWAccessKey& access_key) +{ + rgw_raw_obj obj(store->svc.zone->get_zone_params().user_keys_pool, access_key.id); + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + return sysobj.wop().remove(); +} + +int rgw_remove_uid_index(RGWRados *store, rgw_user& uid) +{ + RGWObjVersionTracker objv_tracker; + RGWUserInfo info; + int ret = rgw_get_user_info_by_uid(store, uid, info, &objv_tracker, NULL); + if (ret < 0) + return ret; + + string oid = uid.to_str(); + ret = store->meta_mgr->remove_entry(user_meta_handler, oid, &objv_tracker); + if (ret < 0) + return ret; + + return 0; +} + +int rgw_remove_email_index(RGWRados *store, string& email) +{ + if (email.empty()) { + return 0; + } + rgw_raw_obj obj(store->svc.zone->get_zone_params().user_email_pool, email); + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + return sysobj.wop().remove(); +} + +int rgw_remove_swift_name_index(RGWRados *store, string& swift_name) +{ + rgw_raw_obj obj(store->svc.zone->get_zone_params().user_swift_pool, swift_name); + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + return sysobj.wop().remove(); +} + +/** + * delete a user's presence from the RGW system. + * First remove their bucket ACLs, then delete them + * from the user and user email pools. This leaves the pools + * themselves alone, as well as any ACLs embedded in object xattrs. + */ +int rgw_delete_user(RGWRados *store, RGWUserInfo& info, RGWObjVersionTracker& objv_tracker) { + int ret; + + map::iterator kiter = info.access_keys.begin(); + for (; kiter != info.access_keys.end(); ++kiter) { + ldout(store->ctx(), 10) << "removing key index: " << kiter->first << dendl; + ret = rgw_remove_key_index(store, kiter->second); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: could not remove " << kiter->first << " (access key object), should be fixed (err=" << ret << ")" << dendl; + return ret; + } + } + + map::iterator siter = info.swift_keys.begin(); + for (; siter != info.swift_keys.end(); ++siter) { + RGWAccessKey& k = siter->second; + ldout(store->ctx(), 10) << "removing swift subuser index: " << k.id << dendl; + /* check if swift mapping exists */ + ret = rgw_remove_swift_name_index(store, k.id); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: could not remove " << k.id << " (swift name object), should be fixed (err=" << ret << ")" << dendl; + return ret; + } + } + + ldout(store->ctx(), 10) << "removing email index: " << info.user_email << dendl; + ret = rgw_remove_email_index(store, info.user_email); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: could not remove email index object for " + << info.user_email << ", should be fixed (err=" << ret << ")" << dendl; + return ret; + } + + string buckets_obj_id; + rgw_get_buckets_obj(info.user_id, buckets_obj_id); + rgw_raw_obj uid_bucks(store->svc.zone->get_zone_params().user_uid_pool, buckets_obj_id); + ldout(store->ctx(), 10) << "removing user buckets index" << dendl; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(uid_bucks); + ret = sysobj.wop().remove(); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: could not remove " << info.user_id << ":" << uid_bucks << ", should be fixed (err=" << ret << ")" << dendl; + return ret; + } + + string key; + info.user_id.to_str(key); + + rgw_raw_obj uid_obj(store->svc.zone->get_zone_params().user_uid_pool, key); + ldout(store->ctx(), 10) << "removing user index: " << info.user_id << dendl; + ret = store->meta_mgr->remove_entry(user_meta_handler, key, &objv_tracker); + if (ret < 0 && ret != -ENOENT && ret != -ECANCELED) { + ldout(store->ctx(), 0) << "ERROR: could not remove " << info.user_id << ":" << uid_obj << ", should be fixed (err=" << ret << ")" << dendl; + return ret; + } + + return 0; +} + +static bool char_is_unreserved_url(char c) +{ + if (isalnum(c)) + return true; + + switch (c) { + case '-': + case '.': + case '_': + case '~': + return true; + default: + return false; + } +} + +struct rgw_flags_desc { + uint32_t mask; + const char *str; +}; + +static struct rgw_flags_desc rgw_perms[] = { + { RGW_PERM_FULL_CONTROL, "full-control" }, + { RGW_PERM_READ | RGW_PERM_WRITE, "read-write" }, + { RGW_PERM_READ, "read" }, + { RGW_PERM_WRITE, "write" }, + { RGW_PERM_READ_ACP, "read-acp" }, + { RGW_PERM_WRITE_ACP, "write-acp" }, + { 0, NULL } +}; + +void rgw_perm_to_str(uint32_t mask, char *buf, int len) +{ + const char *sep = ""; + int pos = 0; + if (!mask) { + snprintf(buf, len, ""); + return; + } + while (mask) { + uint32_t orig_mask = mask; + for (int i = 0; rgw_perms[i].mask; i++) { + struct rgw_flags_desc *desc = &rgw_perms[i]; + if ((mask & desc->mask) == desc->mask) { + pos += snprintf(buf + pos, len - pos, "%s%s", sep, desc->str); + if (pos == len) + return; + sep = ", "; + mask &= ~desc->mask; + if (!mask) + return; + } + } + if (mask == orig_mask) // no change + break; + } +} + +uint32_t rgw_str_to_perm(const char *str) +{ + if (strcasecmp(str, "") == 0) + return RGW_PERM_NONE; + else if (strcasecmp(str, "read") == 0) + return RGW_PERM_READ; + else if (strcasecmp(str, "write") == 0) + return RGW_PERM_WRITE; + else if (strcasecmp(str, "readwrite") == 0) + return RGW_PERM_READ | RGW_PERM_WRITE; + else if (strcasecmp(str, "full") == 0) + return RGW_PERM_FULL_CONTROL; + + return RGW_PERM_INVALID; +} + +int rgw_validate_tenant_name(const string& t) +{ + struct tench { + static bool is_good(char ch) { + return isalnum(ch) || ch == '_'; + } + }; + std::string::const_iterator it = + std::find_if_not(t.begin(), t.end(), tench::is_good); + return (it == t.end())? 0: -ERR_INVALID_TENANT_NAME; +} + +static bool validate_access_key(string& key) +{ + const char *p = key.c_str(); + while (*p) { + if (!char_is_unreserved_url(*p)) + return false; + p++; + } + return true; +} + +static void set_err_msg(std::string *sink, std::string msg) +{ + if (sink && !msg.empty()) + *sink = msg; +} + +static bool remove_old_indexes(RGWRados *store, + RGWUserInfo& old_info, RGWUserInfo& new_info, std::string *err_msg) +{ + int ret; + bool success = true; + + if (!old_info.user_id.empty() && + old_info.user_id.compare(new_info.user_id) != 0) { + if (old_info.user_id.tenant != new_info.user_id.tenant) { + ldout(store->ctx(), 0) << "ERROR: tenant mismatch: " << old_info.user_id.tenant << " != " << new_info.user_id.tenant << dendl; + return false; + } + ret = rgw_remove_uid_index(store, old_info.user_id); + if (ret < 0 && ret != -ENOENT) { + set_err_msg(err_msg, "ERROR: could not remove index for uid " + old_info.user_id.to_str()); + success = false; + } + } + + if (!old_info.user_email.empty() && + old_info.user_email.compare(new_info.user_email) != 0) { + ret = rgw_remove_email_index(store, old_info.user_email); + if (ret < 0 && ret != -ENOENT) { + set_err_msg(err_msg, "ERROR: could not remove index for email " + old_info.user_email); + success = false; + } + } + + map::iterator old_iter; + for (old_iter = old_info.swift_keys.begin(); old_iter != old_info.swift_keys.end(); ++old_iter) { + RGWAccessKey& swift_key = old_iter->second; + map::iterator new_iter = new_info.swift_keys.find(swift_key.id); + if (new_iter == new_info.swift_keys.end()) { + ret = rgw_remove_swift_name_index(store, swift_key.id); + if (ret < 0 && ret != -ENOENT) { + set_err_msg(err_msg, "ERROR: could not remove index for swift_name " + swift_key.id); + success = false; + } + } + } + + return success; +} + +/* + * Dump either the full user info or a subset to a formatter. + * + * NOTE: It is the caller's respnsibility to ensure that the + * formatter is flushed at the correct time. + */ + +static void dump_subusers_info(Formatter *f, RGWUserInfo &info) +{ + map::iterator uiter; + + f->open_array_section("subusers"); + for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) { + RGWSubUser& u = uiter->second; + f->open_object_section("user"); + string s; + info.user_id.to_str(s); + f->dump_format("id", "%s:%s", s.c_str(), u.name.c_str()); + char buf[256]; + rgw_perm_to_str(u.perm_mask, buf, sizeof(buf)); + f->dump_string("permissions", buf); + f->close_section(); + } + f->close_section(); +} + +static void dump_access_keys_info(Formatter *f, RGWUserInfo &info) +{ + map::iterator kiter; + f->open_array_section("keys"); + for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) { + RGWAccessKey& k = kiter->second; + const char *sep = (k.subuser.empty() ? "" : ":"); + const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str()); + f->open_object_section("key"); + string s; + info.user_id.to_str(s); + f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser); + f->dump_string("access_key", k.id); + f->dump_string("secret_key", k.key); + f->close_section(); + } + f->close_section(); +} + +static void dump_swift_keys_info(Formatter *f, RGWUserInfo &info) +{ + map::iterator kiter; + f->open_array_section("swift_keys"); + for (kiter = info.swift_keys.begin(); kiter != info.swift_keys.end(); ++kiter) { + RGWAccessKey& k = kiter->second; + const char *sep = (k.subuser.empty() ? "" : ":"); + const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str()); + f->open_object_section("key"); + string s; + info.user_id.to_str(s); + f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser); + f->dump_string("secret_key", k.key); + f->close_section(); + } + f->close_section(); +} + +static void dump_user_info(Formatter *f, RGWUserInfo &info, + RGWStorageStats *stats = NULL) +{ + f->open_object_section("user_info"); + encode_json("tenant", info.user_id.tenant, f); + encode_json("user_id", info.user_id.id, f); + encode_json("display_name", info.display_name, f); + encode_json("email", info.user_email, f); + encode_json("suspended", (int)info.suspended, f); + encode_json("max_buckets", (int)info.max_buckets, f); + + dump_subusers_info(f, info); + dump_access_keys_info(f, info); + dump_swift_keys_info(f, info); + + encode_json("caps", info.caps, f); + + char buf[256]; + op_type_to_str(info.op_mask, buf, sizeof(buf)); + encode_json("op_mask", (const char *)buf, f); + encode_json("system", (bool)info.system, f); + encode_json("admin", (bool)info.admin, f); + encode_json("default_placement", info.default_placement.name, f); + encode_json("default_storage_class", info.default_placement.storage_class, f); + encode_json("placement_tags", info.placement_tags, f); + encode_json("bucket_quota", info.bucket_quota, f); + encode_json("user_quota", info.user_quota, f); + encode_json("temp_url_keys", info.temp_url_keys, f); + + string user_source_type; + switch ((RGWIdentityType)info.type) { + case TYPE_RGW: + user_source_type = "rgw"; + break; + case TYPE_KEYSTONE: + user_source_type = "keystone"; + break; + case TYPE_LDAP: + user_source_type = "ldap"; + break; + case TYPE_NONE: + user_source_type = "none"; + break; + default: + user_source_type = "none"; + break; + } + encode_json("type", user_source_type, f); + encode_json("mfa_ids", info.mfa_ids, f); + if (stats) { + encode_json("stats", *stats, f); + } + f->close_section(); +} + + +RGWAccessKeyPool::RGWAccessKeyPool(RGWUser* usr) +{ + user = usr; + swift_keys = NULL; + access_keys = NULL; + + if (!user) { + keys_allowed = false; + store = NULL; + return; + } + + keys_allowed = true; + + store = user->get_store(); +} + +RGWAccessKeyPool::~RGWAccessKeyPool() +{ + +} + +int RGWAccessKeyPool::init(RGWUserAdminOpState& op_state) +{ + if (!op_state.is_initialized()) { + keys_allowed = false; + return -EINVAL; + } + + rgw_user& uid = op_state.get_user_id(); + if (uid.compare(RGW_USER_ANON_ID) == 0) { + keys_allowed = false; + return -EACCES; + } + + swift_keys = op_state.get_swift_keys(); + access_keys = op_state.get_access_keys(); + + keys_allowed = true; + + return 0; +} + +/* + * Do a fairly exhaustive search for an existing key matching the parameters + * given. Also handles the case where no key type was specified and updates + * the operation state if needed. + */ + +bool RGWAccessKeyPool::check_existing_key(RGWUserAdminOpState& op_state) +{ + bool existing_key = false; + + int key_type = op_state.get_key_type(); + std::string kid = op_state.get_access_key(); + std::map::iterator kiter; + std::string swift_kid = op_state.build_default_swift_kid(); + + RGWUserInfo dup_info; + + if (kid.empty() && swift_kid.empty()) + return false; + + switch (key_type) { + case KEY_TYPE_SWIFT: + kiter = swift_keys->find(swift_kid); + + existing_key = (kiter != swift_keys->end()); + if (existing_key) + op_state.set_access_key(swift_kid); + + break; + case KEY_TYPE_S3: + kiter = access_keys->find(kid); + existing_key = (kiter != access_keys->end()); + + break; + default: + kiter = access_keys->find(kid); + + existing_key = (kiter != access_keys->end()); + if (existing_key) { + op_state.set_key_type(KEY_TYPE_S3); + break; + } + + kiter = swift_keys->find(kid); + + existing_key = (kiter != swift_keys->end()); + if (existing_key) { + op_state.set_key_type(KEY_TYPE_SWIFT); + break; + } + + // handle the case where the access key was not provided in user:key format + if (swift_kid.empty()) + return false; + + kiter = swift_keys->find(swift_kid); + + existing_key = (kiter != swift_keys->end()); + if (existing_key) { + op_state.set_access_key(swift_kid); + op_state.set_key_type(KEY_TYPE_SWIFT); + } + } + + op_state.set_existing_key(existing_key); + + return existing_key; +} + +int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state, + std::string *err_msg) +{ + RGWUserInfo dup_info; + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!keys_allowed) { + set_err_msg(err_msg, "keys not allowed for this user"); + return -EACCES; + } + + int32_t key_type = op_state.get_key_type(); + + // if a key type wasn't specified + if (key_type < 0) { + if (op_state.has_subuser()) { + key_type = KEY_TYPE_SWIFT; + } else { + key_type = KEY_TYPE_S3; + } + } + + op_state.set_key_type(key_type); + + /* see if the access key was specified */ + if (key_type == KEY_TYPE_S3 && !op_state.will_gen_access() && + op_state.get_access_key().empty()) { + set_err_msg(err_msg, "empty access key"); + return -ERR_INVALID_ACCESS_KEY; + } + + // don't check for secret key because we may be doing a removal + + check_existing_key(op_state); + + return 0; +} + +// Generate a new random key +int RGWAccessKeyPool::generate_key(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + std::string id; + std::string key; + + std::pair key_pair; + RGWAccessKey new_key; + RGWUserInfo duplicate_check; + + int key_type = op_state.get_key_type(); + bool gen_access = op_state.will_gen_access(); + bool gen_secret = op_state.will_gen_secret(); + + if (!keys_allowed) { + set_err_msg(err_msg, "access keys not allowed for this user"); + return -EACCES; + } + + if (op_state.has_existing_key()) { + set_err_msg(err_msg, "cannot create existing key"); + return -ERR_KEY_EXIST; + } + + if (!gen_access) { + id = op_state.get_access_key(); + } + + if (!id.empty()) { + switch (key_type) { + case KEY_TYPE_SWIFT: + if (rgw_get_user_info_by_swift(store, id, duplicate_check) >= 0) { + set_err_msg(err_msg, "existing swift key in RGW system:" + id); + return -ERR_KEY_EXIST; + } + break; + case KEY_TYPE_S3: + if (rgw_get_user_info_by_access_key(store, id, duplicate_check) >= 0) { + set_err_msg(err_msg, "existing S3 key in RGW system:" + id); + return -ERR_KEY_EXIST; + } + } + } + + //key's subuser + if (op_state.has_subuser()) { + //create user and subuser at the same time, user's s3 key should not be set this + if (!op_state.key_type_setbycontext || (key_type == KEY_TYPE_SWIFT)) { + new_key.subuser = op_state.get_subuser(); + } + } + + //Secret key + if (!gen_secret) { + if (op_state.get_secret_key().empty()) { + set_err_msg(err_msg, "empty secret key"); + return -ERR_INVALID_SECRET_KEY; + } + + key = op_state.get_secret_key(); + } else { + char secret_key_buf[SECRET_KEY_LEN + 1]; + gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf)); + key = secret_key_buf; + } + + // Generate the access key + if (key_type == KEY_TYPE_S3 && gen_access) { + char public_id_buf[PUBLIC_ID_LEN + 1]; + + do { + int id_buf_size = sizeof(public_id_buf); + gen_rand_alphanumeric_upper(g_ceph_context, public_id_buf, id_buf_size); + id = public_id_buf; + if (!validate_access_key(id)) + continue; + + } while (!rgw_get_user_info_by_access_key(store, id, duplicate_check)); + } + + if (key_type == KEY_TYPE_SWIFT) { + id = op_state.build_default_swift_kid(); + if (id.empty()) { + set_err_msg(err_msg, "empty swift access key"); + return -ERR_INVALID_ACCESS_KEY; + } + + // check that the access key doesn't exist + if (rgw_get_user_info_by_swift(store, id, duplicate_check) >= 0) { + set_err_msg(err_msg, "cannot create existing swift key"); + return -ERR_KEY_EXIST; + } + } + + // finally create the new key + new_key.id = id; + new_key.key = key; + + key_pair.first = id; + key_pair.second = new_key; + + if (key_type == KEY_TYPE_S3) { + access_keys->insert(key_pair); + } else if (key_type == KEY_TYPE_SWIFT) { + swift_keys->insert(key_pair); + } + + return 0; +} + +// modify an existing key +int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + std::string id; + std::string key = op_state.get_secret_key(); + int key_type = op_state.get_key_type(); + + RGWAccessKey modify_key; + + pair key_pair; + map::iterator kiter; + + switch (key_type) { + case KEY_TYPE_S3: + id = op_state.get_access_key(); + if (id.empty()) { + set_err_msg(err_msg, "no access key specified"); + return -ERR_INVALID_ACCESS_KEY; + } + break; + case KEY_TYPE_SWIFT: + id = op_state.build_default_swift_kid(); + if (id.empty()) { + set_err_msg(err_msg, "no subuser specified"); + return -EINVAL; + } + break; + default: + set_err_msg(err_msg, "invalid key type"); + return -ERR_INVALID_KEY_TYPE; + } + + if (!op_state.has_existing_key()) { + set_err_msg(err_msg, "key does not exist"); + return -ERR_INVALID_ACCESS_KEY; + } + + key_pair.first = id; + + if (key_type == KEY_TYPE_SWIFT) { + modify_key.id = id; + modify_key.subuser = op_state.get_subuser(); + } else if (key_type == KEY_TYPE_S3) { + kiter = access_keys->find(id); + if (kiter != access_keys->end()) { + modify_key = kiter->second; + } + } + + if (op_state.will_gen_secret()) { + char secret_key_buf[SECRET_KEY_LEN + 1]; + int key_buf_size = sizeof(secret_key_buf); + gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, key_buf_size); + key = secret_key_buf; + } + + if (key.empty()) { + set_err_msg(err_msg, "empty secret key"); + return -ERR_INVALID_SECRET_KEY; + } + + // update the access key with the new secret key + modify_key.key = key; + + key_pair.second = modify_key; + + + if (key_type == KEY_TYPE_S3) { + (*access_keys)[id] = modify_key; + } else if (key_type == KEY_TYPE_SWIFT) { + (*swift_keys)[id] = modify_key; + } + + return 0; +} + +int RGWAccessKeyPool::execute_add(RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update) +{ + int ret = 0; + + std::string subprocess_msg; + int key_op = GENERATE_KEY; + + // set the op + if (op_state.has_existing_key()) + key_op = MODIFY_KEY; + + switch (key_op) { + case GENERATE_KEY: + ret = generate_key(op_state, &subprocess_msg); + break; + case MODIFY_KEY: + ret = modify_key(op_state, &subprocess_msg); + break; + } + + if (ret < 0) { + set_err_msg(err_msg, subprocess_msg); + return ret; + } + + // store the updated info + if (!defer_user_update) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWAccessKeyPool::add(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + return add(op_state, err_msg, false); +} + +int RGWAccessKeyPool::add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update) +{ + int ret; + std::string subprocess_msg; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + ret = execute_add(op_state, &subprocess_msg, defer_user_update); + if (ret < 0) { + set_err_msg(err_msg, "unable to add access key, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWAccessKeyPool::execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update) +{ + int ret = 0; + + int key_type = op_state.get_key_type(); + std::string id = op_state.get_access_key(); + map::iterator kiter; + map *keys_map; + + if (!op_state.has_existing_key()) { + set_err_msg(err_msg, "unable to find access key"); + return -ERR_INVALID_ACCESS_KEY; + } + + if (key_type == KEY_TYPE_S3) { + keys_map = access_keys; + } else if (key_type == KEY_TYPE_SWIFT) { + keys_map = swift_keys; + } else { + keys_map = NULL; + set_err_msg(err_msg, "invalid access key"); + return -ERR_INVALID_ACCESS_KEY; + } + + kiter = keys_map->find(id); + if (kiter == keys_map->end()) { + set_err_msg(err_msg, "key not found"); + return -ERR_INVALID_ACCESS_KEY; + } + + rgw_remove_key_index(store, kiter->second); + keys_map->erase(kiter); + + if (!defer_user_update) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWAccessKeyPool::remove(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + return remove(op_state, err_msg, false); +} + +int RGWAccessKeyPool::remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update) +{ + int ret; + + std::string subprocess_msg; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + ret = execute_remove(op_state, &subprocess_msg, defer_user_update); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove access key, " + subprocess_msg); + return ret; + } + + return 0; +} + +// remove all keys associated with a subuser +int RGWAccessKeyPool::remove_subuser_keys(RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update) +{ + int ret = 0; + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!op_state.has_subuser()) { + set_err_msg(err_msg, "no subuser specified"); + return -EINVAL; + } + + std::string swift_kid = op_state.build_default_swift_kid(); + if (swift_kid.empty()) { + set_err_msg(err_msg, "empty swift access key"); + return -EINVAL; + } + + map::iterator kiter; + map *keys_map; + + // a subuser can have at most one swift key + keys_map = swift_keys; + kiter = keys_map->find(swift_kid); + if (kiter != keys_map->end()) { + rgw_remove_key_index(store, kiter->second); + keys_map->erase(kiter); + } + + // a subuser may have multiple s3 key pairs + std::string subuser_str = op_state.get_subuser(); + keys_map = access_keys; + RGWUserInfo user_info = op_state.get_user_info(); + map::iterator user_kiter = user_info.access_keys.begin(); + for (; user_kiter != user_info.access_keys.end(); ++user_kiter) { + if (user_kiter->second.subuser == subuser_str) { + kiter = keys_map->find(user_kiter->first); + if (kiter != keys_map->end()) { + rgw_remove_key_index(store, kiter->second); + keys_map->erase(kiter); + } + } + } + + if (!defer_user_update) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + +RGWSubUserPool::RGWSubUserPool(RGWUser *usr) +{ + subusers_allowed = (usr != NULL); + if (usr) + store = usr->get_store(); + else + store = NULL; + user = usr; + subuser_map = NULL; +} + +RGWSubUserPool::~RGWSubUserPool() +{ + +} + +int RGWSubUserPool::init(RGWUserAdminOpState& op_state) +{ + if (!op_state.is_initialized()) { + subusers_allowed = false; + return -EINVAL; + } + + rgw_user& uid = op_state.get_user_id(); + if (uid.compare(RGW_USER_ANON_ID) == 0) { + subusers_allowed = false; + return -EACCES; + } + + subuser_map = op_state.get_subusers(); + if (subuser_map == NULL) { + subusers_allowed = false; + return -EINVAL; + } + + subusers_allowed = true; + + return 0; +} + +bool RGWSubUserPool::exists(std::string subuser) +{ + if (subuser.empty()) + return false; + + if (!subuser_map) + return false; + + if (subuser_map->count(subuser)) + return true; + + return false; +} + +int RGWSubUserPool::check_op(RGWUserAdminOpState& op_state, + std::string *err_msg) +{ + bool existing = false; + std::string subuser = op_state.get_subuser(); + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!subusers_allowed) { + set_err_msg(err_msg, "subusers not allowed for this user"); + return -EACCES; + } + + if (subuser.empty() && !op_state.will_gen_subuser()) { + set_err_msg(err_msg, "empty subuser name"); + return -EINVAL; + } + + if (op_state.get_subuser_perm() == RGW_PERM_INVALID) { + set_err_msg(err_msg, "invaild subuser access"); + return -EINVAL; + } + + //set key type when it not set or set by context + if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) { + op_state.set_key_type(KEY_TYPE_SWIFT); + op_state.key_type_setbycontext = true; + } + + // check if the subuser exists + if (!subuser.empty()) + existing = exists(subuser); + + op_state.set_existing_subuser(existing); + + return 0; +} + +int RGWSubUserPool::execute_add(RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update) +{ + int ret = 0; + std::string subprocess_msg; + + RGWSubUser subuser; + std::pair subuser_pair; + std::string subuser_str = op_state.get_subuser(); + + subuser_pair.first = subuser_str; + + // assumes key should be created + if (op_state.has_key_op()) { + ret = user->keys.add(op_state, &subprocess_msg, true); + if (ret < 0) { + set_err_msg(err_msg, "unable to create subuser key, " + subprocess_msg); + return ret; + } + } + + // create the subuser + subuser.name = subuser_str; + + if (op_state.has_subuser_perm()) + subuser.perm_mask = op_state.get_subuser_perm(); + + // insert the subuser into user info + subuser_pair.second = subuser; + subuser_map->insert(subuser_pair); + + // attempt to save the subuser + if (!defer_user_update) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWSubUserPool::add(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + return add(op_state, err_msg, false); +} + +int RGWSubUserPool::add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update) +{ + std::string subprocess_msg; + int ret; + int32_t key_type = op_state.get_key_type(); + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + if (key_type == KEY_TYPE_S3 && op_state.get_access_key().empty()) { + op_state.set_gen_access(); + } + + if (op_state.get_secret_key().empty()) { + op_state.set_gen_secret(); + } + + ret = execute_add(op_state, &subprocess_msg, defer_user_update); + if (ret < 0) { + set_err_msg(err_msg, "unable to create subuser, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWSubUserPool::execute_remove(RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update) +{ + int ret = 0; + std::string subprocess_msg; + + std::string subuser_str = op_state.get_subuser(); + + map::iterator siter; + siter = subuser_map->find(subuser_str); + if (siter == subuser_map->end()){ + set_err_msg(err_msg, "subuser not found: " + subuser_str); + return -ERR_NO_SUCH_SUBUSER; + } + if (!op_state.has_existing_subuser()) { + set_err_msg(err_msg, "subuser not found: " + subuser_str); + return -ERR_NO_SUCH_SUBUSER; + } + + // always purge all associate keys + user->keys.remove_subuser_keys(op_state, &subprocess_msg, true); + + // remove the subuser from the user info + subuser_map->erase(siter); + + // attempt to save the subuser + if (!defer_user_update) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWSubUserPool::remove(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + return remove(op_state, err_msg, false); +} + +int RGWSubUserPool::remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update) +{ + std::string subprocess_msg; + int ret; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + ret = execute_remove(op_state, &subprocess_msg, defer_user_update); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove subuser, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWSubUserPool::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update) +{ + int ret = 0; + std::string subprocess_msg; + std::map::iterator siter; + std::pair subuser_pair; + + std::string subuser_str = op_state.get_subuser(); + RGWSubUser subuser; + + if (!op_state.has_existing_subuser()) { + set_err_msg(err_msg, "subuser does not exist"); + return -ERR_NO_SUCH_SUBUSER; + } + + subuser_pair.first = subuser_str; + + siter = subuser_map->find(subuser_str); + subuser = siter->second; + + if (op_state.has_key_op()) { + ret = user->keys.add(op_state, &subprocess_msg, true); + if (ret < 0) { + set_err_msg(err_msg, "unable to create subuser keys, " + subprocess_msg); + return ret; + } + } + + if (op_state.has_subuser_perm()) + subuser.perm_mask = op_state.get_subuser_perm(); + + subuser_pair.second = subuser; + + subuser_map->erase(siter); + subuser_map->insert(subuser_pair); + + // attempt to save the subuser + if (!defer_user_update) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWSubUserPool::modify(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + return RGWSubUserPool::modify(op_state, err_msg, false); +} + +int RGWSubUserPool::modify(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update) +{ + std::string subprocess_msg; + int ret; + + RGWSubUser subuser; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse request, " + subprocess_msg); + return ret; + } + + ret = execute_modify(op_state, &subprocess_msg, defer_user_update); + if (ret < 0) { + set_err_msg(err_msg, "unable to modify subuser, " + subprocess_msg); + return ret; + } + + return 0; +} + +RGWUserCapPool::RGWUserCapPool(RGWUser *usr) +{ + user = usr; + caps = NULL; + caps_allowed = (user != NULL); +} + +RGWUserCapPool::~RGWUserCapPool() +{ + +} + +int RGWUserCapPool::init(RGWUserAdminOpState& op_state) +{ + if (!op_state.is_initialized()) { + caps_allowed = false; + return -EINVAL; + } + + rgw_user& uid = op_state.get_user_id(); + if (uid.compare(RGW_USER_ANON_ID) == 0) { + caps_allowed = false; + return -EACCES; + } + + caps = op_state.get_caps_obj(); + if (!caps) { + caps_allowed = false; + return -ERR_INVALID_CAP; + } + + caps_allowed = true; + + return 0; +} + +int RGWUserCapPool::add(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + return add(op_state, err_msg, false); +} + +int RGWUserCapPool::add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save) +{ + int ret = 0; + std::string caps_str = op_state.get_caps(); + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!caps_allowed) { + set_err_msg(err_msg, "caps not allowed for this user"); + return -EACCES; + } + + if (caps_str.empty()) { + set_err_msg(err_msg, "empty user caps"); + return -ERR_INVALID_CAP; + } + + int r = caps->add_from_string(caps_str); + if (r < 0) { + set_err_msg(err_msg, "unable to add caps: " + caps_str); + return r; + } + + if (!defer_save) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + +int RGWUserCapPool::remove(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + return remove(op_state, err_msg, false); +} + +int RGWUserCapPool::remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save) +{ + int ret = 0; + + std::string caps_str = op_state.get_caps(); + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!caps_allowed) { + set_err_msg(err_msg, "caps not allowed for this user"); + return -EACCES; + } + + if (caps_str.empty()) { + set_err_msg(err_msg, "empty user caps"); + return -ERR_INVALID_CAP; + } + + int r = caps->remove_from_string(caps_str); + if (r < 0) { + set_err_msg(err_msg, "unable to remove caps: " + caps_str); + return r; + } + + if (!defer_save) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + +RGWUser::RGWUser() : store(NULL), info_stored(false), caps(this), keys(this), subusers(this) +{ + init_default(); +} + +int RGWUser::init(RGWRados *storage, RGWUserAdminOpState& op_state) +{ + init_default(); + int ret = init_storage(storage); + if (ret < 0) + return ret; + + ret = init(op_state); + if (ret < 0) + return ret; + + return 0; +} + +RGWUser::~RGWUser() +{ +} + +void RGWUser::init_default() +{ + // use anonymous user info as a placeholder + rgw_get_anon_user(old_info); + user_id = RGW_USER_ANON_ID; + + clear_populated(); +} + +int RGWUser::init_storage(RGWRados *storage) +{ + if (!storage) { + return -EINVAL; + } + + store = storage; + + clear_populated(); + + /* API wrappers */ + keys = RGWAccessKeyPool(this); + caps = RGWUserCapPool(this); + subusers = RGWSubUserPool(this); + + return 0; +} + +int RGWUser::init(RGWUserAdminOpState& op_state) +{ + bool found = false; + std::string swift_user; + user_id = op_state.get_user_id(); + std::string user_email = op_state.get_user_email(); + std::string access_key = op_state.get_access_key(); + std::string subuser = op_state.get_subuser(); + + int key_type = op_state.get_key_type(); + if (key_type == KEY_TYPE_SWIFT) { + swift_user = op_state.get_access_key(); + access_key.clear(); + } + + RGWUserInfo user_info; + + clear_populated(); + + if (user_id.empty() && !subuser.empty()) { + size_t pos = subuser.find(':'); + if (pos != string::npos) { + user_id = subuser.substr(0, pos); + op_state.set_user_id(user_id); + } + } + + if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) { + found = (rgw_get_user_info_by_uid(store, user_id, user_info, &op_state.objv) >= 0); + op_state.found_by_uid = found; + } + if (store->ctx()->_conf.get_val("rgw_user_unique_email")) { + if (!user_email.empty() && !found) { + found = (rgw_get_user_info_by_email(store, user_email, user_info, &op_state.objv) >= 0); + op_state.found_by_email = found; + } + } + if (!swift_user.empty() && !found) { + found = (rgw_get_user_info_by_swift(store, swift_user, user_info, &op_state.objv) >= 0); + op_state.found_by_key = found; + } + if (!access_key.empty() && !found) { + found = (rgw_get_user_info_by_access_key(store, access_key, user_info, &op_state.objv) >= 0); + op_state.found_by_key = found; + } + + op_state.set_existing_user(found); + if (found) { + op_state.set_user_info(user_info); + op_state.set_populated(); + + old_info = user_info; + set_populated(); + } + + if (user_id.empty()) { + user_id = user_info.user_id; + } + op_state.set_initialized(); + + // this may have been called by a helper object + int ret = init_members(op_state); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUser::init_members(RGWUserAdminOpState& op_state) +{ + int ret = 0; + + ret = keys.init(op_state); + if (ret < 0) + return ret; + + ret = subusers.init(op_state); + if (ret < 0) + return ret; + + ret = caps.init(op_state); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUser::update(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + int ret; + std::string subprocess_msg; + RGWUserInfo user_info = op_state.get_user_info(); + + if (!store) { + set_err_msg(err_msg, "couldn't initialize storage"); + return -EINVAL; + } + + if (is_populated()) { + ret = rgw_store_user_info(store, user_info, &old_info, &op_state.objv, real_time(), false); + if (ret < 0) { + set_err_msg(err_msg, "unable to store user info"); + return ret; + } + + ret = remove_old_indexes(store, old_info, user_info, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove old user info, " + subprocess_msg); + return ret; + } + } else { + ret = rgw_store_user_info(store, user_info, NULL, &op_state.objv, real_time(), false); + if (ret < 0) { + set_err_msg(err_msg, "unable to store user info"); + return ret; + } + } + + old_info = user_info; + set_populated(); + + return 0; +} + +int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + bool same_id; + bool populated; + rgw_user& op_id = op_state.get_user_id(); + + RGWUserInfo user_info; + + same_id = (user_id.compare(op_id) == 0); + populated = is_populated(); + + if (op_id.compare(RGW_USER_ANON_ID) == 0) { + set_err_msg(err_msg, "unable to perform operations on the anonymous user"); + return -EINVAL; + } + + if (populated && !same_id) { + set_err_msg(err_msg, "user id mismatch, operation id: " + op_id.to_str() + + " does not match: " + user_id.to_str()); + + return -EINVAL; + } + + int ret = rgw_validate_tenant_name(op_id.tenant); + if (ret) { + set_err_msg(err_msg, + "invalid tenant only alphanumeric and _ characters are allowed"); + return ret; + } + + //set key type when it not set or set by context + if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) { + op_state.set_key_type(KEY_TYPE_S3); + op_state.key_type_setbycontext = true; + } + + return 0; +} + +int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + std::string subprocess_msg; + int ret = 0; + bool defer_user_update = true; + + RGWUserInfo user_info; + + rgw_user& uid = op_state.get_user_id(); + std::string user_email = op_state.get_user_email(); + std::string display_name = op_state.get_display_name(); + + // fail if the user exists already + if (op_state.has_existing_user()) { + if (!op_state.exclusive && + (user_email.empty() || + boost::iequals(user_email, old_info.user_email)) && + old_info.display_name == display_name) { + return execute_modify(op_state, err_msg); + } + + if (op_state.found_by_email) { + set_err_msg(err_msg, "email: " + user_email + + " is the email address an existing user"); + ret = -ERR_EMAIL_EXIST; + } else if (op_state.found_by_key) { + set_err_msg(err_msg, "duplicate key provided"); + ret = -ERR_KEY_EXIST; + } else { + set_err_msg(err_msg, "user: " + op_state.user_id.to_str() + " exists"); + ret = -EEXIST; + } + return ret; + } + + // fail if the user_info has already been populated + if (op_state.is_populated()) { + set_err_msg(err_msg, "cannot overwrite already populated user"); + return -EEXIST; + } + + // fail if the display name was not included + if (display_name.empty()) { + set_err_msg(err_msg, "no display name specified"); + return -EINVAL; + } + + + // set the user info + user_id = uid; + user_info.user_id = user_id; + user_info.display_name = display_name; + user_info.type = TYPE_RGW; + + if (!user_email.empty()) + user_info.user_email = user_email; + + CephContext *cct = store->ctx(); + if (op_state.max_buckets_specified) { + user_info.max_buckets = op_state.get_max_buckets(); + } else { + user_info.max_buckets = + cct->_conf.get_val("rgw_user_max_buckets"); + } + + user_info.suspended = op_state.get_suspension_status(); + user_info.admin = op_state.admin; + user_info.system = op_state.system; + + if (op_state.op_mask_specified) + user_info.op_mask = op_state.get_op_mask(); + + if (op_state.has_bucket_quota()) { + user_info.bucket_quota = op_state.get_bucket_quota(); + } else { + rgw_apply_default_bucket_quota(user_info.bucket_quota, cct->_conf); + } + + if (op_state.temp_url_key_specified) { + map::iterator iter; + for (iter = op_state.temp_url_keys.begin(); + iter != op_state.temp_url_keys.end(); ++iter) { + user_info.temp_url_keys[iter->first] = iter->second; + } + } + + if (op_state.has_user_quota()) { + user_info.user_quota = op_state.get_user_quota(); + } else { + rgw_apply_default_user_quota(user_info.user_quota, cct->_conf); + } + + // update the request + op_state.set_user_info(user_info); + op_state.set_populated(); + + // update the helper objects + ret = init_members(op_state); + if (ret < 0) { + set_err_msg(err_msg, "unable to initialize user"); + return ret; + } + + // see if we need to add an access key + if (op_state.has_key_op()) { + ret = keys.add(op_state, &subprocess_msg, defer_user_update); + if (ret < 0) { + set_err_msg(err_msg, "unable to create access key, " + subprocess_msg); + return ret; + } + } + + // see if we need to add some caps + if (op_state.has_caps_op()) { + ret = caps.add(op_state, &subprocess_msg, defer_user_update); + if (ret < 0) { + set_err_msg(err_msg, "unable to add user capabilities, " + subprocess_msg); + return ret; + } + } + + ret = update(op_state, err_msg); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUser::add(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + std::string subprocess_msg; + int ret; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg); + return ret; + } + + ret = execute_add(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to create user, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWUser::execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + int ret; + + bool purge_data = op_state.will_purge_data(); + rgw_user& uid = op_state.get_user_id(); + RGWUserInfo user_info = op_state.get_user_info(); + + if (!op_state.has_existing_user()) { + set_err_msg(err_msg, "user does not exist"); + return -ENOENT; + } + + bool is_truncated = false; + string marker; + CephContext *cct = store->ctx(); + size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk; + do { + RGWUserBuckets buckets; + ret = rgw_read_user_buckets(store, uid, buckets, marker, string(), + max_buckets, false, &is_truncated); + if (ret < 0) { + set_err_msg(err_msg, "unable to read user bucket info"); + return ret; + } + + map& m = buckets.get_buckets(); + if (!m.empty() && !purge_data) { + set_err_msg(err_msg, "must specify purge data to remove user with buckets"); + return -EEXIST; // change to code that maps to 409: conflict + } + + std::map::iterator it; + for (it = m.begin(); it != m.end(); ++it) { + ret = rgw_remove_bucket(store, ((*it).second).bucket, true); + if (ret < 0) { + set_err_msg(err_msg, "unable to delete user data"); + return ret; + } + + marker = it->first; + } + + } while (is_truncated); + + ret = rgw_delete_user(store, user_info, op_state.objv); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove user from RADOS"); + return ret; + } + + op_state.clear_populated(); + clear_populated(); + + return 0; +} + +int RGWUser::remove(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + std::string subprocess_msg; + int ret; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg); + return ret; + } + + ret = execute_remove(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to remove user, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + bool populated = op_state.is_populated(); + int ret = 0; + std::string subprocess_msg; + std::string op_email = op_state.get_user_email(); + std::string display_name = op_state.get_display_name(); + + RGWUserInfo user_info; + RGWUserInfo duplicate_check; + + // ensure that the user info has been populated or is populate-able + if (!op_state.has_existing_user() && !populated) { + set_err_msg(err_msg, "user not found"); + return -ENOENT; + } + + // if the user hasn't already been populated...attempt to + if (!populated) { + ret = init(op_state); + if (ret < 0) { + set_err_msg(err_msg, "unable to retrieve user info"); + return ret; + } + } + + // ensure that we can modify the user's attributes + if (user_id.compare(RGW_USER_ANON_ID) == 0) { + set_err_msg(err_msg, "unable to modify anonymous user's info"); + return -EACCES; + } + + user_info = old_info; + + std::string old_email = old_info.user_email; + if (!op_email.empty()) { + // make sure we are not adding a duplicate email + if (old_email.compare(op_email) != 0) { + ret = rgw_get_user_info_by_email(store, op_email, duplicate_check); + if (ret >= 0 && duplicate_check.user_id.compare(user_id) != 0) { + set_err_msg(err_msg, "cannot add duplicate email"); + return -ERR_EMAIL_EXIST; + } + } + user_info.user_email = op_email; + } else if (op_email.empty() && op_state.user_email_specified) { + + ldout(store->ctx(), 10) << "removing email index: " << user_info.user_email << dendl; + ret = rgw_remove_email_index(store, user_info.user_email); + if (ret < 0 && ret != -ENOENT) { + ldout(store->ctx(), 0) << "ERROR: could not remove " << user_info.user_id << " index (err=" << ret << ")" << dendl; + return ret; + } + user_info.user_email = ""; + } + + // update the remaining user info + if (!display_name.empty()) + user_info.display_name = display_name; + + if (op_state.max_buckets_specified) + user_info.max_buckets = op_state.get_max_buckets(); + + if (op_state.admin_specified) + user_info.admin = op_state.admin; + + if (op_state.system_specified) + user_info.system = op_state.system; + + if (op_state.temp_url_key_specified) { + map::iterator iter; + for (iter = op_state.temp_url_keys.begin(); + iter != op_state.temp_url_keys.end(); ++iter) { + user_info.temp_url_keys[iter->first] = iter->second; + } + } + + if (op_state.op_mask_specified) + user_info.op_mask = op_state.get_op_mask(); + + if (op_state.has_bucket_quota()) + user_info.bucket_quota = op_state.get_bucket_quota(); + + if (op_state.has_user_quota()) + user_info.user_quota = op_state.get_user_quota(); + + if (op_state.has_suspension_op()) { + __u8 suspended = op_state.get_suspension_status(); + user_info.suspended = suspended; + + RGWUserBuckets buckets; + + if (user_id.empty()) { + set_err_msg(err_msg, "empty user id passed...aborting"); + return -EINVAL; + } + + bool is_truncated = false; + string marker; + CephContext *cct = store->ctx(); + size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk; + do { + ret = rgw_read_user_buckets(store, user_id, buckets, marker, string(), + max_buckets, false, &is_truncated); + if (ret < 0) { + set_err_msg(err_msg, "could not get buckets for uid: " + user_id.to_str()); + return ret; + } + + map& m = buckets.get_buckets(); + map::iterator iter; + + vector bucket_names; + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt obj = iter->second; + bucket_names.push_back(obj.bucket); + + marker = iter->first; + } + + ret = store->set_buckets_enabled(bucket_names, !suspended); + if (ret < 0) { + set_err_msg(err_msg, "failed to modify bucket"); + return ret; + } + + } while (is_truncated); + } + + if (op_state.mfa_ids_specified) { + user_info.mfa_ids = op_state.mfa_ids; + } + op_state.set_user_info(user_info); + + // if we're supposed to modify keys, do so + if (op_state.has_key_op()) { + ret = keys.add(op_state, &subprocess_msg, true); + if (ret < 0) { + set_err_msg(err_msg, "unable to create or modify keys, " + subprocess_msg); + return ret; + } + } + + ret = update(op_state, err_msg); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUser::modify(RGWUserAdminOpState& op_state, std::string *err_msg) +{ + std::string subprocess_msg; + int ret; + + ret = check_op(op_state, &subprocess_msg); + if (ret < 0) { + if (is_populated() && (user_id.compare(op_state.get_user_id()) != 0)) { + set_err_msg(err_msg, "unable to create user " + user_id.to_str() + + " because user id " + op_state.get_user_id().to_str() + + " already exists with email " + + op_state.get_user_email()); + } else { + set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg); + } + return ret; + } + + ret = execute_modify(op_state, &subprocess_msg); + if (ret < 0) { + set_err_msg(err_msg, "unable to modify user, " + subprocess_msg); + return ret; + } + + return 0; +} + +int RGWUser::info(RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info, std::string *err_msg) +{ + int ret = init(op_state); + if (ret < 0) { + set_err_msg(err_msg, "unable to fetch user info"); + return ret; + } + + fetched_info = op_state.get_user_info(); + + return 0; +} + +int RGWUser::info(RGWUserInfo& fetched_info, std::string *err_msg) +{ + if (!is_populated()) { + set_err_msg(err_msg, "no user info saved"); + return -EINVAL; + } + + fetched_info = old_info; + + return 0; +} + +int RGWUser::list(RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher) +{ + Formatter *formatter = flusher.get_formatter(); + void *handle = nullptr; + std::string metadata_key = "user"; + if (op_state.max_entries > 1000) { + op_state.max_entries = 1000; + } + + int ret = store->meta_mgr->list_keys_init(metadata_key, op_state.marker, &handle); + if (ret < 0) { + return ret; + } + + bool truncated = false; + uint64_t count = 0; + uint64_t left = 0; + flusher.start(0); + + // open the result object section + formatter->open_object_section("result"); + + // open the user id list array section + formatter->open_array_section("keys"); + do { + std::list keys; + left = op_state.max_entries - count; + ret = store->meta_mgr->list_keys_next(handle, left, keys, &truncated); + if (ret < 0 && ret != -ENOENT) { + return ret; + } if (ret != -ENOENT) { + for (std::list::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + formatter->dump_string("key", *iter); + ++count; + } + } + } while (truncated && left > 0); + // close user id list section + formatter->close_section(); + + formatter->dump_bool("truncated", truncated); + formatter->dump_int("count", count); + if (truncated) { + formatter->dump_string("marker", store->meta_mgr->get_marker(handle)); + } + + // close result object section + formatter->close_section(); + + store->meta_mgr->list_keys_complete(handle); + + flusher.flush(); + return 0; +} + +int RGWUserAdminOp_User::list(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUser user; + + int ret = user.init_storage(store); + if (ret < 0) + return ret; + + ret = user.list(op_state, flusher); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUserAdminOp_User::info(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (op_state.sync_stats) { + ret = rgw_user_sync_all_stats(store, info.user_id); + if (ret < 0) { + return ret; + } + } + + RGWStorageStats stats; + RGWStorageStats *arg_stats = NULL; + if (op_state.fetch_stats) { + int ret = store->get_user_stats(info.user_id, stats); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + + arg_stats = &stats; + } + + if (formatter) { + flusher.start(0); + + dump_user_info(formatter, info, arg_stats); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_User::create(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.add(op_state, NULL); + if (ret < 0) { + if (ret == -EEXIST) + ret = -ERR_USER_EXIST; + return ret; + } + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + dump_user_info(formatter, info); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_User::modify(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + Formatter *formatter = flusher.get_formatter(); + + ret = user.modify(op_state, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = -ERR_NO_SUCH_USER; + return ret; + } + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + dump_user_info(formatter, info); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_User::remove(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + + ret = user.remove(op_state, NULL); + + if (ret == -ENOENT) + ret = -ERR_NO_SUCH_USER; + return ret; +} + +int RGWUserAdminOp_Subuser::create(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.subusers.add(op_state, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + dump_subusers_info(formatter, info); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_Subuser::modify(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.subusers.modify(op_state, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + dump_subusers_info(formatter, info); + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_Subuser::remove(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + ret = user.subusers.remove(op_state, NULL); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUserAdminOp_Key::create(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.keys.add(op_state, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + int key_type = op_state.get_key_type(); + + if (key_type == KEY_TYPE_SWIFT) + dump_swift_keys_info(formatter, info); + + else if (key_type == KEY_TYPE_S3) + dump_access_keys_info(formatter, info); + + flusher.flush(); + } + + return 0; +} + +int RGWUserAdminOp_Key::remove(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + + ret = user.keys.remove(op_state, NULL); + if (ret < 0) + return ret; + + return 0; +} + +int RGWUserAdminOp_Caps::add(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.caps.add(op_state, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + info.caps.dump(formatter); + flusher.flush(); + } + + return 0; +} + + +int RGWUserAdminOp_Caps::remove(RGWRados *store, RGWUserAdminOpState& op_state, + RGWFormatterFlusher& flusher) +{ + RGWUserInfo info; + RGWUser user; + int ret = user.init(store, op_state); + if (ret < 0) + return ret; + + if (!op_state.has_existing_user()) + return -ERR_NO_SUCH_USER; + + Formatter *formatter = flusher.get_formatter(); + + ret = user.caps.remove(op_state, NULL); + if (ret < 0) + return ret; + + ret = user.info(info, NULL); + if (ret < 0) + return ret; + + if (formatter) { + flusher.start(0); + + info.caps.dump(formatter); + flusher.flush(); + } + + return 0; +} + +struct RGWUserCompleteInfo { + RGWUserInfo info; + map attrs; + bool has_attrs; + + RGWUserCompleteInfo() + : has_attrs(false) + {} + + void dump(Formatter * const f) const { + info.dump(f); + encode_json("attrs", attrs, f); + } + + void decode_json(JSONObj *obj) { + decode_json_obj(info, obj); + has_attrs = JSONDecoder::decode_json("attrs", attrs, obj); + } +}; + +class RGWUserMetadataObject : public RGWMetadataObject { + RGWUserCompleteInfo uci; +public: + RGWUserMetadataObject(const RGWUserCompleteInfo& _uci, obj_version& v, real_time m) + : uci(_uci) { + objv = v; + mtime = m; + } + + void dump(Formatter *f) const override { + uci.dump(f); + } +}; + +class RGWUserMetadataHandler : public RGWMetadataHandler { +public: + string get_type() override { return "user"; } + + int get(RGWRados *store, string& entry, RGWMetadataObject **obj) override { + RGWUserCompleteInfo uci; + RGWObjVersionTracker objv_tracker; + real_time mtime; + + rgw_user uid(entry); + + int ret = rgw_get_user_info_by_uid(store, uid, uci.info, &objv_tracker, + &mtime, NULL, &uci.attrs); + if (ret < 0) { + return ret; + } + + RGWUserMetadataObject *mdo = new RGWUserMetadataObject(uci, objv_tracker.read_version, mtime); + *obj = mdo; + + return 0; + } + + int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker, + real_time mtime, JSONObj *obj, sync_type_t sync_mode) override { + RGWUserCompleteInfo uci; + + try { + decode_json_obj(uci, obj); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + + map *pattrs = NULL; + if (uci.has_attrs) { + pattrs = &uci.attrs; + } + + rgw_user uid(entry); + + RGWUserInfo old_info; + real_time orig_mtime; + int ret = rgw_get_user_info_by_uid(store, uid, old_info, &objv_tracker, &orig_mtime); + if (ret < 0 && ret != -ENOENT) + return ret; + + // are we actually going to perform this put, or is it too old? + if (ret != -ENOENT && + !check_versions(objv_tracker.read_version, orig_mtime, + objv_tracker.write_version, mtime, sync_mode)) { + return STATUS_NO_APPLY; + } + + ret = rgw_store_user_info(store, uci.info, &old_info, &objv_tracker, mtime, false, pattrs); + if (ret < 0) { + return ret; + } + + return STATUS_APPLIED; + } + + struct list_keys_info { + RGWRados *store; + RGWListRawObjsCtx ctx; + }; + + int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override { + RGWUserInfo info; + + rgw_user uid(entry); + + int ret = rgw_get_user_info_by_uid(store, uid, info, &objv_tracker); + if (ret < 0) + return ret; + + return rgw_delete_user(store, info, objv_tracker); + } + + void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) override { + oid = key; + pool = store->svc.zone->get_zone_params().user_uid_pool; + } + + int list_keys_init(RGWRados *store, const string& marker, void **phandle) override + { + auto info = std::make_unique(); + + info->store = store; + + int ret = store->list_raw_objects_init(store->svc.zone->get_zone_params().user_uid_pool, marker, + &info->ctx); + if (ret < 0) { + return ret; + } + + *phandle = (void *)info.release(); + + return 0; + } + + int list_keys_next(void *handle, int max, list& keys, bool *truncated) override { + list_keys_info *info = static_cast(handle); + + string no_filter; + + keys.clear(); + + RGWRados *store = info->store; + + list unfiltered_keys; + + int ret = store->list_raw_objects_next(no_filter, max, info->ctx, + unfiltered_keys, truncated); + if (ret < 0 && ret != -ENOENT) + return ret; + if (ret == -ENOENT) { + if (truncated) + *truncated = false; + return 0; + } + + // now filter out the buckets entries + list::iterator iter; + for (iter = unfiltered_keys.begin(); iter != unfiltered_keys.end(); ++iter) { + string& k = *iter; + + if (k.find(".buckets") == string::npos) { + keys.push_back(k); + } + } + + return 0; + } + + void list_keys_complete(void *handle) override { + list_keys_info *info = static_cast(handle); + delete info; + } + + string get_marker(void *handle) override { + list_keys_info *info = static_cast(handle); + return info->store->list_raw_objs_get_cursor(info->ctx); + } +}; + +void rgw_user_init(RGWRados *store) +{ + uinfo_cache.init(store->svc.cache); + + user_meta_handler = new RGWUserMetadataHandler; + store->meta_mgr->register_handler(user_meta_handler); +} diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h new file mode 100644 index 00000000..942648b5 --- /dev/null +++ b/src/rgw/rgw_user.h @@ -0,0 +1,774 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_USER_H +#define CEPH_RGW_USER_H + +#include +#include +#include "include/ceph_assert.h" + +#include "include/types.h" +#include "rgw_common.h" +#include "rgw_tools.h" + +#include "rgw_rados.h" + +#include "rgw_string.h" + +#include "common/Formatter.h" +#include "rgw_formats.h" + +#define RGW_USER_ANON_ID "anonymous" + +#define SECRET_KEY_LEN 40 +#define PUBLIC_ID_LEN 20 +#define RAND_SUBUSER_LEN 5 + +#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/" + +/** + * A string wrapper that includes encode/decode functions + * for easily accessing a UID in all forms + */ +struct RGWUID +{ + rgw_user user_id; + void encode(bufferlist& bl) const { + string s; + user_id.to_str(s); + using ceph::encode; + encode(s, bl); + } + void decode(bufferlist::const_iterator& bl) { + string s; + using ceph::decode; + decode(s, bl); + user_id.from_str(s); + } +}; +WRITE_CLASS_ENCODER(RGWUID) + +extern int rgw_user_sync_all_stats(RGWRados *store, const rgw_user& user_id); +extern int rgw_user_get_all_buckets_stats(RGWRados *store, const rgw_user& user_id, map&buckets_usage_map); + +/** + * Get the anonymous (ie, unauthenticated) user info. + */ +extern void rgw_get_anon_user(RGWUserInfo& info); + +/** + * Save the given user information to storage. + * Returns: 0 on success, -ERR# on failure. + */ +extern int rgw_store_user_info(RGWRados *store, + RGWUserInfo& info, + RGWUserInfo *old_info, + RGWObjVersionTracker *objv_tracker, + real_time mtime, + bool exclusive, + map *pattrs = NULL); + +/** + * Given an user_id, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +extern int rgw_get_user_info_by_uid(RGWRados *store, + const rgw_user& user_id, + RGWUserInfo& info, + RGWObjVersionTracker *objv_tracker = NULL, + real_time *pmtime = NULL, + rgw_cache_entry_info *cache_info = NULL, + map *pattrs = NULL); +/** + * Given an email, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +extern int rgw_get_user_info_by_email(RGWRados *store, string& email, RGWUserInfo& info, + RGWObjVersionTracker *objv_tracker = NULL, real_time *pmtime = NULL); +/** + * Given an swift username, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +extern int rgw_get_user_info_by_swift(RGWRados *store, + const string& swift_name, + RGWUserInfo& info, /* out */ + RGWObjVersionTracker *objv_tracker = nullptr, + real_time *pmtime = nullptr); +/** + * Given an access key, finds the user info associated with it. + * returns: 0 on success, -ERR# on failure (including nonexistence) + */ +extern int rgw_get_user_info_by_access_key(RGWRados* store, + const std::string& access_key, + RGWUserInfo& info, + RGWObjVersionTracker* objv_tracker = nullptr, + real_time* pmtime = nullptr); +/** + * Get all the custom metadata stored for user specified in @user_id + * and put it into @attrs. + * Returns: 0 on success, -ERR# on failure. + */ +extern int rgw_get_user_attrs_by_uid(RGWRados *store, + const rgw_user& user_id, + map& attrs, + RGWObjVersionTracker *objv_tracker = NULL); +/** + * Given an RGWUserInfo, deletes the user and its bucket ACLs. + */ +extern int rgw_delete_user(RGWRados *store, RGWUserInfo& user, RGWObjVersionTracker& objv_tracker); + +/* + * remove the different indexes + */ +extern int rgw_remove_key_index(RGWRados *store, RGWAccessKey& access_key); +extern int rgw_remove_uid_index(RGWRados *store, rgw_user& uid); +extern int rgw_remove_email_index(RGWRados *store, string& email); +extern int rgw_remove_swift_name_index(RGWRados *store, string& swift_name); + +extern void rgw_perm_to_str(uint32_t mask, char *buf, int len); +extern uint32_t rgw_str_to_perm(const char *str); + +extern int rgw_validate_tenant_name(const string& t); + +enum ObjectKeyType { + KEY_TYPE_SWIFT, + KEY_TYPE_S3, + KEY_TYPE_UNDEFINED +}; + +enum RGWKeyPoolOp { + GENERATE_KEY, + MODIFY_KEY +}; + +enum RGWUserId { + RGW_USER_ID, + RGW_SWIFT_USERNAME, + RGW_USER_EMAIL, + RGW_ACCESS_KEY, +}; + +/* + * An RGWUser class along with supporting classes created + * to support the creation of an RESTful administrative API + */ +struct RGWUserAdminOpState { + // user attributes + RGWUserInfo info; + rgw_user user_id; + std::string user_email; + std::string display_name; + int32_t max_buckets; + __u8 suspended; + __u8 admin; + __u8 system; + __u8 exclusive; + __u8 fetch_stats; + __u8 sync_stats; + std::string caps; + RGWObjVersionTracker objv; + uint32_t op_mask; + map temp_url_keys; + + // subuser attributes + std::string subuser; + uint32_t perm_mask; + + // key_attributes + std::string id; // access key + std::string key; // secret key + int32_t key_type; + + std::set mfa_ids; + + // operation attributes + bool existing_user; + bool existing_key; + bool existing_subuser; + bool existing_email; + bool subuser_specified; + bool gen_secret; + bool gen_access; + bool gen_subuser; + bool id_specified; + bool key_specified; + bool type_specified; + bool key_type_setbycontext; // key type set by user or subuser context + bool purge_data; + bool purge_keys; + bool display_name_specified; + bool user_email_specified; + bool max_buckets_specified; + bool perm_specified; + bool op_mask_specified; + bool caps_specified; + bool suspension_op; + bool admin_specified = false; + bool system_specified; + bool key_op; + bool temp_url_key_specified; + bool found_by_uid; + bool found_by_email; + bool found_by_key; + bool mfa_ids_specified; + + // req parameters + bool populated; + bool initialized; + bool key_params_checked; + bool subuser_params_checked; + bool user_params_checked; + + bool bucket_quota_specified; + bool user_quota_specified; + + RGWQuotaInfo bucket_quota; + RGWQuotaInfo user_quota; + + // req parameters for listing user + std::string marker; + uint32_t max_entries; + + void set_access_key(const std::string& access_key) { + if (access_key.empty()) + return; + + id = access_key; + id_specified = true; + gen_access = false; + key_op = true; + } + + void set_secret_key(const std::string& secret_key) { + if (secret_key.empty()) + return; + + key = secret_key; + key_specified = true; + gen_secret = false; + key_op = true; + } + + void set_user_id(rgw_user& id) { + if (id.empty()) + return; + + user_id = id; + } + + void set_user_email(std::string& email) { + /* always lowercase email address */ + boost::algorithm::to_lower(email); + user_email = email; + user_email_specified = true; + } + + void set_display_name(const std::string& name) { + if (name.empty()) + return; + + display_name = name; + display_name_specified = true; + } + + void set_subuser(std::string& _subuser) { + if (_subuser.empty()) + return; + + size_t pos = _subuser.find(":"); + if (pos != string::npos) { + rgw_user tmp_id; + tmp_id.from_str(_subuser.substr(0, pos)); + if (tmp_id.tenant.empty()) { + user_id.id = tmp_id.id; + } else { + user_id = tmp_id; + } + subuser = _subuser.substr(pos+1); + } else { + subuser = _subuser; + } + + subuser_specified = true; + } + + void set_caps(const std::string& _caps) { + if (_caps.empty()) + return; + + caps = _caps; + caps_specified = true; + } + + void set_perm(uint32_t perm) { + perm_mask = perm; + perm_specified = true; + } + + void set_op_mask(uint32_t mask) { + op_mask = mask; + op_mask_specified = true; + } + + void set_temp_url_key(const string& key, int index) { + temp_url_keys[index] = key; + temp_url_key_specified = true; + } + + void set_key_type(int32_t type) { + key_type = type; + type_specified = true; + } + + void set_suspension(__u8 is_suspended) { + suspended = is_suspended; + suspension_op = true; + } + + void set_admin(__u8 is_admin) { + admin = is_admin; + admin_specified = true; + } + + void set_system(__u8 is_system) { + system = is_system; + system_specified = true; + } + + void set_exclusive(__u8 is_exclusive) { + exclusive = is_exclusive; + } + + void set_fetch_stats(__u8 is_fetch_stats) { + fetch_stats = is_fetch_stats; + } + + void set_sync_stats(__u8 is_sync_stats) { + sync_stats = is_sync_stats; + } + + void set_user_info(RGWUserInfo& user_info) { + user_id = user_info.user_id; + info = user_info; + } + + void set_max_buckets(int32_t mb) { + max_buckets = mb; + max_buckets_specified = true; + } + + void set_gen_access() { + gen_access = true; + key_op = true; + } + + void set_gen_secret() { + gen_secret = true; + key_op = true; + } + + void set_generate_key() { + if (id.empty()) + gen_access = true; + if (key.empty()) + gen_secret = true; + key_op = true; + } + + void clear_generate_key() { + gen_access = false; + gen_secret = false; + } + + void set_purge_keys() { + purge_keys = true; + key_op = true; + } + + void set_bucket_quota(RGWQuotaInfo& quota) { + bucket_quota = quota; + bucket_quota_specified = true; + } + + void set_user_quota(RGWQuotaInfo& quota) { + user_quota = quota; + user_quota_specified = true; + } + + void set_mfa_ids(const std::set& ids) { + mfa_ids = ids; + mfa_ids_specified = true; + } + + bool is_populated() { return populated; } + bool is_initialized() { return initialized; } + bool has_existing_user() { return existing_user; } + bool has_existing_key() { return existing_key; } + bool has_existing_subuser() { return existing_subuser; } + bool has_existing_email() { return existing_email; } + bool has_subuser() { return subuser_specified; } + bool has_key_op() { return key_op; } + bool has_caps_op() { return caps_specified; } + bool has_suspension_op() { return suspension_op; } + bool has_subuser_perm() { return perm_specified; } + bool has_op_mask() { return op_mask_specified; } + bool will_gen_access() { return gen_access; } + bool will_gen_secret() { return gen_secret; } + bool will_gen_subuser() { return gen_subuser; } + bool will_purge_keys() { return purge_keys; } + bool will_purge_data() { return purge_data; } + bool will_generate_subuser() { return gen_subuser; } + bool has_bucket_quota() { return bucket_quota_specified; } + bool has_user_quota() { return user_quota_specified; } + void set_populated() { populated = true; } + void clear_populated() { populated = false; } + void set_initialized() { initialized = true; } + void set_existing_user(bool flag) { existing_user = flag; } + void set_existing_key(bool flag) { existing_key = flag; } + void set_existing_subuser(bool flag) { existing_subuser = flag; } + void set_existing_email(bool flag) { existing_email = flag; } + void set_purge_data(bool flag) { purge_data = flag; } + void set_generate_subuser(bool flag) { gen_subuser = flag; } + __u8 get_suspension_status() { return suspended; } + int32_t get_key_type() {return key_type; } + uint32_t get_subuser_perm() { return perm_mask; } + int32_t get_max_buckets() { return max_buckets; } + uint32_t get_op_mask() { return op_mask; } + RGWQuotaInfo& get_bucket_quota() { return bucket_quota; } + RGWQuotaInfo& get_user_quota() { return user_quota; } + set& get_mfa_ids() { return mfa_ids; } + + rgw_user& get_user_id() { return user_id; } + std::string get_subuser() { return subuser; } + std::string get_access_key() { return id; } + std::string get_secret_key() { return key; } + std::string get_caps() { return caps; } + std::string get_user_email() { return user_email; } + std::string get_display_name() { return display_name; } + map& get_temp_url_keys() { return temp_url_keys; } + + RGWUserInfo& get_user_info() { return info; } + + map *get_swift_keys() { return &info.swift_keys; } + map *get_access_keys() { return &info.access_keys; } + map *get_subusers() { return &info.subusers; } + + RGWUserCaps *get_caps_obj() { return &info.caps; } + + std::string build_default_swift_kid() { + if (user_id.empty() || subuser.empty()) + return ""; + + std::string kid; + user_id.to_str(kid); + kid.append(":"); + kid.append(subuser); + + return kid; + } + + std::string generate_subuser() { + if (user_id.empty()) + return ""; + + std::string generated_subuser; + user_id.to_str(generated_subuser); + std::string rand_suffix; + + int sub_buf_size = RAND_SUBUSER_LEN + 1; + char sub_buf[RAND_SUBUSER_LEN + 1]; + + gen_rand_alphanumeric_upper(g_ceph_context, sub_buf, sub_buf_size); + + rand_suffix = sub_buf; + if (rand_suffix.empty()) + return ""; + + generated_subuser.append(rand_suffix); + subuser = generated_subuser; + + return generated_subuser; + } + + RGWUserAdminOpState() : user_id(RGW_USER_ANON_ID) + { + max_buckets = RGW_DEFAULT_MAX_BUCKETS; + key_type = -1; + perm_mask = RGW_PERM_NONE; + suspended = 0; + admin = 0; + system = 0; + exclusive = 0; + fetch_stats = 0; + op_mask = 0; + + existing_user = false; + existing_key = false; + existing_subuser = false; + existing_email = false; + subuser_specified = false; + caps_specified = false; + purge_keys = false; + gen_secret = false; + gen_access = false; + gen_subuser = false; + id_specified = false; + key_specified = false; + type_specified = false; + key_type_setbycontext = false; + purge_data = false; + display_name_specified = false; + user_email_specified = false; + max_buckets_specified = false; + perm_specified = false; + op_mask_specified = false; + suspension_op = false; + system_specified = false; + key_op = false; + populated = false; + initialized = false; + key_params_checked = false; + subuser_params_checked = false; + user_params_checked = false; + bucket_quota_specified = false; + temp_url_key_specified = false; + user_quota_specified = false; + found_by_uid = false; + found_by_email = false; + found_by_key = false; + mfa_ids_specified = false; + max_entries = 1000; + marker = ""; + } +}; + +class RGWUser; + +class RGWAccessKeyPool +{ + RGWUser *user; + + std::map key_type_map; + rgw_user user_id; + RGWRados *store; + + map *swift_keys; + map *access_keys; + + // we don't want to allow keys for the anonymous user or a null user + bool keys_allowed; + +private: + int create_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + int generate_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + int modify_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + int check_key_owner(RGWUserAdminOpState& op_state); + bool check_existing_key(RGWUserAdminOpState& op_state); + int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + /* API Contract Fulfilment */ + int execute_add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int remove_subuser_keys(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + + int add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); +public: + explicit RGWAccessKeyPool(RGWUser* usr); + ~RGWAccessKeyPool(); + + int init(RGWUserAdminOpState& op_state); + + /* API Contracted Methods */ + int add(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + int remove(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + friend class RGWUser; + friend class RGWSubUserPool; +}; + +class RGWSubUserPool +{ + RGWUser *user; + + rgw_user user_id; + RGWRados *store; + bool subusers_allowed; + + map *subuser_map; + +private: + int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + /* API Contract Fulfillment */ + int execute_add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + + int add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int modify(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); +public: + explicit RGWSubUserPool(RGWUser *user); + ~RGWSubUserPool(); + + bool exists(std::string subuser); + int init(RGWUserAdminOpState& op_state); + + /* API contracted methods */ + int add(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + int remove(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + int modify(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + friend class RGWUser; +}; + +class RGWUserCapPool +{ + RGWUserCaps *caps; + bool caps_allowed; + RGWUser *user; + +private: + int add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + +public: + explicit RGWUserCapPool(RGWUser *user); + ~RGWUserCapPool(); + + int init(RGWUserAdminOpState& op_state); + + /* API contracted methods */ + int add(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + int remove(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + friend class RGWUser; +}; + +class RGWUser +{ + +private: + RGWUserInfo old_info; + RGWRados *store; + + rgw_user user_id; + bool info_stored; + + void set_populated() { info_stored = true; } + void clear_populated() { info_stored = false; } + bool is_populated() { return info_stored; } + + int check_op(RGWUserAdminOpState& req, std::string *err_msg); + int update(RGWUserAdminOpState& op_state, std::string *err_msg); + + void clear_members(); + void init_default(); + + /* API Contract Fulfillment */ + int execute_add(RGWUserAdminOpState& op_state, std::string *err_msg); + int execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg); + int execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg); + +public: + RGWUser(); + ~RGWUser(); + + int init(RGWRados *storage, RGWUserAdminOpState& op_state); + + int init_storage(RGWRados *storage); + int init(RGWUserAdminOpState& op_state); + int init_members(RGWUserAdminOpState& op_state); + + RGWRados *get_store() { return store; } + + /* API Contracted Members */ + RGWUserCapPool caps; + RGWAccessKeyPool keys; + RGWSubUserPool subusers; + + /* API Contracted Methods */ + int add(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + int remove(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + /* remove an already populated RGWUser */ + int remove(std::string *err_msg = NULL); + + int modify(RGWUserAdminOpState& op_state, std::string *err_msg = NULL); + + /* retrieve info from an existing user in the RGW system */ + int info(RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info, std::string *err_msg = NULL); + + /* info from an already populated RGWUser */ + int info (RGWUserInfo& fetched_info, std::string *err_msg = NULL); + + /* list the existing users */ + int list(RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + friend class RGWAccessKeyPool; + friend class RGWSubUserPool; + friend class RGWUserCapPool; +}; + +/* Wrappers for admin API functionality */ + +class RGWUserAdminOp_User +{ +public: + static int list(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int info(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int create(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int modify(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int remove(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); +}; + +class RGWUserAdminOp_Subuser +{ +public: + static int create(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int modify(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int remove(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); +}; + +class RGWUserAdminOp_Key +{ +public: + static int create(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int remove(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); +}; + +class RGWUserAdminOp_Caps +{ +public: + static int add(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); + + static int remove(RGWRados *store, + RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher); +}; + +class RGWMetadataManager; + +extern void rgw_user_init(RGWRados *store); + +#endif diff --git a/src/rgw/rgw_web_idp.h b/src/rgw/rgw_web_idp.h new file mode 100644 index 00000000..b357338c --- /dev/null +++ b/src/rgw/rgw_web_idp.h @@ -0,0 +1,29 @@ +#ifndef CEPH_RGW_WEB_IDP_H +#define CEPH_RGW_WEB_IDP_H + +#include +#include +#include + +#include "rgw_auth.h" +#include "rgw_common.h" + +namespace rgw { +namespace web_idp { + +//WebToken contains some claims from the decoded token which are of interest to us. +struct WebTokenClaims { + //Subject of the token + string sub; + //Intended audience for this token + string aud; + //Issuer of this token + string iss; + //Human-readable id for the resource owner + string user_name; +}; + +}; /* namespace web_idp */ +}; /* namespace rgw */ + +#endif /* CEPH_RGW_WEB_IDP_H */ diff --git a/src/rgw/rgw_website.cc b/src/rgw/rgw_website.cc new file mode 100644 index 00000000..13a3b1de --- /dev/null +++ b/src/rgw/rgw_website.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Yehuda Sadeh + * Copyright (C) 2015 Robin H. Johnson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/debug.h" +#include "common/ceph_json.h" + +#include "acconfig.h" + +#include +#include +#include +#include "include/types.h" +#include "rgw_website.h" + + + +bool RGWBWRoutingRuleCondition::check_key_condition(const string& key) { + return (key.size() >= key_prefix_equals.size() && + key.compare(0, key_prefix_equals.size(), key_prefix_equals) == 0); +} + + +void RGWBWRoutingRule::apply_rule(const string& default_protocol, const string& default_hostname, + const string& key, string *new_url, int *redirect_code) +{ + RGWRedirectInfo& redirect = redirect_info.redirect; + + string protocol = (!redirect.protocol.empty() ? redirect.protocol : default_protocol); + string hostname = (!redirect.hostname.empty() ? redirect.hostname : default_hostname); + + *new_url = protocol + "://" + hostname + "/"; + + if (!redirect_info.replace_key_prefix_with.empty()) { + *new_url += redirect_info.replace_key_prefix_with; + *new_url += key.substr(condition.key_prefix_equals.size()); + } else if (!redirect_info.replace_key_with.empty()) { + *new_url += redirect_info.replace_key_with; + } else { + *new_url += key; + } + + if(redirect.http_redirect_code > 0) + *redirect_code = redirect.http_redirect_code; +} + +bool RGWBWRoutingRules::check_key_and_error_code_condition(const string &key, int error_code, RGWBWRoutingRule **rule) +{ + for (list::iterator iter = rules.begin(); iter != rules.end(); ++iter) { + if (iter->check_key_condition(key) && iter->check_error_code_condition(error_code)) { + *rule = &(*iter); + return true; + } + } + return false; +} + +bool RGWBWRoutingRules::check_key_condition(const string& key, RGWBWRoutingRule **rule) +{ + for (list::iterator iter = rules.begin(); iter != rules.end(); ++iter) { + if (iter->check_key_condition(key)) { + *rule = &(*iter); + return true; + } + } + return false; +} + +bool RGWBWRoutingRules::check_error_code_condition(const int http_error_code, RGWBWRoutingRule **rule) +{ + for (list::iterator iter = rules.begin(); iter != rules.end(); ++iter) { + if (iter->check_error_code_condition(http_error_code)) { + *rule = &(*iter); + return true; + } + } + return false; +} + +bool RGWBucketWebsiteConf::should_redirect(const string& key, const int http_error_code, RGWBWRoutingRule *redirect) +{ + RGWBWRoutingRule *rule; + if(!redirect_all.hostname.empty()) { + RGWBWRoutingRule redirect_all_rule; + redirect_all_rule.redirect_info.redirect = redirect_all; + redirect_all.http_redirect_code = 301; + *redirect = redirect_all_rule; + return true; + } else if (!routing_rules.check_key_and_error_code_condition(key, http_error_code, &rule)) { + return false; + } + + *redirect = *rule; + + return true; +} + +bool RGWBucketWebsiteConf::get_effective_key(const string& key, string *effective_key, bool is_file) const +{ + if (index_doc_suffix.empty()) { + return false; + } + + if (key.empty()) { + *effective_key = index_doc_suffix; + } else if (key[key.size() - 1] == '/') { + *effective_key = key + index_doc_suffix; + } else if (! is_file) { + *effective_key = key + "/" + index_doc_suffix; + } else { + *effective_key = key; + } + + return true; +} diff --git a/src/rgw/rgw_website.h b/src/rgw/rgw_website.h new file mode 100644 index 00000000..8366f39c --- /dev/null +++ b/src/rgw/rgw_website.h @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Yehuda Sadeh + * Copyright (C) 2015 Robin H. Johnson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RGW_WEBSITE_H +#define RGW_WEBSITE_H + +#include +#include + +#include "common/ceph_json.h" + +#include "rgw_xml.h" + +struct RGWRedirectInfo +{ + std::string protocol; + std::string hostname; + uint16_t http_redirect_code = 0; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(protocol, bl); + encode(hostname, bl); + encode(http_redirect_code, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(protocol, bl); + decode(hostname, bl); + decode(http_redirect_code, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWRedirectInfo) + + +struct RGWBWRedirectInfo +{ + RGWRedirectInfo redirect; + std::string replace_key_prefix_with; + std::string replace_key_with; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(redirect, bl); + encode(replace_key_prefix_with, bl); + encode(replace_key_with, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(redirect, bl); + decode(replace_key_prefix_with, bl); + decode(replace_key_with, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void decode_json(JSONObj *obj); + void decode_xml(XMLObj *obj); +}; +WRITE_CLASS_ENCODER(RGWBWRedirectInfo) + +struct RGWBWRoutingRuleCondition +{ + std::string key_prefix_equals; + uint16_t http_error_code_returned_equals = 0; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(key_prefix_equals, bl); + encode(http_error_code_returned_equals, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(key_prefix_equals, bl); + decode(http_error_code_returned_equals, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void decode_json(JSONObj *obj); + void decode_xml(XMLObj *obj); + + bool check_key_condition(const std::string& key); + bool check_error_code_condition(const int error_code) { + return (uint16_t)error_code == http_error_code_returned_equals; + } +}; +WRITE_CLASS_ENCODER(RGWBWRoutingRuleCondition) + +struct RGWBWRoutingRule +{ + RGWBWRoutingRuleCondition condition; + RGWBWRedirectInfo redirect_info; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(condition, bl); + encode(redirect_info, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(condition, bl); + decode(redirect_info, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void decode_json(JSONObj *obj); + void decode_xml(XMLObj *obj); + + bool check_key_condition(const std::string& key) { + return condition.check_key_condition(key); + } + bool check_error_code_condition(int error_code) { + return condition.check_error_code_condition(error_code); + } + + void apply_rule(const std::string& default_protocol, + const std::string& default_hostname, + const std::string& key, + std::string *redirect, + int *redirect_code); +}; +WRITE_CLASS_ENCODER(RGWBWRoutingRule) + +struct RGWBWRoutingRules +{ + std::list rules; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(rules, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(rules, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void decode_json(JSONObj *obj); + + bool check_key_condition(const std::string& key, RGWBWRoutingRule **rule); + bool check_error_code_condition(int error_code, RGWBWRoutingRule **rule); + bool check_key_and_error_code_condition(const std::string& key, + const int error_code, + RGWBWRoutingRule **rule); +}; +WRITE_CLASS_ENCODER(RGWBWRoutingRules) + +struct RGWBucketWebsiteConf +{ + RGWRedirectInfo redirect_all; + std::string index_doc_suffix; + std::string error_doc; + std::string subdir_marker; + std::string listing_css_doc; + bool listing_enabled; + bool is_redirect_all; + bool is_set_index_doc; + RGWBWRoutingRules routing_rules; + + RGWBucketWebsiteConf() + : listing_enabled(false) { + is_redirect_all = false; + is_set_index_doc = false; + } + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(index_doc_suffix, bl); + encode(error_doc, bl); + encode(routing_rules, bl); + encode(redirect_all, bl); + encode(subdir_marker, bl); + encode(listing_css_doc, bl); + encode(listing_enabled, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(index_doc_suffix, bl); + decode(error_doc, bl); + decode(routing_rules, bl); + decode(redirect_all, bl); + if (struct_v >= 2) { + decode(subdir_marker, bl); + decode(listing_css_doc, bl); + decode(listing_enabled, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + void decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + bool should_redirect(const std::string& key, + const int http_error_code, + RGWBWRoutingRule *redirect); + + bool get_effective_key(const std::string& key, + std::string *effective_key, bool is_file) const; + + const std::string& get_index_doc() const { + return index_doc_suffix; + } + + bool is_empty() const { + return index_doc_suffix.empty() && + error_doc.empty() && + subdir_marker.empty() && + listing_css_doc.empty() && + ! listing_enabled; + } +}; +WRITE_CLASS_ENCODER(RGWBucketWebsiteConf) + +#endif diff --git a/src/rgw/rgw_xml.cc b/src/rgw/rgw_xml.cc new file mode 100755 index 00000000..4ecd9d66 --- /dev/null +++ b/src/rgw/rgw_xml.cc @@ -0,0 +1,500 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include +#include + +#include + +#include "include/types.h" +#include "include/utime.h" + +#include "rgw_xml.h" + +XMLObjIter:: +XMLObjIter() +{ +} + +XMLObjIter:: +~XMLObjIter() +{ +} + +void XMLObjIter:: +set(const XMLObjIter::map_iter_t &_cur, const XMLObjIter::map_iter_t &_end) +{ + cur = _cur; + end = _end; +} + +XMLObj *XMLObjIter:: +get_next() +{ + XMLObj *obj = NULL; + if (cur != end) { + obj = cur->second; + ++cur; + } + return obj; +} + +bool XMLObjIter::get_name(std::string& name) const +{ + if (cur == end) { + return false; + } + + name = cur->first; + return true; +} + +ostream& operator<<(ostream &out, const XMLObj &obj) { + out << obj.obj_type << ": " << obj.data; + return out; +} + +XMLObj:: +~XMLObj() +{ +} + +bool XMLObj:: +xml_start(XMLObj *parent, const char *el, const char **attr) +{ + this->parent = parent; + obj_type = el; + for (int i = 0; attr[i]; i += 2) { + attr_map[attr[i]] = std::string(attr[i + 1]); + } + return true; +} + +bool XMLObj:: +xml_end(const char *el) +{ + return true; +} + +void XMLObj:: +xml_handle_data(const char *s, int len) +{ + data.append(s, len); +} + +const std::string& XMLObj:: +XMLObj::get_data() const +{ + return data; +} + +const std::string& XMLObj:: +XMLObj::get_obj_type() const +{ + return obj_type; +} + +XMLObj *XMLObj:: +XMLObj::get_parent() +{ + return parent; +} + +void XMLObj:: +add_child(const std::string& el, XMLObj *obj) +{ + children.insert(std::pair(el, obj)); +} + +bool XMLObj:: +get_attr(const std::string& name, std::string& attr) const +{ + const std::map::const_iterator iter = attr_map.find(name); + if (iter == attr_map.end()) + return false; + attr = iter->second; + return true; +} + +XMLObjIter XMLObj:: +find(const std::string& name) +{ + XMLObjIter iter; + const XMLObjIter::const_map_iter_t first = children.find(name); + XMLObjIter::const_map_iter_t last; + if (first != children.end()) { + last = children.upper_bound(name); + }else + last = children.end(); + iter.set(first, last); + return iter; +} + +XMLObjIter XMLObj::find_first() +{ + XMLObjIter iter; + const XMLObjIter::const_map_iter_t first = children.begin(); + const XMLObjIter::const_map_iter_t last = children.end(); + iter.set(first, last); + return iter; +} + +XMLObj *XMLObj:: +find_first(const std::string& name) +{ + const XMLObjIter::const_map_iter_t first = children.find(name); + if (first != children.end()) + return first->second; + return nullptr; +} + +RGWXMLParser:: +RGWXMLParser() : buf(nullptr), buf_len(0), cur_obj(nullptr), success(true), init_called(false) +{ + p = XML_ParserCreate(nullptr); +} + +RGWXMLParser:: +~RGWXMLParser() +{ + XML_ParserFree(p); + + free(buf); + std::list::const_iterator iter; + for (iter = allocated_objs.begin(); iter != allocated_objs.end(); ++iter) { + XMLObj *obj = *iter; + delete obj; + } +} + +void RGWXMLParser::call_xml_start(void* user_data, const char *el, const char **attr) { + RGWXMLParser *handler = static_cast(user_data); + XMLObj * obj = handler->alloc_obj(el); + if (!obj) { + handler->unallocated_objs.push_back(XMLObj()); + obj = &handler->unallocated_objs.back(); + } else { + handler->allocated_objs.push_back(obj); + } + if (!obj->xml_start(handler->cur_obj, el, attr)) { + handler->success = false; + return; + } + if (handler->cur_obj) { + handler->cur_obj->add_child(el, obj); + } else { + handler->children.insert(std::pair(el, obj)); + } + handler->cur_obj = obj; + + handler->objs.push_back(obj); +} + +void RGWXMLParser::call_xml_end(void* user_data, const char *el) { + RGWXMLParser *handler = static_cast(user_data); + XMLObj *parent_obj = handler->cur_obj->get_parent(); + if (!handler->cur_obj->xml_end(el)) { + handler->success = false; + return; + } + handler->cur_obj = parent_obj; +} + +void RGWXMLParser::call_xml_handle_data(void* user_data, const char *s, int len) +{ + RGWXMLParser *handler = static_cast(user_data); + handler->cur_obj->xml_handle_data(s, len); +} + +bool RGWXMLParser::init() +{ + if (!p) { + return false; + } + init_called = true; + XML_SetElementHandler(p, RGWXMLParser::call_xml_start, RGWXMLParser::call_xml_end); + XML_SetCharacterDataHandler(p, RGWXMLParser::call_xml_handle_data); + XML_SetUserData(p, (void *)this); + return true; +} + +bool RGWXMLParser::parse(const char *_buf, int len, int done) +{ + ceph_assert(init_called); + int pos = buf_len; + char *tmp_buf; + tmp_buf = (char *)realloc(buf, buf_len + len); + if (tmp_buf == NULL){ + free(buf); + buf = NULL; + return false; + } else { + buf = tmp_buf; + } + + memcpy(&buf[buf_len], _buf, len); + buf_len += len; + + success = true; + if (!XML_Parse(p, &buf[pos], len, done)) { + fprintf(stderr, "Parse error at line %d:\n%s\n", + (int)XML_GetCurrentLineNumber(p), + XML_ErrorString(XML_GetErrorCode(p))); + success = false; + } + + return success; +} + +void decode_xml_obj(unsigned long& val, XMLObj *obj) +{ + auto& s = obj->get_data(); + const char *start = s.c_str(); + char *p; + + errno = 0; + val = strtoul(start, &p, 10); + + /* Check for various possible errors */ + + if ((errno == ERANGE && val == ULONG_MAX) || + (errno != 0 && val == 0)) { + throw RGWXMLDecoder::err("failed to number"); + } + + if (p == start) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + while (*p != '\0') { + if (!isspace(*p)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + p++; + } +} + + +void decode_xml_obj(long& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + const char *start = s.c_str(); + char *p; + + errno = 0; + val = strtol(start, &p, 10); + + /* Check for various possible errors */ + + if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) || + (errno != 0 && val == 0)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + if (p == start) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + while (*p != '\0') { + if (!isspace(*p)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + p++; + } +} + +void decode_xml_obj(long long& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + const char *start = s.c_str(); + char *p; + + errno = 0; + val = strtoll(start, &p, 10); + + /* Check for various possible errors */ + + if ((errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) || + (errno != 0 && val == 0)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + if (p == start) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + while (*p != '\0') { + if (!isspace(*p)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + p++; + } +} + +void decode_xml_obj(unsigned long long& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + const char *start = s.c_str(); + char *p; + + errno = 0; + val = strtoull(start, &p, 10); + + /* Check for various possible errors */ + + if ((errno == ERANGE && val == ULLONG_MAX) || + (errno != 0 && val == 0)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + if (p == start) { + throw RGWXMLDecoder::err("failed to parse number"); + } + + while (*p != '\0') { + if (!isspace(*p)) { + throw RGWXMLDecoder::err("failed to parse number"); + } + p++; + } +} + +void decode_xml_obj(int& val, XMLObj *obj) +{ + long l; + decode_xml_obj(l, obj); +#if LONG_MAX > INT_MAX + if (l > INT_MAX || l < INT_MIN) { + throw RGWXMLDecoder::err("integer out of range"); + } +#endif + + val = (int)l; +} + +void decode_xml_obj(unsigned& val, XMLObj *obj) +{ + unsigned long l; + decode_xml_obj(l, obj); +#if ULONG_MAX > UINT_MAX + if (l > UINT_MAX) { + throw RGWXMLDecoder::err("unsigned integer out of range"); + } +#endif + + val = (unsigned)l; +} + +void decode_xml_obj(bool& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + if (strncasecmp(s.c_str(), "true", 8) == 0) { + val = true; + return; + } + if (strncasecmp(s.c_str(), "false", 8) == 0) { + val = false; + return; + } + int i; + decode_xml_obj(i, obj); + val = (bool)i; +} + +void decode_xml_obj(bufferlist& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + + bufferlist bl; + bl.append(s.c_str(), s.size()); + try { + val.decode_base64(bl); + } catch (buffer::error& err) { + throw RGWXMLDecoder::err("failed to decode base64"); + } +} + +void decode_xml_obj(utime_t& val, XMLObj *obj) +{ + const std::string s = obj->get_data(); + uint64_t epoch; + uint64_t nsec; + int r = utime_t::parse_date(s, &epoch, &nsec); + if (r == 0) { + val = utime_t(epoch, nsec); + } else { + throw RGWXMLDecoder::err("failed to decode utime_t"); + } +} + +void encode_xml(const char *name, const string& val, Formatter *f) +{ + f->dump_string(name, val); +} + +void encode_xml(const char *name, const char *val, Formatter *f) +{ + f->dump_string(name, val); +} + +void encode_xml(const char *name, bool val, Formatter *f) +{ + std::string s; + if (val) + s = "True"; + else + s = "False"; + + f->dump_string(name, s); +} + +void encode_xml(const char *name, int val, Formatter *f) +{ + f->dump_int(name, val); +} + +void encode_xml(const char *name, long val, Formatter *f) +{ + f->dump_int(name, val); +} + +void encode_xml(const char *name, unsigned val, Formatter *f) +{ + f->dump_unsigned(name, val); +} + +void encode_xml(const char *name, unsigned long val, Formatter *f) +{ + f->dump_unsigned(name, val); +} + +void encode_xml(const char *name, unsigned long long val, Formatter *f) +{ + f->dump_unsigned(name, val); +} + +void encode_xml(const char *name, long long val, Formatter *f) +{ + f->dump_int(name, val); +} + +void encode_xml(const char *name, const utime_t& val, Formatter *f) +{ + val.gmtime(f->dump_stream(name)); +} + +void encode_xml(const char *name, const bufferlist& bl, Formatter *f) +{ + /* need to copy data from bl, as it is const bufferlist */ + bufferlist src = bl; + + bufferlist b64; + src.encode_base64(b64); + + const std::string s(b64.c_str(), b64.length()); + + encode_xml(name, s, f); +} + diff --git a/src/rgw/rgw_xml.h b/src/rgw/rgw_xml.h new file mode 100644 index 00000000..227e1cba --- /dev/null +++ b/src/rgw/rgw_xml.h @@ -0,0 +1,352 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_XML_H +#define CEPH_RGW_XML_H + +#include +#include +#include +#include +#include + +class XMLObj; +class RGWXMLParser; + +class XMLObjIter { +public: + typedef map::iterator map_iter_t; + typedef map::iterator const_map_iter_t; + + XMLObjIter(); + ~XMLObjIter(); + void set(const XMLObjIter::const_map_iter_t &_cur, const XMLObjIter::const_map_iter_t &_end); + XMLObj *get_next(); + bool get_name(std::string& name) const; + +private: + map_iter_t cur; + map_iter_t end; +}; + +/** + * Represents a block of XML. + * Give the class an XML blob, and it will parse the blob into + * an attr_name->value map. + * It shouldn't be the start point for any parsing. Look at RGWXMLParser for that. + */ +class XMLObj +{ +private: + XMLObj *parent; + std::string obj_type; + +protected: + std::string data; + std::multimap children; + std::map attr_map; + + // invoked at the beginning of the XML tag, and populate any attributes + bool xml_start(XMLObj *parent, const char *el, const char **attr); + // callback invoked at the end of the XML tag + // if objects are created while parsing, this should be overwritten in the drived class + virtual bool xml_end(const char *el); + // callback invoked for storing the data of the XML tag + // if data manipulation is needed this could be overwritten in the drived class + virtual void xml_handle_data(const char *s, int len); + // get the parent object + XMLObj *get_parent(); + // add a child XML object + void add_child(const std::string& el, XMLObj *obj); + +public: + XMLObj() : parent(nullptr) {} + virtual ~XMLObj(); + + // get the data (as string) + const std::string& get_data() const; + // get the type of the object (as string) + const std::string& get_obj_type() const; + bool get_attr(const std::string& name, std::string& attr) const; + // return a list of sub-tags matching the name + XMLObjIter find(const std::string& name); + // return the first sub-tag + XMLObjIter find_first(); + // return the first sub-tags matching the name + XMLObj *find_first(const std::string& name); + + friend ostream& operator<<(ostream &out, const XMLObj &obj); + friend RGWXMLParser; +}; + +struct XML_ParserStruct; + +// an XML parser is an XML object without a parent (root of the tree) +// the parser could be used in 2 ways: +// +// (1) lazy object creation/intrusive API: usually used within the RGWXMLDecode namespace (as RGWXMLDecode::XMLParser) +// the parser will parse the input and store info, but will not generate the target object. The object can be allocated outside +// of the parser (stack or heap), and require to implement the decode_xml() API for the values to be populated. +// note that the decode_xml() calls may throw exceptions if parsing fails +// +// (2) object creation while parsing: a new class needs to be derived from RGWXMLParser and implement alloc_obj() +// API that should create a set of classes derived from XMLObj implementing xml_end() to create the actual target objects +// +// There could be a mix-and-match of the 2 types, control over that is in the alloc_obj() call +// deciding for which tags objects are allocate during parsing and for which tags object allocation is external + +class RGWXMLParser : public XMLObj +{ +private: + XML_ParserStruct *p; + char *buf; + int buf_len; + XMLObj *cur_obj; + std::vector objs; + std::list allocated_objs; + std::list unallocated_objs; + bool success; + bool init_called; + + // calls xml_start() on each parsed object + // passed as static callback to actual parser, passes itself as user_data + static void call_xml_start(void* user_data, const char *el, const char **attr); + // calls xml_end() on each parsed object + // passed as static callback to actual parser, passes itself as user_data + static void call_xml_end(void* user_data, const char *el); + // calls xml_handle_data() on each parsed object + // passed as static callback to actual parser, passes itself as user_data + static void call_xml_handle_data(void* user_data, const char *s, int len); + +protected: + // if objects are created while parsing, this should be implemented in the derived class + // and be a factory for creating the classes derived from XMLObj + // note that not all sub-tags has to be constructed here, any such tag which is not + // constructed will be lazily created when decode_xml() is invoked on it + // + // note that in case of different tags sharing the same name at different levels + // this method should not be used + virtual XMLObj *alloc_obj(const char *el) { + return nullptr; + } + +public: + RGWXMLParser(); + ~RGWXMLParser() override; + + // initialize the parser, must be called before parsing + bool init(); + // parse the XML buffer (can be invoked multiple times for incremental parsing) + // receives the buffer to parse, its length, and boolean indication (0,1) + // whether this is the final chunk of the buffer + bool parse(const char *buf, int len, int done); + // get the XML blob being parsed + const char *get_xml() const { return buf; } +}; + +namespace RGWXMLDecoder { + struct err { + std::string message; + + explicit err(const std::string& m) : message(m) {} + }; + + typedef RGWXMLParser XMLParser; + + template + bool decode_xml(const char *name, T& val, XMLObj* obj, bool mandatory = false); + + template + bool decode_xml(const char *name, std::vector& v, XMLObj* obj, bool mandatory = false); + + template + bool decode_xml(const char *name, C& container, void (*cb)(C&, XMLObj *obj), XMLObj *obj, bool mandatory = false); + + template + void decode_xml(const char *name, T& val, T& default_val, XMLObj* obj); +} + +static inline ostream& operator<<(ostream &out, RGWXMLDecoder::err& err) +{ + return out << err.message; +} + +template +void decode_xml_obj(T& val, XMLObj *obj) +{ + val.decode_xml(obj); +} + +static inline void decode_xml_obj(string& val, XMLObj *obj) +{ + val = obj->get_data(); +} + +void decode_xml_obj(unsigned long long& val, XMLObj *obj); +void decode_xml_obj(long long& val, XMLObj *obj); +void decode_xml_obj(unsigned long& val, XMLObj *obj); +void decode_xml_obj(long& val, XMLObj *obj); +void decode_xml_obj(unsigned& val, XMLObj *obj); +void decode_xml_obj(int& val, XMLObj *obj); +void decode_xml_obj(bool& val, XMLObj *obj); +void decode_xml_obj(bufferlist& val, XMLObj *obj); +class utime_t; +void decode_xml_obj(utime_t& val, XMLObj *obj); + +template +void do_decode_xml_obj(list& l, const string& name, XMLObj *obj) +{ + l.clear(); + + XMLObjIter iter = obj->find(name); + XMLObj *o; + + while ((o = iter.get_next())) { + T val; + decode_xml_obj(val, o); + l.push_back(val); + } +} + +template +bool RGWXMLDecoder::decode_xml(const char *name, T& val, XMLObj *obj, bool mandatory) +{ + XMLObjIter iter = obj->find(name); + XMLObj *o = iter.get_next(); + if (!o) { + if (mandatory) { + string s = "missing mandatory field " + string(name); + throw err(s); + } + val = T(); + return false; + } + + try { + decode_xml_obj(val, o); + } catch (err& e) { + string s = string(name) + ": "; + s.append(e.message); + throw err(s); + } + + return true; +} + +template +bool RGWXMLDecoder::decode_xml(const char *name, std::vector& v, XMLObj *obj, bool mandatory) +{ + XMLObjIter iter = obj->find(name); + XMLObj *o = iter.get_next(); + + v.clear(); + + if (!o) { + if (mandatory) { + string s = "missing mandatory field " + string(name); + throw err(s); + } + return false; + } + + do { + T val; + try { + decode_xml_obj(val, o); + } catch (err& e) { + string s = string(name) + ": "; + s.append(e.message); + throw err(s); + } + v.push_back(val); + } while ((o = iter.get_next())); + return true; +} + +template +bool RGWXMLDecoder::decode_xml(const char *name, C& container, void (*cb)(C&, XMLObj *), XMLObj *obj, bool mandatory) +{ + container.clear(); + + XMLObjIter iter = obj->find(name); + XMLObj *o = iter.get_next(); + if (!o) { + if (mandatory) { + string s = "missing mandatory field " + string(name); + throw err(s); + } + return false; + } + + try { + decode_xml_obj(container, cb, o); + } catch (err& e) { + string s = string(name) + ": "; + s.append(e.message); + throw err(s); + } + + return true; +} + +template +void RGWXMLDecoder::decode_xml(const char *name, T& val, T& default_val, XMLObj *obj) +{ + XMLObjIter iter = obj->find(name); + XMLObj *o = iter.get_next(); + if (!o) { + val = default_val; + return; + } + + try { + decode_xml_obj(val, o); + } catch (err& e) { + val = default_val; + string s = string(name) + ": "; + s.append(e.message); + throw err(s); + } +} + +template +static void encode_xml(const char *name, const T& val, ceph::Formatter *f) +{ + f->open_object_section(name); + val.dump_xml(f); + f->close_section(); +} + +template +static void encode_xml(const char *name, const char *ns, const T& val, ceph::Formatter *f) +{ + f->open_object_section_in_ns(name, ns); + val.dump_xml(f); + f->close_section(); +} + +void encode_xml(const char *name, const string& val, ceph::Formatter *f); +void encode_xml(const char *name, const char *val, ceph::Formatter *f); +void encode_xml(const char *name, bool val, ceph::Formatter *f); +void encode_xml(const char *name, int val, ceph::Formatter *f); +void encode_xml(const char *name, unsigned val, ceph::Formatter *f); +void encode_xml(const char *name, long val, ceph::Formatter *f); +void encode_xml(const char *name, unsigned long val, ceph::Formatter *f); +void encode_xml(const char *name, long long val, ceph::Formatter *f); +void encode_xml(const char *name, const utime_t& val, ceph::Formatter *f); +void encode_xml(const char *name, const bufferlist& bl, ceph::Formatter *f); +void encode_xml(const char *name, long long unsigned val, ceph::Formatter *f); + +template +static void do_encode_xml(const char *name, const std::list& l, const char *entry_name, ceph::Formatter *f) +{ + f->open_array_section(name); + for (typename std::list::const_iterator iter = l.begin(); iter != l.end(); ++iter) { + encode_xml(entry_name, *iter, f); + } + f->close_section(); +} + + + +#endif diff --git a/src/rgw/rgw_xml_enc.cc b/src/rgw/rgw_xml_enc.cc new file mode 100644 index 00000000..5473c2f6 --- /dev/null +++ b/src/rgw/rgw_xml_enc.cc @@ -0,0 +1,152 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Yehuda Sadeh + * Copyright (C) 2015 Robin H. Johnson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_common.h" +#include "rgw_xml.h" + +#include "common/Formatter.h" + +#define dout_subsys ceph_subsys_rgw + +void RGWBWRedirectInfo::dump_xml(Formatter *f) const +{ + if (!redirect.protocol.empty()) { + encode_xml("Protocol", redirect.protocol, f); + } + if (!redirect.hostname.empty()) { + encode_xml("HostName", redirect.hostname, f); + } + if (redirect.http_redirect_code > 0) { + encode_xml("HttpRedirectCode", (int)redirect.http_redirect_code, f); + } + if (!replace_key_prefix_with.empty()) { + encode_xml("ReplaceKeyPrefixWith", replace_key_prefix_with, f); + } + if (!replace_key_with.empty()) { + encode_xml("ReplaceKeyWith", replace_key_with, f); + } +} + +#define WEBSITE_HTTP_REDIRECT_CODE_MIN 300 +#define WEBSITE_HTTP_REDIRECT_CODE_MAX 400 +void RGWBWRedirectInfo::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Protocol", redirect.protocol, obj); + RGWXMLDecoder::decode_xml("HostName", redirect.hostname, obj); + int code = 0; + bool has_http_redirect_code = RGWXMLDecoder::decode_xml("HttpRedirectCode", code, obj); + if (has_http_redirect_code && + !(code > WEBSITE_HTTP_REDIRECT_CODE_MIN && + code < WEBSITE_HTTP_REDIRECT_CODE_MAX)) { + throw RGWXMLDecoder::err("The provided HTTP redirect code is not valid. Valid codes are 3XX except 300."); + } + redirect.http_redirect_code = code; + bool has_replace_key_prefix_with = RGWXMLDecoder::decode_xml("ReplaceKeyPrefixWith", replace_key_prefix_with, obj); + bool has_replace_key_with = RGWXMLDecoder::decode_xml("ReplaceKeyWith", replace_key_with, obj); + if (has_replace_key_prefix_with && has_replace_key_with) { + throw RGWXMLDecoder::err("You can only define ReplaceKeyPrefix or ReplaceKey but not both."); + } +} + +void RGWBWRoutingRuleCondition::dump_xml(Formatter *f) const +{ + if (!key_prefix_equals.empty()) { + encode_xml("KeyPrefixEquals", key_prefix_equals, f); + } + if (http_error_code_returned_equals > 0) { + encode_xml("HttpErrorCodeReturnedEquals", (int)http_error_code_returned_equals, f); + } +} + +#define WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MIN 400 +#define WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MAX 600 +void RGWBWRoutingRuleCondition::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("KeyPrefixEquals", key_prefix_equals, obj); + int code = 0; + bool has_http_error_code_returned_equals = RGWXMLDecoder::decode_xml("HttpErrorCodeReturnedEquals", code, obj); + if (has_http_error_code_returned_equals && + !(code >= WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MIN && + code < WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MAX)) { + throw RGWXMLDecoder::err("The provided HTTP redirect code is not valid. Valid codes are 4XX or 5XX."); + } + http_error_code_returned_equals = code; +} + +void RGWBWRoutingRule::dump_xml(Formatter *f) const +{ + encode_xml("Condition", condition, f); + encode_xml("Redirect", redirect_info, f); +} + +void RGWBWRoutingRule::decode_xml(XMLObj *obj) { + RGWXMLDecoder::decode_xml("Condition", condition, obj); + RGWXMLDecoder::decode_xml("Redirect", redirect_info, obj); +} + +static void encode_xml(const char *name, const std::list& l, ceph::Formatter *f) +{ + do_encode_xml("RoutingRules", l, "RoutingRule", f); +} + +void RGWBucketWebsiteConf::dump_xml(Formatter *f) const +{ + if (!redirect_all.hostname.empty()) { + f->open_object_section("RedirectAllRequestsTo"); + encode_xml("HostName", redirect_all.hostname, f); + if (!redirect_all.protocol.empty()) { + encode_xml("Protocol", redirect_all.protocol, f); + } + f->close_section(); + } + if (!index_doc_suffix.empty()) { + f->open_object_section("IndexDocument"); + encode_xml("Suffix", index_doc_suffix, f); + f->close_section(); + } + if (!error_doc.empty()) { + f->open_object_section("ErrorDocument"); + encode_xml("Key", error_doc, f); + f->close_section(); + } + if (!routing_rules.rules.empty()) { + encode_xml("RoutingRules", routing_rules.rules, f); + } +} + +void decode_xml_obj(list& l, XMLObj *obj) +{ + do_decode_xml_obj(l, "RoutingRule", obj); +} + +void RGWBucketWebsiteConf::decode_xml(XMLObj *obj) { + XMLObj *o = obj->find_first("RedirectAllRequestsTo"); + if (o) { + is_redirect_all = true; + RGWXMLDecoder::decode_xml("HostName", redirect_all.hostname, o, true); + RGWXMLDecoder::decode_xml("Protocol", redirect_all.protocol, o); + } else { + o = obj->find_first("IndexDocument"); + if (o) { + is_set_index_doc = true; + RGWXMLDecoder::decode_xml("Suffix", index_doc_suffix, o); + } + o = obj->find_first("ErrorDocument"); + if (o) { + RGWXMLDecoder::decode_xml("Key", error_doc, o); + } + RGWXMLDecoder::decode_xml("RoutingRules", routing_rules.rules, obj); + } +} + diff --git a/src/rgw/rgw_zone.cc b/src/rgw/rgw_zone.cc new file mode 100644 index 00000000..667782d1 --- /dev/null +++ b/src/rgw/rgw_zone.cc @@ -0,0 +1,1937 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" + +#include "rgw_zone.h" +#include "rgw_realm_watcher.h" +#include "rgw_meta_sync_status.h" +#include "rgw_sync.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" + +#define dout_subsys ceph_subsys_rgw + +namespace rgw_zone_defaults { + +std::string zone_info_oid_prefix = "zone_info."; +std::string zone_names_oid_prefix = "zone_names."; +std::string region_info_oid_prefix = "region_info."; +std::string realm_names_oid_prefix = "realms_names."; +std::string zone_group_info_oid_prefix = "zonegroup_info."; +std::string realm_info_oid_prefix = "realms."; +std::string default_region_info_oid = "default.region"; +std::string default_zone_group_info_oid = "default.zonegroup"; +std::string period_info_oid_prefix = "periods."; +std::string period_latest_epoch_info_oid = ".latest_epoch"; +std::string region_map_oid = "region_map"; +std::string default_realm_info_oid = "default.realm"; +std::string default_zonegroup_name = "default"; +std::string default_zone_name = "default"; +std::string zonegroup_names_oid_prefix = "zonegroups_names."; +std::string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root"; +std::string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root"; +std::string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root"; +std::string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root"; +std::string default_bucket_index_pool_suffix = "rgw.buckets.index"; +std::string default_storage_extra_pool_suffix = "rgw.buckets.non-ec"; +std::string avail_pools = ".pools.avail"; +std::string default_storage_pool_suffix = "rgw.buckets.data"; + +} + +using namespace rgw_zone_defaults; + +#define FIRST_EPOCH 1 + +void RGWDefaultZoneGroupInfo::dump(Formatter *f) const { + encode_json("default_zonegroup", default_zonegroup, f); +} + +void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) { + + JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj); + /* backward compatability with region */ + if (default_zonegroup.empty()) { + JSONDecoder::decode_json("default_region", default_zonegroup, obj); + } +} + +rgw_pool RGWZoneGroup::get_pool(CephContext *cct_) const +{ + if (cct_->_conf->rgw_zonegroup_root_pool.empty()) { + return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL); + } + + return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool); +} + +int RGWZoneGroup::create_default(bool old_format) +{ + name = default_zonegroup_name; + api_name = default_zonegroup_name; + is_master = true; + + RGWZoneGroupPlacementTarget placement_target; + placement_target.name = "default-placement"; + placement_targets[placement_target.name] = placement_target; + default_placement.name = "default-placement"; + + RGWZoneParams zone_params(default_zone_name); + + int r = zone_params.init(cct, sysobj_svc, false); + if (r < 0) { + ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl; + return r; + } + + r = zone_params.create_default(); + if (r < 0 && r != -EEXIST) { + ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl; + return r; + } else if (r == -EEXIST) { + ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl; + zone_params.clear_id(); + r = zone_params.init(cct, sysobj_svc); + if (r < 0) { + ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl; + return r; + } + ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id() + << dendl; + } + + RGWZone& default_zone = zones[zone_params.get_id()]; + default_zone.name = zone_params.get_name(); + default_zone.id = zone_params.get_id(); + master_zone = default_zone.id; + + r = create(); + if (r < 0 && r != -EEXIST) { + ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl; + return r; + } + + if (r == -EEXIST) { + ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl; + id.clear(); + r = init(cct, sysobj_svc); + if (r < 0) { + return r; + } + } + + if (old_format) { + name = id; + } + + post_process_params(); + + return 0; +} + +const string RGWZoneGroup::get_default_oid(bool old_region_format) const +{ + if (old_region_format) { + if (cct->_conf->rgw_default_region_info_oid.empty()) { + return default_region_info_oid; + } + return cct->_conf->rgw_default_region_info_oid; + } + + string default_oid = cct->_conf->rgw_default_zonegroup_info_oid; + + if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) { + default_oid = default_zone_group_info_oid; + } + + default_oid += "." + realm_id; + + return default_oid; +} + +const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format) const +{ + if (old_region_format) { + return region_info_oid_prefix; + } + return zone_group_info_oid_prefix; +} + +const string& RGWZoneGroup::get_names_oid_prefix() const +{ + return zonegroup_names_oid_prefix; +} + +const string& RGWZoneGroup::get_predefined_name(CephContext *cct) const { + return cct->_conf->rgw_zonegroup; +} + +int RGWZoneGroup::equals(const string& other_zonegroup) const +{ + if (is_master && other_zonegroup.empty()) + return true; + + return (id == other_zonegroup); +} + +int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only, + const list& endpoints, const string *ptier_type, + bool *psync_from_all, list& sync_from, list& sync_from_rm, + string *predirect_zone, RGWSyncModulesManager *sync_mgr) +{ + auto& zone_id = zone_params.get_id(); + auto& zone_name = zone_params.get_name(); + + // check for duplicate zone name on insert + if (!zones.count(zone_id)) { + for (const auto& zone : zones) { + if (zone.second.name == zone_name) { + ldout(cct, 0) << "ERROR: found existing zone name " << zone_name + << " (" << zone.first << ") in zonegroup " << get_name() << dendl; + return -EEXIST; + } + } + } + + if (is_master) { + if (*is_master) { + if (!master_zone.empty() && master_zone != zone_id) { + ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl; + } + master_zone = zone_id; + } else if (master_zone == zone_id) { + master_zone.clear(); + } + } + + RGWZone& zone = zones[zone_id]; + zone.name = zone_name; + zone.id = zone_id; + if (!endpoints.empty()) { + zone.endpoints = endpoints; + } + if (read_only) { + zone.read_only = *read_only; + } + if (ptier_type) { + zone.tier_type = *ptier_type; + if (!sync_mgr->get_module(*ptier_type, nullptr)) { + ldout(cct, 0) << "ERROR: could not found sync module: " << *ptier_type + << ", valid sync modules: " + << sync_mgr->get_registered_module_names() + << dendl; + return -ENOENT; + } + } + + if (psync_from_all) { + zone.sync_from_all = *psync_from_all; + } + + if (predirect_zone) { + zone.redirect_zone = *predirect_zone; + } + + for (auto add : sync_from) { + zone.sync_from.insert(add); + } + + for (auto rm : sync_from_rm) { + zone.sync_from.erase(rm); + } + + post_process_params(); + + return update(); +} + + +int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params) +{ + RGWZone& zone = zones[zone_params.get_id()]; + zone.name = zone_params.get_name(); + + return update(); +} + +void RGWZoneGroup::post_process_params() +{ + bool log_data = zones.size() > 1; + + if (master_zone.empty()) { + map::iterator iter = zones.begin(); + if (iter != zones.end()) { + master_zone = iter->first; + } + } + + for (map::iterator iter = zones.begin(); iter != zones.end(); ++iter) { + RGWZone& zone = iter->second; + zone.log_data = log_data; + + RGWZoneParams zone_params(zone.id, zone.name); + int ret = zone_params.init(cct, sysobj_svc); + if (ret < 0) { + ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl; + continue; + } + + for (map::iterator iter = zone_params.placement_pools.begin(); + iter != zone_params.placement_pools.end(); ++iter) { + const string& placement_name = iter->first; + if (placement_targets.find(placement_name) == placement_targets.end()) { + RGWZoneGroupPlacementTarget placement_target; + placement_target.name = placement_name; + placement_targets[placement_name] = placement_target; + } + } + } + + if (default_placement.empty() && !placement_targets.empty()) { + default_placement.init(placement_targets.begin()->first, RGW_STORAGE_CLASS_STANDARD); + } +} + +int RGWZoneGroup::remove_zone(const std::string& zone_id) +{ + map::iterator iter = zones.find(zone_id); + if (iter == zones.end()) { + ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup " + << name << dendl; + return -ENOENT; + } + + zones.erase(iter); + + post_process_params(); + + return update(); +} + +int RGWZoneGroup::read_default_id(string& default_id, bool old_format) +{ + if (realm_id.empty()) { + /* try using default realm */ + RGWRealm realm; + int ret = realm.init(cct, sysobj_svc); + // no default realm exist + if (ret < 0) { + return read_id(default_zonegroup_name, default_id); + } + realm_id = realm.get_id(); + } + + return RGWSystemMetaObj::read_default_id(default_id, old_format); +} + +int RGWZoneGroup::set_as_default(bool exclusive) +{ + if (realm_id.empty()) { + /* try using default realm */ + RGWRealm realm; + int ret = realm.init(cct, sysobj_svc); + if (ret < 0) { + ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl; + return -EINVAL; + } + realm_id = realm.get_id(); + } + + return RGWSystemMetaObj::set_as_default(exclusive); +} + +void RGWSystemMetaObj::reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc) +{ + cct = _cct; + sysobj_svc = _sysobj_svc; + zone_svc = _sysobj_svc->get_zone_svc(); +} + +int RGWSystemMetaObj::init(CephContext *_cct, RGWSI_SysObj *_sysobj_svc, bool setup_obj, bool old_format) +{ + reinit_instance(_cct, _sysobj_svc); + + if (!setup_obj) + return 0; + + if (old_format && id.empty()) { + id = name; + } + + if (id.empty()) { + int r; + if (name.empty()) { + name = get_predefined_name(cct); + } + if (name.empty()) { + r = use_default(old_format); + if (r < 0) { + return r; + } + } else if (!old_format) { + r = read_id(name, id); + if (r < 0) { + if (r != -ENOENT) { + ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl; + } + return r; + } + } + } + + return read_info(id, old_format); +} + +int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid) +{ + using ceph::decode; + auto pool = get_pool(cct); + bufferlist bl; + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj(pool, oid)); + int ret = sysobj.rop().read(&bl); + if (ret < 0) + return ret; + + try { + auto iter = bl.cbegin(); + decode(default_info, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl; + return -EIO; + } + + return 0; +} + +int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format) +{ + RGWDefaultSystemMetaObjInfo default_info; + + int ret = read_default(default_info, get_default_oid(old_format)); + if (ret < 0) { + return ret; + } + + default_id = default_info.default_id; + + return 0; +} + +int RGWSystemMetaObj::use_default(bool old_format) +{ + return read_default_id(id, old_format); +} + +int RGWSystemMetaObj::set_as_default(bool exclusive) +{ + using ceph::encode; + string oid = get_default_oid(); + + rgw_pool pool(get_pool(cct)); + bufferlist bl; + + RGWDefaultSystemMetaObjInfo default_info; + default_info.default_id = id; + + encode(default_info, bl); + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj(pool, oid)); + int ret = sysobj.wop() + .set_exclusive(exclusive) + .write(bl); + if (ret < 0) + return ret; + + return 0; +} + +int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id) +{ + using ceph::decode; + rgw_pool pool(get_pool(cct)); + bufferlist bl; + + string oid = get_names_oid_prefix() + obj_name; + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj(pool, oid)); + int ret = sysobj.rop().read(&bl); + if (ret < 0) { + return ret; + } + + RGWNameToId nameToId; + try { + auto iter = bl.cbegin(); + decode(nameToId, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl; + return -EIO; + } + object_id = nameToId.obj_id; + return 0; +} + +int RGWSystemMetaObj::delete_obj(bool old_format) +{ + rgw_pool pool(get_pool(cct)); + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + + /* check to see if obj is the default */ + RGWDefaultSystemMetaObjInfo default_info; + int ret = read_default(default_info, get_default_oid(old_format)); + if (ret < 0 && ret != -ENOENT) + return ret; + if (default_info.default_id == id || (old_format && default_info.default_id == name)) { + string oid = get_default_oid(old_format); + rgw_raw_obj default_named_obj(pool, oid); + auto sysobj = sysobj_svc->get_obj(obj_ctx, default_named_obj); + ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + } + if (!old_format) { + string oid = get_names_oid_prefix() + name; + rgw_raw_obj object_name(pool, oid); + auto sysobj = sysobj_svc->get_obj(obj_ctx, object_name); + ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + string oid = get_info_oid_prefix(old_format); + if (old_format) { + oid += name; + } else { + oid += id; + } + + rgw_raw_obj object_id(pool, oid); + auto sysobj = sysobj_svc->get_obj(obj_ctx, object_id); + ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl; + } + + return ret; +} + +int RGWSystemMetaObj::store_name(bool exclusive) +{ + rgw_pool pool(get_pool(cct)); + string oid = get_names_oid_prefix() + name; + + RGWNameToId nameToId; + nameToId.obj_id = id; + + bufferlist bl; + using ceph::encode; + encode(nameToId, bl); + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj(pool, oid)); + return sysobj.wop() + .set_exclusive(exclusive) + .write(bl); +} + +int RGWSystemMetaObj::rename(const string& new_name) +{ + string new_id; + int ret = read_id(new_name, new_id); + if (!ret) { + return -EEXIST; + } + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + string old_name = name; + name = new_name; + ret = update(); + if (ret < 0) { + ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = store_name(true); + if (ret < 0) { + ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + /* delete old name */ + rgw_pool pool(get_pool(cct)); + string oid = get_names_oid_prefix() + old_name; + rgw_raw_obj old_name_obj(pool, oid); + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, old_name_obj); + ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return ret; +} + +int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format) +{ + rgw_pool pool(get_pool(cct)); + + bufferlist bl; + + string oid = get_info_oid_prefix(old_format) + obj_id; + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj{pool, oid}); + int ret = sysobj.rop().read(&bl); + if (ret < 0) { + ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + using ceph::decode; + + try { + auto iter = bl.cbegin(); + decode(*this, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl; + return -EIO; + } + + return 0; +} + +int RGWSystemMetaObj::read() +{ + int ret = read_id(name, id); + if (ret < 0) { + return ret; + } + + return read_info(id); +} + +int RGWSystemMetaObj::create(bool exclusive) +{ + int ret; + + /* check to see the name is not used */ + ret = read_id(name, id); + if (exclusive && ret == 0) { + ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl; + return -EEXIST; + } else if ( ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + if (id.empty()) { + /* create unique id */ + uuid_d new_uuid; + char uuid_str[37]; + new_uuid.generate_random(); + new_uuid.print(uuid_str); + id = uuid_str; + } + + ret = store_info(exclusive); + if (ret < 0) { + ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return store_name(exclusive); +} + +int RGWSystemMetaObj::store_info(bool exclusive) +{ + rgw_pool pool(get_pool(cct)); + + string oid = get_info_oid_prefix() + id; + + bufferlist bl; + using ceph::encode; + encode(*this, bl); + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj{pool, oid}); + return sysobj.wop() + .set_exclusive(exclusive) + .write(bl); +} + +int RGWSystemMetaObj::write(bool exclusive) +{ + int ret = store_info(exclusive); + if (ret < 0) { + ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl; + return ret; + } + ret = store_name(exclusive); + if (ret < 0) { + ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl; + return ret; + } + return 0; +} + + +const string& RGWRealm::get_predefined_name(CephContext *cct) const { + return cct->_conf->rgw_realm; +} + +int RGWRealm::create(bool exclusive) +{ + int ret = RGWSystemMetaObj::create(exclusive); + if (ret < 0) { + ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + // create the control object for watch/notify + ret = create_control(exclusive); + if (ret < 0) { + ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + RGWPeriod period; + if (current_period.empty()) { + /* create new period for the realm */ + ret = period.init(cct, sysobj_svc, id, name, false); + if (ret < 0 ) { + return ret; + } + ret = period.create(true); + if (ret < 0) { + ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + } else { + period = RGWPeriod(current_period, 0); + int ret = period.init(cct, sysobj_svc, id, name); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl; + return ret; + } + } + ret = set_current_period(period); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl; + return ret; + } + // try to set as default. may race with another create, so pass exclusive=true + // so we don't override an existing default + ret = set_as_default(true); + if (ret < 0 && ret != -EEXIST) { + ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl; + } + + return 0; +} + +int RGWRealm::delete_obj() +{ + int ret = RGWSystemMetaObj::delete_obj(); + if (ret < 0) { + return ret; + } + return delete_control(); +} + +int RGWRealm::create_control(bool exclusive) +{ + auto pool = rgw_pool{get_pool(cct)}; + auto oid = get_control_oid(); + bufferlist bl; + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj{pool, oid}); + return sysobj.wop() + .set_exclusive(exclusive) + .write(bl); +} + +int RGWRealm::delete_control() +{ + auto pool = rgw_pool{get_pool(cct)}; + auto obj = rgw_raw_obj{pool, get_control_oid()}; + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, obj); + return sysobj.wop().remove(); +} + +rgw_pool RGWRealm::get_pool(CephContext *cct) const +{ + if (cct->_conf->rgw_realm_root_pool.empty()) { + return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL); + } + return rgw_pool(cct->_conf->rgw_realm_root_pool); +} + +const string RGWRealm::get_default_oid(bool old_format) const +{ + if (cct->_conf->rgw_default_realm_info_oid.empty()) { + return default_realm_info_oid; + } + return cct->_conf->rgw_default_realm_info_oid; +} + +const string& RGWRealm::get_names_oid_prefix() const +{ + return realm_names_oid_prefix; +} + +const string& RGWRealm::get_info_oid_prefix(bool old_format) const +{ + return realm_info_oid_prefix; +} + +int RGWRealm::set_current_period(RGWPeriod& period) +{ + // update realm epoch to match the period's + if (epoch > period.get_realm_epoch()) { + ldout(cct, 0) << "ERROR: set_current_period with old realm epoch " + << period.get_realm_epoch() << ", current epoch=" << epoch << dendl; + return -EINVAL; + } + if (epoch == period.get_realm_epoch() && current_period != period.get_id()) { + ldout(cct, 0) << "ERROR: set_current_period with same realm epoch " + << period.get_realm_epoch() << ", but different period id " + << period.get_id() << " != " << current_period << dendl; + return -EINVAL; + } + + epoch = period.get_realm_epoch(); + current_period = period.get_id(); + + int ret = update(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = period.reflect(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +string RGWRealm::get_control_oid() const +{ + return get_info_oid_prefix() + id + ".control"; +} + +int RGWRealm::notify_zone(bufferlist& bl) +{ + rgw_pool pool{get_pool(cct)}; + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj{pool, get_control_oid()}); + int ret = sysobj.wn().notify(bl, 0, nullptr); + if (ret < 0) { + return ret; + } + return 0; +} + +int RGWRealm::notify_new_period(const RGWPeriod& period) +{ + bufferlist bl; + using ceph::encode; + // push the period to dependent zonegroups/zones + encode(RGWRealmNotify::ZonesNeedPeriod, bl); + encode(period, bl); + // reload the gateway with the new period + encode(RGWRealmNotify::Reload, bl); + + return notify_zone(bl); +} + +std::string RGWPeriodConfig::get_oid(const std::string& realm_id) +{ + if (realm_id.empty()) { + return "period_config.default"; + } + return "period_config." + realm_id; +} + +rgw_pool RGWPeriodConfig::get_pool(CephContext *cct) +{ + const auto& pool_name = cct->_conf->rgw_period_root_pool; + if (pool_name.empty()) { + return {RGW_DEFAULT_PERIOD_ROOT_POOL}; + } + return {pool_name}; +} + +int RGWPeriodConfig::read(RGWSI_SysObj *sysobj_svc, const std::string& realm_id) +{ + const auto& pool = get_pool(sysobj_svc->ctx()); + const auto& oid = get_oid(realm_id); + bufferlist bl; + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj{pool, oid}); + int ret = sysobj.rop().read(&bl); + if (ret < 0) { + return ret; + } + using ceph::decode; + try { + auto iter = bl.cbegin(); + decode(*this, iter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +int RGWPeriodConfig::write(RGWSI_SysObj *sysobj_svc, const std::string& realm_id) +{ + const auto& pool = get_pool(sysobj_svc->ctx()); + const auto& oid = get_oid(realm_id); + bufferlist bl; + using ceph::encode; + encode(*this, bl); + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj{pool, oid}); + return sysobj.wop() + .set_exclusive(false) + .write(bl); +} + +int RGWPeriod::init(CephContext *_cct, RGWSI_SysObj *_sysobj_svc, const string& period_realm_id, + const string& period_realm_name, bool setup_obj) +{ + cct = _cct; + sysobj_svc = _sysobj_svc; + + realm_id = period_realm_id; + realm_name = period_realm_name; + + if (!setup_obj) + return 0; + + return init(_cct, _sysobj_svc, setup_obj); +} + + +int RGWPeriod::init(CephContext *_cct, RGWSI_SysObj *_sysobj_svc, bool setup_obj) +{ + cct = _cct; + sysobj_svc = _sysobj_svc; + + if (!setup_obj) + return 0; + + if (id.empty()) { + RGWRealm realm(realm_id, realm_name); + int ret = realm.init(cct, sysobj_svc); + if (ret < 0) { + ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " << + cpp_strerror(-ret) << dendl; + return ret; + } + id = realm.get_current_period(); + realm_id = realm.get_id(); + } + + if (!epoch) { + int ret = use_latest_epoch(); + if (ret < 0) { + ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id + << " : " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + return read_info(); +} + + +int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, + const string& zonegroup_id) const +{ + map::const_iterator iter; + if (!zonegroup_id.empty()) { + iter = period_map.zonegroups.find(zonegroup_id); + } else { + iter = period_map.zonegroups.find("default"); + } + if (iter != period_map.zonegroups.end()) { + zonegroup = iter->second; + return 0; + } + + return -ENOENT; +} + +const string& RGWPeriod::get_latest_epoch_oid() const +{ + if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) { + return period_latest_epoch_info_oid; + } + return cct->_conf->rgw_period_latest_epoch_info_oid; +} + +const string& RGWPeriod::get_info_oid_prefix() const +{ + return period_info_oid_prefix; +} + +const string RGWPeriod::get_period_oid_prefix() const +{ + return get_info_oid_prefix() + id; +} + +const string RGWPeriod::get_period_oid() const +{ + std::ostringstream oss; + oss << get_period_oid_prefix(); + // skip the epoch for the staging period + if (id != get_staging_id(realm_id)) + oss << "." << epoch; + return oss.str(); +} + +int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info, + RGWObjVersionTracker *objv) +{ + string oid = get_period_oid_prefix() + get_latest_epoch_oid(); + + rgw_pool pool(get_pool(cct)); + bufferlist bl; + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj{pool, oid}); + int ret = sysobj.rop().read(&bl); + if (ret < 0) { + ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl; + return ret; + } + try { + auto iter = bl.cbegin(); + using ceph::decode; + decode(info, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl; + return -EIO; + } + + return 0; +} + +int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch) +{ + RGWPeriodLatestEpochInfo info; + + int ret = read_latest_epoch(info); + if (ret < 0) { + return ret; + } + + latest_epoch = info.epoch; + + return 0; +} + +int RGWPeriod::use_latest_epoch() +{ + RGWPeriodLatestEpochInfo info; + int ret = read_latest_epoch(info); + if (ret < 0) { + return ret; + } + + epoch = info.epoch; + + return 0; +} + +int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive, + RGWObjVersionTracker *objv) +{ + string oid = get_period_oid_prefix() + get_latest_epoch_oid(); + + rgw_pool pool(get_pool(cct)); + bufferlist bl; + + RGWPeriodLatestEpochInfo info; + info.epoch = epoch; + + using ceph::encode; + encode(info, bl); + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj(pool, oid)); + return sysobj.wop() + .set_exclusive(exclusive) + .write(bl); +} + +int RGWPeriod::update_latest_epoch(epoch_t epoch) +{ + static constexpr int MAX_RETRIES = 20; + + for (int i = 0; i < MAX_RETRIES; i++) { + RGWPeriodLatestEpochInfo info; + RGWObjVersionTracker objv; + bool exclusive = false; + + // read existing epoch + int r = read_latest_epoch(info, &objv); + if (r == -ENOENT) { + // use an exclusive create to set the epoch atomically + exclusive = true; + ldout(cct, 20) << "creating initial latest_epoch=" << epoch + << " for period=" << id << dendl; + } else if (r < 0) { + ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl; + return r; + } else if (epoch <= info.epoch) { + r = -EEXIST; // fail with EEXIST if epoch is not newer + ldout(cct, 10) << "found existing latest_epoch " << info.epoch + << " >= given epoch " << epoch << ", returning r=" << r << dendl; + return r; + } else { + ldout(cct, 20) << "updating latest_epoch from " << info.epoch + << " -> " << epoch << " on period=" << id << dendl; + } + + r = set_latest_epoch(epoch, exclusive, &objv); + if (r == -EEXIST) { + continue; // exclusive create raced with another update, retry + } else if (r == -ECANCELED) { + continue; // write raced with a conflicting version, retry + } + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl; + return r; + } + return 0; // return success + } + + return -ECANCELED; // fail after max retries +} + +int RGWPeriod::delete_obj() +{ + rgw_pool pool(get_pool(cct)); + + // delete the object for each period epoch + for (epoch_t e = 1; e <= epoch; e++) { + RGWPeriod p{get_id(), e}; + rgw_raw_obj oid{pool, p.get_period_oid()}; + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, oid); + int ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(cct, 0) << "WARNING: failed to delete period object " << oid + << ": " << cpp_strerror(-ret) << dendl; + } + } + + // delete the .latest_epoch object + rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()}; + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, oid); + int ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(cct, 0) << "WARNING: failed to delete period object " << oid + << ": " << cpp_strerror(-ret) << dendl; + } + return ret; +} + +int RGWPeriod::read_info() +{ + rgw_pool pool(get_pool(cct)); + + bufferlist bl; + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj{pool, get_period_oid()}); + int ret = sysobj.rop().read(&bl); + if (ret < 0) { + ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + try { + using ceph::decode; + auto iter = bl.cbegin(); + decode(*this, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl; + return -EIO; + } + + return 0; +} + +int RGWPeriod::create(bool exclusive) +{ + int ret; + + /* create unique id */ + uuid_d new_uuid; + char uuid_str[37]; + new_uuid.generate_random(); + new_uuid.print(uuid_str); + id = uuid_str; + + epoch = FIRST_EPOCH; + + period_map.id = id; + + ret = store_info(exclusive); + if (ret < 0) { + ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = set_latest_epoch(epoch); + if (ret < 0) { + ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl; + } + + return ret; +} + +int RGWPeriod::store_info(bool exclusive) +{ + rgw_pool pool(get_pool(cct)); + + string oid = get_period_oid(); + bufferlist bl; + using ceph::encode; + encode(*this, bl); + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj(pool, oid)); + return sysobj.wop() + .set_exclusive(exclusive) + .write(bl); +} + +rgw_pool RGWPeriod::get_pool(CephContext *cct) const +{ + if (cct->_conf->rgw_period_root_pool.empty()) { + return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL); + } + return rgw_pool(cct->_conf->rgw_period_root_pool); +} + +int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup) +{ + if (zonegroup.realm_id != realm_id) { + return 0; + } + int ret = period_map.update(zonegroup, cct); + if (ret < 0) { + ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl; + return ret; + } + + return store_info(false); +} + +int RGWPeriod::update() +{ + auto zone_svc = sysobj_svc->get_zone_svc(); + ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl; + list zonegroups; + int ret = zone_svc->list_zonegroups(zonegroups); + if (ret < 0) { + ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl; + return ret; + } + + // clear zone short ids of removed zones. period_map.update() will add the + // remaining zones back + period_map.short_zone_ids.clear(); + + for (auto& iter : zonegroups) { + RGWZoneGroup zg(string(), iter); + ret = zg.init(cct, sysobj_svc); + if (ret < 0) { + ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl; + continue; + } + + if (zg.realm_id != realm_id) { + ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl; + continue; + } + + if (zg.master_zone.empty()) { + ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl; + return -EINVAL; + } + + if (zg.zones.find(zg.master_zone) == zg.zones.end()) { + ldout(cct,0) << "ERROR: zonegroup " << zg.get_name() + << " has a non existent master zone "<< dendl; + return -EINVAL; + } + + if (zg.is_master_zonegroup()) { + master_zonegroup = zg.get_id(); + master_zone = zg.master_zone; + } + + int ret = period_map.update(zg, cct); + if (ret < 0) { + return ret; + } + } + + ret = period_config.read(sysobj_svc, realm_id); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "ERROR: failed to read period config: " + << cpp_strerror(ret) << dendl; + return ret; + } + return 0; +} + +int RGWPeriod::reflect() +{ + for (auto& iter : period_map.zonegroups) { + RGWZoneGroup& zg = iter.second; + zg.reinit_instance(cct, sysobj_svc); + int r = zg.write(false); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl; + return r; + } + if (zg.is_master_zonegroup()) { + // set master as default if no default exists + r = zg.set_as_default(true); + if (r == 0) { + ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id() + << " as the default" << dendl; + } + } + } + + int r = period_config.write(sysobj_svc, realm_id); + if (r < 0) { + ldout(cct, 0) << "ERROR: failed to store period config: " + << cpp_strerror(-r) << dendl; + return r; + } + return 0; +} + +void RGWPeriod::fork() +{ + ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl; + predecessor_uuid = id; + id = get_staging_id(realm_id); + period_map.reset(); + realm_epoch++; +} + +static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status) +{ + // initialize a sync status manager to read the status + RGWMetaSyncStatusManager mgr(store, store->get_async_rados()); + int r = mgr.init(); + if (r < 0) { + return r; + } + r = mgr.read_sync_status(sync_status); + mgr.stop(); + return r; +} + +int RGWPeriod::update_sync_status(RGWRados *store, /* for now */ + const RGWPeriod ¤t_period, + std::ostream& error_stream, + bool force_if_stale) +{ + rgw_meta_sync_status status; + int r = read_sync_status(store, &status); + if (r < 0) { + ldout(cct, 0) << "period failed to read sync status: " + << cpp_strerror(-r) << dendl; + return r; + } + + std::vector markers; + + const auto current_epoch = current_period.get_realm_epoch(); + if (current_epoch != status.sync_info.realm_epoch) { + // no sync status markers for the current period + ceph_assert(current_epoch > status.sync_info.realm_epoch); + const int behind = current_epoch - status.sync_info.realm_epoch; + if (!force_if_stale && current_epoch > 1) { + error_stream << "ERROR: This zone is " << behind << " period(s) behind " + "the current master zone in metadata sync. If this zone is promoted " + "to master, any metadata changes during that time are likely to " + "be lost.\n" + "Waiting for this zone to catch up on metadata sync (see " + "'radosgw-admin sync status') is recommended.\n" + "To promote this zone to master anyway, add the flag " + "--yes-i-really-mean-it." << std::endl; + return -EINVAL; + } + // empty sync status markers - other zones will skip this period during + // incremental metadata sync + markers.resize(status.sync_info.num_shards); + } else { + markers.reserve(status.sync_info.num_shards); + for (auto& i : status.sync_markers) { + auto& marker = i.second; + // filter out markers from other periods + if (marker.realm_epoch != current_epoch) { + marker.marker.clear(); + } + markers.emplace_back(std::move(marker.marker)); + } + } + + std::swap(sync_status, markers); + return 0; +} + +int RGWPeriod::commit(RGWRados *store, + RGWRealm& realm, const RGWPeriod& current_period, + std::ostream& error_stream, bool force_if_stale) +{ + auto zone_svc = sysobj_svc->get_zone_svc(); + ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl; + // gateway must be in the master zone to commit + if (master_zone != zone_svc->get_zone_params().get_id()) { + error_stream << "Cannot commit period on zone " + << zone_svc->get_zone_params().get_id() << ", it must be sent to " + "the period's master zone " << master_zone << '.' << std::endl; + return -EINVAL; + } + // period predecessor must match current period + if (predecessor_uuid != current_period.get_id()) { + error_stream << "Period predecessor " << predecessor_uuid + << " does not match current period " << current_period.get_id() + << ". Use 'period pull' to get the latest period from the master, " + "reapply your changes, and try again." << std::endl; + return -EINVAL; + } + // realm epoch must be 1 greater than current period + if (realm_epoch != current_period.get_realm_epoch() + 1) { + error_stream << "Period's realm epoch " << realm_epoch + << " does not come directly after current realm epoch " + << current_period.get_realm_epoch() << ". Use 'realm pull' to get the " + "latest realm and period from the master zone, reapply your changes, " + "and try again." << std::endl; + return -EINVAL; + } + // did the master zone change? + if (master_zone != current_period.get_master_zone()) { + // store the current metadata sync status in the period + int r = update_sync_status(store, current_period, error_stream, force_if_stale); + if (r < 0) { + ldout(cct, 0) << "failed to update metadata sync status: " + << cpp_strerror(-r) << dendl; + return r; + } + // create an object with a new period id + r = create(true); + if (r < 0) { + ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl; + return r; + } + // set as current period + r = realm.set_current_period(*this); + if (r < 0) { + ldout(cct, 0) << "failed to update realm's current period: " + << cpp_strerror(-r) << dendl; + return r; + } + ldout(cct, 4) << "Promoted to master zone and committed new period " + << id << dendl; + realm.notify_new_period(*this); + return 0; + } + // period must be based on current epoch + if (epoch != current_period.get_epoch()) { + error_stream << "Period epoch " << epoch << " does not match " + "predecessor epoch " << current_period.get_epoch() + << ". Use 'period pull' to get the latest epoch from the master zone, " + "reapply your changes, and try again." << std::endl; + return -EINVAL; + } + // set period as next epoch + set_id(current_period.get_id()); + set_epoch(current_period.get_epoch() + 1); + set_predecessor(current_period.get_predecessor()); + realm_epoch = current_period.get_realm_epoch(); + // write the period to rados + int r = store_info(false); + if (r < 0) { + ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl; + return r; + } + // set as latest epoch + r = update_latest_epoch(epoch); + if (r == -EEXIST) { + // already have this epoch (or a more recent one) + return 0; + } + if (r < 0) { + ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl; + return r; + } + r = reflect(); + if (r < 0) { + ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl; + return r; + } + ldout(cct, 4) << "Committed new epoch " << epoch + << " for period " << id << dendl; + realm.notify_new_period(*this); + return 0; +} + +int RGWZoneParams::create_default(bool old_format) +{ + name = default_zone_name; + + int r = create(); + if (r < 0) { + return r; + } + + if (old_format) { + name = id; + } + + return r; +} + + +int get_zones_pool_set(CephContext* cct, + RGWSI_SysObj* sysobj_svc, + const list& zones, + const string& my_zone_id, + set& pool_names) +{ + for(auto const& iter : zones) { + RGWZoneParams zone(iter); + int r = zone.init(cct, sysobj_svc); + if (r < 0) { + ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl; + return r; + } + if (zone.get_id() != my_zone_id) { + pool_names.insert(zone.domain_root); + pool_names.insert(zone.metadata_heap); + pool_names.insert(zone.control_pool); + pool_names.insert(zone.gc_pool); + pool_names.insert(zone.log_pool); + pool_names.insert(zone.intent_log_pool); + pool_names.insert(zone.usage_log_pool); + pool_names.insert(zone.user_keys_pool); + pool_names.insert(zone.user_email_pool); + pool_names.insert(zone.user_swift_pool); + pool_names.insert(zone.user_uid_pool); + pool_names.insert(zone.otp_pool); + pool_names.insert(zone.roles_pool); + pool_names.insert(zone.reshard_pool); + for(auto& iter : zone.placement_pools) { + pool_names.insert(iter.second.index_pool); + for (auto& pi : iter.second.storage_classes.get_all()) { + if (pi.second.data_pool) { + pool_names.insert(pi.second.data_pool.get()); + } + } + pool_names.insert(iter.second.data_extra_pool); + } + } + } + return 0; +} + +rgw_pool fix_zone_pool_dup(set pools, + const string& default_prefix, + const string& default_suffix, + const rgw_pool& suggested_pool) +{ + string suggested_name = suggested_pool.to_str(); + + string prefix = default_prefix; + string suffix = default_suffix; + + if (!suggested_pool.empty()) { + prefix = suggested_name.substr(0, suggested_name.find(".")); + suffix = suggested_name.substr(prefix.length()); + } + + rgw_pool pool(prefix + suffix); + + if (pools.find(pool) == pools.end()) { + return pool; + } else { + while(true) { + pool = prefix + "_" + std::to_string(std::rand()) + suffix; + if (pools.find(pool) == pools.end()) { + return pool; + } + } + } +} + +int RGWZoneParams::fix_pool_names() +{ + + list zones; + int r = zone_svc->list_zones(zones); + if (r < 0) { + ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl; + } + + set pools; + r = get_zones_pool_set(cct, sysobj_svc, zones, id, pools); + if (r < 0) { + ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl; + return r; + } + + domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root); + if (!metadata_heap.name.empty()) { + metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap); + } + control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool); + gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool); + lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool); + log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool); + intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool); + usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool); + user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool); + user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool); + user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool); + user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool); + roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool); + reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool); + otp_pool = fix_zone_pool_dup(pools, name, ".rgw.otp", otp_pool); + + for(auto& iter : placement_pools) { + iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix, + iter.second.index_pool); + for (auto& pi : iter.second.storage_classes.get_all()) { + if (pi.second.data_pool) { + rgw_pool& pool = pi.second.data_pool.get(); + pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix, + pool); + } + } + iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix, + iter.second.data_extra_pool); + } + + return 0; +} + +int RGWZoneParams::create(bool exclusive) +{ + /* check for old pools config */ + rgw_raw_obj obj(domain_root, avail_pools); + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = sysobj_svc->get_obj(obj_ctx, obj); + int r = sysobj.rop().stat(); + if (r < 0) { + ldout(cct, 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl; + /* a new system, let's set new placement info */ + RGWZonePlacementInfo default_placement; + default_placement.index_pool = name + "." + default_bucket_index_pool_suffix; + rgw_pool pool = name + "." + default_storage_pool_suffix; + default_placement.storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &pool, nullptr); + default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix; + placement_pools["default-placement"] = default_placement; + } + + r = fix_pool_names(); + if (r < 0) { + ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl; + return r; + } + + r = RGWSystemMetaObj::create(exclusive); + if (r < 0) { + return r; + } + + // try to set as default. may race with another create, so pass exclusive=true + // so we don't override an existing default + r = set_as_default(true); + if (r < 0 && r != -EEXIST) { + ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl; + } + + return 0; +} + +rgw_pool RGWZoneParams::get_pool(CephContext *cct) const +{ + if (cct->_conf->rgw_zone_root_pool.empty()) { + return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL); + } + + return rgw_pool(cct->_conf->rgw_zone_root_pool); +} + +const string RGWZoneParams::get_default_oid(bool old_format) const +{ + if (old_format) { + return cct->_conf->rgw_default_zone_info_oid; + } + + return cct->_conf->rgw_default_zone_info_oid + "." + realm_id; +} + +const string& RGWZoneParams::get_names_oid_prefix() const +{ + return zone_names_oid_prefix; +} + +const string& RGWZoneParams::get_info_oid_prefix(bool old_format) const +{ + return zone_info_oid_prefix; +} + +const string& RGWZoneParams::get_predefined_name(CephContext *cct) const { + return cct->_conf->rgw_zone; +} + +int RGWZoneParams::init(CephContext *cct, RGWSI_SysObj *sysobj_svc, bool setup_obj, bool old_format) +{ + if (name.empty()) { + name = cct->_conf->rgw_zone; + } + + return RGWSystemMetaObj::init(cct, sysobj_svc, setup_obj, old_format); +} + +int RGWZoneParams::read_default_id(string& default_id, bool old_format) +{ + if (realm_id.empty()) { + /* try using default realm */ + RGWRealm realm; + int ret = realm.init(cct, sysobj_svc); + //no default realm exist + if (ret < 0) { + return read_id(default_zone_name, default_id); + } + realm_id = realm.get_id(); + } + + return RGWSystemMetaObj::read_default_id(default_id, old_format); +} + + +int RGWZoneParams::set_as_default(bool exclusive) +{ + if (realm_id.empty()) { + /* try using default realm */ + RGWRealm realm; + int ret = realm.init(cct, sysobj_svc); + if (ret < 0) { + ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl; + return -EINVAL; + } + realm_id = realm.get_id(); + } + + return RGWSystemMetaObj::set_as_default(exclusive); +} + +const string& RGWZoneParams::get_compression_type(const rgw_placement_rule& placement_rule) const +{ + static const std::string NONE{"none"}; + auto p = placement_pools.find(placement_rule.name); + if (p == placement_pools.end()) { + return NONE; + } + const auto& type = p->second.get_compression_type(placement_rule.get_storage_class()); + return !type.empty() ? type : NONE; +} + +void RGWPeriodMap::encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(id, bl); + encode(zonegroups, bl); + encode(master_zonegroup, bl); + encode(short_zone_ids, bl); + ENCODE_FINISH(bl); +} + +void RGWPeriodMap::decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(id, bl); + decode(zonegroups, bl); + decode(master_zonegroup, bl); + if (struct_v >= 2) { + decode(short_zone_ids, bl); + } + DECODE_FINISH(bl); + + zonegroups_by_api.clear(); + for (map::iterator iter = zonegroups.begin(); + iter != zonegroups.end(); ++iter) { + RGWZoneGroup& zonegroup = iter->second; + zonegroups_by_api[zonegroup.api_name] = zonegroup; + if (zonegroup.is_master_zonegroup()) { + master_zonegroup = zonegroup.get_id(); + } + } +} + +// run an MD5 hash on the zone_id and return the first 32 bits +static uint32_t gen_short_zone_id(const std::string zone_id) +{ + unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + hash.Update((const unsigned char *)zone_id.c_str(), zone_id.size()); + hash.Final(md5); + + uint32_t short_id; + memcpy((char *)&short_id, md5, sizeof(short_id)); + return std::max(short_id, 1u); +} + +int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct) +{ + if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) { + ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl; + ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <::iterator iter = zonegroups.find(zonegroup.get_id()); + if (iter != zonegroups.end()) { + RGWZoneGroup& old_zonegroup = iter->second; + if (!old_zonegroup.api_name.empty()) { + zonegroups_by_api.erase(old_zonegroup.api_name); + } + } + zonegroups[zonegroup.get_id()] = zonegroup; + + if (!zonegroup.api_name.empty()) { + zonegroups_by_api[zonegroup.api_name] = zonegroup; + } + + if (zonegroup.is_master_zonegroup()) { + master_zonegroup = zonegroup.get_id(); + } else if (master_zonegroup == zonegroup.get_id()) { + master_zonegroup = ""; + } + + for (auto& i : zonegroup.zones) { + auto& zone = i.second; + if (short_zone_ids.find(zone.id) != short_zone_ids.end()) { + continue; + } + // calculate the zone's short id + uint32_t short_id = gen_short_zone_id(zone.id); + + // search for an existing zone with the same short id + for (auto& s : short_zone_ids) { + if (s.second == short_id) { + ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id + << ") generates the same short_zone_id " << short_id + << " as existing zone id " << s.first << dendl; + return -EEXIST; + } + } + + short_zone_ids[zone.id] = short_id; + } + + return 0; +} + +uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const +{ + auto i = short_zone_ids.find(zone_id); + if (i == short_zone_ids.end()) { + return 0; + } + return i->second; +} + +int RGWZoneGroupMap::read(CephContext *cct, RGWSI_SysObj *sysobj_svc) +{ + + RGWPeriod period; + int ret = period.init(cct, sysobj_svc); + if (ret < 0) { + cerr << "failed to read current period info: " << cpp_strerror(ret); + return ret; + } + + bucket_quota = period.get_config().bucket_quota; + user_quota = period.get_config().user_quota; + zonegroups = period.get_map().zonegroups; + zonegroups_by_api = period.get_map().zonegroups_by_api; + master_zonegroup = period.get_map().master_zonegroup; + + return 0; +} + +void RGWRegionMap::encode(bufferlist& bl) const { + ENCODE_START( 3, 1, bl); + encode(regions, bl); + encode(master_region, bl); + encode(bucket_quota, bl); + encode(user_quota, bl); + ENCODE_FINISH(bl); +} + +void RGWRegionMap::decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(regions, bl); + decode(master_region, bl); + if (struct_v >= 2) + decode(bucket_quota, bl); + if (struct_v >= 3) + decode(user_quota, bl); + DECODE_FINISH(bl); +} + +void RGWZoneGroupMap::encode(bufferlist& bl) const { + ENCODE_START( 3, 1, bl); + encode(zonegroups, bl); + encode(master_zonegroup, bl); + encode(bucket_quota, bl); + encode(user_quota, bl); + ENCODE_FINISH(bl); +} + +void RGWZoneGroupMap::decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(zonegroups, bl); + decode(master_zonegroup, bl); + if (struct_v >= 2) + decode(bucket_quota, bl); + if (struct_v >= 3) + decode(user_quota, bl); + DECODE_FINISH(bl); + + zonegroups_by_api.clear(); + for (map::iterator iter = zonegroups.begin(); + iter != zonegroups.end(); ++iter) { + RGWZoneGroup& zonegroup = iter->second; + zonegroups_by_api[zonegroup.api_name] = zonegroup; + if (zonegroup.is_master_zonegroup()) { + master_zonegroup = zonegroup.get_name(); + } + } +} + + diff --git a/src/rgw/rgw_zone.h b/src/rgw/rgw_zone.h new file mode 100644 index 00000000..89f635a5 --- /dev/null +++ b/src/rgw/rgw_zone.h @@ -0,0 +1,1145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RGW_ZONE_H +#define CEPH_RGW_ZONE_H + +#include "rgw_common.h" + +namespace rgw_zone_defaults { + +extern std::string zone_names_oid_prefix; +extern std::string region_info_oid_prefix; +extern std::string realm_names_oid_prefix; +extern std::string zone_group_info_oid_prefix; +extern std::string realm_info_oid_prefix; +extern std::string default_region_info_oid; +extern std::string default_zone_group_info_oid; +extern std::string region_map_oid; +extern std::string default_realm_info_oid; +extern std::string default_zonegroup_name; +extern std::string default_zone_name; +extern std::string zonegroup_names_oid_prefix; +extern std::string RGW_DEFAULT_ZONE_ROOT_POOL; +extern std::string RGW_DEFAULT_ZONEGROUP_ROOT_POOL; +extern std::string RGW_DEFAULT_REALM_ROOT_POOL; +extern std::string RGW_DEFAULT_PERIOD_ROOT_POOL; +extern std::string avail_pools; +extern std::string default_storage_pool_suffix; + +} + +class JSONObj; +class RGWSyncModulesManager; + +struct RGWNameToId { + std::string obj_id; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(obj_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(obj_id, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWNameToId) + +struct RGWDefaultSystemMetaObjInfo { + std::string default_id; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(default_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(default_id, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWDefaultSystemMetaObjInfo) + +class RGWSI_SysObj; +class RGWSI_Zone; + +class RGWSystemMetaObj { +protected: + std::string id; + std::string name; + + CephContext *cct{nullptr}; + RGWSI_SysObj *sysobj_svc{nullptr}; + RGWSI_Zone *zone_svc{nullptr}; + + int store_name(bool exclusive); + int store_info(bool exclusive); + int read_info(const std::string& obj_id, bool old_format = false); + int read_id(const std::string& obj_name, std::string& obj_id); + int read_default(RGWDefaultSystemMetaObjInfo& default_info, + const std::string& oid); + /* read and use default id */ + int use_default(bool old_format = false); + +public: + RGWSystemMetaObj() {} + RGWSystemMetaObj(const std::string& _name): name(_name) {} + RGWSystemMetaObj(const std::string& _id, const std::string& _name) : id(_id), name(_name) {} + RGWSystemMetaObj(CephContext *_cct, RGWSI_SysObj *_sysobj_svc) { + reinit_instance(_cct, _sysobj_svc); + } + RGWSystemMetaObj(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): name(_name) { + reinit_instance(_cct, _sysobj_svc); + } + + const std::string& get_name() const { return name; } + const std::string& get_id() const { return id; } + + void set_name(const std::string& _name) { name = _name;} + void set_id(const std::string& _id) { id = _id;} + void clear_id() { id.clear(); } + + virtual ~RGWSystemMetaObj() {} + + virtual void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(name, bl); + ENCODE_FINISH(bl); + } + + virtual void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(name, bl); + DECODE_FINISH(bl); + } + + void reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc); + int init(CephContext *_cct, RGWSI_SysObj *_sysobj_svc, bool setup_obj = true, bool old_format = false); + virtual int read_default_id(std::string& default_id, bool old_format = false); + virtual int set_as_default(bool exclusive = false); + int delete_default(); + virtual int create(bool exclusive = true); + int delete_obj(bool old_format = false); + int rename(const std::string& new_name); + int update() { return store_info(false);} + int update_name() { return store_name(false);} + int read(); + int write(bool exclusive); + + virtual rgw_pool get_pool(CephContext *cct) const = 0; + virtual const std::string get_default_oid(bool old_format = false) const = 0; + virtual const std::string& get_names_oid_prefix() const = 0; + virtual const std::string& get_info_oid_prefix(bool old_format = false) const = 0; + virtual const std::string& get_predefined_name(CephContext *cct) const = 0; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWSystemMetaObj) + +struct RGWZoneStorageClass { + boost::optional data_pool; + boost::optional compression_type; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(data_pool, bl); + encode(compression_type, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(data_pool, bl); + decode(compression_type, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneStorageClass) + + +class RGWZoneStorageClasses { + map m; + + /* in memory only */ + RGWZoneStorageClass *standard_class; + +public: + RGWZoneStorageClasses() { + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; + } + RGWZoneStorageClasses(const RGWZoneStorageClasses& rhs) { + m = rhs.m; + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; + } + RGWZoneStorageClasses& operator=(const RGWZoneStorageClasses& rhs) { + m = rhs.m; + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; + return *this; + } + + const RGWZoneStorageClass& get_standard() const { + return *standard_class; + } + + bool find(const string& sc, const RGWZoneStorageClass **pstorage_class) const { + auto iter = m.find(sc); + if (iter == m.end()) { + return false; + } + *pstorage_class = &iter->second; + return true; + } + + bool exists(const string& sc) const { + if (sc.empty()) { + return true; + } + auto iter = m.find(sc); + return (iter != m.end()); + } + + const map& get_all() const { + return m; + } + + map& get_all() { + return m; + } + + void set_storage_class(const string& sc, const rgw_pool *data_pool, const string *compression_type) { + const string *psc = ≻ + if (sc.empty()) { + psc = &RGW_STORAGE_CLASS_STANDARD; + } + RGWZoneStorageClass& storage_class = m[*psc]; + if (data_pool) { + storage_class.data_pool = *data_pool; + } + if (compression_type) { + storage_class.compression_type = *compression_type; + } + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(m, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(m, bl); + standard_class = &m[RGW_STORAGE_CLASS_STANDARD]; + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneStorageClasses) + +struct RGWZonePlacementInfo { + rgw_pool index_pool; + rgw_pool data_extra_pool; /* if not set we should use data_pool */ + RGWZoneStorageClasses storage_classes; + RGWBucketIndexType index_type; + + RGWZonePlacementInfo() : index_type(RGWBIType_Normal) {} + + void encode(bufferlist& bl) const { + ENCODE_START(7, 1, bl); + encode(index_pool.to_str(), bl); + rgw_pool standard_data_pool = get_data_pool(RGW_STORAGE_CLASS_STANDARD); + encode(standard_data_pool.to_str(), bl); + encode(data_extra_pool.to_str(), bl); + encode((uint32_t)index_type, bl); + string standard_compression_type = get_compression_type(RGW_STORAGE_CLASS_STANDARD); + encode(standard_compression_type, bl); + encode(storage_classes, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(7, bl); + string index_pool_str; + string data_pool_str; + decode(index_pool_str, bl); + index_pool = rgw_pool(index_pool_str); + decode(data_pool_str, bl); + rgw_pool standard_data_pool(data_pool_str); + if (struct_v >= 4) { + string data_extra_pool_str; + decode(data_extra_pool_str, bl); + data_extra_pool = rgw_pool(data_extra_pool_str); + } + if (struct_v >= 5) { + uint32_t it; + decode(it, bl); + index_type = (RGWBucketIndexType)it; + } + string standard_compression_type; + if (struct_v >= 6) { + decode(standard_compression_type, bl); + } + if (struct_v >= 7) { + decode(storage_classes, bl); + } else { + storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &standard_data_pool, + (!standard_compression_type.empty() ? &standard_compression_type : nullptr)); + } + DECODE_FINISH(bl); + } + const rgw_pool& get_data_extra_pool() const { + static rgw_pool no_pool; + if (data_extra_pool.empty()) { + return storage_classes.get_standard().data_pool.get_value_or(no_pool); + } + return data_extra_pool; + } + const rgw_pool& get_data_pool(const string& sc) const { + const RGWZoneStorageClass *storage_class; + static rgw_pool no_pool; + + if (!storage_classes.find(sc, &storage_class)) { + return storage_classes.get_standard().data_pool.get_value_or(no_pool); + } + + return storage_class->data_pool.get_value_or(no_pool); + } + const rgw_pool& get_standard_data_pool() const { + return get_data_pool(RGW_STORAGE_CLASS_STANDARD); + } + + const string& get_compression_type(const string& sc) const { + const RGWZoneStorageClass *storage_class; + static string no_compression; + + if (!storage_classes.find(sc, &storage_class)) { + return no_compression; + } + return storage_class->compression_type.get_value_or(no_compression); + } + + bool storage_class_exists(const string& sc) const { + return storage_classes.exists(sc); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + +}; +WRITE_CLASS_ENCODER(RGWZonePlacementInfo) + +struct RGWZoneParams : RGWSystemMetaObj { + rgw_pool domain_root; + rgw_pool metadata_heap; + rgw_pool control_pool; + rgw_pool gc_pool; + rgw_pool lc_pool; + rgw_pool log_pool; + rgw_pool intent_log_pool; + rgw_pool usage_log_pool; + + rgw_pool user_keys_pool; + rgw_pool user_email_pool; + rgw_pool user_swift_pool; + rgw_pool user_uid_pool; + rgw_pool roles_pool; + rgw_pool reshard_pool; + rgw_pool otp_pool; + + RGWAccessKey system_key; + + map placement_pools; + + std::string realm_id; + + JSONFormattable tier_config; + + RGWZoneParams() : RGWSystemMetaObj() {} + explicit RGWZoneParams(const std::string& name) : RGWSystemMetaObj(name){} + RGWZoneParams(const std::string& id, const std::string& name) : RGWSystemMetaObj(id, name) {} + RGWZoneParams(const std::string& id, const std::string& name, const std::string& _realm_id) + : RGWSystemMetaObj(id, name), realm_id(_realm_id) {} + + rgw_pool get_pool(CephContext *cct) const override; + const std::string get_default_oid(bool old_format = false) const override; + const std::string& get_names_oid_prefix() const override; + const std::string& get_info_oid_prefix(bool old_format = false) const override; + const std::string& get_predefined_name(CephContext *cct) const override; + + int init(CephContext *_cct, RGWSI_SysObj *_sysobj_svc, bool setup_obj = true, + bool old_format = false); + using RGWSystemMetaObj::init; + int read_default_id(std::string& default_id, bool old_format = false) override; + int set_as_default(bool exclusive = false) override; + int create_default(bool old_format = false); + int create(bool exclusive = true) override; + int fix_pool_names(); + + const string& get_compression_type(const rgw_placement_rule& placement_rule) const; + + void encode(bufferlist& bl) const override { + ENCODE_START(12, 1, bl); + encode(domain_root, bl); + encode(control_pool, bl); + encode(gc_pool, bl); + encode(log_pool, bl); + encode(intent_log_pool, bl); + encode(usage_log_pool, bl); + encode(user_keys_pool, bl); + encode(user_email_pool, bl); + encode(user_swift_pool, bl); + encode(user_uid_pool, bl); + RGWSystemMetaObj::encode(bl); + encode(system_key, bl); + encode(placement_pools, bl); + encode(metadata_heap, bl); + encode(realm_id, bl); + encode(lc_pool, bl); + map old_tier_config; + encode(old_tier_config, bl); + encode(roles_pool, bl); + encode(reshard_pool, bl); + encode(otp_pool, bl); + encode(tier_config, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(12, bl); + decode(domain_root, bl); + decode(control_pool, bl); + decode(gc_pool, bl); + decode(log_pool, bl); + decode(intent_log_pool, bl); + decode(usage_log_pool, bl); + decode(user_keys_pool, bl); + decode(user_email_pool, bl); + decode(user_swift_pool, bl); + decode(user_uid_pool, bl); + if (struct_v >= 6) { + RGWSystemMetaObj::decode(bl); + } else if (struct_v >= 2) { + decode(name, bl); + id = name; + } + if (struct_v >= 3) + decode(system_key, bl); + if (struct_v >= 4) + decode(placement_pools, bl); + if (struct_v >= 5) + decode(metadata_heap, bl); + if (struct_v >= 6) { + decode(realm_id, bl); + } + if (struct_v >= 7) { + decode(lc_pool, bl); + } else { + lc_pool = log_pool.name + ":lc"; + } + map old_tier_config; + if (struct_v >= 8) { + decode(old_tier_config, bl); + } + if (struct_v >= 9) { + decode(roles_pool, bl); + } else { + roles_pool = name + ".rgw.meta:roles"; + } + if (struct_v >= 10) { + decode(reshard_pool, bl); + } else { + reshard_pool = log_pool.name + ":reshard"; + } + if (struct_v >= 11) { + ::decode(otp_pool, bl); + } else { + otp_pool = name + ".rgw.otp"; + } + if (struct_v >= 12) { + ::decode(tier_config, bl); + } else { + for (auto& kv : old_tier_config) { + tier_config.set(kv.first, kv.second); + } + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(list& o); + + bool get_placement(const std::string& placement_id, RGWZonePlacementInfo *placement) const { + auto iter = placement_pools.find(placement_id); + if (iter == placement_pools.end()) { + return false; + } + *placement = iter->second; + return true; + } + + /* + * return data pool of the head object + */ + bool get_head_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool) const { + const rgw_data_placement_target& explicit_placement = obj.bucket.explicit_placement; + if (!explicit_placement.data_pool.empty()) { + if (!obj.in_extra_data) { + *pool = explicit_placement.data_pool; + } else { + *pool = explicit_placement.get_data_extra_pool(); + } + return true; + } + if (placement_rule.empty()) { + return false; + } + auto iter = placement_pools.find(placement_rule.name); + if (iter == placement_pools.end()) { + return false; + } + if (!obj.in_extra_data) { + *pool = iter->second.get_data_pool(placement_rule.storage_class); + } else { + *pool = iter->second.get_data_extra_pool(); + } + return true; + } + + bool valid_placement(const rgw_placement_rule& rule) const { + auto iter = placement_pools.find(rule.name); + if (iter == placement_pools.end()) { + return false; + } + return iter->second.storage_class_exists(rule.storage_class); + } +}; +WRITE_CLASS_ENCODER(RGWZoneParams) + +struct RGWZone { + std::string id; + std::string name; + list endpoints; + bool log_meta; + bool log_data; + bool read_only; + std::string tier_type; + + std::string redirect_zone; + +/** + * Represents the number of shards for the bucket index object, a value of zero + * indicates there is no sharding. By default (no sharding, the name of the object + * is '.dir.{marker}', with sharding, the name is '.dir.{marker}.{sharding_id}', + * sharding_id is zero-based value. It is not recommended to set a too large value + * (e.g. thousand) as it increases the cost for bucket listing. + */ + uint32_t bucket_index_max_shards; + + bool sync_from_all; + set sync_from; /* list of zones to sync from */ + + RGWZone() : log_meta(false), log_data(false), read_only(false), bucket_index_max_shards(0), + sync_from_all(true) {} + + void encode(bufferlist& bl) const { + ENCODE_START(7, 1, bl); + encode(name, bl); + encode(endpoints, bl); + encode(log_meta, bl); + encode(log_data, bl); + encode(bucket_index_max_shards, bl); + encode(id, bl); + encode(read_only, bl); + encode(tier_type, bl); + encode(sync_from_all, bl); + encode(sync_from, bl); + encode(redirect_zone, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(7, bl); + decode(name, bl); + if (struct_v < 4) { + id = name; + } + decode(endpoints, bl); + if (struct_v >= 2) { + decode(log_meta, bl); + decode(log_data, bl); + } + if (struct_v >= 3) { + decode(bucket_index_max_shards, bl); + } + if (struct_v >= 4) { + decode(id, bl); + decode(read_only, bl); + } + if (struct_v >= 5) { + decode(tier_type, bl); + } + if (struct_v >= 6) { + decode(sync_from_all, bl); + decode(sync_from, bl); + } + if (struct_v >= 7) { + decode(redirect_zone, bl); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(list& o); + + bool is_read_only() const { return read_only; } + + bool syncs_from(const std::string& zone_name) const { + return (sync_from_all || sync_from.find(zone_name) != sync_from.end()); + } +}; +WRITE_CLASS_ENCODER(RGWZone) + +struct RGWDefaultZoneGroupInfo { + std::string default_zonegroup; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(default_zonegroup, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(default_zonegroup, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + //todo: implement ceph-dencoder +}; +WRITE_CLASS_ENCODER(RGWDefaultZoneGroupInfo) + +struct RGWZoneGroupPlacementTarget { + std::string name; + set tags; + set storage_classes; + + bool user_permitted(const list& user_tags) const { + if (tags.empty()) { + return true; + } + for (auto& rule : user_tags) { + if (tags.find(rule) != tags.end()) { + return true; + } + } + return false; + } + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(name, bl); + encode(tags, bl); + encode(storage_classes, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(name, bl); + decode(tags, bl); + if (struct_v >= 2) { + decode(storage_classes, bl); + } + if (storage_classes.empty()) { + storage_classes.insert(RGW_STORAGE_CLASS_STANDARD); + } + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTarget) + +struct RGWZoneGroup : public RGWSystemMetaObj { + std::string api_name; + list endpoints; + bool is_master = false; + + std::string master_zone; + map zones; + + map placement_targets; + rgw_placement_rule default_placement; + + list hostnames; + list hostnames_s3website; + // TODO: Maybe convert hostnames to a map> for + // endpoint_type->hostnames +/* +20:05 < _robbat21irssi> maybe I do someting like: if (hostname_map.empty()) { populate all map keys from hostnames; }; +20:05 < _robbat21irssi> but that's a later compatability migration planning bit +20:06 < yehudasa> more like if (!hostnames.empty()) { +20:06 < yehudasa> for (list::iterator iter = hostnames.begin(); iter != hostnames.end(); ++iter) { +20:06 < yehudasa> hostname_map["s3"].append(iter->second); +20:07 < yehudasa> hostname_map["s3website"].append(iter->second); +20:07 < yehudasa> s/append/push_back/g +20:08 < _robbat21irssi> inner loop over APIs +20:08 < yehudasa> yeah, probably +20:08 < _robbat21irssi> s3, s3website, swift, swith_auth, swift_website +*/ + map > api_hostname_map; + map > api_endpoints_map; + + std::string realm_id; + + RGWZoneGroup(): is_master(false){} + RGWZoneGroup(const std::string &id, const std::string &name):RGWSystemMetaObj(id, name) {} + explicit RGWZoneGroup(const std::string &_name):RGWSystemMetaObj(_name) {} + RGWZoneGroup(const std::string &_name, bool _is_master, CephContext *cct, RGWSI_SysObj* sysobj_svc, + const std::string& _realm_id, const list& _endpoints) + : RGWSystemMetaObj(_name, cct , sysobj_svc), endpoints(_endpoints), is_master(_is_master), + realm_id(_realm_id) {} + + bool is_master_zonegroup() const { return is_master;} + void update_master(bool _is_master) { + is_master = _is_master; + post_process_params(); + } + void post_process_params(); + + void encode(bufferlist& bl) const override { + ENCODE_START(4, 1, bl); + encode(name, bl); + encode(api_name, bl); + encode(is_master, bl); + encode(endpoints, bl); + encode(master_zone, bl); + encode(zones, bl); + encode(placement_targets, bl); + encode(default_placement, bl); + encode(hostnames, bl); + encode(hostnames_s3website, bl); + RGWSystemMetaObj::encode(bl); + encode(realm_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(4, bl); + decode(name, bl); + decode(api_name, bl); + decode(is_master, bl); + decode(endpoints, bl); + decode(master_zone, bl); + decode(zones, bl); + decode(placement_targets, bl); + decode(default_placement, bl); + if (struct_v >= 2) { + decode(hostnames, bl); + } + if (struct_v >= 3) { + decode(hostnames_s3website, bl); + } + if (struct_v >= 4) { + RGWSystemMetaObj::decode(bl); + decode(realm_id, bl); + } else { + id = name; + } + DECODE_FINISH(bl); + } + + int read_default_id(std::string& default_id, bool old_format = false) override; + int set_as_default(bool exclusive = false) override; + int create_default(bool old_format = false); + int equals(const std::string& other_zonegroup) const; + int add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only, + const list& endpoints, const std::string *ptier_type, + bool *psync_from_all, list& sync_from, list& sync_from_rm, + std::string *predirect_zone, RGWSyncModulesManager *sync_mgr); + int remove_zone(const std::string& zone_id); + int rename_zone(const RGWZoneParams& zone_params); + rgw_pool get_pool(CephContext *cct) const override; + const std::string get_default_oid(bool old_region_format = false) const override; + const std::string& get_info_oid_prefix(bool old_region_format = false) const override; + const std::string& get_names_oid_prefix() const override; + const std::string& get_predefined_name(CephContext *cct) const override; + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(RGWZoneGroup) + +struct RGWPeriodMap +{ + std::string id; + map zonegroups; + map zonegroups_by_api; + map short_zone_ids; + + std::string master_zonegroup; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + + int update(const RGWZoneGroup& zonegroup, CephContext *cct); + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + void reset() { + zonegroups.clear(); + zonegroups_by_api.clear(); + master_zonegroup.clear(); + } + + uint32_t get_zone_short_id(const std::string& zone_id) const; +}; +WRITE_CLASS_ENCODER(RGWPeriodMap) + +struct RGWPeriodConfig +{ + RGWQuotaInfo bucket_quota; + RGWQuotaInfo user_quota; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(bucket_quota, bl); + encode(user_quota, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(bucket_quota, bl); + decode(user_quota, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + + // the period config must be stored in a local object outside of the period, + // so that it can be used in a default configuration where no realm/period + // exists + int read(RGWSI_SysObj *sysobj_svc, const std::string& realm_id); + int write(RGWSI_SysObj *sysobj_svc, const std::string& realm_id); + + static std::string get_oid(const std::string& realm_id); + static rgw_pool get_pool(CephContext *cct); +}; +WRITE_CLASS_ENCODER(RGWPeriodConfig) + +/* for backward comaptability */ +struct RGWRegionMap { + + map regions; + + std::string master_region; + + RGWQuotaInfo bucket_quota; + RGWQuotaInfo user_quota; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWRegionMap) + +struct RGWZoneGroupMap { + + map zonegroups; + map zonegroups_by_api; + + std::string master_zonegroup; + + RGWQuotaInfo bucket_quota; + RGWQuotaInfo user_quota; + + /* construct the map */ + int read(CephContext *cct, RGWSI_SysObj *sysobj_svc); + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWZoneGroupMap) + +class RGWRealm; +class RGWPeriod; + +class RGWRealm : public RGWSystemMetaObj +{ + std::string current_period; + epoch_t epoch{0}; //< realm epoch, incremented for each new period + + int create_control(bool exclusive); + int delete_control(); +public: + RGWRealm() {} + RGWRealm(const std::string& _id, const std::string& _name = "") : RGWSystemMetaObj(_id, _name) {} + RGWRealm(CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_cct, _sysobj_svc) {} + RGWRealm(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_name, _cct, _sysobj_svc){} + + void encode(bufferlist& bl) const override { + ENCODE_START(1, 1, bl); + RGWSystemMetaObj::encode(bl); + encode(current_period, bl); + encode(epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) override { + DECODE_START(1, bl); + RGWSystemMetaObj::decode(bl); + decode(current_period, bl); + decode(epoch, bl); + DECODE_FINISH(bl); + } + + int create(bool exclusive = true) override; + int delete_obj(); + rgw_pool get_pool(CephContext *cct) const override; + const std::string get_default_oid(bool old_format = false) const override; + const std::string& get_names_oid_prefix() const override; + const std::string& get_info_oid_prefix(bool old_format = false) const override; + const std::string& get_predefined_name(CephContext *cct) const override; + + using RGWSystemMetaObj::read_id; // expose as public for radosgw-admin + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(list& o); + + const std::string& get_current_period() const { + return current_period; + } + int set_current_period(RGWPeriod& period); + void clear_current_period_and_epoch() { + current_period.clear(); + epoch = 0; + } + epoch_t get_epoch() const { return epoch; } + + std::string get_control_oid() const; + /// send a notify on the realm control object + int notify_zone(bufferlist& bl); + /// notify the zone of a new period + int notify_new_period(const RGWPeriod& period); +}; +WRITE_CLASS_ENCODER(RGWRealm) + +struct RGWPeriodLatestEpochInfo { + epoch_t epoch; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(epoch, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(epoch, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo) + +class RGWPeriod +{ + std::string id; + epoch_t epoch{0}; + std::string predecessor_uuid; + std::vector sync_status; + RGWPeriodMap period_map; + RGWPeriodConfig period_config; + std::string master_zonegroup; + std::string master_zone; + + std::string realm_id; + std::string realm_name; + epoch_t realm_epoch{1}; //< realm epoch when period was made current + + CephContext *cct{nullptr}; + RGWSI_SysObj *sysobj_svc{nullptr}; + + int read_info(); + int read_latest_epoch(RGWPeriodLatestEpochInfo& epoch_info, + RGWObjVersionTracker *objv = nullptr); + int use_latest_epoch(); + int use_current_period(); + + const std::string get_period_oid() const; + const std::string get_period_oid_prefix() const; + + // gather the metadata sync status for each shard; only for use on master zone + int update_sync_status(RGWRados *store, + const RGWPeriod ¤t_period, + std::ostream& error_stream, bool force_if_stale); + +public: + RGWPeriod() {} + + RGWPeriod(const std::string& period_id, epoch_t _epoch = 0) + : id(period_id), epoch(_epoch) {} + + const std::string& get_id() const { return id; } + epoch_t get_epoch() const { return epoch; } + epoch_t get_realm_epoch() const { return realm_epoch; } + const std::string& get_predecessor() const { return predecessor_uuid; } + const std::string& get_master_zone() const { return master_zone; } + const std::string& get_master_zonegroup() const { return master_zonegroup; } + const std::string& get_realm() const { return realm_id; } + const RGWPeriodMap& get_map() const { return period_map; } + RGWPeriodConfig& get_config() { return period_config; } + const RGWPeriodConfig& get_config() const { return period_config; } + const std::vector& get_sync_status() const { return sync_status; } + rgw_pool get_pool(CephContext *cct) const; + const std::string& get_latest_epoch_oid() const; + const std::string& get_info_oid_prefix() const; + + void set_user_quota(RGWQuotaInfo& user_quota) { + period_config.user_quota = user_quota; + } + + void set_bucket_quota(RGWQuotaInfo& bucket_quota) { + period_config.bucket_quota = bucket_quota; + } + + void set_id(const std::string& id) { + this->id = id; + period_map.id = id; + } + void set_epoch(epoch_t epoch) { this->epoch = epoch; } + void set_realm_epoch(epoch_t epoch) { realm_epoch = epoch; } + + void set_predecessor(const std::string& predecessor) + { + predecessor_uuid = predecessor; + } + + void set_realm_id(const std::string& _realm_id) { + realm_id = _realm_id; + } + + int reflect(); + + int get_zonegroup(RGWZoneGroup& zonegroup, + const std::string& zonegroup_id) const; + + bool is_single_zonegroup() const + { + return (period_map.zonegroups.size() <= 1); + } + + /* + returns true if there are several zone groups with a least one zone + */ + bool is_multi_zonegroups_with_zones() const + { + int count = 0; + for (const auto& zg: period_map.zonegroups) { + if (zg.second.zones.size() > 0) { + if (count++ > 0) { + return true; + } + } + } + return false; + } + + int get_latest_epoch(epoch_t& epoch); + int set_latest_epoch(epoch_t epoch, bool exclusive = false, + RGWObjVersionTracker *objv = nullptr); + // update latest_epoch if the given epoch is higher, else return -EEXIST + int update_latest_epoch(epoch_t epoch); + + int init(CephContext *_cct, RGWSI_SysObj *_sysobj_svc, const std::string &period_realm_id, const std::string &period_realm_name = "", + bool setup_obj = true); + int init(CephContext *_cct, RGWSI_SysObj *_sysobj_svc, bool setup_obj = true); + + int create(bool exclusive = true); + int delete_obj(); + int store_info(bool exclusive); + int add_zonegroup(const RGWZoneGroup& zonegroup); + + void fork(); + int update(); + + // commit a staging period; only for use on master zone + int commit(RGWRados *store, + RGWRealm& realm, const RGWPeriod ¤t_period, + std::ostream& error_stream, bool force_if_stale = false); + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(id, bl); + encode(epoch, bl); + encode(realm_epoch, bl); + encode(predecessor_uuid, bl); + encode(sync_status, bl); + encode(period_map, bl); + encode(master_zone, bl); + encode(master_zonegroup, bl); + encode(period_config, bl); + encode(realm_id, bl); + encode(realm_name, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(id, bl); + decode(epoch, bl); + decode(realm_epoch, bl); + decode(predecessor_uuid, bl); + decode(sync_status, bl); + decode(period_map, bl); + decode(master_zone, bl); + decode(master_zonegroup, bl); + decode(period_config, bl); + decode(realm_id, bl); + decode(realm_name, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(list& o); + + static std::string get_staging_id(const std::string& realm_id) { + return realm_id + ":staging"; + } +}; +WRITE_CLASS_ENCODER(RGWPeriod) + +#endif diff --git a/src/rgw/services/svc_finisher.cc b/src/rgw/services/svc_finisher.cc new file mode 100644 index 00000000..d239ff3c --- /dev/null +++ b/src/rgw/services/svc_finisher.cc @@ -0,0 +1,53 @@ +#include "common/Finisher.h" + +#include "svc_finisher.h" + +int RGWSI_Finisher::do_start() +{ + finisher = new Finisher(cct); + finisher->start(); + + return 0; +} + +void RGWSI_Finisher::shutdown() +{ + if (finalized) { + return; + } + + if (finisher) { + finisher->stop(); + + map cbs; + cbs.swap(shutdown_cbs); /* move cbs out, in case caller unregisetrs */ + for (auto& iter : cbs) { + iter.second->call(); + } + delete finisher; + } + + finalized = true; +} + +RGWSI_Finisher::~RGWSI_Finisher() +{ + shutdown(); +} + +void RGWSI_Finisher::register_caller(ShutdownCB *cb, int *phandle) +{ + *phandle = ++handles_counter; + shutdown_cbs[*phandle] = cb; +} + +void RGWSI_Finisher::unregister_caller(int handle) +{ + shutdown_cbs.erase(handle); +} + +void RGWSI_Finisher::schedule_context(Context *c) +{ + finisher->queue(c); +} + diff --git a/src/rgw/services/svc_finisher.h b/src/rgw/services/svc_finisher.h new file mode 100644 index 00000000..116fd8fd --- /dev/null +++ b/src/rgw/services/svc_finisher.h @@ -0,0 +1,45 @@ +#ifndef CEPH_RGW_SERVICES_FINISHER_H +#define CEPH_RGW_SERVICES_FINISHER_H + + +#include "rgw/rgw_service.h" + +class Context; +class Finisher; + +class RGWSI_Finisher : public RGWServiceInstance +{ + friend struct RGWServices_Def; +public: + class ShutdownCB; + +private: + Finisher *finisher{nullptr}; + bool finalized{false}; + + void shutdown() override; + + std::map shutdown_cbs; + std::atomic handles_counter{0}; + +protected: + void init() {} + int do_start() override; + +public: + RGWSI_Finisher(CephContext *cct): RGWServiceInstance(cct) {} + ~RGWSI_Finisher(); + + class ShutdownCB { + public: + virtual ~ShutdownCB() {} + virtual void call() = 0; + }; + + void register_caller(ShutdownCB *cb, int *phandle); + void unregister_caller(int handle); + + void schedule_context(Context *c); +}; + +#endif diff --git a/src/rgw/services/svc_notify.cc b/src/rgw/services/svc_notify.cc new file mode 100644 index 00000000..9ee7f295 --- /dev/null +++ b/src/rgw/services/svc_notify.cc @@ -0,0 +1,484 @@ +#include "include/random.h" +#include "common/errno.h" + +#include "svc_notify.h" +#include "svc_finisher.h" +#include "svc_zone.h" +#include "svc_rados.h" + +#include "rgw/rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +static string notify_oid_prefix = "notify"; + +class RGWWatcher : public librados::WatchCtx2 { + CephContext *cct; + RGWSI_Notify *svc; + int index; + RGWSI_RADOS::Obj obj; + uint64_t watch_handle; + int register_ret{0}; + librados::AioCompletion *register_completion{nullptr}; + + class C_ReinitWatch : public Context { + RGWWatcher *watcher; + public: + explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {} + void finish(int r) override { + watcher->reinit(); + } + }; +public: + RGWWatcher(CephContext *_cct, RGWSI_Notify *s, int i, RGWSI_RADOS::Obj& o) : cct(_cct), svc(s), index(i), obj(o), watch_handle(0) {} + void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) override { + ldout(cct, 10) << "RGWWatcher::handle_notify() " + << " notify_id " << notify_id + << " cookie " << cookie + << " notifier " << notifier_id + << " bl.length()=" << bl.length() << dendl; + + if (unlikely(svc->inject_notify_timeout_probability == 1) || + (svc->inject_notify_timeout_probability > 0 && + (svc->inject_notify_timeout_probability > + ceph::util::generate_random_number(0.0, 1.0)))) { + ldout(cct, 0) + << "RGWWatcher::handle_notify() dropping notification! " + << "If this isn't what you want, set " + << "rgw_inject_notify_timeout_probability to zero!" << dendl; + return; + } + + svc->watch_cb(notify_id, cookie, notifier_id, bl); + + bufferlist reply_bl; // empty reply payload + obj.notify_ack(notify_id, cookie, reply_bl); + } + void handle_error(uint64_t cookie, int err) override { + lderr(cct) << "RGWWatcher::handle_error cookie " << cookie + << " err " << cpp_strerror(err) << dendl; + svc->remove_watcher(index); + svc->schedule_context(new C_ReinitWatch(this)); + } + + void reinit() { + int ret = unregister_watch(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl; + return; + } + ret = register_watch(); + if (ret < 0) { + ldout(cct, 0) << "ERROR: register_watch() returned ret=" << ret << dendl; + return; + } + } + + int unregister_watch() { + int r = svc->unwatch(obj, watch_handle); + if (r < 0) { + return r; + } + svc->remove_watcher(index); + return 0; + } + + int register_watch_async() { + if (register_completion) { + register_completion->release(); + register_completion = nullptr; + } + register_completion = librados::Rados::aio_create_completion(nullptr, nullptr, nullptr); + register_ret = obj.aio_watch(register_completion, &watch_handle, this); + if (register_ret < 0) { + register_completion->release(); + return register_ret; + } + return 0; + } + + int register_watch_finish() { + if (register_ret < 0) { + return register_ret; + } + if (!register_completion) { + return -EINVAL; + } + register_completion->wait_for_safe(); + int r = register_completion->get_return_value(); + register_completion->release(); + register_completion = nullptr; + if (r < 0) { + return r; + } + svc->add_watcher(index); + return 0; + } + + int register_watch() { + int r = obj.watch(&watch_handle, this); + if (r < 0) { + return r; + } + svc->add_watcher(index); + return 0; + } +}; + + +class RGWSI_Notify_ShutdownCB : public RGWSI_Finisher::ShutdownCB +{ + RGWSI_Notify *svc; +public: + RGWSI_Notify_ShutdownCB(RGWSI_Notify *_svc) : svc(_svc) {} + void call() override { + svc->shutdown(); + } +}; + +string RGWSI_Notify::get_control_oid(int i) +{ + char buf[notify_oid_prefix.size() + 16]; + snprintf(buf, sizeof(buf), "%s.%d", notify_oid_prefix.c_str(), i); + + return string(buf); +} + +// do not call pick_obj_control before init_watch +RGWSI_RADOS::Obj RGWSI_Notify::pick_control_obj(const string& key) +{ + uint32_t r = ceph_str_hash_linux(key.c_str(), key.size()); + + int i = r % num_watchers; + return notify_objs[i]; +} + +int RGWSI_Notify::init_watch() +{ + num_watchers = cct->_conf->rgw_num_control_oids; + + bool compat_oid = (num_watchers == 0); + + if (num_watchers <= 0) + num_watchers = 1; + + watchers = new RGWWatcher *[num_watchers]; + + int error = 0; + + notify_objs.resize(num_watchers); + + for (int i=0; i < num_watchers; i++) { + string notify_oid; + + if (!compat_oid) { + notify_oid = get_control_oid(i); + } else { + notify_oid = notify_oid_prefix; + } + + notify_objs[i] = rados_svc->handle().obj({control_pool, notify_oid}); + auto& notify_obj = notify_objs[i]; + + int r = notify_obj.open(); + if (r < 0) { + ldout(cct, 0) << "ERROR: notify_obj.open() returned r=" << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + op.create(false); + r = notify_obj.operate(&op, null_yield); + if (r < 0 && r != -EEXIST) { + ldout(cct, 0) << "ERROR: notify_obj.operate() returned r=" << r << dendl; + return r; + } + + RGWWatcher *watcher = new RGWWatcher(cct, this, i, notify_obj); + watchers[i] = watcher; + + r = watcher->register_watch_async(); + if (r < 0) { + ldout(cct, 0) << "WARNING: register_watch_aio() returned " << r << dendl; + error = r; + continue; + } + } + + for (int i = 0; i < num_watchers; ++i) { + int r = watchers[i]->register_watch_finish(); + if (r < 0) { + ldout(cct, 0) << "WARNING: async watch returned " << r << dendl; + error = r; + } + } + + if (error < 0) { + return error; + } + + return 0; +} + +void RGWSI_Notify::finalize_watch() +{ + for (int i = 0; i < num_watchers; i++) { + RGWWatcher *watcher = watchers[i]; + watcher->unregister_watch(); + delete watcher; + } + + delete[] watchers; +} + +int RGWSI_Notify::do_start() +{ + int r = zone_svc->start(); + if (r < 0) { + return r; + } + + assert(zone_svc->is_started()); /* otherwise there's an ordering problem */ + + r = rados_svc->start(); + if (r < 0) { + return r; + } + r = finisher_svc->start(); + if (r < 0) { + return r; + } + + control_pool = zone_svc->get_zone_params().control_pool; + + int ret = init_watch(); + if (ret < 0) { + lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl; + return ret; + } + + shutdown_cb = new RGWSI_Notify_ShutdownCB(this); + int handle; + finisher_svc->register_caller(shutdown_cb, &handle); + finisher_handle = handle; + + return 0; +} + +void RGWSI_Notify::shutdown() +{ + if (finalized) { + return; + } + + if (finisher_handle) { + finisher_svc->unregister_caller(*finisher_handle); + } + finalize_watch(); + + delete shutdown_cb; + + finalized = true; +} + +RGWSI_Notify::~RGWSI_Notify() +{ + shutdown(); +} + +int RGWSI_Notify::unwatch(RGWSI_RADOS::Obj& obj, uint64_t watch_handle) +{ + int r = obj.unwatch(watch_handle); + if (r < 0) { + ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl; + return r; + } + r = rados_svc->handle().watch_flush(); + if (r < 0) { + ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl; + return r; + } + return 0; +} + +void RGWSI_Notify::add_watcher(int i) +{ + ldout(cct, 20) << "add_watcher() i=" << i << dendl; + RWLock::WLocker l(watchers_lock); + watchers_set.insert(i); + if (watchers_set.size() == (size_t)num_watchers) { + ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl; + _set_enabled(true); + } +} + +void RGWSI_Notify::remove_watcher(int i) +{ + ldout(cct, 20) << "remove_watcher() i=" << i << dendl; + RWLock::WLocker l(watchers_lock); + size_t orig_size = watchers_set.size(); + watchers_set.erase(i); + if (orig_size == (size_t)num_watchers && + watchers_set.size() < orig_size) { /* actually removed */ + ldout(cct, 2) << "removed watcher, disabling cache" << dendl; + _set_enabled(false); + } +} + +int RGWSI_Notify::watch_cb(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) +{ + RWLock::RLocker l(watchers_lock); + if (cb) { + return cb->watch_cb(notify_id, cookie, notifier_id, bl); + } + return 0; +} + +void RGWSI_Notify::set_enabled(bool status) +{ + RWLock::WLocker l(watchers_lock); + _set_enabled(status); +} + +void RGWSI_Notify::_set_enabled(bool status) +{ + enabled = status; + if (cb) { + cb->set_enabled(status); + } +} + +int RGWSI_Notify::distribute(const string& key, bufferlist& bl) +{ + /* The RGW uses the control pool to store the watch notify objects. + The precedence in RGWSI_Notify::do_start is to call to zone_svc->start and later to init_watch(). + The first time, RGW starts in the cluster, the RGW will try to create zone and zonegroup system object. + In that case RGW will try to distribute the cache before it ran init_watch, + which will lead to division by 0 in pick_obj_control (num_watchers is 0). + */ + if (num_watchers > 0) { + RGWSI_RADOS::Obj notify_obj = pick_control_obj(key); + + ldout(cct, 10) << "distributing notification oid=" << notify_obj.get_ref().obj + << " bl.length()=" << bl.length() << dendl; + return robust_notify(notify_obj, bl); + } + return 0; +} + +int RGWSI_Notify::robust_notify(RGWSI_RADOS::Obj& notify_obj, bufferlist& bl) +{ + // The reply of every machine that acks goes in here. + boost::container::flat_set> acks; + bufferlist rbl; + + // First, try to send, without being fancy about it. + auto r = notify_obj.notify(bl, 0, &rbl); + + // If that doesn't work, get serious. + if (r < 0) { + ldout(cct, 1) << "robust_notify: If at first you don't succeed: " + << cpp_strerror(-r) << dendl; + + + auto p = rbl.cbegin(); + // Gather up the replies to the first attempt. + try { + uint32_t num_acks; + decode(num_acks, p); + // Doing this ourselves since we don't care about the payload; + for (auto i = 0u; i < num_acks; ++i) { + std::pair id; + decode(id, p); + acks.insert(id); + ldout(cct, 20) << "robust_notify: acked by " << id << dendl; + uint32_t blen; + decode(blen, p); + p.advance(blen); + } + } catch (const buffer::error& e) { + ldout(cct, 0) << "robust_notify: notify response parse failed: " + << e.what() << dendl; + acks.clear(); // Throw away junk on failed parse. + } + + + // Every machine that fails to reply and hasn't acked a previous + // attempt goes in here. + boost::container::flat_set> timeouts; + + auto tries = 1u; + while (r < 0 && tries < max_notify_retries) { + ++tries; + rbl.clear(); + // Reset the timeouts, we're only concerned with new ones. + timeouts.clear(); + r = notify_obj.notify(bl, 0, &rbl); + if (r < 0) { + ldout(cct, 1) << "robust_notify: retry " << tries << " failed: " + << cpp_strerror(-r) << dendl; + p = rbl.begin(); + try { + uint32_t num_acks; + decode(num_acks, p); + // Not only do we not care about the payload, but we don't + // want to empty the container; we just want to augment it + // with any new members. + for (auto i = 0u; i < num_acks; ++i) { + std::pair id; + decode(id, p); + auto ir = acks.insert(id); + if (ir.second) { + ldout(cct, 20) << "robust_notify: acked by " << id << dendl; + } + uint32_t blen; + decode(blen, p); + p.advance(blen); + } + + uint32_t num_timeouts; + decode(num_timeouts, p); + for (auto i = 0u; i < num_timeouts; ++i) { + std::pair id; + decode(id, p); + // Only track timeouts from hosts that haven't acked previously. + if (acks.find(id) != acks.cend()) { + ldout(cct, 20) << "robust_notify: " << id << " timed out." + << dendl; + timeouts.insert(id); + } + } + } catch (const buffer::error& e) { + ldout(cct, 0) << "robust_notify: notify response parse failed: " + << e.what() << dendl; + continue; + } + // If we got a good parse and timeouts is empty, that means + // everyone who timed out in one call received the update in a + // previous one. + if (timeouts.empty()) { + r = 0; + } + } + } + } + return r; +} + +void RGWSI_Notify::register_watch_cb(CB *_cb) +{ + RWLock::WLocker l(watchers_lock); + cb = _cb; + _set_enabled(enabled); +} + +void RGWSI_Notify::schedule_context(Context *c) +{ + finisher_svc->schedule_context(c); +} diff --git a/src/rgw/services/svc_notify.h b/src/rgw/services/svc_notify.h new file mode 100644 index 00000000..cd9d9eb8 --- /dev/null +++ b/src/rgw/services/svc_notify.h @@ -0,0 +1,100 @@ +#ifndef CEPH_RGW_SERVICES_NOTIFY_H +#define CEPH_RGW_SERVICES_NOTIFY_H + + +#include "rgw/rgw_service.h" + +#include "svc_rados.h" + + +class RGWSI_Zone; +class RGWSI_Finisher; + +class RGWWatcher; +class RGWSI_Notify_ShutdownCB; + +class RGWSI_Notify : public RGWServiceInstance +{ + friend class RGWWatcher; + friend class RGWSI_Notify_ShutdownCB; + friend class RGWServices_Def; + +public: + class CB; + +private: + RGWSI_Zone *zone_svc{nullptr}; + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_Finisher *finisher_svc{nullptr}; + + RWLock watchers_lock{"watchers_lock"}; + rgw_pool control_pool; + + int num_watchers{0}; + RGWWatcher **watchers{nullptr}; + std::set watchers_set; + vector notify_objs; + + bool enabled{false}; + + double inject_notify_timeout_probability{0}; + unsigned max_notify_retries{0}; + + string get_control_oid(int i); + RGWSI_RADOS::Obj pick_control_obj(const string& key); + + CB *cb{nullptr}; + + std::optional finisher_handle; + RGWSI_Notify_ShutdownCB *shutdown_cb{nullptr}; + + bool finalized{false}; + + int init_watch(); + void finalize_watch(); + + void init(RGWSI_Zone *_zone_svc, + RGWSI_RADOS *_rados_svc, + RGWSI_Finisher *_finisher_svc) { + zone_svc = _zone_svc; + rados_svc = _rados_svc; + finisher_svc = _finisher_svc; + } + int do_start() override; + void shutdown() override; + + int unwatch(RGWSI_RADOS::Obj& obj, uint64_t watch_handle); + void add_watcher(int i); + void remove_watcher(int i); + + int watch_cb(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl); + void _set_enabled(bool status); + void set_enabled(bool status); + + int robust_notify(RGWSI_RADOS::Obj& notify_obj, bufferlist& bl); + + void schedule_context(Context *c); +public: + RGWSI_Notify(CephContext *cct): RGWServiceInstance(cct) {} + ~RGWSI_Notify(); + + class CB { + public: + virtual ~CB() {} + virtual int watch_cb(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) = 0; + virtual void set_enabled(bool status) = 0; + }; + + int distribute(const string& key, bufferlist& bl); + + void register_watch_cb(CB *cb); +}; + +#endif + diff --git a/src/rgw/services/svc_quota.cc b/src/rgw/services/svc_quota.cc new file mode 100644 index 00000000..f2baac36 --- /dev/null +++ b/src/rgw/services/svc_quota.cc @@ -0,0 +1,15 @@ +#include "svc_quota.h" +#include "svc_zone.h" + +#include "rgw/rgw_zone.h" + +const RGWQuotaInfo& RGWSI_Quota::get_bucket_quota() const +{ + return zone_svc->get_current_period().get_config().bucket_quota; +} + +const RGWQuotaInfo& RGWSI_Quota::get_user_quota() const +{ + return zone_svc->get_current_period().get_config().user_quota; +} + diff --git a/src/rgw/services/svc_quota.h b/src/rgw/services/svc_quota.h new file mode 100644 index 00000000..7dfbf19b --- /dev/null +++ b/src/rgw/services/svc_quota.h @@ -0,0 +1,23 @@ +#ifndef CEPH_RGW_SERVICES_QUOTA_H +#define CEPH_RGW_SERVICES_QUOTA_H + + +#include "rgw/rgw_service.h" + + +class RGWSI_Quota : public RGWServiceInstance +{ + RGWSI_Zone *zone_svc{nullptr}; + +public: + RGWSI_Quota(CephContext *cct): RGWServiceInstance(cct) {} + + void init(RGWSI_Zone *_zone_svc) { + zone_svc = _zone_svc; + } + + const RGWQuotaInfo& get_bucket_quota() const; + const RGWQuotaInfo& get_user_quota() const; +}; + +#endif diff --git a/src/rgw/services/svc_rados.cc b/src/rgw/services/svc_rados.cc new file mode 100644 index 00000000..408d25d9 --- /dev/null +++ b/src/rgw/services/svc_rados.cc @@ -0,0 +1,308 @@ +#include "svc_rados.h" + +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "osd/osd_types.h" +#include "rgw/rgw_tools.h" + +#define dout_subsys ceph_subsys_rgw + +int RGWSI_RADOS::do_start() +{ + int ret = rados.init_with_context(cct); + if (ret < 0) { + return ret; + } + ret = rados.connect(); + if (ret < 0) { + return ret; + } + return 0; +} + +librados::Rados* RGWSI_RADOS::get_rados_handle() +{ + return &rados; +} + +uint64_t RGWSI_RADOS::instance_id() +{ + return get_rados_handle()->get_instance_id(); +} + +int RGWSI_RADOS::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx) +{ + constexpr bool create = true; // create the pool if it doesn't exist + return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create); +} + +int RGWSI_RADOS::pool_iterate(librados::IoCtx& io_ctx, + librados::NObjectIterator& iter, + uint32_t num, vector& objs, + RGWAccessListFilter *filter, + bool *is_truncated) +{ + if (iter == io_ctx.nobjects_end()) + return -ENOENT; + + uint32_t i; + + for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) { + rgw_bucket_dir_entry e; + + string oid = iter->get_oid(); + ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl; + + // fill it in with initial values; we may correct later + if (filter && !filter->filter(oid, oid)) + continue; + + e.key = oid; + objs.push_back(e); + } + + if (is_truncated) + *is_truncated = (iter != io_ctx.nobjects_end()); + + return objs.size(); +} + +void RGWSI_RADOS::Obj::init(const rgw_raw_obj& obj) +{ + ref.obj = obj; +} + +int RGWSI_RADOS::Obj::open() +{ + int r = rados_svc->open_pool_ctx(ref.obj.pool, ref.ioctx); + if (r < 0) { + return r; + } + + ref.ioctx.locator_set_key(ref.obj.loc); + + return 0; +} + +int RGWSI_RADOS::Obj::operate(librados::ObjectWriteOperation *op, + optional_yield y) +{ + return rgw_rados_operate(ref.ioctx, ref.obj.oid, op, y); +} + +int RGWSI_RADOS::Obj::operate(librados::ObjectReadOperation *op, bufferlist *pbl, + optional_yield y) +{ + return rgw_rados_operate(ref.ioctx, ref.obj.oid, op, pbl, y); +} + +int RGWSI_RADOS::Obj::aio_operate(librados::AioCompletion *c, librados::ObjectWriteOperation *op) +{ + return ref.ioctx.aio_operate(ref.obj.oid, c, op); +} + +int RGWSI_RADOS::Obj::aio_operate(librados::AioCompletion *c, librados::ObjectReadOperation *op, + bufferlist *pbl) +{ + return ref.ioctx.aio_operate(ref.obj.oid, c, op, pbl); +} + +int RGWSI_RADOS::Obj::watch(uint64_t *handle, librados::WatchCtx2 *ctx) +{ + return ref.ioctx.watch2(ref.obj.oid, handle, ctx); +} + +int RGWSI_RADOS::Obj::aio_watch(librados::AioCompletion *c, uint64_t *handle, librados::WatchCtx2 *ctx) +{ + return ref.ioctx.aio_watch(ref.obj.oid, c, handle, ctx); +} + +int RGWSI_RADOS::Obj::unwatch(uint64_t handle) +{ + return ref.ioctx.unwatch2(handle); +} + +int RGWSI_RADOS::Obj::notify(bufferlist& bl, + uint64_t timeout_ms, + bufferlist *pbl) +{ + return ref.ioctx.notify2(ref.obj.oid, bl, timeout_ms, pbl); +} + +void RGWSI_RADOS::Obj::notify_ack(uint64_t notify_id, + uint64_t cookie, + bufferlist& bl) +{ + ref.ioctx.notify_ack(ref.obj.oid, notify_id, cookie, bl); +} + +uint64_t RGWSI_RADOS::Obj::get_last_version() +{ + return ref.ioctx.get_last_version(); +} + +int RGWSI_RADOS::Pool::create() +{ + librados::Rados *rad = rados_svc->get_rados_handle(); + int r = rad->pool_create(pool.name.c_str()); + if (r < 0) { + ldout(rados_svc->cct, 0) << "WARNING: pool_create returned " << r << dendl; + return r; + } + librados::IoCtx io_ctx; + r = rad->ioctx_create(pool.name.c_str(), io_ctx); + if (r < 0) { + ldout(rados_svc->cct, 0) << "WARNING: ioctx_create returned " << r << dendl; + return r; + } + r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false); + if (r < 0) { + ldout(rados_svc->cct, 0) << "WARNING: application_enable returned " << r << dendl; + return r; + } + return 0; +} + +int RGWSI_RADOS::Pool::create(const vector& pools, vector *retcodes) +{ + vector completions; + vector rets; + + librados::Rados *rad = rados_svc->get_rados_handle(); + for (auto iter = pools.begin(); iter != pools.end(); ++iter) { + librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion(); + completions.push_back(c); + auto& pool = *iter; + int ret = rad->pool_create_async(pool.name.c_str(), c); + rets.push_back(ret); + } + + vector::iterator riter; + vector::iterator citer; + + bool error = false; + ceph_assert(rets.size() == completions.size()); + for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) { + int r = *riter; + librados::PoolAsyncCompletion *c = *citer; + if (r == 0) { + c->wait(); + r = c->get_return_value(); + if (r < 0) { + ldout(rados_svc->cct, 0) << "WARNING: async pool_create returned " << r << dendl; + error = true; + } + } + c->release(); + retcodes->push_back(r); + } + if (error) { + return 0; + } + + std::vector io_ctxs; + retcodes->clear(); + for (auto pool : pools) { + io_ctxs.emplace_back(); + int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back()); + if (ret < 0) { + ldout(rados_svc->cct, 0) << "WARNING: ioctx_create returned " << ret << dendl; + error = true; + } + retcodes->push_back(ret); + } + if (error) { + return 0; + } + + completions.clear(); + for (auto &io_ctx : io_ctxs) { + librados::PoolAsyncCompletion *c = + librados::Rados::pool_async_create_completion(); + completions.push_back(c); + int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW, + false, c); + ceph_assert(ret == 0); + } + + retcodes->clear(); + for (auto c : completions) { + c->wait(); + int ret = c->get_return_value(); + if (ret == -EOPNOTSUPP) { + ret = 0; + } else if (ret < 0) { + ldout(rados_svc->cct, 0) << "WARNING: async application_enable returned " << ret + << dendl; + error = true; + } + c->release(); + retcodes->push_back(ret); + } + return 0; +} + +int RGWSI_RADOS::Pool::lookup() +{ + librados::Rados *rad = rados_svc->get_rados_handle(); + int ret = rad->pool_lookup(pool.name.c_str()); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWSI_RADOS::Pool::List::init(const string& marker, RGWAccessListFilter *filter) +{ + if (ctx.initialized) { + return -EINVAL; + } + + int r = pool.rados_svc->open_pool_ctx(pool.pool, ctx.ioctx); + if (r < 0) { + return r; + } + + librados::ObjectCursor oc; + if (!oc.from_str(marker)) { + ldout(pool.rados_svc->cct, 10) << "failed to parse cursor: " << marker << dendl; + return -EINVAL; + } + + ctx.iter = ctx.ioctx.nobjects_begin(oc); + ctx.filter = filter; + ctx.initialized = true; + + return 0; +} + +int RGWSI_RADOS::Pool::List::get_next(int max, + std::list *oids, + bool *is_truncated) +{ + if (!ctx.initialized) { + return -EINVAL; + } + vector objs; + int r = pool.rados_svc->pool_iterate(ctx.ioctx, ctx.iter, max, objs, ctx.filter, is_truncated); + if (r < 0) { + if(r != -ENOENT) { + ldout(pool.rados_svc->cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl; + } + return r; + } + + vector::iterator iter; + for (auto& o : objs) { + oids->push_back(o.key.name); + } + + return oids->size(); +} + +int RGWSI_RADOS::Handle::watch_flush() +{ + librados::Rados *rad = rados_svc->get_rados_handle(); + return rad->watch_flush(); +} diff --git a/src/rgw/services/svc_rados.h b/src/rgw/services/svc_rados.h new file mode 100644 index 00000000..0453eb0c --- /dev/null +++ b/src/rgw/services/svc_rados.h @@ -0,0 +1,178 @@ +#ifndef CEPH_RGW_SERVICES_RADOS_H +#define CEPH_RGW_SERVICES_RADOS_H + + +#include "rgw/rgw_service.h" + +#include "include/rados/librados.hpp" +#include "common/async/yield_context.h" + +class RGWAccessListFilter { +public: + virtual ~RGWAccessListFilter() {} + virtual bool filter(const string& name, string& key) = 0; +}; + +struct RGWAccessListFilterPrefix : public RGWAccessListFilter { + string prefix; + + explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {} + bool filter(const string& name, string& key) override { + return (prefix.compare(key.substr(0, prefix.size())) == 0); + } +}; + +struct rgw_rados_ref { + rgw_raw_obj obj; + librados::IoCtx ioctx; +}; + +class RGWSI_RADOS : public RGWServiceInstance +{ + librados::Rados rados; + + int do_start() override; + + librados::Rados* get_rados_handle(); + int open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx); + int pool_iterate(librados::IoCtx& ioctx, + librados::NObjectIterator& iter, + uint32_t num, vector& objs, + RGWAccessListFilter *filter, + bool *is_truncated); + +public: + RGWSI_RADOS(CephContext *cct) : RGWServiceInstance(cct) {} + + void init() {} + + uint64_t instance_id(); + + class Handle; + + class Obj { + friend class RGWSI_RADOS; + friend Handle; + + RGWSI_RADOS *rados_svc{nullptr}; + rgw_rados_ref ref; + + void init(const rgw_raw_obj& obj); + + Obj(RGWSI_RADOS *_rados_svc, const rgw_raw_obj& _obj) + : rados_svc(_rados_svc) { + init(_obj); + } + + public: + Obj() {} + + int open(); + + int operate(librados::ObjectWriteOperation *op, optional_yield y); + int operate(librados::ObjectReadOperation *op, bufferlist *pbl, + optional_yield y); + int aio_operate(librados::AioCompletion *c, librados::ObjectWriteOperation *op); + int aio_operate(librados::AioCompletion *c, librados::ObjectReadOperation *op, + bufferlist *pbl); + + int watch(uint64_t *handle, librados::WatchCtx2 *ctx); + int aio_watch(librados::AioCompletion *c, uint64_t *handle, librados::WatchCtx2 *ctx); + int unwatch(uint64_t handle); + int notify(bufferlist& bl, + uint64_t timeout_ms, + bufferlist *pbl); + void notify_ack(uint64_t notify_id, + uint64_t cookie, + bufferlist& bl); + + uint64_t get_last_version(); + + rgw_rados_ref& get_ref() { return ref; } + const rgw_rados_ref& get_ref() const { return ref; } + }; + + class Pool { + friend class RGWSI_RADOS; + friend Handle; + + RGWSI_RADOS *rados_svc{nullptr}; + rgw_pool pool; + + Pool(RGWSI_RADOS *_rados_svc, + const rgw_pool& _pool) : rados_svc(_rados_svc), + pool(_pool) {} + + Pool(RGWSI_RADOS *_rados_svc) : rados_svc(_rados_svc) {} + public: + Pool() {} + + int create(); + int create(const std::vector& pools, std::vector *retcodes); + int lookup(); + + struct List { + Pool& pool; + + struct Ctx { + bool initialized{false}; + librados::IoCtx ioctx; + librados::NObjectIterator iter; + RGWAccessListFilter *filter{nullptr}; + } ctx; + + List(Pool& _pool) : pool(_pool) {} + + int init(const string& marker, RGWAccessListFilter *filter = nullptr); + int get_next(int max, + std::list *oids, + bool *is_truncated); + }; + + List op() { + return List(*this); + } + + friend List; + }; + + class Handle { + friend class RGWSI_RADOS; + + RGWSI_RADOS *rados_svc{nullptr}; + + Handle(RGWSI_RADOS *_rados_svc) : rados_svc(_rados_svc) {} + public: + Obj obj(const rgw_raw_obj& o) { + return Obj(rados_svc, o); + } + + Pool pool(const rgw_pool& p) { + return Pool(rados_svc, p); + } + + int watch_flush(); + }; + + Handle handle() { + return Handle(this); + } + + Obj obj(const rgw_raw_obj& o) { + return Obj(this, o); + } + + Pool pool() { + return Pool(this); + } + + Pool pool(const rgw_pool& p) { + return Pool(this, p); + } + + friend Obj; + friend Pool; + friend Pool::List; +}; + +#endif diff --git a/src/rgw/services/svc_sync_modules.cc b/src/rgw/services/svc_sync_modules.cc new file mode 100644 index 00000000..ca6a7a30 --- /dev/null +++ b/src/rgw/services/svc_sync_modules.cc @@ -0,0 +1,15 @@ +#include "svc_sync_modules.h" + +#include "rgw/rgw_sync_module.h" + +void RGWSI_SyncModules::init() +{ + sync_modules_manager = new RGWSyncModulesManager(); + rgw_register_sync_modules(sync_modules_manager); +} + +RGWSI_SyncModules::~RGWSI_SyncModules() +{ + delete sync_modules_manager; +} + diff --git a/src/rgw/services/svc_sync_modules.h b/src/rgw/services/svc_sync_modules.h new file mode 100644 index 00000000..19c4ec57 --- /dev/null +++ b/src/rgw/services/svc_sync_modules.h @@ -0,0 +1,26 @@ +#ifndef CEPH_RGW_SERVICES_SYNC_MODULES_H +#define CEPH_RGW_SERVICES_SYNC_MODULES_H + + +#include "rgw/rgw_service.h" + + +class RGWSyncModulesManager; + +class RGWSI_SyncModules : public RGWServiceInstance +{ + RGWSyncModulesManager *sync_modules_manager{nullptr}; + +public: + RGWSI_SyncModules(CephContext *cct): RGWServiceInstance(cct) {} + ~RGWSI_SyncModules(); + + RGWSyncModulesManager *get_manager() { + return sync_modules_manager; + } + + void init(); +}; + +#endif + diff --git a/src/rgw/services/svc_sys_obj.cc b/src/rgw/services/svc_sys_obj.cc new file mode 100644 index 00000000..1eda37f8 --- /dev/null +++ b/src/rgw/services/svc_sys_obj.cc @@ -0,0 +1,192 @@ +#include "svc_sys_obj.h" +#include "svc_sys_obj_core.h" +#include "svc_rados.h" +#include "svc_zone.h" + +#include "rgw/rgw_zone.h" + +#define dout_subsys ceph_subsys_rgw + +RGWSysObjectCtx RGWSI_SysObj::init_obj_ctx() +{ + return RGWSysObjectCtx(this); +} + +RGWSI_SysObj::Obj RGWSI_SysObj::get_obj(RGWSysObjectCtx& obj_ctx, const rgw_raw_obj& obj) +{ + return Obj(core_svc, obj_ctx, obj); +} + +void RGWSI_SysObj::Obj::invalidate() +{ + ctx.invalidate(obj); +} + +int RGWSI_SysObj::Obj::ROp::stat() +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->stat(source.get_ctx(), state, obj, + attrs, raw_attrs, + lastmod, obj_size, + objv_tracker); +} + +int RGWSI_SysObj::Obj::ROp::read(int64_t ofs, int64_t end, bufferlist *bl) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->read(source.get_ctx(), state, + objv_tracker, + obj, bl, ofs, end, + attrs, + raw_attrs, + cache_info, + refresh_version); +} + +int RGWSI_SysObj::Obj::ROp::get_attr(const char *name, bufferlist *dest) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->get_attr(obj, name, dest); +} + +int RGWSI_SysObj::Obj::WOp::remove() +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->remove(source.get_ctx(), + objv_tracker, + obj); +} + +int RGWSI_SysObj::Obj::WOp::write(bufferlist& bl) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->write(obj, pmtime, attrs, exclusive, + bl, objv_tracker, mtime); +} + +int RGWSI_SysObj::Obj::WOp::write_data(bufferlist& bl) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->write_data(obj, bl, exclusive, objv_tracker); +} + +int RGWSI_SysObj::Obj::WOp::write_attrs() +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + return svc->set_attrs(obj, attrs, nullptr, objv_tracker); +} + +int RGWSI_SysObj::Obj::WOp::write_attr(const char *name, bufferlist& bl) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.get_obj(); + + map m; + m[name] = bl; + + return svc->set_attrs(obj, m, nullptr, objv_tracker); +} + +int RGWSI_SysObj::Pool::Op::list_prefixed_objs(const string& prefix, list *result) +{ + bool is_truncated; + + auto rados_pool = source.rados_svc->pool(source.pool); + + auto op = rados_pool.op(); + + RGWAccessListFilterPrefix filter(prefix); + + int r = op.init(string(), &filter); + if (r < 0) { + return r; + } + + do { + list oids; +#define MAX_OBJS_DEFAULT 1000 + int r = op.get_next(MAX_OBJS_DEFAULT, &oids, &is_truncated); + if (r < 0) { + return r; + } + for (auto& val : oids) { + if (val.size() > prefix.size()) { + result->push_back(val.substr(prefix.size())); + } + } + } while (is_truncated); + + return 0; +} + +int RGWSI_SysObj::Obj::OmapOp::get_all(std::map *m) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_get_all(obj, m); +} + +int RGWSI_SysObj::Obj::OmapOp::get_vals(const string& marker, + uint64_t count, + std::map *m, + bool *pmore) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_get_vals(obj, marker, count, m, pmore); +} + +int RGWSI_SysObj::Obj::OmapOp::set(const std::string& key, bufferlist& bl) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_set(obj, key, bl, must_exist); +} + +int RGWSI_SysObj::Obj::OmapOp::set(const map& m) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_set(obj, m, must_exist); +} + +int RGWSI_SysObj::Obj::OmapOp::del(const std::string& key) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->omap_del(obj, key); +} + +int RGWSI_SysObj::Obj::WNOp::notify(bufferlist& bl, + uint64_t timeout_ms, + bufferlist *pbl) +{ + RGWSI_SysObj_Core *svc = source.core_svc; + rgw_raw_obj& obj = source.obj; + + return svc->notify(obj, bl, timeout_ms, pbl); +} + +RGWSI_Zone *RGWSI_SysObj::get_zone_svc() +{ + return core_svc->get_zone_svc(); +} diff --git a/src/rgw/services/svc_sys_obj.h b/src/rgw/services/svc_sys_obj.h new file mode 100644 index 00000000..f6cd77ce --- /dev/null +++ b/src/rgw/services/svc_sys_obj.h @@ -0,0 +1,275 @@ +#ifndef CEPH_RGW_SERVICES_SYS_OBJ_H +#define CEPH_RGW_SERVICES_SYS_OBJ_H + + +#include "rgw/rgw_service.h" + +#include "svc_rados.h" +#include "svc_sys_obj_core.h" + + +class RGWSI_Zone; +class RGWSI_SysObj; +class RGWSysObjectCtx; + +struct rgw_cache_entry_info; + +class RGWSI_SysObj : public RGWServiceInstance +{ + friend struct RGWServices_Def; + +public: + class Obj { + friend class ROp; + + RGWSI_SysObj_Core *core_svc; + RGWSysObjectCtx& ctx; + rgw_raw_obj obj; + + public: + Obj(RGWSI_SysObj_Core *_core_svc, + RGWSysObjectCtx& _ctx, + const rgw_raw_obj& _obj) : core_svc(_core_svc), + ctx(_ctx), + obj(_obj) {} + + void invalidate(); + + RGWSysObjectCtx& get_ctx() { + return ctx; + } + + rgw_raw_obj& get_obj() { + return obj; + } + + struct ROp { + Obj& source; + + RGWSI_SysObj_Core::GetObjState state; + + RGWObjVersionTracker *objv_tracker{nullptr}; + map *attrs{nullptr}; + bool raw_attrs{false}; + boost::optional refresh_version{boost::none}; + ceph::real_time *lastmod{nullptr}; + uint64_t *obj_size{nullptr}; + rgw_cache_entry_info *cache_info{nullptr}; + + ROp& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + ROp& set_last_mod(ceph::real_time *_lastmod) { + lastmod = _lastmod; + return *this; + } + + ROp& set_obj_size(uint64_t *_obj_size) { + obj_size = _obj_size; + return *this; + } + + ROp& set_attrs(map *_attrs) { + attrs = _attrs; + return *this; + } + + ROp& set_raw_attrs(bool ra) { + raw_attrs = ra; + return *this; + } + + ROp& set_refresh_version(boost::optional& rf) { + refresh_version = rf; + return *this; + } + + ROp& set_cache_info(rgw_cache_entry_info *ci) { + cache_info = ci; + return *this; + } + + ROp(Obj& _source) : source(_source) {} + + int stat(); + int read(int64_t ofs, int64_t end, bufferlist *pbl); + int read(bufferlist *pbl) { + return read(0, -1, pbl); + } + int get_attr(const char *name, bufferlist *dest); + }; + + struct WOp { + Obj& source; + + RGWObjVersionTracker *objv_tracker{nullptr}; + map attrs; + ceph::real_time mtime; + ceph::real_time *pmtime{nullptr}; + bool exclusive{false}; + + WOp& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) { + objv_tracker = _objv_tracker; + return *this; + } + + WOp& set_attrs(map& _attrs) { + attrs = _attrs; + return *this; + } + + WOp& set_attrs(map&& _attrs) { + attrs = _attrs; + return *this; + } + + WOp& set_mtime(const ceph::real_time& _mtime) { + mtime = _mtime; + return *this; + } + + WOp& set_pmtime(ceph::real_time *_pmtime) { + pmtime = _pmtime; + return *this; + } + + WOp& set_exclusive(bool _exclusive = true) { + exclusive = _exclusive; + return *this; + } + + WOp(Obj& _source) : source(_source) {} + + int remove(); + int write(bufferlist& bl); + + int write_data(bufferlist& bl); /* write data only */ + int write_attrs(); /* write attrs only */ + int write_attr(const char *name, bufferlist& bl); /* write attrs only */ + }; + + struct OmapOp { + Obj& source; + + bool must_exist{false}; + + OmapOp& set_must_exist(bool _must_exist = true) { + must_exist = _must_exist; + return *this; + } + + OmapOp(Obj& _source) : source(_source) {} + + int get_all(std::map *m); + int get_vals(const string& marker, + uint64_t count, + std::map *m, + bool *pmore); + int set(const std::string& key, bufferlist& bl); + int set(const map& m); + int del(const std::string& key); + }; + + struct WNOp { + Obj& source; + + WNOp(Obj& _source) : source(_source) {} + + int notify(bufferlist& bl, + uint64_t timeout_ms, + bufferlist *pbl); + }; + ROp rop() { + return ROp(*this); + } + + WOp wop() { + return WOp(*this); + } + + OmapOp omap() { + return OmapOp(*this); + } + + WNOp wn() { + return WNOp(*this); + } + }; + + class Pool { + friend class Op; + + RGWSI_RADOS *rados_svc; + RGWSI_SysObj_Core *core_svc; + rgw_pool pool; + + public: + Pool(RGWSI_RADOS *_rados_svc, + RGWSI_SysObj_Core *_core_svc, + const rgw_pool& _pool) : rados_svc(_rados_svc), + core_svc(_core_svc), + pool(_pool) {} + + rgw_pool& get_pool() { + return pool; + } + + struct Op { + Pool& source; + + Op(Pool& _source) : source(_source) {} + + int list_prefixed_objs(const std::string& prefix, std::list *result); + }; + + Op op() { + return Op(*this); + } + }; + + friend class Obj; + friend class Obj::ROp; + friend class Obj::WOp; + friend class Pool; + friend class Pool::Op; + +protected: + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_SysObj_Core *core_svc{nullptr}; + + void init(RGWSI_RADOS *_rados_svc, + RGWSI_SysObj_Core *_core_svc) { + rados_svc = _rados_svc; + core_svc = _core_svc; + } + +public: + RGWSI_SysObj(CephContext *cct): RGWServiceInstance(cct) {} + + RGWSysObjectCtx init_obj_ctx(); + Obj get_obj(RGWSysObjectCtx& obj_ctx, const rgw_raw_obj& obj); + + Pool get_pool(const rgw_pool& pool) { + return Pool(rados_svc, core_svc, pool); + } + + RGWSI_Zone *get_zone_svc(); +}; + +using RGWSysObj = RGWSI_SysObj::Obj; + +class RGWSysObjectCtx : public RGWSysObjectCtxBase +{ + RGWSI_SysObj *sysobj_svc; +public: + RGWSysObjectCtx(RGWSI_SysObj *_sysobj_svc) : sysobj_svc(_sysobj_svc) {} + + RGWSI_SysObj::Obj get_obj(const rgw_raw_obj& obj) { + return sysobj_svc->get_obj(*this, obj); + } +}; + +#endif + diff --git a/src/rgw/services/svc_sys_obj_cache.cc b/src/rgw/services/svc_sys_obj_cache.cc new file mode 100644 index 00000000..9130e054 --- /dev/null +++ b/src/rgw/services/svc_sys_obj_cache.cc @@ -0,0 +1,506 @@ +#include "svc_sys_obj_cache.h" +#include "svc_zone.h" +#include "svc_notify.h" + +#include "rgw/rgw_zone.h" +#include "rgw/rgw_tools.h" + +#define dout_subsys ceph_subsys_rgw + +class RGWSI_SysObj_Cache_CB : public RGWSI_Notify::CB +{ + RGWSI_SysObj_Cache *svc; +public: + RGWSI_SysObj_Cache_CB(RGWSI_SysObj_Cache *_svc) : svc(_svc) {} + int watch_cb(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) { + return svc->watch_cb(notify_id, cookie, notifier_id, bl); + } + + void set_enabled(bool status) { + svc->set_enabled(status); + } +}; + +int RGWSI_SysObj_Cache::do_start() +{ + int r = RGWSI_SysObj_Core::do_start(); + if (r < 0) { + return r; + } + + r = notify_svc->start(); + if (r < 0) { + return r; + } + + assert(notify_svc->is_started()); + + cb.reset(new RGWSI_SysObj_Cache_CB(this)); + + notify_svc->register_watch_cb(cb.get()); + + return 0; +} + +static string normal_name(rgw_pool& pool, const std::string& oid) { + std::string buf; + buf.reserve(pool.name.size() + pool.ns.size() + oid.size() + 2); + buf.append(pool.name).append("+").append(pool.ns).append("+").append(oid); + return buf; +} + +void RGWSI_SysObj_Cache::normalize_pool_and_obj(const rgw_pool& src_pool, const string& src_obj, rgw_pool& dst_pool, string& dst_obj) +{ + if (src_obj.size()) { + dst_pool = src_pool; + dst_obj = src_obj; + } else { + dst_pool = zone_svc->get_zone_params().domain_root; + dst_obj = src_pool.name; + } +} + + +int RGWSI_SysObj_Cache::remove(RGWSysObjectCtxBase& obj_ctx, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj) + +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + + string name = normal_name(pool, oid); + cache.remove(name); + + ObjectCacheInfo info; + int r = distribute_cache(name, obj, info, REMOVE_OBJ); + if (r < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to distribute cache: r=" << r << dendl; + } + + return RGWSI_SysObj_Core::remove(obj_ctx, objv_tracker, obj); +} + +int RGWSI_SysObj_Cache::read(RGWSysObjectCtxBase& obj_ctx, + GetObjState& read_state, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + bufferlist *obl, off_t ofs, off_t end, + map *attrs, + bool raw_attrs, + rgw_cache_entry_info *cache_info, + boost::optional refresh_version) +{ + rgw_pool pool; + string oid; + if (ofs != 0) { + return RGWSI_SysObj_Core::read(obj_ctx, read_state, objv_tracker, + obj, obl, ofs, end, attrs, raw_attrs, + cache_info, refresh_version); + } + + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + string name = normal_name(pool, oid); + + ObjectCacheInfo info; + + uint32_t flags = (end != 0 ? CACHE_FLAG_DATA : 0); + if (objv_tracker) + flags |= CACHE_FLAG_OBJV; + if (attrs) + flags |= CACHE_FLAG_XATTRS; + + int r = cache.get(name, info, flags, cache_info); + if (r == 0 && + (!refresh_version || !info.version.compare(&(*refresh_version)))) { + if (info.status < 0) + return info.status; + + bufferlist& bl = info.data; + + bufferlist::iterator i = bl.begin(); + + obl->clear(); + + i.copy_all(*obl); + if (objv_tracker) + objv_tracker->read_version = info.version; + if (attrs) { + if (raw_attrs) { + *attrs = info.xattrs; + } else { + rgw_filter_attrset(info.xattrs, RGW_ATTR_PREFIX, attrs); + } + } + return obl->length(); + } + if(r == -ENODATA) + return -ENOENT; + + map unfiltered_attrset; + r = RGWSI_SysObj_Core::read(obj_ctx, read_state, objv_tracker, + obj, obl, ofs, end, + (attrs ? &unfiltered_attrset : nullptr), + true, /* cache unfiltered attrs */ + cache_info, + refresh_version); + if (r < 0) { + if (r == -ENOENT) { // only update ENOENT, we'd rather retry other errors + info.status = r; + cache.put(name, info, cache_info); + } + return r; + } + + if (obl->length() == end + 1) { + /* in this case, most likely object contains more data, we can't cache it */ + flags &= ~CACHE_FLAG_DATA; + } else { + bufferptr p(r); + bufferlist& bl = info.data; + bl.clear(); + bufferlist::iterator o = obl->begin(); + o.copy_all(bl); + } + + info.status = 0; + info.flags = flags; + if (objv_tracker) { + info.version = objv_tracker->read_version; + } + if (attrs) { + info.xattrs = std::move(unfiltered_attrset); + if (raw_attrs) { + *attrs = info.xattrs; + } else { + rgw_filter_attrset(info.xattrs, RGW_ATTR_PREFIX, attrs); + } + } + cache.put(name, info, cache_info); + return r; +} + +int RGWSI_SysObj_Cache::get_attr(const rgw_raw_obj& obj, + const char *attr_name, + bufferlist *dest) +{ + rgw_pool pool; + string oid; + + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + string name = normal_name(pool, oid); + + ObjectCacheInfo info; + + uint32_t flags = CACHE_FLAG_XATTRS; + + int r = cache.get(name, info, flags, nullptr); + if (r == 0) { + if (info.status < 0) + return info.status; + + auto iter = info.xattrs.find(attr_name); + if (iter == info.xattrs.end()) { + return -ENODATA; + } + + *dest = iter->second; + return dest->length(); + } else if (r == -ENODATA) { + return -ENOENT; + } + /* don't try to cache this one */ + return RGWSI_SysObj_Core::get_attr(obj, attr_name, dest); +} + +int RGWSI_SysObj_Cache::set_attrs(const rgw_raw_obj& obj, + map& attrs, + map *rmattrs, + RGWObjVersionTracker *objv_tracker) +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + ObjectCacheInfo info; + info.xattrs = attrs; + if (rmattrs) { + info.rm_xattrs = *rmattrs; + } + info.status = 0; + info.flags = CACHE_FLAG_MODIFY_XATTRS; + int ret = RGWSI_SysObj_Core::set_attrs(obj, attrs, rmattrs, objv_tracker); + string name = normal_name(pool, oid); + if (ret >= 0) { + if (objv_tracker && objv_tracker->read_version.ver) { + info.version = objv_tracker->read_version; + info.flags |= CACHE_FLAG_OBJV; + } + cache.put(name, info, NULL); + int r = distribute_cache(name, obj, info, UPDATE_OBJ); + if (r < 0) + ldout(cct, 0) << "ERROR: failed to distribute cache for " << obj << dendl; + } else { + cache.remove(name); + } + + return ret; +} + +int RGWSI_SysObj_Cache::write(const rgw_raw_obj& obj, + real_time *pmtime, + map& attrs, + bool exclusive, + const bufferlist& data, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime) +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + ObjectCacheInfo info; + info.xattrs = attrs; + info.status = 0; + info.data = data; + info.flags = CACHE_FLAG_XATTRS | CACHE_FLAG_DATA | CACHE_FLAG_META; + ceph::real_time result_mtime; + int ret = RGWSI_SysObj_Core::write(obj, &result_mtime, attrs, + exclusive, data, + objv_tracker, set_mtime); + if (pmtime) { + *pmtime = result_mtime; + } + if (objv_tracker && objv_tracker->read_version.ver) { + info.version = objv_tracker->read_version; + info.flags |= CACHE_FLAG_OBJV; + } + info.meta.mtime = result_mtime; + info.meta.size = data.length(); + string name = normal_name(pool, oid); + if (ret >= 0) { + cache.put(name, info, NULL); + int r = distribute_cache(name, obj, info, UPDATE_OBJ); + if (r < 0) + ldout(cct, 0) << "ERROR: failed to distribute cache for " << obj << dendl; + } else { + cache.remove(name); + } + + return ret; +} + +int RGWSI_SysObj_Cache::write_data(const rgw_raw_obj& obj, + const bufferlist& data, + bool exclusive, + RGWObjVersionTracker *objv_tracker) +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + + ObjectCacheInfo info; + info.data = data; + info.meta.size = data.length(); + info.status = 0; + info.flags = CACHE_FLAG_DATA; + + int ret = RGWSI_SysObj_Core::write_data(obj, data, exclusive, objv_tracker); + string name = normal_name(pool, oid); + if (ret >= 0) { + if (objv_tracker && objv_tracker->read_version.ver) { + info.version = objv_tracker->read_version; + info.flags |= CACHE_FLAG_OBJV; + } + cache.put(name, info, NULL); + int r = distribute_cache(name, obj, info, UPDATE_OBJ); + if (r < 0) + ldout(cct, 0) << "ERROR: failed to distribute cache for " << obj << dendl; + } else { + cache.remove(name); + } + + return ret; +} + +int RGWSI_SysObj_Cache::raw_stat(const rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *pepoch, + map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker) +{ + rgw_pool pool; + string oid; + normalize_pool_and_obj(obj.pool, obj.oid, pool, oid); + + string name = normal_name(pool, oid); + + uint64_t size; + real_time mtime; + uint64_t epoch; + + ObjectCacheInfo info; + uint32_t flags = CACHE_FLAG_META | CACHE_FLAG_XATTRS; + if (objv_tracker) + flags |= CACHE_FLAG_OBJV; + int r = cache.get(name, info, flags, NULL); + if (r == 0) { + if (info.status < 0) + return info.status; + + size = info.meta.size; + mtime = info.meta.mtime; + epoch = info.epoch; + if (objv_tracker) + objv_tracker->read_version = info.version; + goto done; + } + if (r == -ENODATA) { + return -ENOENT; + } + r = RGWSI_SysObj_Core::raw_stat(obj, &size, &mtime, &epoch, &info.xattrs, first_chunk, objv_tracker); + if (r < 0) { + if (r == -ENOENT) { + info.status = r; + cache.put(name, info, NULL); + } + return r; + } + info.status = 0; + info.epoch = epoch; + info.meta.mtime = mtime; + info.meta.size = size; + info.flags = CACHE_FLAG_META | CACHE_FLAG_XATTRS; + if (objv_tracker) { + info.flags |= CACHE_FLAG_OBJV; + info.version = objv_tracker->read_version; + } + cache.put(name, info, NULL); +done: + if (psize) + *psize = size; + if (pmtime) + *pmtime = mtime; + if (pepoch) + *pepoch = epoch; + if (attrs) + *attrs = info.xattrs; + return 0; +} + +int RGWSI_SysObj_Cache::distribute_cache(const string& normal_name, const rgw_raw_obj& obj, ObjectCacheInfo& obj_info, int op) +{ + RGWCacheNotifyInfo info; + + info.op = op; + + info.obj_info = obj_info; + info.obj = obj; + bufferlist bl; + encode(info, bl); + return notify_svc->distribute(normal_name, bl); +} + +int RGWSI_SysObj_Cache::watch_cb(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) +{ + RGWCacheNotifyInfo info; + + try { + auto iter = bl.cbegin(); + decode(info, iter); + } catch (buffer::end_of_buffer& err) { + ldout(cct, 0) << "ERROR: got bad notification" << dendl; + return -EIO; + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: buffer::error" << dendl; + return -EIO; + } + + rgw_pool pool; + string oid; + normalize_pool_and_obj(info.obj.pool, info.obj.oid, pool, oid); + string name = normal_name(pool, oid); + + switch (info.op) { + case UPDATE_OBJ: + cache.put(name, info.obj_info, NULL); + break; + case REMOVE_OBJ: + cache.remove(name); + break; + default: + ldout(cct, 0) << "WARNING: got unknown notification op: " << info.op << dendl; + return -EINVAL; + } + + return 0; +} + +void RGWSI_SysObj_Cache::set_enabled(bool status) +{ + cache.set_enabled(status); +} + +bool RGWSI_SysObj_Cache::chain_cache_entry(std::initializer_list cache_info_entries, + RGWChainedCache::Entry *chained_entry) +{ + return cache.chain_cache_entry(cache_info_entries, chained_entry); +} + +void RGWSI_SysObj_Cache::register_chained_cache(RGWChainedCache *cc) +{ + cache.chain_cache(cc); +} + +void RGWSI_SysObj_Cache::unregister_chained_cache(RGWChainedCache *cc) +{ + cache.unchain_cache(cc); +} + +static void cache_list_dump_helper(Formatter* f, + const std::string& name, + const ceph::real_time mtime, + const std::uint64_t size) +{ + f->dump_string("name", name); + f->dump_string("mtime", ceph::to_iso_8601(mtime)); + f->dump_unsigned("size", size); +} + +void RGWSI_SysObj_Cache::call_list(const std::optional& filter, Formatter* f) +{ + cache.for_each( + [this, &filter, f] (const string& name, const ObjectCacheEntry& entry) { + if (!filter || name.find(*filter) != name.npos) { + cache_list_dump_helper(f, name, entry.info.meta.mtime, + entry.info.meta.size); + } + }); +} + +int RGWSI_SysObj_Cache::call_inspect(const std::string& target, Formatter* f) +{ + if (const auto entry = cache.get(target)) { + f->open_object_section("cache_entry"); + f->dump_string("name", target.c_str()); + entry->dump(f); + f->close_section(); + return true; + } else { + return false; + } +} + +int RGWSI_SysObj_Cache::call_erase(const std::string& target) +{ + return cache.remove(target); +} + +int RGWSI_SysObj_Cache::call_zap() +{ + cache.invalidate_all(); + return 0; +} diff --git a/src/rgw/services/svc_sys_obj_cache.h b/src/rgw/services/svc_sys_obj_cache.h new file mode 100644 index 00000000..e48b64f2 --- /dev/null +++ b/src/rgw/services/svc_sys_obj_cache.h @@ -0,0 +1,176 @@ + +#ifndef CEPH_RGW_SERVICES_SYS_OBJ_CACHE_H +#define CEPH_RGW_SERVICES_SYS_OBJ_CACHE_H + + +#include "rgw/rgw_service.h" +#include "rgw/rgw_cache.h" + +#include "svc_sys_obj_core.h" + +class RGWSI_Notify; + +class RGWSI_SysObj_Cache_CB; + +class RGWSI_SysObj_Cache : public RGWSI_SysObj_Core +{ + friend class RGWSI_SysObj_Cache_CB; + friend class RGWServices_Def; + + RGWSI_Notify *notify_svc{nullptr}; + ObjectCache cache; + + std::shared_ptr cb; + + void normalize_pool_and_obj(const rgw_pool& src_pool, const string& src_obj, rgw_pool& dst_pool, string& dst_obj); +protected: + void init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc, + RGWSI_Notify *_notify_svc) { + core_init(_rados_svc, _zone_svc); + notify_svc = _notify_svc; + } + + int do_start() override; + + int raw_stat(const rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch, + map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker) override; + + int read(RGWSysObjectCtxBase& obj_ctx, + GetObjState& read_state, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + bufferlist *bl, off_t ofs, off_t end, + map *attrs, + bool raw_attrs, + rgw_cache_entry_info *cache_info, + boost::optional) override; + + int get_attr(const rgw_raw_obj& obj, const char *name, bufferlist *dest) override; + + int set_attrs(const rgw_raw_obj& obj, + map& attrs, + map *rmattrs, + RGWObjVersionTracker *objv_tracker); + + int remove(RGWSysObjectCtxBase& obj_ctx, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj) override; + + int write(const rgw_raw_obj& obj, + real_time *pmtime, + map& attrs, + bool exclusive, + const bufferlist& data, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime) override; + + int write_data(const rgw_raw_obj& obj, + const bufferlist& bl, + bool exclusive, + RGWObjVersionTracker *objv_tracker); + + int distribute_cache(const string& normal_name, const rgw_raw_obj& obj, ObjectCacheInfo& obj_info, int op); + + int watch_cb(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl); + + void set_enabled(bool status); + +public: + RGWSI_SysObj_Cache(CephContext *cct) : RGWSI_SysObj_Core(cct) { + cache.set_ctx(cct); + } + + bool chain_cache_entry(std::initializer_list cache_info_entries, + RGWChainedCache::Entry *chained_entry); + void register_chained_cache(RGWChainedCache *cc); + void unregister_chained_cache(RGWChainedCache *cc); + + void call_list(const std::optional& filter, Formatter* f); + int call_inspect(const std::string& target, Formatter* f); + int call_erase(const std::string& target); + int call_zap(); +}; + +template +class RGWChainedCacheImpl : public RGWChainedCache { + RGWSI_SysObj_Cache *svc{nullptr}; + ceph::timespan expiry; + RWLock lock; + + std::unordered_map> entries; + +public: + RGWChainedCacheImpl() : lock("RGWChainedCacheImpl::lock") {} + ~RGWChainedCacheImpl() { + if (!svc) { + return; + } + svc->unregister_chained_cache(this); + } + + void unregistered() override { + svc = nullptr; + } + + void init(RGWSI_SysObj_Cache *_svc) { + if (!_svc) { + return; + } + svc = _svc; + svc->register_chained_cache(this); + expiry = std::chrono::seconds(svc->ctx()->_conf.get_val( + "rgw_cache_expiry_interval")); + } + + boost::optional find(const string& key) { + RWLock::RLocker rl(lock); + auto iter = entries.find(key); + if (iter == entries.end()) { + return boost::none; + } + if (expiry.count() && + (ceph::coarse_mono_clock::now() - iter->second.second) > expiry) { + return boost::none; + } + + return iter->second.first; + } + + bool put(RGWSI_SysObj_Cache *svc, const string& key, T *entry, + std::initializer_list cache_info_entries) { + if (!svc) { + return false; + } + + Entry chain_entry(this, key, entry); + + /* we need the svc cache to call us under its lock to maintain lock ordering */ + return svc->chain_cache_entry(cache_info_entries, &chain_entry); + } + + void chain_cb(const string& key, void *data) override { + T *entry = static_cast(data); + RWLock::WLocker wl(lock); + entries[key].first = *entry; + if (expiry.count() > 0) { + entries[key].second = ceph::coarse_mono_clock::now(); + } + } + + void invalidate(const string& key) override { + RWLock::WLocker wl(lock); + entries.erase(key); + } + + void invalidate_all() override { + RWLock::WLocker wl(lock); + entries.clear(); + } +}; /* RGWChainedCacheImpl */ + +#endif diff --git a/src/rgw/services/svc_sys_obj_core.cc b/src/rgw/services/svc_sys_obj_core.cc new file mode 100644 index 00000000..ead6aebd --- /dev/null +++ b/src/rgw/services/svc_sys_obj_core.cc @@ -0,0 +1,595 @@ +#include "svc_sys_obj_core.h" +#include "svc_rados.h" +#include "svc_zone.h" + +#include "rgw/rgw_tools.h" + +#define dout_subsys ceph_subsys_rgw + +int RGWSI_SysObj_Core::GetObjState::get_rados_obj(RGWSI_RADOS *rados_svc, + RGWSI_Zone *zone_svc, + const rgw_raw_obj& obj, + RGWSI_RADOS::Obj **pobj) +{ + if (!has_rados_obj) { + if (obj.oid.empty()) { + ldout(rados_svc->ctx(), 0) << "ERROR: obj.oid is empty" << dendl; + return -EINVAL; + } + + rados_obj = rados_svc->obj(obj); + int r = rados_obj.open(); + if (r < 0) { + return r; + } + has_rados_obj = true; + } + *pobj = &rados_obj; + return 0; +} + +int RGWSI_SysObj_Core::get_rados_obj(RGWSI_Zone *zone_svc, + const rgw_raw_obj& obj, + RGWSI_RADOS::Obj *pobj) +{ + if (obj.oid.empty()) { + ldout(rados_svc->ctx(), 0) << "ERROR: obj.oid is empty" << dendl; + return -EINVAL; + } + + *pobj = std::move(rados_svc->obj(obj)); + int r = pobj->open(); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWSI_SysObj_Core::get_system_obj_state_impl(RGWSysObjectCtxBase *rctx, const rgw_raw_obj& obj, RGWSysObjState **state, RGWObjVersionTracker *objv_tracker) +{ + if (obj.empty()) { + return -EINVAL; + } + + RGWSysObjState *s = rctx->get_state(obj); + ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl; + *state = s; + if (s->has_attrs) { + return 0; + } + + s->obj = obj; + + int r = raw_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : nullptr), objv_tracker); + if (r == -ENOENT) { + s->exists = false; + s->has_attrs = true; + s->mtime = real_time(); + return 0; + } + if (r < 0) + return r; + + s->exists = true; + s->has_attrs = true; + s->obj_tag = s->attrset[RGW_ATTR_ID_TAG]; + + if (s->obj_tag.length()) + ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to " + << s->obj_tag.c_str() << dendl; + else + ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl; + + return 0; +} + +int RGWSI_SysObj_Core::get_system_obj_state(RGWSysObjectCtxBase *rctx, const rgw_raw_obj& obj, RGWSysObjState **state, RGWObjVersionTracker *objv_tracker) +{ + int ret; + + do { + ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker); + } while (ret == -EAGAIN); + + return ret; +} + +int RGWSI_SysObj_Core::raw_stat(const rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch, + map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + return r; + } + + uint64_t size = 0; + struct timespec mtime_ts; + + librados::ObjectReadOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + op.getxattrs(attrs, nullptr); + if (psize || pmtime) { + op.stat2(&size, &mtime_ts, nullptr); + } + if (first_chunk) { + op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, nullptr); + } + bufferlist outbl; + r = rados_obj.operate(&op, &outbl, null_yield); + + if (epoch) { + *epoch = rados_obj.get_last_version(); + } + + if (r < 0) + return r; + + if (psize) + *psize = size; + if (pmtime) + *pmtime = ceph::real_clock::from_timespec(mtime_ts); + + return 0; +} + +int RGWSI_SysObj_Core::stat(RGWSysObjectCtxBase& obj_ctx, + GetObjState& state, + const rgw_raw_obj& obj, + map *attrs, + bool raw_attrs, + real_time *lastmod, + uint64_t *obj_size, + RGWObjVersionTracker *objv_tracker) +{ + RGWSysObjState *astate = nullptr; + + int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker); + if (r < 0) + return r; + + if (!astate->exists) { + return -ENOENT; + } + + if (attrs) { + if (raw_attrs) { + *attrs = astate->attrset; + } else { + rgw_filter_attrset(astate->attrset, RGW_ATTR_PREFIX, attrs); + } + if (cct->_conf->subsys.should_gather()) { + map::iterator iter; + for (iter = attrs->begin(); iter != attrs->end(); ++iter) { + ldout(cct, 20) << "Read xattr: " << iter->first << dendl; + } + } + } + + if (obj_size) + *obj_size = astate->size; + if (lastmod) + *lastmod = astate->mtime; + + return 0; +} + +int RGWSI_SysObj_Core::read(RGWSysObjectCtxBase& obj_ctx, + GetObjState& read_state, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + bufferlist *bl, off_t ofs, off_t end, + map *attrs, + bool raw_attrs, + rgw_cache_entry_info *cache_info, + boost::optional) +{ + uint64_t len; + librados::ObjectReadOperation op; + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + + ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl; + op.read(ofs, len, bl, nullptr); + + map unfiltered_attrset; + + if (attrs) { + if (raw_attrs) { + op.getxattrs(attrs, nullptr); + } else { + op.getxattrs(&unfiltered_attrset, nullptr); + } + } + + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + r = rados_obj.operate(&op, nullptr, null_yield); + if (r < 0) { + ldout(cct, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl; + return r; + } + ldout(cct, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl; + + uint64_t op_ver = rados_obj.get_last_version(); + + if (read_state.last_ver > 0 && + read_state.last_ver != op_ver) { + ldout(cct, 5) << "raced with an object write, abort" << dendl; + return -ECANCELED; + } + + if (attrs && !raw_attrs) { + rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs); + } + + read_state.last_ver = op_ver; + + return bl->length(); +} + +/** + * Get an attribute for a system object. + * obj: the object to get attr + * name: name of the attr to retrieve + * dest: bufferlist to store the result in + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWSI_SysObj_Core::get_attr(const rgw_raw_obj& obj, + const char *name, + bufferlist *dest) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectReadOperation op; + + int rval; + op.getxattr(name, dest, &rval); + + r = rados_obj.operate(&op, nullptr, null_yield); + if (r < 0) + return r; + + return 0; +} + +int RGWSI_SysObj_Core::set_attrs(const rgw_raw_obj& obj, + map& attrs, + map *rmattrs, + RGWObjVersionTracker *objv_tracker) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + + map::iterator iter; + if (rmattrs) { + for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { + const string& name = iter->first; + op.rmxattr(name.c_str()); + } + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + } + + if (!op.size()) + return 0; + + bufferlist bl; + + r = rados_obj.operate(&op, null_yield); + if (r < 0) + return r; + + return 0; +} + +int RGWSI_SysObj_Core::omap_get_vals(const rgw_raw_obj& obj, + const string& marker, + uint64_t count, + std::map *m, + bool *pmore) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + string start_after = marker; + bool more; + + do { + librados::ObjectReadOperation op; + + std::map t; + int rval; + op.omap_get_vals2(start_after, count, &t, &more, &rval); + + r = rados_obj.operate(&op, nullptr, null_yield); + if (r < 0) { + return r; + } + if (t.empty()) { + break; + } + count -= t.size(); + start_after = t.rbegin()->first; + m->insert(t.begin(), t.end()); + } while (more && count > 0); + + if (pmore) { + *pmore = more; + } + return 0; +} + +int RGWSI_SysObj_Core::omap_get_all(const rgw_raw_obj& obj, std::map *m) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + +#define MAX_OMAP_GET_ENTRIES 1024 + const int count = MAX_OMAP_GET_ENTRIES; + string start_after; + bool more; + + do { + librados::ObjectReadOperation op; + + std::map t; + int rval; + op.omap_get_vals2(start_after, count, &t, &more, &rval); + + r = rados_obj.operate(&op, nullptr, null_yield); + if (r < 0) { + return r; + } + if (t.empty()) { + break; + } + start_after = t.rbegin()->first; + m->insert(t.begin(), t.end()); + } while (more); + return 0; +} + +int RGWSI_SysObj_Core::omap_set(const rgw_raw_obj& obj, const std::string& key, bufferlist& bl, bool must_exist) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl; + + map m; + m[key] = bl; + librados::ObjectWriteOperation op; + if (must_exist) + op.assert_exists(); + op.omap_set(m); + r = rados_obj.operate(&op, null_yield); + return r; +} + +int RGWSI_SysObj_Core::omap_set(const rgw_raw_obj& obj, const std::map& m, bool must_exist) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + if (must_exist) + op.assert_exists(); + op.omap_set(m); + r = rados_obj.operate(&op, null_yield); + return r; +} + +int RGWSI_SysObj_Core::omap_del(const rgw_raw_obj& obj, const std::string& key) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + set k; + k.insert(key); + + librados::ObjectWriteOperation op; + + op.omap_rm_keys(k); + + r = rados_obj.operate(&op, null_yield); + return r; +} + +int RGWSI_SysObj_Core::notify(const rgw_raw_obj& obj, + bufferlist& bl, + uint64_t timeout_ms, + bufferlist *pbl) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + r = rados_obj.notify(bl, timeout_ms, pbl); + return r; +} + +int RGWSI_SysObj_Core::remove(RGWSysObjectCtxBase& obj_ctx, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + + op.remove(); + r = rados_obj.operate(&op, null_yield); + if (r < 0) + return r; + + return 0; +} + +int RGWSI_SysObj_Core::write(const rgw_raw_obj& obj, + real_time *pmtime, + map& attrs, + bool exclusive, + const bufferlist& data, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + + if (exclusive) { + op.create(true); // exclusive create + } else { + op.remove(); + op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK); + op.create(false); + } + + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + + if (real_clock::is_zero(set_mtime)) { + set_mtime = real_clock::now(); + } + + struct timespec mtime_ts = real_clock::to_timespec(set_mtime); + op.mtime2(&mtime_ts); + op.write_full(data); + + bufferlist acl_bl; + + for (map::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + } + + r = rados_obj.operate(&op, null_yield); + if (r < 0) { + return r; + } + + if (objv_tracker) { + objv_tracker->apply_write(); + } + + if (pmtime) { + *pmtime = set_mtime; + } + + return 0; +} + + +int RGWSI_SysObj_Core::write_data(const rgw_raw_obj& obj, + const bufferlist& bl, + bool exclusive, + RGWObjVersionTracker *objv_tracker) +{ + RGWSI_RADOS::Obj rados_obj; + int r = get_rados_obj(zone_svc, obj, &rados_obj); + if (r < 0) { + ldout(cct, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl; + return r; + } + + librados::ObjectWriteOperation op; + + if (exclusive) { + op.create(true); + } + + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + op.write_full(bl); + r = rados_obj.operate(&op, null_yield); + if (r < 0) + return r; + + if (objv_tracker) { + objv_tracker->apply_write(); + } + return 0; +} + diff --git a/src/rgw/services/svc_sys_obj_core.h b/src/rgw/services/svc_sys_obj_core.h new file mode 100644 index 00000000..d033267e --- /dev/null +++ b/src/rgw/services/svc_sys_obj_core.h @@ -0,0 +1,201 @@ +#ifndef CEPH_RGW_SERVICES_SYS_OBJ_CORE_H +#define CEPH_RGW_SERVICES_SYS_OBJ_CORE_H + + +#include "rgw/rgw_service.h" + +#include "svc_rados.h" + + +class RGWSI_Zone; + +struct rgw_cache_entry_info; + +struct RGWSysObjState { + rgw_raw_obj obj; + bool has_attrs{false}; + bool exists{false}; + uint64_t size{0}; + ceph::real_time mtime; + uint64_t epoch{0}; + bufferlist obj_tag; + bool has_data{false}; + bufferlist data; + bool prefetch_data{false}; + uint64_t pg_ver{0}; + + /* important! don't forget to update copy constructor */ + + RGWObjVersionTracker objv_tracker; + + map attrset; + RGWSysObjState() {} + RGWSysObjState(const RGWSysObjState& rhs) : obj (rhs.obj) { + has_attrs = rhs.has_attrs; + exists = rhs.exists; + size = rhs.size; + mtime = rhs.mtime; + epoch = rhs.epoch; + if (rhs.obj_tag.length()) { + obj_tag = rhs.obj_tag; + } + has_data = rhs.has_data; + if (rhs.data.length()) { + data = rhs.data; + } + prefetch_data = rhs.prefetch_data; + pg_ver = rhs.pg_ver; + objv_tracker = rhs.objv_tracker; + } +}; + +class RGWSysObjectCtxBase { + std::map objs_state; + RWLock lock; + +public: + explicit RGWSysObjectCtxBase() : lock("RGWSysObjectCtxBase") {} + + RGWSysObjectCtxBase(const RGWSysObjectCtxBase& rhs) : objs_state(rhs.objs_state), + lock("RGWSysObjectCtxBase") {} + RGWSysObjectCtxBase(const RGWSysObjectCtxBase&& rhs) : objs_state(std::move(rhs.objs_state)), + lock("RGWSysObjectCtxBase") {} + + RGWSysObjState *get_state(const rgw_raw_obj& obj) { + RGWSysObjState *result; + std::map::iterator iter; + lock.get_read(); + assert (!obj.empty()); + iter = objs_state.find(obj); + if (iter != objs_state.end()) { + result = &iter->second; + lock.unlock(); + } else { + lock.unlock(); + lock.get_write(); + result = &objs_state[obj]; + lock.unlock(); + } + return result; + } + + void set_prefetch_data(rgw_raw_obj& obj) { + RWLock::WLocker wl(lock); + assert (!obj.empty()); + objs_state[obj].prefetch_data = true; + } + void invalidate(rgw_raw_obj& obj) { + RWLock::WLocker wl(lock); + auto iter = objs_state.find(obj); + if (iter == objs_state.end()) { + return; + } + objs_state.erase(iter); + } +}; + +class RGWSI_SysObj_Core : public RGWServiceInstance +{ + friend class RGWServices_Def; + friend class RGWSI_SysObj; + +protected: + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_Zone *zone_svc{nullptr}; + + struct GetObjState { + RGWSI_RADOS::Obj rados_obj; + bool has_rados_obj{false}; + uint64_t last_ver{0}; + + GetObjState() {} + + int get_rados_obj(RGWSI_RADOS *rados_svc, + RGWSI_Zone *zone_svc, + const rgw_raw_obj& obj, + RGWSI_RADOS::Obj **pobj); + }; + + + void core_init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc) { + rados_svc = _rados_svc; + zone_svc = _zone_svc; + } + int get_rados_obj(RGWSI_Zone *zone_svc, const rgw_raw_obj& obj, RGWSI_RADOS::Obj *pobj); + + virtual int raw_stat(const rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch, + map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker); + + virtual int read(RGWSysObjectCtxBase& obj_ctx, + GetObjState& read_state, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj, + bufferlist *bl, off_t ofs, off_t end, + map *attrs, + bool raw_attrs, + rgw_cache_entry_info *cache_info, + boost::optional); + + virtual int remove(RGWSysObjectCtxBase& obj_ctx, + RGWObjVersionTracker *objv_tracker, + const rgw_raw_obj& obj); + + virtual int write(const rgw_raw_obj& obj, + real_time *pmtime, + map& attrs, + bool exclusive, + const bufferlist& data, + RGWObjVersionTracker *objv_tracker, + real_time set_mtime); + + virtual int write_data(const rgw_raw_obj& obj, + const bufferlist& bl, + bool exclusive, + RGWObjVersionTracker *objv_tracker); + + virtual int get_attr(const rgw_raw_obj& obj, const char *name, bufferlist *dest); + + virtual int set_attrs(const rgw_raw_obj& obj, + map& attrs, + map *rmattrs, + RGWObjVersionTracker *objv_tracker); + + virtual int omap_get_all(const rgw_raw_obj& obj, std::map *m); + virtual int omap_get_vals(const rgw_raw_obj& obj, + const string& marker, + uint64_t count, + std::map *m, + bool *pmore); + virtual int omap_set(const rgw_raw_obj& obj, const std::string& key, bufferlist& bl, bool must_exist = false); + virtual int omap_set(const rgw_raw_obj& obj, const map& m, bool must_exist = false); + virtual int omap_del(const rgw_raw_obj& obj, const std::string& key); + + virtual int notify(const rgw_raw_obj& obj, + bufferlist& bl, + uint64_t timeout_ms, + bufferlist *pbl); + + /* wrappers */ + int get_system_obj_state_impl(RGWSysObjectCtxBase *rctx, const rgw_raw_obj& obj, RGWSysObjState **state, RGWObjVersionTracker *objv_tracker); + int get_system_obj_state(RGWSysObjectCtxBase *rctx, const rgw_raw_obj& obj, RGWSysObjState **state, RGWObjVersionTracker *objv_tracker); + + int stat(RGWSysObjectCtxBase& obj_ctx, + GetObjState& state, + const rgw_raw_obj& obj, + map *attrs, + bool raw_attrs, + real_time *lastmod, + uint64_t *obj_size, + RGWObjVersionTracker *objv_tracker); + +public: + RGWSI_SysObj_Core(CephContext *cct): RGWServiceInstance(cct) {} + + RGWSI_Zone *get_zone_svc() { + return zone_svc; + } +}; + +#endif diff --git a/src/rgw/services/svc_zone.cc b/src/rgw/services/svc_zone.cc new file mode 100644 index 00000000..724f83ae --- /dev/null +++ b/src/rgw/services/svc_zone.cc @@ -0,0 +1,1250 @@ +#include "svc_zone.h" +#include "svc_rados.h" +#include "svc_sys_obj.h" +#include "svc_sync_modules.h" + +#include "rgw/rgw_zone.h" +#include "rgw/rgw_rest_conn.h" + +#include "common/errno.h" +#include "include/random.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace rgw_zone_defaults; + +RGWSI_Zone::RGWSI_Zone(CephContext *cct) : RGWServiceInstance(cct) +{ +} + +void RGWSI_Zone::init(RGWSI_SysObj *_sysobj_svc, + RGWSI_RADOS * _rados_svc, + RGWSI_SyncModules * _sync_modules_svc) +{ + sysobj_svc = _sysobj_svc; + rados_svc = _rados_svc; + sync_modules_svc = _sync_modules_svc; + + realm = new RGWRealm(); + zonegroup = new RGWZoneGroup(); + zone_public_config = new RGWZone(); + zone_params = new RGWZoneParams(); + current_period = new RGWPeriod(); +} + +RGWSI_Zone::~RGWSI_Zone() +{ + delete realm; + delete zonegroup; + delete zone_public_config; + delete zone_params; + delete current_period; +} + +bool RGWSI_Zone::zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const +{ + return target_zone.syncs_from(source_zone.name) && + sync_modules_svc->get_manager()->supports_data_export(source_zone.tier_type); +} + +int RGWSI_Zone::do_start() +{ + int ret = sysobj_svc->start(); + if (ret < 0) { + return ret; + } + + assert(sysobj_svc->is_started()); /* if not then there's ordering issue */ + + ret = rados_svc->start(); + if (ret < 0) { + return ret; + } + ret = sync_modules_svc->start(); + if (ret < 0) { + return ret; + } + ret = realm->init(cct, sysobj_svc); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } else if (ret != -ENOENT) { + ldout(cct, 20) << "realm " << realm->get_name() << " " << realm->get_id() << dendl; + ret = current_period->init(cct, sysobj_svc, realm->get_id(), realm->get_name()); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl; + return ret; + } + ldout(cct, 20) << "current period " << current_period->get_id() << dendl; + } + + ret = replace_region_with_zonegroup(); + if (ret < 0) { + lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = convert_regionmap(); + if (ret < 0) { + lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl; + return ret; + } + + bool zg_initialized = false; + + if (!current_period->get_id().empty()) { + ret = init_zg_from_period(&zg_initialized); + if (ret < 0) { + return ret; + } + } + + bool creating_defaults = false; + bool using_local = (!zg_initialized); + if (using_local) { + ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl; + ret = init_zg_from_local(&creating_defaults); + if (ret < 0) { + return ret; + } + // read period_config into current_period + auto& period_config = current_period->get_config(); + ret = period_config.read(sysobj_svc, zonegroup->realm_id); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "ERROR: failed to read period config: " + << cpp_strerror(ret) << dendl; + return ret; + } + } + + ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl; + if (creating_defaults && cct->_conf->rgw_zone.empty()) { + ldout(cct, 10) << " Using default name "<< default_zone_name << dendl; + zone_params->set_name(default_zone_name); + } + + ret = zone_params->init(cct, sysobj_svc); + if (ret < 0 && ret != -ENOENT) { + lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } + auto zone_iter = zonegroup->zones.find(zone_params->get_id()); + if (zone_iter == zonegroup->zones.end()) { + if (using_local) { + lderr(cct) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << ")" << dendl; + return -EINVAL; + } + ldout(cct, 1) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << "), switching to local zonegroup configuration" << dendl; + ret = init_zg_from_local(&creating_defaults); + if (ret < 0) { + return ret; + } + zone_iter = zonegroup->zones.find(zone_params->get_id()); + } + if (zone_iter != zonegroup->zones.end()) { + *zone_public_config = zone_iter->second; + ldout(cct, 20) << "zone " << zone_params->get_name() << dendl; + } else { + lderr(cct) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << ")" << dendl; + return -EINVAL; + } + + zone_short_id = current_period->get_map().get_zone_short_id(zone_params->get_id()); + + RGWSyncModuleRef sm; + if (!sync_modules_svc->get_manager()->get_module(zone_public_config->tier_type, &sm)) { + lderr(cct) << "ERROR: tier type not found: " << zone_public_config->tier_type << dendl; + return -EINVAL; + } + + writeable_zone = sm->supports_writes(); + + /* first build all zones index */ + for (auto ziter : zonegroup->zones) { + const string& id = ziter.first; + RGWZone& z = ziter.second; + zone_id_by_name[z.name] = id; + zone_by_id[id] = z; + } + + if (zone_by_id.find(zone_id()) == zone_by_id.end()) { + ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl; + } + *zone_public_config = zone_by_id[zone_id()]; + for (const auto& ziter : zonegroup->zones) { + const string& id = ziter.first; + const RGWZone& z = ziter.second; + if (id == zone_id()) { + continue; + } + if (z.endpoints.empty()) { + ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl; + continue; + } + ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl; + RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints); + zone_conn_map[id] = conn; + if (zone_syncs_from(*zone_public_config, z) || + zone_syncs_from(z, *zone_public_config)) { + if (zone_syncs_from(*zone_public_config, z)) { + data_sync_source_zones.push_back(&z); + } + if (zone_syncs_from(z, *zone_public_config)) { + zone_data_notify_to_map[id] = conn; + } + } else { + ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl; + } + } + + return 0; +} + +void RGWSI_Zone::shutdown() +{ + delete rest_master_conn; + + map::iterator iter; + for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + delete conn; + } + + for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + delete conn; + } +} + +int RGWSI_Zone::list_regions(list& regions) +{ + RGWZoneGroup zonegroup; + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zonegroup.get_pool(cct)); + + return syspool.op().list_prefixed_objs(region_info_oid_prefix, ®ions); +} + +int RGWSI_Zone::list_zonegroups(list& zonegroups) +{ + RGWZoneGroup zonegroup; + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zonegroup.get_pool(cct)); + + return syspool.op().list_prefixed_objs(zonegroup_names_oid_prefix, &zonegroups); +} + +int RGWSI_Zone::list_zones(list& zones) +{ + RGWZoneParams zoneparams; + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zoneparams.get_pool(cct)); + + return syspool.op().list_prefixed_objs(zone_names_oid_prefix, &zones); +} + +int RGWSI_Zone::list_realms(list& realms) +{ + RGWRealm realm(cct, sysobj_svc); + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(realm.get_pool(cct)); + + return syspool.op().list_prefixed_objs(realm_names_oid_prefix, &realms); +} + +int RGWSI_Zone::list_periods(list& periods) +{ + RGWPeriod period; + list raw_periods; + RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(period.get_pool(cct)); + int ret = syspool.op().list_prefixed_objs(period.get_info_oid_prefix(), &raw_periods); + if (ret < 0) { + return ret; + } + for (const auto& oid : raw_periods) { + size_t pos = oid.find("."); + if (pos != std::string::npos) { + periods.push_back(oid.substr(0, pos)); + } else { + periods.push_back(oid); + } + } + periods.sort(); // unique() only detects duplicates if they're adjacent + periods.unique(); + return 0; +} + + +int RGWSI_Zone::list_periods(const string& current_period, list& periods) +{ + int ret = 0; + string period_id = current_period; + while(!period_id.empty()) { + RGWPeriod period(period_id); + ret = period.init(cct, sysobj_svc); + if (ret < 0) { + return ret; + } + periods.push_back(period.get_id()); + period_id = period.get_predecessor(); + } + + return ret; +} + +/** + * Replace all region configuration with zonegroup for + * backward compatability + * Returns 0 on success, -ERR# on failure. + */ +int RGWSI_Zone::replace_region_with_zonegroup() +{ + /* copy default region */ + /* convert default region to default zonegroup */ + string default_oid = cct->_conf->rgw_default_region_info_oid; + if (default_oid.empty()) { + default_oid = default_region_info_oid; + } + + RGWZoneGroup default_zonegroup; + rgw_pool pool{default_zonegroup.get_pool(cct)}; + string oid = "converted"; + bufferlist bl; + + RGWSysObjectCtx obj_ctx = sysobj_svc->init_obj_ctx(); + RGWSysObj sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj(pool, oid)); + + int ret = sysobj.rop().read(&bl); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } else if (ret != -ENOENT) { + ldout(cct, 20) << "System already converted " << dendl; + return 0; + } + + string default_region; + ret = default_zonegroup.init(cct, sysobj_svc, false, true); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = default_zonegroup.read_default_id(default_region, true); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } + + /* convert regions to zonegroups */ + list regions; + ret = list_regions(regions); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } else if (ret == -ENOENT || regions.empty()) { + RGWZoneParams zoneparams(default_zone_name); + int ret = zoneparams.init(cct, sysobj_svc); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl; + return ret; + } + /* update master zone */ + RGWZoneGroup default_zg(default_zonegroup_name); + ret = default_zg.init(cct, sysobj_svc); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl; + return ret; + } + if (ret != -ENOENT && default_zg.master_zone.empty()) { + default_zg.master_zone = zoneparams.get_id(); + return default_zg.update(); + } + return 0; + } + + string master_region, master_zone; + for (list::iterator iter = regions.begin(); iter != regions.end(); ++iter) { + if (*iter != default_zonegroup_name){ + RGWZoneGroup region(*iter); + int ret = region.init(cct, sysobj_svc, true, true); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + if (region.is_master_zonegroup()) { + master_region = region.get_id(); + master_zone = region.master_zone; + } + } + } + + /* create realm if there is none. + The realm name will be the region and zone concatenated + realm id will be mds of its name */ + if (realm->get_id().empty() && !master_region.empty() && !master_zone.empty()) { + string new_realm_name = master_region + "." + master_zone; + unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + MD5 hash; + hash.Update((const unsigned char *)new_realm_name.c_str(), new_realm_name.length()); + hash.Final(md5); + buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str); + string new_realm_id(md5_str); + RGWRealm new_realm(new_realm_id,new_realm_name); + ret = new_realm.init(cct, sysobj_svc, false); + if (ret < 0) { + ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = new_realm.create(); + if (ret < 0 && ret != -EEXIST) { + ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = new_realm.set_as_default(); + if (ret < 0) { + ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = realm->init(cct, sysobj_svc); + if (ret < 0) { + ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = current_period->init(cct, sysobj_svc, realm->get_id(), realm->get_name()); + if (ret < 0) { + ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + list::iterator iter; + /* create zonegroups */ + for (iter = regions.begin(); iter != regions.end(); ++iter) + { + ldout(cct, 0) << __func__ << " Converting " << *iter << dendl; + /* check to see if we don't have already a zonegroup with this name */ + RGWZoneGroup new_zonegroup(*iter); + ret = new_zonegroup.init(cct , sysobj_svc); + if (ret == 0 && new_zonegroup.get_id() != *iter) { + ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () << + " skipping conversion " << dendl; + continue; + } + RGWZoneGroup zonegroup(*iter); + zonegroup.set_id(*iter); + int ret = zonegroup.init(cct, sysobj_svc, true, true); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } + zonegroup.realm_id = realm->get_id(); + /* fix default region master zone */ + if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) { + ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl; + zonegroup.master_zone = default_zone_name; + } + ret = zonegroup.update(); + if (ret < 0 && ret != -EEXIST) { + ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + ret = zonegroup.update_name(); + if (ret < 0 && ret != -EEXIST) { + ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + if (zonegroup.get_name() == default_region) { + ret = zonegroup.set_as_default(); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + } + for (map::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end(); + ++iter) { + ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl; + RGWZoneParams zoneparams(iter->first, iter->first); + zoneparams.set_id(iter->first); + zoneparams.realm_id = realm->get_id(); + ret = zoneparams.init(cct, sysobj_svc); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl; + return ret; + } else if (ret == -ENOENT) { + ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl; + continue; + } + zonegroup.realm_id = realm->get_id(); + ret = zoneparams.update(); + if (ret < 0 && ret != -EEXIST) { + ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = zoneparams.update_name(); + if (ret < 0 && ret != -EEXIST) { + ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + if (!current_period->get_id().empty()) { + ret = current_period->add_zonegroup(zonegroup); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl; + return ret; + } + } + } + + if (!current_period->get_id().empty()) { + ret = current_period->update(); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = current_period->store_info(false); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = current_period->reflect(); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + for (auto const& iter : regions) { + RGWZoneGroup zonegroup(iter); + int ret = zonegroup.init(cct, sysobj_svc, true, true); + if (ret < 0) { + ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = zonegroup.delete_obj(true); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + } + + /* mark as converted */ + ret = sysobj.wop() + .set_exclusive(true) + .write(bl); + if (ret < 0 ) { + ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + + return 0; +} + +/** + * Add new connection to connections map + * @param zonegroup_conn_map map which new connection will be added to + * @param zonegroup zonegroup which new connection will connect to + * @param new_connection pointer to new connection instance + */ +static void add_new_connection_to_map(map &zonegroup_conn_map, + const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection) +{ + // Delete if connection is already exists + map::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id()); + if (iterZoneGroup != zonegroup_conn_map.end()) { + delete iterZoneGroup->second; + } + + // Add new connection to connections map + zonegroup_conn_map[zonegroup.get_id()] = new_connection; +} + +int RGWSI_Zone::init_zg_from_period(bool *initialized) +{ + *initialized = false; + + if (current_period->get_id().empty()) { + return 0; + } + + int ret = zonegroup->init(cct, sysobj_svc); + ldout(cct, 20) << "period zonegroup init ret " << ret << dendl; + if (ret == -ENOENT) { + return 0; + } + if (ret < 0) { + ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl; + return ret; + } + ldout(cct, 20) << "period zonegroup name " << zonegroup->get_name() << dendl; + + map::const_iterator iter = + current_period->get_map().zonegroups.find(zonegroup->get_id()); + + if (iter != current_period->get_map().zonegroups.end()) { + ldout(cct, 20) << "using current period zonegroup " << zonegroup->get_name() << dendl; + *zonegroup = iter->second; + ret = zonegroup->init(cct, sysobj_svc, false); + if (ret < 0) { + ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl; + return ret; + } + ret = zone_params->init(cct, sysobj_svc); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl; + return ret; + } if (ret ==-ENOENT && zonegroup->get_name() == default_zonegroup_name) { + ldout(cct, 10) << " Using default name "<< default_zone_name << dendl; + zone_params->set_name(default_zone_name); + ret = zone_params->init(cct, sysobj_svc); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl; + return ret; + } + } + } + for (iter = current_period->get_map().zonegroups.begin(); + iter != current_period->get_map().zonegroups.end(); ++iter){ + const RGWZoneGroup& zg = iter->second; + // use endpoints from the zonegroup's master zone + auto master = zg.zones.find(zg.master_zone); + if (master == zg.zones.end()) { + // Check for empty zonegroup which can happen if zone was deleted before removal + if (zg.zones.size() == 0) + continue; + // fix missing master zone for a single zone zonegroup + if (zg.master_zone.empty() && zg.zones.size() == 1) { + master = zg.zones.begin(); + ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " << + master->second.name << " id:" << master->second.id << " as master" << dendl; + if (zonegroup->get_id() == zg.get_id()) { + zonegroup->master_zone = master->second.id; + ret = zonegroup->update(); + if (ret < 0) { + ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl; + return ret; + } + } else { + RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name()); + ret = fixed_zg.init(cct, sysobj_svc); + if (ret < 0) { + ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl; + return ret; + } + fixed_zg.master_zone = master->second.id; + ret = fixed_zg.update(); + if (ret < 0) { + ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl; + return ret; + } + } + } else { + ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" << + zg.master_zone << dendl; + return -EINVAL; + } + } + const auto& endpoints = master->second.endpoints; + add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints)); + if (!current_period->get_master_zonegroup().empty() && + zg.get_id() == current_period->get_master_zonegroup()) { + rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints); + } + } + + *initialized = true; + + return 0; +} + +int RGWSI_Zone::init_zg_from_local(bool *creating_defaults) +{ + int ret = zonegroup->init(cct, sysobj_svc); + if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) { + ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl; + return ret; + } else if (ret == -ENOENT) { + *creating_defaults = true; + ldout(cct, 10) << "Creating default zonegroup " << dendl; + ret = zonegroup->create_default(); + if (ret < 0) { + ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + ret = zonegroup->init(cct, sysobj_svc); + if (ret < 0) { + ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret) + << dendl; + return ret; + } + } + ldout(cct, 20) << "zonegroup " << zonegroup->get_name() << dendl; + if (zonegroup->is_master_zonegroup()) { + // use endpoints from the zonegroup's master zone + auto master = zonegroup->zones.find(zonegroup->master_zone); + if (master == zonegroup->zones.end()) { + // fix missing master zone for a single zone zonegroup + if (zonegroup->master_zone.empty() && zonegroup->zones.size() == 1) { + master = zonegroup->zones.begin(); + ldout(cct, 0) << "zonegroup " << zonegroup->get_name() << " missing master_zone, setting zone " << + master->second.name << " id:" << master->second.id << " as master" << dendl; + zonegroup->master_zone = master->second.id; + ret = zonegroup->update(); + if (ret < 0) { + ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl; + return ret; + } + } else { + ldout(cct, 0) << "zonegroup " << zonegroup->get_name() << " missing zone for " + "master_zone=" << zonegroup->master_zone << dendl; + return -EINVAL; + } + } + const auto& endpoints = master->second.endpoints; + rest_master_conn = new RGWRESTConn(cct, this, zonegroup->get_id(), endpoints); + } + + return 0; +} + +int RGWSI_Zone::convert_regionmap() +{ + RGWZoneGroupMap zonegroupmap; + + string pool_name = cct->_conf->rgw_zone_root_pool; + if (pool_name.empty()) { + pool_name = RGW_DEFAULT_ZONE_ROOT_POOL; + } + string oid = region_map_oid; + + rgw_pool pool(pool_name); + bufferlist bl; + + RGWSysObjectCtx obj_ctx = sysobj_svc->init_obj_ctx(); + RGWSysObj sysobj = sysobj_svc->get_obj(obj_ctx, rgw_raw_obj(pool, oid)); + + int ret = sysobj.rop().read(&bl); + if (ret < 0 && ret != -ENOENT) { + return ret; + } else if (ret == -ENOENT) { + return 0; + } + + try { + auto iter = bl.cbegin(); + decode(zonegroupmap, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl; + return -EIO; + } + + for (map::iterator iter = zonegroupmap.zonegroups.begin(); + iter != zonegroupmap.zonegroups.end(); ++iter) { + RGWZoneGroup& zonegroup = iter->second; + ret = zonegroup.init(cct, sysobj_svc, false); + ret = zonegroup.update(); + if (ret < 0 && ret != -ENOENT) { + ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " << + cpp_strerror(-ret) << dendl; + return ret; + } else if (ret == -ENOENT) { + ret = zonegroup.create(); + if (ret < 0) { + ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " << + cpp_strerror(-ret) << dendl; + return ret; + } + } + } + + current_period->set_user_quota(zonegroupmap.user_quota); + current_period->set_bucket_quota(zonegroupmap.bucket_quota); + + // remove the region_map so we don't try to convert again + ret = sysobj.wop().remove(); + if (ret < 0) { + ldout(cct, 0) << "Error could not remove " << sysobj.get_obj() + << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl; + return ret; + } + + return 0; +} + +const RGWZoneParams& RGWSI_Zone::get_zone_params() const +{ + return *zone_params; +} + +const RGWZone& RGWSI_Zone::get_zone() const +{ + return *zone_public_config; +} + +const RGWZoneGroup& RGWSI_Zone::get_zonegroup() const +{ + return *zonegroup; +} + +int RGWSI_Zone::get_zonegroup(const string& id, RGWZoneGroup& zg) const +{ + int ret = 0; + if (id == zonegroup->get_id()) { + zg = *zonegroup; + } else if (!current_period->get_id().empty()) { + ret = current_period->get_zonegroup(zg, id); + } + return ret; +} + +const RGWRealm& RGWSI_Zone::get_realm() const +{ + return *realm; +} + +const RGWPeriod& RGWSI_Zone::get_current_period() const +{ + return *current_period; +} + +const string& RGWSI_Zone::get_current_period_id() +{ + return current_period->get_id(); +} + +bool RGWSI_Zone::has_zonegroup_api(const std::string& api) const +{ + if (!current_period->get_id().empty()) { + const auto& zonegroups_by_api = current_period->get_map().zonegroups_by_api; + if (zonegroups_by_api.find(api) != zonegroups_by_api.end()) + return true; + } else if (zonegroup->api_name == api) { + return true; + } + return false; +} + +bool RGWSI_Zone::zone_is_writeable() +{ + return writeable_zone && !get_zone().is_read_only(); +} + +uint32_t RGWSI_Zone::get_zone_short_id() const +{ + return zone_short_id; +} + +const string& RGWSI_Zone::zone_name() +{ + return get_zone_params().get_name(); +} +const string& RGWSI_Zone::zone_id() +{ + return get_zone_params().get_id(); +} + +bool RGWSI_Zone::find_zone_by_id(const string& id, RGWZone **zone) +{ + auto iter = zone_by_id.find(id); + if (iter == zone_by_id.end()) { + return false; + } + *zone = &(iter->second); + return true; +} + +RGWRESTConn *RGWSI_Zone::get_zone_conn_by_id(const string& id) { + auto citer = zone_conn_map.find(id); + if (citer == zone_conn_map.end()) { + return NULL; + } + + return citer->second; +} + +RGWRESTConn *RGWSI_Zone::get_zone_conn_by_name(const string& name) { + auto i = zone_id_by_name.find(name); + if (i == zone_id_by_name.end()) { + return NULL; + } + + return get_zone_conn_by_id(i->second); +} + +bool RGWSI_Zone::find_zone_id_by_name(const string& name, string *id) { + auto i = zone_id_by_name.find(name); + if (i == zone_id_by_name.end()) { + return false; + } + *id = i->second; + return true; +} + +bool RGWSI_Zone::need_to_log_data() const +{ + return zone_public_config->log_data; +} + +bool RGWSI_Zone::is_meta_master() const +{ + if (!zonegroup->is_master_zonegroup()) { + return false; + } + + return (zonegroup->master_zone == zone_public_config->id); +} + +bool RGWSI_Zone::need_to_log_metadata() const +{ + return is_meta_master() && + (zonegroup->zones.size() > 1 || current_period->is_multi_zonegroups_with_zones()); +} + +bool RGWSI_Zone::can_reshard() const +{ + return current_period->get_id().empty() || + (zonegroup->zones.size() == 1 && current_period->is_single_zonegroup()); +} + +/** + * Check to see if the bucket metadata could be synced + * bucket: the bucket to check + * Returns false is the bucket is not synced + */ +bool RGWSI_Zone::is_syncing_bucket_meta(const rgw_bucket& bucket) +{ + + /* no current period */ + if (current_period->get_id().empty()) { + return false; + } + + /* zonegroup is not master zonegroup */ + if (!zonegroup->is_master_zonegroup()) { + return false; + } + + /* single zonegroup and a single zone */ + if (current_period->is_single_zonegroup() && zonegroup->zones.size() == 1) { + return false; + } + + /* zone is not master */ + if (zonegroup->master_zone.compare(zone_public_config->id) != 0) { + return false; + } + + return true; +} + + +int RGWSI_Zone::select_new_bucket_location(const RGWUserInfo& user_info, const string& zonegroup_id, + const rgw_placement_rule& request_rule, + rgw_placement_rule *pselected_rule_name, RGWZonePlacementInfo *rule_info) +{ + /* first check that zonegroup exists within current period. */ + RGWZoneGroup zonegroup; + int ret = get_zonegroup(zonegroup_id, zonegroup); + if (ret < 0) { + ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl; + return ret; + } + + const rgw_placement_rule *used_rule; + + /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */ + std::map::const_iterator titer; + + if (!request_rule.name.empty()) { + used_rule = &request_rule; + titer = zonegroup.placement_targets.find(request_rule.name); + if (titer == zonegroup.placement_targets.end()) { + ldout(cct, 0) << "could not find requested placement id " << request_rule + << " within zonegroup " << dendl; + return -ERR_INVALID_LOCATION_CONSTRAINT; + } + } else if (!user_info.default_placement.name.empty()) { + used_rule = &user_info.default_placement; + titer = zonegroup.placement_targets.find(user_info.default_placement.name); + if (titer == zonegroup.placement_targets.end()) { + ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement + << " within zonegroup " << dendl; + return -ERR_INVALID_LOCATION_CONSTRAINT; + } + } else { + if (zonegroup.default_placement.name.empty()) { // zonegroup default rule as fallback, it should not be empty. + ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl; + return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION; + } else { + used_rule = &zonegroup.default_placement; + titer = zonegroup.placement_targets.find(zonegroup.default_placement.name); + if (titer == zonegroup.placement_targets.end()) { + ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement + << " within zonegroup " << dendl; + return -ERR_INVALID_LOCATION_CONSTRAINT; + } + } + } + + /* now check tag for the rule, whether user is permitted to use rule */ + const auto& target_rule = titer->second; + if (!target_rule.user_permitted(user_info.placement_tags)) { + ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl; + return -EPERM; + } + + const string *storage_class = &request_rule.storage_class; + + if (storage_class->empty()) { + storage_class = &used_rule->storage_class; + } + + rgw_placement_rule rule(titer->first, *storage_class); + + if (pselected_rule_name) { + *pselected_rule_name = rule; + } + + return select_bucket_location_by_rule(rule, rule_info); +} + +int RGWSI_Zone::select_bucket_location_by_rule(const rgw_placement_rule& location_rule, RGWZonePlacementInfo *rule_info) +{ + if (location_rule.name.empty()) { + /* we can only reach here if we're trying to set a bucket location from a bucket + * created on a different zone, using a legacy / default pool configuration + */ + if (rule_info) { + return select_legacy_bucket_placement(rule_info); + } + + return 0; + } + + /* + * make sure that zone has this rule configured. We're + * checking it for the local zone, because that's where this bucket object is going to + * reside. + */ + auto piter = zone_params->placement_pools.find(location_rule.name); + if (piter == zone_params->placement_pools.end()) { + /* couldn't find, means we cannot really place data for this bucket in this zone */ + ldout(cct, 0) << "ERROR: This zone does not contain placement rule " + << location_rule << " present in the zonegroup!" << dendl; + return -EINVAL; + } + + auto storage_class = location_rule.get_storage_class(); + if (!piter->second.storage_class_exists(storage_class)) { + ldout(cct, 5) << "requested storage class does not exist: " << storage_class << dendl; + return -EINVAL; + } + + + RGWZonePlacementInfo& placement_info = piter->second; + + if (rule_info) { + *rule_info = placement_info; + } + + return 0; +} + +int RGWSI_Zone::select_bucket_placement(const RGWUserInfo& user_info, const string& zonegroup_id, + const rgw_placement_rule& placement_rule, + rgw_placement_rule *pselected_rule, RGWZonePlacementInfo *rule_info) +{ + if (!zone_params->placement_pools.empty()) { + return select_new_bucket_location(user_info, zonegroup_id, placement_rule, + pselected_rule, rule_info); + } + + if (pselected_rule) { + pselected_rule->clear(); + } + + if (rule_info) { + return select_legacy_bucket_placement(rule_info); + } + + return 0; +} + +int RGWSI_Zone::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info) +{ + bufferlist map_bl; + map m; + string pool_name; + bool write_map = false; + + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + + int ret = sysobj.rop().read(&map_bl); + if (ret < 0) { + goto read_omap; + } + + try { + auto iter = map_bl.cbegin(); + decode(m, iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl; + } + +read_omap: + if (m.empty()) { + ret = sysobj.omap().get_all(&m); + + write_map = true; + } + + if (ret < 0 || m.empty()) { + vector pools; + string s = string("default.") + default_storage_pool_suffix; + pools.push_back(rgw_pool(s)); + vector retcodes; + bufferlist bl; + ret = rados_svc->pool().create(pools, &retcodes); + if (ret < 0) + return ret; + ret = sysobj.omap().set(s, bl); + if (ret < 0) + return ret; + m[s] = bl; + } + + if (write_map) { + bufferlist new_bl; + encode(m, new_bl); + ret = sysobj.wop().write(new_bl); + if (ret < 0) { + ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl; + } + } + + auto miter = m.begin(); + if (m.size() > 1) { + // choose a pool at random + auto r = ceph::util::generate_random_number(0, m.size() - 1); + std::advance(miter, r); + } + pool_name = miter->first; + + rgw_pool pool = pool_name; + + rule_info->storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &pool, nullptr); + rule_info->data_extra_pool = pool_name; + rule_info->index_pool = pool_name; + rule_info->index_type = RGWBIType_Normal; + + return 0; +} + +int RGWSI_Zone::update_placement_map() +{ + bufferlist header; + map m; + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + + int ret = sysobj.omap().get_all(&m); + if (ret < 0) + return ret; + + bufferlist new_bl; + encode(m, new_bl); + ret = sysobj.wop().write(new_bl); + if (ret < 0) { + ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl; + } + + return ret; +} + +int RGWSI_Zone::add_bucket_placement(const rgw_pool& new_pool) +{ + int ret = rados_svc->pool(new_pool).lookup(); + if (ret < 0) { // DNE, or something + return ret; + } + + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + + bufferlist empty_bl; + ret = sysobj.omap().set(new_pool.to_str(), empty_bl); + + // don't care about return value + update_placement_map(); + + return ret; +} + +int RGWSI_Zone::remove_bucket_placement(const rgw_pool& old_pool) +{ + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + + int ret = sysobj.omap().del(old_pool.to_str()); + + // don't care about return value + update_placement_map(); + + return ret; +} + +int RGWSI_Zone::list_placement_set(set& names) +{ + bufferlist header; + map m; + + rgw_raw_obj obj(zone_params->domain_root, avail_pools); + auto obj_ctx = sysobj_svc->init_obj_ctx(); + auto sysobj = obj_ctx.get_obj(obj); + int ret = sysobj.omap().get_all(&m); + if (ret < 0) + return ret; + + names.clear(); + map::iterator miter; + for (miter = m.begin(); miter != m.end(); ++miter) { + names.insert(rgw_pool(miter->first)); + } + + return names.size(); +} + +bool RGWSI_Zone::get_redirect_zone_endpoint(string *endpoint) +{ + if (zone_public_config->redirect_zone.empty()) { + return false; + } + + auto iter = zone_conn_map.find(zone_public_config->redirect_zone); + if (iter == zone_conn_map.end()) { + ldout(cct, 0) << "ERROR: cannot find entry for redirect zone: " << zone_public_config->redirect_zone << dendl; + return false; + } + + RGWRESTConn *conn = iter->second; + + int ret = conn->get_url(*endpoint); + if (ret < 0) { + ldout(cct, 0) << "ERROR: redirect zone, conn->get_endpoint() returned ret=" << ret << dendl; + return false; + } + + return true; +} + diff --git a/src/rgw/services/svc_zone.h b/src/rgw/services/svc_zone.h new file mode 100644 index 00000000..8c8dbeba --- /dev/null +++ b/src/rgw/services/svc_zone.h @@ -0,0 +1,134 @@ +#ifndef CEPH_RGW_SERVICES_ZONE_H +#define CEPH_RGW_SERVICES_ZONE_H + + +#include "rgw/rgw_service.h" + + +class RGWSI_RADOS; +class RGWSI_SysObj; +class RGWSI_SyncModules; + +class RGWRealm; +class RGWZoneGroup; +class RGWZone; +class RGWZoneParams; +class RGWPeriod; +class RGWZonePlacementInfo; + +class RGWRESTConn; + +class RGWSI_Zone : public RGWServiceInstance +{ + friend struct RGWServices_Def; + + RGWSI_SysObj *sysobj_svc{nullptr}; + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_SyncModules *sync_modules_svc{nullptr}; + + RGWRealm *realm{nullptr}; + RGWZoneGroup *zonegroup{nullptr}; + RGWZone *zone_public_config{nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */ + RGWZoneParams *zone_params{nullptr}; /* internal zone params, e.g., rados pools */ + RGWPeriod *current_period{nullptr}; + uint32_t zone_short_id{0}; + bool writeable_zone{false}; + + RGWRESTConn *rest_master_conn{nullptr}; + map zone_conn_map; + std::vector data_sync_source_zones; + map zone_data_notify_to_map; + map zonegroup_conn_map; + + map zone_id_by_name; + map zone_by_id; + + void init(RGWSI_SysObj *_sysobj_svc, + RGWSI_RADOS *_rados_svc, + RGWSI_SyncModules *_sync_modules_svc); + int do_start() override; + void shutdown() override; + + int replace_region_with_zonegroup(); + int init_zg_from_period(bool *initialized); + int init_zg_from_local(bool *creating_defaults); + int convert_regionmap(); + + int update_placement_map(); +public: + RGWSI_Zone(CephContext *cct); + ~RGWSI_Zone(); + + const RGWZoneParams& get_zone_params() const; + const RGWPeriod& get_current_period() const; + const RGWRealm& get_realm() const; + const RGWZoneGroup& get_zonegroup() const; + int get_zonegroup(const string& id, RGWZoneGroup& zonegroup) const; + const RGWZone& get_zone() const; + + const string& zone_name(); + const string& zone_id(); + uint32_t get_zone_short_id() const; + + const string& get_current_period_id(); + bool has_zonegroup_api(const std::string& api) const; + + bool zone_is_writeable(); + bool zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const; + bool get_redirect_zone_endpoint(string *endpoint); + + RGWRESTConn *get_master_conn() { + return rest_master_conn; + } + + map& get_zonegroup_conn_map() { + return zonegroup_conn_map; + } + + map& get_zone_conn_map() { + return zone_conn_map; + } + + std::vector& get_data_sync_source_zones() { + return data_sync_source_zones; + } + + map& get_zone_data_notify_to_map() { + return zone_data_notify_to_map; + } + + bool find_zone_by_id(const string& id, RGWZone **zone); + + RGWRESTConn *get_zone_conn_by_id(const string& id); + RGWRESTConn *get_zone_conn_by_name(const string& name); + bool find_zone_id_by_name(const string& name, string *id); + + int select_bucket_placement(const RGWUserInfo& user_info, const string& zonegroup_id, + const rgw_placement_rule& rule, + rgw_placement_rule *pselected_rule, RGWZonePlacementInfo *rule_info); + int select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info); + int select_new_bucket_location(const RGWUserInfo& user_info, const string& zonegroup_id, + const rgw_placement_rule& rule, + rgw_placement_rule *pselected_rule_name, RGWZonePlacementInfo *rule_info); + int select_bucket_location_by_rule(const rgw_placement_rule& location_rule, RGWZonePlacementInfo *rule_info); + + int add_bucket_placement(const rgw_pool& new_pool); + int remove_bucket_placement(const rgw_pool& old_pool); + int list_placement_set(set& names); + + bool is_meta_master() const; + + bool need_to_log_data() const; + bool need_to_log_metadata() const; + bool can_reshard() const; + bool is_syncing_bucket_meta(const rgw_bucket& bucket); + + int list_zonegroups(list& zonegroups); + int list_regions(list& regions); + int list_zones(list& zones); + int list_realms(list& realms); + int list_periods(list& periods); + int list_periods(const string& current_period, list& periods); +}; + +#endif diff --git a/src/rgw/services/svc_zone_utils.cc b/src/rgw/services/svc_zone_utils.cc new file mode 100644 index 00000000..ef9c9c88 --- /dev/null +++ b/src/rgw/services/svc_zone_utils.cc @@ -0,0 +1,59 @@ +#include "svc_zone_utils.h" +#include "svc_rados.h" +#include "svc_zone.h" + +#include "rgw/rgw_zone.h" + +int RGWSI_ZoneUtils::do_start() +{ + init_unique_trans_id_deps(); + + return 0; +} + +string RGWSI_ZoneUtils::gen_host_id() { + /* uint64_t needs 16, two '-' separators and a trailing null */ + const string& zone_name = zone_svc->get_zone().name; + const string& zonegroup_name = zone_svc->get_zonegroup().get_name(); + char charbuf[16 + zone_name.size() + zonegroup_name.size() + 2 + 1]; + snprintf(charbuf, sizeof(charbuf), "%llx-%s-%s", (unsigned long long)rados_svc->instance_id(), zone_name.c_str(), zonegroup_name.c_str()); + return string(charbuf); +} + +string RGWSI_ZoneUtils::unique_id(uint64_t unique_num) +{ + char buf[32]; + snprintf(buf, sizeof(buf), ".%llu.%llu", (unsigned long long)rados_svc->instance_id(), (unsigned long long)unique_num); + string s = zone_svc->get_zone_params().get_id() + buf; + return s; +} + +void RGWSI_ZoneUtils::init_unique_trans_id_deps() { + char buf[16 + 2 + 1]; /* uint64_t needs 16, 2 hyphens add further 2 */ + + snprintf(buf, sizeof(buf), "-%llx-", (unsigned long long)rados_svc->instance_id()); + url_encode(string(buf) + zone_svc->get_zone().name, trans_id_suffix); +} + +/* In order to preserve compatibility with Swift API, transaction ID + * should contain at least 32 characters satisfying following spec: + * - first 21 chars must be in range [0-9a-f]. Swift uses this + * space for storing fragment of UUID obtained through a call to + * uuid4() function of Python's uuid module; + * - char no. 22 must be a hyphen; + * - at least 10 next characters constitute hex-formatted timestamp + * padded with zeroes if necessary. All bytes must be in [0-9a-f] + * range; + * - last, optional part of transaction ID is any url-encoded string + * without restriction on length. */ +string RGWSI_ZoneUtils::unique_trans_id(const uint64_t unique_num) { + char buf[41]; /* 2 + 21 + 1 + 16 (timestamp can consume up to 16) + 1 */ + time_t timestamp = time(NULL); + + snprintf(buf, sizeof(buf), "tx%021llx-%010llx", + (unsigned long long)unique_num, + (unsigned long long)timestamp); + + return string(buf) + trans_id_suffix; +} + diff --git a/src/rgw/services/svc_zone_utils.h b/src/rgw/services/svc_zone_utils.h new file mode 100644 index 00000000..158d2a92 --- /dev/null +++ b/src/rgw/services/svc_zone_utils.h @@ -0,0 +1,39 @@ +#ifndef CEPH_RGW_SERVICES_ZONEUTILS_H +#define CEPH_RGW_SERVICES_ZONEUTILS_H + + +#include "rgw/rgw_service.h" + + +class RGWSI_RADOS; +class RGWSI_Zone; + +class RGWSI_ZoneUtils : public RGWServiceInstance +{ + friend struct RGWServices_Def; + + RGWSI_RADOS *rados_svc{nullptr}; + RGWSI_Zone *zone_svc{nullptr}; + + string trans_id_suffix; + + void init(RGWSI_RADOS *_rados_svc, + RGWSI_Zone *_zone_svc) { + rados_svc = _rados_svc; + zone_svc = _zone_svc; + } + + int do_start() override; + + void init_unique_trans_id_deps(); + +public: + RGWSI_ZoneUtils(CephContext *cct): RGWServiceInstance(cct) {} + + string gen_host_id(); + string unique_id(uint64_t unique_num); + + string unique_trans_id(const uint64_t unique_num); +}; + +#endif -- cgit v1.2.3