summaryrefslogtreecommitdiffstats
path: root/src/crimson
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/crimson
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crimson')
-rw-r--r--src/crimson/CMakeLists.txt203
-rw-r--r--src/crimson/admin/CMakeLists.txt9
-rw-r--r--src/crimson/admin/admin_socket.cc550
-rw-r--r--src/crimson/admin/admin_socket.h187
-rw-r--r--src/crimson/admin/osd_admin.cc574
-rw-r--r--src/crimson/admin/osd_admin.h28
-rw-r--r--src/crimson/admin/pg_commands.cc167
-rw-r--r--src/crimson/admin/pg_commands.h10
-rw-r--r--src/crimson/auth/AuthClient.h71
-rw-r--r--src/crimson/auth/AuthServer.h42
-rw-r--r--src/crimson/auth/DummyAuth.h79
-rw-r--r--src/crimson/auth/KeyRing.cc79
-rw-r--r--src/crimson/auth/KeyRing.h15
-rw-r--r--src/crimson/common/assert.cc81
-rw-r--r--src/crimson/common/auth_handler.h17
-rw-r--r--src/crimson/common/buffer_io.cc57
-rw-r--r--src/crimson/common/buffer_io.h21
-rw-r--r--src/crimson/common/condition_variable.h43
-rw-r--r--src/crimson/common/config_proxy.cc93
-rw-r--r--src/crimson/common/config_proxy.h222
-rw-r--r--src/crimson/common/errorator-loop.h91
-rw-r--r--src/crimson/common/errorator.h1358
-rw-r--r--src/crimson/common/exception.h54
-rw-r--r--src/crimson/common/fatal_signal.cc172
-rw-r--r--src/crimson/common/fatal_signal.h21
-rw-r--r--src/crimson/common/fixed_kv_node_layout.h730
-rw-r--r--src/crimson/common/formatter.cc40
-rw-r--r--src/crimson/common/formatter.h13
-rw-r--r--src/crimson/common/gated.h55
-rw-r--r--src/crimson/common/interruptible_future.h1600
-rw-r--r--src/crimson/common/layout.h737
-rw-r--r--src/crimson/common/local_shared_foreign_ptr.h245
-rw-r--r--src/crimson/common/log.cc21
-rw-r--r--src/crimson/common/log.h88
-rw-r--r--src/crimson/common/logclient.cc364
-rw-r--r--src/crimson/common/logclient.h232
-rw-r--r--src/crimson/common/operation.cc75
-rw-r--r--src/crimson/common/operation.h776
-rw-r--r--src/crimson/common/perf_counters_collection.cc41
-rw-r--r--src/crimson/common/perf_counters_collection.h49
-rw-r--r--src/crimson/common/shared_lru.h180
-rw-r--r--src/crimson/common/simple_lru.h141
-rw-r--r--src/crimson/common/smp_helpers.h92
-rw-r--r--src/crimson/common/throttle.cc64
-rw-r--r--src/crimson/common/throttle.h43
-rw-r--r--src/crimson/common/tmap_helpers.cc131
-rw-r--r--src/crimson/common/tmap_helpers.h40
-rw-r--r--src/crimson/common/tri_mutex.cc225
-rw-r--r--src/crimson/common/tri_mutex.h156
-rw-r--r--src/crimson/common/type_helpers.h8
-rw-r--r--src/crimson/common/utility.h38
-rw-r--r--src/crimson/crush/CrushLocation.cc186
-rw-r--r--src/crimson/crush/CrushLocation.h37
-rw-r--r--src/crimson/mgr/client.cc176
-rw-r--r--src/crimson/mgr/client.h71
-rw-r--r--src/crimson/mon/MonClient.cc1162
-rw-r--r--src/crimson/mon/MonClient.h218
-rw-r--r--src/crimson/net/Connection.h147
-rw-r--r--src/crimson/net/Dispatcher.h62
-rw-r--r--src/crimson/net/Errors.cc51
-rw-r--r--src/crimson/net/Errors.h53
-rw-r--r--src/crimson/net/FrameAssemblerV2.cc461
-rw-r--r--src/crimson/net/FrameAssemblerV2.h257
-rw-r--r--src/crimson/net/Fwd.h52
-rw-r--r--src/crimson/net/Interceptor.h186
-rw-r--r--src/crimson/net/Messenger.cc19
-rw-r--r--src/crimson/net/Messenger.h130
-rw-r--r--src/crimson/net/ProtocolV2.cc2348
-rw-r--r--src/crimson/net/ProtocolV2.h328
-rw-r--r--src/crimson/net/Socket.cc523
-rw-r--r--src/crimson/net/Socket.h201
-rw-r--r--src/crimson/net/SocketConnection.cc220
-rw-r--r--src/crimson/net/SocketConnection.h236
-rw-r--r--src/crimson/net/SocketMessenger.cc485
-rw-r--r--src/crimson/net/SocketMessenger.h192
-rw-r--r--src/crimson/net/chained_dispatchers.cc114
-rw-r--r--src/crimson/net/chained_dispatchers.h39
-rw-r--r--src/crimson/net/io_handler.cc1287
-rw-r--r--src/crimson/net/io_handler.h623
-rw-r--r--src/crimson/os/CMakeLists.txt16
-rw-r--r--src/crimson/os/alienstore/CMakeLists.txt86
-rw-r--r--src/crimson/os/alienstore/alien_collection.h39
-rw-r--r--src/crimson/os/alienstore/alien_log.cc33
-rw-r--r--src/crimson/os/alienstore/alien_log.h31
-rw-r--r--src/crimson/os/alienstore/alien_store.cc620
-rw-r--r--src/crimson/os/alienstore/alien_store.h133
-rw-r--r--src/crimson/os/alienstore/semaphore.h90
-rw-r--r--src/crimson/os/alienstore/thread_pool.cc98
-rw-r--r--src/crimson/os/alienstore/thread_pool.h184
-rw-r--r--src/crimson/os/cyanstore/CMakeLists.txt7
-rw-r--r--src/crimson/os/cyanstore/cyan_collection.cc78
-rw-r--r--src/crimson/os/cyanstore/cyan_collection.h51
-rw-r--r--src/crimson/os/cyanstore/cyan_object.cc89
-rw-r--r--src/crimson/os/cyanstore/cyan_object.h45
-rw-r--r--src/crimson/os/cyanstore/cyan_store.cc952
-rw-r--r--src/crimson/os/cyanstore/cyan_store.h219
-rw-r--r--src/crimson/os/futurized_collection.h37
-rw-r--r--src/crimson/os/futurized_store.cc36
-rw-r--r--src/crimson/os/futurized_store.h195
-rw-r--r--src/crimson/os/seastore/CMakeLists.txt79
-rw-r--r--src/crimson/os/seastore/async_cleaner.cc1817
-rw-r--r--src/crimson/os/seastore/async_cleaner.h1761
-rw-r--r--src/crimson/os/seastore/backref/backref_tree_node.cc14
-rw-r--r--src/crimson/os/seastore/backref/backref_tree_node.h137
-rw-r--r--src/crimson/os/seastore/backref/btree_backref_manager.cc609
-rw-r--r--src/crimson/os/seastore/backref/btree_backref_manager.h121
-rw-r--r--src/crimson/os/seastore/backref_manager.cc18
-rw-r--r--src/crimson/os/seastore/backref_manager.h152
-rw-r--r--src/crimson/os/seastore/btree/btree_range_pin.cc27
-rw-r--r--src/crimson/os/seastore/btree/btree_range_pin.h204
-rw-r--r--src/crimson/os/seastore/btree/fixed_kv_btree.h2251
-rw-r--r--src/crimson/os/seastore/btree/fixed_kv_node.cc12
-rw-r--r--src/crimson/os/seastore/btree/fixed_kv_node.h1220
-rw-r--r--src/crimson/os/seastore/cache.cc2040
-rw-r--r--src/crimson/os/seastore/cache.h1688
-rw-r--r--src/crimson/os/seastore/cached_extent.cc176
-rw-r--r--src/crimson/os/seastore/cached_extent.h1304
-rw-r--r--src/crimson/os/seastore/collection_manager.cc14
-rw-r--r--src/crimson/os/seastore/collection_manager.h84
-rw-r--r--src/crimson/os/seastore/collection_manager/collection_flat_node.cc120
-rw-r--r--src/crimson/os/seastore/collection_manager/collection_flat_node.h184
-rw-r--r--src/crimson/os/seastore/collection_manager/flat_collection_manager.cc133
-rw-r--r--src/crimson/os/seastore/collection_manager/flat_collection_manager.h41
-rw-r--r--src/crimson/os/seastore/device.cc51
-rw-r--r--src/crimson/os/seastore/device.h175
-rw-r--r--src/crimson/os/seastore/extent_placement_manager.cc808
-rw-r--r--src/crimson/os/seastore/extent_placement_manager.h915
-rw-r--r--src/crimson/os/seastore/extentmap_manager.cc33
-rw-r--r--src/crimson/os/seastore/journal.cc25
-rw-r--r--src/crimson/os/seastore/journal.h122
-rw-r--r--src/crimson/os/seastore/journal/circular_bounded_journal.cc387
-rw-r--r--src/crimson/os/seastore/journal/circular_bounded_journal.h224
-rw-r--r--src/crimson/os/seastore/journal/circular_journal_space.cc249
-rw-r--r--src/crimson/os/seastore/journal/circular_journal_space.h261
-rw-r--r--src/crimson/os/seastore/journal/record_submitter.cc533
-rw-r--r--src/crimson/os/seastore/journal/record_submitter.h347
-rw-r--r--src/crimson/os/seastore/journal/segment_allocator.cc283
-rw-r--r--src/crimson/os/seastore/journal/segment_allocator.h131
-rw-r--r--src/crimson/os/seastore/journal/segmented_journal.cc433
-rw-r--r--src/crimson/os/seastore/journal/segmented_journal.h105
-rw-r--r--src/crimson/os/seastore/lba_manager.cc31
-rw-r--r--src/crimson/os/seastore/lba_manager.h237
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc761
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h396
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc55
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h294
-rw-r--r--src/crimson/os/seastore/logging.h30
-rw-r--r--src/crimson/os/seastore/object_data_handler.cc1638
-rw-r--r--src/crimson/os/seastore/object_data_handler.h156
-rw-r--r--src/crimson/os/seastore/omap_manager.cc42
-rw-r--r--src/crimson/os/seastore/omap_manager.h210
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc293
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h111
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h122
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc738
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h250
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_types.h157
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h1550
-rw-r--r--src/crimson/os/seastore/onode.cc18
-rw-r--r--src/crimson/os/seastore/onode.h89
-rw-r--r--src/crimson/os/seastore/onode_manager.h86
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc183
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h171
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h196
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node.cc2282
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node.h743
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h55
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h619
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc32
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h105
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h196
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc90
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h223
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h67
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h113
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc80
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h270
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h948
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h138
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h145
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc202
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h193
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc34
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h910
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc420
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h232
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc153
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h406
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h2488
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h442
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc329
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h368
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/super.cc28
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/super.h143
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree.h387
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h565
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/value.cc164
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/value.h337
-rw-r--r--src/crimson/os/seastore/ordering_handle.h181
-rw-r--r--src/crimson/os/seastore/random_block_manager.cc21
-rw-r--r--src/crimson/os/seastore/random_block_manager.h176
-rw-r--r--src/crimson/os/seastore/random_block_manager/avlallocator.cc201
-rw-r--r--src/crimson/os/seastore/random_block_manager/avlallocator.h174
-rw-r--r--src/crimson/os/seastore/random_block_manager/block_rb_manager.cc176
-rw-r--r--src/crimson/os/seastore/random_block_manager/block_rb_manager.h142
-rw-r--r--src/crimson/os/seastore/random_block_manager/extent_allocator.h75
-rw-r--r--src/crimson/os/seastore/random_block_manager/nvme_block_device.cc280
-rw-r--r--src/crimson/os/seastore/random_block_manager/nvme_block_device.h360
-rw-r--r--src/crimson/os/seastore/random_block_manager/rbm_device.cc271
-rw-r--r--src/crimson/os/seastore/random_block_manager/rbm_device.h261
-rw-r--r--src/crimson/os/seastore/randomblock_manager_group.h71
-rw-r--r--src/crimson/os/seastore/record_scanner.cc239
-rw-r--r--src/crimson/os/seastore/record_scanner.h83
-rw-r--r--src/crimson/os/seastore/root_block.cc27
-rw-r--r--src/crimson/os/seastore/root_block.h109
-rw-r--r--src/crimson/os/seastore/seastore.cc2135
-rw-r--r--src/crimson/os/seastore/seastore.h531
-rw-r--r--src/crimson/os/seastore/seastore_types.cc874
-rw-r--r--src/crimson/os/seastore/seastore_types.h2254
-rw-r--r--src/crimson/os/seastore/segment_manager.cc107
-rw-r--r--src/crimson/os/seastore/segment_manager.h216
-rw-r--r--src/crimson/os/seastore/segment_manager/block.cc810
-rw-r--r--src/crimson/os/seastore/segment_manager/block.h262
-rw-r--r--src/crimson/os/seastore/segment_manager/ephemeral.cc294
-rw-r--r--src/crimson/os/seastore/segment_manager/ephemeral.h166
-rw-r--r--src/crimson/os/seastore/segment_manager/zbd.cc823
-rw-r--r--src/crimson/os/seastore/segment_manager/zbd.h246
-rw-r--r--src/crimson/os/seastore/segment_manager_group.cc171
-rw-r--r--src/crimson/os/seastore/segment_manager_group.h150
-rw-r--r--src/crimson/os/seastore/segment_seq_allocator.h50
-rw-r--r--src/crimson/os/seastore/transaction.cc8
-rw-r--r--src/crimson/os/seastore/transaction.h653
-rw-r--r--src/crimson/os/seastore/transaction_manager.cc759
-rw-r--r--src/crimson/os/seastore/transaction_manager.h928
-rw-r--r--src/crimson/osd/CMakeLists.txt72
-rw-r--r--src/crimson/osd/acked_peers.h14
-rw-r--r--src/crimson/osd/backfill_facades.h73
-rw-r--r--src/crimson/osd/backfill_state.cc558
-rw-r--r--src/crimson/osd/backfill_state.h382
-rw-r--r--src/crimson/osd/ec_backend.cc37
-rw-r--r--src/crimson/osd/ec_backend.h41
-rw-r--r--src/crimson/osd/exceptions.h46
-rw-r--r--src/crimson/osd/heartbeat.cc819
-rw-r--r--src/crimson/osd/heartbeat.h461
-rw-r--r--src/crimson/osd/lsan_suppressions.cc20
-rw-r--r--src/crimson/osd/main.cc259
-rw-r--r--src/crimson/osd/main_config_bootstrap_helpers.cc265
-rw-r--r--src/crimson/osd/main_config_bootstrap_helpers.h99
-rw-r--r--src/crimson/osd/objclass.cc584
-rw-r--r--src/crimson/osd/object_context.cc85
-rw-r--r--src/crimson/osd/object_context.h276
-rw-r--r--src/crimson/osd/object_context_loader.cc232
-rw-r--r--src/crimson/osd/object_context_loader.h87
-rw-r--r--src/crimson/osd/ops_executer.cc1461
-rw-r--r--src/crimson/osd/ops_executer.h629
-rw-r--r--src/crimson/osd/osd.cc1357
-rw-r--r--src/crimson/osd/osd.h251
-rw-r--r--src/crimson/osd/osd_connection_priv.h27
-rw-r--r--src/crimson/osd/osd_meta.cc98
-rw-r--r--src/crimson/osd/osd_meta.h60
-rw-r--r--src/crimson/osd/osd_operation.cc227
-rw-r--r--src/crimson/osd/osd_operation.h281
-rw-r--r--src/crimson/osd/osd_operation_external_tracking.h307
-rw-r--r--src/crimson/osd/osd_operations/background_recovery.cc207
-rw-r--r--src/crimson/osd/osd_operations/background_recovery.h144
-rw-r--r--src/crimson/osd/osd_operations/client_request.cc388
-rw-r--r--src/crimson/osd/osd_operations/client_request.h281
-rw-r--r--src/crimson/osd/osd_operations/client_request_common.cc64
-rw-r--r--src/crimson/osd/osd_operations/client_request_common.h20
-rw-r--r--src/crimson/osd/osd_operations/common/pg_pipeline.h31
-rw-r--r--src/crimson/osd/osd_operations/internal_client_request.cc130
-rw-r--r--src/crimson/osd/osd_operations/internal_client_request.h68
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request.cc79
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request.h81
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request_reply.cc68
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request_reply.h79
-rw-r--r--src/crimson/osd/osd_operations/osdop_params.h22
-rw-r--r--src/crimson/osd/osd_operations/peering_event.cc190
-rw-r--r--src/crimson/osd/osd_operations/peering_event.h207
-rw-r--r--src/crimson/osd/osd_operations/pg_advance_map.cc130
-rw-r--r--src/crimson/osd/osd_operations/pg_advance_map.h61
-rw-r--r--src/crimson/osd/osd_operations/recovery_subrequest.cc46
-rw-r--r--src/crimson/osd/osd_operations/recovery_subrequest.h81
-rw-r--r--src/crimson/osd/osd_operations/replicated_request.cc80
-rw-r--r--src/crimson/osd/osd_operations/replicated_request.h80
-rw-r--r--src/crimson/osd/osd_operations/snaptrim_event.cc569
-rw-r--r--src/crimson/osd/osd_operations/snaptrim_event.h210
-rw-r--r--src/crimson/osd/osdmap_gate.cc86
-rw-r--r--src/crimson/osd/osdmap_gate.h83
-rw-r--r--src/crimson/osd/osdmap_service.h21
-rw-r--r--src/crimson/osd/pg.cc1544
-rw-r--r--src/crimson/osd/pg.h833
-rw-r--r--src/crimson/osd/pg_activation_blocker.cc36
-rw-r--r--src/crimson/osd/pg_activation_blocker.h35
-rw-r--r--src/crimson/osd/pg_backend.cc1811
-rw-r--r--src/crimson/osd/pg_backend.h448
-rw-r--r--src/crimson/osd/pg_interval_interrupt_condition.cc43
-rw-r--r--src/crimson/osd/pg_interval_interrupt_condition.h56
-rw-r--r--src/crimson/osd/pg_map.cc102
-rw-r--r--src/crimson/osd/pg_map.h201
-rw-r--r--src/crimson/osd/pg_meta.cc110
-rw-r--r--src/crimson/osd/pg_meta.h20
-rw-r--r--src/crimson/osd/pg_recovery.cc569
-rw-r--r--src/crimson/osd/pg_recovery.h118
-rw-r--r--src/crimson/osd/pg_recovery_listener.h39
-rw-r--r--src/crimson/osd/pg_shard_manager.cc108
-rw-r--r--src/crimson/osd/pg_shard_manager.h390
-rw-r--r--src/crimson/osd/recovery_backend.cc328
-rw-r--r--src/crimson/osd/recovery_backend.h233
-rw-r--r--src/crimson/osd/replicated_backend.cc174
-rw-r--r--src/crimson/osd/replicated_backend.h61
-rw-r--r--src/crimson/osd/replicated_recovery_backend.cc1182
-rw-r--r--src/crimson/osd/replicated_recovery_backend.h169
-rw-r--r--src/crimson/osd/scheduler/mclock_scheduler.cc165
-rw-r--r--src/crimson/osd/scheduler/mclock_scheduler.h125
-rw-r--r--src/crimson/osd/scheduler/scheduler.cc181
-rw-r--r--src/crimson/osd/scheduler/scheduler.h82
-rw-r--r--src/crimson/osd/shard_services.cc761
-rw-r--r--src/crimson/osd/shard_services.h589
-rw-r--r--src/crimson/osd/state.h130
-rw-r--r--src/crimson/osd/stop_signal.h83
-rw-r--r--src/crimson/osd/watch.cc354
-rw-r--r--src/crimson/osd/watch.h256
-rw-r--r--src/crimson/tools/CMakeLists.txt22
-rw-r--r--src/crimson/tools/perf_async_msgr.cc151
-rw-r--r--src/crimson/tools/perf_crimson_msgr.cc1222
-rw-r--r--src/crimson/tools/perf_staged_fltree.cc178
-rw-r--r--src/crimson/tools/store_nbd/block_driver.cc19
-rw-r--r--src/crimson/tools/store_nbd/block_driver.h134
-rw-r--r--src/crimson/tools/store_nbd/fs_driver.cc310
-rw-r--r--src/crimson/tools/store_nbd/fs_driver.h72
-rw-r--r--src/crimson/tools/store_nbd/store-nbd.cc456
-rw-r--r--src/crimson/tools/store_nbd/tm_driver.cc222
-rw-r--r--src/crimson/tools/store_nbd/tm_driver.h56
334 files changed, 105416 insertions, 0 deletions
diff --git a/src/crimson/CMakeLists.txt b/src/crimson/CMakeLists.txt
new file mode 100644
index 000000000..e2b37fac9
--- /dev/null
+++ b/src/crimson/CMakeLists.txt
@@ -0,0 +1,203 @@
+add_library(crimson::cflags INTERFACE IMPORTED)
+set(crimson_cflag_definitions "WITH_SEASTAR=1")
+# disable concepts to address https://github.com/boostorg/asio/issues/312
+if((CMAKE_CXX_COMPILER_ID STREQUAL GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10) OR
+ (CMAKE_CXX_COMPILER_ID STREQUAL Clang))
+ list(APPEND crimson_cflag_definitions
+ "BOOST_ASIO_DISABLE_CONCEPTS")
+endif()
+set_target_properties(crimson::cflags PROPERTIES
+ INTERFACE_COMPILE_DEFINITIONS "${crimson_cflag_definitions}"
+ INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:-Wno-non-virtual-dtor>
+ INTERFACE_LINK_LIBRARIES Seastar::seastar)
+
+set(crimson_common_srcs
+ common/assert.cc
+ common/buffer_io.cc
+ common/config_proxy.cc
+ common/fatal_signal.cc
+ common/formatter.cc
+ common/perf_counters_collection.cc
+ common/log.cc
+ common/logclient.cc
+ common/operation.cc
+ common/throttle.cc
+ common/tmap_helpers.cc
+ common/tri_mutex.cc
+ crush/CrushLocation.cc)
+
+# the specialized version of ceph-common, where
+# - the logging is sent to Seastar backend
+# - and the template parameter of lock_policy is SINGLE
+add_library(crimson-common STATIC
+ ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc
+ ${PROJECT_SOURCE_DIR}/src/common/admin_socket_client.cc
+ ${PROJECT_SOURCE_DIR}/src/common/bit_str.cc
+ ${PROJECT_SOURCE_DIR}/src/common/bloom_filter.cc
+ ${PROJECT_SOURCE_DIR}/src/common/buffer.cc
+ ${PROJECT_SOURCE_DIR}/src/common/buffer_seastar.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_argparse.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_hash.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_time.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_strings.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_releases.cc
+ ${PROJECT_SOURCE_DIR}/src/common/cmdparse.cc
+ ${PROJECT_SOURCE_DIR}/src/common/common_init.cc
+ ${PROJECT_SOURCE_DIR}/src/common/compat.cc
+ ${PROJECT_SOURCE_DIR}/src/common/code_environment.cc
+ ${PROJECT_SOURCE_DIR}/src/common/config.cc
+ ${PROJECT_SOURCE_DIR}/src/common/config_values.cc
+ ${PROJECT_SOURCE_DIR}/src/common/dout.cc
+ ${PROJECT_SOURCE_DIR}/src/common/entity_name.cc
+ ${PROJECT_SOURCE_DIR}/src/common/environment.cc
+ ${PROJECT_SOURCE_DIR}/src/common/errno.cc
+ ${PROJECT_SOURCE_DIR}/src/common/escape.cc
+ ${PROJECT_SOURCE_DIR}/src/common/hex.cc
+ ${PROJECT_SOURCE_DIR}/src/common/fs_types.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_json.cc
+ ${PROJECT_SOURCE_DIR}/src/common/histogram.cc
+ ${PROJECT_SOURCE_DIR}/src/common/hobject.cc
+ ${PROJECT_SOURCE_DIR}/src/common/hostname.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ipaddr.cc
+ ${PROJECT_SOURCE_DIR}/src/common/mempool.cc
+ ${PROJECT_SOURCE_DIR}/src/common/options.cc
+ ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc
+ ${PROJECT_SOURCE_DIR}/src/common/perf_counters_key.cc
+ ${PROJECT_SOURCE_DIR}/src/common/perf_histogram.cc
+ ${PROJECT_SOURCE_DIR}/src/common/page.cc
+ ${PROJECT_SOURCE_DIR}/src/common/pick_address.cc
+ ${PROJECT_SOURCE_DIR}/src/common/snap_types.cc
+ ${PROJECT_SOURCE_DIR}/src/common/signal.cc
+ ${PROJECT_SOURCE_DIR}/src/common/str_list.cc
+ ${PROJECT_SOURCE_DIR}/src/common/str_map.cc
+ ${PROJECT_SOURCE_DIR}/src/common/strtol.cc
+ ${PROJECT_SOURCE_DIR}/src/common/reverse.c
+ ${PROJECT_SOURCE_DIR}/src/common/types.cc
+ ${PROJECT_SOURCE_DIR}/src/common/utf8.c
+ ${PROJECT_SOURCE_DIR}/src/common/version.cc
+ ${PROJECT_SOURCE_DIR}/src/common/BackTrace.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ConfUtils.cc
+ ${PROJECT_SOURCE_DIR}/src/common/DecayCounter.cc
+ ${PROJECT_SOURCE_DIR}/src/common/HTMLFormatter.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Formatter.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Graylog.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Journald.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ostream_temp.cc
+ ${PROJECT_SOURCE_DIR}/src/common/LogEntry.cc
+ ${PROJECT_SOURCE_DIR}/src/common/TextTable.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Thread.cc
+ ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc
+ ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc
+ ${PROJECT_SOURCE_DIR}/src/common/util.cc
+ ${PROJECT_SOURCE_DIR}/src/compressor/Compressor.cc
+ ${PROJECT_SOURCE_DIR}/src/crush/builder.c
+ ${PROJECT_SOURCE_DIR}/src/crush/mapper.c
+ ${PROJECT_SOURCE_DIR}/src/crush/crush.c
+ ${PROJECT_SOURCE_DIR}/src/crush/hash.c
+ ${PROJECT_SOURCE_DIR}/src/crush/CrushWrapper.cc
+ ${PROJECT_SOURCE_DIR}/src/crush/CrushCompiler.cc
+ ${PROJECT_SOURCE_DIR}/src/crush/CrushTester.cc
+ ${PROJECT_SOURCE_DIR}/src/global/global_context.cc
+ ${PROJECT_SOURCE_DIR}/src/global/pidfile.cc
+ ${PROJECT_SOURCE_DIR}/src/librbd/Features.cc
+ ${PROJECT_SOURCE_DIR}/src/librbd/io/IoOperations.cc
+ ${PROJECT_SOURCE_DIR}/src/mgr/ServiceMap.cc
+ ${PROJECT_SOURCE_DIR}/src/mds/inode_backtrace.cc
+ ${PROJECT_SOURCE_DIR}/src/mds/mdstypes.cc
+ ${PROJECT_SOURCE_DIR}/src/mds/cephfs_features.cc
+ ${PROJECT_SOURCE_DIR}/src/mds/FSMap.cc
+ ${PROJECT_SOURCE_DIR}/src/mds/FSMapUser.cc
+ ${PROJECT_SOURCE_DIR}/src/mds/MDSMap.cc
+ ${PROJECT_SOURCE_DIR}/src/msg/msg_types.cc
+ ${PROJECT_SOURCE_DIR}/src/msg/Message.cc
+ ${PROJECT_SOURCE_DIR}/src/mon/PGMap.cc
+ ${PROJECT_SOURCE_DIR}/src/mon/MonCap.cc
+ ${PROJECT_SOURCE_DIR}/src/mon/MonMap.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/osd_types.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/ECMsgTypes.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/HitSet.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/OSDMap.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc
+ ${PROJECT_SOURCE_DIR}/src/xxHash/xxhash.c
+ ${crimson_common_srcs}
+ $<TARGET_OBJECTS:common_mountcephfs_objs>
+ $<TARGET_OBJECTS:common-options-objs>)
+
+target_compile_definitions(crimson-common PRIVATE
+ "CMAKE_INSTALL_LIBDIR=\"${CMAKE_INSTALL_LIBDIR}\""
+ "CEPH_INSTALL_FULL_PKGLIBDIR=\"${CEPH_INSTALL_FULL_PKGLIBDIR}\""
+ "CEPH_INSTALL_DATADIR=\"${CEPH_INSTALL_DATADIR}\"")
+
+set(crimson_common_deps
+ Boost::iostreams
+ Boost::random
+ json_spirit)
+
+set(crimson_common_public_deps crimson::cflags)
+if(WITH_JAEGER)
+ list(APPEND crimson_common_public_deps jaeger_base)
+endif()
+
+if(NOT WITH_SYSTEM_BOOST)
+ list(APPEND crimson_common_deps ${ZLIB_LIBRARIES})
+endif()
+
+target_link_libraries(crimson-common
+ PUBLIC
+ ${crimson_common_public_deps}
+ PRIVATE
+ crc32
+ ${crimson_common_deps}
+ OpenSSL::Crypto)
+
+set(crimson_auth_srcs
+ auth/KeyRing.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/AuthClientHandler.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/AuthMethodList.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/AuthRegistry.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/AuthSessionHandler.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/KeyRing.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/RotatingKeyRing.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxAuthorizeHandler.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxClientHandler.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxProtocol.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxSessionHandler.cc
+ ${PROJECT_SOURCE_DIR}/src/auth/none/AuthNoneAuthorizeHandler.cc)
+set(crimson_mgr_srcs
+ mgr/client.cc)
+set(crimson_mon_srcs
+ mon/MonClient.cc
+ ${PROJECT_SOURCE_DIR}/src/mon/MonSub.cc)
+set(crimson_net_srcs
+ ${PROJECT_SOURCE_DIR}/src/msg/async/crypto_onwire.cc
+ ${PROJECT_SOURCE_DIR}/src/msg/async/compression_onwire.cc
+ ${PROJECT_SOURCE_DIR}/src/msg/async/frames_v2.cc
+ net/Errors.cc
+ net/FrameAssemblerV2.cc
+ net/io_handler.cc
+ net/Messenger.cc
+ net/SocketConnection.cc
+ net/SocketMessenger.cc
+ net/Socket.cc
+ net/ProtocolV2.cc
+ net/chained_dispatchers.cc)
+add_library(crimson STATIC
+ ${crimson_auth_srcs}
+ ${crimson_mgr_srcs}
+ ${crimson_mon_srcs}
+ ${crimson_net_srcs})
+target_compile_options(crimson PUBLIC
+ "-ftemplate-backtrace-limit=0")
+set_target_properties(crimson PROPERTIES
+ JOB_POOL_COMPILE heavy_compile_job_pool)
+target_link_libraries(crimson
+ PUBLIC
+ crimson-common
+ crimson::cflags)
+add_subdirectory(admin)
+add_subdirectory(os)
+add_subdirectory(osd)
+add_subdirectory(tools)
diff --git a/src/crimson/admin/CMakeLists.txt b/src/crimson/admin/CMakeLists.txt
new file mode 100644
index 000000000..36a5ae2a9
--- /dev/null
+++ b/src/crimson/admin/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_library(crimson-admin STATIC
+ admin_socket.cc
+ osd_admin.cc
+ pg_commands.cc)
+target_link_libraries(crimson-admin
+ crimson::cflags
+ Boost::MPL)
+add_dependencies(crimson-admin
+ legacy-option-headers)
diff --git a/src/crimson/admin/admin_socket.cc b/src/crimson/admin/admin_socket.cc
new file mode 100644
index 000000000..9db91369a
--- /dev/null
+++ b/src/crimson/admin/admin_socket.cc
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/admin/admin_socket.h"
+
+#include <boost/algorithm/string/join.hpp>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+#include <seastar/net/api.hh>
+#include <seastar/net/inet_address.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/util/std-compat.hh>
+
+#include "common/options.h"
+#include "common/version.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "crimson/common/log.h"
+#include "crimson/net/Socket.h"
+
+using namespace crimson::common;
+using namespace std::literals;
+using ceph::common::cmdmap_from_json;
+using ceph::common::cmd_getval;
+using ceph::common::bad_cmd_get;
+using ceph::common::validate_cmd;
+using ceph::common::dump_cmd_and_help_to_json;
+
+namespace {
+seastar::logger& logger()
+{
+ return crimson::get_logger(ceph_subsys_osd);
+}
+} // namespace
+
+using std::string;
+using std::string_view;
+using std::stringstream;
+using std::unique_ptr;
+
+namespace crimson::admin {
+
+tell_result_t::tell_result_t(int ret, std::string&& err)
+ : ret{ret}, err(std::move(err))
+{}
+
+tell_result_t::tell_result_t(int ret, std::string&& err, ceph::bufferlist&& out)
+ : ret{ret}, err(std::move(err)), out(std::move(out))
+{}
+
+tell_result_t::tell_result_t(std::unique_ptr<Formatter> formatter)
+{
+ formatter->flush(out);
+}
+
+void AdminSocket::register_command(std::unique_ptr<AdminSocketHook>&& hook)
+{
+ auto prefix = hook->prefix;
+ auto [it, added] = hooks.emplace(prefix, std::move(hook));
+ assert(added);
+ logger().info("register_command(): {})", it->first);
+}
+
+auto AdminSocket::parse_cmd(const std::vector<std::string>& cmd)
+ -> std::variant<parsed_command_t, tell_result_t>
+{
+ // preliminaries:
+ // - create the formatter specified by the cmd parameters
+ // - locate the "op-code" string (the 'prefix' segment)
+ // - prepare for command parameters extraction via cmdmap_t
+ cmdmap_t cmdmap;
+ ceph::bufferlist out;
+
+ try {
+ stringstream errss;
+ // note that cmdmap_from_json() may throw on syntax issues
+ if (!cmdmap_from_json(cmd, &cmdmap, errss)) {
+ logger().error("{}: incoming command error: {}", __func__, errss.str());
+ out.append("error:"s);
+ out.append(errss.str());
+ return tell_result_t{-EINVAL, "invalid json", std::move(out)};
+ }
+ } catch (const std::runtime_error& e) {
+ logger().error("{}: incoming command syntax: {}", __func__, cmd);
+ out.append(string{e.what()});
+ return tell_result_t{-EINVAL, "invalid json", std::move(out)};
+ }
+
+ string format;
+ string prefix;
+ try {
+ cmd_getval(cmdmap, "format", format);
+ cmd_getval(cmdmap, "prefix", prefix);
+ // tolerate old-style pg <pgid> command <args> style formatting
+ if (prefix == "pg") {
+ cmd_getval(cmdmap, "cmd", prefix);
+ }
+ } catch (const bad_cmd_get& e) {
+ logger().error("{}: invalid syntax: {}", __func__, cmd);
+ out.append(string{e.what()});
+ return tell_result_t{-EINVAL, "invalid json", std::move(out)};
+ }
+
+ // match the incoming op-code to one of the registered APIs
+ if (auto found = hooks.find(prefix); found != hooks.end()) {
+ return parsed_command_t{ cmdmap, format, *found->second };
+ } else {
+ return tell_result_t{-EINVAL,
+ fmt::format("unknown command '{}'", prefix),
+ std::move(out)};
+ }
+}
+
+seastar::future<> AdminSocket::finalize_response(
+ seastar::output_stream<char>& out, ceph::bufferlist&& msgs)
+{
+ string outbuf_cont = msgs.to_str();
+ if (outbuf_cont.empty()) {
+ outbuf_cont = " {} ";
+ }
+ uint32_t response_length = htonl(outbuf_cont.length());
+ logger().info("asok response length: {}", outbuf_cont.length());
+
+ return out.write(reinterpret_cast<char*>(&response_length),
+ sizeof(response_length))
+ .then([&out, outbuf_cont] { return out.write(outbuf_cont.c_str()); });
+}
+
+
+seastar::future<> AdminSocket::handle_command(crimson::net::ConnectionRef conn,
+ boost::intrusive_ptr<MCommand> m)
+{
+ return execute_command(m->cmd, std::move(m->get_data())).then(
+ [conn, tid=m->get_tid()](auto result) {
+ auto [ret, err, out] = std::move(result);
+ auto reply = crimson::make_message<MCommandReply>(ret, err);
+ reply->set_tid(tid);
+ reply->set_data(out);
+ return conn->send(std::move(reply));
+ });
+}
+
+seastar::future<> AdminSocket::execute_line(std::string cmdline,
+ seastar::output_stream<char>& out)
+{
+ return execute_command({std::move(cmdline)}, {}).then([&out, this](auto result) {
+ auto [ret, stderr, stdout] = std::move(result);
+ if (ret < 0) {
+ stdout.append(fmt::format("ERROR: {}\n", cpp_strerror(ret)));
+ stdout.append(stderr);
+ }
+ return finalize_response(out, std::move(stdout));
+ });
+}
+
+auto AdminSocket::execute_command(const std::vector<std::string>& cmd,
+ ceph::bufferlist&& buf)
+ -> seastar::future<tell_result_t>
+{
+ auto maybe_parsed = parse_cmd(cmd);
+ if (auto* parsed = std::get_if<parsed_command_t>(&maybe_parsed); parsed) {
+ stringstream os;
+ string desc{parsed->hook.desc};
+ if (!validate_cmd(desc, parsed->params, os)) {
+ logger().error("AdminSocket::execute_command: "
+ "failed to validate '{}': {}", cmd, os.str());
+ ceph::bufferlist out;
+ out.append(os);
+ return seastar::make_ready_future<tell_result_t>(
+ tell_result_t{-EINVAL, "invalid command json", std::move(out)});
+ }
+ return parsed->hook.call(parsed->params, parsed->format, std::move(buf));
+ } else {
+ auto& result = std::get<tell_result_t>(maybe_parsed);
+ return seastar::make_ready_future<tell_result_t>(std::move(result));
+ }
+}
+
+// an input_stream consumer that reads buffer into a std::string up to the first
+// '\0' which indicates the end of command
+struct line_consumer {
+ using tmp_buf = seastar::temporary_buffer<char>;
+ using consumption_result_type =
+ typename seastar::input_stream<char>::consumption_result_type;
+
+ seastar::future<consumption_result_type> operator()(tmp_buf&& buf) {
+ size_t consumed = 0;
+ for (auto c : buf) {
+ consumed++;
+ if (c == '\0') {
+ buf.trim_front(consumed);
+ return seastar::make_ready_future<consumption_result_type>(
+ consumption_result_type::stop_consuming_type(std::move(buf)));
+ } else {
+ line.push_back(c);
+ }
+ }
+ return seastar::make_ready_future<consumption_result_type>(
+ seastar::continue_consuming{});
+ }
+ std::string line;
+};
+
+seastar::future<> AdminSocket::handle_client(seastar::input_stream<char>& in,
+ seastar::output_stream<char>& out)
+{
+ auto consumer = seastar::make_shared<line_consumer>();
+ return in.consume(*consumer).then([consumer, &out, this] {
+ logger().debug("AdminSocket::handle_client: incoming asok string: {}",
+ consumer->line);
+ return execute_line(consumer->line, out);
+ }).then([&out] {
+ return out.flush();
+ }).finally([&out] {
+ return out.close();
+ }).then([&in] {
+ return in.close();
+ }).handle_exception([](auto ep) {
+ logger().debug("exception on {}: {}", __func__, ep);
+ });
+}
+
+seastar::future<> AdminSocket::start(const std::string& path)
+{
+ if (path.empty()) {
+ logger().error(
+ "{}: Admin Socket socket path missing from the configuration", __func__);
+ return seastar::now();
+ }
+
+ logger().debug("{}: asok socket path={}", __func__, path);
+ auto sock_path = seastar::socket_address{ seastar::unix_domain_addr{ path } };
+ try {
+ server_sock = seastar::engine().listen(sock_path);
+ } catch (const std::system_error& e) {
+ if (e.code() == std::errc::address_in_use) {
+ logger().debug("{}: Admin Socket socket path={} already exists, retrying",
+ __func__, path);
+ return seastar::remove_file(path).then([this, path] {
+ server_sock.reset();
+ return start(path);
+ });
+ }
+ logger().error("{}: unable to listen({}): {}", __func__, path, e.what());
+ server_sock.reset();
+ return seastar::make_ready_future<>();
+ }
+ // listen in background
+ task = seastar::keep_doing([this] {
+ return seastar::try_with_gate(stop_gate, [this] {
+ assert(!connected_sock.has_value());
+ return server_sock->accept().then([this](seastar::accept_result acc) {
+ connected_sock = std::move(acc.connection);
+ return seastar::do_with(connected_sock->input(),
+ connected_sock->output(),
+ [this](auto& input, auto& output) mutable {
+ return handle_client(input, output);
+ }).finally([this] {
+ assert(connected_sock.has_value());
+ connected_sock.reset();
+ });
+ }).handle_exception([this](auto ep) {
+ if (!stop_gate.is_closed()) {
+ logger().error("AdminSocket: terminated: {}", ep);
+ }
+ });
+ });
+ }).handle_exception_type([](const seastar::gate_closed_exception&) {
+ }).finally([path] {
+ return seastar::remove_file(path);
+ });
+ return seastar::make_ready_future<>();
+}
+
+seastar::future<> AdminSocket::stop()
+{
+ if (!server_sock) {
+ return seastar::now();
+ }
+ server_sock->abort_accept();
+ if (connected_sock) {
+ connected_sock->shutdown_input();
+ connected_sock->shutdown_output();
+ }
+ return stop_gate.close().then([this] {
+ assert(task.has_value());
+ return task->then([] {
+ logger().info("AdminSocket: stopped");
+ return seastar::now();
+ });
+ });
+}
+
+/////////////////////////////////////////
+// the internal hooks
+/////////////////////////////////////////
+
+class VersionHook final : public AdminSocketHook {
+ public:
+ VersionHook()
+ : AdminSocketHook{"version", "", "get ceph version"}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&&) const final
+ {
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ f->open_object_section("version");
+ f->dump_string("version", ceph_version_to_str());
+ f->dump_string("release", ceph_release_to_str());
+ f->dump_string("release_type", ceph_release_type());
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+
+/**
+ Note that the git_version command is expected to return a 'version' JSON
+ segment.
+*/
+class GitVersionHook final : public AdminSocketHook {
+ public:
+ GitVersionHook()
+ : AdminSocketHook{"git_version", "", "get git sha1"}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&&) const final
+ {
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ f->open_object_section("version");
+ f->dump_string("git_version", git_version_to_str());
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+
+class HelpHook final : public AdminSocketHook {
+ const AdminSocket& m_as;
+
+ public:
+ explicit HelpHook(const AdminSocket& as) :
+ AdminSocketHook{"help", "", "list available commands"},
+ m_as{as}
+ {}
+
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&&) const final
+ {
+ unique_ptr<Formatter> f{Formatter::create(format,
+ "json-pretty", "json-pretty")};
+ f->open_object_section("help");
+ for (const auto& [prefix, hook] : m_as) {
+ if (!hook->help.empty()) {
+ f->dump_string(prefix.data(), hook->help);
+ }
+ }
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+
+class GetdescsHook final : public AdminSocketHook {
+ const AdminSocket& m_as;
+
+ public:
+ explicit GetdescsHook(const AdminSocket& as) :
+ AdminSocketHook{"get_command_descriptions",
+ "",
+ "list available commands"},
+ m_as{ as } {}
+
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&&) const final
+ {
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ int cmdnum = 0;
+ f->open_object_section("command_descriptions");
+ for (const auto& [prefix, hook] : m_as) {
+ auto secname = fmt::format("cmd {:>03}", cmdnum);
+ auto cmd = fmt::format("{} {}", hook->prefix, hook->desc);
+ dump_cmd_and_help_to_json(f.get(), CEPH_FEATURES_ALL, secname,
+ cmd, std::string{hook->help});
+ cmdnum++;
+ }
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+
+class InjectArgsHook final : public AdminSocketHook {
+public:
+ InjectArgsHook()
+ : AdminSocketHook{"injectargs",
+ "name=injected_args,type=CephString,n=N",
+ "inject configuration arguments into running daemon"}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&&) const final
+ {
+ std::vector<std::string> argv;
+ if (!cmd_getval(cmdmap, "injected_args", argv)) {
+ return seastar::make_ready_future<tell_result_t>();
+ }
+ const std::string args = boost::algorithm::join(argv, " ");
+ return local_conf().inject_args(args).then([] {
+ return seastar::make_ready_future<tell_result_t>();
+ }).handle_exception_type([] (const std::invalid_argument& e) {
+ return seastar::make_ready_future<tell_result_t>(
+ tell_result_t{-EINVAL, e.what()});
+ });
+ }
+};
+
+/**
+ * listing the configuration values
+ */
+class ConfigShowHook : public AdminSocketHook {
+public:
+ ConfigShowHook() :
+ AdminSocketHook{"config show",
+ "",
+ "dump current config settings"}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ f->open_object_section("config_show");
+ local_conf().show_config(f.get());
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+
+/**
+ * fetching the value of a specific configuration item
+ */
+class ConfigGetHook : public AdminSocketHook {
+public:
+ ConfigGetHook() :
+ AdminSocketHook("config get",
+ "name=var,type=CephString",
+ "config get <field>: get the config value")
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ std::string var;
+ [[maybe_unused]] bool found = cmd_getval(cmdmap, "var", var);
+ assert(found);
+ std::string conf_val;
+ if (int r = local_conf().get_val(var, &conf_val); r < 0) {
+ return seastar::make_ready_future<tell_result_t>(
+ tell_result_t{r, fmt::format("error getting {}: {}",
+ var, cpp_strerror(r))});
+ }
+ unique_ptr<Formatter> f{Formatter::create(format,
+ "json-pretty",
+ "json-pretty")};
+ f->open_object_section("config_get");
+ f->dump_string(var, conf_val);
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+
+/**
+ * setting the value of a specific configuration item (an example:
+ * {"prefix": "config set", "var":"debug_osd", "val": ["30/20"]} )
+ */
+class ConfigSetHook : public AdminSocketHook {
+public:
+ ConfigSetHook()
+ : AdminSocketHook("config set",
+ "name=var,type=CephString "
+ "name=val,type=CephString,n=N",
+ "config set <field> <val> [<val> ...]: set a config variable")
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&&) const final
+ {
+ std::string var;
+ std::vector<std::string> new_val;
+ cmd_getval(cmdmap, "var", var);
+ cmd_getval(cmdmap, "val", new_val);
+ // val may be multiple words
+ const std::string joined_values = boost::algorithm::join(new_val, " ");
+ return local_conf().set_val(var, joined_values).then([format] {
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ f->open_object_section("config_set");
+ f->dump_string("success", "");
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }).handle_exception_type([](std::invalid_argument& e) {
+ return seastar::make_ready_future<tell_result_t>(
+ tell_result_t{-EINVAL, e.what()});
+ });
+ }
+};
+
+/**
+ * listing the configuration values
+ */
+class ConfigHelpHook : public AdminSocketHook {
+public:
+ ConfigHelpHook() :
+ AdminSocketHook{"config help",
+ "",
+ "get config setting schema and descriptions"}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ // Output all
+ f->open_array_section("options");
+ for (const auto &option : ceph_options) {
+ f->dump_object("option", option);
+ }
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+
+/// the hooks that are served directly by the admin_socket server
+void AdminSocket::register_admin_commands()
+{
+ register_command(std::make_unique<VersionHook>());
+ register_command(std::make_unique<GitVersionHook>());
+ register_command(std::make_unique<HelpHook>(*this));
+ register_command(std::make_unique<GetdescsHook>(*this));
+ register_command(std::make_unique<ConfigGetHook>());
+ register_command(std::make_unique<ConfigSetHook>());
+ register_command(std::make_unique<ConfigShowHook>());
+ register_command(std::make_unique<ConfigHelpHook>());
+ register_command(std::make_unique<InjectArgsHook>());
+}
+
+} // namespace crimson::admin
diff --git a/src/crimson/admin/admin_socket.h b/src/crimson/admin/admin_socket.h
new file mode 100644
index 000000000..8bf9fd4d3
--- /dev/null
+++ b/src/crimson/admin/admin_socket.h
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+/**
+ A Crimson-wise version of the src/common/admin_socket.h
+
+ Note: assumed to be running on a single core.
+*/
+#include <map>
+#include <string>
+#include <string_view>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/iostream.hh>
+#include <seastar/core/shared_mutex.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/net/api.hh>
+
+#include "common/cmdparse.h"
+#include "include/buffer.h"
+#include "crimson/net/Fwd.h"
+
+class MCommand;
+
+namespace crimson::admin {
+
+class AdminSocket;
+
+struct tell_result_t {
+ int ret = 0;
+ std::string err;
+ ceph::bufferlist out;
+ tell_result_t() = default;
+ tell_result_t(int ret, std::string&& err);
+ tell_result_t(int ret, std::string&& err, ceph::bufferlist&& out);
+ /**
+ * create a \c tell_result_t indicating the successful completion
+ * of command
+ *
+ * \param formatter the content of formatter will be flushed to the
+ * output buffer
+ */
+ tell_result_t(std::unique_ptr<Formatter> formatter);
+};
+
+/**
+ * An abstract class to be inherited by implementations of asock hooks
+ */
+class AdminSocketHook {
+ public:
+ AdminSocketHook(std::string_view prefix,
+ std::string_view desc,
+ std::string_view help) :
+ prefix{prefix}, desc{desc}, help{help}
+ {}
+ /**
+ * handle command defined by cmdmap
+ *
+ * \param cmdmap dictionary holding the named parameters
+ * \param format the expected format of the output
+ * \param input the binary input of the command
+ * \pre \c cmdmap should be validated with \c desc
+ * \retval an instance of \c tell_result_t
+ * \note a negative \c ret should be set to indicate that the hook fails to
+ * fulfill the command either because of an invalid input or other
+ * failures. in that case, a brief reason of the failure should
+ * noted in \c err in the returned value
+ */
+ virtual seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&& input) const = 0;
+ virtual ~AdminSocketHook() {}
+ const std::string_view prefix;
+ const std::string_view desc;
+ const std::string_view help;
+};
+
+class AdminSocket : public seastar::enable_lw_shared_from_this<AdminSocket> {
+ public:
+ AdminSocket() = default;
+ ~AdminSocket() = default;
+
+ AdminSocket(const AdminSocket&) = delete;
+ AdminSocket& operator=(const AdminSocket&) = delete;
+ AdminSocket(AdminSocket&&) = delete;
+ AdminSocket& operator=(AdminSocket&&) = delete;
+
+ /**
+ * create the async Seastar thread that handles asok commands arriving
+ * over the socket.
+ */
+ seastar::future<> start(const std::string& path);
+
+ seastar::future<> stop();
+
+ /**
+ * register an admin socket hook
+ *
+ * Commands (APIs) are registered under a command string. Incoming
+ * commands are split by spaces and matched against the longest
+ * registered command. For example, if 'foo' and 'foo bar' are
+ * registered, and an incoming command is 'foo bar baz', it is
+ * matched with 'foo bar', while 'foo fud' will match 'foo'.
+ *
+ * \param hook a hook which includes its identifying command string, the
+ * expected call syntax, and some help text.
+ *
+ * A note regarding the help text: if empty, command will not be
+ * included in 'help' output.
+ */
+ void register_command(std::unique_ptr<AdminSocketHook>&& hook);
+
+ /**
+ * Registering the APIs that are served directly by the admin_socket server.
+ */
+ void register_admin_commands();
+ /**
+ * handle a command message by replying an MCommandReply with the same tid
+ *
+ * \param conn connection over which the incoming command message is received
+ * \param m message carrying the command vector and optional input buffer
+ */
+ seastar::future<> handle_command(crimson::net::ConnectionRef conn,
+ boost::intrusive_ptr<MCommand> m);
+
+private:
+ /**
+ * the result of analyzing an incoming command, and locating it in
+ * the registered APIs collection.
+ */
+ struct parsed_command_t {
+ cmdmap_t params;
+ std::string format;
+ const AdminSocketHook& hook;
+ };
+ // and the shorthand:
+ seastar::future<> handle_client(seastar::input_stream<char>& inp,
+ seastar::output_stream<char>& out);
+
+ seastar::future<> execute_line(std::string cmdline,
+ seastar::output_stream<char>& out);
+
+ seastar::future<> finalize_response(seastar::output_stream<char>& out,
+ ceph::bufferlist&& msgs);
+
+ seastar::future<tell_result_t> execute_command(const std::vector<std::string>& cmd,
+ ceph::bufferlist&& buf);
+
+ std::optional<seastar::future<>> task;
+ std::optional<seastar::server_socket> server_sock;
+ std::optional<seastar::connected_socket> connected_sock;
+
+ /**
+ * stopping incoming ASOK requests at shutdown
+ */
+ seastar::gate stop_gate;
+
+ /**
+ * parse the incoming command vector, find a registered hook by looking up by
+ * its prefix, perform sanity checks on the parsed parameters with the hook's
+ * command description
+ *
+ * \param cmd a vector of string which presents a command
+ * \retval on success, a \c parsed_command_t is returned, tell_result_t with
+ * detailed error messages is returned otherwise
+ */
+ std::variant<parsed_command_t, tell_result_t>
+ parse_cmd(const std::vector<std::string>& cmd);
+
+ using hooks_t = std::map<std::string_view, std::unique_ptr<AdminSocketHook>>;
+ hooks_t hooks;
+
+public:
+ /**
+ * iterator support
+ */
+ hooks_t::const_iterator begin() const {
+ return hooks.cbegin();
+ }
+ hooks_t::const_iterator end() const {
+ return hooks.cend();
+ }
+};
+
+} // namespace crimson::admin
diff --git a/src/crimson/admin/osd_admin.cc b/src/crimson/admin/osd_admin.cc
new file mode 100644
index 000000000..0436e5184
--- /dev/null
+++ b/src/crimson/admin/osd_admin.cc
@@ -0,0 +1,574 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/admin/osd_admin.h"
+#include <string>
+#include <string_view>
+
+#include <fmt/format.h>
+#include <seastar/core/do_with.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/core/scollectd_api.hh>
+
+#include "common/config.h"
+#include "crimson/admin/admin_socket.h"
+#include "crimson/common/log.h"
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/osd.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/shard_services.h"
+
+namespace {
+seastar::logger& logger()
+{
+ return crimson::get_logger(ceph_subsys_osd);
+}
+} // namespace
+
+using namespace std::literals;
+using std::string_view;
+using std::unique_ptr;
+using crimson::osd::OSD;
+using crimson::common::local_conf;
+using namespace crimson::common;
+using ceph::common::cmd_getval;
+using ceph::common::cmd_getval_or;
+
+namespace crimson::admin {
+
+template <class Hook, class... Args>
+std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args)
+{
+ return std::make_unique<Hook>(std::forward<Args>(args)...);
+}
+
+/**
+ * An OSD admin hook: OSD status
+ */
+class OsdStatusHook : public AdminSocketHook {
+public:
+ explicit OsdStatusHook(const crimson::osd::OSD& osd) :
+ AdminSocketHook{"status", "", "OSD status"},
+ osd(osd)
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ f->open_object_section("status");
+ osd.dump_status(f.get());
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+private:
+ const crimson::osd::OSD& osd;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<OsdStatusHook>(const crimson::osd::OSD& osd);
+
+/**
+ * An OSD admin hook: send beacon
+ */
+class SendBeaconHook : public AdminSocketHook {
+public:
+ explicit SendBeaconHook(crimson::osd::OSD& osd) :
+ AdminSocketHook{"send_beacon",
+ "",
+ "send OSD beacon to mon immediately"},
+ osd(osd)
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ return osd.send_beacon().then([] {
+ return seastar::make_ready_future<tell_result_t>();
+ });
+ }
+private:
+ crimson::osd::OSD& osd;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<SendBeaconHook>(crimson::osd::OSD& osd);
+
+/**
+ * send the latest pg stats to mgr
+ */
+class FlushPgStatsHook : public AdminSocketHook {
+public:
+ explicit FlushPgStatsHook(crimson::osd::OSD& osd) :
+ AdminSocketHook("flush_pg_stats",
+ "",
+ "flush pg stats"),
+ osd{osd}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ uint64_t seq = osd.send_pg_stats();
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ f->dump_unsigned("stat_seq", seq);
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+
+private:
+ crimson::osd::OSD& osd;
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<FlushPgStatsHook>(crimson::osd::OSD& osd);
+
+/// dump the history of PGs' peering state
+class DumpPGStateHistory final: public AdminSocketHook {
+public:
+ explicit DumpPGStateHistory(const crimson::osd::PGShardManager &pg_shard_manager) :
+ AdminSocketHook{"dump_pgstate_history",
+ "",
+ "dump history of PGs' peering state"},
+ pg_shard_manager{pg_shard_manager}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ std::unique_ptr<Formatter> fref{
+ Formatter::create(format, "json-pretty", "json-pretty")};
+ Formatter *f = fref.get();
+ f->open_object_section("pgstate_history");
+ f->open_array_section("pgs");
+ return pg_shard_manager.for_each_pg([f](auto &pgid, auto &pg) {
+ f->open_object_section("pg");
+ f->dump_stream("pg") << pgid;
+ const auto& peering_state = pg->get_peering_state();
+ f->dump_string("currently", peering_state.get_current_state());
+ peering_state.dump_history(f);
+ f->close_section();
+ }).then([fref=std::move(fref)]() mutable {
+ fref->close_section();
+ fref->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(fref));
+ });
+ }
+
+private:
+ const crimson::osd::PGShardManager &pg_shard_manager;
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<DumpPGStateHistory>(
+ const crimson::osd::PGShardManager &);
+
+//dump the contents of perfcounters in osd and store
+class DumpPerfCountersHook final: public AdminSocketHook {
+public:
+ explicit DumpPerfCountersHook() :
+ AdminSocketHook{"perfcounters_dump",
+ "name=logger,type=CephString,req=false "
+ "name=counter,type=CephString,req=false",
+ "dump perfcounters in osd and store"}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ std::unique_ptr<Formatter> f{Formatter::create(format,
+ "json-pretty",
+ "json-pretty")};
+ std::string logger;
+ std::string counter;
+ cmd_getval(cmdmap, "logger", logger);
+ cmd_getval(cmdmap, "counter", counter);
+
+ crimson::common::local_perf_coll().dump_formatted(f.get(), false, false, logger, counter);
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<DumpPerfCountersHook>();
+
+
+
+/**
+ * A CephContext admin hook: calling assert (if allowed by
+ * 'debug_asok_assert_abort')
+ */
+class AssertAlwaysHook : public AdminSocketHook {
+public:
+ AssertAlwaysHook() :
+ AdminSocketHook{"assert",
+ "",
+ "asserts"}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ if (local_conf().get_val<bool>("debug_asok_assert_abort")) {
+ ceph_assert_always(0);
+ return seastar::make_ready_future<tell_result_t>();
+ } else {
+ return seastar::make_ready_future<tell_result_t>(
+ tell_result_t{-EPERM, "configuration set to disallow asok assert"});
+ }
+ }
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<AssertAlwaysHook>();
+
+/**
+ * A Seastar admin hook: fetching the values of configured metrics
+ */
+class DumpMetricsHook : public AdminSocketHook {
+public:
+ DumpMetricsHook() :
+ AdminSocketHook("dump_metrics",
+ "name=group,type=CephString,req=false",
+ "dump current configured seastar metrics and their values")
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ std::unique_ptr<Formatter> fref{Formatter::create(format, "json-pretty", "json-pretty")};
+ auto *f = fref.get();
+ std::string prefix;
+ cmd_getval(cmdmap, "group", prefix);
+ f->open_object_section("metrics");
+ f->open_array_section("metrics");
+ return seastar::do_with(std::move(prefix), [f](auto &prefix) {
+ return crimson::reactor_map_seq([f, &prefix] {
+ for (const auto& [full_name, metric_family]: seastar::scollectd::get_value_map()) {
+ if (!prefix.empty() && full_name.compare(0, prefix.size(), prefix) != 0) {
+ continue;
+ }
+ for (const auto& [labels, metric] : metric_family) {
+ if (metric && metric->is_enabled()) {
+ f->open_object_section(""); // enclosed by array
+ DumpMetricsHook::dump_metric_value(f, full_name, *metric, labels);
+ f->close_section();
+ }
+ }
+ }
+ });
+ }).then([fref = std::move(fref)]() mutable {
+ fref->close_section();
+ fref->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(fref));
+ });
+ }
+private:
+ using registered_metric = seastar::metrics::impl::registered_metric;
+ using data_type = seastar::metrics::impl::data_type;
+
+ static void dump_metric_value(Formatter* f,
+ string_view full_name,
+ const registered_metric& metric,
+ const seastar::metrics::impl::labels_type& labels)
+ {
+ f->open_object_section(full_name);
+ for (const auto& [key, value] : labels) {
+ f->dump_string(key, value);
+ }
+ auto value_name = "value";
+ switch (auto v = metric(); v.type()) {
+ case data_type::GAUGE:
+ f->dump_float(value_name, v.d());
+ break;
+ case data_type::REAL_COUNTER:
+ f->dump_float(value_name, v.d());
+ break;
+ case data_type::COUNTER:
+ double val;
+ try {
+ val = v.ui();
+ } catch (std::range_error&) {
+ // seastar's cpu steal time may be negative
+ val = 0;
+ }
+ f->dump_unsigned(value_name, val);
+ break;
+ case data_type::HISTOGRAM: {
+ f->open_object_section(value_name);
+ auto&& h = v.get_histogram();
+ f->dump_float("sum", h.sample_sum);
+ f->dump_unsigned("count", h.sample_count);
+ f->open_array_section("buckets");
+ for (auto i : h.buckets) {
+ f->open_object_section("bucket");
+ f->dump_float("le", i.upper_bound);
+ f->dump_unsigned("count", i.count);
+ f->close_section(); // "bucket"
+ }
+ {
+ f->open_object_section("bucket");
+ f->dump_string("le", "+Inf");
+ f->dump_unsigned("count", h.sample_count);
+ f->close_section();
+ }
+ f->close_section(); // "buckets"
+ f->close_section(); // value_name
+ }
+ break;
+ default:
+ std::abort();
+ break;
+ }
+ f->close_section(); // full_name
+ }
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<DumpMetricsHook>();
+
+
+static ghobject_t test_ops_get_object_name(
+ const OSDMap& osdmap,
+ const cmdmap_t& cmdmap)
+{
+ auto pool = [&] {
+ auto pool_arg = cmd_getval<std::string>(cmdmap, "pool");
+ if (!pool_arg) {
+ throw std::invalid_argument{"No 'pool' specified"};
+ }
+ int64_t pool = osdmap.lookup_pg_pool_name(*pool_arg);
+ if (pool < 0 && std::isdigit((*pool_arg)[0])) {
+ pool = std::atoll(pool_arg->c_str());
+ }
+ if (pool < 0) {
+ // the return type of `fmt::format` is `std::string`
+ throw std::invalid_argument{
+ fmt::format("Invalid pool '{}'", *pool_arg)
+ };
+ }
+ return pool;
+ }();
+
+ auto [ objname, nspace, raw_pg ] = [&] {
+ auto obj_arg = cmd_getval<std::string>(cmdmap, "objname");
+ if (!obj_arg) {
+ throw std::invalid_argument{"No 'objname' specified"};
+ }
+ std::string objname, nspace;
+ if (std::size_t sep_pos = obj_arg->find_first_of('/');
+ sep_pos != obj_arg->npos) {
+ nspace = obj_arg->substr(0, sep_pos);
+ objname = obj_arg->substr(sep_pos+1);
+ } else {
+ objname = *obj_arg;
+ }
+ pg_t raw_pg;
+ if (object_locator_t oloc(pool, nspace);
+ osdmap.object_locator_to_pg(object_t(objname), oloc, raw_pg) < 0) {
+ throw std::invalid_argument{"Invalid namespace/objname"};
+ }
+ return std::make_tuple(std::move(objname),
+ std::move(nspace),
+ std::move(raw_pg));
+ }();
+
+ auto shard_id = cmd_getval_or<int64_t>(cmdmap,
+ "shardid",
+ shard_id_t::NO_SHARD);
+
+ return ghobject_t{
+ hobject_t{
+ object_t{objname}, std::string{}, CEPH_NOSNAP, raw_pg.ps(), pool, nspace
+ },
+ ghobject_t::NO_GEN,
+ shard_id_t{static_cast<int8_t>(shard_id)}
+ };
+}
+
+// Usage:
+// injectdataerr <pool> [namespace/]<obj-name> [shardid]
+class InjectDataErrorHook : public AdminSocketHook {
+public:
+ InjectDataErrorHook(crimson::osd::ShardServices& shard_services) :
+ AdminSocketHook("injectdataerr",
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname " \
+ "name=shardid,type=CephInt,req=false,range=0|255",
+ "inject data error to an object"),
+ shard_services(shard_services) {
+ }
+
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ ghobject_t obj;
+ try {
+ obj = test_ops_get_object_name(*shard_services.get_map(), cmdmap);
+ } catch (const std::invalid_argument& e) {
+ logger().info("error during data error injection: {}", e.what());
+ return seastar::make_ready_future<tell_result_t>(-EINVAL,
+ e.what());
+ }
+ return shard_services.get_store().inject_data_error(obj).then([=] {
+ logger().info("successfully injected data error for obj={}", obj);
+ ceph::bufferlist bl;
+ bl.append("ok"sv);
+ return seastar::make_ready_future<tell_result_t>(0,
+ std::string{}, // no err
+ std::move(bl));
+ });
+ }
+
+private:
+ crimson::osd::ShardServices& shard_services;
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<InjectDataErrorHook>(
+ crimson::osd::ShardServices&);
+
+
+// Usage:
+// injectmdataerr <pool> [namespace/]<obj-name> [shardid]
+class InjectMDataErrorHook : public AdminSocketHook {
+public:
+ InjectMDataErrorHook(crimson::osd::ShardServices& shard_services) :
+ AdminSocketHook("injectmdataerr",
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname " \
+ "name=shardid,type=CephInt,req=false,range=0|255",
+ "inject data error to an object"),
+ shard_services(shard_services) {
+ }
+
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ ghobject_t obj;
+ try {
+ obj = test_ops_get_object_name(*shard_services.get_map(), cmdmap);
+ } catch (const std::invalid_argument& e) {
+ logger().info("error during metadata error injection: {}", e.what());
+ return seastar::make_ready_future<tell_result_t>(-EINVAL,
+ e.what());
+ }
+ return shard_services.get_store().inject_mdata_error(obj).then([=] {
+ logger().info("successfully injected metadata error for obj={}", obj);
+ ceph::bufferlist bl;
+ bl.append("ok"sv);
+ return seastar::make_ready_future<tell_result_t>(0,
+ std::string{}, // no err
+ std::move(bl));
+ });
+ }
+
+private:
+ crimson::osd::ShardServices& shard_services;
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<InjectMDataErrorHook>(
+ crimson::osd::ShardServices&);
+
+
+/**
+ * An InFlightOps admin hook: dump current in-flight operations
+ */
+class DumpInFlightOpsHook : public AdminSocketHook {
+public:
+ explicit DumpInFlightOpsHook(const crimson::osd::PGShardManager &pg_shard_manager) :
+ AdminSocketHook{"dump_ops_in_flight", "", "show the ops currently in flight"},
+ pg_shard_manager(pg_shard_manager)
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ unique_ptr<Formatter> fref{
+ Formatter::create(format, "json-pretty", "json-pretty")};
+ auto *f = fref.get();
+ f->open_object_section("ops_in_flight");
+ f->open_array_section("ops_in_flight");
+ return pg_shard_manager.invoke_on_each_shard_seq([f](const auto &shard_services) {
+ return shard_services.dump_ops_in_flight(f);
+ }).then([fref=std::move(fref)]() mutable {
+ fref->close_section();
+ fref->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(fref));
+ });
+ }
+private:
+ const crimson::osd::PGShardManager &pg_shard_manager;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<DumpInFlightOpsHook>(const crimson::osd::PGShardManager &);
+
+
+class DumpHistoricOpsHook : public AdminSocketHook {
+public:
+ explicit DumpHistoricOpsHook(const crimson::osd::OSDOperationRegistry& op_registry) :
+ AdminSocketHook{"dump_historic_ops", "", "show recent ops"},
+ op_registry(op_registry)
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ f->open_object_section("historic_ops");
+ op_registry.dump_historic_client_requests(f.get());
+ f->close_section();
+ f->dump_int("num_ops", 0);
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+private:
+ const crimson::osd::OSDOperationRegistry& op_registry;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<DumpHistoricOpsHook>(const crimson::osd::OSDOperationRegistry& op_registry);
+
+
+class DumpSlowestHistoricOpsHook : public AdminSocketHook {
+public:
+ explicit DumpSlowestHistoricOpsHook(const crimson::osd::OSDOperationRegistry& op_registry) :
+ AdminSocketHook{"dump_historic_slow_ops", "", "show slowest recent ops"},
+ op_registry(op_registry)
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ logger().warn("{}", __func__);
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ f->open_object_section("historic_slow_ops");
+ op_registry.dump_slowest_historic_client_requests(f.get());
+ f->close_section();
+ f->dump_int("num_ops", 0);
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+private:
+ const crimson::osd::OSDOperationRegistry& op_registry;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<DumpSlowestHistoricOpsHook>(const crimson::osd::OSDOperationRegistry& op_registry);
+
+class DumpRecoveryReservationsHook : public AdminSocketHook {
+public:
+ explicit DumpRecoveryReservationsHook(crimson::osd::ShardServices& shard_services) :
+ AdminSocketHook{"dump_recovery_reservations", "", "show recovery reservations"},
+ shard_services(shard_services)
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ logger().debug("{}", __func__);
+ unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+ return seastar::do_with(std::move(f), [this](auto&& f) {
+ f->open_object_section("reservations");
+ f->open_object_section("local_reservations");
+ return shard_services.local_dump_reservations(f.get()).then([&f, this] {
+ f->close_section();
+ f->open_object_section("remote_reservations");
+ return shard_services.remote_dump_reservations(f.get()).then([&f] {
+ f->close_section();
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ });
+ });
+ });
+ }
+private:
+ crimson::osd::ShardServices& shard_services;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<DumpRecoveryReservationsHook>(crimson::osd::ShardServices& shard_services);
+
+} // namespace crimson::admin
diff --git a/src/crimson/admin/osd_admin.h b/src/crimson/admin/osd_admin.h
new file mode 100644
index 000000000..a3ddd66b9
--- /dev/null
+++ b/src/crimson/admin/osd_admin.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <memory>
+
+namespace crimson::admin {
+
+class AdminSocketHook;
+
+class AssertAlwaysHook;
+class DumpMetricsHook;
+class DumpPGStateHistory;
+class DumpPerfCountersHook;
+class FlushPgStatsHook;
+class InjectDataErrorHook;
+class InjectMDataErrorHook;
+class OsdStatusHook;
+class SendBeaconHook;
+class DumpInFlightOpsHook;
+class DumpHistoricOpsHook;
+class DumpSlowestHistoricOpsHook;
+class DumpRecoveryReservationsHook;
+
+template<class Hook, class... Args>
+std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args);
+
+} // namespace crimson::admin
diff --git a/src/crimson/admin/pg_commands.cc b/src/crimson/admin/pg_commands.cc
new file mode 100644
index 000000000..f2c84b254
--- /dev/null
+++ b/src/crimson/admin/pg_commands.cc
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/admin/pg_commands.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include <fmt/format.h>
+#include <seastar/core/future.hh>
+
+#include "crimson/admin/admin_socket.h"
+#include "crimson/osd/osd.h"
+#include "crimson/osd/pg.h"
+
+
+using crimson::osd::OSD;
+using crimson::osd::PG;
+using namespace crimson::common;
+using ceph::common::cmd_getval;
+
+
+namespace crimson::admin::pg {
+
+class PGCommand : public AdminSocketHook {
+public:
+ // TODO: const correctness of osd
+ PGCommand(crimson::osd::OSD& osd,
+ std::string_view prefix,
+ std::string_view desc,
+ std::string_view help)
+ : AdminSocketHook{prefix, desc, help}, osd {osd}
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ // we have "ceph tell <pgid> <cmd>". and it is the ceph cli's responsibility
+ // to add "pgid" to the cmd dict. as rados_pg_command() does not set it for
+ // us. moreover, and "pgid" is not listed in the command description, as user
+ // command format does not follow the convention of "<prefix> [<args>,...]"
+ // so we have to verify it on the server side.
+ std::string pgid_str;
+ pg_t pgid;
+ if (!cmd_getval(cmdmap, "pgid", pgid_str)) {
+ return seastar::make_ready_future<tell_result_t>(
+ tell_result_t{-EINVAL, "no pgid specified"});
+ } else if (!pgid.parse(pgid_str.c_str())) {
+ return seastar::make_ready_future<tell_result_t>(
+ tell_result_t{-EINVAL, fmt::format("couldn't parse pgid '{}'", pgid_str)});
+ }
+ // am i the primary for this pg?
+ const auto osdmap = osd.get_shard_services().get_map();
+ spg_t spg_id;
+ if (!osdmap->get_primary_shard(pgid, &spg_id)) {
+ return seastar::make_ready_future<tell_result_t>(tell_result_t{
+ -ENOENT, fmt::format("pgid '{}' does not exist", pgid_str)});
+ }
+ return osd.get_pg_shard_manager().with_pg(
+ spg_id,
+ [this, spg_id,
+ cmdmap=std::move(cmdmap),
+ format=std::move(format),
+ input=std::move(input)
+ ](auto &&pg) mutable {
+ if (!pg) {
+ return seastar::make_ready_future<tell_result_t>(tell_result_t{
+ -ENOENT, fmt::format("i don't have pgid '{}'", spg_id)});
+ }
+ if (!pg->is_primary()) {
+ return seastar::make_ready_future<tell_result_t>(tell_result_t{
+ -EAGAIN, fmt::format("not primary for pgid '{}'", spg_id)});
+ }
+ return this->do_command(pg, cmdmap, format, std::move(input));
+ });
+ }
+
+private:
+ virtual seastar::future<tell_result_t>
+ do_command(Ref<PG> pg,
+ const cmdmap_t& cmdmap,
+ std::string_view format,
+ ceph::bufferlist&& input) const = 0;
+
+ OSD& osd;
+};
+
+class QueryCommand final : public PGCommand {
+public:
+ // TODO: const correctness of osd
+ explicit QueryCommand(crimson::osd::OSD& osd) :
+ PGCommand{osd,
+ "query",
+ "",
+ "show details of a specific pg"}
+ {}
+private:
+ seastar::future<tell_result_t>
+ do_command(Ref<PG> pg,
+ const cmdmap_t&,
+ std::string_view format,
+ ceph::bufferlist&& input) const final
+ {
+ std::unique_ptr<Formatter> f{Formatter::create(format,
+ "json-pretty",
+ "json-pretty")};
+ f->open_object_section("pg");
+ pg->dump_primary(f.get());
+ f->close_section();
+ return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+
+class MarkUnfoundLostCommand final : public PGCommand {
+public:
+ explicit MarkUnfoundLostCommand(crimson::osd::OSD& osd) :
+ PGCommand{osd,
+ "mark_unfound_lost",
+ "name=pgid,type=CephPgid,req=false"
+ " name=mulcmd,type=CephChoices,strings=revert|delete",
+ "mark all unfound objects in this pg as lost, either"
+ " removing or reverting to a prior version if one is"
+ " available"}
+ {}
+ seastar::future<tell_result_t>
+ do_command(Ref<PG> pg,
+ const cmdmap_t& cmdmap,
+ std::string_view,
+ ceph::bufferlist&&) const final
+ {
+ // what to do with the unfound object specifically.
+ std::string cmd;
+ int op = -1;
+ cmd_getval(cmdmap, "mulcmd", cmd);
+ if (cmd == "revert") {
+ op = pg_log_entry_t::LOST_REVERT;
+ } else if (cmd == "delete") {
+ op = pg_log_entry_t::LOST_DELETE;
+ } else {
+ return seastar::make_ready_future<tell_result_t>(tell_result_t{
+ -EINVAL, "mode must be 'revert' or 'delete'; mark not yet implemented"});
+ }
+ return pg->mark_unfound_lost(op).then([] {
+ // TODO
+ return seastar::make_ready_future<tell_result_t>();
+ });
+ }
+};
+
+} // namespace crimson::admin::pg
+
+namespace crimson::admin {
+
+template <class Hook, class... Args>
+std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args)
+{
+ return std::make_unique<Hook>(std::forward<Args>(args)...);
+}
+
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<crimson::admin::pg::QueryCommand>(crimson::osd::OSD& osd);
+
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<crimson::admin::pg::MarkUnfoundLostCommand>(crimson::osd::OSD& osd);
+
+} // namespace crimson::admin
diff --git a/src/crimson/admin/pg_commands.h b/src/crimson/admin/pg_commands.h
new file mode 100644
index 000000000..873b3c923
--- /dev/null
+++ b/src/crimson/admin/pg_commands.h
@@ -0,0 +1,10 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+namespace crimson::admin::pg {
+
+class QueryCommand;
+class MarkUnfoundLostCommand;
+
+} // namespace crimson::admin::pg
diff --git a/src/crimson/auth/AuthClient.h b/src/crimson/auth/AuthClient.h
new file mode 100644
index 000000000..2d970c88c
--- /dev/null
+++ b/src/crimson/auth/AuthClient.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <tuple>
+#include <vector>
+#include "include/buffer_fwd.h"
+#include "crimson/net/Fwd.h"
+
+class CryptoKey;
+
+namespace crimson::auth {
+
+class error : public std::logic_error {
+public:
+ using std::logic_error::logic_error;
+};
+
+using method_t = uint32_t;
+
+// TODO: revisit interfaces for non-dummy implementations
+class AuthClient {
+public:
+ virtual ~AuthClient() {}
+
+ struct auth_request_t {
+ method_t auth_method;
+ std::vector<uint32_t> preferred_modes;
+ ceph::bufferlist auth_bl;
+ };
+ /// Build an authentication request to begin the handshake
+ ///
+ /// @throw auth::error if unable to build the request
+ virtual auth_request_t get_auth_request(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta) = 0;
+
+ /// Handle server's request to continue the handshake
+ ///
+ /// @throw auth::error if unable to build the request
+ virtual ceph::bufferlist handle_auth_reply_more(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ const ceph::bufferlist& bl) = 0;
+
+ /// Handle server's indication that authentication succeeded
+ ///
+ /// @return 0 if authenticated, a negative number otherwise
+ virtual int handle_auth_done(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ uint64_t global_id,
+ uint32_t con_mode,
+ const bufferlist& bl) = 0;
+
+ /// Handle server's indication that the previous auth attempt failed
+ ///
+ /// @return 0 if will try next auth method, a negative number if we have no
+ /// more options
+ virtual int handle_auth_bad_method(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes) = 0;
+};
+
+} // namespace crimson::auth
diff --git a/src/crimson/auth/AuthServer.h b/src/crimson/auth/AuthServer.h
new file mode 100644
index 000000000..a808410d2
--- /dev/null
+++ b/src/crimson/auth/AuthServer.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+#include "crimson/net/Fwd.h"
+
+struct AuthAuthorizeHandler;
+
+namespace crimson::auth {
+
+class AuthServer {
+public:
+ virtual ~AuthServer() {}
+
+ // Get authentication methods and connection modes for the given peer type
+ virtual std::pair<std::vector<uint32_t>, std::vector<uint32_t>>
+ get_supported_auth_methods(int peer_type) = 0;
+ // Get support connection modes for the given peer type and auth method
+ virtual uint32_t pick_con_mode(
+ int peer_type,
+ uint32_t auth_method,
+ const std::vector<uint32_t>& preferred_modes) = 0;
+ // return an AuthAuthorizeHandler for the given peer type and auth method
+ virtual AuthAuthorizeHandler* get_auth_authorize_handler(
+ int peer_type,
+ int auth_method) = 0;
+ // Handle an authentication request on an incoming connection
+ virtual int handle_auth_request(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ bool more, //< true if this is not the first part of the handshake
+ uint32_t auth_method,
+ const bufferlist& bl,
+ uint64_t *p_peer_global_id,
+ bufferlist *reply) = 0;
+};
+
+} // namespace crimson::auth
diff --git a/src/crimson/auth/DummyAuth.h b/src/crimson/auth/DummyAuth.h
new file mode 100644
index 000000000..7a3dd7ec4
--- /dev/null
+++ b/src/crimson/auth/DummyAuth.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "AuthClient.h"
+#include "AuthServer.h"
+
+namespace crimson::auth {
+
+class DummyAuthClientServer : public AuthClient,
+ public AuthServer {
+public:
+ DummyAuthClientServer() {}
+
+ // client
+ std::pair<std::vector<uint32_t>, std::vector<uint32_t>>
+ get_supported_auth_methods(int peer_type) final {
+ return {{CEPH_AUTH_NONE}, {CEPH_AUTH_NONE}};
+ }
+
+ uint32_t pick_con_mode(int peer_type,
+ uint32_t auth_method,
+ const std::vector<uint32_t>& preferred_modes) final {
+ ceph_assert(auth_method == CEPH_AUTH_NONE);
+ ceph_assert(preferred_modes.size() &&
+ preferred_modes[0] == CEPH_CON_MODE_CRC);
+ return CEPH_CON_MODE_CRC;
+ }
+
+ AuthAuthorizeHandler* get_auth_authorize_handler(int peer_type,
+ int auth_method) final {
+ return nullptr;
+ }
+
+ AuthClient::auth_request_t get_auth_request(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta) override {
+ return {CEPH_AUTH_NONE, {CEPH_CON_MODE_CRC}, {}};
+ }
+
+ ceph::bufferlist handle_auth_reply_more(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ const bufferlist& bl) override {
+ ceph_abort();
+ }
+
+ int handle_auth_done(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ uint64_t global_id,
+ uint32_t con_mode,
+ const bufferlist& bl) override {
+ return 0;
+ }
+
+ int handle_auth_bad_method(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes) override {
+ ceph_abort();
+ }
+
+ // server
+ int handle_auth_request(
+ crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ bool more,
+ uint32_t auth_method,
+ const bufferlist& bl,
+ uint64_t *p_peer_global_id,
+ bufferlist *reply) override {
+ return 1;
+ }
+};
+
+} // namespace crimson::auth
diff --git a/src/crimson/auth/KeyRing.cc b/src/crimson/auth/KeyRing.cc
new file mode 100644
index 000000000..436e29c1b
--- /dev/null
+++ b/src/crimson/auth/KeyRing.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "KeyRing.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include <seastar/core/do_with.hh>
+#include <seastar/core/fstream.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/reactor.hh>
+
+#include "common/buffer_seastar.h"
+#include "auth/KeyRing.h"
+#include "include/denc.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+
+namespace crimson::auth {
+
+seastar::future<KeyRing*> load_from_keyring(KeyRing* keyring)
+{
+ std::vector<std::string> paths;
+ boost::split(paths, crimson::common::local_conf()->keyring,
+ boost::is_any_of(",;"));
+ std::pair<bool, std::string> found;
+ return seastar::map_reduce(paths, [](auto path) {
+ return seastar::engine().file_exists(path).then([path](bool file_exists) {
+ return std::make_pair(file_exists, path);
+ });
+ }, std::move(found), [](auto found, auto file_exists_and_path) {
+ if (!found.first && file_exists_and_path.first) {
+ found = std::move(file_exists_and_path);
+ }
+ return found;
+ }).then([keyring] (auto file_exists_and_path) {
+ const auto& [exists, path] = file_exists_and_path;
+ if (exists) {
+ return read_file(path).then([keyring](auto buf) {
+ bufferlist bl;
+ bl.append(buffer::create(std::move(buf)));
+ auto i = bl.cbegin();
+ keyring->decode(i);
+ return seastar::make_ready_future<KeyRing*>(keyring);
+ });
+ } else {
+ return seastar::make_ready_future<KeyRing*>(keyring);
+ }
+ });
+}
+
+seastar::future<KeyRing*> load_from_keyfile(KeyRing* keyring)
+{
+ auto& path = crimson::common::local_conf()->keyfile;
+ if (!path.empty()) {
+ return read_file(path).then([keyring](auto buf) {
+ EntityAuth ea;
+ ea.key.decode_base64(std::string(buf.begin(),
+ buf.end()));
+ keyring->add(crimson::common::local_conf()->name, ea);
+ return seastar::make_ready_future<KeyRing*>(keyring);
+ });
+ } else {
+ return seastar::make_ready_future<KeyRing*>(keyring);
+ }
+}
+
+seastar::future<KeyRing*> load_from_key(KeyRing* keyring)
+{
+ auto& key = crimson::common::local_conf()->key;
+ if (!key.empty()) {
+ EntityAuth ea;
+ ea.key.decode_base64(key);
+ keyring->add(crimson::common::local_conf()->name, ea);
+ }
+ return seastar::make_ready_future<KeyRing*>(keyring);
+}
+
+} // namespace crimson::auth
diff --git a/src/crimson/auth/KeyRing.h b/src/crimson/auth/KeyRing.h
new file mode 100644
index 000000000..850f1bb79
--- /dev/null
+++ b/src/crimson/auth/KeyRing.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+class KeyRing;
+
+namespace crimson::auth {
+ // see KeyRing::from_ceph_context
+ seastar::future<KeyRing*> load_from_keyring(KeyRing* keyring);
+ seastar::future<KeyRing*> load_from_keyfile(KeyRing* keyring);
+ seastar::future<KeyRing*> load_from_key(KeyRing* keyring);
+}
diff --git a/src/crimson/common/assert.cc b/src/crimson/common/assert.cc
new file mode 100644
index 000000000..07610c33f
--- /dev/null
+++ b/src/crimson/common/assert.cc
@@ -0,0 +1,81 @@
+#include <cstdarg>
+#include <iostream>
+
+#include <seastar/util/backtrace.hh>
+#include <seastar/core/reactor.hh>
+
+#include "include/ceph_assert.h"
+
+#include "crimson/common/log.h"
+
+namespace ceph {
+ [[gnu::cold]] void __ceph_assert_fail(const ceph::assert_data &ctx)
+ {
+ __ceph_assert_fail(ctx.assertion, ctx.file, ctx.line, ctx.function);
+ }
+
+ [[gnu::cold]] void __ceph_assert_fail(const char* assertion,
+ const char* file, int line,
+ const char* func)
+ {
+ seastar::logger& logger = crimson::get_logger(0);
+ logger.error("{}:{} : In function '{}', ceph_assert(%s)\n"
+ "{}",
+ file, line, func, assertion,
+ seastar::current_backtrace());
+ std::cout << std::flush;
+ abort();
+ }
+ [[gnu::cold]] void __ceph_assertf_fail(const char *assertion,
+ const char *file, int line,
+ const char *func, const char* msg,
+ ...)
+ {
+ char buf[8096];
+ va_list args;
+ va_start(args, msg);
+ std::vsnprintf(buf, sizeof(buf), msg, args);
+ va_end(args);
+
+ seastar::logger& logger = crimson::get_logger(0);
+ logger.error("{}:{} : In function '{}', ceph_assert(%s)\n"
+ "{}\n{}\n",
+ file, line, func, assertion,
+ buf,
+ seastar::current_backtrace());
+ std::cout << std::flush;
+ abort();
+ }
+
+ [[gnu::cold]] void __ceph_abort(const char* file, int line,
+ const char* func, const std::string& msg)
+ {
+ seastar::logger& logger = crimson::get_logger(0);
+ logger.error("{}:{} : In function '{}', abort(%s)\n"
+ "{}",
+ file, line, func, msg,
+ seastar::current_backtrace());
+ std::cout << std::flush;
+ abort();
+ }
+
+ [[gnu::cold]] void __ceph_abortf(const char* file, int line,
+ const char* func, const char* fmt,
+ ...)
+ {
+ char buf[8096];
+ va_list args;
+ va_start(args, fmt);
+ std::vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ seastar::logger& logger = crimson::get_logger(0);
+ logger.error("{}:{} : In function '{}', abort()\n"
+ "{}\n{}\n",
+ file, line, func,
+ buf,
+ seastar::current_backtrace());
+ std::cout << std::flush;
+ abort();
+ }
+}
diff --git a/src/crimson/common/auth_handler.h b/src/crimson/common/auth_handler.h
new file mode 100644
index 000000000..d4140b6a2
--- /dev/null
+++ b/src/crimson/common/auth_handler.h
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+class EntityName;
+class AuthCapsInfo;
+
+namespace crimson::common {
+class AuthHandler {
+public:
+ // the peer just got authorized
+ virtual void handle_authentication(const EntityName& name,
+ const AuthCapsInfo& caps) = 0;
+ virtual ~AuthHandler() = default;
+};
+}
diff --git a/src/crimson/common/buffer_io.cc b/src/crimson/common/buffer_io.cc
new file mode 100644
index 000000000..86edf7a6f
--- /dev/null
+++ b/src/crimson/common/buffer_io.cc
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "buffer_io.h"
+
+#include <seastar/core/reactor.hh>
+#include <seastar/core/fstream.hh>
+#include <seastar/core/do_with.hh>
+
+#include "include/buffer.h"
+
+namespace crimson {
+
+seastar::future<> write_file(ceph::buffer::list&& bl,
+ seastar::sstring fn,
+ seastar::file_permissions permissions)
+{
+ const auto flags = (seastar::open_flags::wo |
+ seastar::open_flags::create |
+ seastar::open_flags::truncate);
+ seastar::file_open_options foo;
+ foo.create_permissions = permissions;
+ return seastar::open_file_dma(fn, flags, foo).then(
+ [bl=std::move(bl)](seastar::file f) {
+ return seastar::make_file_output_stream(f).then(
+ [bl=std::move(bl), f=std::move(f)](seastar::output_stream<char> out) {
+ return seastar::do_with(std::move(out),
+ std::move(f),
+ std::move(bl),
+ [](seastar::output_stream<char>& out,
+ seastar::file& f,
+ ceph::buffer::list& bl) {
+ return seastar::do_for_each(bl.buffers(), [&out](auto& buf) {
+ return out.write(buf.c_str(), buf.length());
+ }).then([&out] {
+ return out.close();
+ });
+ });
+ });
+ });
+}
+
+seastar::future<seastar::temporary_buffer<char>>
+read_file(const seastar::sstring fn)
+{
+ return seastar::open_file_dma(fn, seastar::open_flags::ro).then(
+ [] (seastar::file f) {
+ return f.size().then([f = std::move(f)](size_t s) {
+ return seastar::do_with(seastar::make_file_input_stream(f),
+ [s](seastar::input_stream<char>& in) {
+ return in.read_exactly(s);
+ });
+ });
+ });
+}
+
+}
diff --git a/src/crimson/common/buffer_io.h b/src/crimson/common/buffer_io.h
new file mode 100644
index 000000000..c5ece4a6f
--- /dev/null
+++ b/src/crimson/common/buffer_io.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/file-types.hh>
+
+#include "include/buffer_fwd.h"
+
+namespace crimson {
+ seastar::future<> write_file(ceph::buffer::list&& bl,
+ seastar::sstring fn,
+ seastar::file_permissions= // 0644
+ (seastar::file_permissions::user_read |
+ seastar::file_permissions::user_write |
+ seastar::file_permissions::group_read |
+ seastar::file_permissions::others_read));
+ seastar::future<seastar::temporary_buffer<char>>
+ read_file(const seastar::sstring fn);
+}
diff --git a/src/crimson/common/condition_variable.h b/src/crimson/common/condition_variable.h
new file mode 100644
index 000000000..19267f38a
--- /dev/null
+++ b/src/crimson/common/condition_variable.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/condition-variable.hh>
+#include <seastar/core/loop.hh>
+
+#include "crimson/common/interruptible_future.h"
+
+namespace crimson {
+
+class condition_variable : public seastar::condition_variable {
+public:
+ template <typename Pred, typename Func>
+ auto wait(
+ Pred&& pred,
+ Func&& action) noexcept {
+ using func_result_t = std::invoke_result_t<Func>;
+ using intr_errorator_t = typename func_result_t::interrupt_errorator_type;
+ using intr_cond_t = typename func_result_t::interrupt_cond_type;
+ using interruptor = crimson::interruptible::interruptor<intr_cond_t>;
+ return interruptor::repeat(
+ [this, pred=std::forward<Pred>(pred),
+ action=std::forward<Func>(action)]()
+ -> typename intr_errorator_t::template future<seastar::stop_iteration> {
+ if (!pred()) {
+ return seastar::condition_variable::wait().then([] {
+ return seastar::make_ready_future<
+ seastar::stop_iteration>(seastar::stop_iteration::no);
+ });
+ } else {
+ return action().si_then([] {
+ return seastar::make_ready_future<
+ seastar::stop_iteration>(seastar::stop_iteration::yes);
+ });
+ }
+ });
+ }
+};
+
+} // namespace crimson
diff --git a/src/crimson/common/config_proxy.cc b/src/crimson/common/config_proxy.cc
new file mode 100644
index 000000000..88d4679d5
--- /dev/null
+++ b/src/crimson/common/config_proxy.cc
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "config_proxy.h"
+
+#include <filesystem>
+
+#include "crimson/common/buffer_io.h"
+
+namespace crimson::common {
+
+ConfigProxy::ConfigProxy(const EntityName& name, std::string_view cluster)
+{
+ if (seastar::this_shard_id() != 0) {
+ return;
+ }
+ // set the initial value on CPU#0
+ values.reset(seastar::make_lw_shared<ConfigValues>());
+ values.get()->name = name;
+ values.get()->cluster = cluster;
+ // and the only copy of md_config_impl<> is allocated on CPU#0
+ local_config.reset(new md_config_t{*values, obs_mgr, true});
+ if (name.is_mds()) {
+ local_config->set_val_default(*values, obs_mgr,
+ "keyring", "$mds_data/keyring");
+ } else if (name.is_osd()) {
+ local_config->set_val_default(*values, obs_mgr,
+ "keyring", "$osd_data/keyring");
+ }
+}
+
+seastar::future<> ConfigProxy::start()
+{
+ // populate values and config to all other shards
+ if (!values) {
+ return seastar::make_ready_future<>();
+ }
+ return container().invoke_on_others([this](auto& proxy) {
+ return values.copy().then([config=local_config.get(),
+ &proxy](auto foreign_values) {
+ proxy.values.reset();
+ proxy.values = std::move(foreign_values);
+ proxy.remote_config = config;
+ return seastar::make_ready_future<>();
+ });
+ });
+}
+
+void ConfigProxy::show_config(ceph::Formatter* f) const {
+ get_config().show_config(*values, f);
+}
+
+seastar::future<> ConfigProxy::parse_config_files(const std::string& conf_files)
+{
+ auto conffile_paths =
+ get_config().get_conffile_paths(*values,
+ conf_files.empty() ? nullptr : conf_files.c_str(),
+ &std::cerr,
+ CODE_ENVIRONMENT_DAEMON);
+ return seastar::do_with(std::move(conffile_paths), [this] (auto& paths) {
+ return seastar::repeat([path=paths.begin(), e=paths.end(), this]() mutable {
+ if (path == e) {
+ // tried all conffile, none of them works
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ return crimson::read_file(*path++).then([this](auto&& buf) {
+ return do_change([buf=std::move(buf), this](ConfigValues& values) {
+ if (get_config().parse_buffer(values, obs_mgr,
+ buf.get(), buf.size(),
+ &std::cerr) == 0) {
+ get_config().update_legacy_vals(values);
+ } else {
+ throw std::invalid_argument("parse error");
+ }
+ }).then([] {
+ // this one works!
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ });
+ }).handle_exception_type([] (const std::filesystem::filesystem_error&) {
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ }).handle_exception_type([] (const std::invalid_argument&) {
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ });
+ });
+}
+
+ConfigProxy::ShardedConfig ConfigProxy::sharded_conf;
+}
diff --git a/src/crimson/common/config_proxy.h b/src/crimson/common/config_proxy.h
new file mode 100644
index 000000000..4c0e65507
--- /dev/null
+++ b/src/crimson/common/config_proxy.h
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sharded.hh>
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "common/config_obs_mgr.h"
+#include "common/errno.h"
+
+namespace ceph {
+class Formatter;
+}
+
+namespace crimson::common {
+
+// a facade for managing config. each shard has its own copy of ConfigProxy.
+//
+// In seastar-osd, there could be multiple instances of @c ConfigValues in a
+// single process, as we are using a variant of read-copy-update mechinary to
+// update the settings at runtime.
+class ConfigProxy : public seastar::peering_sharded_service<ConfigProxy>
+{
+ using LocalConfigValues = seastar::lw_shared_ptr<ConfigValues>;
+ seastar::foreign_ptr<LocalConfigValues> values;
+
+ md_config_t* remote_config = nullptr;
+ std::unique_ptr<md_config_t> local_config;
+
+ using ConfigObserver = ceph::md_config_obs_impl<ConfigProxy>;
+ ObserverMgr<ConfigObserver> obs_mgr;
+
+ const md_config_t& get_config() const {
+ return remote_config ? *remote_config : * local_config;
+ }
+ md_config_t& get_config() {
+ return remote_config ? *remote_config : * local_config;
+ }
+
+ // apply changes to all shards
+ // @param func a functor which accepts @c "ConfigValues&"
+ template<typename Func>
+ seastar::future<> do_change(Func&& func) {
+ return container().invoke_on(values.get_owner_shard(),
+ [func = std::move(func)](ConfigProxy& owner) {
+ // apply the changes to a copy of the values
+ auto new_values = seastar::make_lw_shared(*owner.values);
+ new_values->changed.clear();
+ func(*new_values);
+
+ // always apply the new settings synchronously on the owner shard, to
+ // avoid racings with other do_change() calls in parallel.
+ ObserverMgr<ConfigObserver>::rev_obs_map rev_obs;
+ owner.values.reset(new_values);
+ owner.obs_mgr.for_each_change(owner.values->changed, owner,
+ [&rev_obs](ConfigObserver *obs,
+ const std::string &key) {
+ rev_obs[obs].insert(key);
+ }, nullptr);
+ for (auto& [obs, keys] : rev_obs) {
+ obs->handle_conf_change(owner, keys);
+ }
+
+ return seastar::parallel_for_each(boost::irange(1u, seastar::smp::count),
+ [&owner, new_values] (auto cpu) {
+ return owner.container().invoke_on(cpu,
+ [foreign_values = seastar::make_foreign(new_values)](ConfigProxy& proxy) mutable {
+ proxy.values.reset();
+ proxy.values = std::move(foreign_values);
+
+ ObserverMgr<ConfigObserver>::rev_obs_map rev_obs;
+ proxy.obs_mgr.for_each_change(proxy.values->changed, proxy,
+ [&rev_obs](ConfigObserver *obs, const std::string& key) {
+ rev_obs[obs].insert(key);
+ }, nullptr);
+ for (auto& obs_keys : rev_obs) {
+ obs_keys.first->handle_conf_change(proxy, obs_keys.second);
+ }
+ });
+ }).finally([new_values] {
+ new_values->changed.clear();
+ });
+ });
+ }
+public:
+ ConfigProxy(const EntityName& name, std::string_view cluster);
+ const ConfigValues* operator->() const noexcept {
+ return values.get();
+ }
+ const ConfigValues get_config_values() {
+ return *values.get();
+ }
+ ConfigValues* operator->() noexcept {
+ return values.get();
+ }
+
+ void get_config_bl(uint64_t have_version,
+ ceph::buffer::list *bl,
+ uint64_t *got_version) {
+ get_config().get_config_bl(get_config_values(), have_version,
+ bl, got_version);
+ }
+ void get_defaults_bl(ceph::buffer::list *bl) {
+ get_config().get_defaults_bl(get_config_values(), bl);
+ }
+ seastar::future<> start();
+ // required by sharded<>
+ seastar::future<> stop() {
+ return seastar::make_ready_future<>();
+ }
+ void add_observer(ConfigObserver* obs) {
+ obs_mgr.add_observer(obs);
+ }
+ void remove_observer(ConfigObserver* obs) {
+ obs_mgr.remove_observer(obs);
+ }
+ seastar::future<> rm_val(const std::string& key) {
+ return do_change([key, this](ConfigValues& values) {
+ auto ret = get_config().rm_val(values, key);
+ if (ret < 0) {
+ throw std::invalid_argument(cpp_strerror(ret));
+ }
+ });
+ }
+ seastar::future<> set_val(const std::string& key,
+ const std::string& val) {
+ return do_change([key, val, this](ConfigValues& values) {
+ std::stringstream err;
+ auto ret = get_config().set_val(values, obs_mgr, key, val, &err);
+ if (ret < 0) {
+ throw std::invalid_argument(err.str());
+ }
+ });
+ }
+ int get_val(std::string_view key, std::string *val) const {
+ return get_config().get_val(*values, key, val);
+ }
+ template<typename T>
+ const T get_val(std::string_view key) const {
+ return get_config().template get_val<T>(*values, key);
+ }
+
+ int get_all_sections(std::vector<std::string>& sections) const {
+ return get_config().get_all_sections(sections);
+ }
+
+ int get_val_from_conf_file(const std::vector<std::string>& sections,
+ const std::string& key, std::string& out,
+ bool expand_meta) const {
+ return get_config().get_val_from_conf_file(*values, sections, key,
+ out, expand_meta);
+ }
+
+ unsigned get_osd_pool_default_min_size(uint8_t size) const {
+ return get_config().get_osd_pool_default_min_size(*values, size);
+ }
+
+ seastar::future<>
+ set_mon_vals(const std::map<std::string,std::string,std::less<>>& kv) {
+ return do_change([kv, this](ConfigValues& values) {
+ get_config().set_mon_vals(nullptr, values, obs_mgr, kv, nullptr);
+ });
+ }
+
+ seastar::future<> inject_args(const std::string& s) {
+ return do_change([s, this](ConfigValues& values) {
+ std::stringstream err;
+ if (get_config().injectargs(values, obs_mgr, s, &err)) {
+ throw std::invalid_argument(err.str());
+ }
+ });
+ }
+ void show_config(ceph::Formatter* f) const;
+
+ seastar::future<> parse_argv(std::vector<const char*>& argv) {
+ // we could pass whatever is unparsed to seastar, but seastar::app_template
+ // is used for driving the seastar application, and
+ // crimson::common::ConfigProxy is not available until seastar engine is up
+ // and running, so we have to feed the command line args to app_template
+ // first, then pass them to ConfigProxy.
+ return do_change([&argv, this](ConfigValues& values) {
+ get_config().parse_argv(values,
+ obs_mgr,
+ argv,
+ CONF_CMDLINE);
+ });
+ }
+
+ seastar::future<> parse_env() {
+ return do_change([this](ConfigValues& values) {
+ get_config().parse_env(CEPH_ENTITY_TYPE_OSD,
+ values,
+ obs_mgr);
+ });
+ }
+
+ seastar::future<> parse_config_files(const std::string& conf_files);
+
+ using ShardedConfig = seastar::sharded<ConfigProxy>;
+
+private:
+ static ShardedConfig sharded_conf;
+ friend ConfigProxy& local_conf();
+ friend ShardedConfig& sharded_conf();
+};
+
+inline ConfigProxy& local_conf() {
+ return ConfigProxy::sharded_conf.local();
+}
+
+inline ConfigProxy::ShardedConfig& sharded_conf() {
+ return ConfigProxy::sharded_conf;
+}
+
+template<typename T>
+const T get_conf(const std::string& key) {
+ return local_conf().template get_val<T>(key);
+}
+
+}
diff --git a/src/crimson/common/errorator-loop.h b/src/crimson/common/errorator-loop.h
new file mode 100644
index 000000000..bb3b7fb15
--- /dev/null
+++ b/src/crimson/common/errorator-loop.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "crimson/common/errorator.h"
+
+
+namespace crimson {
+template <class... AllowedErrors>
+class parallel_for_each_state final : private seastar::continuation_base<> {
+ using future_t = typename errorator<AllowedErrors...>::template future<>;
+ std::vector<future_t> _incomplete;
+ seastar::promise<> _result;
+ std::exception_ptr _ex;
+private:
+ void wait_for_one() noexcept {
+ while (!_incomplete.empty() && _incomplete.back().available()) {
+ if (_incomplete.back().failed()) {
+ _ex = _incomplete.back().get_exception();
+ }
+ _incomplete.pop_back();
+ }
+ if (!_incomplete.empty()) {
+ seastar::internal::set_callback(std::move(_incomplete.back()),
+ static_cast<continuation_base<>*>(this));
+ _incomplete.pop_back();
+ return;
+ }
+ if (__builtin_expect(bool(_ex), false)) {
+ _result.set_exception(std::move(_ex));
+ } else {
+ _result.set_value();
+ }
+ delete this;
+ }
+ virtual void run_and_dispose() noexcept override {
+ if (_state.failed()) {
+ _ex = std::move(_state).get_exception();
+ }
+ _state = {};
+ wait_for_one();
+ }
+ task* waiting_task() noexcept override { return _result.waiting_task(); }
+public:
+ parallel_for_each_state(size_t n) {
+ _incomplete.reserve(n);
+ }
+ void add_future(future_t&& f) {
+ _incomplete.push_back(std::move(f));
+ }
+ future_t get_future() {
+ auto ret = _result.get_future();
+ wait_for_one();
+ return ret;
+ }
+};
+
+template <typename Iterator, typename Func, typename... AllowedErrors>
+static inline typename errorator<AllowedErrors...>::template future<>
+parallel_for_each(Iterator first, Iterator last, Func&& func) noexcept {
+ parallel_for_each_state<AllowedErrors...>* s = nullptr;
+ // Process all elements, giving each future the following treatment:
+ // - available, not failed: do nothing
+ // - available, failed: collect exception in ex
+ // - not available: collect in s (allocating it if needed)
+ for (;first != last; ++first) {
+ auto f = seastar::futurize_invoke(std::forward<Func>(func), *first);
+ if (!f.available() || f.failed()) {
+ if (!s) {
+ using itraits = std::iterator_traits<Iterator>;
+ auto n = (seastar::internal::iterator_range_estimate_vector_capacity(
+ first, last, typename itraits::iterator_category()) + 1);
+ s = new parallel_for_each_state<AllowedErrors...>(n);
+ }
+ s->add_future(std::move(f));
+ }
+ }
+ // If any futures were not available, hand off to parallel_for_each_state::start().
+ // Otherwise we can return a result immediately.
+ if (s) {
+ // s->get_future() takes ownership of s (and chains it to one of the futures it contains)
+ // so this isn't a leak
+ return s->get_future();
+ }
+ return seastar::make_ready_future<>();
+}
+
+} // namespace crimson
diff --git a/src/crimson/common/errorator.h b/src/crimson/common/errorator.h
new file mode 100644
index 000000000..c5d63d5b9
--- /dev/null
+++ b/src/crimson/common/errorator.h
@@ -0,0 +1,1358 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <exception>
+#include <system_error>
+
+#include <seastar/core/future-util.hh>
+
+#include "crimson/common/utility.h"
+#include "include/ceph_assert.h"
+
+namespace crimson::interruptible {
+
+template <typename, typename>
+class parallel_for_each_state;
+
+template <typename, typename>
+class interruptible_future_detail;
+
+}
+
+namespace crimson {
+
+// crimson::do_for_each_state is the mirror of seastar::do_for_each_state with FutureT
+template <typename Iterator, typename AsyncAction, typename FutureT>
+class do_for_each_state final : public seastar::continuation_base<> {
+ Iterator _begin;
+ Iterator _end;
+ AsyncAction _action;
+ seastar::promise<> _pr;
+
+public:
+ do_for_each_state(Iterator begin, Iterator end, AsyncAction action,
+ FutureT&& first_unavailable)
+ : _begin(std::move(begin)), _end(std::move(end)), _action(std::move(action)) {
+ seastar::internal::set_callback(std::move(first_unavailable), this);
+ }
+ virtual void run_and_dispose() noexcept override {
+ std::unique_ptr<do_for_each_state> zis(this);
+ if (_state.failed()) {
+ _pr.set_urgent_state(std::move(_state));
+ return;
+ }
+ while (_begin != _end) {
+ auto f = seastar::futurize_invoke(_action, *_begin);
+ ++_begin;
+ if (f.failed()) {
+ f._forward_to(std::move(_pr));
+ return;
+ }
+ if (!f.available() || seastar::need_preempt()) {
+ _state = {};
+ seastar::internal::set_callback(std::move(f), this);
+ zis.release();
+ return;
+ }
+ }
+ _pr.set_value();
+ }
+ task* waiting_task() noexcept override {
+ return _pr.waiting_task();
+ }
+ FutureT get_future() {
+ return _pr.get_future();
+ }
+};
+
+template<typename Iterator, typename AsyncAction,
+ typename FutureT = std::invoke_result_t<AsyncAction, typename Iterator::reference>>
+inline FutureT do_for_each_impl(Iterator begin, Iterator end, AsyncAction action) {
+ while (begin != end) {
+ auto f = seastar::futurize_invoke(action, *begin);
+ ++begin;
+ if (f.failed()) {
+ return f;
+ }
+ if (!f.available() || seastar::need_preempt()) {
+ // s will be freed by run_and_dispose()
+ auto* s = new crimson::do_for_each_state<Iterator, AsyncAction, FutureT>{
+ std::move(begin), std::move(end), std::move(action), std::move(f)};
+ return s->get_future();
+ }
+ }
+ return seastar::make_ready_future<>();
+}
+
+template<typename Iterator, typename AsyncAction>
+inline auto do_for_each(Iterator begin, Iterator end, AsyncAction action) {
+ return ::crimson::do_for_each_impl(begin, end, std::move(action));
+}
+
+template<typename Container, typename AsyncAction>
+inline auto do_for_each(Container& c, AsyncAction action) {
+ return ::crimson::do_for_each(std::begin(c), std::end(c), std::move(action));
+}
+
+template<typename AsyncAction>
+inline auto repeat(AsyncAction action) {
+ using errorator_t =
+ typename ::seastar::futurize_t<std::invoke_result_t<AsyncAction>>::errorator_type;
+
+ while (true) {
+ auto f = ::seastar::futurize_invoke(action);
+ if (f.failed()) {
+ return errorator_t::template make_exception_future2<>(
+ f.get_exception()
+ );
+ } else if (f.available()) {
+ if (auto done = f.get0()) {
+ return errorator_t::template make_ready_future<>();
+ }
+ } else {
+ return std::move(f)._then(
+ [action = std::move(action)] (auto stop) mutable {
+ if (stop == seastar::stop_iteration::yes) {
+ return errorator_t::template make_ready_future<>();
+ }
+ return ::crimson::repeat(
+ std::move(action));
+ });
+ }
+ }
+}
+
+// define the interface between error types and errorator
+template <class ConcreteErrorT>
+class error_t {
+ static constexpr const std::type_info& get_exception_ptr_type_info() {
+ return ConcreteErrorT::exception_ptr_type_info();
+ }
+
+ decltype(auto) static from_exception_ptr(std::exception_ptr ep) {
+ return ConcreteErrorT::from_exception_ptr(std::move(ep));
+ }
+
+ template <class... AllowedErrorsT>
+ friend struct errorator;
+
+ template <class ErrorVisitorT, class FuturatorT>
+ friend class maybe_handle_error_t;
+
+protected:
+ std::exception_ptr to_exception_ptr() const {
+ const auto* concrete_error = static_cast<const ConcreteErrorT*>(this);
+ return concrete_error->to_exception_ptr();
+ }
+
+public:
+ template <class Func>
+ static decltype(auto) handle(Func&& func) {
+ return ConcreteErrorT::handle(std::forward<Func>(func));
+ }
+};
+
+// unthrowable_wrapper ensures compilation failure when somebody
+// would like to `throw make_error<...>)()` instead of returning.
+// returning allows for the compile-time verification of future's
+// AllowedErrorsV and also avoid the burden of throwing.
+template <class ErrorT, ErrorT ErrorV>
+struct unthrowable_wrapper : error_t<unthrowable_wrapper<ErrorT, ErrorV>> {
+ unthrowable_wrapper(const unthrowable_wrapper&) = delete;
+ [[nodiscard]] static const auto& make() {
+ static constexpr unthrowable_wrapper instance{};
+ return instance;
+ }
+
+ static auto exception_ptr() {
+ return make().to_exception_ptr();
+ }
+
+ template<class Func>
+ static auto handle(Func&& func) {
+ return [
+ func = std::forward<Func>(func)
+ ] (const unthrowable_wrapper& raw_error) mutable -> decltype(auto) {
+ if constexpr (std::is_invocable_v<Func, ErrorT, decltype(raw_error)>) {
+ // check whether the handler wants to take the raw error object which
+ // would be the case if it wants conditionally handle-or-pass-further.
+ return std::invoke(std::forward<Func>(func),
+ ErrorV,
+ std::move(raw_error));
+ } else if constexpr (std::is_invocable_v<Func, ErrorT>) {
+ return std::invoke(std::forward<Func>(func), ErrorV);
+ } else {
+ return std::invoke(std::forward<Func>(func));
+ }
+ };
+ }
+
+ struct pass_further {
+ decltype(auto) operator()(const unthrowable_wrapper& e) {
+ return e;
+ }
+ };
+
+ struct discard {
+ decltype(auto) operator()(const unthrowable_wrapper&) {
+ }
+ };
+
+
+private:
+ // can be used only to initialize the `instance` member
+ explicit unthrowable_wrapper() = default;
+
+ // implement the errorable interface
+ struct throwable_carrier{};
+ static std::exception_ptr carrier_instance;
+
+ static constexpr const std::type_info& exception_ptr_type_info() {
+ return typeid(throwable_carrier);
+ }
+ auto to_exception_ptr() const {
+ // error codes don't need to instantiate `std::exception_ptr` each
+ // time as the code is actually a part of the type itself.
+ // `std::make_exception_ptr()` on modern enough GCCs is quite cheap
+ // (see the Gleb Natapov's patch eradicating throw/catch there),
+ // but using one instance per type boils down the overhead to just
+ // ref-counting.
+ return carrier_instance;
+ }
+ static const auto& from_exception_ptr(std::exception_ptr) {
+ return make();
+ }
+
+ friend class error_t<unthrowable_wrapper<ErrorT, ErrorV>>;
+};
+
+template <class ErrorT, ErrorT ErrorV>
+std::exception_ptr unthrowable_wrapper<ErrorT, ErrorV>::carrier_instance = \
+ std::make_exception_ptr<
+ unthrowable_wrapper<ErrorT, ErrorV>::throwable_carrier>({});
+
+
+template <class ErrorT>
+struct stateful_error_t : error_t<stateful_error_t<ErrorT>> {
+ template <class... Args>
+ explicit stateful_error_t(Args&&... args)
+ : ep(std::make_exception_ptr<ErrorT>(std::forward<Args>(args)...)) {
+ }
+
+ template<class Func>
+ static auto handle(Func&& func) {
+ return [
+ func = std::forward<Func>(func)
+ ] (stateful_error_t<ErrorT>&& e) mutable -> decltype(auto) {
+ if constexpr (std::is_invocable_v<Func>) {
+ return std::invoke(std::forward<Func>(func));
+ }
+ try {
+ std::rethrow_exception(e.ep);
+ } catch (const ErrorT& obj) {
+ if constexpr (std::is_invocable_v<Func, decltype(obj), decltype(e)>) {
+ return std::invoke(std::forward<Func>(func), obj, e);
+ } else if constexpr (std::is_invocable_v<Func, decltype(obj)>) {
+ return std::invoke(std::forward<Func>(func), obj);
+ }
+ }
+ ceph_abort_msg("exception type mismatch -- impossible!");
+ };
+ }
+
+private:
+ std::exception_ptr ep;
+
+ explicit stateful_error_t(std::exception_ptr ep) : ep(std::move(ep)) {}
+
+ static constexpr const std::type_info& exception_ptr_type_info() {
+ return typeid(ErrorT);
+ }
+ auto to_exception_ptr() const {
+ return ep;
+ }
+ static stateful_error_t<ErrorT> from_exception_ptr(std::exception_ptr ep) {
+ return stateful_error_t<ErrorT>(std::move(ep));
+ }
+
+ friend class error_t<stateful_error_t<ErrorT>>;
+};
+
+namespace _impl {
+ template <class T> struct always_false : std::false_type {};
+};
+
+template <class ErrorVisitorT, class FuturatorT>
+class maybe_handle_error_t {
+ const std::type_info& type_info;
+ typename FuturatorT::type result;
+ ErrorVisitorT errfunc;
+
+public:
+ maybe_handle_error_t(ErrorVisitorT&& errfunc, std::exception_ptr ep)
+ : type_info(*ep.__cxa_exception_type()),
+ result(FuturatorT::make_exception_future(std::move(ep))),
+ errfunc(std::forward<ErrorVisitorT>(errfunc)) {
+ }
+
+ template <class ErrorT>
+ void handle() {
+ static_assert(std::is_invocable<ErrorVisitorT, ErrorT>::value,
+ "provided Error Visitor is not exhaustive");
+ // In C++ throwing an exception isn't the sole way to signal
+ // error with it. This approach nicely fits cold, infrequent cases
+ // but when applied to a hot one, it will likely hurt performance.
+ //
+ // Alternative approach is to create `std::exception_ptr` on our
+ // own and place it in the future via `make_exception_future()`.
+ // When it comes to handling, the pointer can be interrogated for
+ // pointee's type with `__cxa_exception_type()` instead of costly
+ // re-throwing (via `std::rethrow_exception()`) and matching with
+ // `catch`. The limitation here is lack of support for hierarchies
+ // of exceptions. The code below checks for exact match only while
+ // `catch` would allow to match against a base class as well.
+ // However, this shouldn't be a big issue for `errorator` as Error
+ // Visitors are already checked for exhaustiveness at compile-time.
+ //
+ // NOTE: `__cxa_exception_type()` is an extension of the language.
+ // It should be available both in GCC and Clang but a fallback
+ // (based on `std::rethrow_exception()` and `catch`) can be made
+ // to handle other platforms if necessary.
+ if (type_info == ErrorT::error_t::get_exception_ptr_type_info()) {
+ // set `state::invalid` in internals of `seastar::future` to not
+ // call `report_failed_future()` during `operator=()`.
+ [[maybe_unused]] auto&& ep = std::move(result).get_exception();
+
+ using return_t = std::invoke_result_t<ErrorVisitorT, ErrorT>;
+ if constexpr (std::is_assignable_v<decltype(result), return_t>) {
+ result = std::invoke(std::forward<ErrorVisitorT>(errfunc),
+ ErrorT::error_t::from_exception_ptr(std::move(ep)));
+ } else if constexpr (std::is_same_v<return_t, void>) {
+ // void denotes explicit discarding
+ // execute for the sake a side effects. Typically this boils down
+ // to throwing an exception by the handler.
+ std::invoke(std::forward<ErrorVisitorT>(errfunc),
+ ErrorT::error_t::from_exception_ptr(std::move(ep)));
+ } else if constexpr (seastar::Future<decltype(result)>) {
+ // result is seastar::future but return_t is e.g. int. If so,
+ // the else clause cannot be used as seastar::future lacks
+ // errorator_type member.
+ result = seastar::make_ready_future<return_t>(
+ std::invoke(std::forward<ErrorVisitorT>(errfunc),
+ ErrorT::error_t::from_exception_ptr(std::move(ep))));
+ } else {
+ result = FuturatorT::type::errorator_type::template make_ready_future<return_t>(
+ std::invoke(std::forward<ErrorVisitorT>(errfunc),
+ ErrorT::error_t::from_exception_ptr(std::move(ep))));
+ }
+ }
+ }
+
+ auto get_result() && {
+ return std::move(result);
+ }
+};
+
+template <class FuncHead, class... FuncTail>
+static constexpr auto composer(FuncHead&& head, FuncTail&&... tail) {
+ return [
+ head = std::forward<FuncHead>(head),
+ // perfect forwarding in lambda's closure isn't available in C++17
+ // using tuple as workaround; see: https://stackoverflow.com/a/49902823
+ tail = std::make_tuple(std::forward<FuncTail>(tail)...)
+ ] (auto&&... args) mutable -> decltype(auto) {
+ if constexpr (std::is_invocable_v<FuncHead, decltype(args)...>) {
+ return std::invoke(std::forward<FuncHead>(head),
+ std::forward<decltype(args)>(args)...);
+ } else if constexpr (sizeof...(FuncTail) > 0) {
+ using next_composer_t = decltype(composer<FuncTail...>);
+ auto&& next = std::apply<next_composer_t>(composer<FuncTail...>,
+ std::move(tail));
+ return std::invoke(std::move(next),
+ std::forward<decltype(args)>(args)...);
+ } else {
+ static_assert(
+ std::is_invocable_v<FuncHead, decltype(args)...> ||
+ (sizeof...(FuncTail) > 0),
+ "composition is not exhaustive");
+ }
+ };
+}
+
+template <class ValueT>
+struct errorated_future_marker{};
+
+template <class... AllowedErrors>
+class parallel_for_each_state;
+
+template <class T>
+static inline constexpr bool is_error_v = std::is_base_of_v<error_t<T>, T>;
+
+template <typename... AllowedErrors>
+struct errorator;
+
+template <typename Iterator, typename Func, typename... AllowedErrors>
+static inline typename errorator<AllowedErrors...>::template future<>
+parallel_for_each(Iterator first, Iterator last, Func&& func) noexcept;
+
+template <class... AllowedErrors>
+struct errorator {
+
+ static_assert((... && is_error_v<AllowedErrors>),
+ "errorator expects presence of ::is_error in all error types");
+
+ template <class ErrorT>
+ struct contains_once {
+ static constexpr bool value =
+ (0 + ... + std::is_same_v<ErrorT, AllowedErrors>) == 1;
+ };
+ template <class... Errors>
+ struct contains_once<errorator<Errors...>> {
+ static constexpr bool value = (... && contains_once<Errors>::value);
+ };
+ template <class T>
+ static constexpr bool contains_once_v = contains_once<T>::value;
+
+ static_assert((... && contains_once_v<AllowedErrors>),
+ "no error type in errorator can be duplicated");
+
+ struct ready_future_marker{};
+ struct exception_future_marker{};
+
+private:
+ // see the comment for `using future = _future` below.
+ template <class>
+ class [[nodiscard]] _future {};
+ template <class ValueT>
+ class [[nodiscard]] _future<::crimson::errorated_future_marker<ValueT>>
+ : private seastar::future<ValueT> {
+ using base_t = seastar::future<ValueT>;
+ // we need the friendship for the sake of `get_exception() &&` when
+ // `safe_then()` is going to return an errorated future as a result of
+ // chaining. In contrast to `seastar::future`, errorator<T...>::future`
+ // has this member private.
+ template <class ErrorVisitor, class Futurator>
+ friend class maybe_handle_error_t;
+
+ // any `seastar::futurize` specialization must be able to access the base.
+ // see : `satisfy_with_result_of()` far below.
+ template <typename>
+ friend struct seastar::futurize;
+
+ template <typename T1, typename T2, typename... More>
+ friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more);
+
+ template <class, class = std::void_t<>>
+ struct get_errorator {
+ // generic template for non-errorated things (plain types and
+ // vanilla seastar::future as well).
+ using type = errorator<>;
+ };
+ template <class FutureT>
+ struct get_errorator<FutureT,
+ std::void_t<typename FutureT::errorator_type>> {
+ using type = typename FutureT::errorator_type;
+ };
+ template <class T>
+ using get_errorator_t = typename get_errorator<T>::type;
+
+ template <class ValueFuncErroratorT, class... ErrorVisitorRetsT>
+ struct make_errorator {
+ // NOP. The generic template.
+ };
+ template <class... ValueFuncAllowedErrors,
+ class ErrorVisitorRetsHeadT,
+ class... ErrorVisitorRetsTailT>
+ struct make_errorator<errorator<ValueFuncAllowedErrors...>,
+ ErrorVisitorRetsHeadT,
+ ErrorVisitorRetsTailT...> {
+ private:
+ using step_errorator = errorator<ValueFuncAllowedErrors...>;
+ // add ErrorVisitorRetsHeadT only if 1) it's an error type and
+ // 2) isn't already included in the errorator's error set.
+ // It's enough to negate contains_once_v as any errorator<...>
+ // type is already guaranteed to be free of duplications.
+ using _next_errorator = std::conditional_t<
+ is_error_v<ErrorVisitorRetsHeadT> &&
+ !step_errorator::template contains_once_v<ErrorVisitorRetsHeadT>,
+ typename step_errorator::template extend<ErrorVisitorRetsHeadT>,
+ step_errorator>;
+ using maybe_head_ertr = get_errorator_t<ErrorVisitorRetsHeadT>;
+ using next_errorator =
+ typename _next_errorator::template extend_ertr<maybe_head_ertr>;
+
+ public:
+ using type = typename make_errorator<next_errorator,
+ ErrorVisitorRetsTailT...>::type;
+ };
+ // finish the recursion
+ template <class... ValueFuncAllowedErrors>
+ struct make_errorator<errorator<ValueFuncAllowedErrors...>> {
+ using type = ::crimson::errorator<ValueFuncAllowedErrors...>;
+ };
+ template <class... Args>
+ using make_errorator_t = typename make_errorator<Args...>::type;
+
+ using base_t::base_t;
+
+ template <class Futurator, class Future, class ErrorVisitor>
+ [[gnu::noinline]]
+ static auto _safe_then_handle_errors(Future&& future,
+ ErrorVisitor&& errfunc) {
+ maybe_handle_error_t<ErrorVisitor, Futurator> maybe_handle_error(
+ std::forward<ErrorVisitor>(errfunc),
+ std::move(future).get_exception()
+ );
+ (maybe_handle_error.template handle<AllowedErrors>() , ...);
+ return std::move(maybe_handle_error).get_result();
+ }
+
+ protected:
+ using base_t::get_exception;
+ public:
+ using errorator_type = ::crimson::errorator<AllowedErrors...>;
+ using promise_type = seastar::promise<ValueT>;
+
+ using base_t::available;
+ using base_t::failed;
+ // need this because of the legacy in PG::do_osd_ops().
+ using base_t::handle_exception_type;
+
+ [[gnu::always_inline]]
+ _future(base_t&& base)
+ : base_t(std::move(base)) {
+ }
+
+ base_t to_base() && {
+ return std::move(*this);
+ }
+
+ template <class... A>
+ [[gnu::always_inline]]
+ _future(ready_future_marker, A&&... a)
+ : base_t(::seastar::make_ready_future<ValueT>(std::forward<A>(a)...)) {
+ }
+ [[gnu::always_inline]]
+ _future(exception_future_marker, ::seastar::future_state_base&& state) noexcept
+ : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(state))) {
+ }
+ [[gnu::always_inline]]
+ _future(exception_future_marker, std::exception_ptr&& ep) noexcept
+ : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(ep))) {
+ }
+
+ template <template <class...> class ErroratedFuture,
+ class = std::void_t<
+ typename ErroratedFuture<
+ ::crimson::errorated_future_marker<ValueT>>::errorator_type>>
+ operator ErroratedFuture<errorated_future_marker<ValueT>> () && {
+ using dest_errorator_t = \
+ typename ErroratedFuture<
+ ::crimson::errorated_future_marker<ValueT>>::errorator_type;
+ static_assert(dest_errorator_t::template contains_once_v<errorator_type>,
+ "conversion is possible to more-or-eq errorated future!");
+ return static_cast<base_t&&>(*this);
+ }
+
+ // initialize future as failed without throwing. `make_exception_future()`
+ // internally uses `std::make_exception_ptr()`. cppreference.com shouldn't
+ // be misinterpreted when it says:
+ //
+ // "This is done as if executing the following code:
+ // try {
+ // throw e;
+ // } catch(...) {
+ // return std::current_exception();
+ // }",
+ //
+ // the "as if" is absolutely crucial because modern GCCs employ optimized
+ // path for it. See:
+ // * https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=cce8e59224e18858749a2324bce583bcfd160d6c,
+ // * https://gcc.gnu.org/ml/gcc-patches/2016-08/msg00373.html.
+ //
+ // This behavior, combined with `__cxa_exception_type()` for inspecting
+ // exception's type, allows for throw/catch-free handling of stateless
+ // exceptions (which is fine for error codes). Stateful jumbos would be
+ // actually a bit harder as `_M_get()` is private, and thus rethrowing is
+ // necessary to get to the state inside. However, it's not unthinkable to
+ // see another extension bringing operator*() to the exception pointer...
+ //
+ // TODO: we don't really need to `make_exception_ptr` each time. It still
+ // allocates memory underneath while can be replaced with single instance
+ // per type created on start-up.
+ template <class ErrorT,
+ class DecayedT = std::decay_t<ErrorT>,
+ bool IsError = is_error_v<DecayedT>,
+ class = std::enable_if_t<IsError>>
+ _future(ErrorT&& e)
+ : base_t(
+ seastar::make_exception_future<ValueT>(
+ errorator_type::make_exception_ptr(e))) {
+ static_assert(errorator_type::contains_once_v<DecayedT>,
+ "ErrorT is not enlisted in errorator");
+ }
+
+ template <class ValueFuncT, class ErrorVisitorT>
+ auto safe_then(ValueFuncT&& valfunc, ErrorVisitorT&& errfunc) {
+ static_assert((... && std::is_invocable_v<ErrorVisitorT,
+ AllowedErrors>),
+ "provided Error Visitor is not exhaustive");
+ static_assert(std::is_void_v<ValueT> ? std::is_invocable_v<ValueFuncT>
+ : std::is_invocable_v<ValueFuncT, ValueT>,
+ "Value Func is not invocable with future's value");
+ using value_func_result_t =
+ typename std::conditional_t<std::is_void_v<ValueT>,
+ std::invoke_result<ValueFuncT>,
+ std::invoke_result<ValueFuncT, ValueT>>::type;
+ // recognize whether there can be any error coming from the Value
+ // Function.
+ using value_func_errorator_t = get_errorator_t<value_func_result_t>;
+ // mutate the Value Function's errorator to harvest errors coming
+ // from the Error Visitor. Yes, it's perfectly fine to fail error
+ // handling at one step and delegate even broader set of issues
+ // to next continuation.
+ using return_errorator_t = make_errorator_t<
+ value_func_errorator_t,
+ std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>;
+ // OK, now we know about all errors next continuation must take
+ // care about. If Visitor handled everything and the Value Func
+ // doesn't return any, we'll finish with errorator<>::future
+ // which is just vanilla seastar::future – that's it, next cont
+ // finally could use `.then()`!
+ using futurator_t = \
+ typename return_errorator_t::template futurize<value_func_result_t>;
+ // `seastar::futurize`, used internally by `then_wrapped()`, would
+ // wrap any non-`seastar::future` type coming from Value Func into
+ // `seastar::future`. As we really don't want to end with things
+ // like `seastar::future<errorator::future<...>>`, we need either:
+ // * convert the errorated future into plain in the lambda below
+ // and back here or
+ // * specialize the `seastar::futurize<T>` to get proper kind of
+ // future directly from `::then_wrapped()`.
+ // As C++17 doesn't guarantee copy elision when non-same types are
+ // involved while examination of assemblies from GCC 8.1 confirmed
+ // extra copying, switch to the second approach has been made.
+ return this->then_wrapped(
+ [ valfunc = std::forward<ValueFuncT>(valfunc),
+ errfunc = std::forward<ErrorVisitorT>(errfunc)
+ ] (auto&& future) mutable noexcept {
+ if (__builtin_expect(future.failed(), false)) {
+ return _safe_then_handle_errors<futurator_t>(
+ std::move(future), std::forward<ErrorVisitorT>(errfunc));
+ } else {
+ // NOTE: using `seastar::future::get()` here is a bit bloaty
+ // as the method rechecks availability of future's value and,
+ // if it's unavailable, does the `::do_wait()` path (yes, it
+ // targets `seastar::thread`). Actually this is dead code as
+ // `then_wrapped()` executes the lambda only when the future
+ // is available (which means: failed or ready). However, GCC
+ // hasn't optimized it out:
+ //
+ // if (__builtin_expect(future.failed(), false)) {
+ // ea25: 48 83 bd c8 fe ff ff cmpq $0x2,-0x138(%rbp)
+ // ea2c: 02
+ // ea2d: 0f 87 f0 05 00 00 ja f023 <ceph::osd::
+ // ...
+ // /// If get() is called in a \ref seastar::thread context,
+ // /// then it need not be available; instead, the thread will
+ // /// be paused until the future becomes available.
+ // [[gnu::always_inline]]
+ // std::tuple<T...> get() {
+ // if (!_state.available()) {
+ // ea3a: 0f 85 1b 05 00 00 jne ef5b <ceph::osd::
+ // }
+ // ...
+ //
+ // I don't perceive this as huge issue. Though, it cannot be
+ // claimed errorator has 0 overhead on hot path. The perfect
+ // solution here would be mark the `::get_available_state()`
+ // as `protected` and use dedicated `get_value()` exactly as
+ // `::then()` already does.
+ return futurator_t::invoke(std::forward<ValueFuncT>(valfunc),
+ std::move(future).get());
+ }
+ });
+ }
+
+ /**
+ * unsafe_thread_get
+ *
+ * Only valid within a seastar_thread. Ignores errorator protections
+ * and throws any contained exceptions.
+ *
+ * Should really only be used within test code
+ * (see test/crimson/gtest_seastar.h).
+ */
+ auto &&unsafe_get() {
+ return seastar::future<ValueT>::get();
+ }
+ auto unsafe_get0() {
+ return seastar::future<ValueT>::get0();
+ }
+
+ template <class FuncT>
+ _future finally(FuncT &&func) {
+ return this->then_wrapped(
+ [func = std::forward<FuncT>(func)](auto &&result) mutable noexcept {
+ if constexpr (seastar::InvokeReturnsAnyFuture<FuncT>) {
+ return ::seastar::futurize_invoke(std::forward<FuncT>(func)).then_wrapped(
+ [result = std::move(result)](auto&& f_res) mutable {
+ // TODO: f_res.failed()
+ (void)f_res.discard_result();
+ return std::move(result);
+ });
+ } else {
+ try {
+ func();
+ } catch (...) {
+ // TODO: rethrow
+ }
+ return std::move(result);
+ }
+ });
+ }
+
+ _future<::crimson::errorated_future_marker<void>>
+ discard_result() noexcept {
+ return safe_then([](auto&&) {});
+ }
+
+ // taking ErrorFuncOne and ErrorFuncTwo separately from ErrorFuncTail
+ // to avoid SFINAE
+ template <class ValueFunc,
+ class ErrorFuncHead,
+ class... ErrorFuncTail>
+ auto safe_then(ValueFunc&& value_func,
+ ErrorFuncHead&& error_func_head,
+ ErrorFuncTail&&... error_func_tail) {
+ static_assert(sizeof...(ErrorFuncTail) > 0);
+ return safe_then(
+ std::forward<ValueFunc>(value_func),
+ composer(std::forward<ErrorFuncHead>(error_func_head),
+ std::forward<ErrorFuncTail>(error_func_tail)...));
+ }
+
+ template <class ValueFunc>
+ auto safe_then(ValueFunc&& value_func) {
+ return safe_then(std::forward<ValueFunc>(value_func),
+ errorator_type::pass_further{});
+ }
+
+ template <class ValueFunc,
+ class... ErrorFuncs>
+ auto safe_then_unpack(ValueFunc&& value_func,
+ ErrorFuncs&&... error_funcs) {
+ return safe_then(
+ [value_func=std::move(value_func)] (ValueT&& tuple) mutable {
+ assert_moveable(value_func);
+ return std::apply(std::move(value_func), std::move(tuple));
+ },
+ std::forward<ErrorFuncs>(error_funcs)...
+ );
+ }
+
+ template <class Func>
+ void then(Func&&) = delete;
+
+ template <class ErrorVisitorT>
+ auto handle_error(ErrorVisitorT&& errfunc) {
+ static_assert((... && std::is_invocable_v<ErrorVisitorT,
+ AllowedErrors>),
+ "provided Error Visitor is not exhaustive");
+ using return_errorator_t = make_errorator_t<
+ errorator<>,
+ std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>;
+ using futurator_t = \
+ typename return_errorator_t::template futurize<::seastar::future<ValueT>>;
+ return this->then_wrapped(
+ [ errfunc = std::forward<ErrorVisitorT>(errfunc)
+ ] (auto&& future) mutable noexcept {
+ if (__builtin_expect(future.failed(), false)) {
+ return _safe_then_handle_errors<futurator_t>(
+ std::move(future), std::forward<ErrorVisitorT>(errfunc));
+ } else {
+ return typename futurator_t::type{ std::move(future) };
+ }
+ });
+ }
+
+ template <class ErrorFuncHead,
+ class... ErrorFuncTail>
+ auto handle_error(ErrorFuncHead&& error_func_head,
+ ErrorFuncTail&&... error_func_tail) {
+ static_assert(sizeof...(ErrorFuncTail) > 0);
+ return this->handle_error(
+ composer(std::forward<ErrorFuncHead>(error_func_head),
+ std::forward<ErrorFuncTail>(error_func_tail)...));
+ }
+
+ private:
+ // for ::crimson::do_for_each
+ template <class Func>
+ auto _then(Func&& func) {
+ return base_t::then(std::forward<Func>(func));
+ }
+ template <class T>
+ auto _forward_to(T&& pr) {
+ return base_t::forward_to(std::forward<T>(pr));
+ }
+ template<typename Iterator, typename AsyncAction>
+ friend inline auto ::crimson::do_for_each(Iterator begin,
+ Iterator end,
+ AsyncAction action);
+
+ template <typename Iterator, typename AsyncAction, typename FutureT>
+ friend class ::crimson::do_for_each_state;
+
+ template<typename AsyncAction>
+ friend inline auto ::crimson::repeat(AsyncAction action);
+
+ template <typename Result>
+ friend class ::seastar::future;
+
+ // let seastar::do_with_impl to up-cast us to seastar::future.
+ template<typename T, typename F>
+ friend inline auto ::seastar::internal::do_with_impl(T&& rvalue, F&& f);
+ template<typename T1, typename T2, typename T3_or_F, typename... More>
+ friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more);
+ template<typename, typename>
+ friend class ::crimson::interruptible::interruptible_future_detail;
+ friend class ::crimson::parallel_for_each_state<AllowedErrors...>;
+ template <typename IC, typename FT>
+ friend class ::crimson::interruptible::parallel_for_each_state;
+ };
+
+ class Enabler {};
+
+ template <typename T>
+ using EnableIf = typename std::enable_if<contains_once_v<std::decay_t<T>>, Enabler>::type;
+
+ template <typename ErrorFunc>
+ struct all_same_way_t {
+ ErrorFunc func;
+ all_same_way_t(ErrorFunc &&error_func)
+ : func(std::forward<ErrorFunc>(error_func)) {}
+
+ template <typename ErrorT, EnableIf<ErrorT>...>
+ decltype(auto) operator()(ErrorT&& e) {
+ using decayed_t = std::decay_t<decltype(e)>;
+ auto&& handler =
+ decayed_t::error_t::handle(std::forward<ErrorFunc>(func));
+ static_assert(std::is_invocable_v<decltype(handler), ErrorT>);
+ return std::invoke(std::move(handler), std::forward<ErrorT>(e));
+ }
+ };
+
+public:
+ // HACK: `errorated_future_marker` and `_future` is just a hack to
+ // specialize `seastar::futurize` for category of class templates:
+ // `future<...>` from distinct errorators. Such tricks are usually
+ // performed basing on SFINAE and `std::void_t` to check existence
+ // of a trait/member (`future<...>::errorator_type` in our case).
+ // Unfortunately, this technique can't be applied as the `futurize`
+ // lacks the optional parameter. The problem looks awfully similar
+ // to following SO item: https://stackoverflow.com/a/38860413.
+ template <class ValueT=void>
+ using future = _future<::crimson::errorated_future_marker<ValueT>>;
+
+ // the visitor that forwards handling of all errors to next continuation
+ struct pass_further {
+ template <class ErrorT, EnableIf<ErrorT>...>
+ decltype(auto) operator()(ErrorT&& e) {
+ static_assert(contains_once_v<std::decay_t<ErrorT>>,
+ "passing further disallowed ErrorT");
+ return std::forward<ErrorT>(e);
+ }
+ };
+
+ struct discard_all {
+ template <class ErrorT, EnableIf<ErrorT>...>
+ void operator()(ErrorT&&) {
+ static_assert(contains_once_v<std::decay_t<ErrorT>>,
+ "discarding disallowed ErrorT");
+ }
+ };
+
+ template <typename T>
+ static future<T> make_errorator_future(seastar::future<T>&& fut) {
+ return std::move(fut);
+ }
+
+ // assert_all{ "TODO" };
+ class assert_all {
+ const char* const msg = nullptr;
+ public:
+ template <std::size_t N>
+ assert_all(const char (&msg)[N])
+ : msg(msg) {
+ }
+ assert_all() = default;
+
+ template <class ErrorT, EnableIf<ErrorT>...>
+ void operator()(ErrorT&&) {
+ static_assert(contains_once_v<std::decay_t<ErrorT>>,
+ "discarding disallowed ErrorT");
+ if (msg) {
+ ceph_abort_msg(msg);
+ } else {
+ ceph_abort();
+ }
+ }
+ };
+
+ template <class ErrorFunc>
+ static decltype(auto) all_same_way(ErrorFunc&& error_func) {
+ return all_same_way_t<ErrorFunc>{std::forward<ErrorFunc>(error_func)};
+ };
+
+ // get a new errorator by extending current one with new errors
+ template <class... NewAllowedErrorsT>
+ using extend = errorator<AllowedErrors..., NewAllowedErrorsT...>;
+
+ // get a new errorator by summing and deduplicating error set of
+ // the errorator `unify<>` is applied on with another errorator
+ // provided as template parameter.
+ template <class OtherErroratorT>
+ struct unify {
+ // 1st: generic NOP template
+ };
+ template <class OtherAllowedErrorsHead,
+ class... OtherAllowedErrorsTail>
+ struct unify<errorator<OtherAllowedErrorsHead,
+ OtherAllowedErrorsTail...>> {
+ private:
+ // 2nd: specialization for errorators with non-empty error set.
+ //
+ // split error set of other errorator, passed as template param,
+ // into head and tail. Mix error set of this errorator with head
+ // of the other one only if it isn't already present in the set.
+ using step_errorator = std::conditional_t<
+ contains_once_v<OtherAllowedErrorsHead> == false,
+ errorator<AllowedErrors..., OtherAllowedErrorsHead>,
+ errorator<AllowedErrors...>>;
+ using rest_errorator = errorator<OtherAllowedErrorsTail...>;
+
+ public:
+ using type = typename step_errorator::template unify<rest_errorator>::type;
+ };
+ template <class... EmptyPack>
+ struct unify<errorator<EmptyPack...>> {
+ // 3rd: recursion finisher
+ static_assert(sizeof...(EmptyPack) == 0);
+ using type = errorator<AllowedErrors...>;
+ };
+
+ // get a new errorator by extending current one with another errorator
+ template <class E>
+ using extend_ertr = typename unify<E>::type;
+
+ template <typename T=void, typename... A>
+ static future<T> make_ready_future(A&&... value) {
+ return future<T>(ready_future_marker(), std::forward<A>(value)...);
+ }
+
+ template <typename T=void>
+ static
+ future<T> make_exception_future2(std::exception_ptr&& ex) noexcept {
+ return future<T>(exception_future_marker(), std::move(ex));
+ }
+ template <typename T=void>
+ static
+ future<T> make_exception_future2(seastar::future_state_base&& state) noexcept {
+ return future<T>(exception_future_marker(), std::move(state));
+ }
+ template <typename T=void, typename Exception>
+ static
+ future<T> make_exception_future2(Exception&& ex) noexcept {
+ return make_exception_future2<T>(std::make_exception_ptr(std::forward<Exception>(ex)));
+ }
+
+ static auto now() {
+ return make_ready_future<>();
+ }
+
+ template <typename Container, typename Func>
+ static inline auto parallel_for_each(Container&& container, Func&& func) noexcept {
+ return crimson::parallel_for_each<decltype(std::begin(container)), Func, AllowedErrors...>(
+ std::begin(container),
+ std::end(container),
+ std::forward<Func>(func));
+ }
+
+ template <typename Iterator, typename Func>
+ static inline errorator<AllowedErrors...>::future<>
+ parallel_for_each(Iterator first, Iterator last, Func&& func) noexcept {
+ return crimson::parallel_for_each<Iterator, Func, AllowedErrors...>(
+ first,
+ last,
+ std::forward<Func>(func));
+ }
+private:
+ template <class T>
+ class futurize {
+ using vanilla_futurize = seastar::futurize<T>;
+
+ // explicit specializations for nested type is not allowed unless both
+ // the member template and the enclosing template are specialized. see
+ // section temp.expl.spec, N4659
+ template <class Stored, int Dummy = 0>
+ struct stored_to_future {
+ using type = future<Stored>;
+ };
+ template <int Dummy>
+ struct stored_to_future <seastar::internal::monostate, Dummy> {
+ using type = future<>;
+ };
+
+ public:
+ using type =
+ typename stored_to_future<typename vanilla_futurize::value_type>::type;
+
+ template <class Func, class... Args>
+ static type invoke(Func&& func, Args&&... args) {
+ try {
+ return vanilla_futurize::invoke(std::forward<Func>(func),
+ std::forward<Args>(args)...);
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <class Func>
+ static type invoke(Func&& func, seastar::internal::monostate) {
+ try {
+ return vanilla_futurize::invoke(std::forward<Func>(func));
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <typename Arg>
+ static type make_exception_future(Arg&& arg) {
+ return vanilla_futurize::make_exception_future(std::forward<Arg>(arg));
+ }
+ };
+ template <template <class...> class ErroratedFutureT,
+ class ValueT>
+ class futurize<ErroratedFutureT<::crimson::errorated_future_marker<ValueT>>> {
+ public:
+ using type = ::crimson::errorator<AllowedErrors...>::future<ValueT>;
+
+ template <class Func, class... Args>
+ static type invoke(Func&& func, Args&&... args) {
+ try {
+ return ::seastar::futurize_invoke(std::forward<Func>(func),
+ std::forward<Args>(args)...);
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <class Func>
+ static type invoke(Func&& func, seastar::internal::monostate) {
+ try {
+ return ::seastar::futurize_invoke(std::forward<Func>(func));
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <typename Arg>
+ static type make_exception_future(Arg&& arg) {
+ return ::crimson::errorator<AllowedErrors...>::make_exception_future2<ValueT>(std::forward<Arg>(arg));
+ }
+ };
+
+ template <typename InterruptCond, typename FutureType>
+ class futurize<
+ ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond, FutureType>> {
+ public:
+ using type = ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond, typename futurize<FutureType>::type>;
+
+ template <typename Func, typename... Args>
+ static type invoke(Func&& func, Args&&... args) {
+ try {
+ return ::seastar::futurize_invoke(std::forward<Func>(func),
+ std::forward<Args>(args)...);
+ } catch(...) {
+ return seastar::futurize<
+ ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond, FutureType>>::make_exception_future(
+ std::current_exception());
+ }
+ }
+ template <typename Func>
+ static type invoke(Func&& func, seastar::internal::monostate) {
+ try {
+ return ::seastar::futurize_invoke(std::forward<Func>(func));
+ } catch(...) {
+ return seastar::futurize<
+ ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond, FutureType>>::make_exception_future(
+ std::current_exception());
+ }
+ }
+ template <typename Arg>
+ static type make_exception_future(Arg&& arg) {
+ return ::seastar::futurize<
+ ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond, FutureType>>::make_exception_future(
+ std::forward<Arg>(arg));
+ }
+ };
+
+ template <class ErrorT>
+ static std::exception_ptr make_exception_ptr(ErrorT&& e) {
+ // calling via interface class due to encapsulation and friend relations.
+ return e.error_t<std::decay_t<ErrorT>>::to_exception_ptr();
+ }
+
+ // needed because of:
+ // * return_errorator_t::template futurize<...> in `safe_then()`,
+ // * conversion to `std::exception_ptr` in `future::future(ErrorT&&)`.
+ // the friendship with all errorators is an idea from Kefu to fix build
+ // issues on GCC 9. This version likely fixes some access violation bug
+ // we were exploiting before.
+ template <class...>
+ friend class errorator;
+ template<typename, typename>
+ friend class ::crimson::interruptible::interruptible_future_detail;
+}; // class errorator, generic template
+
+// no errors? errorator<>::future is plain seastar::future then!
+template <>
+class errorator<> {
+public:
+ template <class ValueT=void>
+ using future = ::seastar::futurize_t<ValueT>;
+
+ template <class T>
+ using futurize = ::seastar::futurize<T>;
+
+ // get a new errorator by extending current one with errors
+ template <class... NewAllowedErrors>
+ using extend = errorator<NewAllowedErrors...>;
+
+ // get a new errorator by extending current one with another errorator
+ template <class E>
+ using extend_ertr = E;
+
+ // errorator with empty error set never contains any error
+ template <class T>
+ static constexpr bool contains_once_v = false;
+}; // class errorator, <> specialization
+
+
+template <class ErroratorOne,
+ class ErroratorTwo,
+ class... FurtherErrators>
+struct compound_errorator {
+private:
+ // generic template. Empty `FurtherErrators` are handled by
+ // the specialization below.
+ static_assert(sizeof...(FurtherErrators) > 0);
+ using step =
+ typename compound_errorator<ErroratorOne, ErroratorTwo>::type;
+
+public:
+ using type =
+ typename compound_errorator<step, FurtherErrators...>::type;
+};
+template <class ErroratorOne,
+ class ErroratorTwo>
+struct compound_errorator<ErroratorOne, ErroratorTwo> {
+ // specialization for empty `FurtherErrators` arg pack
+ using type =
+ typename ErroratorOne::template unify<ErroratorTwo>::type;
+};
+template <class... Args>
+using compound_errorator_t = typename compound_errorator<Args...>::type;
+
+// this is conjunction of two nasty features: C++14's variable template
+// and inline global variable of C++17. The latter is crucial to ensure
+// the variable will get the same address across all translation units.
+template <int ErrorV>
+inline std::error_code ec = std::error_code(ErrorV, std::generic_category());
+
+template <int ErrorV>
+using ct_error_code = unthrowable_wrapper<const std::error_code&, ec<ErrorV>>;
+
+namespace ct_error {
+ using enoent = ct_error_code<static_cast<int>(std::errc::no_such_file_or_directory)>;
+ using enodata = ct_error_code<static_cast<int>(std::errc::no_message_available)>;
+ using invarg = ct_error_code<static_cast<int>(std::errc::invalid_argument)>;
+ using input_output_error = ct_error_code<static_cast<int>(std::errc::io_error)>;
+ using object_corrupted = ct_error_code<static_cast<int>(std::errc::illegal_byte_sequence)>;
+ using permission_denied = ct_error_code<static_cast<int>(std::errc::permission_denied)>;
+ using operation_not_supported =
+ ct_error_code<static_cast<int>(std::errc::operation_not_supported)>;
+ using not_connected = ct_error_code<static_cast<int>(std::errc::not_connected)>;
+ using timed_out = ct_error_code<static_cast<int>(std::errc::timed_out)>;
+ using erange =
+ ct_error_code<static_cast<int>(std::errc::result_out_of_range)>;
+ using ebadf =
+ ct_error_code<static_cast<int>(std::errc::bad_file_descriptor)>;
+ using enospc =
+ ct_error_code<static_cast<int>(std::errc::no_space_on_device)>;
+ using value_too_large = ct_error_code<static_cast<int>(std::errc::value_too_large)>;
+ using eagain =
+ ct_error_code<static_cast<int>(std::errc::resource_unavailable_try_again)>;
+ using file_too_large =
+ ct_error_code<static_cast<int>(std::errc::file_too_large)>;
+ using address_in_use = ct_error_code<static_cast<int>(std::errc::address_in_use)>;
+ using address_not_available = ct_error_code<static_cast<int>(std::errc::address_not_available)>;
+ using ecanceled = ct_error_code<static_cast<int>(std::errc::operation_canceled)>;
+ using einprogress = ct_error_code<static_cast<int>(std::errc::operation_in_progress)>;
+ using enametoolong = ct_error_code<static_cast<int>(std::errc::filename_too_long)>;
+ using eexist = ct_error_code<static_cast<int>(std::errc::file_exists)>;
+ using edquot = ct_error_code<int(122)>;
+ constexpr int cmp_fail_error_value = 4095;
+ using cmp_fail = ct_error_code<int(cmp_fail_error_value)>;
+
+ struct pass_further_all {
+ template <class ErrorT>
+ decltype(auto) operator()(ErrorT&& e) {
+ return std::forward<ErrorT>(e);
+ }
+ };
+
+ struct discard_all {
+ template <class ErrorT>
+ void operator()(ErrorT&&) {
+ }
+ };
+
+ class assert_all {
+ const char* const msg = nullptr;
+ public:
+ template <std::size_t N>
+ assert_all(const char (&msg)[N])
+ : msg(msg) {
+ }
+ assert_all() = default;
+
+ template <class ErrorT>
+ void operator()(ErrorT&&) {
+ if (msg) {
+ ceph_abort(msg);
+ } else {
+ ceph_abort();
+ }
+ }
+ };
+
+ template <class ErrorFunc>
+ static decltype(auto) all_same_way(ErrorFunc&& error_func) {
+ return [
+ error_func = std::forward<ErrorFunc>(error_func)
+ ] (auto&& e) mutable -> decltype(auto) {
+ using decayed_t = std::decay_t<decltype(e)>;
+ auto&& handler =
+ decayed_t::error_t::handle(std::forward<ErrorFunc>(error_func));
+ return std::invoke(std::move(handler), std::forward<decltype(e)>(e));
+ };
+ };
+}
+
+using stateful_errc = stateful_error_t<std::errc>;
+using stateful_errint = stateful_error_t<int>;
+using stateful_ec = stateful_error_t<std::error_code>;
+
+template <typename F>
+struct is_errorated_future {
+ static constexpr bool value = false;
+};
+template <template <class...> class ErroratedFutureT,
+ class ValueT>
+struct is_errorated_future<
+ ErroratedFutureT<::crimson::errorated_future_marker<ValueT>>
+ > {
+ static constexpr bool value = true;
+};
+template <typename T>
+constexpr bool is_errorated_future_v = is_errorated_future<T>::value;
+
+} // namespace crimson
+
+
+// open the `seastar` namespace to specialize `futurize`. This is not
+// pretty for sure. I just hope it's not worse than e.g. specializing
+// `hash` in the `std` namespace. The justification is copy avoidance
+// in `future<...>::safe_then()`. See the comments there for details.
+namespace seastar {
+
+// Container is a placeholder for errorator::_future<> template
+template <template <class> class Container,
+ class Value>
+struct futurize<Container<::crimson::errorated_future_marker<Value>>> {
+ using errorator_type = typename Container<
+ ::crimson::errorated_future_marker<Value>>::errorator_type;
+
+ using type = typename errorator_type::template future<Value>;
+ using value_type = seastar::internal::future_stored_type_t<Value>;
+
+ template<typename Func, typename... FuncArgs>
+ [[gnu::always_inline]]
+ static type apply(Func&& func, std::tuple<FuncArgs...>&& args) noexcept {
+ try {
+ return std::apply(
+ std::forward<Func>(func),
+ std::forward<std::tuple<FuncArgs...>>(args));
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template<typename Func, typename... FuncArgs>
+ [[gnu::always_inline]]
+ static inline type invoke(Func&& func, FuncArgs&&... args) noexcept {
+ try {
+ return func(std::forward<FuncArgs>(args)...);
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <class Func>
+ [[gnu::always_inline]]
+ static type invoke(Func&& func, seastar::internal::monostate) noexcept {
+ try {
+ return func();
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <typename Arg>
+ [[gnu::always_inline]]
+ static type make_exception_future(Arg&& arg) {
+ return errorator_type::template make_exception_future2<Value>(std::forward<Arg>(arg));
+ }
+
+private:
+ template<typename PromiseT, typename Func>
+ static void satisfy_with_result_of(PromiseT&& pr, Func&& func) {
+ // this may use the protected variant of `seastar::future::forward_to()`
+ // because:
+ // 1. `seastar::future` established a friendship with with all
+ // specializations of `seastar::futurize`, including this
+ // one (we're in the `seastar` namespace!) WHILE
+ // 2. any errorated future declares now the friendship with any
+ // `seastar::futurize<...>`.
+ func().forward_to(std::move(pr));
+ }
+ template <typename U>
+ friend class future;
+};
+
+template <template <class> class Container,
+ class Value>
+struct continuation_base_from_future<Container<::crimson::errorated_future_marker<Value>>> {
+ using type = continuation_base<Value>;
+};
+
+} // namespace seastar
diff --git a/src/crimson/common/exception.h b/src/crimson/common/exception.h
new file mode 100644
index 000000000..682fef69b
--- /dev/null
+++ b/src/crimson/common/exception.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "crimson/common/log.h"
+#include "crimson/common/interruptible_future.h"
+
+namespace crimson::common {
+
+class interruption : public std::exception
+{};
+
+class system_shutdown_exception final : public interruption{
+public:
+ const char* what() const noexcept final {
+ return "system shutting down";
+ }
+};
+
+class actingset_changed final : public interruption {
+public:
+ actingset_changed(bool sp) : still_primary(sp) {}
+ const char* what() const noexcept final {
+ return "acting set changed";
+ }
+ bool is_primary() const {
+ return still_primary;
+ }
+private:
+ const bool still_primary;
+};
+
+template<typename Func, typename... Args>
+inline seastar::future<> handle_system_shutdown(Func&& func, Args&&... args)
+{
+ return seastar::futurize_invoke(std::forward<Func>(func),
+ std::forward<Args>(args)...)
+ .handle_exception([](std::exception_ptr eptr) {
+ if (*eptr.__cxa_exception_type() ==
+ typeid(crimson::common::system_shutdown_exception)) {
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "operation skipped, system shutdown");
+ return seastar::now();
+ }
+ std::rethrow_exception(eptr);
+ });
+}
+
+}
diff --git a/src/crimson/common/fatal_signal.cc b/src/crimson/common/fatal_signal.cc
new file mode 100644
index 000000000..f2983769d
--- /dev/null
+++ b/src/crimson/common/fatal_signal.cc
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "fatal_signal.h"
+
+#include <csignal>
+#include <iostream>
+#include <string_view>
+
+#define BOOST_STACKTRACE_USE_ADDR2LINE
+#include <boost/stacktrace.hpp>
+#include <seastar/core/reactor.hh>
+
+#include "common/safe_io.h"
+#include "include/scope_guard.h"
+
+FatalSignal::FatalSignal()
+{
+ install_oneshot_signals_handler<SIGSEGV,
+ SIGABRT,
+ SIGBUS,
+ SIGILL,
+ SIGFPE,
+ SIGXCPU,
+ SIGXFSZ,
+ SIGSYS>();
+}
+
+template <int... SigNums>
+void FatalSignal::install_oneshot_signals_handler()
+{
+ (install_oneshot_signal_handler<SigNums>() , ...);
+}
+
+static void reraise_fatal(const int signum)
+{
+ // use default handler to dump core
+ ::signal(signum, SIG_DFL);
+
+ // normally, we won't get here. if we do, something is very weird.
+ if (::raise(signum)) {
+ std::cerr << "reraise_fatal: failed to re-raise signal " << signum
+ << std::endl;
+ } else {
+ std::cerr << "reraise_fatal: default handler for signal " << signum
+ << " didn't terminate the process?" << std::endl;
+ }
+ std::cerr << std::flush;
+ ::_exit(1);
+}
+
+[[gnu::noinline]] void FatalSignal::signal_entry(
+ const int signum,
+ siginfo_t* const info,
+ void*)
+{
+ if (static std::atomic_bool handled{false}; handled.exchange(true)) {
+ return;
+ }
+ assert(info);
+ FatalSignal::signaled(signum, *info);
+ reraise_fatal(signum);
+}
+
+template <int SigNum>
+void FatalSignal::install_oneshot_signal_handler()
+{
+ struct sigaction sa;
+ // it's a bad idea to use a lambda here. On GCC there are `operator()`
+ // and `_FUN()`. Controlling their inlineability is hard (impossible?).
+ sa.sa_sigaction = signal_entry;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO | SA_RESTART | SA_NODEFER;
+ if constexpr (SigNum == SIGSEGV) {
+ sa.sa_flags |= SA_ONSTACK;
+ }
+ [[maybe_unused]] auto r = ::sigaction(SigNum, &sa, nullptr);
+ assert(r == 0);
+}
+
+
+[[gnu::noinline]] static void print_backtrace(std::string_view cause) {
+ std::cerr << cause;
+ if (seastar::engine_is_ready()) {
+ std::cerr << " on shard " << seastar::this_shard_id();
+ }
+ // nobody wants to see things like `FatalSignal::signaled()` or
+ // `print_backtrace()` in our backtraces. `+ 1` is for the extra
+ // frame created by kernel (signal trampoline, it will take care
+ // about e.g. sigreturn(2) calling; see the man page).
+ constexpr std::size_t FRAMES_TO_SKIP = 3 + 1;
+ std::cerr << ".\nBacktrace:\n";
+ std::cerr << boost::stacktrace::stacktrace(
+ FRAMES_TO_SKIP,
+ static_cast<std::size_t>(-1)/* max depth same as the default one */);
+ std::cerr << std::flush;
+ // TODO: dump crash related meta data to $crash_dir
+ // see handle_fatal_signal()
+}
+
+static void print_segv_info(const siginfo_t& siginfo)
+{
+ std::cerr \
+ << "Dump of siginfo:" << std::endl
+ << " si_signo: " << siginfo.si_signo << std::endl
+ << " si_errno: " << siginfo.si_errno << std::endl
+ << " si_code: " << siginfo.si_code << std::endl
+ << " si_pid: " << siginfo.si_pid << std::endl
+ << " si_uid: " << siginfo.si_uid << std::endl
+ << " si_status: " << siginfo.si_status << std::endl
+ << " si_utime: " << siginfo.si_utime << std::endl
+ << " si_stime: " << siginfo.si_stime << std::endl
+ << " si_int: " << siginfo.si_int << std::endl
+ << " si_ptr: " << siginfo.si_ptr << std::endl
+ << " si_overrun: " << siginfo.si_overrun << std::endl
+ << " si_timerid: " << siginfo.si_timerid << std::endl
+ << " si_addr: " << siginfo.si_addr << std::endl
+ << " si_band: " << siginfo.si_band << std::endl
+ << " si_fd: " << siginfo.si_fd << std::endl
+ << " si_addr_lsb: " << siginfo.si_addr_lsb << std::endl
+ << " si_lower: " << siginfo.si_lower << std::endl
+ << " si_upper: " << siginfo.si_upper << std::endl
+ << " si_pkey: " << siginfo.si_pkey << std::endl
+ << " si_call_addr: " << siginfo.si_call_addr << std::endl
+ << " si_syscall: " << siginfo.si_syscall << std::endl
+ << " si_arch: " << siginfo.si_arch << std::endl;
+ std::cerr << std::flush;
+}
+
+static void print_proc_maps()
+{
+ const int fd = ::open("/proc/self/maps", O_RDONLY);
+ if (fd < 0) {
+ std::cerr << "can't open /proc/self/maps. procfs not mounted?" << std::endl;
+ return;
+ }
+ const auto fd_guard = make_scope_guard([fd] {
+ ::close(fd);
+ });
+ std::cerr << "Content of /proc/self/maps:" << std::endl;
+ while (true) {
+ char chunk[4096] = {0, };
+ const ssize_t r = safe_read(fd, chunk, sizeof(chunk) - 1);
+ if (r < 0) {
+ std::cerr << "error while reading /proc/self/maps: " << r << std::endl;
+ return;
+ } else {
+ std::cerr << chunk << std::flush;
+ if (r < static_cast<ssize_t>(sizeof(chunk) - 1)) {
+ return; // eof
+ }
+ }
+ }
+}
+
+[[gnu::noinline]] void FatalSignal::signaled(const int signum,
+ const siginfo_t& siginfo)
+{
+ switch (signum) {
+ case SIGSEGV:
+ print_backtrace("Segmentation fault");
+ print_segv_info(siginfo);
+ break;
+ case SIGABRT:
+ print_backtrace("Aborting");
+ break;
+ default:
+ print_backtrace(fmt::format("Signal {}", signum));
+ break;
+ }
+ print_proc_maps();
+}
diff --git a/src/crimson/common/fatal_signal.h b/src/crimson/common/fatal_signal.h
new file mode 100644
index 000000000..626017c93
--- /dev/null
+++ b/src/crimson/common/fatal_signal.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <csignal>
+
+class FatalSignal {
+public:
+ FatalSignal();
+
+private:
+ static void signal_entry(int signum, siginfo_t* siginfo, void* p);
+ static void signaled(int signum, const siginfo_t& siginfo);
+
+ template <int... SigNums>
+ void install_oneshot_signals_handler();
+
+ template <int SigNum>
+ void install_oneshot_signal_handler();
+};
diff --git a/src/crimson/common/fixed_kv_node_layout.h b/src/crimson/common/fixed_kv_node_layout.h
new file mode 100644
index 000000000..676563594
--- /dev/null
+++ b/src/crimson/common/fixed_kv_node_layout.h
@@ -0,0 +1,730 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+
+#include <boost/iterator/counting_iterator.hpp>
+
+#include "include/byteorder.h"
+
+#include "crimson/common/layout.h"
+
+namespace crimson::common {
+
+template <typename T, bool is_const>
+struct maybe_const_t {
+};
+template<typename T>
+struct maybe_const_t<T, true> {
+ using type = const T*;
+};
+template<typename T>
+struct maybe_const_t<T, false> {
+ using type = T*;
+};
+
+
+/**
+ * FixedKVNodeLayout
+ *
+ * Reusable implementation of a fixed size block mapping
+ * K -> V with internal representations KINT and VINT.
+ *
+ * Uses absl::container_internal::Layout for the actual memory layout.
+ *
+ * The primary interface exposed is centered on the iterator
+ * and related methods.
+ *
+ * Also included are helpers for doing splits and merges as for a btree.
+ */
+template <
+ size_t CAPACITY,
+ typename Meta,
+ typename MetaInt,
+ typename K,
+ typename KINT,
+ typename V,
+ typename VINT,
+ bool VALIDATE_INVARIANTS=true>
+class FixedKVNodeLayout {
+ char *buf = nullptr;
+
+ using L = absl::container_internal::Layout<ceph_le32, MetaInt, KINT, VINT>;
+ static constexpr L layout{1, 1, CAPACITY, CAPACITY};
+
+public:
+ template <bool is_const>
+ struct iter_t {
+ friend class FixedKVNodeLayout;
+ using parent_t = typename maybe_const_t<FixedKVNodeLayout, is_const>::type;
+
+ parent_t node;
+ uint16_t offset = 0;
+
+ iter_t() = default;
+ iter_t(
+ parent_t parent,
+ uint16_t offset) : node(parent), offset(offset) {}
+
+ iter_t(const iter_t &) noexcept = default;
+ iter_t(iter_t &&) noexcept = default;
+ template<bool is_const_ = is_const>
+ iter_t(const iter_t<false>& it, std::enable_if_t<is_const_, int> = 0)
+ : iter_t{it.node, it.offset}
+ {}
+ iter_t &operator=(const iter_t &) = default;
+ iter_t &operator=(iter_t &&) = default;
+
+ // Work nicely with for loops without requiring a nested type.
+ using reference = iter_t&;
+ iter_t &operator*() { return *this; }
+ iter_t *operator->() { return this; }
+
+ iter_t operator++(int) {
+ auto ret = *this;
+ ++offset;
+ return ret;
+ }
+
+ iter_t &operator++() {
+ ++offset;
+ return *this;
+ }
+
+ iter_t operator--(int) {
+ assert(offset > 0);
+ auto ret = *this;
+ --offset;
+ return ret;
+ }
+
+ iter_t &operator--() {
+ assert(offset > 0);
+ --offset;
+ return *this;
+ }
+
+ uint16_t operator-(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return offset - rhs.offset;
+ }
+
+ iter_t operator+(uint16_t off) const {
+ return iter_t(
+ node,
+ offset + off);
+ }
+ iter_t operator-(uint16_t off) const {
+ return iter_t(
+ node,
+ offset - off);
+ }
+
+ friend bool operator==(const iter_t &lhs, const iter_t &rhs) {
+ assert(lhs.node == rhs.node);
+ return lhs.offset == rhs.offset;
+ }
+
+ friend bool operator!=(const iter_t &lhs, const iter_t &rhs) {
+ return !(lhs == rhs);
+ }
+
+ friend bool operator==(const iter_t<is_const> &lhs, const iter_t<!is_const> &rhs) {
+ assert(lhs.node == rhs.node);
+ return lhs.offset == rhs.offset;
+ }
+ friend bool operator!=(const iter_t<is_const> &lhs, const iter_t<!is_const> &rhs) {
+ return !(lhs == rhs);
+ }
+ K get_key() const {
+ return K(node->get_key_ptr()[offset]);
+ }
+
+ K get_next_key_or_max() const {
+ auto next = *this + 1;
+ if (next == node->end())
+ return std::numeric_limits<K>::max();
+ else
+ return next->get_key();
+ }
+
+ void set_val(V val) const {
+ static_assert(!is_const);
+ node->get_val_ptr()[offset] = VINT(val);
+ }
+
+ V get_val() const {
+ return V(node->get_val_ptr()[offset]);
+ };
+
+ bool contains(K addr) const {
+ return (get_key() <= addr) && (get_next_key_or_max() > addr);
+ }
+
+ uint16_t get_offset() const {
+ return offset;
+ }
+
+ private:
+ void set_key(K _lb) const {
+ static_assert(!is_const);
+ KINT lb;
+ lb = _lb;
+ node->get_key_ptr()[offset] = lb;
+ }
+
+ typename maybe_const_t<char, is_const>::type get_key_ptr() const {
+ return reinterpret_cast<
+ typename maybe_const_t<char, is_const>::type>(
+ node->get_key_ptr() + offset);
+ }
+
+ typename maybe_const_t<char, is_const>::type get_val_ptr() const {
+ return reinterpret_cast<
+ typename maybe_const_t<char, is_const>::type>(
+ node->get_val_ptr() + offset);
+ }
+ };
+ using const_iterator = iter_t<true>;
+ using iterator = iter_t<false>;
+
+ struct delta_t {
+ enum class op_t : uint8_t {
+ INSERT,
+ REMOVE,
+ UPDATE,
+ } op;
+ KINT key;
+ VINT val;
+
+ void replay(FixedKVNodeLayout &l) {
+ switch (op) {
+ case op_t::INSERT: {
+ l.insert(l.lower_bound(key), key, val);
+ break;
+ }
+ case op_t::REMOVE: {
+ auto iter = l.find(key);
+ assert(iter != l.end());
+ l.remove(iter);
+ break;
+ }
+ case op_t::UPDATE: {
+ auto iter = l.find(key);
+ assert(iter != l.end());
+ l.update(iter, val);
+ break;
+ }
+ default:
+ assert(0 == "Impossible");
+ }
+ }
+
+ bool operator==(const delta_t &rhs) const {
+ return op == rhs.op &&
+ key == rhs.key &&
+ val == rhs.val;
+ }
+ };
+
+public:
+ class delta_buffer_t {
+ std::vector<delta_t> buffer;
+ public:
+ bool empty() const {
+ return buffer.empty();
+ }
+ void insert(
+ const K &key,
+ const V &val) {
+ KINT k;
+ k = key;
+ buffer.push_back(
+ delta_t{
+ delta_t::op_t::INSERT,
+ k,
+ VINT(val)
+ });
+ }
+ void update(
+ const K &key,
+ const V &val) {
+ KINT k;
+ k = key;
+ buffer.push_back(
+ delta_t{
+ delta_t::op_t::UPDATE,
+ k,
+ VINT(val)
+ });
+ }
+ void remove(const K &key) {
+ KINT k;
+ k = key;
+ buffer.push_back(
+ delta_t{
+ delta_t::op_t::REMOVE,
+ k,
+ VINT()
+ });
+ }
+ void replay(FixedKVNodeLayout &node) {
+ for (auto &i: buffer) {
+ i.replay(node);
+ }
+ }
+ size_t get_bytes() const {
+ return buffer.size() * sizeof(delta_t);
+ }
+ void copy_out(char *out, size_t len) {
+ assert(len == get_bytes());
+ ::memcpy(out, reinterpret_cast<const void *>(buffer.data()), get_bytes());
+ buffer.clear();
+ }
+ void copy_in(const char *out, size_t len) {
+ assert(empty());
+ assert(len % sizeof(delta_t) == 0);
+ buffer = std::vector(
+ reinterpret_cast<const delta_t*>(out),
+ reinterpret_cast<const delta_t*>(out + len));
+ }
+ bool operator==(const delta_buffer_t &rhs) const {
+ return buffer == rhs.buffer;
+ }
+ };
+
+ void journal_insert(
+ const_iterator _iter,
+ const K &key,
+ const V &val,
+ delta_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.offset);
+ if (recorder) {
+ recorder->insert(
+ key,
+ val);
+ }
+ insert(iter, key, val);
+ }
+
+ void journal_update(
+ const_iterator _iter,
+ const V &val,
+ delta_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.offset);
+ if (recorder) {
+ recorder->update(iter->get_key(), val);
+ }
+ update(iter, val);
+ }
+
+ void journal_replace(
+ const_iterator _iter,
+ const K &key,
+ const V &val,
+ delta_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.offset);
+ if (recorder) {
+ recorder->remove(iter->get_key());
+ recorder->insert(key, val);
+ }
+ replace(iter, key, val);
+ }
+
+
+ void journal_remove(
+ const_iterator _iter,
+ delta_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.offset);
+ if (recorder) {
+ recorder->remove(iter->get_key());
+ }
+ remove(iter);
+ }
+
+
+ FixedKVNodeLayout(char *buf) :
+ buf(buf) {}
+
+ virtual ~FixedKVNodeLayout() = default;
+
+ const_iterator begin() const {
+ return const_iterator(
+ this,
+ 0);
+ }
+
+ const_iterator end() const {
+ return const_iterator(
+ this,
+ get_size());
+ }
+
+ iterator begin() {
+ return iterator(
+ this,
+ 0);
+ }
+
+ iterator end() {
+ return iterator(
+ this,
+ get_size());
+ }
+
+ const_iterator iter_idx(uint16_t off) const {
+ return const_iterator(
+ this,
+ off);
+ }
+
+ const_iterator find(K l) const {
+ auto ret = begin();
+ for (; ret != end(); ++ret) {
+ if (ret->get_key() == l)
+ break;
+ }
+ return ret;
+ }
+ iterator find(K l) {
+ const auto &tref = *this;
+ return iterator(this, tref.find(l).offset);
+ }
+
+ const_iterator lower_bound(K l) const {
+ auto it = std::lower_bound(boost::make_counting_iterator<uint16_t>(0),
+ boost::make_counting_iterator<uint16_t>(get_size()),
+ l,
+ [this](uint16_t i, K key) {
+ const_iterator iter(this, i);
+ return iter->get_key() < key;
+ });
+ return const_iterator(this, *it);
+ }
+
+ iterator lower_bound(K l) {
+ const auto &tref = *this;
+ return iterator(this, tref.lower_bound(l).offset);
+ }
+
+ const_iterator upper_bound(K l) const {
+ auto it = std::upper_bound(boost::make_counting_iterator<uint16_t>(0),
+ boost::make_counting_iterator<uint16_t>(get_size()),
+ l,
+ [this](K key, uint16_t i) {
+ const_iterator iter(this, i);
+ return key < iter->get_key();
+ });
+ return const_iterator(this, *it);
+ }
+
+ iterator upper_bound(K l) {
+ const auto &tref = *this;
+ return iterator(this, tref.upper_bound(l).offset);
+ }
+
+ const_iterator get_split_pivot() const {
+ return iter_idx(get_size() / 2);
+ }
+
+ uint16_t get_size() const {
+ return *layout.template Pointer<0>(buf);
+ }
+
+ /**
+ * set_size
+ *
+ * Set size representation to match size
+ */
+ void set_size(uint16_t size) {
+ *layout.template Pointer<0>(buf) = size;
+ }
+
+ /**
+ * get_meta/set_meta
+ *
+ * Enables stashing a templated type within the layout.
+ * Cannot be modified after initial write as it is not represented
+ * in delta_t
+ */
+ Meta get_meta() const {
+ MetaInt &metaint = *layout.template Pointer<1>(buf);
+ return Meta(metaint);
+ }
+ void set_meta(const Meta &meta) {
+ *layout.template Pointer<1>(buf) = MetaInt(meta);
+ }
+
+ constexpr static size_t get_capacity() {
+ return CAPACITY;
+ }
+
+ bool operator==(const FixedKVNodeLayout &rhs) const {
+ if (get_size() != rhs.get_size()) {
+ return false;
+ }
+
+ auto iter = begin();
+ auto iter2 = rhs.begin();
+ while (iter != end()) {
+ if (iter->get_key() != iter2->get_key() ||
+ iter->get_val() != iter2->get_val()) {
+ return false;
+ }
+ iter++;
+ iter2++;
+ }
+ return true;
+ }
+
+ /**
+ * split_into
+ *
+ * Takes *this and splits its contents into left and right.
+ */
+ K split_into(
+ FixedKVNodeLayout &left,
+ FixedKVNodeLayout &right) const {
+ auto piviter = get_split_pivot();
+
+ left.copy_from_foreign(left.begin(), begin(), piviter);
+ left.set_size(piviter - begin());
+
+ right.copy_from_foreign(right.begin(), piviter, end());
+ right.set_size(end() - piviter);
+
+ auto [lmeta, rmeta] = get_meta().split_into(piviter->get_key());
+ left.set_meta(lmeta);
+ right.set_meta(rmeta);
+
+ return piviter->get_key();
+ }
+
+ /**
+ * merge_from
+ *
+ * Takes two nodes and copies their contents into *this.
+ *
+ * precondition: left.size() + right.size() < CAPACITY
+ */
+ void merge_from(
+ const FixedKVNodeLayout &left,
+ const FixedKVNodeLayout &right)
+ {
+ copy_from_foreign(
+ end(),
+ left.begin(),
+ left.end());
+ set_size(left.get_size());
+ copy_from_foreign(
+ end(),
+ right.begin(),
+ right.end());
+ set_size(left.get_size() + right.get_size());
+ set_meta(Meta::merge_from(left.get_meta(), right.get_meta()));
+ }
+
+ /**
+ * balance_into_new_nodes
+ *
+ * Takes the contents of left and right and copies them into
+ * replacement_left and replacement_right such that in the
+ * event that the number of elements is odd the extra goes to
+ * the left side iff prefer_left.
+ */
+ static K balance_into_new_nodes(
+ const FixedKVNodeLayout &left,
+ const FixedKVNodeLayout &right,
+ bool prefer_left,
+ FixedKVNodeLayout &replacement_left,
+ FixedKVNodeLayout &replacement_right)
+ {
+ auto total = left.get_size() + right.get_size();
+ auto pivot_idx = (left.get_size() + right.get_size()) / 2;
+ if (total % 2 && prefer_left) {
+ pivot_idx++;
+ }
+ auto replacement_pivot = pivot_idx >= left.get_size() ?
+ right.iter_idx(pivot_idx - left.get_size())->get_key() :
+ left.iter_idx(pivot_idx)->get_key();
+
+ if (pivot_idx < left.get_size()) {
+ replacement_left.copy_from_foreign(
+ replacement_left.end(),
+ left.begin(),
+ left.iter_idx(pivot_idx));
+ replacement_left.set_size(pivot_idx);
+
+ replacement_right.copy_from_foreign(
+ replacement_right.end(),
+ left.iter_idx(pivot_idx),
+ left.end());
+
+ replacement_right.set_size(left.get_size() - pivot_idx);
+ replacement_right.copy_from_foreign(
+ replacement_right.end(),
+ right.begin(),
+ right.end());
+ replacement_right.set_size(total - pivot_idx);
+ } else {
+ replacement_left.copy_from_foreign(
+ replacement_left.end(),
+ left.begin(),
+ left.end());
+ replacement_left.set_size(left.get_size());
+
+ replacement_left.copy_from_foreign(
+ replacement_left.end(),
+ right.begin(),
+ right.iter_idx(pivot_idx - left.get_size()));
+ replacement_left.set_size(pivot_idx);
+
+ replacement_right.copy_from_foreign(
+ replacement_right.end(),
+ right.iter_idx(pivot_idx - left.get_size()),
+ right.end());
+ replacement_right.set_size(total - pivot_idx);
+ }
+
+ auto [lmeta, rmeta] = Meta::rebalance(
+ left.get_meta(), right.get_meta(), replacement_pivot);
+ replacement_left.set_meta(lmeta);
+ replacement_right.set_meta(rmeta);
+ return replacement_pivot;
+ }
+
+private:
+ void insert(
+ iterator iter,
+ const K &key,
+ const V &val) {
+ if (VALIDATE_INVARIANTS) {
+ if (iter != begin()) {
+ assert((iter - 1)->get_key() < key);
+ }
+ if (iter != end()) {
+ assert(iter->get_key() > key);
+ }
+ assert(get_size() < CAPACITY);
+ }
+ copy_from_local(iter + 1, iter, end());
+ iter->set_key(key);
+ iter->set_val(val);
+ set_size(get_size() + 1);
+ }
+
+ void update(
+ iterator iter,
+ V val) {
+ assert(iter != end());
+ iter->set_val(val);
+ }
+
+ void replace(
+ iterator iter,
+ const K &key,
+ const V &val) {
+ assert(iter != end());
+ if (VALIDATE_INVARIANTS) {
+ if (iter != begin()) {
+ assert((iter - 1)->get_key() < key);
+ }
+ if ((iter + 1) != end()) {
+ assert((iter + 1)->get_key() > key);
+ }
+ }
+ iter->set_key(key);
+ iter->set_val(val);
+ }
+
+ void remove(iterator iter) {
+ assert(iter != end());
+ copy_from_local(iter, iter + 1, end());
+ set_size(get_size() - 1);
+ }
+
+ /**
+ * get_key_ptr
+ *
+ * Get pointer to start of key array
+ */
+ KINT *get_key_ptr() {
+ return layout.template Pointer<2>(buf);
+ }
+ const KINT *get_key_ptr() const {
+ return layout.template Pointer<2>(buf);
+ }
+
+ /**
+ * get_val_ptr
+ *
+ * Get pointer to start of val array
+ */
+ VINT *get_val_ptr() {
+ return layout.template Pointer<3>(buf);
+ }
+ const VINT *get_val_ptr() const {
+ return layout.template Pointer<3>(buf);
+ }
+
+ /**
+ * node_resolve/unresolve_vals
+ *
+ * If the representation for values depends in some way on the
+ * node in which they are located, users may implement
+ * resolve/unresolve to enable copy_from_foreign to handle that
+ * transition.
+ */
+ virtual void node_resolve_vals(iterator from, iterator to) const {}
+ virtual void node_unresolve_vals(iterator from, iterator to) const {}
+
+ /**
+ * copy_from_foreign
+ *
+ * Copies entries from [from_src, to_src) to tgt.
+ *
+ * tgt and from_src must be from different nodes.
+ * from_src and to_src must be from the same node.
+ */
+ static void copy_from_foreign(
+ iterator tgt,
+ const_iterator from_src,
+ const_iterator to_src) {
+ assert(tgt->node != from_src->node);
+ assert(to_src->node == from_src->node);
+ memcpy(
+ tgt->get_val_ptr(), from_src->get_val_ptr(),
+ to_src->get_val_ptr() - from_src->get_val_ptr());
+ memcpy(
+ tgt->get_key_ptr(), from_src->get_key_ptr(),
+ to_src->get_key_ptr() - from_src->get_key_ptr());
+ from_src->node->node_resolve_vals(tgt, tgt + (to_src - from_src));
+ tgt->node->node_unresolve_vals(tgt, tgt + (to_src - from_src));
+ }
+
+ /**
+ * copy_from_local
+ *
+ * Copies entries from [from_src, to_src) to tgt.
+ *
+ * tgt, from_src, and to_src must be from the same node.
+ */
+ static void copy_from_local(
+ iterator tgt,
+ iterator from_src,
+ iterator to_src) {
+ assert(tgt->node == from_src->node);
+ assert(to_src->node == from_src->node);
+ memmove(
+ tgt->get_val_ptr(), from_src->get_val_ptr(),
+ to_src->get_val_ptr() - from_src->get_val_ptr());
+ memmove(
+ tgt->get_key_ptr(), from_src->get_key_ptr(),
+ to_src->get_key_ptr() - from_src->get_key_ptr());
+ }
+};
+
+}
diff --git a/src/crimson/common/formatter.cc b/src/crimson/common/formatter.cc
new file mode 100644
index 000000000..ab371ddbf
--- /dev/null
+++ b/src/crimson/common/formatter.cc
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "formatter.h"
+
+#include <fmt/format.h>
+#if FMT_VERSION >= 60000
+#include <fmt/chrono.h>
+#else
+#include <fmt/time.h>
+#endif
+
+
+template <>
+struct fmt::formatter<seastar::lowres_system_clock::time_point> {
+ // ignore the format string
+ template <typename ParseContext>
+ constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }
+
+ template <typename FormatContext>
+ auto format(const seastar::lowres_system_clock::time_point& t,
+ FormatContext& ctx) {
+ std::time_t tt = std::chrono::duration_cast<std::chrono::seconds>(
+ t.time_since_epoch()).count();
+ auto milliseconds = (t.time_since_epoch() %
+ std::chrono::seconds(1)).count();
+ return fmt::format_to(ctx.out(), "{:%Y-%m-%d %H:%M:%S} {:03d}",
+ fmt::localtime(tt), milliseconds);
+ }
+};
+
+namespace std {
+
+ostream& operator<<(ostream& out,
+ const seastar::lowres_system_clock::time_point& t)
+{
+ return out << fmt::format("{}", t);
+}
+
+}
diff --git a/src/crimson/common/formatter.h b/src/crimson/common/formatter.h
new file mode 100644
index 000000000..9b7be428a
--- /dev/null
+++ b/src/crimson/common/formatter.h
@@ -0,0 +1,13 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/lowres_clock.hh>
+
+#include "common/ceph_time.h"
+
+namespace std {
+
+ostream& operator<<(ostream& out,
+ const seastar::lowres_system_clock::time_point& t);
+
+}
diff --git a/src/crimson/common/gated.h b/src/crimson/common/gated.h
new file mode 100644
index 000000000..559a889a3
--- /dev/null
+++ b/src/crimson/common/gated.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/gate.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "crimson/common/exception.h"
+#include "crimson/common/log.h"
+#include "include/ceph_assert.h"
+
+namespace crimson::common {
+
+class Gated {
+ public:
+ static seastar::logger& gated_logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+ template <typename Func, typename T>
+ inline void dispatch_in_background(const char* what, T& who, Func&& func) {
+ (void) dispatch(what, who, func);
+ }
+ template <typename Func, typename T>
+ inline seastar::future<> dispatch(const char* what, T& who, Func&& func) {
+ return seastar::with_gate(pending_dispatch, std::forward<Func>(func)
+ ).handle_exception([what, &who] (std::exception_ptr eptr) {
+ if (*eptr.__cxa_exception_type() == typeid(system_shutdown_exception)) {
+ gated_logger().debug(
+ "{}, {} skipped, system shutdown", who, what);
+ return;
+ }
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception& e) {
+ gated_logger().error(
+ "{} dispatch() {} caught exception: {}", who, what, e.what());
+ }
+ assert(*eptr.__cxa_exception_type()
+ == typeid(seastar::gate_closed_exception));
+ });
+ }
+
+ seastar::future<> close() {
+ return pending_dispatch.close();
+ }
+ bool is_closed() const {
+ return pending_dispatch.is_closed();
+ }
+ private:
+ seastar::gate pending_dispatch;
+};
+
+}// namespace crimson::common
diff --git a/src/crimson/common/interruptible_future.h b/src/crimson/common/interruptible_future.h
new file mode 100644
index 000000000..c0e2c346c
--- /dev/null
+++ b/src/crimson/common/interruptible_future.h
@@ -0,0 +1,1600 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future-util.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/core/when_all.hh>
+#include <seastar/core/thread.hh>
+
+#include "crimson/common/log.h"
+#include "crimson/common/errorator.h"
+#ifndef NDEBUG
+#define INTR_FUT_DEBUG(FMT_MSG, ...) crimson::get_logger(ceph_subsys_).trace(FMT_MSG, ##__VA_ARGS__)
+#else
+#define INTR_FUT_DEBUG(FMT_MSG, ...)
+#endif
+
+// The interrupt condition generally works this way:
+//
+// 1. It is created by call_with_interruption_impl method, and is recorded in the thread
+// local global variable "::crimson::interruptible::interrupt_cond".
+// 2. Any continuation that's created within the execution of the continuation
+// that calls the call_with_interruption_impl method will capture the "interrupt_cond";
+// and when they starts to run, they will put that capture interruption condition
+// into "::crimson::interruptible::interrupt_cond" so that further continuations
+// created can also capture the interruption condition;
+// 3. At the end of the continuation run, the global "interrupt_cond" will be cleared
+// to prevent other continuations that are not supposed to be interrupted wrongly
+// capture an interruption condition.
+// With this approach, continuations capture the interrupt condition at their creation,
+// restore the interrupt conditions at the beginning of their execution and clear those
+// interrupt conditions at the end of their execution. So the global "interrupt_cond"
+// only hold valid interrupt conditions when the corresponding continuations are actually
+// running after which it gets cleared. Since continuations can't be executed simultaneously,
+// different continuation chains won't be able to interfere with each other.
+//
+// The global "interrupt_cond" can work as a signal about whether the continuation
+// is supposed to be interrupted, the reason that the global "interrupt_cond"
+// exists is that there may be this scenario:
+//
+// Say there's some method PG::func1(), in which the continuations created may
+// or may not be supposed to be interrupted in different situations. If we don't
+// have a global signal, we have to add an extra parameter to every method like
+// PG::func1() to indicate whether the current run should create to-be-interrupted
+// continuations or not.
+//
+// interruptor::with_interruption() and helpers can be used by users to wrap a future in
+// the interruption machinery.
+
+namespace crimson::os::seastore {
+ class TransactionConflictCondition;
+}
+
+// GCC tries to instantiate
+// seastar::lw_shared_ptr<crimson::os::seastore::TransactionConflictCondition>.
+// but we *may* not have the definition of TransactionConflictCondition at this moment,
+// a full specialization for lw_shared_ptr_accessors helps to bypass the default
+// lw_shared_ptr_accessors implementation, where std::is_base_of<.., T> is used.
+namespace seastar::internal {
+ template<>
+ struct lw_shared_ptr_accessors<::crimson::os::seastore::TransactionConflictCondition, void>
+ : lw_shared_ptr_accessors_no_esft<::crimson::os::seastore::TransactionConflictCondition>
+ {};
+}
+
+SEASTAR_CONCEPT(
+namespace crimson::interruptible {
+ template<typename InterruptCond, typename FutureType>
+ class interruptible_future_detail;
+}
+namespace seastar::impl {
+ template <typename InterruptCond, typename FutureType, typename... Rest>
+ struct is_tuple_of_futures<std::tuple<crimson::interruptible::interruptible_future_detail<InterruptCond, FutureType>, Rest...>>
+ : is_tuple_of_futures<std::tuple<Rest...>> {};
+}
+)
+
+namespace crimson::interruptible {
+
+struct ready_future_marker {};
+struct exception_future_marker {};
+
+template <typename InterruptCond>
+class interruptible_future_builder;
+
+template <typename InterruptCond>
+struct interruptor;
+
+template <typename InterruptCond>
+using InterruptCondRef = seastar::lw_shared_ptr<InterruptCond>;
+
+template <typename InterruptCond>
+struct interrupt_cond_t {
+ InterruptCondRef<InterruptCond> interrupt_cond;
+ uint64_t ref_count = 0;
+ void set(
+ InterruptCondRef<InterruptCond>& ic) {
+ INTR_FUT_DEBUG(
+ "{}: going to set interrupt_cond: {}, ic: {}",
+ __func__,
+ (void*)interrupt_cond.get(),
+ (void*)ic.get());
+ if (!interrupt_cond) {
+ interrupt_cond = ic;
+ }
+ assert(interrupt_cond.get() == ic.get());
+ ref_count++;
+ INTR_FUT_DEBUG(
+ "{}: interrupt_cond: {}, ref_count: {}",
+ __func__,
+ (void*)interrupt_cond.get(),
+ ref_count);
+ }
+ void reset() {
+ if (--ref_count == 0) {
+ INTR_FUT_DEBUG(
+ "{}: clearing interrupt_cond: {},{}",
+ __func__,
+ (void*)interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ interrupt_cond.release();
+ } else {
+ INTR_FUT_DEBUG(
+ "{}: end without clearing interrupt_cond: {},{}, ref_count: {}",
+ __func__,
+ (void*)interrupt_cond.get(),
+ typeid(InterruptCond).name(),
+ ref_count);
+ }
+ }
+};
+
+template <typename InterruptCond>
+thread_local interrupt_cond_t<InterruptCond> interrupt_cond;
+
+extern template thread_local interrupt_cond_t<crimson::os::seastore::TransactionConflictCondition>
+interrupt_cond<crimson::os::seastore::TransactionConflictCondition>;
+
+template <typename InterruptCond, typename FutureType>
+class [[nodiscard]] interruptible_future_detail {};
+
+template <typename FutureType>
+struct is_interruptible_future : public std::false_type {};
+
+template <typename InterruptCond, typename FutureType>
+struct is_interruptible_future<
+ interruptible_future_detail<
+ InterruptCond,
+ FutureType>>
+ : public std::true_type {};
+template <typename FutureType>
+concept IsInterruptibleFuture = is_interruptible_future<FutureType>::value;
+template <typename Func, typename... Args>
+concept InvokeReturnsInterruptibleFuture =
+ IsInterruptibleFuture<std::invoke_result_t<Func, Args...>>;
+
+namespace internal {
+
+template <typename InterruptCond, typename Func, typename... Args>
+auto call_with_interruption_impl(
+ InterruptCondRef<InterruptCond> interrupt_condition,
+ Func&& func, Args&&... args)
+{
+ using futurator_t = seastar::futurize<std::invoke_result_t<Func, Args...>>;
+ // there might be a case like this:
+ // with_interruption([] {
+ // interruptor::do_for_each([] {
+ // ...
+ // return interruptible_errorated_future();
+ // }).safe_then_interruptible([] {
+ // ...
+ // });
+ // })
+ // In this case, as crimson::do_for_each would directly do futurize_invoke
+ // for "call_with_interruption", we have to make sure this invocation would
+ // not errorly release ::crimson::interruptible::interrupt_cond<InterruptCond>
+
+ // If there exists an interrupt condition, which means "Func" may not be
+ // permitted to run as a result of the interruption, test it. If it does
+ // need to be interrupted, return an interruption; otherwise, restore the
+ // global "interrupt_cond" with the interruption condition, and go ahead
+ // executing the Func.
+ assert(interrupt_condition);
+ auto fut = interrupt_condition->template may_interrupt<
+ typename futurator_t::type>();
+ INTR_FUT_DEBUG(
+ "call_with_interruption_impl: may_interrupt: {}, "
+ "local interrupt_condition: {}, "
+ "global interrupt_cond: {},{}",
+ (bool)fut,
+ (void*)interrupt_condition.get(),
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ if (fut) {
+ return std::move(*fut);
+ }
+ interrupt_cond<InterruptCond>.set(interrupt_condition);
+
+ auto fut2 = seastar::futurize_invoke(
+ std::forward<Func>(func),
+ std::forward<Args>(args)...);
+ // Clear the global "interrupt_cond" to prevent it from interfering other
+ // continuation chains.
+ interrupt_cond<InterruptCond>.reset();
+ return fut2;
+}
+
+}
+
+template <typename InterruptCond, typename Func, seastar::Future Ret>
+requires (!InterruptCond::template is_interruption_v<Ret>)
+auto call_with_interruption(
+ InterruptCondRef<InterruptCond> interrupt_condition,
+ Func&& func, Ret&& fut)
+{
+ using Result = std::invoke_result_t<Func, Ret>;
+ // if "T" is already an interrupt exception, return it directly;
+ // otherwise, upper layer application may encounter errors executing
+ // the "Func" body.
+ if (fut.failed()) {
+ std::exception_ptr eptr = fut.get_exception();
+ if (interrupt_condition->is_interruption(eptr)) {
+ return seastar::futurize<Result>::make_exception_future(std::move(eptr));
+ }
+ return internal::call_with_interruption_impl(
+ interrupt_condition,
+ std::forward<Func>(func),
+ seastar::futurize<Ret>::make_exception_future(
+ std::move(eptr)));
+ }
+ return internal::call_with_interruption_impl(
+ interrupt_condition,
+ std::forward<Func>(func),
+ std::move(fut));
+}
+
+template <typename InterruptCond, typename Func, typename T>
+requires (InterruptCond::template is_interruption_v<T>)
+auto call_with_interruption(
+ InterruptCondRef<InterruptCond> interrupt_condition,
+ Func&& func, T&& arg)
+{
+ using Result = std::invoke_result_t<Func, T>;
+ // if "T" is already an interrupt exception, return it directly;
+ // otherwise, upper layer application may encounter errors executing
+ // the "Func" body.
+ return seastar::futurize<Result>::make_exception_future(
+ std::get<0>(std::tuple(std::forward<T>(arg))));
+}
+
+template <typename InterruptCond, typename Func, typename T>
+requires (!InterruptCond::template is_interruption_v<T>) && (!seastar::Future<T>)
+auto call_with_interruption(
+ InterruptCondRef<InterruptCond> interrupt_condition,
+ Func&& func, T&& arg)
+{
+ return internal::call_with_interruption_impl(
+ interrupt_condition,
+ std::forward<Func>(func),
+ std::forward<T>(arg));
+}
+
+template <typename InterruptCond, typename Func,
+ typename Result = std::invoke_result_t<Func>>
+auto call_with_interruption(
+ InterruptCondRef<InterruptCond> interrupt_condition,
+ Func&& func)
+{
+ return internal::call_with_interruption_impl(
+ interrupt_condition,
+ std::forward<Func>(func));
+}
+
+template <typename InterruptCond, typename Func, typename... T,
+ typename Result = std::invoke_result_t<Func, T...>>
+Result non_futurized_call_with_interruption(
+ InterruptCondRef<InterruptCond> interrupt_condition,
+ Func&& func, T&&... args)
+{
+ assert(interrupt_condition);
+ auto fut = interrupt_condition->template may_interrupt<seastar::future<>>();
+ INTR_FUT_DEBUG(
+ "non_futurized_call_with_interruption may_interrupt: {}, "
+ "interrupt_condition: {}, interrupt_cond: {},{}",
+ (bool)fut,
+ (void*)interrupt_condition.get(),
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ if (fut) {
+ std::rethrow_exception(fut->get_exception());
+ }
+ interrupt_cond<InterruptCond>.set(interrupt_condition);
+ try {
+ if constexpr (std::is_void_v<Result>) {
+ std::invoke(std::forward<Func>(func), std::forward<T>(args)...);
+
+ // Clear the global "interrupt_cond" to prevent it from interfering other
+ // continuation chains.
+ interrupt_cond<InterruptCond>.reset();
+ return;
+ } else {
+ auto&& err = std::invoke(std::forward<Func>(func), std::forward<T>(args)...);
+ interrupt_cond<InterruptCond>.reset();
+ return std::forward<Result>(err);
+ }
+ } catch (std::exception& e) {
+ // Clear the global "interrupt_cond" to prevent it from interfering other
+ // continuation chains.
+ interrupt_cond<InterruptCond>.reset();
+ throw e;
+ }
+}
+
+template <typename InterruptCond, typename Errorator>
+struct interruptible_errorator;
+
+template <typename T>
+struct parallel_for_each_ret {
+ static_assert(seastar::Future<T>);
+ using type = seastar::future<>;
+};
+
+template <template <typename...> typename ErroratedFuture, typename T>
+struct parallel_for_each_ret<
+ ErroratedFuture<
+ ::crimson::errorated_future_marker<T>>> {
+ using type = ErroratedFuture<::crimson::errorated_future_marker<void>>;
+};
+
+template <typename InterruptCond, typename FutureType>
+class parallel_for_each_state final : private seastar::continuation_base<> {
+ using elem_ret_t = std::conditional_t<
+ IsInterruptibleFuture<FutureType>,
+ typename FutureType::core_type,
+ FutureType>;
+ using future_t = interruptible_future_detail<
+ InterruptCond,
+ typename parallel_for_each_ret<elem_ret_t>::type>;
+ std::vector<future_t> _incomplete;
+ seastar::promise<> _result;
+ std::exception_ptr _ex;
+private:
+ void wait_for_one() noexcept {
+ while (!_incomplete.empty() && _incomplete.back().available()) {
+ if (_incomplete.back().failed()) {
+ _ex = _incomplete.back().get_exception();
+ }
+ _incomplete.pop_back();
+ }
+ if (!_incomplete.empty()) {
+ seastar::internal::set_callback(std::move(_incomplete.back()),
+ static_cast<continuation_base<>*>(this));
+ _incomplete.pop_back();
+ return;
+ }
+ if (__builtin_expect(bool(_ex), false)) {
+ _result.set_exception(std::move(_ex));
+ } else {
+ _result.set_value();
+ }
+ delete this;
+ }
+ virtual void run_and_dispose() noexcept override {
+ if (_state.failed()) {
+ _ex = std::move(_state).get_exception();
+ }
+ _state = {};
+ wait_for_one();
+ }
+ task* waiting_task() noexcept override { return _result.waiting_task(); }
+public:
+ parallel_for_each_state(size_t n) {
+ _incomplete.reserve(n);
+ }
+ void add_future(future_t&& f) {
+ _incomplete.push_back(std::move(f));
+ }
+ future_t get_future() {
+ auto ret = _result.get_future();
+ wait_for_one();
+ return ret;
+ }
+ static future_t now() {
+ return seastar::now();
+ }
+};
+
+template <typename InterruptCond, typename T>
+class [[nodiscard]] interruptible_future_detail<InterruptCond, seastar::future<T>>
+ : private seastar::future<T> {
+public:
+ using core_type = seastar::future<T>;
+ template <typename U>
+ using interrupt_futurize_t =
+ typename interruptor<InterruptCond>::template futurize_t<U>;
+ using core_type::get0;
+ using core_type::core_type;
+ using core_type::get_exception;
+ using core_type::ignore_ready_future;
+
+ [[gnu::always_inline]]
+ interruptible_future_detail(seastar::future<T>&& base)
+ : core_type(std::move(base))
+ {}
+
+ using value_type = typename seastar::future<T>::value_type;
+ using tuple_type = typename seastar::future<T>::tuple_type;
+
+ [[gnu::always_inline]]
+ value_type&& get() {
+ if (core_type::available()) {
+ return core_type::get();
+ } else {
+ // destined to wait!
+ auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond;
+ INTR_FUT_DEBUG(
+ "interruptible_future_detail::get() waiting, interrupt_cond: {},{}",
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ interrupt_cond<InterruptCond>.reset();
+ auto&& value = core_type::get();
+ interrupt_cond<InterruptCond>.set(interruption_condition);
+ INTR_FUT_DEBUG(
+ "interruptible_future_detail::get() got, interrupt_cond: {},{}",
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ return std::move(value);
+ }
+ }
+
+ using core_type::available;
+ using core_type::failed;
+
+ template <typename Func,
+ typename Result = interrupt_futurize_t<
+ std::invoke_result_t<Func, seastar::future<T>>>>
+ [[gnu::always_inline]]
+ Result then_wrapped_interruptible(Func&& func) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ return core_type::then_wrapped(
+ [func=std::move(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& fut) mutable {
+ return call_with_interruption(
+ std::move(interrupt_condition),
+ std::forward<Func>(func),
+ std::move(fut));
+ });
+ }
+
+ template <typename Func>
+ [[gnu::always_inline]]
+ auto then_interruptible(Func&& func) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ if constexpr (std::is_void_v<T>) {
+ auto fut = core_type::then(
+ [func=std::move(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ () mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func));
+ });
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ } else {
+ auto fut = core_type::then(
+ [func=std::move(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (T&& arg) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func),
+ std::forward<T>(arg));
+ });
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ }
+ }
+
+ template <typename Func>
+ [[gnu::always_inline]]
+ auto then_unpack_interruptible(Func&& func) {
+ return then_interruptible([func=std::forward<Func>(func)](T&& tuple) mutable {
+ return std::apply(std::forward<Func>(func), std::move(tuple));
+ });
+ }
+
+ template <typename Func,
+ typename Result =interrupt_futurize_t<
+ std::result_of_t<Func(std::exception_ptr)>>>
+ [[gnu::always_inline]]
+ Result handle_exception_interruptible(Func&& func) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ return core_type::then_wrapped(
+ [func=std::forward<Func>(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& fut) mutable {
+ if (!fut.failed()) {
+ return seastar::make_ready_future<T>(fut.get());
+ } else {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func),
+ fut.get_exception());
+ }
+ });
+ }
+
+ template <bool may_interrupt = true, typename Func,
+ typename Result = interrupt_futurize_t<
+ std::result_of_t<Func()>>>
+ [[gnu::always_inline]]
+ Result finally_interruptible(Func&& func) {
+ if constexpr (may_interrupt) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ return core_type::then_wrapped(
+ [func=std::forward<Func>(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& fut) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func));
+ });
+ } else {
+ return core_type::finally(std::forward<Func>(func));
+ }
+ }
+
+ template <typename Func,
+ typename Result = interrupt_futurize_t<
+ std::result_of_t<Func(
+ typename seastar::function_traits<Func>::template arg<0>::type)>>>
+ [[gnu::always_inline]]
+ Result handle_exception_type_interruptible(Func&& func) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ using trait = seastar::function_traits<Func>;
+ static_assert(trait::arity == 1, "func can take only one parameter");
+ using ex_type = typename trait::template arg<0>::type;
+ return core_type::then_wrapped(
+ [func=std::forward<Func>(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& fut) mutable -> Result {
+ if (!fut.failed()) {
+ return seastar::make_ready_future<T>(fut.get());
+ } else {
+ try {
+ std::rethrow_exception(fut.get_exception());
+ } catch (ex_type& ex) {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func), ex);
+ }
+ }
+ });
+ }
+
+
+ using my_type = interruptible_future_detail<InterruptCond, seastar::future<T>>;
+
+ template <typename Func>
+ [[gnu::always_inline]]
+ my_type finally(Func&& func) {
+ return core_type::finally(std::forward<Func>(func));
+ }
+private:
+ template <typename Func>
+ [[gnu::always_inline]]
+ auto handle_interruption(Func&& func) {
+ return core_type::then_wrapped(
+ [func=std::move(func)](auto&& fut) mutable {
+ if (fut.failed()) {
+ std::exception_ptr ex = fut.get_exception();
+ if (InterruptCond::is_interruption(ex)) {
+ return seastar::futurize_invoke(std::move(func), std::move(ex));
+ } else {
+ return seastar::make_exception_future<T>(std::move(ex));
+ }
+ } else {
+ return seastar::make_ready_future<T>(fut.get());
+ }
+ });
+ }
+
+ seastar::future<T> to_future() {
+ return static_cast<core_type&&>(std::move(*this));
+ }
+ // this is only supposed to be invoked by seastar functions
+ template <typename Func,
+ typename Result = interrupt_futurize_t<
+ std::result_of_t<Func(seastar::future<T>)>>>
+ [[gnu::always_inline]]
+ Result then_wrapped(Func&& func) {
+ return core_type::then_wrapped(
+ [func=std::move(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& fut) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::forward<Func>(func),
+ std::move(fut));
+ });
+ }
+ friend interruptor<InterruptCond>;
+ friend class interruptible_future_builder<InterruptCond>;
+ template <typename U>
+ friend struct ::seastar::futurize;
+ template <typename>
+ friend class ::seastar::future;
+ template <typename HeldState, typename Future>
+ friend class seastar::internal::do_with_state;
+ template<typename TX, typename F>
+ friend inline auto ::seastar::internal::do_with_impl(TX&& rvalue, F&& f);
+ template<typename T1, typename T2, typename T3_or_F, typename... More>
+ friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more);
+ template <typename T1, typename T2, typename... More>
+ friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more);
+ template <typename, typename>
+ friend class ::crimson::maybe_handle_error_t;
+ template <typename>
+ friend class ::seastar::internal::extract_values_from_futures_vector;
+ template <typename, typename>
+ friend class interruptible_future_detail;
+ template <typename ResolvedVectorTransform, typename Future>
+ friend inline typename ResolvedVectorTransform::future_type
+ seastar::internal::complete_when_all(
+ std::vector<Future>&& futures,
+ typename std::vector<Future>::iterator pos) noexcept;
+ template <typename>
+ friend class ::seastar::internal::when_all_state_component;
+ template <typename Lock, typename Func>
+ friend inline auto seastar::with_lock(Lock& lock, Func&& f);
+ template <typename IC, typename FT>
+ friend class parallel_for_each_state;
+};
+
+template <typename InterruptCond, typename Errorator>
+struct interruptible_errorator {
+ using base_ertr = Errorator;
+ using intr_cond_t = InterruptCond;
+
+ template <typename ValueT = void>
+ using future = interruptible_future_detail<InterruptCond,
+ typename Errorator::template future<ValueT>>;
+
+ template <class... NewAllowedErrorsT>
+ using extend = interruptible_errorator<
+ InterruptCond,
+ typename Errorator::template extend<NewAllowedErrorsT...>>;
+
+ template <class Ertr>
+ using extend_ertr = interruptible_errorator<
+ InterruptCond,
+ typename Errorator::template extend_ertr<Ertr>>;
+
+ template <typename ValueT = void, typename... A>
+ static interruptible_future_detail<
+ InterruptCond,
+ typename Errorator::template future<ValueT>>
+ make_ready_future(A&&... value) {
+ return interruptible_future_detail<
+ InterruptCond, typename Errorator::template future<ValueT>>(
+ Errorator::template make_ready_future<ValueT>(
+ std::forward<A>(value)...));
+ }
+ static interruptible_future_detail<
+ InterruptCond,
+ typename Errorator::template future<>> now() {
+ return interruptible_future_detail<
+ InterruptCond, typename Errorator::template future<>>(
+ Errorator::now());
+ }
+
+ using pass_further = typename Errorator::pass_further;
+};
+
+template <typename InterruptCond,
+ template <typename...> typename ErroratedFuture,
+ typename T>
+class [[nodiscard]] interruptible_future_detail<
+ InterruptCond,
+ ErroratedFuture<::crimson::errorated_future_marker<T>>>
+ : private ErroratedFuture<::crimson::errorated_future_marker<T>>
+{
+public:
+ using core_type = ErroratedFuture<crimson::errorated_future_marker<T>>;
+ using errorator_type = typename core_type::errorator_type;
+ using interrupt_errorator_type =
+ interruptible_errorator<InterruptCond, errorator_type>;
+ using interrupt_cond_type = InterruptCond;
+
+ template <typename U>
+ using interrupt_futurize_t =
+ typename interruptor<InterruptCond>::template futurize_t<U>;
+
+ using core_type::available;
+ using core_type::failed;
+ using core_type::core_type;
+ using core_type::get_exception;
+
+ using value_type = typename core_type::value_type;
+
+ interruptible_future_detail(seastar::future<T>&& fut)
+ : core_type(std::move(fut))
+ {}
+
+ template <template <typename...> typename ErroratedFuture2,
+ typename... U>
+ [[gnu::always_inline]]
+ interruptible_future_detail(
+ ErroratedFuture2<::crimson::errorated_future_marker<U...>>&& fut)
+ : core_type(std::move(fut)) {}
+
+ template <template <typename...> typename ErroratedFuture2,
+ typename... U>
+ [[gnu::always_inline]]
+ interruptible_future_detail(
+ interruptible_future_detail<InterruptCond,
+ ErroratedFuture2<::crimson::errorated_future_marker<U...>>>&& fut)
+ : core_type(static_cast<typename std::decay_t<decltype(fut)>::core_type&&>(fut)) {
+ using src_errorator_t = \
+ typename ErroratedFuture2<
+ ::crimson::errorated_future_marker<U...>>::errorator_type;
+ static_assert(core_type::errorator_type::template contains_once_v<
+ src_errorator_t>,
+ "conversion is only possible from less-or-eq errorated future!");
+ }
+
+ [[gnu::always_inline]]
+ interruptible_future_detail(
+ interruptible_future_detail<InterruptCond, seastar::future<T>>&& fut)
+ : core_type(static_cast<seastar::future<T>&&>(fut)) {}
+
+ template <class... A>
+ [[gnu::always_inline]]
+ interruptible_future_detail(ready_future_marker, A&&... a)
+ : core_type(::seastar::make_ready_future<typename core_type::value_type>(
+ std::forward<A>(a)...)) {
+ }
+ [[gnu::always_inline]]
+ interruptible_future_detail(exception_future_marker, ::seastar::future_state_base&& state) noexcept
+ : core_type(::seastar::futurize<core_type>::make_exception_future(std::move(state))) {
+ }
+ [[gnu::always_inline]]
+ interruptible_future_detail(exception_future_marker, std::exception_ptr&& ep) noexcept
+ : core_type(::seastar::futurize<core_type>::make_exception_future(std::move(ep))) {
+ }
+
+ template<bool interruptible = true, typename ValueInterruptCondT, typename ErrorVisitorT,
+ std::enable_if_t<!interruptible, int> = 0>
+ [[gnu::always_inline]]
+ auto safe_then_interruptible(ValueInterruptCondT&& valfunc, ErrorVisitorT&& errfunc) {
+ auto fut = core_type::safe_then(
+ std::forward<ValueInterruptCondT>(valfunc),
+ std::forward<ErrorVisitorT>(errfunc));
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ }
+
+ template <typename... Args>
+ auto si_then(Args&&... args) {
+ return safe_then_interruptible(std::forward<Args>(args)...);
+ }
+
+
+ template<bool interruptible = true, typename ValueInterruptCondT, typename ErrorVisitorT,
+ typename U = T, std::enable_if_t<!std::is_void_v<U> && interruptible, int> = 0>
+ [[gnu::always_inline]]
+ auto safe_then_interruptible(ValueInterruptCondT&& valfunc, ErrorVisitorT&& errfunc) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ auto fut = core_type::safe_then(
+ [func=std::move(valfunc),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (T&& args) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func),
+ std::forward<T>(args));
+ }, [func=std::move(errfunc),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& err) mutable -> decltype(auto) {
+ constexpr bool return_void = std::is_void_v<
+ std::invoke_result_t<ErrorVisitorT,
+ std::decay_t<decltype(err)>>>;
+ constexpr bool return_err = ::crimson::is_error_v<
+ std::decay_t<std::invoke_result_t<ErrorVisitorT,
+ std::decay_t<decltype(err)>>>>;
+ if constexpr (return_err || return_void) {
+ return non_futurized_call_with_interruption(
+ interrupt_condition,
+ std::move(func),
+ std::move(err));
+ } else {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func),
+ std::move(err));
+ }
+ });
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ }
+
+ template<bool interruptible = true, typename ValueInterruptCondT, typename ErrorVisitorT,
+ typename U = T, std::enable_if_t<std::is_void_v<U> && interruptible, int> = 0>
+ [[gnu::always_inline]]
+ auto safe_then_interruptible(ValueInterruptCondT&& valfunc, ErrorVisitorT&& errfunc) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ auto fut = core_type::safe_then(
+ [func=std::move(valfunc),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ () mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func));
+ }, [func=std::move(errfunc),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& err) mutable -> decltype(auto) {
+ constexpr bool return_void = std::is_void_v<
+ std::invoke_result_t<ErrorVisitorT,
+ std::decay_t<decltype(err)>>>;
+ constexpr bool return_err = ::crimson::is_error_v<
+ std::decay_t<std::invoke_result_t<ErrorVisitorT,
+ std::decay_t<decltype(err)>>>>;
+ if constexpr (return_err || return_void) {
+ return non_futurized_call_with_interruption(
+ interrupt_condition,
+ std::move(func),
+ std::move(err));
+ } else {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func),
+ std::move(err));
+ }
+ });
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ }
+
+ template <bool interruptible = true, typename ValueInterruptCondT,
+ typename U = T, std::enable_if_t<std::is_void_v<T> && interruptible, int> = 0>
+ [[gnu::always_inline]]
+ auto safe_then_interruptible(ValueInterruptCondT&& valfunc) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ auto fut = core_type::safe_then(
+ [func=std::move(valfunc),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ () mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func));
+ });
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ }
+
+ template <typename ValFuncT, typename ErrorFuncT>
+ [[gnu::always_inline]]
+ auto safe_then_unpack_interruptible(ValFuncT&& func, ErrorFuncT&& errfunc) {
+ return safe_then_interruptible([func=std::forward<ValFuncT>(func)](T&& tuple) mutable {
+ return std::apply(std::forward<ValFuncT>(func), std::move(tuple));
+ }, std::forward<ErrorFuncT>(errfunc));
+ }
+
+ template <typename ValFuncT>
+ [[gnu::always_inline]]
+ auto safe_then_unpack_interruptible(ValFuncT&& func) {
+ return safe_then_interruptible([func=std::forward<ValFuncT>(func)](T&& tuple) mutable {
+ return std::apply(std::forward<ValFuncT>(func), std::move(tuple));
+ });
+ }
+
+ template <bool interruptible = true, typename ValueInterruptCondT,
+ typename U = T, std::enable_if_t<!std::is_void_v<T> && interruptible, int> = 0>
+ [[gnu::always_inline]]
+ auto safe_then_interruptible(ValueInterruptCondT&& valfunc) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ auto fut = core_type::safe_then(
+ [func=std::move(valfunc),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (T&& arg) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(func),
+ std::forward<T>(arg));
+ });
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ }
+
+ template <bool interruptible = true, typename ValueInterruptCondT,
+ std::enable_if_t<!interruptible, int> = 0>
+ [[gnu::always_inline]]
+ auto safe_then_interruptible(ValueInterruptCondT&& valfunc) {
+ auto fut = core_type::safe_then(std::forward<ValueInterruptCondT>(valfunc));
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ }
+
+ template <typename ValueInterruptCondT,
+ typename ErrorVisitorHeadT,
+ typename... ErrorVisitorTailT>
+ [[gnu::always_inline]]
+ auto safe_then_interruptible(ValueInterruptCondT&& valfunc,
+ ErrorVisitorHeadT&& err_func_head,
+ ErrorVisitorTailT&&... err_func_tail) {
+ return safe_then_interruptible(
+ std::forward<ValueInterruptCondT>(valfunc),
+ ::crimson::composer(std::forward<ErrorVisitorHeadT>(err_func_head),
+ std::forward<ErrorVisitorTailT>(err_func_tail)...));
+ }
+
+ template <typename ValueInterruptCondT,
+ typename ErrorVisitorHeadT,
+ typename... ErrorVisitorTailT>
+ [[gnu::always_inline]]
+ auto safe_then_interruptible_tuple(ValueInterruptCondT&& valfunc,
+ ErrorVisitorHeadT&& err_func_head,
+ ErrorVisitorTailT&&... err_func_tail) {
+ return safe_then_interruptible(
+ std::forward<ValueInterruptCondT>(valfunc),
+ ::crimson::composer(std::forward<ErrorVisitorHeadT>(err_func_head),
+ std::forward<ErrorVisitorTailT>(err_func_tail)...));
+ }
+
+ template <typename ValFuncT,
+ typename ErrorVisitorHeadT,
+ typename... ErrorVisitorTailT>
+ [[gnu::always_inline]]
+ auto safe_then_unpack_interruptible_tuple(
+ ValFuncT&& valfunc,
+ ErrorVisitorHeadT&& err_func_head,
+ ErrorVisitorTailT&&... err_func_tail) {
+ return safe_then_interruptible_tuple(
+ [valfunc=std::forward<ValFuncT>(valfunc)](T&& tuple) mutable {
+ return std::apply(std::forward<ValFuncT>(valfunc), std::move(tuple));
+ },
+ ::crimson::composer(std::forward<ErrorVisitorHeadT>(err_func_head),
+ std::forward<ErrorVisitorTailT>(err_func_tail)...));
+ }
+
+ template <bool interruptible = true, typename ErrorFunc>
+ auto handle_error_interruptible(ErrorFunc&& errfunc) {
+ if constexpr (interruptible) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ auto fut = core_type::handle_error(
+ [errfunc=std::move(errfunc),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& err) mutable -> decltype(auto) {
+ constexpr bool return_void = std::is_void_v<
+ std::invoke_result_t<ErrorFunc,
+ std::decay_t<decltype(err)>>>;
+ constexpr bool return_err = ::crimson::is_error_v<
+ std::decay_t<std::invoke_result_t<ErrorFunc,
+ std::decay_t<decltype(err)>>>>;
+ if constexpr (return_err || return_void) {
+ return non_futurized_call_with_interruption(
+ interrupt_condition,
+ std::move(errfunc),
+ std::move(err));
+ } else {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(errfunc),
+ std::move(err));
+ }
+ });
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ } else {
+ return core_type::handle_error(std::forward<ErrorFunc>(errfunc));
+ }
+ }
+
+ template <typename ErrorFuncHead,
+ typename... ErrorFuncTail>
+ auto handle_error_interruptible(ErrorFuncHead&& error_func_head,
+ ErrorFuncTail&&... error_func_tail) {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ static_assert(sizeof...(ErrorFuncTail) > 0);
+ return this->handle_error_interruptible(
+ ::crimson::composer(
+ std::forward<ErrorFuncHead>(error_func_head),
+ std::forward<ErrorFuncTail>(error_func_tail)...));
+ }
+
+ template <typename Func>
+ [[gnu::always_inline]]
+ auto finally(Func&& func) {
+ auto fut = core_type::finally(std::forward<Func>(func));
+ return (interrupt_futurize_t<decltype(fut)>)(std::move(fut));
+ }
+
+private:
+ using core_type::_then;
+ template <typename Func>
+ [[gnu::always_inline]]
+ auto handle_interruption(Func&& func) {
+ // see errorator.h safe_then definition
+ using func_result_t =
+ typename std::invoke_result<Func, std::exception_ptr>::type;
+ using func_ertr_t =
+ typename core_type::template get_errorator_t<func_result_t>;
+ using this_ertr_t = typename core_type::errorator_type;
+ using ret_ertr_t = typename this_ertr_t::template extend_ertr<func_ertr_t>;
+ using futurator_t = typename ret_ertr_t::template futurize<func_result_t>;
+ return core_type::then_wrapped(
+ [func=std::move(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (auto&& fut) mutable
+ -> typename futurator_t::type {
+ if (fut.failed()) {
+ std::exception_ptr ex = fut.get_exception();
+ if (InterruptCond::is_interruption(ex)) {
+ return futurator_t::invoke(std::move(func), std::move(ex));
+ } else {
+ return futurator_t::make_exception_future(std::move(ex));
+ }
+ } else {
+ return std::move(fut);
+ }
+ });
+ }
+
+ ErroratedFuture<::crimson::errorated_future_marker<T>>
+ to_future() {
+ return static_cast<core_type&&>(std::move(*this));
+ }
+
+ friend class interruptor<InterruptCond>;
+ friend class interruptible_future_builder<InterruptCond>;
+ template <typename U>
+ friend struct ::seastar::futurize;
+ template <typename>
+ friend class ::seastar::future;
+ template<typename TX, typename F>
+ friend inline auto ::seastar::internal::do_with_impl(TX&& rvalue, F&& f);
+ template<typename T1, typename T2, typename T3_or_F, typename... More>
+ friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more);
+ template <typename T1, typename T2, typename... More>
+ friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more);
+ template <typename HeldState, typename Future>
+ friend class seastar::internal::do_with_state;
+ template <typename, typename>
+ friend class ::crimson::maybe_handle_error_t;
+ template <typename, typename>
+ friend class interruptible_future_detail;
+ template <typename Lock, typename Func>
+ friend inline auto seastar::with_lock(Lock& lock, Func&& f);
+ template <typename IC, typename FT>
+ friend class parallel_for_each_state;
+};
+
+template <typename InterruptCond, typename T = void>
+using interruptible_future =
+ interruptible_future_detail<InterruptCond, seastar::future<T>>;
+
+template <typename InterruptCond, typename Errorator, typename T = void>
+using interruptible_errorated_future =
+ interruptible_future_detail<
+ InterruptCond,
+ typename Errorator::template future<T>>;
+
+template <typename InterruptCond>
+struct interruptor
+{
+public:
+ using condition = InterruptCond;
+
+ template <typename FutureType>
+ [[gnu::always_inline]]
+ static interruptible_future_detail<InterruptCond, FutureType>
+ make_interruptible(FutureType&& fut) {
+ return interruptible_future_detail<InterruptCond, FutureType>(std::move(fut));
+ }
+
+ [[gnu::always_inline]]
+ static interruptible_future_detail<InterruptCond, seastar::future<>> now() {
+ return interruptible_future_detail<
+ InterruptCond,
+ seastar::future<>>(seastar::now());
+ }
+
+ template <typename ValueT = void, typename... A>
+ [[gnu::always_inline]]
+ static interruptible_future_detail<InterruptCond, seastar::future<ValueT>>
+ make_ready_future(A&&... value) {
+ return interruptible_future_detail<InterruptCond, seastar::future<ValueT>>(
+ seastar::make_ready_future<ValueT>(std::forward<A>(value)...));
+ }
+
+ template <typename T>
+ struct futurize {
+ using type = interruptible_future_detail<
+ InterruptCond, typename seastar::futurize<T>::type>;
+ };
+
+ template <typename FutureType>
+ struct futurize<interruptible_future_detail<InterruptCond, FutureType>> {
+ using type = interruptible_future_detail<InterruptCond, FutureType>;
+ };
+
+ template <typename T>
+ using futurize_t = typename futurize<T>::type;
+
+ template <typename Container, typename AsyncAction>
+ [[gnu::always_inline]]
+ static auto do_for_each(Container& c, AsyncAction&& action) {
+ return do_for_each(std::begin(c), std::end(c),
+ std::forward<AsyncAction>(action));
+ }
+
+ template <typename OpFunc, typename OnInterrupt,
+ typename... Params>
+ static inline auto with_interruption_cond(
+ OpFunc&& opfunc, OnInterrupt&& efunc, InterruptCond &&cond, Params&&... params) {
+ INTR_FUT_DEBUG(
+ "with_interruption_cond: interrupt_cond: {}",
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get());
+ return internal::call_with_interruption_impl(
+ seastar::make_lw_shared<InterruptCond>(std::move(cond)),
+ std::forward<OpFunc>(opfunc),
+ std::forward<Params>(params)...
+ ).template handle_interruption(std::move(efunc));
+ }
+
+ template <typename OpFunc, typename OnInterrupt,
+ typename... InterruptCondParams>
+ static inline auto with_interruption(
+ OpFunc&& opfunc, OnInterrupt&& efunc, InterruptCondParams&&... params) {
+ return with_interruption_cond(
+ std::forward<OpFunc>(opfunc),
+ std::forward<OnInterrupt>(efunc),
+ InterruptCond(std::forward<InterruptCondParams>(params)...));
+ }
+
+ template <typename Error,
+ typename Func,
+ typename... Params>
+ static inline auto with_interruption_to_error(
+ Func &&f, InterruptCond &&cond, Params&&... params) {
+ using func_result_t = std::invoke_result_t<Func, Params...>;
+ using func_ertr_t =
+ typename seastar::template futurize<
+ func_result_t>::core_type::errorator_type;
+ using with_trans_ertr =
+ typename func_ertr_t::template extend_ertr<errorator<Error>>;
+
+ using value_type = typename func_result_t::value_type;
+ using ftype = typename std::conditional_t<
+ std::is_same_v<value_type, seastar::internal::monostate>,
+ typename with_trans_ertr::template future<>,
+ typename with_trans_ertr::template future<value_type>>;
+
+ return with_interruption_cond(
+ std::forward<Func>(f),
+ [](auto e) -> ftype {
+ return Error::make();
+ },
+ std::forward<InterruptCond>(cond),
+ std::forward<Params>(params)...);
+ }
+
+ template <typename Func>
+ [[gnu::always_inline]]
+ static auto wrap_function(Func&& func) {
+ return [func=std::forward<Func>(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]() mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::forward<Func>(func));
+ };
+ }
+
+ template <typename Iterator,
+ InvokeReturnsInterruptibleFuture<typename Iterator::reference> AsyncAction>
+ [[gnu::always_inline]]
+ static auto do_for_each(Iterator begin, Iterator end, AsyncAction&& action) {
+ using Result = std::invoke_result_t<AsyncAction, typename Iterator::reference>;
+ if constexpr (seastar::Future<typename Result::core_type>) {
+ return make_interruptible(
+ ::seastar::do_for_each(begin, end,
+ [action=std::move(action),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (typename Iterator::reference x) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(action),
+ std::forward<decltype(*begin)>(x)).to_future();
+ })
+ );
+ } else {
+ return make_interruptible(
+ ::crimson::do_for_each(begin, end,
+ [action=std::move(action),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (typename Iterator::reference x) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(action),
+ std::forward<decltype(*begin)>(x)).to_future();
+ })
+ );
+ }
+ }
+
+ template <typename Iterator, typename AsyncAction>
+ requires (!InvokeReturnsInterruptibleFuture<AsyncAction, typename Iterator::reference>)
+ [[gnu::always_inline]]
+ static auto do_for_each(Iterator begin, Iterator end, AsyncAction&& action) {
+ if constexpr (seastar::InvokeReturnsAnyFuture<AsyncAction, typename Iterator::reference>) {
+ return make_interruptible(
+ ::seastar::do_for_each(begin, end,
+ [action=std::move(action),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (typename Iterator::reference x) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(action),
+ std::forward<decltype(*begin)>(x));
+ })
+ );
+ } else {
+ return make_interruptible(
+ ::crimson::do_for_each(begin, end,
+ [action=std::move(action),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (typename Iterator::reference x) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(action),
+ std::forward<decltype(*begin)>(x));
+ })
+ );
+ }
+ }
+
+ template <InvokeReturnsInterruptibleFuture AsyncAction>
+ [[gnu::always_inline]]
+ static auto repeat(AsyncAction&& action) {
+ using Result = std::invoke_result_t<AsyncAction>;
+ if constexpr (seastar::Future<typename Result::core_type>) {
+ return make_interruptible(
+ ::seastar::repeat(
+ [action=std::move(action),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(action)).to_future();
+ })
+ );
+ } else {
+ return make_interruptible(
+ ::crimson::repeat(
+ [action=std::move(action),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]() mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(action)).to_future();
+ })
+ );
+ }
+ }
+ template <typename AsyncAction>
+ requires (!InvokeReturnsInterruptibleFuture<AsyncAction>)
+ [[gnu::always_inline]]
+ static auto repeat(AsyncAction&& action) {
+ if constexpr (seastar::InvokeReturnsAnyFuture<AsyncAction>) {
+ return make_interruptible(
+ ::seastar::repeat(
+ [action=std::move(action),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(action));
+ })
+ );
+ } else {
+ return make_interruptible(
+ ::crimson::repeat(
+ [action=std::move(action),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] {
+ return call_with_interruption(
+ interrupt_condition,
+ std::move(action));
+ })
+ );
+ }
+ }
+
+ template <typename Iterator, typename Func>
+ static inline auto parallel_for_each(
+ Iterator begin,
+ Iterator end,
+ Func&& func
+ ) noexcept {
+ using ResultType = std::invoke_result_t<Func, typename Iterator::reference>;
+ parallel_for_each_state<InterruptCond, ResultType>* s = nullptr;
+ auto decorated_func =
+ [func=std::forward<Func>(func),
+ interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]
+ (decltype(*Iterator())&& x) mutable {
+ return call_with_interruption(
+ interrupt_condition,
+ std::forward<Func>(func),
+ std::forward<decltype(*begin)>(x));
+ };
+ // Process all elements, giving each future the following treatment:
+ // - available, not failed: do nothing
+ // - available, failed: collect exception in ex
+ // - not available: collect in s (allocating it if needed)
+ while (begin != end) {
+ auto f = seastar::futurize_invoke(decorated_func, *begin++);
+ if (!f.available() || f.failed()) {
+ if (!s) {
+ using itraits = std::iterator_traits<Iterator>;
+ auto n = (seastar::internal::iterator_range_estimate_vector_capacity(
+ begin, end, typename itraits::iterator_category()) + 1);
+ s = new parallel_for_each_state<InterruptCond, ResultType>(n);
+ }
+ s->add_future(std::move(f));
+ }
+ }
+ // If any futures were not available, hand off to parallel_for_each_state::start().
+ // Otherwise we can return a result immediately.
+ if (s) {
+ // s->get_future() takes ownership of s (and chains it to one of the futures it contains)
+ // so this isn't a leak
+ return s->get_future();
+ }
+ return parallel_for_each_state<InterruptCond, ResultType>::now();
+ }
+
+ template <typename Container, typename Func>
+ static inline auto parallel_for_each(Container& container, Func&& func) noexcept {
+ return parallel_for_each(
+ std::begin(container),
+ std::end(container),
+ std::forward<Func>(func));
+ }
+
+ template <typename Iterator, typename Mapper, typename Initial, typename Reduce>
+ static inline interruptible_future<InterruptCond, Initial> map_reduce(
+ Iterator begin, Iterator end, Mapper&& mapper, Initial initial, Reduce&& reduce) {
+ struct state {
+ Initial result;
+ Reduce reduce;
+ };
+ auto s = seastar::make_lw_shared(state{std::move(initial), std::move(reduce)});
+ interruptible_future<InterruptCond> ret = seastar::make_ready_future<>();
+ while (begin != end) {
+ ret = seastar::futurize_invoke(mapper, *begin++).then_wrapped_interruptible(
+ [s = s.get(), ret = std::move(ret)] (auto f) mutable {
+ try {
+ s->result = s->reduce(std::move(s->result), std::move(f.get0()));
+ return std::move(ret);
+ } catch (...) {
+ return std::move(ret).then_wrapped_interruptible([ex = std::current_exception()] (auto f) {
+ f.ignore_ready_future();
+ return seastar::make_exception_future<>(ex);
+ });
+ }
+ });
+ }
+ return ret.then_interruptible([s] {
+ return seastar::make_ready_future<Initial>(std::move(s->result));
+ });
+ }
+ template <typename Range, typename Mapper, typename Initial, typename Reduce>
+ static inline interruptible_future<InterruptCond, Initial> map_reduce(
+ Range&& range, Mapper&& mapper, Initial initial, Reduce&& reduce) {
+ return map_reduce(std::begin(range), std::end(range), std::forward<Mapper>(mapper),
+ std::move(initial), std::move(reduce));
+ }
+
+ template<typename Fut>
+ requires seastar::Future<Fut> || IsInterruptibleFuture<Fut>
+ static auto futurize_invoke_if_func(Fut&& fut) noexcept {
+ return std::forward<Fut>(fut);
+ }
+
+ template<typename Func>
+ requires (!seastar::Future<Func>) && (!IsInterruptibleFuture<Func>)
+ static auto futurize_invoke_if_func(Func&& func) noexcept {
+ return seastar::futurize_invoke(std::forward<Func>(func));
+ }
+
+ template <typename... FutOrFuncs>
+ static inline auto when_all(FutOrFuncs&&... fut_or_funcs) noexcept {
+ return ::seastar::internal::when_all_impl(
+ futurize_invoke_if_func(std::forward<FutOrFuncs>(fut_or_funcs))...);
+ }
+
+ template <typename... FutOrFuncs>
+ static inline auto when_all_succeed(FutOrFuncs&&... fut_or_funcs) noexcept {
+ return ::seastar::internal::when_all_succeed_impl(
+ futurize_invoke_if_func(std::forward<FutOrFuncs>(fut_or_funcs))...);
+ }
+
+ template <typename Func,
+ typename Result = futurize_t<std::invoke_result_t<Func>>>
+ static inline Result async(Func&& func) {
+ auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond;
+ INTR_FUT_DEBUG(
+ "interruptible_future_detail::async() yielding out, "
+ "interrupt_cond {},{} cleared",
+ (void*)interruption_condition.get(),
+ typeid(InterruptCond).name());
+ interrupt_cond<InterruptCond>.reset();
+ auto ret = seastar::async([func=std::forward<Func>(func),
+ interruption_condition] () mutable {
+ return non_futurized_call_with_interruption(
+ interruption_condition, std::forward<Func>(func));
+ });
+ interrupt_cond<InterruptCond>.set(interruption_condition);
+ INTR_FUT_DEBUG(
+ "interruptible_future_detail::async() yield back, interrupt_cond: {},{}",
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ return ret;
+ }
+
+ template <class FutureT>
+ static decltype(auto) green_get(FutureT&& fut) {
+ if (fut.available()) {
+ return fut.get();
+ } else {
+ // destined to wait!
+ auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond;
+ INTR_FUT_DEBUG(
+ "green_get() waiting, interrupt_cond: {},{}",
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ interrupt_cond<InterruptCond>.reset();
+ auto&& value = fut.get();
+ interrupt_cond<InterruptCond>.set(interruption_condition);
+ INTR_FUT_DEBUG(
+ "green_get() got, interrupt_cond: {},{}",
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ return std::move(value);
+ }
+ }
+
+ static void yield() {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond;
+ INTR_FUT_DEBUG(
+ "interruptible_future_detail::yield() yielding out, "
+ "interrupt_cond {},{} cleared",
+ (void*)interruption_condition.get(),
+ typeid(InterruptCond).name());
+ interrupt_cond<InterruptCond>.reset();
+ seastar::thread::yield();
+ interrupt_cond<InterruptCond>.set(interruption_condition);
+ INTR_FUT_DEBUG(
+ "interruptible_future_detail::yield() yield back, interrupt_cond: {},{}",
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ }
+
+ static void maybe_yield() {
+ ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond);
+ if (seastar::thread::should_yield()) {
+ auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond;
+ INTR_FUT_DEBUG(
+ "interruptible_future_detail::may_yield() yielding out, "
+ "interrupt_cond {},{} cleared",
+ (void*)interruption_condition.get(),
+ typeid(InterruptCond).name());
+ interrupt_cond<InterruptCond>.reset();
+ seastar::thread::yield();
+ interrupt_cond<InterruptCond>.set(interruption_condition);
+ INTR_FUT_DEBUG(
+ "interruptible_future_detail::may_yield() yield back, interrupt_cond: {},{}",
+ (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(),
+ typeid(InterruptCond).name());
+ }
+ }
+};
+
+} // namespace crimson::interruptible
+
+namespace seastar {
+
+template <typename InterruptCond, typename... T>
+struct futurize<::crimson::interruptible::interruptible_future_detail<
+ InterruptCond, seastar::future<T...>>> {
+ using type = ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond, seastar::future<T...>>;
+
+ using value_type = typename type::value_type;
+ using tuple_type = typename type::tuple_type;
+
+ static type from_tuple(tuple_type&& value) {
+ return type(ready_future_marker(), std::move(value));
+ }
+ static type from_tuple(const tuple_type& value) {
+ return type(ready_future_marker(), value);
+ }
+ static type from_tuple(value_type&& value) {
+ return type(ready_future_marker(), std::move(value));
+ }
+ static type from_tuple(const value_type& value) {
+ return type(ready_future_marker(), value);
+ }
+
+ template <typename Func, typename... FuncArgs>
+ [[gnu::always_inline]]
+ static inline type invoke(Func&& func, FuncArgs&&... args) noexcept {
+ try {
+ return func(std::forward<FuncArgs>(args)...);
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <typename Func>
+ [[gnu::always_inline]]
+ static type invoke(Func&& func, seastar::internal::monostate) noexcept {
+ try {
+ return ::seastar::futurize_invoke(std::forward<Func>(func));
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <typename Arg>
+ static inline type make_exception_future(Arg&& arg) noexcept {
+ return seastar::make_exception_future<T...>(std::forward<Arg>(arg));
+ }
+
+ static inline type make_exception_future(future_state_base&& state) noexcept {
+ return seastar::internal::make_exception_future<T...>(std::move(state));
+ }
+
+ template<typename PromiseT, typename Func>
+ static void satisfy_with_result_of(PromiseT&& pr, Func&& func) {
+ func().forward_to(std::move(pr));
+ }
+};
+
+template <typename InterruptCond,
+ template <typename...> typename ErroratedFuture,
+ typename... T>
+struct futurize<
+ ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond,
+ ErroratedFuture<::crimson::errorated_future_marker<T...>>
+ >
+> {
+ using type = ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond,
+ ErroratedFuture<::crimson::errorated_future_marker<T...>>>;
+ using core_type = ErroratedFuture<
+ ::crimson::errorated_future_marker<T...>>;
+ using errorator_type =
+ ::crimson::interruptible::interruptible_errorator<
+ InterruptCond,
+ typename ErroratedFuture<
+ ::crimson::errorated_future_marker<T...>>::errorator_type>;
+
+ template<typename Func, typename... FuncArgs>
+ static inline type invoke(Func&& func, FuncArgs&&... args) noexcept {
+ try {
+ return func(std::forward<FuncArgs>(args)...);
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <typename Func>
+ [[gnu::always_inline]]
+ static type invoke(Func&& func, seastar::internal::monostate) noexcept {
+ try {
+ return ::seastar::futurize_invoke(std::forward<Func>(func));
+ } catch (...) {
+ return make_exception_future(std::current_exception());
+ }
+ }
+
+ template <typename Arg>
+ static inline type make_exception_future(Arg&& arg) noexcept {
+ return core_type::errorator_type::template make_exception_future2<T...>(
+ std::forward<Arg>(arg));
+ }
+
+ template<typename PromiseT, typename Func>
+ static void satisfy_with_result_of(PromiseT&& pr, Func&& func) {
+ func().forward_to(std::move(pr));
+ }
+
+};
+
+template <typename InterruptCond, typename FutureType>
+struct continuation_base_from_future<
+ ::crimson::interruptible::interruptible_future_detail<InterruptCond, FutureType>> {
+ using type = typename seastar::continuation_base_from_future<FutureType>::type;
+};
+
+template <typename InterruptCond, typename FutureType>
+struct is_future<
+ ::crimson::interruptible::interruptible_future_detail<
+ InterruptCond,
+ FutureType>>
+ : std::true_type {};
+} // namespace seastar
diff --git a/src/crimson/common/layout.h b/src/crimson/common/layout.h
new file mode 100644
index 000000000..9d54ecd1d
--- /dev/null
+++ b/src/crimson/common/layout.h
@@ -0,0 +1,737 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// MOTIVATION AND TUTORIAL
+//
+// If you want to put in a single heap allocation N doubles followed by M ints,
+// it's easy if N and M are known at compile time.
+//
+// struct S {
+// double a[N];
+// int b[M];
+// };
+//
+// S* p = new S;
+//
+// But what if N and M are known only in run time? Class template Layout to the
+// rescue! It's a portable generalization of the technique known as struct hack.
+//
+// // This object will tell us everything we need to know about the memory
+// // layout of double[N] followed by int[M]. It's structurally identical to
+// // size_t[2] that stores N and M. It's very cheap to create.
+// const Layout<double, int> layout(N, M);
+//
+// // Allocate enough memory for both arrays. `AllocSize()` tells us how much
+// // memory is needed. We are free to use any allocation function we want as
+// // long as it returns aligned memory.
+// std::unique_ptr<unsigned char[]> p(new unsigned char[layout.AllocSize()]);
+//
+// // Obtain the pointer to the array of doubles.
+// // Equivalent to `reinterpret_cast<double*>(p.get())`.
+// //
+// // We could have written layout.Pointer<0>(p) instead. If all the types are
+// // unique you can use either form, but if some types are repeated you must
+// // use the index form.
+// double* a = layout.Pointer<double>(p.get());
+//
+// // Obtain the pointer to the array of ints.
+// // Equivalent to `reinterpret_cast<int*>(p.get() + N * 8)`.
+// int* b = layout.Pointer<int>(p);
+//
+// If we are unable to specify sizes of all fields, we can pass as many sizes as
+// we can to `Partial()`. In return, it'll allow us to access the fields whose
+// locations and sizes can be computed from the provided information.
+// `Partial()` comes in handy when the array sizes are embedded into the
+// allocation.
+//
+// // size_t[1] containing N, size_t[1] containing M, double[N], int[M].
+// using L = Layout<size_t, size_t, double, int>;
+//
+// unsigned char* Allocate(size_t n, size_t m) {
+// const L layout(1, 1, n, m);
+// unsigned char* p = new unsigned char[layout.AllocSize()];
+// *layout.Pointer<0>(p) = n;
+// *layout.Pointer<1>(p) = m;
+// return p;
+// }
+//
+// void Use(unsigned char* p) {
+// // First, extract N and M.
+// // Specify that the first array has only one element. Using `prefix` we
+// // can access the first two arrays but not more.
+// constexpr auto prefix = L::Partial(1);
+// size_t n = *prefix.Pointer<0>(p);
+// size_t m = *prefix.Pointer<1>(p);
+//
+// // Now we can get pointers to the payload.
+// const L layout(1, 1, n, m);
+// double* a = layout.Pointer<double>(p);
+// int* b = layout.Pointer<int>(p);
+// }
+//
+// The layout we used above combines fixed-size with dynamically-sized fields.
+// This is quite common. Layout is optimized for this use case and generates
+// optimal code. All computations that can be performed at compile time are
+// indeed performed at compile time.
+//
+// Efficiency tip: The order of fields matters. In `Layout<T1, ..., TN>` try to
+// ensure that `alignof(T1) >= ... >= alignof(TN)`. This way you'll have no
+// padding in between arrays.
+//
+// You can manually override the alignment of an array by wrapping the type in
+// `Aligned<T, N>`. `Layout<..., Aligned<T, N>, ...>` has exactly the same API
+// and behavior as `Layout<..., T, ...>` except that the first element of the
+// array of `T` is aligned to `N` (the rest of the elements follow without
+// padding). `N` cannot be less than `alignof(T)`.
+//
+// `AllocSize()` and `Pointer()` are the most basic methods for dealing with
+// memory layouts. Check out the reference or code below to discover more.
+//
+// EXAMPLE
+//
+// // Immutable move-only string with sizeof equal to sizeof(void*). The
+// // string size and the characters are kept in the same heap allocation.
+// class CompactString {
+// public:
+// CompactString(const char* s = "") {
+// const size_t size = strlen(s);
+// // size_t[1] followed by char[size + 1].
+// const L layout(1, size + 1);
+// p_.reset(new unsigned char[layout.AllocSize()]);
+// // If running under ASAN, mark the padding bytes, if any, to catch
+// // memory errors.
+// layout.PoisonPadding(p_.get());
+// // Store the size in the allocation.
+// *layout.Pointer<size_t>(p_.get()) = size;
+// // Store the characters in the allocation.
+// memcpy(layout.Pointer<char>(p_.get()), s, size + 1);
+// }
+//
+// size_t size() const {
+// // Equivalent to reinterpret_cast<size_t&>(*p).
+// return *L::Partial().Pointer<size_t>(p_.get());
+// }
+//
+// const char* c_str() const {
+// // Equivalent to reinterpret_cast<char*>(p.get() + sizeof(size_t)).
+// // The argument in Partial(1) specifies that we have size_t[1] in front
+// // of the characters.
+// return L::Partial(1).Pointer<char>(p_.get());
+// }
+//
+// private:
+// // Our heap allocation contains a size_t followed by an array of chars.
+// using L = Layout<size_t, char>;
+// std::unique_ptr<unsigned char[]> p_;
+// };
+//
+// int main() {
+// CompactString s = "hello";
+// assert(s.size() == 5);
+// assert(strcmp(s.c_str(), "hello") == 0);
+// }
+//
+// DOCUMENTATION
+//
+// The interface exported by this file consists of:
+// - class `Layout<>` and its public members.
+// - The public members of class `internal_layout::LayoutImpl<>`. That class
+// isn't intended to be used directly, and its name and template parameter
+// list are internal implementation details, but the class itself provides
+// most of the functionality in this file. See comments on its members for
+// detailed documentation.
+//
+// `Layout<T1,... Tn>::Partial(count1,..., countm)` (where `m` <= `n`) returns a
+// `LayoutImpl<>` object. `Layout<T1,..., Tn> layout(count1,..., countn)`
+// creates a `Layout` object, which exposes the same functionality by inheriting
+// from `LayoutImpl<>`.
+
+#ifndef ABSL_CONTAINER_INTERNAL_LAYOUT_H_
+#define ABSL_CONTAINER_INTERNAL_LAYOUT_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+#ifdef ADDRESS_SANITIZER
+#include <sanitizer/asan_interface.h>
+#endif
+
+// for C++20 std::span
+#include <boost/beast/core/span.hpp>
+#include <fmt/format.h>
+
+#if defined(__GXX_RTTI)
+#define ABSL_INTERNAL_HAS_CXA_DEMANGLE
+#endif
+
+#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE
+#include <cxxabi.h>
+#endif
+
+namespace absl {
+namespace container_internal {
+
+// A type wrapper that instructs `Layout` to use the specific alignment for the
+// array. `Layout<..., Aligned<T, N>, ...>` has exactly the same API
+// and behavior as `Layout<..., T, ...>` except that the first element of the
+// array of `T` is aligned to `N` (the rest of the elements follow without
+// padding).
+//
+// Requires: `N >= alignof(T)` and `N` is a power of 2.
+template <class T, size_t N>
+struct Aligned;
+
+namespace internal_layout {
+
+template <class T>
+struct NotAligned {};
+
+template <class T, size_t N>
+struct NotAligned<const Aligned<T, N>> {
+ static_assert(sizeof(T) == 0, "Aligned<T, N> cannot be const-qualified");
+};
+
+template <size_t>
+using IntToSize = size_t;
+
+template <class>
+using TypeToSize = size_t;
+
+template <class T>
+struct Type : NotAligned<T> {
+ using type = T;
+};
+
+template <class T, size_t N>
+struct Type<Aligned<T, N>> {
+ using type = T;
+};
+
+template <class T>
+struct SizeOf : NotAligned<T>, std::integral_constant<size_t, sizeof(T)> {};
+
+template <class T, size_t N>
+struct SizeOf<Aligned<T, N>> : std::integral_constant<size_t, sizeof(T)> {};
+
+// Note: workaround for https://gcc.gnu.org/PR88115
+template <class T>
+struct AlignOf : NotAligned<T> {
+ static constexpr size_t value = alignof(T);
+};
+
+template <class T, size_t N>
+struct AlignOf<Aligned<T, N>> {
+ static_assert(N % alignof(T) == 0,
+ "Custom alignment can't be lower than the type's alignment");
+ static constexpr size_t value = N;
+};
+
+// Does `Ts...` contain `T`?
+template <class T, class... Ts>
+using Contains = std::disjunction<std::is_same<T, Ts>...>;
+
+template <class From, class To>
+using CopyConst =
+ typename std::conditional_t<std::is_const_v<From>, const To, To>;
+
+// Note: We're not qualifying this with absl:: because it doesn't compile under
+// MSVC.
+template <class T>
+using SliceType = boost::beast::span<T>;
+
+// This namespace contains no types. It prevents functions defined in it from
+// being found by ADL.
+namespace adl_barrier {
+
+template <class Needle, class... Ts>
+constexpr size_t Find(Needle, Needle, Ts...) {
+ static_assert(!Contains<Needle, Ts...>(), "Duplicate element type");
+ return 0;
+}
+
+template <class Needle, class T, class... Ts>
+constexpr size_t Find(Needle, T, Ts...) {
+ return adl_barrier::Find(Needle(), Ts()...) + 1;
+}
+
+constexpr bool IsPow2(size_t n) { return !(n & (n - 1)); }
+
+// Returns `q * m` for the smallest `q` such that `q * m >= n`.
+// Requires: `m` is a power of two. It's enforced by IsLegalElementType below.
+constexpr size_t Align(size_t n, size_t m) { return (n + m - 1) & ~(m - 1); }
+
+constexpr size_t Min(size_t a, size_t b) { return b < a ? b : a; }
+
+constexpr size_t Max(size_t a) { return a; }
+
+template <class... Ts>
+constexpr size_t Max(size_t a, size_t b, Ts... rest) {
+ return adl_barrier::Max(b < a ? a : b, rest...);
+}
+
+template <class T>
+std::string TypeName() {
+ std::string out;
+ int status = 0;
+ char* demangled = nullptr;
+#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE
+ demangled = abi::__cxa_demangle(typeid(T).name(), nullptr, nullptr, &status);
+#endif
+ if (status == 0 && demangled != nullptr) { // Demangling succeeded.
+ out = fmt::format("<{}>", demangled);
+ free(demangled);
+ } else {
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+ out = fmt::format("<{}>", typeid(T).name());
+#endif
+ }
+ return out;
+}
+
+} // namespace adl_barrier
+
+template <bool C>
+using EnableIf = typename std::enable_if_t<C, int>;
+
+// Can `T` be a template argument of `Layout`?
+template <class T>
+using IsLegalElementType = std::integral_constant<
+ bool, !std::is_reference_v<T> && !std::is_volatile_v<T> &&
+ !std::is_reference_v<typename Type<T>::type> &&
+ !std::is_volatile_v<typename Type<T>::type> &&
+ adl_barrier::IsPow2(AlignOf<T>::value)>;
+
+template <class Elements, class SizeSeq, class OffsetSeq>
+class LayoutImpl;
+
+// Public base class of `Layout` and the result type of `Layout::Partial()`.
+//
+// `Elements...` contains all template arguments of `Layout` that created this
+// instance.
+//
+// `SizeSeq...` is `[0, NumSizes)` where `NumSizes` is the number of arguments
+// passed to `Layout::Partial()` or `Layout::Layout()`.
+//
+// `OffsetSeq...` is `[0, NumOffsets)` where `NumOffsets` is
+// `Min(sizeof...(Elements), NumSizes + 1)` (the number of arrays for which we
+// can compute offsets).
+template <class... Elements, size_t... SizeSeq, size_t... OffsetSeq>
+class LayoutImpl<std::tuple<Elements...>, std::index_sequence<SizeSeq...>,
+ std::index_sequence<OffsetSeq...>> {
+ private:
+ static_assert(sizeof...(Elements) > 0, "At least one field is required");
+ static_assert(std::conjunction_v<IsLegalElementType<Elements>...>,
+ "Invalid element type (see IsLegalElementType)");
+
+ enum {
+ NumTypes = sizeof...(Elements),
+ NumSizes = sizeof...(SizeSeq),
+ NumOffsets = sizeof...(OffsetSeq),
+ };
+
+ // These are guaranteed by `Layout`.
+ static_assert(NumOffsets == adl_barrier::Min(NumTypes, NumSizes + 1),
+ "Internal error");
+ static_assert(NumTypes > 0, "Internal error");
+
+ // Returns the index of `T` in `Elements...`. Results in a compilation error
+ // if `Elements...` doesn't contain exactly one instance of `T`.
+ template <class T>
+ static constexpr size_t ElementIndex() {
+ static_assert(Contains<Type<T>, Type<typename Type<Elements>::type>...>(),
+ "Type not found");
+ return adl_barrier::Find(Type<T>(),
+ Type<typename Type<Elements>::type>()...);
+ }
+
+ template <size_t N>
+ using ElementAlignment =
+ AlignOf<typename std::tuple_element<N, std::tuple<Elements...>>::type>;
+
+ public:
+ // Element types of all arrays packed in a tuple.
+ using ElementTypes = std::tuple<typename Type<Elements>::type...>;
+
+ // Element type of the Nth array.
+ template <size_t N>
+ using ElementType = typename std::tuple_element<N, ElementTypes>::type;
+
+ constexpr explicit LayoutImpl(IntToSize<SizeSeq>... sizes)
+ : size_{sizes...} {}
+
+ // Alignment of the layout, equal to the strictest alignment of all elements.
+ // All pointers passed to the methods of layout must be aligned to this value.
+ static constexpr size_t Alignment() {
+ return adl_barrier::Max(AlignOf<Elements>::value...);
+ }
+
+ // Offset in bytes of the Nth array.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // assert(x.Offset<0>() == 0); // The ints starts from 0.
+ // assert(x.Offset<1>() == 16); // The doubles starts from 16.
+ //
+ // Requires: `N <= NumSizes && N < sizeof...(Ts)`.
+ template <size_t N, EnableIf<N == 0> = 0>
+ constexpr size_t Offset() const {
+ return 0;
+ }
+
+ template <size_t N, EnableIf<N != 0> = 0>
+ constexpr size_t Offset() const {
+ static_assert(N < NumOffsets, "Index out of bounds");
+ return adl_barrier::Align(
+ Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1],
+ ElementAlignment<N>::value);
+ }
+
+ // Offset in bytes of the array with the specified element type. There must
+ // be exactly one such array and its zero-based index must be at most
+ // `NumSizes`.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // assert(x.Offset<int>() == 0); // The ints starts from 0.
+ // assert(x.Offset<double>() == 16); // The doubles starts from 16.
+ template <class T>
+ constexpr size_t Offset() const {
+ return Offset<ElementIndex<T>()>();
+ }
+
+ // Offsets in bytes of all arrays for which the offsets are known.
+ constexpr std::array<size_t, NumOffsets> Offsets() const {
+ return {{Offset<OffsetSeq>()...}};
+ }
+
+ // The number of elements in the Nth array. This is the Nth argument of
+ // `Layout::Partial()` or `Layout::Layout()` (zero-based).
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // assert(x.Size<0>() == 3);
+ // assert(x.Size<1>() == 4);
+ //
+ // Requires: `N < NumSizes`.
+ template <size_t N>
+ constexpr size_t Size() const {
+ static_assert(N < NumSizes, "Index out of bounds");
+ return size_[N];
+ }
+
+ // The number of elements in the array with the specified element type.
+ // There must be exactly one such array and its zero-based index must be
+ // at most `NumSizes`.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // assert(x.Size<int>() == 3);
+ // assert(x.Size<double>() == 4);
+ template <class T>
+ constexpr size_t Size() const {
+ return Size<ElementIndex<T>()>();
+ }
+
+ // The number of elements of all arrays for which they are known.
+ constexpr std::array<size_t, NumSizes> Sizes() const {
+ return {{Size<SizeSeq>()...}};
+ }
+
+ // Pointer to the beginning of the Nth array.
+ //
+ // `Char` must be `[const] [signed|unsigned] char`.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // unsigned char* p = new unsigned char[x.AllocSize()];
+ // int* ints = x.Pointer<0>(p);
+ // double* doubles = x.Pointer<1>(p);
+ //
+ // Requires: `N <= NumSizes && N < sizeof...(Ts)`.
+ // Requires: `p` is aligned to `Alignment()`.
+ template <size_t N, class Char>
+ CopyConst<Char, ElementType<N>>* Pointer(Char* p) const {
+ using C = typename std::remove_const<Char>::type;
+ static_assert(
+ std::is_same<C, char>() || std::is_same<C, unsigned char>() ||
+ std::is_same<C, signed char>(),
+ "The argument must be a pointer to [const] [signed|unsigned] char");
+ constexpr size_t alignment = Alignment();
+ (void)alignment;
+ assert(reinterpret_cast<uintptr_t>(p) % alignment == 0);
+ return reinterpret_cast<CopyConst<Char, ElementType<N>>*>(p + Offset<N>());
+ }
+
+ // Pointer to the beginning of the array with the specified element type.
+ // There must be exactly one such array and its zero-based index must be at
+ // most `NumSizes`.
+ //
+ // `Char` must be `[const] [signed|unsigned] char`.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // unsigned char* p = new unsigned char[x.AllocSize()];
+ // int* ints = x.Pointer<int>(p);
+ // double* doubles = x.Pointer<double>(p);
+ //
+ // Requires: `p` is aligned to `Alignment()`.
+ template <class T, class Char>
+ CopyConst<Char, T>* Pointer(Char* p) const {
+ return Pointer<ElementIndex<T>()>(p);
+ }
+
+ // Pointers to all arrays for which pointers are known.
+ //
+ // `Char` must be `[const] [signed|unsigned] char`.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // unsigned char* p = new unsigned char[x.AllocSize()];
+ //
+ // int* ints;
+ // double* doubles;
+ // std::tie(ints, doubles) = x.Pointers(p);
+ //
+ // Requires: `p` is aligned to `Alignment()`.
+ //
+ // Note: We're not using ElementType alias here because it does not compile
+ // under MSVC.
+ template <class Char>
+ std::tuple<CopyConst<
+ Char, typename std::tuple_element<OffsetSeq, ElementTypes>::type>*...>
+ Pointers(Char* p) const {
+ return std::tuple<CopyConst<Char, ElementType<OffsetSeq>>*...>(
+ Pointer<OffsetSeq>(p)...);
+ }
+
+ // The Nth array.
+ //
+ // `Char` must be `[const] [signed|unsigned] char`.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // unsigned char* p = new unsigned char[x.AllocSize()];
+ // Span<int> ints = x.Slice<0>(p);
+ // Span<double> doubles = x.Slice<1>(p);
+ //
+ // Requires: `N < NumSizes`.
+ // Requires: `p` is aligned to `Alignment()`.
+ template <size_t N, class Char>
+ SliceType<CopyConst<Char, ElementType<N>>> Slice(Char* p) const {
+ return SliceType<CopyConst<Char, ElementType<N>>>(Pointer<N>(p), Size<N>());
+ }
+
+ // The array with the specified element type. There must be exactly one
+ // such array and its zero-based index must be less than `NumSizes`.
+ //
+ // `Char` must be `[const] [signed|unsigned] char`.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // unsigned char* p = new unsigned char[x.AllocSize()];
+ // Span<int> ints = x.Slice<int>(p);
+ // Span<double> doubles = x.Slice<double>(p);
+ //
+ // Requires: `p` is aligned to `Alignment()`.
+ template <class T, class Char>
+ SliceType<CopyConst<Char, T>> Slice(Char* p) const {
+ return Slice<ElementIndex<T>()>(p);
+ }
+
+ // All arrays with known sizes.
+ //
+ // `Char` must be `[const] [signed|unsigned] char`.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // unsigned char* p = new unsigned char[x.AllocSize()];
+ //
+ // Span<int> ints;
+ // Span<double> doubles;
+ // std::tie(ints, doubles) = x.Slices(p);
+ //
+ // Requires: `p` is aligned to `Alignment()`.
+ //
+ // Note: We're not using ElementType alias here because it does not compile
+ // under MSVC.
+ template <class Char>
+ std::tuple<SliceType<CopyConst<
+ Char, typename std::tuple_element<SizeSeq, ElementTypes>::type>>...>
+ Slices(Char* p) const {
+ // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63875 (fixed
+ // in 6.1).
+ (void)p;
+ return std::tuple<SliceType<CopyConst<Char, ElementType<SizeSeq>>>...>(
+ Slice<SizeSeq>(p)...);
+ }
+
+ // The size of the allocation that fits all arrays.
+ //
+ // // int[3], 4 bytes of padding, double[4].
+ // Layout<int, double> x(3, 4);
+ // unsigned char* p = new unsigned char[x.AllocSize()]; // 48 bytes
+ //
+ // Requires: `NumSizes == sizeof...(Ts)`.
+ constexpr size_t AllocSize() const {
+ static_assert(NumTypes == NumSizes, "You must specify sizes of all fields");
+ return Offset<NumTypes - 1>() +
+ SizeOf<ElementType<NumTypes - 1>>() * size_[NumTypes - 1];
+ }
+
+ // If built with --config=asan, poisons padding bytes (if any) in the
+ // allocation. The pointer must point to a memory block at least
+ // `AllocSize()` bytes in length.
+ //
+ // `Char` must be `[const] [signed|unsigned] char`.
+ //
+ // Requires: `p` is aligned to `Alignment()`.
+ template <class Char, size_t N = NumOffsets - 1, EnableIf<N == 0> = 0>
+ void PoisonPadding(const Char* p) const {
+ Pointer<0>(p); // verify the requirements on `Char` and `p`
+ }
+
+ template <class Char, size_t N = NumOffsets - 1, EnableIf<N != 0> = 0>
+ void PoisonPadding(const Char* p) const {
+ static_assert(N < NumOffsets, "Index out of bounds");
+ (void)p;
+#ifdef ADDRESS_SANITIZER
+ PoisonPadding<Char, N - 1>(p);
+ // The `if` is an optimization. It doesn't affect the observable behaviour.
+ if (ElementAlignment<N - 1>::value % ElementAlignment<N>::value) {
+ size_t start =
+ Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1];
+ ASAN_POISON_MEMORY_REGION(p + start, Offset<N>() - start);
+ }
+#endif
+ }
+
+ // Human-readable description of the memory layout. Useful for debugging.
+ // Slow.
+ //
+ // // char[5], 3 bytes of padding, int[3], 4 bytes of padding, followed
+ // // by an unknown number of doubles.
+ // auto x = Layout<char, int, double>::Partial(5, 3);
+ // assert(x.DebugString() ==
+ // "@0<char>(1)[5]; @8<int>(4)[3]; @24<double>(8)");
+ //
+ // Each field is in the following format: @offset<type>(sizeof)[size] (<type>
+ // may be missing depending on the target platform). For example,
+ // @8<int>(4)[3] means that at offset 8 we have an array of ints, where each
+ // int is 4 bytes, and we have 3 of those ints. The size of the last field may
+ // be missing (as in the example above). Only fields with known offsets are
+ // described. Type names may differ across platforms: one compiler might
+ // produce "unsigned*" where another produces "unsigned int *".
+ std::string DebugString() const {
+ const auto offsets = Offsets();
+ const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>()...};
+ const std::string types[] = {
+ adl_barrier::TypeName<ElementType<OffsetSeq>>()...};
+ std::string res = fmt::format("@0{}({})", types[0], sizes[0]);
+ for (size_t i = 0; i != NumOffsets - 1; ++i) {
+ res += fmt::format("[{}]; @({})", size_[i], offsets[i + 1], types[i + 1], sizes[i + 1]);
+ }
+ // NumSizes is a constant that may be zero. Some compilers cannot see that
+ // inside the if statement "size_[NumSizes - 1]" must be valid.
+ int last = static_cast<int>(NumSizes) - 1;
+ if (NumTypes == NumSizes && last >= 0) {
+ res += fmt::format("[{}]", size_[last]);
+ }
+ return res;
+ }
+
+ private:
+ // Arguments of `Layout::Partial()` or `Layout::Layout()`.
+ size_t size_[NumSizes > 0 ? NumSizes : 1];
+};
+
+template <size_t NumSizes, class... Ts>
+using LayoutType = LayoutImpl<
+ std::tuple<Ts...>, std::make_index_sequence<NumSizes>,
+ std::make_index_sequence<adl_barrier::Min(sizeof...(Ts), NumSizes + 1)>>;
+
+} // namespace internal_layout
+
+// Descriptor of arrays of various types and sizes laid out in memory one after
+// another. See the top of the file for documentation.
+//
+// Check out the public API of internal_layout::LayoutImpl above. The type is
+// internal to the library but its methods are public, and they are inherited
+// by `Layout`.
+template <class... Ts>
+class Layout : public internal_layout::LayoutType<sizeof...(Ts), Ts...> {
+ public:
+ static_assert(sizeof...(Ts) > 0, "At least one field is required");
+ static_assert(
+ std::conjunction_v<internal_layout::IsLegalElementType<Ts>...>,
+ "Invalid element type (see IsLegalElementType)");
+
+ // The result type of `Partial()` with `NumSizes` arguments.
+ template <size_t NumSizes>
+ using PartialType = internal_layout::LayoutType<NumSizes, Ts...>;
+
+ // `Layout` knows the element types of the arrays we want to lay out in
+ // memory but not the number of elements in each array.
+ // `Partial(size1, ..., sizeN)` allows us to specify the latter. The
+ // resulting immutable object can be used to obtain pointers to the
+ // individual arrays.
+ //
+ // It's allowed to pass fewer array sizes than the number of arrays. E.g.,
+ // if all you need is to the offset of the second array, you only need to
+ // pass one argument -- the number of elements in the first array.
+ //
+ // // int[3] followed by 4 bytes of padding and an unknown number of
+ // // doubles.
+ // auto x = Layout<int, double>::Partial(3);
+ // // doubles start at byte 16.
+ // assert(x.Offset<1>() == 16);
+ //
+ // If you know the number of elements in all arrays, you can still call
+ // `Partial()` but it's more convenient to use the constructor of `Layout`.
+ //
+ // Layout<int, double> x(3, 5);
+ //
+ // Note: The sizes of the arrays must be specified in number of elements,
+ // not in bytes.
+ //
+ // Requires: `sizeof...(Sizes) <= sizeof...(Ts)`.
+ // Requires: all arguments are convertible to `size_t`.
+ template <class... Sizes>
+ static constexpr PartialType<sizeof...(Sizes)> Partial(Sizes&&... sizes) {
+ static_assert(sizeof...(Sizes) <= sizeof...(Ts));
+ return PartialType<sizeof...(Sizes)>(std::forward<Sizes>(sizes)...);
+ }
+
+ // Creates a layout with the sizes of all arrays specified. If you know
+ // only the sizes of the first N arrays (where N can be zero), you can use
+ // `Partial()` defined above. The constructor is essentially equivalent to
+ // calling `Partial()` and passing in all array sizes; the constructor is
+ // provided as a convenient abbreviation.
+ //
+ // Note: The sizes of the arrays must be specified in number of elements,
+ // not in bytes.
+ constexpr explicit Layout(internal_layout::TypeToSize<Ts>... sizes)
+ : internal_layout::LayoutType<sizeof...(Ts), Ts...>(sizes...) {}
+};
+
+} // namespace container_internal
+} // namespace absl
+
+#endif // ABSL_CONTAINER_INTERNAL_LAYOUT_H_
diff --git a/src/crimson/common/local_shared_foreign_ptr.h b/src/crimson/common/local_shared_foreign_ptr.h
new file mode 100644
index 000000000..c4bd1099a
--- /dev/null
+++ b/src/crimson/common/local_shared_foreign_ptr.h
@@ -0,0 +1,245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/smp.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/sharded.hh>
+
+namespace crimson {
+
+/**
+ * local_shared_foreign_ptr
+ *
+ * See seastar/include/seastar/core/sharded.hh:foreign_ptr
+ *
+ * seastar::foreign_ptr wraps a smart ptr by proxying the copy() and destructor
+ * operations back to the original core. This works well except that copy()
+ * requires a cross-core call. We need a smart_ptr which allows cross-core
+ * caching of (for example) OSDMaps, but we want to avoid the overhead inherent
+ * in incrementing the source smart_ptr on every copy. Thus,
+ * local_shared_foreign_ptr maintains a core-local foreign_ptr back to the
+ * original core instance with core-local ref counting.
+ */
+template <typename PtrType>
+class local_shared_foreign_ptr {
+ using element_type = typename std::pointer_traits<PtrType>::element_type;
+ using pointer = element_type*;
+
+ seastar::lw_shared_ptr<seastar::foreign_ptr<PtrType>> ptr;
+
+ /// Wraps a pointer object and remembers the current core.
+ local_shared_foreign_ptr(seastar::foreign_ptr<PtrType> &&fptr)
+ : ptr(fptr ? seastar::make_lw_shared(std::move(fptr)) : nullptr) {
+ assert(!ptr || (ptr && *ptr));
+ }
+
+ template <typename T>
+ friend local_shared_foreign_ptr<T> make_local_shared_foreign(
+ seastar::foreign_ptr<T> &&);
+
+public:
+ /// Constructs a null local_shared_foreign_ptr<>.
+ local_shared_foreign_ptr() = default;
+
+ /// Constructs a null local_shared_foreign_ptr<>.
+ local_shared_foreign_ptr(std::nullptr_t) : local_shared_foreign_ptr() {}
+
+ /// Moves a local_shared_foreign_ptr<> to another object.
+ local_shared_foreign_ptr(local_shared_foreign_ptr&& other) = default;
+
+ /// Copies a local_shared_foreign_ptr<>
+ local_shared_foreign_ptr(const local_shared_foreign_ptr &other) = default;
+
+ /// Releases reference to ptr eventually releasing the contained foreign_ptr
+ ~local_shared_foreign_ptr() = default;
+
+ /// Creates a copy of this foreign ptr. Only works if the stored ptr is copyable.
+ seastar::future<seastar::foreign_ptr<PtrType>> get_foreign() const noexcept {
+ assert(!ptr || (ptr && *ptr));
+ return ptr ? ptr->copy() :
+ seastar::make_ready_future<seastar::foreign_ptr<PtrType>>(nullptr);
+ }
+
+ /// Accesses the wrapped object.
+ element_type& operator*() const noexcept {
+ assert(ptr && *ptr);
+ return **ptr;
+ }
+ /// Accesses the wrapped object.
+ element_type* operator->() const noexcept {
+ assert(ptr && *ptr);
+ return &**ptr;
+ }
+
+ /// Access the raw pointer to the wrapped object.
+ pointer get() const noexcept {
+ assert(!ptr || (ptr && *ptr));
+ return ptr ? ptr->get() : nullptr;
+ }
+
+ /// Return the owner-shard of the contained foreign_ptr.
+ unsigned get_owner_shard() const noexcept {
+ assert(!ptr || (ptr && *ptr));
+ return ptr ? ptr->get_owner_shard() : seastar::this_shard_id();
+ }
+
+ /// Checks whether the wrapped pointer is non-null.
+ operator bool() const noexcept {
+ assert(!ptr || (ptr && *ptr));
+ return static_cast<bool>(ptr);
+ }
+
+ /// Move-assigns a \c local_shared_foreign_ptr<>.
+ local_shared_foreign_ptr& operator=(local_shared_foreign_ptr&& other) noexcept {
+ ptr = std::move(other.ptr);
+ return *this;
+ }
+
+ /// Copy-assigns a \c local_shared_foreign_ptr<>.
+ local_shared_foreign_ptr& operator=(const local_shared_foreign_ptr& other) noexcept {
+ ptr = other.ptr;
+ return *this;
+ }
+
+ /// Reset the containing ptr
+ void reset() noexcept {
+ assert(!ptr || (ptr && *ptr));
+ ptr = nullptr;
+ }
+};
+
+/// Wraps a smart_ptr T in a local_shared_foreign_ptr<>.
+template <typename T>
+local_shared_foreign_ptr<T> make_local_shared_foreign(
+ seastar::foreign_ptr<T> &&ptr) {
+ return local_shared_foreign_ptr<T>(std::move(ptr));
+}
+
+/// Wraps ptr in a local_shared_foreign_ptr<>.
+template <typename T>
+local_shared_foreign_ptr<T> make_local_shared_foreign(T &&ptr) {
+ return make_local_shared_foreign<T>(
+ ptr ? seastar::make_foreign(std::forward<T>(ptr)) : nullptr);
+}
+
+template <typename T, typename U>
+inline bool operator==(const local_shared_foreign_ptr<T> &x,
+ const local_shared_foreign_ptr<U> &y) {
+ return x.get() == y.get();
+}
+
+template <typename T>
+inline bool operator==(const local_shared_foreign_ptr<T> &x, std::nullptr_t) {
+ return x.get() == nullptr;
+}
+
+template <typename T>
+inline bool operator==(std::nullptr_t, const local_shared_foreign_ptr<T>& y) {
+ return nullptr == y.get();
+}
+
+template <typename T, typename U>
+inline bool operator!=(const local_shared_foreign_ptr<T> &x,
+ const local_shared_foreign_ptr<U> &y) {
+ return x.get() != y.get();
+}
+
+template <typename T>
+inline bool operator!=(const local_shared_foreign_ptr<T> &x, std::nullptr_t) {
+ return x.get() != nullptr;
+}
+
+template <typename T>
+inline bool operator!=(std::nullptr_t, const local_shared_foreign_ptr<T>& y) {
+ return nullptr != y.get();
+}
+
+template <typename T, typename U>
+inline bool operator<(const local_shared_foreign_ptr<T> &x,
+ const local_shared_foreign_ptr<U> &y) {
+ return x.get() < y.get();
+}
+
+template <typename T>
+inline bool operator<(const local_shared_foreign_ptr<T> &x, std::nullptr_t) {
+ return x.get() < nullptr;
+}
+
+template <typename T>
+inline bool operator<(std::nullptr_t, const local_shared_foreign_ptr<T>& y) {
+ return nullptr < y.get();
+}
+
+template <typename T, typename U>
+inline bool operator<=(const local_shared_foreign_ptr<T> &x,
+ const local_shared_foreign_ptr<U> &y) {
+ return x.get() <= y.get();
+}
+
+template <typename T>
+inline bool operator<=(const local_shared_foreign_ptr<T> &x, std::nullptr_t) {
+ return x.get() <= nullptr;
+}
+
+template <typename T>
+inline bool operator<=(std::nullptr_t, const local_shared_foreign_ptr<T>& y) {
+ return nullptr <= y.get();
+}
+
+template <typename T, typename U>
+inline bool operator>(const local_shared_foreign_ptr<T> &x,
+ const local_shared_foreign_ptr<U> &y) {
+ return x.get() > y.get();
+}
+
+template <typename T>
+inline bool operator>(const local_shared_foreign_ptr<T> &x, std::nullptr_t) {
+ return x.get() > nullptr;
+}
+
+template <typename T>
+inline bool operator>(std::nullptr_t, const local_shared_foreign_ptr<T>& y) {
+ return nullptr > y.get();
+}
+
+template <typename T, typename U>
+inline bool operator>=(const local_shared_foreign_ptr<T> &x,
+ const local_shared_foreign_ptr<U> &y) {
+ return x.get() >= y.get();
+}
+
+template <typename T>
+inline bool operator>=(const local_shared_foreign_ptr<T> &x, std::nullptr_t) {
+ return x.get() >= nullptr;
+}
+
+template <typename T>
+inline bool operator>=(std::nullptr_t, const local_shared_foreign_ptr<T>& y) {
+ return nullptr >= y.get();
+}
+
+}
+
+namespace std {
+
+template <typename T>
+struct hash<crimson::local_shared_foreign_ptr<T>>
+ : private hash<typename std::pointer_traits<T>::element_type *> {
+ size_t operator()(const crimson::local_shared_foreign_ptr<T>& p) const {
+ return hash<typename std::pointer_traits<T>::element_type *>::operator()(p.get());
+ }
+};
+
+}
+
+namespace seastar {
+
+template<typename T>
+struct is_smart_ptr<crimson::local_shared_foreign_ptr<T>> : std::true_type {};
+
+}
diff --git a/src/crimson/common/log.cc b/src/crimson/common/log.cc
new file mode 100644
index 000000000..cae9f6a7b
--- /dev/null
+++ b/src/crimson/common/log.cc
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "log.h"
+
+static std::array<seastar::logger, ceph_subsys_get_num()> loggers{
+#define SUBSYS(name, log_level, gather_level) \
+ seastar::logger(#name),
+#define DEFAULT_SUBSYS(log_level, gather_level) \
+ seastar::logger("none"),
+ #include "common/subsys.h"
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+};
+
+namespace crimson {
+seastar::logger& get_logger(int subsys) {
+ assert(subsys < ceph_subsys_max);
+ return loggers[subsys];
+}
+}
diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h
new file mode 100644
index 000000000..27ff550d8
--- /dev/null
+++ b/src/crimson/common/log.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <fmt/format.h>
+#include <seastar/util/log.hh>
+
+#include "common/subsys_types.h"
+
+namespace crimson {
+seastar::logger& get_logger(int subsys);
+static inline seastar::log_level to_log_level(int level) {
+ if (level < 0) {
+ return seastar::log_level::error;
+ } else if (level < 1) {
+ return seastar::log_level::warn;
+ } else if (level <= 5) {
+ return seastar::log_level::info;
+ } else if (level <= 20) {
+ return seastar::log_level::debug;
+ } else {
+ return seastar::log_level::trace;
+ }
+}
+}
+
+/* Logging convenience macros
+ *
+ * The intention here is to standardize prefixing log lines with the function name
+ * and a context prefix (like the operator<< for the PG). Place
+ *
+ * SET_SUBSYS(osd);
+ *
+ * at the top of the file to declare the log lines within the file as being (in this case)
+ * in the osd subsys. At the beginning of each method/function, add
+ *
+ * LOG_PREFIX(Class::method_name)
+ *
+ * to set the FNAME symbol to Class::method_name. In order to use the log macros
+ * within lambdas, capture FNAME by value.
+ *
+ * Log lines can then be declared using the appropriate macro below.
+ */
+
+#define SET_SUBSYS(subname_) static constexpr auto SOURCE_SUBSYS = ceph_subsys_##subname_
+#define LOCAL_LOGGER crimson::get_logger(SOURCE_SUBSYS)
+#define LOGGER(subname_) crimson::get_logger(ceph_subsys_##subname_)
+#define LOG_PREFIX(x) constexpr auto FNAME = #x
+
+#define LOG(level_, MSG, ...) \
+ LOCAL_LOGGER.log(level_, "{}: " MSG, FNAME , ##__VA_ARGS__)
+#define SUBLOG(subname_, level_, MSG, ...) \
+ LOGGER(subname_).log(level_, "{}: " MSG, FNAME , ##__VA_ARGS__)
+
+#define TRACE(...) LOG(seastar::log_level::trace, __VA_ARGS__)
+#define SUBTRACE(subname_, ...) SUBLOG(subname_, seastar::log_level::trace, __VA_ARGS__)
+
+#define DEBUG(...) LOG(seastar::log_level::debug, __VA_ARGS__)
+#define SUBDEBUG(subname_, ...) SUBLOG(subname_, seastar::log_level::debug, __VA_ARGS__)
+
+#define INFO(...) LOG(seastar::log_level::info, __VA_ARGS__)
+#define SUBINFO(subname_, ...) SUBLOG(subname_, seastar::log_level::info, __VA_ARGS__)
+
+#define WARN(...) LOG(seastar::log_level::warn, __VA_ARGS__)
+#define SUBWARN(subname_, ...) SUBLOG(subname_, seastar::log_level::warn, __VA_ARGS__)
+
+#define ERROR(...) LOG(seastar::log_level::error, __VA_ARGS__)
+#define SUBERROR(subname_, ...) SUBLOG(subname_, seastar::log_level::error, __VA_ARGS__)
+
+// *DPP macros are intended to take DoutPrefixProvider implementations, but anything with
+// an operator<< will work as a prefix
+
+#define SUBLOGDPP(subname_, level_, MSG, dpp, ...) \
+ LOGGER(subname_).log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__)
+#define SUBTRACEDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::trace, __VA_ARGS__)
+#define SUBDEBUGDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::debug, __VA_ARGS__)
+#define SUBINFODPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::info, __VA_ARGS__)
+#define SUBWARNDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::warn, __VA_ARGS__)
+#define SUBERRORDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::error, __VA_ARGS__)
+
+#define LOGDPP(level_, MSG, dpp, ...) \
+ LOCAL_LOGGER.log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__)
+#define TRACEDPP(...) LOGDPP(seastar::log_level::trace, __VA_ARGS__)
+#define DEBUGDPP(...) LOGDPP(seastar::log_level::debug, __VA_ARGS__)
+#define INFODPP(...) LOGDPP(seastar::log_level::info, __VA_ARGS__)
+#define WARNDPP(...) LOGDPP(seastar::log_level::warn, __VA_ARGS__)
+#define ERRORDPP(...) LOGDPP(seastar::log_level::error, __VA_ARGS__)
diff --git a/src/crimson/common/logclient.cc b/src/crimson/common/logclient.cc
new file mode 100644
index 000000000..d402ecd19
--- /dev/null
+++ b/src/crimson/common/logclient.cc
@@ -0,0 +1,364 @@
+#include "crimson/common/logclient.h"
+#include <fmt/ranges.h>
+#include "include/str_map.h"
+#include "messages/MLog.h"
+#include "messages/MLogAck.h"
+#include "messages/MMonGetVersion.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/mon/MonClient.h"
+#include "mon/MonMap.h"
+#include "common/Graylog.h"
+
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::string;
+using crimson::common::local_conf;
+
+namespace {
+ seastar::logger& logger()
+ {
+ return crimson::get_logger(ceph_subsys_monc);
+ }
+}
+
+//TODO: in order to avoid unnecessary maps declarations and moving around,
+// create a named structure containing the maps and return optional
+// fit to it.
+int parse_log_client_options(CephContext *cct,
+ map<string,string> &log_to_monitors,
+ map<string,string> &log_to_syslog,
+ map<string,string> &log_channels,
+ map<string,string> &log_prios,
+ map<string,string> &log_to_graylog,
+ map<string,string> &log_to_graylog_host,
+ map<string,string> &log_to_graylog_port,
+ uuid_d &fsid,
+ string &host)
+{
+ ostringstream oss;
+
+ int r = get_conf_str_map_helper(
+ cct->_conf.get_val<string>("clog_to_monitors"), oss,
+ &log_to_monitors, CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ logger().error("{} error parsing 'clog_to_monitors'", __func__);
+ return r;
+ }
+
+ r = get_conf_str_map_helper(
+ cct->_conf.get_val<string>("clog_to_syslog"), oss,
+ &log_to_syslog, CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ logger().error("{} error parsing 'clog_to_syslog'", __func__);
+ return r;
+ }
+
+ r = get_conf_str_map_helper(
+ cct->_conf.get_val<string>("clog_to_syslog_facility"), oss,
+ &log_channels, CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ logger().error("{} error parsing 'clog_to_syslog_facility'", __func__);
+ return r;
+ }
+
+ r = get_conf_str_map_helper(
+ cct->_conf.get_val<string>("clog_to_syslog_level"), oss,
+ &log_prios, CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ logger().error("{} error parsing 'clog_to_syslog_level'", __func__);
+ return r;
+ }
+
+ r = get_conf_str_map_helper(
+ cct->_conf.get_val<string>("clog_to_graylog"), oss,
+ &log_to_graylog, CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ logger().error("{} error parsing 'clog_to_graylog'", __func__);
+ return r;
+ }
+
+ r = get_conf_str_map_helper(
+ cct->_conf.get_val<string>("clog_to_graylog_host"), oss,
+ &log_to_graylog_host, CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ logger().error("{} error parsing 'clog_to_graylog_host'", __func__);
+ return r;
+ }
+
+ r = get_conf_str_map_helper(
+ cct->_conf.get_val<string>("clog_to_graylog_port"), oss,
+ &log_to_graylog_port, CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ logger().error("{} error parsing 'clog_to_graylog_port'", __func__);
+ return r;
+ }
+
+ fsid = cct->_conf.get_val<uuid_d>("fsid");
+ host = cct->_conf->host;
+ return 0;
+}
+
+LogChannel::LogChannel(LogClient *lc, const string &channel)
+ : parent(lc), log_channel(channel), log_to_syslog(false),
+ log_to_monitors(false)
+{
+}
+
+LogChannel::LogChannel(LogClient *lc, const string &channel,
+ const string &facility, const string &prio)
+ : parent(lc), log_channel(channel), log_prio(prio),
+ syslog_facility(facility), log_to_syslog(false),
+ log_to_monitors(false)
+{
+}
+
+LogClient::LogClient(crimson::net::Messenger *m,
+ logclient_flag_t flags)
+ : messenger(m), is_mon(flags & FLAG_MON),
+ last_log_sent(0), last_log(0)
+{
+}
+
+void LogChannel::set_log_to_monitors(bool v)
+{
+ if (log_to_monitors != v) {
+ parent->reset();
+ log_to_monitors = v;
+ }
+}
+
+void LogChannel::update_config(map<string,string> &log_to_monitors,
+ map<string,string> &log_to_syslog,
+ map<string,string> &log_channels,
+ map<string,string> &log_prios,
+ map<string,string> &log_to_graylog,
+ map<string,string> &log_to_graylog_host,
+ map<string,string> &log_to_graylog_port,
+ uuid_d &fsid,
+ string &host)
+{
+ logger().debug(
+ "{} log_to_monitors {} log_to_syslog {} log_channels {} log_prios {}",
+ __func__, log_to_monitors, log_to_syslog, log_channels, log_prios);
+ bool to_monitors = (get_str_map_key(log_to_monitors, log_channel,
+ &CLOG_CONFIG_DEFAULT_KEY) == "true");
+ bool to_syslog = (get_str_map_key(log_to_syslog, log_channel,
+ &CLOG_CONFIG_DEFAULT_KEY) == "true");
+ string syslog_facility = get_str_map_key(log_channels, log_channel,
+ &CLOG_CONFIG_DEFAULT_KEY);
+ string prio = get_str_map_key(log_prios, log_channel,
+ &CLOG_CONFIG_DEFAULT_KEY);
+ bool to_graylog = (get_str_map_key(log_to_graylog, log_channel,
+ &CLOG_CONFIG_DEFAULT_KEY) == "true");
+ string graylog_host = get_str_map_key(log_to_graylog_host, log_channel,
+ &CLOG_CONFIG_DEFAULT_KEY);
+ string graylog_port_str = get_str_map_key(log_to_graylog_port, log_channel,
+ &CLOG_CONFIG_DEFAULT_KEY);
+ int graylog_port = atoi(graylog_port_str.c_str());
+
+ set_log_to_monitors(to_monitors);
+ set_log_to_syslog(to_syslog);
+ set_syslog_facility(syslog_facility);
+ set_log_prio(prio);
+
+ if (to_graylog && !graylog) { /* should but isn't */
+ graylog = seastar::make_shared<ceph::logging::Graylog>("clog");
+ } else if (!to_graylog && graylog) { /* shouldn't but is */
+ graylog = nullptr;
+ }
+
+ if (to_graylog && graylog) {
+ graylog->set_fsid(fsid);
+ graylog->set_hostname(host);
+ }
+
+ if (graylog && (!graylog_host.empty()) && (graylog_port != 0)) {
+ graylog->set_destination(graylog_host, graylog_port);
+ }
+
+ logger().debug("{} to_monitors: {} to_syslog: {}"
+ "syslog_facility: {} prio: {} to_graylog: {} graylog_host: {}"
+ "graylog_port: {}", __func__, (to_monitors ? "true" : "false"),
+ (to_syslog ? "true" : "false"), syslog_facility, prio,
+ (to_graylog ? "true" : "false"), graylog_host, graylog_port);
+}
+
+void LogChannel::do_log(clog_type prio, std::stringstream& ss)
+{
+ while (!ss.eof()) {
+ string s;
+ getline(ss, s);
+ if (!s.empty()) {
+ do_log(prio, s);
+ }
+ }
+}
+
+void LogChannel::do_log(clog_type prio, const std::string& s)
+{
+ if (CLOG_ERROR == prio) {
+ logger().error("log {} : {}", prio, s);
+ } else {
+ logger().warn("log {} : {}", prio, s);
+ }
+ LogEntry e;
+ e.stamp = ceph_clock_now();
+ e.addrs = parent->get_myaddrs();
+ e.name = parent->get_myname();
+ e.rank = parent->get_myrank();
+ e.prio = prio;
+ e.msg = s;
+ e.channel = get_log_channel();
+
+ // seq and who should be set for syslog/graylog/log_to_mon
+ // log to monitor?
+ if (log_to_monitors) {
+ e.seq = parent->queue(e);
+ } else {
+ e.seq = parent->get_next_seq();
+ }
+
+ // log to syslog?
+ if (do_log_to_syslog()) {
+ logger().warn("{} log to syslog", __func__);
+ e.log_to_syslog(get_log_prio(), get_syslog_facility());
+ }
+
+ // log to graylog?
+ if (do_log_to_graylog()) {
+ logger().warn("{} log to graylog", __func__);
+ graylog->log_log_entry(&e);
+ }
+}
+
+MessageURef LogClient::get_mon_log_message(log_flushing_t flush_flag)
+{
+ if (flush_flag == log_flushing_t::FLUSH) {
+ if (log_queue.empty()) {
+ return {};
+ }
+ // reset session
+ last_log_sent = log_queue.front().seq;
+ }
+ return _get_mon_log_message();
+}
+
+bool LogClient::are_pending() const
+{
+ return last_log > last_log_sent;
+}
+
+MessageURef LogClient::_get_mon_log_message()
+{
+ if (log_queue.empty()) {
+ return {};
+ }
+
+ // only send entries that haven't been sent yet during this mon
+ // session! monclient needs to call reset_session() on mon session
+ // reset for this to work right.
+
+ if (last_log_sent == last_log) {
+ return {};
+ }
+
+ // limit entries per message
+ const int64_t num_unsent = last_log - last_log_sent;
+ int64_t num_to_send;
+ if (local_conf()->mon_client_max_log_entries_per_message > 0) {
+ num_to_send = std::min(num_unsent,
+ local_conf()->mon_client_max_log_entries_per_message);
+ } else {
+ num_to_send = num_unsent;
+ }
+
+ logger().debug("log_queue is {} last_log {} sent {} num {} unsent {}"
+ " sending {}", log_queue.size(), last_log,
+ last_log_sent, log_queue.size(), num_unsent, num_to_send);
+ ceph_assert((unsigned)num_unsent <= log_queue.size());
+ auto log_iter = log_queue.begin();
+ std::deque<LogEntry> out_log_queue; /* will send the logs contained here */
+ while (log_iter->seq <= last_log_sent) {
+ ++log_iter;
+ ceph_assert(log_iter != log_queue.end());
+ }
+ while (num_to_send--) {
+ ceph_assert(log_iter != log_queue.end());
+ out_log_queue.push_back(*log_iter);
+ last_log_sent = log_iter->seq;
+ logger().debug(" will send {}", *log_iter);
+ ++log_iter;
+ }
+
+ return crimson::make_message<MLog>(m_fsid,
+ std::move(out_log_queue));
+}
+
+version_t LogClient::queue(LogEntry &entry)
+{
+ entry.seq = ++last_log;
+ log_queue.push_back(entry);
+
+ return entry.seq;
+}
+
+void LogClient::reset()
+{
+ if (log_queue.size()) {
+ log_queue.clear();
+ }
+ last_log_sent = last_log;
+}
+
+uint64_t LogClient::get_next_seq()
+{
+ return ++last_log;
+}
+
+entity_addrvec_t LogClient::get_myaddrs() const
+{
+ return messenger->get_myaddrs();
+}
+
+entity_name_t LogClient::get_myrank()
+{
+ return messenger->get_myname();
+}
+
+const EntityName& LogClient::get_myname() const
+{
+ return local_conf()->name;
+}
+
+seastar::future<> LogClient::handle_log_ack(Ref<MLogAck> m)
+{
+ logger().debug("handle_log_ack {}", *m);
+
+ version_t last = m->last;
+
+ auto q = log_queue.begin();
+ while (q != log_queue.end()) {
+ const LogEntry &entry(*q);
+ if (entry.seq > last)
+ break;
+ logger().debug(" logged {}", entry);
+ q = log_queue.erase(q);
+ }
+ return seastar::now();
+}
+
+LogChannelRef LogClient::create_channel(const std::string& name) {
+ auto it = channels.find(name);
+ if (it == channels.end()) {
+ it = channels.insert(it,
+ {name, seastar::make_lw_shared<LogChannel>(this, name)});
+ }
+ return it->second;
+}
+
+seastar::future<> LogClient::set_fsid(const uuid_d& fsid) {
+ m_fsid = fsid;
+ return seastar::now();
+}
+
diff --git a/src/crimson/common/logclient.h b/src/crimson/common/logclient.h
new file mode 100644
index 000000000..ab9b25091
--- /dev/null
+++ b/src/crimson/common/logclient.h
@@ -0,0 +1,232 @@
+#ifndef CEPH_LOGCLIENT_H
+#define CEPH_LOGCLIENT_H
+
+#include "common/LogEntry.h"
+#include "common/ostream_temp.h"
+#include "common/ref.h"
+#include "include/health.h"
+#include "crimson/net/Fwd.h"
+
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/timer.hh>
+
+class LogClient;
+class MLog;
+class MLogAck;
+class Message;
+struct uuid_d;
+struct Connection;
+
+class LogChannel;
+
+namespace ceph {
+namespace logging {
+ class Graylog;
+}
+}
+
+template<typename Message> using Ref = boost::intrusive_ptr<Message>;
+namespace crimson::net {
+ class Messenger;
+}
+
+enum class log_flushing_t {
+ NO_FLUSH,
+ FLUSH
+};
+
+int parse_log_client_options(CephContext *cct,
+ std::map<std::string,std::string> &log_to_monitors,
+ std::map<std::string,std::string> &log_to_syslog,
+ std::map<std::string,std::string> &log_channels,
+ std::map<std::string,std::string> &log_prios,
+ std::map<std::string,std::string> &log_to_graylog,
+ std::map<std::string,std::string> &log_to_graylog_host,
+ std::map<std::string,std::string> &log_to_graylog_port,
+ uuid_d &fsid,
+ std::string &host);
+
+/** Manage where we output to and at which priority
+ *
+ * Not to be confused with the LogClient, which is the almighty coordinator
+ * of channels. We just deal with the boring part of the logging: send to
+ * syslog, send to file, generate LogEntry and queue it for the LogClient.
+ *
+ * Past queueing the LogEntry, the LogChannel is done with the whole thing.
+ * LogClient will deal with sending and handling of LogEntries.
+ */
+class LogChannel : public LoggerSinkSet
+{
+public:
+ LogChannel(LogClient *lc, const std::string &channel);
+ LogChannel(LogClient *lc, const std::string &channel,
+ const std::string &facility, const std::string &prio);
+
+ OstreamTemp debug() {
+ return OstreamTemp(CLOG_DEBUG, this);
+ }
+ void debug(std::stringstream &s) final {
+ do_log(CLOG_DEBUG, s);
+ }
+ /**
+ * Convenience function mapping health status to
+ * the appropriate cluster log severity.
+ */
+ OstreamTemp health(health_status_t health) {
+ switch(health) {
+ case HEALTH_OK:
+ return info();
+ case HEALTH_WARN:
+ return warn();
+ case HEALTH_ERR:
+ return error();
+ default:
+ // Invalid health_status_t value
+ ceph_abort();
+ }
+ }
+ OstreamTemp info() final {
+ return OstreamTemp(CLOG_INFO, this);
+ }
+ void info(std::stringstream &s) final {
+ do_log(CLOG_INFO, s);
+ }
+ OstreamTemp warn() final {
+ return OstreamTemp(CLOG_WARN, this);
+ }
+ void warn(std::stringstream &s) final {
+ do_log(CLOG_WARN, s);
+ }
+ OstreamTemp error() final {
+ return OstreamTemp(CLOG_ERROR, this);
+ }
+ void error(std::stringstream &s) final {
+ do_log(CLOG_ERROR, s);
+ }
+ OstreamTemp sec() final {
+ return OstreamTemp(CLOG_SEC, this);
+ }
+ void sec(std::stringstream &s) final {
+ do_log(CLOG_SEC, s);
+ }
+
+ void set_log_to_monitors(bool v);
+ void set_log_to_syslog(bool v) {
+ log_to_syslog = v;
+ }
+ void set_log_channel(const std::string& v) {
+ log_channel = v;
+ }
+ void set_log_prio(const std::string& v) {
+ log_prio = v;
+ }
+ void set_syslog_facility(const std::string& v) {
+ syslog_facility = v;
+ }
+ const std::string& get_log_prio() const { return log_prio; }
+ const std::string& get_log_channel() const { return log_channel; }
+ const std::string& get_syslog_facility() const { return syslog_facility; }
+ bool must_log_to_syslog() const { return log_to_syslog; }
+ /**
+ * Do we want to log to syslog?
+ *
+ * @return true if log_to_syslog is true and both channel and prio
+ * are not empty; false otherwise.
+ */
+ bool do_log_to_syslog() {
+ return must_log_to_syslog() &&
+ !log_prio.empty() && !log_channel.empty();
+ }
+ bool must_log_to_monitors() { return log_to_monitors; }
+
+ bool do_log_to_graylog() {
+ return (graylog != nullptr);
+ }
+
+ using Ref = seastar::lw_shared_ptr<LogChannel>;
+
+ /**
+ * update config values from parsed k/v std::map for each config option
+ *
+ * Pick out the relevant value based on our channel.
+ */
+ void update_config(std::map<std::string,std::string> &log_to_monitors,
+ std::map<std::string,std::string> &log_to_syslog,
+ std::map<std::string,std::string> &log_channels,
+ std::map<std::string,std::string> &log_prios,
+ std::map<std::string,std::string> &log_to_graylog,
+ std::map<std::string,std::string> &log_to_graylog_host,
+ std::map<std::string,std::string> &log_to_graylog_port,
+ uuid_d &fsid,
+ std::string &host);
+
+ void do_log(clog_type prio, std::stringstream& ss) final;
+ void do_log(clog_type prio, const std::string& s) final;
+
+private:
+ LogClient *parent;
+ std::string log_channel;
+ std::string log_prio;
+ std::string syslog_facility;
+ bool log_to_syslog;
+ bool log_to_monitors;
+ seastar::shared_ptr<ceph::logging::Graylog> graylog;
+};
+
+using LogChannelRef = LogChannel::Ref;
+
+class LogClient
+{
+public:
+ enum logclient_flag_t {
+ NO_FLAGS = 0,
+ FLAG_MON = 0x1,
+ };
+
+ LogClient(crimson::net::Messenger *m, logclient_flag_t flags);
+
+ virtual ~LogClient() = default;
+
+ seastar::future<> handle_log_ack(Ref<MLogAck> m);
+ MessageURef get_mon_log_message(log_flushing_t flush_flag);
+ bool are_pending() const;
+
+ LogChannelRef create_channel() {
+ return create_channel(CLOG_CHANNEL_DEFAULT);
+ }
+
+ LogChannelRef create_channel(const std::string& name);
+
+ void destroy_channel(const std::string& name) {
+ channels.erase(name);
+ }
+
+ void shutdown() {
+ channels.clear();
+ }
+
+ uint64_t get_next_seq();
+ entity_addrvec_t get_myaddrs() const;
+ const EntityName& get_myname() const;
+ entity_name_t get_myrank();
+ version_t queue(LogEntry &entry);
+ void reset();
+ seastar::future<> set_fsid(const uuid_d& fsid);
+
+private:
+ MessageURef _get_mon_log_message();
+
+ crimson::net::Messenger *messenger;
+ bool is_mon;
+ version_t last_log_sent;
+ version_t last_log;
+ std::deque<LogEntry> log_queue;
+
+ std::map<std::string, LogChannelRef> channels;
+ uuid_d m_fsid;
+};
+#endif
+
diff --git a/src/crimson/common/operation.cc b/src/crimson/common/operation.cc
new file mode 100644
index 000000000..53399fb9b
--- /dev/null
+++ b/src/crimson/common/operation.cc
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "operation.h"
+
+namespace crimson {
+
+void Operation::dump(ceph::Formatter* f) const
+{
+ f->open_object_section("operation");
+ f->dump_string("type", get_type_name());
+ f->dump_unsigned("id", id);
+ {
+ f->open_object_section("detail");
+ dump_detail(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void Operation::dump_brief(ceph::Formatter* f) const
+{
+ f->open_object_section("operation");
+ f->dump_string("type", get_type_name());
+ f->dump_unsigned("id", id);
+ f->close_section();
+}
+
+std::ostream &operator<<(std::ostream &lhs, const Operation &rhs) {
+ lhs << rhs.get_type_name() << "(id=" << rhs.get_id() << ", detail=";
+ rhs.print(lhs);
+ lhs << ")";
+ return lhs;
+}
+
+void Blocker::dump(ceph::Formatter* f) const
+{
+ f->open_object_section("blocker");
+ f->dump_string("op_type", get_type_name());
+ {
+ f->open_object_section("detail");
+ dump_detail(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+namespace detail {
+void dump_time_event(const char* name,
+ const utime_t& timestamp,
+ ceph::Formatter* f)
+{
+ assert(f);
+ f->open_object_section("time_event");
+ f->dump_string("name", name);
+ f->dump_stream("initiated_at") << timestamp;
+ f->close_section();
+}
+
+void dump_blocking_event(const char* name,
+ const utime_t& timestamp,
+ const Blocker* const blocker,
+ ceph::Formatter* f)
+{
+ assert(f);
+ f->open_object_section("blocking_event");
+ f->dump_string("name", name);
+ f->dump_stream("initiated_at") << timestamp;
+ if (blocker) {
+ blocker->dump(f);
+ }
+ f->close_section();
+}
+} // namespace detail
+} // namespace crimson
diff --git a/src/crimson/common/operation.h b/src/crimson/common/operation.h
new file mode 100644
index 000000000..6df2c99fd
--- /dev/null
+++ b/src/crimson/common/operation.h
@@ -0,0 +1,776 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <set>
+#include <vector>
+#include <boost/core/demangle.hpp>
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/shared_mutex.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/timer.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/future-util.hh>
+
+#include "include/ceph_assert.h"
+#include "include/utime.h"
+#include "common/Clock.h"
+#include "common/Formatter.h"
+#include "crimson/common/interruptible_future.h"
+#include "crimson/common/smp_helpers.h"
+#include "crimson/common/log.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson {
+
+using registry_hook_t = boost::intrusive::list_member_hook<
+ boost::intrusive::link_mode<boost::intrusive::auto_unlink>>;
+
+class Operation;
+class Blocker;
+
+
+namespace detail {
+void dump_time_event(const char* name,
+ const utime_t& timestamp,
+ ceph::Formatter* f);
+void dump_blocking_event(const char* name,
+ const utime_t& timestamp,
+ const Blocker* blocker,
+ ceph::Formatter* f);
+} // namespace detail
+
+/**
+ * Provides an interface for dumping diagnostic information about
+ * why a particular op is not making progress.
+ */
+class Blocker {
+public:
+ void dump(ceph::Formatter *f) const;
+ virtual ~Blocker() = default;
+
+private:
+ virtual void dump_detail(ceph::Formatter *f) const = 0;
+ virtual const char *get_type_name() const = 0;
+};
+
+// the main template. by default an operation has no extenral
+// event handler (the empty tuple). specializing the template
+// allows to define backends on per-operation-type manner.
+// NOTE: basically this could be a function but C++ disallows
+// differentiating return type among specializations.
+template <class T>
+struct EventBackendRegistry {
+ template <typename...> static constexpr bool always_false = false;
+
+ static std::tuple<> get_backends() {
+ static_assert(always_false<T>, "Registry specialization not found");
+ return {};
+ }
+};
+
+template <class T>
+struct Event {
+ T* that() {
+ return static_cast<T*>(this);
+ }
+ const T* that() const {
+ return static_cast<const T*>(this);
+ }
+
+ template <class OpT, class... Args>
+ void trigger(OpT&& op, Args&&... args) {
+ that()->internal_backend.handle(*that(),
+ std::forward<OpT>(op),
+ std::forward<Args>(args)...);
+ // let's call `handle()` for concrete event type from each single
+ // of our backends. the order in the registry matters.
+ std::apply([&, //args=std::forward_as_tuple(std::forward<Args>(args)...),
+ this] (auto... backend) {
+ (..., backend.handle(*that(),
+ std::forward<OpT>(op),
+ std::forward<Args>(args)...));
+ }, EventBackendRegistry<std::decay_t<OpT>>::get_backends());
+ }
+};
+
+
+// simplest event type for recording things like beginning or end
+// of TrackableOperation's life.
+template <class T>
+struct TimeEvent : Event<T> {
+ struct Backend {
+ // `T` is passed solely to let implementations to discriminate
+ // basing on the type-of-event.
+ virtual void handle(T&, const Operation&) = 0;
+ };
+
+ // for the sake of dumping ops-in-flight.
+ struct InternalBackend final : Backend {
+ void handle(T&, const Operation&) override {
+ timestamp = ceph_clock_now();
+ }
+ utime_t timestamp;
+ } internal_backend;
+
+ void dump(ceph::Formatter *f) const {
+ auto demangled_name = boost::core::demangle(typeid(T).name());
+ detail::dump_time_event(
+ demangled_name.c_str(),
+ internal_backend.timestamp, f);
+ }
+
+ auto get_timestamp() const {
+ return internal_backend.timestamp;
+ }
+};
+
+
+template <typename T>
+class BlockerT : public Blocker {
+public:
+ struct BlockingEvent : Event<typename T::BlockingEvent> {
+ using Blocker = std::decay_t<T>;
+
+ struct Backend {
+ // `T` is based solely to let implementations to discriminate
+ // basing on the type-of-event.
+ virtual void handle(typename T::BlockingEvent&, const Operation&, const T&) = 0;
+ };
+
+ struct InternalBackend : Backend {
+ void handle(typename T::BlockingEvent&,
+ const Operation&,
+ const T& blocker) override {
+ this->timestamp = ceph_clock_now();
+ this->blocker = &blocker;
+ }
+
+ utime_t timestamp;
+ const T* blocker;
+ } internal_backend;
+
+ // we don't want to make any BlockerT to be aware and coupled with
+ // an operation. to not templatize an entire path from an op to
+ // a blocker, type erasuring is used.
+ struct TriggerI {
+ TriggerI(BlockingEvent& event) : event(event) {}
+
+ template <class FutureT>
+ auto maybe_record_blocking(FutureT&& fut, const T& blocker) {
+ if (!fut.available()) {
+ // a full blown call via vtable. that's the cost for templatization
+ // avoidance. anyway, most of the things actually have the type
+ // knowledge.
+ record_blocking(blocker);
+ return std::forward<FutureT>(fut).finally(
+ [&event=this->event, &blocker] () mutable {
+ // beware trigger instance may be already dead when this
+ // is executed!
+ record_unblocking(event, blocker);
+ });
+ }
+ return std::forward<FutureT>(fut);
+ }
+ virtual ~TriggerI() = default;
+ protected:
+ // it's for the sake of erasing the OpT type
+ virtual void record_blocking(const T& blocker) = 0;
+
+ static void record_unblocking(BlockingEvent& event, const T& blocker) {
+ assert(event.internal_backend.blocker == &blocker);
+ event.internal_backend.blocker = nullptr;
+ }
+
+ BlockingEvent& event;
+ };
+
+ template <class OpT>
+ struct Trigger : TriggerI {
+ Trigger(BlockingEvent& event, const OpT& op) : TriggerI(event), op(op) {}
+
+ template <class FutureT>
+ auto maybe_record_blocking(FutureT&& fut, const T& blocker) {
+ if (!fut.available()) {
+ // no need for the dynamic dispatch! if we're lucky, a compiler
+ // should collapse all these abstractions into a bunch of movs.
+ this->Trigger::record_blocking(blocker);
+ return std::forward<FutureT>(fut).finally(
+ [&event=this->event, &blocker] () mutable {
+ Trigger::record_unblocking(event, blocker);
+ });
+ }
+ return std::forward<FutureT>(fut);
+ }
+
+ const OpT &get_op() { return op; }
+
+ protected:
+ void record_blocking(const T& blocker) override {
+ this->event.trigger(op, blocker);
+ }
+
+ const OpT& op;
+ };
+
+ void dump(ceph::Formatter *f) const {
+ auto demangled_name = boost::core::demangle(typeid(T).name());
+ detail::dump_blocking_event(
+ demangled_name.c_str(),
+ internal_backend.timestamp,
+ internal_backend.blocker,
+ f);
+ }
+ };
+
+ virtual ~BlockerT() = default;
+ template <class TriggerT, class... Args>
+ decltype(auto) track_blocking(TriggerT&& trigger, Args&&... args) {
+ return std::forward<TriggerT>(trigger).maybe_record_blocking(
+ std::forward<Args>(args)..., static_cast<const T&>(*this));
+ }
+
+private:
+ const char *get_type_name() const final {
+ return static_cast<const T*>(this)->type_name;
+ }
+};
+
+template <class T>
+struct AggregateBlockingEvent {
+ struct TriggerI {
+ protected:
+ struct TriggerContainerI {
+ virtual typename T::TriggerI& get_trigger() = 0;
+ virtual ~TriggerContainerI() = default;
+ };
+ using TriggerContainerIRef = std::unique_ptr<TriggerContainerI>;
+ virtual TriggerContainerIRef create_part_trigger() = 0;
+
+ public:
+ template <class FutureT>
+ auto maybe_record_blocking(FutureT&& fut,
+ const typename T::Blocker& blocker) {
+ // AggregateBlockingEvent is supposed to be used on relatively cold
+ // paths (recovery), so we don't need to worry about the dynamic
+ // polymothps / dynamic memory's overhead.
+ auto tcont = create_part_trigger();
+ return tcont->get_trigger().maybe_record_blocking(
+ std::move(fut), blocker
+ ).finally([tcont=std::move(tcont)] {});
+ }
+
+ virtual ~TriggerI() = default;
+ };
+
+ template <class OpT>
+ struct Trigger final : TriggerI {
+ Trigger(AggregateBlockingEvent& event, const OpT& op)
+ : event(event), op(op) {}
+
+ class TriggerContainer final : public TriggerI::TriggerContainerI {
+ AggregateBlockingEvent& event;
+ typename decltype(event.events)::iterator iter;
+ typename T::template Trigger<OpT> trigger;
+
+ typename T::TriggerI &get_trigger() final {
+ return trigger;
+ }
+
+ public:
+ TriggerContainer(AggregateBlockingEvent& _event, const OpT& op) :
+ event(_event),
+ iter(event.events.emplace(event.events.end())),
+ trigger(*iter, op) {}
+
+ ~TriggerContainer() final {
+ event.events.erase(iter);
+ }
+ };
+
+ protected:
+ typename TriggerI::TriggerContainerIRef create_part_trigger() final {
+ return std::make_unique<TriggerContainer>(event, op);
+ }
+
+ private:
+ AggregateBlockingEvent& event;
+ const OpT& op;
+ };
+
+private:
+ std::list<T> events;
+ template <class OpT>
+ friend class Trigger;
+};
+
+/**
+ * Common base for all crimson-osd operations. Mainly provides
+ * an interface for registering ops in flight and dumping
+ * diagnostic information.
+ */
+class Operation : public boost::intrusive_ref_counter<
+ Operation, boost::thread_unsafe_counter> {
+ public:
+ using id_t = uint64_t;
+ static constexpr id_t NULL_ID = std::numeric_limits<uint64_t>::max();
+ id_t get_id() const {
+ return id;
+ }
+
+ static constexpr bool is_trackable = false;
+
+ virtual unsigned get_type() const = 0;
+ virtual const char *get_type_name() const = 0;
+ virtual void print(std::ostream &) const = 0;
+
+ void dump(ceph::Formatter *f) const;
+ void dump_brief(ceph::Formatter *f) const;
+ virtual ~Operation() = default;
+
+ private:
+ virtual void dump_detail(ceph::Formatter *f) const = 0;
+
+ registry_hook_t registry_hook;
+
+ id_t id = 0;
+ void set_id(id_t in_id) {
+ id = in_id;
+ }
+
+ friend class OperationRegistryI;
+ template <size_t>
+ friend class OperationRegistryT;
+};
+using OperationRef = boost::intrusive_ptr<Operation>;
+
+std::ostream &operator<<(std::ostream &, const Operation &op);
+
+/**
+ * Maintains a set of lists of all active ops.
+ */
+class OperationRegistryI {
+ using op_list_member_option = boost::intrusive::member_hook<
+ Operation,
+ registry_hook_t,
+ &Operation::registry_hook
+ >;
+
+ friend class Operation;
+ seastar::timer<seastar::lowres_clock> shutdown_timer;
+ seastar::promise<> shutdown;
+
+protected:
+ virtual void do_register(Operation *op) = 0;
+ virtual bool registries_empty() const = 0;
+ virtual void do_stop() = 0;
+
+public:
+ using op_list = boost::intrusive::list<
+ Operation,
+ op_list_member_option,
+ boost::intrusive::constant_time_size<false>>;
+
+ template <typename T, typename... Args>
+ auto create_operation(Args&&... args) {
+ boost::intrusive_ptr<T> op = new T(std::forward<Args>(args)...);
+ do_register(&*op);
+ return op;
+ }
+
+ seastar::future<> stop() {
+ crimson::get_logger(ceph_subsys_osd).info("OperationRegistryI::{}", __func__);
+ do_stop();
+ shutdown_timer.set_callback([this] {
+ if (registries_empty()) {
+ shutdown.set_value();
+ shutdown_timer.cancel();
+ }
+ });
+ shutdown_timer.arm_periodic(
+ std::chrono::milliseconds(100/*TODO: use option instead*/));
+ return shutdown.get_future();
+ }
+};
+
+
+template <size_t NUM_REGISTRIES>
+class OperationRegistryT : public OperationRegistryI {
+ Operation::id_t next_id = 0;
+ std::array<
+ op_list,
+ NUM_REGISTRIES
+ > registries;
+
+protected:
+ void do_register(Operation *op) final {
+ const auto op_type = op->get_type();
+ registries[op_type].push_back(*op);
+ op->set_id(++next_id);
+ }
+
+ bool registries_empty() const final {
+ return std::all_of(registries.begin(),
+ registries.end(),
+ [](auto& opl) {
+ return opl.empty();
+ });
+ }
+
+protected:
+ OperationRegistryT(core_id_t core)
+ // Use core to initialize upper 8 bits of counters to ensure that
+ // ids generated by different cores are disjoint
+ : next_id(static_cast<id_t>(core) <<
+ (std::numeric_limits<id_t>::digits - 8))
+ {}
+
+ template <size_t REGISTRY_INDEX>
+ const op_list& get_registry() const {
+ static_assert(
+ REGISTRY_INDEX < std::tuple_size<decltype(registries)>::value);
+ return registries[REGISTRY_INDEX];
+ }
+
+ template <size_t REGISTRY_INDEX>
+ op_list& get_registry() {
+ static_assert(
+ REGISTRY_INDEX < std::tuple_size<decltype(registries)>::value);
+ return registries[REGISTRY_INDEX];
+ }
+
+public:
+ /// Iterate over live ops
+ template <typename F>
+ void for_each_op(F &&f) const {
+ for (const auto &registry: registries) {
+ for (const auto &op: registry) {
+ std::invoke(f, op);
+ }
+ }
+ }
+
+ /// Removes op from registry
+ void remove_from_registry(Operation &op) {
+ const auto op_type = op.get_type();
+ registries[op_type].erase(op_list::s_iterator_to(op));
+ }
+
+ /// Adds op to registry
+ void add_to_registry(Operation &op) {
+ const auto op_type = op.get_type();
+ registries[op_type].push_back(op);
+ }
+};
+
+class PipelineExitBarrierI {
+public:
+ using Ref = std::unique_ptr<PipelineExitBarrierI>;
+
+ /// Waits for exit barrier
+ virtual std::optional<seastar::future<>> wait() = 0;
+
+ /// Releases pipeline stage, can only be called after wait
+ virtual void exit() = 0;
+
+ /// Releases pipeline resources without waiting on barrier
+ virtual void cancel() = 0;
+
+ /// Must ensure that resources are released, likely by calling cancel()
+ virtual ~PipelineExitBarrierI() {}
+};
+
+template <class T>
+class PipelineStageIT : public BlockerT<T> {
+ const core_id_t core = seastar::this_shard_id();
+public:
+ core_id_t get_core() const { return core; }
+
+ template <class... Args>
+ decltype(auto) enter(Args&&... args) {
+ return static_cast<T*>(this)->enter(std::forward<Args>(args)...);
+ }
+};
+
+class PipelineHandle {
+ PipelineExitBarrierI::Ref barrier;
+
+ std::optional<seastar::future<>> wait_barrier() {
+ return barrier ? barrier->wait() : std::nullopt;
+ }
+
+public:
+ PipelineHandle() = default;
+
+ PipelineHandle(const PipelineHandle&) = delete;
+ PipelineHandle(PipelineHandle&&) = default;
+ PipelineHandle &operator=(const PipelineHandle&) = delete;
+ PipelineHandle &operator=(PipelineHandle&&) = default;
+
+ /**
+ * Returns a future which unblocks when the handle has entered the passed
+ * OrderedPipelinePhase. If already in a phase, enter will also release
+ * that phase after placing itself in the queue for the next one to preserve
+ * ordering.
+ */
+ template <typename OpT, typename T>
+ seastar::future<>
+ enter(T &stage, typename T::BlockingEvent::template Trigger<OpT>&& t) {
+ ceph_assert(stage.get_core() == seastar::this_shard_id());
+ auto wait_fut = wait_barrier();
+ if (wait_fut.has_value()) {
+ return wait_fut.value().then([this, &stage, t=std::move(t)] () mutable {
+ auto fut = t.maybe_record_blocking(stage.enter(t), stage);
+ exit();
+ return std::move(fut).then(
+ [this, t=std::move(t)](auto &&barrier_ref) mutable {
+ barrier = std::move(barrier_ref);
+ return seastar::now();
+ });
+ });
+ } else {
+ auto fut = t.maybe_record_blocking(stage.enter(t), stage);
+ exit();
+ return std::move(fut).then(
+ [this, t=std::move(t)](auto &&barrier_ref) mutable {
+ barrier = std::move(barrier_ref);
+ return seastar::now();
+ });
+ }
+ }
+
+ /**
+ * Completes pending exit barrier without entering a new one.
+ */
+ seastar::future<> complete() {
+ auto ret = wait_barrier();
+ barrier.reset();
+ return ret ? std::move(ret.value()) : seastar::now();
+ }
+
+ /**
+ * Exits current phase, skips exit barrier, should only be used for op
+ * failure. Permitting the handle to be destructed as the same effect.
+ */
+ void exit() {
+ barrier.reset();
+ }
+
+};
+
+/**
+ * Ensures that at most one op may consider itself in the phase at a time.
+ * Ops will see enter() unblock in the order in which they tried to enter
+ * the phase. entering (though not necessarily waiting for the future to
+ * resolve) a new phase prior to exiting the previous one will ensure that
+ * the op ordering is preserved.
+ */
+template <class T>
+class OrderedExclusivePhaseT : public PipelineStageIT<T> {
+ void dump_detail(ceph::Formatter *f) const final {
+ f->dump_unsigned("waiting", waiting);
+ if (held_by != Operation::NULL_ID) {
+ f->dump_unsigned("held_by_operation_id", held_by);
+ }
+ }
+
+ class ExitBarrier final : public PipelineExitBarrierI {
+ OrderedExclusivePhaseT *phase;
+ Operation::id_t op_id;
+ public:
+ ExitBarrier(OrderedExclusivePhaseT *phase, Operation::id_t id)
+ : phase(phase), op_id(id) {}
+
+ std::optional<seastar::future<>> wait() final {
+ return std::nullopt;
+ }
+
+ void exit() final {
+ if (phase) {
+ auto *p = phase;
+ auto id = op_id;
+ phase = nullptr;
+ std::ignore = seastar::smp::submit_to(
+ p->get_core(),
+ [p, id] {
+ p->exit(id);
+ });
+ }
+ }
+
+ void cancel() final {
+ exit();
+ }
+
+ ~ExitBarrier() final {
+ cancel();
+ }
+ };
+
+ void exit(Operation::id_t op_id) {
+ clear_held_by(op_id);
+ mutex.unlock();
+ }
+
+public:
+ template <class TriggerT>
+ seastar::future<PipelineExitBarrierI::Ref> enter(TriggerT& t) {
+ waiting++;
+ return mutex.lock().then([this, op_id=t.get_op().get_id()] {
+ ceph_assert_always(waiting > 0);
+ --waiting;
+ set_held_by(op_id);
+ return PipelineExitBarrierI::Ref(new ExitBarrier{this, op_id});
+ });
+ }
+
+private:
+ void set_held_by(Operation::id_t id) {
+ ceph_assert_always(held_by == Operation::NULL_ID);
+ held_by = id;
+ }
+
+ void clear_held_by(Operation::id_t id) {
+ ceph_assert_always(held_by == id);
+ held_by = Operation::NULL_ID;
+ }
+
+ unsigned waiting = 0;
+ seastar::shared_mutex mutex;
+ Operation::id_t held_by = Operation::NULL_ID;
+};
+
+/**
+ * Permits multiple ops to inhabit the stage concurrently, but ensures that
+ * they will proceed to the next stage in the order in which they called
+ * enter.
+ */
+template <class T>
+class OrderedConcurrentPhaseT : public PipelineStageIT<T> {
+ using base_t = PipelineStageIT<T>;
+public:
+ struct BlockingEvent : base_t::BlockingEvent {
+ using base_t::BlockingEvent::BlockingEvent;
+
+ struct ExitBarrierEvent : TimeEvent<ExitBarrierEvent> {};
+
+ template <class OpT>
+ struct Trigger : base_t::BlockingEvent::template Trigger<OpT> {
+ using base_t::BlockingEvent::template Trigger<OpT>::Trigger;
+
+ template <class FutureT>
+ decltype(auto) maybe_record_exit_barrier(FutureT&& fut) {
+ if (!fut.available()) {
+ exit_barrier_event.trigger(this->op);
+ }
+ return std::forward<FutureT>(fut);
+ }
+
+ ExitBarrierEvent exit_barrier_event;
+ };
+ };
+
+private:
+ void dump_detail(ceph::Formatter *f) const final {}
+
+ template <class TriggerT>
+ class ExitBarrier final : public PipelineExitBarrierI {
+ OrderedConcurrentPhaseT *phase;
+ std::optional<seastar::future<>> barrier;
+ TriggerT trigger;
+ public:
+ ExitBarrier(
+ OrderedConcurrentPhaseT *phase,
+ seastar::future<> &&barrier,
+ TriggerT& trigger) : phase(phase), barrier(std::move(barrier)), trigger(trigger) {}
+
+ std::optional<seastar::future<>> wait() final {
+ assert(phase);
+ assert(barrier);
+ auto ret = std::move(*barrier);
+ barrier = std::nullopt;
+ return trigger.maybe_record_exit_barrier(std::move(ret));
+ }
+
+ void exit() final {
+ if (barrier) {
+ static_cast<void>(
+ std::move(*barrier).then([phase=this->phase] { phase->mutex.unlock(); }));
+ barrier = std::nullopt;
+ phase = nullptr;
+ }
+ if (phase) {
+ std::ignore = seastar::smp::submit_to(
+ phase->get_core(),
+ [this] {
+ phase->mutex.unlock();
+ phase = nullptr;
+ });
+ }
+ }
+
+ void cancel() final {
+ exit();
+ }
+
+ ~ExitBarrier() final {
+ cancel();
+ }
+ };
+
+public:
+ template <class TriggerT>
+ seastar::future<PipelineExitBarrierI::Ref> enter(TriggerT& t) {
+ return seastar::make_ready_future<PipelineExitBarrierI::Ref>(
+ new ExitBarrier<TriggerT>{this, mutex.lock(), t});
+ }
+
+private:
+ seastar::shared_mutex mutex;
+};
+
+/**
+ * Imposes no ordering or exclusivity at all. Ops enter without constraint and
+ * may exit in any order. Useful mainly for informational purposes between
+ * stages with constraints.
+ */
+template <class T>
+class UnorderedStageT : public PipelineStageIT<T> {
+ void dump_detail(ceph::Formatter *f) const final {}
+
+ class ExitBarrier final : public PipelineExitBarrierI {
+ public:
+ ExitBarrier() = default;
+
+ std::optional<seastar::future<>> wait() final {
+ return std::nullopt;
+ }
+
+ void exit() final {}
+
+ void cancel() final {}
+
+ ~ExitBarrier() final {}
+ };
+
+public:
+ template <class... IgnoreArgs>
+ seastar::future<PipelineExitBarrierI::Ref> enter(IgnoreArgs&&...) {
+ return seastar::make_ready_future<PipelineExitBarrierI::Ref>(
+ new ExitBarrier);
+ }
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::Operation> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/common/perf_counters_collection.cc b/src/crimson/common/perf_counters_collection.cc
new file mode 100644
index 000000000..254d85278
--- /dev/null
+++ b/src/crimson/common/perf_counters_collection.cc
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "perf_counters_collection.h"
+
+namespace crimson::common {
+PerfCountersCollection::PerfCountersCollection()
+{
+ perf_collection = std::make_unique<PerfCountersCollectionImpl>();
+}
+PerfCountersCollection::~PerfCountersCollection()
+{
+ perf_collection->clear();
+}
+
+PerfCountersCollectionImpl* PerfCountersCollection:: get_perf_collection()
+{
+ return perf_collection.get();
+}
+
+void PerfCountersCollection::dump_formatted(ceph::Formatter *f, bool schema,
+ bool dump_labeled,
+ const std::string &logger,
+ const std::string &counter)
+{
+ perf_collection->dump_formatted(f, schema, dump_labeled, logger, counter);
+}
+
+PerfCountersCollection::ShardedPerfCountersCollection PerfCountersCollection::sharded_perf_coll;
+
+void PerfCountersDeleter::operator()(PerfCounters* p) noexcept
+{
+ if (cct) {
+ cct->get_perfcounters_collection()->remove(p);
+ }
+ delete p;
+}
+
+}
+
diff --git a/src/crimson/common/perf_counters_collection.h b/src/crimson/common/perf_counters_collection.h
new file mode 100644
index 000000000..ae0c8670c
--- /dev/null
+++ b/src/crimson/common/perf_counters_collection.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "common/perf_counters.h"
+#include "include/common_fwd.h"
+#include <seastar/core/sharded.hh>
+
+using crimson::common::PerfCountersCollectionImpl;
+namespace crimson::common {
+class PerfCountersCollection: public seastar::sharded<PerfCountersCollection>
+{
+ using ShardedPerfCountersCollection = seastar::sharded<PerfCountersCollection>;
+
+private:
+ std::unique_ptr<PerfCountersCollectionImpl> perf_collection;
+ static ShardedPerfCountersCollection sharded_perf_coll;
+ friend PerfCountersCollection& local_perf_coll();
+ friend ShardedPerfCountersCollection& sharded_perf_coll();
+
+public:
+ PerfCountersCollection();
+ ~PerfCountersCollection();
+ PerfCountersCollectionImpl* get_perf_collection();
+ void dump_formatted(ceph::Formatter *f, bool schema, bool dump_labeled,
+ const std::string &logger = "",
+ const std::string &counter = "");
+};
+
+inline PerfCountersCollection::ShardedPerfCountersCollection& sharded_perf_coll(){
+ return PerfCountersCollection::sharded_perf_coll;
+}
+
+inline PerfCountersCollection& local_perf_coll() {
+ return PerfCountersCollection::sharded_perf_coll.local();
+}
+
+class PerfCountersDeleter {
+ CephContext* cct;
+
+public:
+ PerfCountersDeleter() noexcept : cct(nullptr) {}
+ PerfCountersDeleter(CephContext* cct) noexcept : cct(cct) {}
+ void operator()(PerfCounters* p) noexcept;
+};
+}
+using PerfCountersRef = std::unique_ptr<crimson::common::PerfCounters, crimson::common::PerfCountersDeleter>;
+
diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h
new file mode 100644
index 000000000..186f02a61
--- /dev/null
+++ b/src/crimson/common/shared_lru.h
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include <boost/smart_ptr/weak_ptr.hpp>
+#include "simple_lru.h"
+
+/// SharedLRU does its best to cache objects. It not only tracks the objects
+/// in its LRU cache with strong references, it also tracks objects with
+/// weak_ptr even if the cache does not hold any strong references to them. so
+/// that it can return the objects after they are evicted, as long as they've
+/// ever been cached and have not been destroyed yet.
+template<class K, class V>
+class SharedLRU {
+ using shared_ptr_t = boost::local_shared_ptr<V>;
+ using weak_ptr_t = boost::weak_ptr<V>;
+ using value_type = std::pair<K, shared_ptr_t>;
+
+ // weak_refs is already ordered, and we don't use accessors like
+ // LRUCache::lower_bound(), so unordered LRUCache would suffice.
+ SimpleLRU<K, shared_ptr_t, false> cache;
+ std::map<K, std::pair<weak_ptr_t, V*>> weak_refs;
+
+ struct Deleter {
+ SharedLRU<K,V>* cache;
+ const K key;
+ void operator()(V* ptr) {
+ cache->_erase_weak(key);
+ delete ptr;
+ }
+ };
+ void _erase_weak(const K& key) {
+ weak_refs.erase(key);
+ }
+public:
+ SharedLRU(size_t max_size = 20)
+ : cache{max_size}
+ {}
+ ~SharedLRU() {
+ cache.clear();
+ // initially, we were assuming that no pointer obtained from SharedLRU
+ // can outlive the lru itself. However, since going with the interruption
+ // concept for handling shutdowns, this is no longer valid.
+ weak_refs.clear();
+ }
+ /**
+ * Returns a reference to the given key, and perform an insertion if such
+ * key does not already exist
+ */
+ shared_ptr_t operator[](const K& key);
+ /**
+ * Returns true iff there are no live references left to anything that has been
+ * in the cache.
+ */
+ bool empty() const {
+ return weak_refs.empty();
+ }
+ size_t size() const {
+ return cache.size();
+ }
+ size_t capacity() const {
+ return cache.capacity();
+ }
+ /***
+ * Inserts a key if not present, or bumps it to the front of the LRU if
+ * it is, and then gives you a reference to the value. If the key already
+ * existed, you are responsible for deleting the new value you tried to
+ * insert.
+ *
+ * @param key The key to insert
+ * @param value The value that goes with the key
+ * @param existed Set to true if the value was already in the
+ * map, false otherwise
+ * @return A reference to the map's value for the given key
+ */
+ shared_ptr_t insert(const K& key, std::unique_ptr<V> value);
+ // clear all strong reference from the lru.
+ void clear() {
+ cache.clear();
+ }
+ shared_ptr_t find(const K& key);
+ // return the last element that is not greater than key
+ shared_ptr_t lower_bound(const K& key);
+ // return the first element that is greater than key
+ std::optional<value_type> upper_bound(const K& key);
+
+ void erase(const K& key) {
+ cache.erase(key);
+ _erase_weak(key);
+ }
+};
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::insert(const K& key, std::unique_ptr<V> value)
+{
+ shared_ptr_t val;
+ if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+ val = found->second.first.lock();
+ }
+ if (!val) {
+ val.reset(value.release(), Deleter{this, key});
+ weak_refs.emplace(key, std::make_pair(val, val.get()));
+ }
+ cache.insert(key, val);
+ return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::operator[](const K& key)
+{
+ if (auto found = cache.find(key); found) {
+ return *found;
+ }
+ shared_ptr_t val;
+ if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+ val = found->second.first.lock();
+ }
+ if (!val) {
+ val.reset(new V{}, Deleter{this, key});
+ weak_refs.emplace(key, std::make_pair(val, val.get()));
+ }
+ cache.insert(key, val);
+ return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::find(const K& key)
+{
+ if (auto found = cache.find(key); found) {
+ return *found;
+ }
+ shared_ptr_t val;
+ if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+ val = found->second.first.lock();
+ }
+ if (val) {
+ cache.insert(key, val);
+ }
+ return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::lower_bound(const K& key)
+{
+ if (weak_refs.empty()) {
+ return {};
+ }
+ auto found = weak_refs.lower_bound(key);
+ if (found == weak_refs.end()) {
+ --found;
+ }
+ if (auto val = found->second.first.lock(); val) {
+ cache.insert(key, val);
+ return val;
+ } else {
+ return {};
+ }
+}
+
+template<class K, class V>
+std::optional<typename SharedLRU<K,V>::value_type>
+SharedLRU<K,V>::upper_bound(const K& key)
+{
+ for (auto found = weak_refs.upper_bound(key);
+ found != weak_refs.end();
+ ++found) {
+ if (auto val = found->second.first.lock(); val) {
+ return std::make_pair(found->first, val);
+ }
+ }
+ return std::nullopt;
+}
diff --git a/src/crimson/common/simple_lru.h b/src/crimson/common/simple_lru.h
new file mode 100644
index 000000000..1419c4885
--- /dev/null
+++ b/src/crimson/common/simple_lru.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <list>
+#include <map>
+#include <optional>
+#include <type_traits>
+#include <unordered_map>
+
+template <class Key, class Value, bool Ordered>
+class SimpleLRU {
+ static_assert(std::is_default_constructible_v<Value>);
+ using list_type = std::list<Key>;
+ template<class K, class V>
+ using map_t = std::conditional_t<Ordered,
+ std::map<K, V>,
+ std::unordered_map<K, V>>;
+ using map_type = map_t<Key, std::pair<Value, typename list_type::iterator>>;
+ list_type lru;
+ map_type cache;
+ const size_t max_size;
+
+public:
+ SimpleLRU(size_t size = 20)
+ : cache(size),
+ max_size(size)
+ {}
+ size_t size() const {
+ return cache.size();
+ }
+ size_t capacity() const {
+ return max_size;
+ }
+ using insert_return_type = std::pair<Value, bool>;
+ insert_return_type insert(const Key& key, Value value);
+ std::optional<Value> find(const Key& key);
+ std::optional<std::enable_if<Ordered, Value>> lower_bound(const Key& key);
+ void erase(const Key& key);
+ void clear();
+private:
+ // bump the item to the front of the lru list
+ Value _lru_add(typename map_type::iterator found);
+ // evict the last element of most recently used list
+ void _evict();
+};
+
+template <class Key, class Value, bool Ordered>
+typename SimpleLRU<Key,Value,Ordered>::insert_return_type
+SimpleLRU<Key,Value,Ordered>::insert(const Key& key, Value value)
+{
+ if constexpr(Ordered) {
+ auto found = cache.lower_bound(key);
+ if (found != cache.end() && found->first == key) {
+ // already exists
+ return {found->second.first, true};
+ } else {
+ if (size() >= capacity()) {
+ _evict();
+ }
+ lru.push_front(key);
+ // use lower_bound as hint to save the lookup
+ cache.emplace_hint(found, key, std::make_pair(value, lru.begin()));
+ return {std::move(value), false};
+ }
+ } else {
+ // cache is not ordered
+ auto found = cache.find(key);
+ if (found != cache.end()) {
+ // already exists
+ return {found->second.first, true};
+ } else {
+ if (size() >= capacity()) {
+ _evict();
+ }
+ lru.push_front(key);
+ cache.emplace(key, std::make_pair(value, lru.begin()));
+ return {std::move(value), false};
+ }
+ }
+}
+
+template <class Key, class Value, bool Ordered>
+std::optional<Value> SimpleLRU<Key,Value,Ordered>::find(const Key& key)
+{
+ if (auto found = cache.find(key); found != cache.end()){
+ return _lru_add(found);
+ } else {
+ return {};
+ }
+}
+
+template <class Key, class Value, bool Ordered>
+std::optional<std::enable_if<Ordered, Value>>
+SimpleLRU<Key,Value,Ordered>::lower_bound(const Key& key)
+{
+ if (auto found = cache.lower_bound(key); found != cache.end()) {
+ return _lru_add(found);
+ } else {
+ return {};
+ }
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::clear()
+{
+ lru.clear();
+ cache.clear();
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::erase(const Key& key)
+{
+ if (auto found = cache.find(key); found != cache.end()) {
+ lru.erase(found->second.second);
+ cache.erase(found);
+ }
+}
+
+template <class Key, class Value, bool Ordered>
+Value SimpleLRU<Key,Value,Ordered>::_lru_add(
+ typename SimpleLRU<Key,Value,Ordered>::map_type::iterator found)
+{
+ auto& [value, in_lru] = found->second;
+ if (in_lru != lru.begin()){
+ // move item to the front
+ lru.splice(lru.begin(), lru, in_lru);
+ }
+ // the item is already at the front
+ return value;
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::_evict()
+{
+ // evict the last element of most recently used list
+ auto last = --lru.end();
+ cache.erase(*last);
+ lru.erase(last);
+}
diff --git a/src/crimson/common/smp_helpers.h b/src/crimson/common/smp_helpers.h
new file mode 100644
index 000000000..c2b7bd964
--- /dev/null
+++ b/src/crimson/common/smp_helpers.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <limits>
+
+#include <seastar/core/smp.hh>
+
+#include "crimson/common/errorator.h"
+#include "crimson/common/utility.h"
+
+namespace crimson {
+
+using core_id_t = seastar::shard_id;
+static constexpr core_id_t NULL_CORE = std::numeric_limits<core_id_t>::max();
+
+auto submit_to(core_id_t core, auto &&f) {
+ using ret_type = decltype(f());
+ if constexpr (is_errorated_future_v<ret_type>) {
+ auto ret = seastar::smp::submit_to(
+ core,
+ [f=std::move(f)]() mutable {
+ return f().to_base();
+ });
+ return ret_type(std::move(ret));
+ } else {
+ return seastar::smp::submit_to(core, std::move(f));
+ }
+}
+
+template <typename Obj, typename Method, typename... Args>
+auto proxy_method_on_core(
+ core_id_t core, Obj &obj, Method method, Args&&... args) {
+ return crimson::submit_to(
+ core,
+ [&obj, method,
+ arg_tuple=std::make_tuple(std::forward<Args>(args)...)]() mutable {
+ return apply_method_to_tuple(obj, method, std::move(arg_tuple));
+ });
+}
+
+/**
+ * reactor_map_seq
+ *
+ * Invokes f on each reactor sequentially, Caller may assume that
+ * f will not be invoked concurrently on multiple cores.
+ */
+template <typename F>
+auto reactor_map_seq(F &&f) {
+ using ret_type = decltype(f());
+ if constexpr (is_errorated_future_v<ret_type>) {
+ auto ret = crimson::do_for_each(
+ seastar::smp::all_cpus().begin(),
+ seastar::smp::all_cpus().end(),
+ [f=std::move(f)](auto core) mutable {
+ return seastar::smp::submit_to(
+ core,
+ [&f] {
+ return std::invoke(f);
+ });
+ });
+ return ret_type(ret);
+ } else {
+ return seastar::do_for_each(
+ seastar::smp::all_cpus().begin(),
+ seastar::smp::all_cpus().end(),
+ [f=std::move(f)](auto core) mutable {
+ return seastar::smp::submit_to(
+ core,
+ [&f] {
+ return std::invoke(f);
+ });
+ });
+ }
+}
+
+/**
+ * sharded_map_seq
+ *
+ * Invokes f on each shard of t sequentially. Caller may assume that
+ * f will not be invoked concurrently on multiple cores.
+ */
+template <typename T, typename F>
+auto sharded_map_seq(T &t, F &&f) {
+ return reactor_map_seq(
+ [&t, f=std::forward<F>(f)]() mutable {
+ return std::invoke(f, t.local());
+ });
+}
+
+}
diff --git a/src/crimson/common/throttle.cc b/src/crimson/common/throttle.cc
new file mode 100644
index 000000000..88d1859f3
--- /dev/null
+++ b/src/crimson/common/throttle.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "throttle.h"
+
+namespace crimson::common {
+
+int64_t Throttle::take(int64_t c)
+{
+ if (max == 0u) {
+ return 0;
+ }
+ count += c;
+ return count;
+}
+
+int64_t Throttle::put(int64_t c)
+{
+ if (max == 0u) {
+ return 0;
+ }
+ if (!c) {
+ return count;
+ }
+ on_free_slots.signal();
+ count -= c;
+ return count;
+}
+
+seastar::future<> Throttle::get(size_t c)
+{
+ if (max == 0u) {
+ return seastar::make_ready_future<>();
+ }
+ pending++;
+ return on_free_slots.wait([this, c] {
+ return !_should_wait(c);
+ }).then([this, c] {
+ pending--;
+ count += c;
+ return seastar::make_ready_future<>();
+ });
+}
+
+void Throttle::reset_max(size_t m) {
+ if (max == m) {
+ return;
+ }
+
+ if (m > max) {
+ on_free_slots.signal();
+ }
+ max = m;
+}
+
+bool Throttle::_should_wait(size_t c) const {
+ if (!max) {
+ return false;
+ }
+ return ((c <= max && count + c > max) || // normally stay under max
+ (c >= max && count > max)); // except for large c
+}
+
+} // namespace crimson::common
diff --git a/src/crimson/common/throttle.h b/src/crimson/common/throttle.h
new file mode 100644
index 000000000..2998cb5f8
--- /dev/null
+++ b/src/crimson/common/throttle.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/condition-variable.hh>
+// pull seastar::timer<...>::timer definitions. FIX SEASTAR or reactor.hh
+// is obligatory and should be included everywhere?
+#include <seastar/core/reactor.hh>
+
+#include "common/ThrottleInterface.h"
+
+namespace crimson::common {
+
+class Throttle final : public ThrottleInterface {
+ size_t max = 0;
+ size_t count = 0;
+ size_t pending = 0;
+ // we cannot change the "count" of seastar::semaphore after it is created,
+ // so use condition_variable instead.
+ seastar::condition_variable on_free_slots;
+public:
+ explicit Throttle(size_t m)
+ : max(m)
+ {}
+ int64_t take(int64_t c = 1) override;
+ int64_t put(int64_t c = 1) override;
+ seastar::future<> get(size_t c);
+ size_t get_current() const {
+ return count;
+ }
+ size_t get_max() const {
+ return max;
+ }
+ size_t get_pending() const {
+ return pending;
+ }
+ void reset_max(size_t m);
+private:
+ bool _should_wait(size_t c) const;
+};
+
+} // namespace crimson::common
diff --git a/src/crimson/common/tmap_helpers.cc b/src/crimson/common/tmap_helpers.cc
new file mode 100644
index 000000000..9c14ebc45
--- /dev/null
+++ b/src/crimson/common/tmap_helpers.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/tmap_helpers.h"
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/rados.h"
+
+namespace detail {
+
+#define decode_or_return(v, bp) \
+ try { \
+ ::decode(v, bp); \
+ } catch (...) { \
+ return -EINVAL; \
+ }
+
+class TMapContents {
+ std::map<std::string, bufferlist> keys;
+ bufferlist header;
+public:
+ TMapContents() = default;
+
+ int decode(bufferlist::const_iterator &bliter) {
+ keys.clear();
+ header.clear();
+ if (bliter.end()) {
+ return 0;
+ }
+ decode_or_return(header, bliter);
+ __u32 num_keys;
+ decode_or_return(num_keys, bliter);
+ for (; num_keys > 0; --num_keys) {
+ std::string key;
+ decode_or_return(key, bliter);
+ decode_or_return(keys[key], bliter);
+ }
+ return 0;
+ }
+
+ bufferlist encode() {
+ bufferlist bl;
+ ::encode(header, bl);
+ ::encode(static_cast<__u32>(keys.size()), bl);
+ for (auto &[k, v]: keys) {
+ ::encode(k, bl);
+ ::encode(v, bl);
+ }
+ return bl;
+ }
+
+ int update(bufferlist::const_iterator in) {
+ while (!in.end()) {
+ __u8 op;
+ decode_or_return(op, in);
+
+ if (op == CEPH_OSD_TMAP_HDR) {
+ decode_or_return(header, in);
+ continue;
+ }
+
+ std::string key;
+ decode_or_return(key, in);
+
+ switch (op) {
+ case CEPH_OSD_TMAP_SET: {
+ decode_or_return(keys[key], in);
+ break;
+ }
+ case CEPH_OSD_TMAP_CREATE: {
+ if (keys.contains(key)) {
+ return -EEXIST;
+ }
+ decode_or_return(keys[key], in);
+ break;
+ }
+ case CEPH_OSD_TMAP_RM: {
+ auto kiter = keys.find(key);
+ if (kiter == keys.end()) {
+ return -ENOENT;
+ }
+ keys.erase(kiter);
+ break;
+ }
+ case CEPH_OSD_TMAP_RMSLOPPY: {
+ keys.erase(key);
+ break;
+ }
+ }
+ }
+ return 0;
+ }
+
+ int put(bufferlist::const_iterator in) {
+ return 0;
+ }
+};
+
+}
+
+namespace crimson::common {
+
+using do_tmap_up_ret = tl::expected<bufferlist, int>;
+do_tmap_up_ret do_tmap_up(bufferlist::const_iterator in, bufferlist contents)
+{
+ detail::TMapContents tmap;
+ auto bliter = contents.cbegin();
+ int r = tmap.decode(bliter);
+ if (r < 0) {
+ return tl::unexpected(r);
+ }
+ r = tmap.update(in);
+ if (r < 0) {
+ return tl::unexpected(r);
+ }
+ return tmap.encode();
+}
+
+using do_tmap_up_ret = tl::expected<bufferlist, int>;
+do_tmap_up_ret do_tmap_put(bufferlist::const_iterator in)
+{
+ detail::TMapContents tmap;
+ int r = tmap.decode(in);
+ if (r < 0) {
+ return tl::unexpected(r);
+ }
+ return tmap.encode();
+}
+
+}
diff --git a/src/crimson/common/tmap_helpers.h b/src/crimson/common/tmap_helpers.h
new file mode 100644
index 000000000..446dbea2a
--- /dev/null
+++ b/src/crimson/common/tmap_helpers.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/expected.hpp"
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+
+namespace crimson::common {
+
+/**
+ * do_tmap_up
+ *
+ * Performs tmap update instructions encoded in buffer referenced by in.
+ *
+ * @param [in] in iterator to buffer containing encoded tmap update operations
+ * @param [in] contents current contents of object
+ * @return buffer containing new object contents,
+ * -EINVAL for decoding errors,
+ * -EEXIST for CEPH_OSD_TMAP_CREATE on a key that exists
+ * -ENOENT for CEPH_OSD_TMAP_RM on a key that does not exist
+ */
+using do_tmap_up_ret = tl::expected<bufferlist, int>;
+do_tmap_up_ret do_tmap_up(bufferlist::const_iterator in, bufferlist contents);
+
+/**
+ * do_tmap_put
+ *
+ * Validates passed buffer pointed to by in and returns resulting object buffer.
+ *
+ * @param [in] in iterator to buffer containing tmap encoding
+ * @return buffer containing validated tmap encoded by in
+ * -EINVAL for decoding errors,
+ */
+using do_tmap_up_ret = tl::expected<bufferlist, int>;
+do_tmap_up_ret do_tmap_put(bufferlist::const_iterator in);
+
+}
diff --git a/src/crimson/common/tri_mutex.cc b/src/crimson/common/tri_mutex.cc
new file mode 100644
index 000000000..e4b181280
--- /dev/null
+++ b/src/crimson/common/tri_mutex.cc
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tri_mutex.h"
+
+seastar::future<> read_lock::lock()
+{
+ return static_cast<tri_mutex*>(this)->lock_for_read();
+}
+
+void read_lock::unlock()
+{
+ static_cast<tri_mutex*>(this)->unlock_for_read();
+}
+
+seastar::future<> write_lock::lock()
+{
+ return static_cast<tri_mutex*>(this)->lock_for_write(false);
+}
+
+void write_lock::unlock()
+{
+ static_cast<tri_mutex*>(this)->unlock_for_write();
+}
+
+seastar::future<> excl_lock::lock()
+{
+ return static_cast<tri_mutex*>(this)->lock_for_excl();
+}
+
+void excl_lock::unlock()
+{
+ static_cast<tri_mutex*>(this)->unlock_for_excl();
+}
+
+seastar::future<> excl_lock_from_read::lock()
+{
+ static_cast<tri_mutex*>(this)->promote_from_read();
+ return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_read::unlock()
+{
+ static_cast<tri_mutex*>(this)->demote_to_read();
+}
+
+seastar::future<> excl_lock_from_write::lock()
+{
+ static_cast<tri_mutex*>(this)->promote_from_write();
+ return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_write::unlock()
+{
+ static_cast<tri_mutex*>(this)->demote_to_write();
+}
+
+seastar::future<> excl_lock_from_excl::lock()
+{
+ return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_excl::unlock()
+{
+}
+
+tri_mutex::~tri_mutex()
+{
+ assert(!is_acquired());
+}
+
+seastar::future<> tri_mutex::lock_for_read()
+{
+ if (try_lock_for_read()) {
+ return seastar::make_ready_future<>();
+ }
+ waiters.emplace_back(seastar::promise<>(), type_t::read);
+ return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_read() noexcept
+{
+ if (!writers && !exclusively_used && waiters.empty()) {
+ ++readers;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void tri_mutex::unlock_for_read()
+{
+ assert(readers > 0);
+ if (--readers == 0) {
+ wake();
+ }
+}
+
+void tri_mutex::promote_from_read()
+{
+ assert(readers == 1);
+ --readers;
+ exclusively_used = true;
+}
+
+void tri_mutex::demote_to_read()
+{
+ assert(exclusively_used);
+ exclusively_used = false;
+ ++readers;
+}
+
+seastar::future<> tri_mutex::lock_for_write(bool greedy)
+{
+ if (try_lock_for_write(greedy)) {
+ return seastar::make_ready_future<>();
+ }
+ waiters.emplace_back(seastar::promise<>(), type_t::write);
+ return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_write(bool greedy) noexcept
+{
+ if (!readers && !exclusively_used) {
+ if (greedy || waiters.empty()) {
+ ++writers;
+ return true;
+ }
+ }
+ return false;
+}
+
+void tri_mutex::unlock_for_write()
+{
+ assert(writers > 0);
+ if (--writers == 0) {
+ wake();
+ }
+}
+
+void tri_mutex::promote_from_write()
+{
+ assert(writers == 1);
+ --writers;
+ exclusively_used = true;
+}
+
+void tri_mutex::demote_to_write()
+{
+ assert(exclusively_used);
+ exclusively_used = false;
+ ++writers;
+}
+
+// for exclusive users
+seastar::future<> tri_mutex::lock_for_excl()
+{
+ if (try_lock_for_excl()) {
+ return seastar::make_ready_future<>();
+ }
+ waiters.emplace_back(seastar::promise<>(), type_t::exclusive);
+ return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_excl() noexcept
+{
+ if (readers == 0u && writers == 0u && !exclusively_used) {
+ exclusively_used = true;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void tri_mutex::unlock_for_excl()
+{
+ assert(exclusively_used);
+ exclusively_used = false;
+ wake();
+}
+
+bool tri_mutex::is_acquired() const
+{
+ if (readers != 0u) {
+ return true;
+ } else if (writers != 0u) {
+ return true;
+ } else if (exclusively_used) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void tri_mutex::wake()
+{
+ assert(!readers && !writers && !exclusively_used);
+ type_t type = type_t::none;
+ while (!waiters.empty()) {
+ auto& waiter = waiters.front();
+ if (type == type_t::exclusive) {
+ break;
+ } if (type == type_t::none) {
+ type = waiter.type;
+ } else if (type != waiter.type) {
+ // to be woken in the next batch
+ break;
+ }
+ switch (type) {
+ case type_t::read:
+ ++readers;
+ break;
+ case type_t::write:
+ ++writers;
+ break;
+ case type_t::exclusive:
+ exclusively_used = true;
+ break;
+ default:
+ assert(0);
+ }
+ waiter.pr.set_value();
+ waiters.pop_front();
+ }
+}
diff --git a/src/crimson/common/tri_mutex.h b/src/crimson/common/tri_mutex.h
new file mode 100644
index 000000000..0533f3539
--- /dev/null
+++ b/src/crimson/common/tri_mutex.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/circular_buffer.hh>
+
+class read_lock {
+public:
+ seastar::future<> lock();
+ void unlock();
+};
+
+class write_lock {
+public:
+ seastar::future<> lock();
+ void unlock();
+};
+
+class excl_lock {
+public:
+ seastar::future<> lock();
+ void unlock();
+};
+
+// promote from read to excl
+class excl_lock_from_read {
+public:
+ seastar::future<> lock();
+ void unlock();
+};
+
+// promote from write to excl
+class excl_lock_from_write {
+public:
+ seastar::future<> lock();
+ void unlock();
+};
+
+// promote from excl to excl
+class excl_lock_from_excl {
+public:
+ seastar::future<> lock();
+ void unlock();
+};
+
+/// shared/exclusive mutual exclusion
+///
+/// this lock design uses reader and writer is entirely and completely
+/// independent of the conventional reader/writer lock usage. Here, what we
+/// mean is that we can pipeline reads, and we can pipeline writes, but we
+/// cannot allow a read while writes are in progress or a write while reads are
+/// in progress. Any rmw operation is therefore exclusive.
+///
+/// tri_mutex is based on seastar::shared_mutex, but instead of two kinds of
+/// waiters, tri_mutex keeps track of three kinds of lock users:
+/// - readers
+/// - writers
+/// - exclusive users
+class tri_mutex : private read_lock,
+ write_lock,
+ excl_lock,
+ excl_lock_from_read,
+ excl_lock_from_write,
+ excl_lock_from_excl
+{
+public:
+ tri_mutex() = default;
+ ~tri_mutex();
+
+ read_lock& for_read() {
+ return *this;
+ }
+ write_lock& for_write() {
+ return *this;
+ }
+ excl_lock& for_excl() {
+ return *this;
+ }
+ excl_lock_from_read& excl_from_read() {
+ return *this;
+ }
+ excl_lock_from_write& excl_from_write() {
+ return *this;
+ }
+ excl_lock_from_excl& excl_from_excl() {
+ return *this;
+ }
+
+ // for shared readers
+ seastar::future<> lock_for_read();
+ bool try_lock_for_read() noexcept;
+ void unlock_for_read();
+ void promote_from_read();
+ void demote_to_read();
+ unsigned get_readers() const {
+ return readers;
+ }
+
+ // for shared writers
+ seastar::future<> lock_for_write(bool greedy);
+ bool try_lock_for_write(bool greedy) noexcept;
+ void unlock_for_write();
+ void promote_from_write();
+ void demote_to_write();
+ unsigned get_writers() const {
+ return writers;
+ }
+
+ // for exclusive users
+ seastar::future<> lock_for_excl();
+ bool try_lock_for_excl() noexcept;
+ void unlock_for_excl();
+ bool is_excl_acquired() const {
+ return exclusively_used;
+ }
+
+ bool is_acquired() const;
+
+ /// pass the provided exception to any waiting waiters
+ template<typename Exception>
+ void abort(Exception ex) {
+ while (!waiters.empty()) {
+ auto& waiter = waiters.front();
+ waiter.pr.set_exception(std::make_exception_ptr(ex));
+ waiters.pop_front();
+ }
+ }
+
+private:
+ void wake();
+ unsigned readers = 0;
+ unsigned writers = 0;
+ bool exclusively_used = false;
+ enum class type_t : uint8_t {
+ read,
+ write,
+ exclusive,
+ none,
+ };
+ struct waiter_t {
+ waiter_t(seastar::promise<>&& pr, type_t type)
+ : pr(std::move(pr)), type(type)
+ {}
+ seastar::promise<> pr;
+ type_t type;
+ };
+ seastar::circular_buffer<waiter_t> waiters;
+ friend class read_lock;
+ friend class write_lock;
+ friend class excl_lock;
+ friend class excl_lock_from_read;
+ friend class excl_lock_from_write;
+ friend class excl_lock_from_excl;
+};
diff --git a/src/crimson/common/type_helpers.h b/src/crimson/common/type_helpers.h
new file mode 100644
index 000000000..4c606581f
--- /dev/null
+++ b/src/crimson/common/type_helpers.h
@@ -0,0 +1,8 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "boost/intrusive_ptr.hpp"
+
+template<typename T> using Ref = boost::intrusive_ptr<T>;
diff --git a/src/crimson/common/utility.h b/src/crimson/common/utility.h
new file mode 100644
index 000000000..86b308155
--- /dev/null
+++ b/src/crimson/common/utility.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <type_traits>
+
+namespace _impl {
+ template <class T> struct always_false : std::false_type {};
+};
+
+template <class T>
+void assert_moveable(T& t) {
+ // It's fine
+}
+template <class T>
+void assert_moveable(const T& t) {
+ static_assert(_impl::always_false<T>::value, "unable to move-out from T");
+}
+
+namespace internal {
+
+template <typename Obj, typename Method, typename ArgTuple, size_t... I>
+static auto _apply_method_to_tuple(
+ Obj &obj, Method method, ArgTuple &&tuple,
+ std::index_sequence<I...>) {
+ return (obj.*method)(std::get<I>(std::forward<ArgTuple>(tuple))...);
+}
+
+}
+
+template <typename Obj, typename Method, typename ArgTuple>
+auto apply_method_to_tuple(Obj &obj, Method method, ArgTuple &&tuple) {
+ constexpr auto tuple_size = std::tuple_size_v<ArgTuple>;
+ return internal::_apply_method_to_tuple(
+ obj, method, std::forward<ArgTuple>(tuple),
+ std::make_index_sequence<tuple_size>());
+}
diff --git a/src/crimson/crush/CrushLocation.cc b/src/crimson/crush/CrushLocation.cc
new file mode 100644
index 000000000..d45264000
--- /dev/null
+++ b/src/crimson/crush/CrushLocation.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "CrushLocation.h"
+
+#include <vector>
+#include <boost/algorithm/string/trim.hpp>
+#include <seastar/util/process.hh>
+#include <seastar/util/later.hh>
+
+#include "crush/CrushWrapper.h"
+#include "crimson/common/log.h"
+#include "crimson/common/config_proxy.h"
+
+static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_crush);
+}
+
+using namespace crimson::common;
+
+namespace crimson::crush {
+
+seastar::future<> CrushLocation::update_from_conf()
+{
+ auto crush_location = local_conf().get_val<std::string>("crush_location");
+ if (crush_location.length()) {
+ _parse(crush_location);
+ }
+
+ return seastar::now();
+}
+
+void CrushLocation::_parse(const std::string& s)
+{
+ std::multimap<std::string, std::string> new_crush_location;
+ std::vector<std::string> lvec;
+ get_str_vec(s, ";, \t", lvec);
+ int r = CrushWrapper::parse_loc_multimap(lvec, &new_crush_location);
+ if (r < 0) {
+ logger().error("CrushWrapper::parse_loc_multimap error, keeping original\
+ crush_location {}", *this);
+ return;
+ }
+
+ loc.swap(new_crush_location);
+ logger().info("{}: crush_location is {}", __func__, *this);
+ return;
+}
+
+seastar::future<> CrushLocation::update_from_hook()
+{
+ auto crush_location_hook = local_conf().get_val<std::string>("crush_location_hook");
+ if (crush_location_hook.length() == 0)
+ return seastar::now();
+
+ return seastar::file_exists(
+ crush_location_hook
+ ).then([this] (bool result) {
+ if (!result) {
+ std::stringstream errmsg;
+ errmsg << "the user define crush location hook: "
+ << local_conf().get_val<std::string>("crush_location_hook")
+ << " is not exists.";
+ logger().error("{}", errmsg.str());
+ throw std::runtime_error(errmsg.str());
+ }
+
+ return seastar::file_accessible(
+ local_conf().get_val<std::string>("crush_location_hook"),
+ seastar::access_flags::execute
+ ).then([this] (bool result) {
+ if (!result) {
+ std::stringstream errmsg;
+ errmsg << "the user define crush location hook: "
+ << local_conf().get_val<std::string>("crush_location_hook")
+ << " is not executable.";
+ logger().error("{}", errmsg.str());
+ throw std::runtime_error(errmsg.str());
+ }
+
+ seastar::experimental::spawn_parameters params = {
+ .argv = {
+ local_conf().get_val<std::string>("crush_location_hook"),
+ "--cluster",
+ local_conf()->cluster,
+ "--id",
+ local_conf()->name.get_id(),
+ "--type",
+ local_conf()->name.get_type_str()
+ }
+ };
+ return seastar::experimental::spawn_process(
+ local_conf().get_val<std::string>("crush_location_hook"),
+ std::move(params)
+ ).then([this] (auto process) {
+ auto stdout = process.stdout();
+ return do_with(
+ std::move(process),
+ std::move(stdout),
+ [this](auto& process, auto& stdout)
+ {
+ return stdout.read().then([] (seastar::temporary_buffer<char> buf) {
+ auto out = std::string(buf.get(), buf.size());
+ boost::algorithm::trim_if(out, boost::algorithm::is_any_of(" \n\r\t"));
+ return seastar::make_ready_future<std::string>(std::move(out));
+ }).then([&process, this] (auto out) {
+ return process.wait(
+ ).then([out = std::move(out), this] (auto wstatus) {
+ auto* exit_signal = std::get_if<seastar::experimental::process::wait_signaled>(&wstatus);
+ if (exit_signal != nullptr) {
+ std::stringstream errmsg;
+ errmsg << "the user define crush location hook: "
+ << local_conf().get_val<std::string>("crush_location_hook")
+ << " terminated, terminated signal is "
+ << exit_signal->terminating_signal;
+ logger().error("{}", errmsg.str());
+ throw std::runtime_error(errmsg.str());
+ }
+
+ auto* exit_status = std::get_if<seastar::experimental::process::wait_exited>(&wstatus);
+ if (exit_status->exit_code != 0) {
+ std::stringstream errmsg;
+ errmsg << "the user define crush location hook: "
+ << local_conf().get_val<std::string>("crush_location_hook")
+ << " execute failed, exit_code is " << exit_status->exit_code;
+ logger().error("{}", errmsg.str());
+ throw std::runtime_error(errmsg.str());
+ } else {
+ _parse(out);
+ }
+ return seastar::now();
+ });
+ });
+ });
+ });
+ });
+ });
+}
+
+seastar::future<> CrushLocation::init_on_startup()
+{
+ if (local_conf().get_val<std::string>("crush_location").length()) {
+ return update_from_conf();
+ }
+ if (local_conf().get_val<std::string>("crush_location_hook").length()) {
+ return update_from_hook();
+ }
+
+ // start with a sane default
+ char hostname[HOST_NAME_MAX + 1];
+ int r = gethostname(hostname, sizeof(hostname));
+ if (r < 0)
+ strcpy(hostname, "unknown_host");
+ // use short hostname
+ for (unsigned i=0; hostname[i]; ++i) {
+ if (hostname[i] == '.') {
+ hostname[i] = '\0';
+ break;
+ }
+ }
+
+ loc.clear();
+ loc.insert(std::make_pair<std::string, std::string>("host", hostname));
+ loc.insert(std::make_pair<std::string, std::string>("root", "default"));
+ return seastar::now();
+}
+
+std::multimap<std::string,std::string> CrushLocation::get_location() const
+{
+ return loc;
+}
+
+std::ostream& operator<<(std::ostream& os, const CrushLocation& loc)
+{
+ bool first = true;
+ for (auto& [type, pos] : loc.get_location()) {
+ if (first) {
+ first = false;
+ } else {
+ os << ", ";
+ }
+ os << '"' << type << '=' << pos << '"';
+ }
+ return os;
+}
+
+}
diff --git a/src/crimson/crush/CrushLocation.h b/src/crimson/crush/CrushLocation.h
new file mode 100644
index 000000000..9dc954672
--- /dev/null
+++ b/src/crimson/crush/CrushLocation.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <iosfwd>
+#include <map>
+#include <string>
+#if FMT_VERSION >= 90000
+#include <fmt/ostream.h>
+#endif
+
+#include <seastar/core/seastar.hh>
+
+namespace crimson::crush {
+
+class CrushLocation {
+public:
+ explicit CrushLocation() {
+ }
+
+ seastar::future<> update_from_conf(); ///< refresh from config
+ seastar::future<> init_on_startup();
+ seastar::future<> update_from_hook(); ///< call hook, if present
+
+ std::multimap<std::string, std::string> get_location() const;
+
+private:
+ void _parse(const std::string& s);
+ std::multimap<std::string, std::string> loc;
+};
+
+std::ostream& operator<<(std::ostream& os, const CrushLocation& loc);
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::crush::CrushLocation> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/mgr/client.cc b/src/crimson/mgr/client.cc
new file mode 100644
index 000000000..169915c9e
--- /dev/null
+++ b/src/crimson/mgr/client.cc
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "client.h"
+
+#include <seastar/core/sleep.hh>
+
+#include "crimson/common/log.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "messages/MMgrConfigure.h"
+#include "messages/MMgrMap.h"
+#include "messages/MMgrOpen.h"
+
+namespace {
+ seastar::logger& logger()
+ {
+ return crimson::get_logger(ceph_subsys_mgrc);
+ }
+}
+
+using crimson::common::local_conf;
+
+namespace crimson::mgr
+{
+
+Client::Client(crimson::net::Messenger& msgr,
+ WithStats& with_stats)
+ : msgr{msgr},
+ with_stats{with_stats},
+ report_timer{[this] {report();}}
+{}
+
+seastar::future<> Client::start()
+{
+ return seastar::now();
+}
+
+seastar::future<> Client::stop()
+{
+ logger().info("{}", __func__);
+ report_timer.cancel();
+ auto fut = gate.close();
+ if (conn) {
+ conn->mark_down();
+ }
+ return fut;
+}
+
+std::optional<seastar::future<>>
+Client::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+ bool dispatched = true;
+ gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] {
+ switch(m->get_type()) {
+ case MSG_MGR_MAP:
+ return handle_mgr_map(conn, boost::static_pointer_cast<MMgrMap>(m));
+ case MSG_MGR_CONFIGURE:
+ return handle_mgr_conf(conn, boost::static_pointer_cast<MMgrConfigure>(m));
+ default:
+ dispatched = false;
+ return seastar::now();
+ }
+ });
+ return (dispatched ? std::make_optional(seastar::now()) : std::nullopt);
+}
+
+void Client::ms_handle_connect(
+ crimson::net::ConnectionRef c,
+ seastar::shard_id prv_shard)
+{
+ ceph_assert_always(prv_shard == seastar::this_shard_id());
+ gate.dispatch_in_background(__func__, *this, [this, c] {
+ if (conn == c) {
+ // ask for the mgrconfigure message
+ auto m = crimson::make_message<MMgrOpen>();
+ m->daemon_name = local_conf()->name.get_id();
+ local_conf().get_config_bl(0, &m->config_bl, &last_config_bl_version);
+ local_conf().get_defaults_bl(&m->config_defaults_bl);
+ return conn->send(std::move(m));
+ } else {
+ return seastar::now();
+ }
+ });
+}
+
+void Client::ms_handle_reset(crimson::net::ConnectionRef c, bool /* is_replace */)
+{
+ gate.dispatch_in_background(__func__, *this, [this, c] {
+ if (conn == c) {
+ report_timer.cancel();
+ return reconnect();
+ } else {
+ return seastar::now();
+ }
+ });
+}
+
+seastar::future<> Client::reconnect()
+{
+ if (conn) {
+ conn->mark_down();
+ conn = {};
+ }
+ if (!mgrmap.get_available()) {
+ logger().warn("No active mgr available yet");
+ return seastar::now();
+ }
+ auto retry_interval = std::chrono::duration<double>(
+ local_conf().get_val<double>("mgr_connect_retry_interval"));
+ auto a_while = std::chrono::duration_cast<seastar::steady_clock_type::duration>(
+ retry_interval);
+ return seastar::sleep(a_while).then([this] {
+ auto peer = mgrmap.get_active_addrs().pick_addr(msgr.get_myaddr().get_type());
+ if (peer == entity_addr_t{}) {
+ // crimson msgr only uses the first bound addr
+ logger().error("mgr.{} does not have an addr compatible with me",
+ mgrmap.get_active_name());
+ return;
+ }
+ conn = msgr.connect(peer, CEPH_ENTITY_TYPE_MGR);
+ });
+}
+
+seastar::future<> Client::handle_mgr_map(crimson::net::ConnectionRef,
+ Ref<MMgrMap> m)
+{
+ mgrmap = m->get_map();
+ if (!conn) {
+ return reconnect();
+ } else if (conn->get_peer_addr() !=
+ mgrmap.get_active_addrs().legacy_addr()) {
+ return reconnect();
+ } else {
+ return seastar::now();
+ }
+}
+
+seastar::future<> Client::handle_mgr_conf(crimson::net::ConnectionRef,
+ Ref<MMgrConfigure> m)
+{
+ logger().info("{} {}", __func__, *m);
+
+ auto report_period = std::chrono::seconds{m->stats_period};
+ if (report_period.count()) {
+ if (report_timer.armed()) {
+ report_timer.rearm(report_timer.get_timeout(), report_period);
+ } else {
+ report_timer.arm_periodic(report_period);
+ }
+ } else {
+ report_timer.cancel();
+ }
+ return seastar::now();
+}
+
+void Client::report()
+{
+ gate.dispatch_in_background(__func__, *this, [this] {
+ if (!conn) {
+ logger().warn("report: no conn available; raport skipped");
+ return seastar::now();
+ }
+ return with_stats.get_stats(
+ ).then([this](auto &&pg_stats) {
+ return conn->send(std::move(pg_stats));
+ });
+ });
+}
+
+void Client::print(std::ostream& out) const
+{
+ out << "mgrc ";
+}
+
+}
diff --git a/src/crimson/mgr/client.h b/src/crimson/mgr/client.h
new file mode 100644
index 000000000..501949768
--- /dev/null
+++ b/src/crimson/mgr/client.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/timer.hh>
+
+#include "crimson/common/gated.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Fwd.h"
+#include "mon/MgrMap.h"
+
+template<typename Message> using Ref = boost::intrusive_ptr<Message>;
+namespace crimson::net {
+ class Messenger;
+}
+
+class MMgrMap;
+class MMgrConfigure;
+
+namespace crimson::mgr
+{
+
+// implement WithStats if you want to report stats to mgr periodically
+class WithStats {
+public:
+ virtual seastar::future<MessageURef> get_stats() const = 0;
+ virtual ~WithStats() {}
+};
+
+class Client : public crimson::net::Dispatcher {
+public:
+ Client(crimson::net::Messenger& msgr,
+ WithStats& with_stats);
+ seastar::future<> start();
+ seastar::future<> stop();
+ void report();
+
+private:
+ std::optional<seastar::future<>> ms_dispatch(
+ crimson::net::ConnectionRef conn, Ref<Message> m) override;
+ void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final;
+ void ms_handle_connect(crimson::net::ConnectionRef conn, seastar::shard_id) final;
+ seastar::future<> handle_mgr_map(crimson::net::ConnectionRef conn,
+ Ref<MMgrMap> m);
+ seastar::future<> handle_mgr_conf(crimson::net::ConnectionRef conn,
+ Ref<MMgrConfigure> m);
+ seastar::future<> reconnect();
+
+ void print(std::ostream&) const;
+ friend std::ostream& operator<<(std::ostream& out, const Client& client);
+private:
+ MgrMap mgrmap;
+ crimson::net::Messenger& msgr;
+ WithStats& with_stats;
+ crimson::net::ConnectionRef conn;
+ seastar::timer<seastar::lowres_clock> report_timer;
+ crimson::common::Gated gate;
+ uint64_t last_config_bl_version = 0;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Client& client) {
+ client.print(out);
+ return out;
+}
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::mgr::Client> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/mon/MonClient.cc b/src/crimson/mon/MonClient.cc
new file mode 100644
index 000000000..7be09915a
--- /dev/null
+++ b/src/crimson/mon/MonClient.cc
@@ -0,0 +1,1162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MonClient.h"
+
+#include <random>
+#include <fmt/ranges.h>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/util/log.hh>
+
+#include "auth/AuthClientHandler.h"
+#include "auth/RotatingKeyRing.h"
+
+#include "common/hostname.h"
+
+#include "crimson/auth/KeyRing.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/log.h"
+#include "crimson/common/logclient.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Errors.h"
+#include "crimson/net/Messenger.h"
+
+#include "messages/MAuth.h"
+#include "messages/MAuthReply.h"
+#include "messages/MConfig.h"
+#include "messages/MLogAck.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersion.h"
+#include "messages/MMonGetVersionReply.h"
+#include "messages/MMonMap.h"
+#include "messages/MMonSubscribe.h"
+#include "messages/MMonSubscribeAck.h"
+
+using std::string;
+using std::tuple;
+using std::vector;
+
+namespace {
+ seastar::logger& logger()
+ {
+ return crimson::get_logger(ceph_subsys_monc);
+ }
+}
+
+namespace crimson::mon {
+
+using crimson::common::local_conf;
+
+class Connection : public seastar::enable_shared_from_this<Connection> {
+public:
+ Connection(const AuthRegistry& auth_registry,
+ crimson::net::ConnectionRef conn,
+ KeyRing* keyring);
+ enum class auth_result_t {
+ success = 0,
+ failure,
+ canceled
+ };
+ seastar::future<> handle_auth_reply(Ref<MAuthReply> m);
+ // v2
+ seastar::future<auth_result_t> authenticate_v2();
+ auth::AuthClient::auth_request_t
+ get_auth_request(const EntityName& name,
+ uint32_t want_keys);
+ using secret_t = string;
+ tuple<CryptoKey, secret_t, bufferlist>
+ handle_auth_reply_more(const ceph::buffer::list& bl);
+ int handle_auth_bad_method(uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes);
+ tuple<CryptoKey, secret_t, int>
+ handle_auth_done(uint64_t new_global_id,
+ const ceph::buffer::list& bl);
+ void close();
+ bool is_my_peer(const entity_addr_t& addr) const;
+ AuthAuthorizer* get_authorizer(entity_type_t peer) const;
+ KeyStore& get_keys();
+ seastar::future<> renew_tickets();
+ seastar::future<> renew_rotating_keyring();
+
+ crimson::net::ConnectionRef get_conn();
+
+private:
+ std::unique_ptr<AuthClientHandler> create_auth(crimson::auth::method_t,
+ uint64_t global_id,
+ const EntityName& name,
+ uint32_t want_keys);
+ enum class request_t {
+ rotating,
+ general,
+ };
+ seastar::future<std::optional<auth_result_t>> do_auth_single(request_t);
+ seastar::future<auth_result_t> do_auth(request_t);
+
+private:
+ bool closed = false;
+ seastar::shared_promise<Ref<MAuthReply>> auth_reply;
+ // v2
+ using clock_t = seastar::lowres_system_clock;
+ clock_t::time_point auth_start;
+ crimson::auth::method_t auth_method = 0;
+ std::optional<seastar::promise<auth_result_t>> auth_done;
+ const AuthRegistry& auth_registry;
+ crimson::net::ConnectionRef conn;
+ std::unique_ptr<AuthClientHandler> auth;
+ std::unique_ptr<RotatingKeyRing> rotating_keyring;
+ uint64_t global_id = 0;
+ clock_t::time_point last_rotating_renew_sent;
+};
+
+Connection::Connection(const AuthRegistry& auth_registry,
+ crimson::net::ConnectionRef conn,
+ KeyRing* keyring)
+ : auth_registry{auth_registry},
+ conn{conn},
+ rotating_keyring{
+ std::make_unique<RotatingKeyRing>(nullptr,
+ CEPH_ENTITY_TYPE_OSD,
+ keyring)}
+{}
+
+seastar::future<> Connection::handle_auth_reply(Ref<MAuthReply> m)
+{
+ logger().info("{}", __func__);
+ ceph_assert(m);
+ auth_reply.set_value(m);
+ auth_reply = {};
+ return seastar::now();
+}
+
+seastar::future<> Connection::renew_tickets()
+{
+ if (auth->need_tickets()) {
+ logger().info("{}: retrieving new tickets", __func__);
+ return do_auth(request_t::general).then([](const auth_result_t r) {
+ if (r == auth_result_t::failure) {
+ logger().info("renew_tickets: ignoring failed auth reply");
+ }
+ });
+ } else {
+ logger().debug("{}: don't need new tickets", __func__);
+ return seastar::now();
+ }
+}
+
+seastar::future<> Connection::renew_rotating_keyring()
+{
+ auto now = clock_t::now();
+ auto ttl = std::chrono::seconds{
+ static_cast<long>(crimson::common::local_conf()->auth_service_ticket_ttl)};
+ auto cutoff = utime_t{now - std::min(std::chrono::seconds{30}, ttl / 4)};
+ if (!rotating_keyring->need_new_secrets(cutoff)) {
+ logger().debug("renew_rotating_keyring secrets are up-to-date "
+ "(they expire after {})", cutoff);
+ return seastar::now();
+ } else {
+ logger().info("renew_rotating_keyring renewing rotating keys "
+ " (they expired before {})", cutoff);
+ }
+ if ((now > last_rotating_renew_sent) &&
+ (now - last_rotating_renew_sent < std::chrono::seconds{1})) {
+ logger().info("renew_rotating_keyring called too often (last: {})",
+ utime_t{last_rotating_renew_sent});
+ return seastar::now();
+ }
+ last_rotating_renew_sent = now;
+ return do_auth(request_t::rotating).then([](const auth_result_t r) {
+ if (r == auth_result_t::failure) {
+ logger().info("renew_rotating_keyring: ignoring failed auth reply");
+ }
+ });
+}
+
+AuthAuthorizer* Connection::get_authorizer(entity_type_t peer) const
+{
+ if (auth) {
+ return auth->build_authorizer(peer);
+ } else {
+ return nullptr;
+ }
+}
+
+KeyStore& Connection::get_keys() {
+ return *rotating_keyring;
+}
+
+std::unique_ptr<AuthClientHandler>
+Connection::create_auth(crimson::auth::method_t protocol,
+ uint64_t global_id,
+ const EntityName& name,
+ uint32_t want_keys)
+{
+ static crimson::common::CephContext cct;
+ std::unique_ptr<AuthClientHandler> auth;
+ auth.reset(AuthClientHandler::create(&cct,
+ protocol,
+ rotating_keyring.get()));
+ if (!auth) {
+ logger().error("no handler for protocol {}", protocol);
+ throw std::system_error(make_error_code(
+ crimson::net::error::negotiation_failure));
+ }
+ auth->init(name);
+ auth->set_want_keys(want_keys);
+ auth->set_global_id(global_id);
+ return auth;
+}
+
+seastar::future<std::optional<Connection::auth_result_t>>
+Connection::do_auth_single(Connection::request_t what)
+{
+ auto m = crimson::make_message<MAuth>();
+ m->protocol = auth->get_protocol();
+ auth->prepare_build_request();
+ switch (what) {
+ case request_t::rotating:
+ auth->build_rotating_request(m->auth_payload);
+ break;
+ case request_t::general:
+ if (int ret = auth->build_request(m->auth_payload); ret) {
+ logger().error("missing/bad key for '{}'", local_conf()->name);
+ throw std::system_error(make_error_code(
+ crimson::net::error::negotiation_failure));
+ }
+ break;
+ default:
+ assert(0);
+ }
+ logger().info("sending {}", *m);
+ return conn->send(std::move(m)).then([this] {
+ logger().info("waiting");
+ return auth_reply.get_shared_future();
+ }).then([this, life_extender=shared_from_this()] (Ref<MAuthReply> m) {
+ if (!m) {
+ ceph_assert(closed);
+ logger().info("do_auth_single: connection closed");
+ return std::make_optional(auth_result_t::canceled);
+ }
+ logger().info("do_auth_single: {} returns {}: {}",
+ *conn, *m, m->result);
+ auto p = m->result_bl.cbegin();
+ auto ret = auth->handle_response(m->result, p,
+ nullptr, nullptr);
+ std::optional<Connection::auth_result_t> auth_result;
+ switch (ret) {
+ case -EAGAIN:
+ auth_result = std::nullopt;
+ break;
+ case 0:
+ auth_result = auth_result_t::success;
+ break;
+ default:
+ auth_result = auth_result_t::failure;
+ logger().error(
+ "do_auth_single: got error {} on mon {}",
+ ret, conn->get_peer_addr());
+ break;
+ }
+ return auth_result;
+ });
+}
+
+seastar::future<Connection::auth_result_t>
+Connection::do_auth(Connection::request_t what) {
+ return seastar::repeat_until_value(
+ [this, life_extender=shared_from_this(), what]() {
+ return do_auth_single(what);
+ });
+}
+
+seastar::future<Connection::auth_result_t> Connection::authenticate_v2()
+{
+ auth_start = seastar::lowres_system_clock::now();
+ return conn->send(crimson::make_message<MMonGetMap>()).then([this] {
+ auth_done.emplace();
+ return auth_done->get_future();
+ });
+}
+
+auth::AuthClient::auth_request_t
+Connection::get_auth_request(const EntityName& entity_name,
+ uint32_t want_keys)
+{
+ // choose method
+ auth_method = [&] {
+ std::vector<crimson::auth::method_t> methods;
+ auth_registry.get_supported_methods(conn->get_peer_type(), &methods);
+ if (methods.empty()) {
+ logger().info("get_auth_request no methods is supported");
+ throw crimson::auth::error("no methods is supported");
+ }
+ return methods.front();
+ }();
+
+ std::vector<uint32_t> modes;
+ auth_registry.get_supported_modes(conn->get_peer_type(), auth_method,
+ &modes);
+ logger().info("method {} preferred_modes {}", auth_method, modes);
+ if (modes.empty()) {
+ throw crimson::auth::error("no modes is supported");
+ }
+ auth = create_auth(auth_method, global_id, entity_name, want_keys);
+
+ using ceph::encode;
+ bufferlist bl;
+ // initial request includes some boilerplate...
+ encode((char)AUTH_MODE_MON, bl);
+ encode(entity_name, bl);
+ encode(global_id, bl);
+ // and (maybe) some method-specific initial payload
+ auth->build_initial_request(&bl);
+ return {auth_method, modes, bl};
+}
+
+tuple<CryptoKey, Connection::secret_t, bufferlist>
+Connection::handle_auth_reply_more(const ceph::buffer::list& payload)
+{
+ CryptoKey session_key;
+ secret_t connection_secret;
+ bufferlist reply;
+ auto p = payload.cbegin();
+ int r = auth->handle_response(0, p, &session_key, &connection_secret);
+ if (r == -EAGAIN) {
+ auth->prepare_build_request();
+ auth->build_request(reply);
+ logger().info(" responding with {} bytes", reply.length());
+ return {session_key, connection_secret, reply};
+ } else if (r < 0) {
+ logger().error(" handle_response returned {}", r);
+ throw crimson::auth::error("unable to build auth");
+ } else {
+ logger().info("authenticated!");
+ std::terminate();
+ }
+}
+
+tuple<CryptoKey, Connection::secret_t, int>
+Connection::handle_auth_done(uint64_t new_global_id,
+ const ceph::buffer::list& payload)
+{
+ global_id = new_global_id;
+ auth->set_global_id(global_id);
+ auto p = payload.begin();
+ CryptoKey session_key;
+ secret_t connection_secret;
+ int r = auth->handle_response(0, p, &session_key, &connection_secret);
+ conn->set_last_keepalive_ack(auth_start);
+ if (auth_done) {
+ auth_done->set_value(auth_result_t::success);
+ auth_done.reset();
+ }
+ return {session_key, connection_secret, r};
+}
+
+int Connection::handle_auth_bad_method(uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes)
+{
+ logger().info("old_auth_method {} result {} allowed_methods {}",
+ old_auth_method, cpp_strerror(result), allowed_methods);
+ std::vector<uint32_t> auth_supported;
+ auth_registry.get_supported_methods(conn->get_peer_type(), &auth_supported);
+ auto p = std::find(auth_supported.begin(), auth_supported.end(),
+ old_auth_method);
+ assert(p != auth_supported.end());
+ p = std::find_first_of(std::next(p), auth_supported.end(),
+ allowed_methods.begin(), allowed_methods.end());
+ if (p == auth_supported.end()) {
+ logger().error("server allowed_methods {} but i only support {}",
+ allowed_methods, auth_supported);
+ assert(auth_done);
+ auth_done->set_exception(std::system_error(make_error_code(
+ crimson::net::error::negotiation_failure)));
+ return -EACCES;
+ }
+ auth_method = *p;
+ logger().info("will try {} next", auth_method);
+ return 0;
+}
+
+void Connection::close()
+{
+ logger().info("{}", __func__);
+ auth_reply.set_value(Ref<MAuthReply>(nullptr));
+ auth_reply = {};
+ if (auth_done) {
+ auth_done->set_value(auth_result_t::canceled);
+ auth_done.reset();
+ }
+ if (conn && !std::exchange(closed, true)) {
+ conn->mark_down();
+ }
+}
+
+bool Connection::is_my_peer(const entity_addr_t& addr) const
+{
+ ceph_assert(conn);
+ return conn->get_peer_addr() == addr;
+}
+
+crimson::net::ConnectionRef Connection::get_conn() {
+ return conn;
+}
+
+Client::mon_command_t::mon_command_t(MURef<MMonCommand> req)
+ : req(std::move(req))
+{}
+
+Client::Client(crimson::net::Messenger& messenger,
+ crimson::common::AuthHandler& auth_handler)
+ // currently, crimson is OSD-only
+ : want_keys{CEPH_ENTITY_TYPE_MON |
+ CEPH_ENTITY_TYPE_OSD |
+ CEPH_ENTITY_TYPE_MGR},
+ timer{[this] { tick(); }},
+ msgr{messenger},
+ log_client{nullptr},
+ auth_registry{&cct},
+ auth_handler{auth_handler}
+{}
+
+Client::Client(Client&&) = default;
+Client::~Client() = default;
+
+seastar::future<> Client::start() {
+ entity_name = crimson::common::local_conf()->name;
+ auth_registry.refresh_config();
+ return load_keyring().then([this] {
+ return monmap.build_initial(crimson::common::local_conf(), false);
+ }).then([this] {
+ return authenticate();
+ }).then([this] {
+ auto interval =
+ std::chrono::duration_cast<seastar::lowres_clock::duration>(
+ std::chrono::duration<double>(
+ local_conf().get_val<double>("mon_client_ping_interval")));
+ timer.arm_periodic(interval);
+ });
+}
+
+seastar::future<> Client::load_keyring()
+{
+ if (!auth_registry.is_supported_method(msgr.get_mytype(), CEPH_AUTH_CEPHX)) {
+ return seastar::now();
+ } else {
+ return crimson::auth::load_from_keyring(&keyring).then([](KeyRing* keyring) {
+ return crimson::auth::load_from_keyfile(keyring);
+ }).then([](KeyRing* keyring) {
+ return crimson::auth::load_from_key(keyring);
+ }).then([](KeyRing*) {
+ return seastar::now();
+ });
+ }
+}
+
+void Client::tick()
+{
+ gate.dispatch_in_background(__func__, *this, [this] {
+ if (active_con) {
+ return seastar::when_all_succeed(wait_for_send_log(),
+ active_con->get_conn()->send_keepalive(),
+ active_con->renew_tickets(),
+ active_con->renew_rotating_keyring()).discard_result();
+ } else {
+ assert(is_hunting());
+ logger().info("{} continuing the hunt", __func__);
+ return authenticate();
+ }
+ });
+}
+
+seastar::future<> Client::wait_for_send_log() {
+ utime_t now = ceph_clock_now();
+ if (now > last_send_log + cct._conf->mon_client_log_interval) {
+ last_send_log = now;
+ return send_log(log_flushing_t::NO_FLUSH);
+ }
+ return seastar::now();
+}
+
+seastar::future<> Client::send_log(log_flushing_t flush_flag) {
+ if (log_client) {
+ if (auto lm = log_client->get_mon_log_message(flush_flag); lm) {
+ return send_message(std::move(lm));
+ }
+ more_log_pending = log_client->are_pending();
+ }
+ return seastar::now();
+}
+
+bool Client::is_hunting() const {
+ return !active_con;
+}
+
+std::optional<seastar::future<>>
+Client::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+ bool dispatched = true;
+ gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] {
+ // we only care about these message types
+ switch (m->get_type()) {
+ case CEPH_MSG_MON_MAP:
+ return handle_monmap(*conn, boost::static_pointer_cast<MMonMap>(m));
+ case CEPH_MSG_AUTH_REPLY:
+ return handle_auth_reply(
+ *conn, boost::static_pointer_cast<MAuthReply>(m));
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ return handle_subscribe_ack(
+ boost::static_pointer_cast<MMonSubscribeAck>(m));
+ case CEPH_MSG_MON_GET_VERSION_REPLY:
+ return handle_get_version_reply(
+ boost::static_pointer_cast<MMonGetVersionReply>(m));
+ case MSG_MON_COMMAND_ACK:
+ return handle_mon_command_ack(
+ boost::static_pointer_cast<MMonCommandAck>(m));
+ case MSG_LOGACK:
+ return handle_log_ack(
+ boost::static_pointer_cast<MLogAck>(m));
+ case MSG_CONFIG:
+ return handle_config(
+ boost::static_pointer_cast<MConfig>(m));
+ default:
+ dispatched = false;
+ return seastar::now();
+ }
+ });
+ return (dispatched ? std::make_optional(seastar::now()) : std::nullopt);
+}
+
+void Client::ms_handle_reset(crimson::net::ConnectionRef conn, bool /* is_replace */)
+{
+ gate.dispatch_in_background(__func__, *this, [this, conn] {
+ auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+ [peer_addr = conn->get_peer_addr()](auto& mc) {
+ return mc->is_my_peer(peer_addr);
+ });
+ if (found != pending_conns.end()) {
+ logger().warn("pending conn reset by {}", conn->get_peer_addr());
+ (*found)->close();
+ pending_conns.erase(found);
+ return seastar::now();
+ } else if (active_con && active_con->is_my_peer(conn->get_peer_addr())) {
+ logger().warn("active conn reset {}", conn->get_peer_addr());
+ return reopen_session(-1).then([this](bool opened) {
+ if (opened) {
+ return on_session_opened();
+ } else {
+ return seastar::now();
+ }
+ });
+ } else {
+ return seastar::now();
+ }
+ });
+}
+
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>>
+Client::get_supported_auth_methods(int peer_type)
+{
+ std::vector<uint32_t> methods;
+ std::vector<uint32_t> modes;
+ auth_registry.get_supported_methods(peer_type, &methods, &modes);
+ return {methods, modes};
+}
+
+uint32_t Client::pick_con_mode(int peer_type,
+ uint32_t auth_method,
+ const std::vector<uint32_t>& preferred_modes)
+{
+ return auth_registry.pick_mode(peer_type, auth_method, preferred_modes);
+}
+
+AuthAuthorizeHandler* Client::get_auth_authorize_handler(int peer_type,
+ int auth_method)
+{
+ return auth_registry.get_handler(peer_type, auth_method);
+}
+
+
+int Client::handle_auth_request(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ bool more,
+ uint32_t auth_method,
+ const ceph::bufferlist& payload,
+ uint64_t *p_peer_global_id,
+ ceph::bufferlist *reply)
+{
+ if (payload.length() == 0) {
+ return -EACCES;
+ }
+ auth_meta.auth_mode = payload[0];
+ if (auth_meta.auth_mode < AUTH_MODE_AUTHORIZER ||
+ auth_meta.auth_mode > AUTH_MODE_AUTHORIZER_MAX) {
+ return -EACCES;
+ }
+ AuthAuthorizeHandler* ah = get_auth_authorize_handler(conn.get_peer_type(),
+ auth_method);
+ if (!ah) {
+ logger().error("no AuthAuthorizeHandler found for auth method: {}",
+ auth_method);
+ return -EOPNOTSUPP;
+ }
+ auto authorizer_challenge = &auth_meta.authorizer_challenge;
+ if (auth_meta.skip_authorizer_challenge) {
+ logger().info("skipping challenge on {}", conn);
+ authorizer_challenge = nullptr;
+ }
+ if (!active_con) {
+ logger().info("auth request during inactivity period");
+ // let's instruct the client to come back later
+ return -EBUSY;
+ }
+ bool was_challenge = (bool)auth_meta.authorizer_challenge;
+ EntityName name;
+ AuthCapsInfo caps_info;
+ bool is_valid = ah->verify_authorizer(
+ &cct,
+ active_con->get_keys(),
+ payload,
+ auth_meta.get_connection_secret_length(),
+ reply,
+ &name,
+ p_peer_global_id,
+ &caps_info,
+ &auth_meta.session_key,
+ &auth_meta.connection_secret,
+ authorizer_challenge);
+ if (is_valid) {
+ auth_handler.handle_authentication(name, caps_info);
+ return 1;
+ }
+ if (!more && !was_challenge && auth_meta.authorizer_challenge) {
+ logger().info("added challenge on {}", conn);
+ return 0;
+ } else {
+ logger().info("bad authorizer on {}", conn);
+ return -EACCES;
+ }
+}
+
+auth::AuthClient::auth_request_t
+Client::get_auth_request(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta)
+{
+ logger().info("get_auth_request(conn={}, auth_method={})",
+ conn, auth_meta.auth_method);
+ // connection to mon?
+ if (conn.get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+ [peer_addr = conn.get_peer_addr()](auto& mc) {
+ return mc->is_my_peer(peer_addr);
+ });
+ if (found == pending_conns.end()) {
+ throw crimson::auth::error{"unknown connection"};
+ }
+ return (*found)->get_auth_request(entity_name, want_keys);
+ } else {
+ // generate authorizer
+ if (!active_con) {
+ logger().error(" but no auth handler is set up");
+ throw crimson::auth::error("no auth available");
+ }
+ auto authorizer = active_con->get_authorizer(conn.get_peer_type());
+ if (!authorizer) {
+ logger().error("failed to build_authorizer for type {}",
+ ceph_entity_type_name(conn.get_peer_type()));
+ throw crimson::auth::error("unable to build auth");
+ }
+ auth_meta.authorizer.reset(authorizer);
+ auth_meta.auth_method = authorizer->protocol;
+ vector<uint32_t> modes;
+ auth_registry.get_supported_modes(conn.get_peer_type(),
+ auth_meta.auth_method,
+ &modes);
+ return {authorizer->protocol, modes, authorizer->bl};
+ }
+}
+
+ceph::bufferlist Client::handle_auth_reply_more(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ const bufferlist& bl)
+{
+ if (conn.get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+ [peer_addr = conn.get_peer_addr()](auto& mc) {
+ return mc->is_my_peer(peer_addr);
+ });
+ if (found == pending_conns.end()) {
+ throw crimson::auth::error{"unknown connection"};
+ }
+ bufferlist reply;
+ tie(auth_meta.session_key, auth_meta.connection_secret, reply) =
+ (*found)->handle_auth_reply_more(bl);
+ return reply;
+ } else {
+ // authorizer challenges
+ if (!active_con || !auth_meta.authorizer) {
+ logger().error("no authorizer?");
+ throw crimson::auth::error("no auth available");
+ }
+ auth_meta.authorizer->add_challenge(&cct, bl);
+ return auth_meta.authorizer->bl;
+ }
+}
+
+int Client::handle_auth_done(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ uint64_t global_id,
+ uint32_t /*con_mode*/,
+ const bufferlist& bl)
+{
+ if (conn.get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+ [peer_addr = conn.get_peer_addr()](auto& mc) {
+ return mc->is_my_peer(peer_addr);
+ });
+ if (found == pending_conns.end()) {
+ return -ENOENT;
+ }
+ int r = 0;
+ tie(auth_meta.session_key, auth_meta.connection_secret, r) =
+ (*found)->handle_auth_done(global_id, bl);
+ return r;
+ } else {
+ // verify authorizer reply
+ auto p = bl.begin();
+ if (!auth_meta.authorizer->verify_reply(p, &auth_meta.connection_secret)) {
+ logger().error("failed verifying authorizer reply");
+ return -EACCES;
+ }
+ auth_meta.session_key = auth_meta.authorizer->session_key;
+ return 0;
+ }
+}
+
+ // Handle server's indication that the previous auth attempt failed
+int Client::handle_auth_bad_method(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes)
+{
+ if (conn.get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+ [peer_addr = conn.get_peer_addr()](auto& mc) {
+ return mc->is_my_peer(peer_addr);
+ });
+ if (found != pending_conns.end()) {
+ return (*found)->handle_auth_bad_method(
+ old_auth_method, result,
+ allowed_methods, allowed_modes);
+ } else {
+ return -ENOENT;
+ }
+ } else {
+ // huh...
+ logger().info("hmm, they didn't like {} result {}",
+ old_auth_method, cpp_strerror(result));
+ return -EACCES;
+ }
+}
+
+seastar::future<> Client::handle_monmap(crimson::net::Connection &conn,
+ Ref<MMonMap> m)
+{
+ monmap.decode(m->monmapbl);
+ const auto peer_addr = conn.get_peer_addr();
+ auto cur_mon = monmap.get_name(peer_addr);
+ logger().info("got monmap {}, mon.{}, is now rank {}",
+ monmap.epoch, cur_mon, monmap.get_rank(cur_mon));
+ sub.got("monmap", monmap.get_epoch());
+
+ if (monmap.get_addr_name(peer_addr, cur_mon)) {
+ if (active_con) {
+ logger().info("handle_monmap: renewing tickets");
+ return seastar::when_all_succeed(
+ active_con->renew_tickets(),
+ active_con->renew_rotating_keyring()).then_unpack([] {
+ logger().info("handle_mon_map: renewed tickets");
+ });
+ } else {
+ return seastar::now();
+ }
+ } else {
+ logger().warn("mon.{} went away", cur_mon);
+ return reopen_session(-1).then([this](bool opened) {
+ if (opened) {
+ return on_session_opened();
+ } else {
+ return seastar::now();
+ }
+ });
+ }
+}
+
+seastar::future<> Client::handle_auth_reply(crimson::net::Connection &conn,
+ Ref<MAuthReply> m)
+{
+ logger().info("handle_auth_reply {} returns {}: {}",
+ conn, *m, m->result);
+ auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+ [peer_addr = conn.get_peer_addr()](auto& mc) {
+ return mc->is_my_peer(peer_addr);
+ });
+ if (found != pending_conns.end()) {
+ return (*found)->handle_auth_reply(m);
+ } else if (active_con) {
+ return active_con->handle_auth_reply(m).then([this] {
+ return seastar::when_all_succeed(
+ active_con->renew_rotating_keyring(),
+ active_con->renew_tickets()).discard_result();
+ });
+ } else {
+ logger().error("unknown auth reply from {}", conn.get_peer_addr());
+ return seastar::now();
+ }
+}
+
+seastar::future<> Client::handle_subscribe_ack(Ref<MMonSubscribeAck> m)
+{
+ sub.acked(m->interval);
+ return seastar::now();
+}
+
+Client::get_version_t Client::get_version(const std::string& map)
+{
+ auto m = crimson::make_message<MMonGetVersion>();
+ auto tid = ++last_version_req_id;
+ m->handle = tid;
+ m->what = map;
+ auto& req = version_reqs[tid];
+ return send_message(std::move(m)).then([&req] {
+ return req.get_future();
+ });
+}
+
+seastar::future<>
+Client::handle_get_version_reply(Ref<MMonGetVersionReply> m)
+{
+ if (auto found = version_reqs.find(m->handle);
+ found != version_reqs.end()) {
+ auto& result = found->second;
+ logger().trace("{}: {} returns {}",
+ __func__, m->handle, m->version);
+ result.set_value(std::make_tuple(m->version, m->oldest_version));
+ version_reqs.erase(found);
+ } else {
+ logger().warn("{}: version request with handle {} not found",
+ __func__, m->handle);
+ }
+ return seastar::now();
+}
+
+seastar::future<> Client::handle_mon_command_ack(Ref<MMonCommandAck> m)
+{
+ const auto tid = m->get_tid();
+ if (auto found = std::find_if(mon_commands.begin(),
+ mon_commands.end(),
+ [tid](auto& cmd) {
+ return cmd.req->get_tid() == tid;
+ });
+ found != mon_commands.end()) {
+ auto& command = *found;
+ logger().trace("{} {}", __func__, tid);
+ command.result.set_value(std::make_tuple(m->r, m->rs, std::move(m->get_data())));
+ mon_commands.erase(found);
+ } else {
+ logger().warn("{} {} not found", __func__, tid);
+ }
+ return seastar::now();
+}
+
+seastar::future<> Client::handle_log_ack(Ref<MLogAck> m)
+{
+ if (log_client) {
+ return log_client->handle_log_ack(m).then([this] {
+ if (more_log_pending) {
+ return send_log(log_flushing_t::NO_FLUSH);
+ } else {
+ return seastar::now();
+ }
+ });
+ }
+ return seastar::now();
+}
+
+seastar::future<> Client::handle_config(Ref<MConfig> m)
+{
+ return crimson::common::local_conf().set_mon_vals(m->config).then([this] {
+ if (config_updated) {
+ config_updated->set_value();
+ }
+ });
+}
+
+std::vector<unsigned> Client::get_random_mons(unsigned n) const
+{
+ uint16_t min_priority = std::numeric_limits<uint16_t>::max();
+ for (const auto& m : monmap.mon_info) {
+ if (m.second.priority < min_priority) {
+ min_priority = m.second.priority;
+ }
+ }
+ vector<unsigned> ranks;
+ for (auto [name, info] : monmap.mon_info) {
+ if (info.priority == min_priority) {
+ ranks.push_back(monmap.get_rank(name));
+ }
+ }
+ std::random_device rd;
+ std::default_random_engine rng{rd()};
+ std::shuffle(ranks.begin(), ranks.end(), rng);
+ if (n == 0 || n > ranks.size()) {
+ return ranks;
+ } else {
+ return {ranks.begin(), ranks.begin() + n};
+ }
+}
+
+seastar::future<> Client::authenticate()
+{
+ return reopen_session(-1).then([this](bool opened) {
+ if (opened) {
+ return on_session_opened();
+ } else {
+ return seastar::now();
+ }
+ });
+}
+
+seastar::future<> Client::stop()
+{
+ logger().info("{}", __func__);
+ auto fut = gate.close();
+ timer.cancel();
+ ready_to_send = false;
+ for (auto& pending_con : pending_conns) {
+ pending_con->close();
+ }
+ if (active_con) {
+ active_con->close();
+ }
+ return fut;
+}
+
+static entity_addr_t choose_client_addr(
+ const entity_addrvec_t& my_addrs,
+ const entity_addrvec_t& client_addrs)
+{
+ // here is where we decide which of the addrs to connect to. always prefer
+ // the first one, if we support it.
+ for (const auto& a : client_addrs.v) {
+ if (a.is_msgr2()) {
+ // FIXME: for ipv4 vs ipv6, check whether local host can handle ipv6 before
+ // trying it? for now, just pick whichever is listed first.
+ return a;
+ }
+ }
+ return entity_addr_t{};
+}
+
+seastar::future<bool> Client::reopen_session(int rank)
+{
+ logger().info("{} to mon.{}", __func__, rank);
+ ready_to_send = false;
+ if (active_con) {
+ active_con->close();
+ active_con = nullptr;
+ ceph_assert(pending_conns.empty());
+ } else {
+ for (auto& pending_con : pending_conns) {
+ pending_con->close();
+ }
+ pending_conns.clear();
+ }
+ vector<unsigned> mons;
+ if (rank >= 0) {
+ mons.push_back(rank);
+ } else {
+ const auto parallel =
+ crimson::common::local_conf().get_val<uint64_t>("mon_client_hunt_parallel");
+ mons = get_random_mons(parallel);
+ }
+ pending_conns.reserve(mons.size());
+ return seastar::parallel_for_each(mons, [this](auto rank) {
+ auto peer = choose_client_addr(msgr.get_myaddrs(),
+ monmap.get_addrs(rank));
+ if (peer == entity_addr_t{}) {
+ // crimson msgr only uses the first bound addr
+ logger().warn("mon.{} does not have an addr compatible with me", rank);
+ return seastar::now();
+ }
+ logger().info("connecting to mon.{}", rank);
+ auto conn = msgr.connect(peer, CEPH_ENTITY_TYPE_MON);
+ auto& mc = pending_conns.emplace_back(
+ seastar::make_shared<Connection>(auth_registry, conn, &keyring));
+ assert(conn->get_peer_addr().is_msgr2());
+ return mc->authenticate_v2().then([peer, this](auto result) {
+ if (result == Connection::auth_result_t::success) {
+ _finish_auth(peer);
+ }
+ logger().debug("reopen_session mon connection attempts complete");
+ }).handle_exception([](auto ep) {
+ logger().error("mon connections failed with ep {}", ep);
+ return seastar::make_exception_future(ep);
+ });
+ }).then([this] {
+ if (active_con) {
+ return true;
+ } else {
+ logger().warn("cannot establish the active_con with any mon");
+ return false;
+ }
+ });
+}
+
+void Client::_finish_auth(const entity_addr_t& peer)
+{
+ if (!is_hunting()) {
+ return;
+ }
+ logger().info("found mon.{}", monmap.get_name(peer));
+
+ auto found = std::find_if(
+ pending_conns.begin(), pending_conns.end(),
+ [peer](auto& conn) {
+ return conn->is_my_peer(peer);
+ });
+ if (found == pending_conns.end()) {
+ // Happens if another connection has won the race
+ ceph_assert(active_con && pending_conns.empty());
+ logger().info("no pending connection for mon.{}, peer {}",
+ monmap.get_name(peer), peer);
+ return;
+ }
+
+ ceph_assert(!active_con && !pending_conns.empty());
+ // It's too early to toggle the `ready_to_send` flag. It will
+ // be set atfer finishing the MAuth exchange and draining out
+ // the `pending_messages` queue.
+ active_con = std::move(*found);
+ *found = nullptr;
+ for (auto& conn : pending_conns) {
+ if (conn) {
+ conn->close();
+ }
+ }
+ pending_conns.clear();
+}
+
+Client::command_result_t
+Client::run_command(std::string&& cmd,
+ bufferlist&& bl)
+{
+ auto m = crimson::make_message<MMonCommand>(monmap.fsid);
+ auto tid = ++last_mon_command_id;
+ m->set_tid(tid);
+ m->cmd = {std::move(cmd)};
+ m->set_data(std::move(bl));
+ auto& command = mon_commands.emplace_back(crimson::make_message<MMonCommand>(*m));
+ return send_message(std::move(m)).then([&result=command.result] {
+ return result.get_future();
+ });
+}
+
+seastar::future<> Client::send_message(MessageURef m)
+{
+ if (active_con && ready_to_send) {
+ assert(pending_messages.empty());
+ return active_con->get_conn()->send(std::move(m));
+ } else {
+ auto& delayed = pending_messages.emplace_back(std::move(m));
+ return delayed.pr.get_future();
+ }
+}
+
+seastar::future<> Client::on_session_opened()
+{
+ return active_con->renew_rotating_keyring().then([this] {
+ if (!active_con) {
+ // the connection can be closed even in the middle of the opening sequence
+ logger().info("on_session_opened {}: connection closed", __LINE__);
+ return seastar::now();
+ }
+ for (auto& m : pending_messages) {
+ (void) active_con->get_conn()->send(std::move(m.msg));
+ m.pr.set_value();
+ }
+ pending_messages.clear();
+ ready_to_send = true;
+ return sub.reload() ? renew_subs() : seastar::now();
+ }).then([this] {
+ if (!active_con) {
+ logger().info("on_session_opened {}: connection closed", __LINE__);
+ return seastar::now();
+ }
+ return seastar::parallel_for_each(mon_commands,
+ [this](auto &command) {
+ return send_message(crimson::make_message<MMonCommand>(*command.req));
+ });
+ });
+}
+
+bool Client::sub_want(const std::string& what, version_t start, unsigned flags)
+{
+ return sub.want(what, start, flags);
+}
+
+void Client::sub_got(const std::string& what, version_t have)
+{
+ sub.got(what, have);
+}
+
+void Client::sub_unwant(const std::string& what)
+{
+ sub.unwant(what);
+}
+
+bool Client::sub_want_increment(const std::string& what,
+ version_t start,
+ unsigned flags)
+{
+ return sub.inc_want(what, start, flags);
+}
+
+seastar::future<> Client::renew_subs()
+{
+ if (!sub.have_new()) {
+ logger().warn("{} - empty", __func__);
+ return seastar::now();
+ }
+ logger().trace("{}", __func__);
+
+ auto m = crimson::make_message<MMonSubscribe>();
+ m->what = sub.get_subs();
+ m->hostname = ceph_get_short_hostname();
+ return send_message(std::move(m)).then([this] {
+ sub.renewed();
+ });
+}
+
+seastar::future<> Client::wait_for_config()
+{
+ assert(!config_updated);
+ config_updated = seastar::promise<>();
+ return config_updated->get_future();
+}
+
+void Client::print(std::ostream& out) const
+{
+ out << "mon." << entity_name;
+}
+
+} // namespace crimson::mon
diff --git a/src/crimson/mon/MonClient.h b/src/crimson/mon/MonClient.h
new file mode 100644
index 000000000..1228ecd0b
--- /dev/null
+++ b/src/crimson/mon/MonClient.h
@@ -0,0 +1,218 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/timer.hh>
+
+#include "auth/AuthRegistry.h"
+#include "auth/KeyRing.h"
+#include "common/ceph_context.h"
+
+#include "crimson/auth/AuthClient.h"
+#include "crimson/auth/AuthServer.h"
+#include "crimson/common/auth_handler.h"
+#include "crimson/common/gated.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Fwd.h"
+
+#include "mon/MonMap.h"
+
+#include "mon/MonSub.h"
+
+template<typename Message> using Ref = boost::intrusive_ptr<Message>;
+namespace crimson::net {
+ class Messenger;
+}
+
+class LogClient;
+
+struct AuthAuthorizeHandler;
+class MAuthReply;
+struct MMonMap;
+struct MMonSubscribeAck;
+struct MMonGetVersionReply;
+struct MMonCommand;
+struct MMonCommandAck;
+struct MLogAck;
+struct MConfig;
+
+enum class log_flushing_t;
+
+namespace crimson::mon {
+
+class Connection;
+
+class Client : public crimson::net::Dispatcher,
+ public crimson::auth::AuthClient,
+ public crimson::auth::AuthServer
+{
+ EntityName entity_name;
+ KeyRing keyring;
+ const uint32_t want_keys;
+
+ MonMap monmap;
+ bool ready_to_send = false;
+ seastar::shared_ptr<Connection> active_con;
+ std::vector<seastar::shared_ptr<Connection>> pending_conns;
+ seastar::timer<seastar::lowres_clock> timer;
+
+ crimson::net::Messenger& msgr;
+
+ LogClient *log_client;
+ bool more_log_pending = false;
+ utime_t last_send_log;
+
+ seastar::future<> send_log(log_flushing_t flush_flag);
+ seastar::future<> wait_for_send_log();
+
+ // commands
+ using get_version_t = seastar::future<std::tuple<version_t, version_t>>;
+
+ ceph_tid_t last_version_req_id = 0;
+ std::map<ceph_tid_t, typename get_version_t::promise_type> version_reqs;
+
+ ceph_tid_t last_mon_command_id = 0;
+ using command_result_t =
+ seastar::future<std::tuple<std::int32_t, std::string, ceph::bufferlist>>;
+ struct mon_command_t {
+ MURef<MMonCommand> req;
+ typename command_result_t::promise_type result;
+ mon_command_t(MURef<MMonCommand> req);
+ };
+ std::vector<mon_command_t> mon_commands;
+
+ MonSub sub;
+
+public:
+ Client(crimson::net::Messenger&, crimson::common::AuthHandler&);
+ Client(Client&&);
+ ~Client();
+ seastar::future<> start();
+ seastar::future<> stop();
+
+ void set_log_client(LogClient *clog) {
+ log_client = clog;
+ }
+
+ const uuid_d& get_fsid() const {
+ return monmap.fsid;
+ }
+ get_version_t get_version(const std::string& map);
+ command_result_t run_command(std::string&& cmd,
+ bufferlist&& bl);
+ seastar::future<> send_message(MessageURef);
+ bool sub_want(const std::string& what, version_t start, unsigned flags);
+ void sub_got(const std::string& what, version_t have);
+ void sub_unwant(const std::string& what);
+ bool sub_want_increment(const std::string& what, version_t start, unsigned flags);
+ seastar::future<> renew_subs();
+ seastar::future<> wait_for_config();
+
+ void print(std::ostream&) const;
+private:
+ // AuthServer methods
+ std::pair<std::vector<uint32_t>, std::vector<uint32_t>>
+ get_supported_auth_methods(int peer_type) final;
+ uint32_t pick_con_mode(int peer_type,
+ uint32_t auth_method,
+ const std::vector<uint32_t>& preferred_modes) final;
+ AuthAuthorizeHandler* get_auth_authorize_handler(int peer_type,
+ int auth_method) final;
+ int handle_auth_request(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ bool more,
+ uint32_t auth_method,
+ const ceph::bufferlist& payload,
+ uint64_t *p_peer_global_id,
+ ceph::bufferlist *reply) final;
+
+ crimson::common::CephContext cct; // for auth_registry
+ AuthRegistry auth_registry;
+ crimson::common::AuthHandler& auth_handler;
+
+ // AuthClient methods
+ crimson::auth::AuthClient::auth_request_t
+ get_auth_request(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta) final;
+
+ // Handle server's request to continue the handshake
+ ceph::bufferlist handle_auth_reply_more(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ const bufferlist& bl) final;
+
+ // Handle server's indication that authentication succeeded
+ int handle_auth_done(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ uint64_t global_id,
+ uint32_t con_mode,
+ const bufferlist& bl) final;
+
+ // Handle server's indication that the previous auth attempt failed
+ int handle_auth_bad_method(crimson::net::Connection &conn,
+ AuthConnectionMeta &auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes) final;
+
+private:
+ void tick();
+
+ std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef conn,
+ MessageRef m) override;
+ void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override;
+
+ seastar::future<> handle_monmap(crimson::net::Connection &conn,
+ Ref<MMonMap> m);
+ seastar::future<> handle_auth_reply(crimson::net::Connection &conn,
+ Ref<MAuthReply> m);
+ seastar::future<> handle_subscribe_ack(Ref<MMonSubscribeAck> m);
+ seastar::future<> handle_get_version_reply(Ref<MMonGetVersionReply> m);
+ seastar::future<> handle_mon_command_ack(Ref<MMonCommandAck> m);
+ seastar::future<> handle_log_ack(Ref<MLogAck> m);
+ seastar::future<> handle_config(Ref<MConfig> m);
+
+ seastar::future<> on_session_opened();
+private:
+ seastar::future<> load_keyring();
+ seastar::future<> authenticate();
+
+ bool is_hunting() const;
+ // @param rank, rank of the monitor to be connected, if it is less than 0,
+ // try to connect to all monitors in monmap, until one of them
+ // is connected.
+ // @return true if a connection to monitor is established
+ seastar::future<bool> reopen_session(int rank);
+ std::vector<unsigned> get_random_mons(unsigned n) const;
+ seastar::future<> _add_conn(unsigned rank, uint64_t global_id);
+ void _finish_auth(const entity_addr_t& peer);
+ crimson::common::Gated gate;
+
+ // messages that are waiting for the active_con to be available
+ struct pending_msg_t {
+ pending_msg_t(MessageURef m) : msg(std::move(m)) {}
+ MessageURef msg;
+ seastar::promise<> pr;
+ };
+ std::deque<pending_msg_t> pending_messages;
+ std::optional<seastar::promise<>> config_updated;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Client& client) {
+ client.print(out);
+ return out;
+}
+
+} // namespace crimson::mon
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::mon::Client> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/net/Connection.h b/src/crimson/net/Connection.h
new file mode 100644
index 000000000..7141e20f4
--- /dev/null
+++ b/src/crimson/net/Connection.h
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <queue>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "Fwd.h"
+
+namespace crimson::net {
+
+using seq_num_t = uint64_t;
+
+/**
+ * Connection
+ *
+ * Abstraction for messenger connections.
+ *
+ * Except when otherwise specified, methods must be invoked from the core on which
+ * the connection originates.
+ */
+class Connection : public seastar::enable_shared_from_this<Connection> {
+ public:
+ using clock_t = seastar::lowres_system_clock;
+
+ Connection() {}
+
+ virtual ~Connection() {}
+
+ /**
+ * get_shard_id
+ *
+ * The shard id where the Connection is dispatching events and handling I/O.
+ *
+ * May be changed with the accept/connect events.
+ */
+ virtual const seastar::shard_id get_shard_id() const = 0;
+
+ virtual const entity_name_t &get_peer_name() const = 0;
+
+ entity_type_t get_peer_type() const { return get_peer_name().type(); }
+ int64_t get_peer_id() const { return get_peer_name().num(); }
+ bool peer_is_mon() const { return get_peer_name().is_mon(); }
+ bool peer_is_mgr() const { return get_peer_name().is_mgr(); }
+ bool peer_is_mds() const { return get_peer_name().is_mds(); }
+ bool peer_is_osd() const { return get_peer_name().is_osd(); }
+ bool peer_is_client() const { return get_peer_name().is_client(); }
+
+ virtual const entity_addr_t &get_peer_addr() const = 0;
+
+ const entity_addrvec_t get_peer_addrs() const {
+ return entity_addrvec_t(get_peer_addr());
+ }
+
+ virtual const entity_addr_t &get_peer_socket_addr() const = 0;
+
+ virtual uint64_t get_features() const = 0;
+
+ bool has_feature(uint64_t f) const {
+ return get_features() & f;
+ }
+
+ /// true if the handshake has completed and no errors have been encountered
+ virtual bool is_connected() const = 0;
+
+ /**
+ * send
+ *
+ * Send a message over a connection that has completed its handshake.
+ *
+ * May be invoked from any core, but that requires to chain the returned
+ * future to preserve ordering.
+ */
+ virtual seastar::future<> send(MessageURef msg) = 0;
+
+ /**
+ * send_keepalive
+ *
+ * Send a keepalive message over a connection that has completed its
+ * handshake.
+ *
+ * May be invoked from any core, but that requires to chain the returned
+ * future to preserve ordering.
+ */
+ virtual seastar::future<> send_keepalive() = 0;
+
+ virtual clock_t::time_point get_last_keepalive() const = 0;
+
+ virtual clock_t::time_point get_last_keepalive_ack() const = 0;
+
+ // workaround for the monitor client
+ virtual void set_last_keepalive_ack(clock_t::time_point when) = 0;
+
+ // close the connection and cancel any any pending futures from read/send,
+ // without dispatching any reset event
+ virtual void mark_down() = 0;
+
+ struct user_private_t {
+ virtual ~user_private_t() = default;
+ };
+
+ virtual bool has_user_private() const = 0;
+
+ virtual user_private_t &get_user_private() = 0;
+
+ virtual void set_user_private(std::unique_ptr<user_private_t>) = 0;
+
+ virtual void print(std::ostream& out) const = 0;
+
+#ifdef UNIT_TESTS_BUILT
+ virtual bool is_protocol_ready() const = 0;
+
+ virtual bool is_protocol_standby() const = 0;
+
+ virtual bool is_protocol_closed() const = 0;
+
+ virtual bool is_protocol_closed_clean() const = 0;
+
+ virtual bool peer_wins() const = 0;
+#endif
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Connection& conn) {
+ out << "[";
+ conn.print(out);
+ out << "]";
+ return out;
+}
+
+} // namespace crimson::net
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::net::Connection> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/net/Dispatcher.h b/src/crimson/net/Dispatcher.h
new file mode 100644
index 000000000..9eea0a858
--- /dev/null
+++ b/src/crimson/net/Dispatcher.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "Fwd.h"
+
+class AuthAuthorizer;
+
+namespace crimson::net {
+
+class Dispatcher {
+ public:
+ virtual ~Dispatcher() {}
+
+ // Dispatchers are put into a chain as described by chain-of-responsibility
+ // pattern. If any of the dispatchers claims this message, it returns a valid
+ // future to prevent other dispatchers from processing it, and this is also
+ // used to throttle the connection if it's too busy.
+ virtual std::optional<seastar::future<>> ms_dispatch(ConnectionRef, MessageRef) = 0;
+
+ // The connection is moving to the new_shard under accept/connect.
+ // User should not operate conn in this shard thereafter.
+ virtual void ms_handle_shard_change(
+ ConnectionRef conn,
+ seastar::shard_id new_shard,
+ bool is_accept_or_connect) {}
+
+ // The connection is accepted or recoverred(lossless), all the followup
+ // events and messages will be dispatched to this shard.
+ //
+ // is_replace=true means the accepted connection has replaced
+ // another connecting connection with the same peer_addr, which currently only
+ // happens under lossy policy when both sides wish to connect to each other.
+ virtual void ms_handle_accept(ConnectionRef conn, seastar::shard_id prv_shard, bool is_replace) {}
+
+ // The connection is (re)connected, all the followup events and messages will
+ // be dispatched to this shard.
+ virtual void ms_handle_connect(ConnectionRef conn, seastar::shard_id prv_shard) {}
+
+ // a reset event is dispatched when the connection is closed unexpectedly.
+ //
+ // is_replace=true means the reset connection is going to be replaced by
+ // another accepting connection with the same peer_addr, which currently only
+ // happens under lossy policy when both sides wish to connect to each other.
+ virtual void ms_handle_reset(ConnectionRef conn, bool is_replace) {}
+
+ virtual void ms_handle_remote_reset(ConnectionRef conn) {}
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Errors.cc b/src/crimson/net/Errors.cc
new file mode 100644
index 000000000..d07c090db
--- /dev/null
+++ b/src/crimson/net/Errors.cc
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "Errors.h"
+
+namespace crimson::net {
+
+const std::error_category& net_category()
+{
+ struct category : public std::error_category {
+ const char* name() const noexcept override {
+ return "crimson::net";
+ }
+
+ std::string message(int ev) const override {
+ switch (static_cast<error>(ev)) {
+ case error::success:
+ return "success";
+ case error::bad_connect_banner:
+ return "bad connect banner";
+ case error::bad_peer_address:
+ return "bad peer address";
+ case error::negotiation_failure:
+ return "negotiation failure";
+ case error::read_eof:
+ return "read eof";
+ case error::corrupted_message:
+ return "corrupted message";
+ case error::protocol_aborted:
+ return "protocol aborted";
+ default:
+ return "unknown";
+ }
+ }
+ };
+ static category instance;
+ return instance;
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Errors.h b/src/crimson/net/Errors.h
new file mode 100644
index 000000000..3a17a103a
--- /dev/null
+++ b/src/crimson/net/Errors.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <system_error>
+
+namespace crimson::net {
+
+/// net error codes
+enum class error {
+ success = 0,
+ bad_connect_banner,
+ bad_peer_address,
+ negotiation_failure,
+ read_eof,
+ corrupted_message,
+ protocol_aborted,
+};
+
+/// net error category
+const std::error_category& net_category();
+
+inline std::error_code make_error_code(error e)
+{
+ return {static_cast<int>(e), net_category()};
+}
+
+inline std::error_condition make_error_condition(error e)
+{
+ return {static_cast<int>(e), net_category()};
+}
+
+} // namespace crimson::net
+
+namespace std {
+
+/// enables implicit conversion to std::error_condition
+template <>
+struct is_error_condition_enum<crimson::net::error> : public true_type {};
+
+} // namespace std
diff --git a/src/crimson/net/FrameAssemblerV2.cc b/src/crimson/net/FrameAssemblerV2.cc
new file mode 100644
index 000000000..273a6350d
--- /dev/null
+++ b/src/crimson/net/FrameAssemblerV2.cc
@@ -0,0 +1,461 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "FrameAssemblerV2.h"
+
+#include "Errors.h"
+#include "SocketConnection.h"
+
+using ceph::msgr::v2::FrameAssembler;
+using ceph::msgr::v2::FrameError;
+using ceph::msgr::v2::preamble_block_t;
+using ceph::msgr::v2::segment_t;
+using ceph::msgr::v2::Tag;
+
+namespace {
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_ms);
+}
+
+} // namespace anonymous
+
+namespace crimson::net {
+
+FrameAssemblerV2::FrameAssemblerV2(SocketConnection &_conn)
+ : conn{_conn}, sid{seastar::this_shard_id()}
+{
+ assert(seastar::this_shard_id() == conn.get_messenger_shard_id());
+}
+
+FrameAssemblerV2::~FrameAssemblerV2()
+{
+ assert(seastar::this_shard_id() == conn.get_messenger_shard_id());
+ assert(seastar::this_shard_id() == sid);
+ if (has_socket()) {
+ std::ignore = move_socket();
+ }
+}
+
+#ifdef UNIT_TESTS_BUILT
+// should be consistent to intercept() in ProtocolV2.cc
+seastar::future<> FrameAssemblerV2::intercept_frames(
+ std::vector<Breakpoint> bps,
+ bp_type_t type)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ if (!conn.interceptor) {
+ return seastar::now();
+ }
+ return conn.interceptor->intercept(conn, bps
+ ).then([this, type](bp_action_t action) {
+ return seastar::smp::submit_to(
+ socket->get_shard_id(),
+ [this, type, action] {
+ socket->set_trap(type, action, &conn.interceptor->blocker);
+ });
+ });
+}
+#endif
+
+void FrameAssemblerV2::set_is_rev1(bool _is_rev1)
+{
+ assert(seastar::this_shard_id() == sid);
+ is_rev1 = _is_rev1;
+ tx_frame_asm.set_is_rev1(_is_rev1);
+ rx_frame_asm.set_is_rev1(_is_rev1);
+}
+
+void FrameAssemblerV2::create_session_stream_handlers(
+ const AuthConnectionMeta &auth_meta,
+ bool crossed)
+{
+ assert(seastar::this_shard_id() == sid);
+ session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+ nullptr, auth_meta, is_rev1, crossed);
+}
+
+void FrameAssemblerV2::reset_handlers()
+{
+ assert(seastar::this_shard_id() == sid);
+ session_stream_handlers = { nullptr, nullptr };
+ session_comp_handlers = { nullptr, nullptr };
+}
+
+FrameAssemblerV2::mover_t
+FrameAssemblerV2::to_replace()
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(is_socket_valid());
+
+ clear();
+
+ return mover_t{
+ move_socket(),
+ std::move(session_stream_handlers),
+ std::move(session_comp_handlers)};
+}
+
+seastar::future<> FrameAssemblerV2::replace_by(FrameAssemblerV2::mover_t &&mover)
+{
+ assert(seastar::this_shard_id() == sid);
+
+ clear();
+
+ session_stream_handlers = std::move(mover.session_stream_handlers);
+ session_comp_handlers = std::move(mover.session_comp_handlers);
+ if (has_socket()) {
+ return replace_shutdown_socket(std::move(mover.socket));
+ } else {
+ set_socket(std::move(mover.socket));
+ return seastar::now();
+ }
+}
+
+void FrameAssemblerV2::start_recording()
+{
+ assert(seastar::this_shard_id() == sid);
+ record_io = true;
+ rxbuf.clear();
+ txbuf.clear();
+}
+
+FrameAssemblerV2::record_bufs_t
+FrameAssemblerV2::stop_recording()
+{
+ assert(seastar::this_shard_id() == sid);
+ ceph_assert_always(record_io == true);
+ record_io = false;
+ return record_bufs_t{std::move(rxbuf), std::move(txbuf)};
+}
+
+bool FrameAssemblerV2::has_socket() const
+{
+ assert((socket && conn.socket) || (!socket && !conn.socket));
+ return bool(socket);
+}
+
+bool FrameAssemblerV2::is_socket_valid() const
+{
+ assert(seastar::this_shard_id() == sid);
+#ifndef NDEBUG
+ if (has_socket() && socket->get_shard_id() == sid) {
+ assert(socket->is_shutdown() == is_socket_shutdown);
+ }
+#endif
+ return has_socket() && !is_socket_shutdown;
+}
+
+seastar::shard_id
+FrameAssemblerV2::get_socket_shard_id() const
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(is_socket_valid());
+ return socket->get_shard_id();
+}
+
+SocketFRef FrameAssemblerV2::move_socket()
+{
+ assert(has_socket());
+ conn.set_socket(nullptr);
+ return std::move(socket);
+}
+
+void FrameAssemblerV2::set_socket(SocketFRef &&new_socket)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(!has_socket());
+ assert(new_socket);
+ socket = std::move(new_socket);
+ conn.set_socket(socket.get());
+ is_socket_shutdown = false;
+ assert(is_socket_valid());
+}
+
+void FrameAssemblerV2::learn_socket_ephemeral_port_as_connector(uint16_t port)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ // Note: may not invoke on the socket core
+ socket->learn_ephemeral_port_as_connector(port);
+}
+
+template <bool may_cross_core>
+void FrameAssemblerV2::shutdown_socket(crimson::common::Gated *gate)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(is_socket_valid());
+ is_socket_shutdown = true;
+ if constexpr (may_cross_core) {
+ assert(conn.get_messenger_shard_id() == sid);
+ assert(gate);
+ gate->dispatch_in_background("shutdown_socket", conn, [this] {
+ return seastar::smp::submit_to(
+ socket->get_shard_id(), [this] {
+ socket->shutdown();
+ });
+ });
+ } else {
+ assert(socket->get_shard_id() == sid);
+ assert(!gate);
+ socket->shutdown();
+ }
+}
+template void FrameAssemblerV2::shutdown_socket<true>(crimson::common::Gated *);
+template void FrameAssemblerV2::shutdown_socket<false>(crimson::common::Gated *);
+
+seastar::future<> FrameAssemblerV2::replace_shutdown_socket(SocketFRef &&new_socket)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ assert(!is_socket_valid());
+ auto old_socket = move_socket();
+ auto old_socket_shard_id = old_socket->get_shard_id();
+ set_socket(std::move(new_socket));
+ return seastar::smp::submit_to(
+ old_socket_shard_id,
+ [old_socket = std::move(old_socket)]() mutable {
+ return old_socket->close(
+ ).then([sock = std::move(old_socket)] {});
+ });
+}
+
+seastar::future<> FrameAssemblerV2::close_shutdown_socket()
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ assert(!is_socket_valid());
+ return seastar::smp::submit_to(
+ socket->get_shard_id(), [this] {
+ return socket->close();
+ });
+}
+
+template <bool may_cross_core>
+seastar::future<ceph::bufferptr>
+FrameAssemblerV2::read_exactly(std::size_t bytes)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ if constexpr (may_cross_core) {
+ assert(conn.get_messenger_shard_id() == sid);
+ return seastar::smp::submit_to(
+ socket->get_shard_id(), [this, bytes] {
+ return socket->read_exactly(bytes);
+ }).then([this](auto bptr) {
+ if (record_io) {
+ rxbuf.append(bptr);
+ }
+ return bptr;
+ });
+ } else {
+ assert(socket->get_shard_id() == sid);
+ return socket->read_exactly(bytes);
+ }
+}
+template seastar::future<ceph::bufferptr> FrameAssemblerV2::read_exactly<true>(std::size_t);
+template seastar::future<ceph::bufferptr> FrameAssemblerV2::read_exactly<false>(std::size_t);
+
+template <bool may_cross_core>
+seastar::future<ceph::bufferlist>
+FrameAssemblerV2::read(std::size_t bytes)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ if constexpr (may_cross_core) {
+ assert(conn.get_messenger_shard_id() == sid);
+ return seastar::smp::submit_to(
+ socket->get_shard_id(), [this, bytes] {
+ return socket->read(bytes);
+ }).then([this](auto buf) {
+ if (record_io) {
+ rxbuf.append(buf);
+ }
+ return buf;
+ });
+ } else {
+ assert(socket->get_shard_id() == sid);
+ return socket->read(bytes);
+ }
+}
+template seastar::future<ceph::bufferlist> FrameAssemblerV2::read<true>(std::size_t);
+template seastar::future<ceph::bufferlist> FrameAssemblerV2::read<false>(std::size_t);
+
+template <bool may_cross_core>
+seastar::future<>
+FrameAssemblerV2::write(ceph::bufferlist buf)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ if constexpr (may_cross_core) {
+ assert(conn.get_messenger_shard_id() == sid);
+ if (record_io) {
+ txbuf.append(buf);
+ }
+ return seastar::smp::submit_to(
+ socket->get_shard_id(), [this, buf = std::move(buf)]() mutable {
+ return socket->write(std::move(buf));
+ });
+ } else {
+ assert(socket->get_shard_id() == sid);
+ return socket->write(std::move(buf));
+ }
+}
+template seastar::future<> FrameAssemblerV2::write<true>(ceph::bufferlist);
+template seastar::future<> FrameAssemblerV2::write<false>(ceph::bufferlist);
+
+template <bool may_cross_core>
+seastar::future<>
+FrameAssemblerV2::flush()
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ if constexpr (may_cross_core) {
+ assert(conn.get_messenger_shard_id() == sid);
+ return seastar::smp::submit_to(
+ socket->get_shard_id(), [this] {
+ return socket->flush();
+ });
+ } else {
+ assert(socket->get_shard_id() == sid);
+ return socket->flush();
+ }
+}
+template seastar::future<> FrameAssemblerV2::flush<true>();
+template seastar::future<> FrameAssemblerV2::flush<false>();
+
+template <bool may_cross_core>
+seastar::future<>
+FrameAssemblerV2::write_flush(ceph::bufferlist buf)
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(has_socket());
+ if constexpr (may_cross_core) {
+ assert(conn.get_messenger_shard_id() == sid);
+ if (unlikely(record_io)) {
+ txbuf.append(buf);
+ }
+ return seastar::smp::submit_to(
+ socket->get_shard_id(), [this, buf = std::move(buf)]() mutable {
+ return socket->write_flush(std::move(buf));
+ });
+ } else {
+ assert(socket->get_shard_id() == sid);
+ return socket->write_flush(std::move(buf));
+ }
+}
+template seastar::future<> FrameAssemblerV2::write_flush<true>(ceph::bufferlist);
+template seastar::future<> FrameAssemblerV2::write_flush<false>(ceph::bufferlist);
+
+template <bool may_cross_core>
+seastar::future<FrameAssemblerV2::read_main_t>
+FrameAssemblerV2::read_main_preamble()
+{
+ assert(seastar::this_shard_id() == sid);
+ rx_preamble.clear();
+ return read_exactly<may_cross_core>(
+ rx_frame_asm.get_preamble_onwire_len()
+ ).then([this](auto bptr) {
+ rx_preamble.append(std::move(bptr));
+ Tag tag;
+ try {
+ tag = rx_frame_asm.disassemble_preamble(rx_preamble);
+ } catch (FrameError& e) {
+ logger().warn("{} read_main_preamble: {}", conn, e.what());
+ throw std::system_error(make_error_code(crimson::net::error::negotiation_failure));
+ }
+#ifdef UNIT_TESTS_BUILT
+ return intercept_frame(tag, false
+ ).then([this, tag] {
+ return read_main_t{tag, &rx_frame_asm};
+ });
+#else
+ return read_main_t{tag, &rx_frame_asm};
+#endif
+ });
+}
+template seastar::future<FrameAssemblerV2::read_main_t> FrameAssemblerV2::read_main_preamble<true>();
+template seastar::future<FrameAssemblerV2::read_main_t> FrameAssemblerV2::read_main_preamble<false>();
+
+template <bool may_cross_core>
+seastar::future<FrameAssemblerV2::read_payload_t*>
+FrameAssemblerV2::read_frame_payload()
+{
+ assert(seastar::this_shard_id() == sid);
+ rx_segments_data.clear();
+ return seastar::do_until(
+ [this] {
+ return rx_frame_asm.get_num_segments() == rx_segments_data.size();
+ },
+ [this] {
+ // TODO: create aligned and contiguous buffer from socket
+ const size_t seg_idx = rx_segments_data.size();
+ if (uint16_t alignment = rx_frame_asm.get_segment_align(seg_idx);
+ alignment != segment_t::DEFAULT_ALIGNMENT) {
+ logger().trace("{} cannot allocate {} aligned buffer at segment desc index {}",
+ conn, alignment, rx_segments_data.size());
+ }
+ uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx);
+ // TODO: create aligned and contiguous buffer from socket
+ return read_exactly<may_cross_core>(onwire_len
+ ).then([this](auto bptr) {
+ logger().trace("{} RECV({}) frame segment[{}]",
+ conn, bptr.length(), rx_segments_data.size());
+ bufferlist segment;
+ segment.append(std::move(bptr));
+ rx_segments_data.emplace_back(std::move(segment));
+ });
+ }
+ ).then([this] {
+ return read_exactly<may_cross_core>(rx_frame_asm.get_epilogue_onwire_len());
+ }).then([this](auto bptr) {
+ logger().trace("{} RECV({}) frame epilogue", conn, bptr.length());
+ bool ok = false;
+ try {
+ bufferlist rx_epilogue;
+ rx_epilogue.append(std::move(bptr));
+ ok = rx_frame_asm.disassemble_segments(rx_preamble, rx_segments_data.data(), rx_epilogue);
+ } catch (FrameError& e) {
+ logger().error("read_frame_payload: {} {}", conn, e.what());
+ throw std::system_error(make_error_code(crimson::net::error::negotiation_failure));
+ } catch (ceph::crypto::onwire::MsgAuthError&) {
+ logger().error("read_frame_payload: {} bad auth tag", conn);
+ throw std::system_error(make_error_code(crimson::net::error::negotiation_failure));
+ }
+ // we do have a mechanism that allows transmitter to start sending message
+ // and abort after putting entire data field on wire. This will be used by
+ // the kernel client to avoid unnecessary buffering.
+ if (!ok) {
+ ceph_abort("TODO");
+ }
+ return &rx_segments_data;
+ });
+}
+template seastar::future<FrameAssemblerV2::read_payload_t*> FrameAssemblerV2::read_frame_payload<true>();
+template seastar::future<FrameAssemblerV2::read_payload_t*> FrameAssemblerV2::read_frame_payload<false>();
+
+void FrameAssemblerV2::log_main_preamble(const ceph::bufferlist &bl)
+{
+ const auto main_preamble =
+ reinterpret_cast<const preamble_block_t*>(bl.front().c_str());
+ logger().trace("{} SEND({}) frame: tag={}, num_segments={}, crc={}",
+ conn, bl.length(), (int)main_preamble->tag,
+ (int)main_preamble->num_segments, main_preamble->crc);
+}
+
+FrameAssemblerV2Ref FrameAssemblerV2::create(SocketConnection &conn)
+{
+ return std::make_unique<FrameAssemblerV2>(conn);
+}
+
+void FrameAssemblerV2::clear()
+{
+ record_io = false;
+ rxbuf.clear();
+ txbuf.clear();
+ rx_preamble.clear();
+ rx_segments_data.clear();
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/FrameAssemblerV2.h b/src/crimson/net/FrameAssemblerV2.h
new file mode 100644
index 000000000..9c89c144e
--- /dev/null
+++ b/src/crimson/net/FrameAssemblerV2.h
@@ -0,0 +1,257 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "msg/async/frames_v2.h"
+#include "msg/async/crypto_onwire.h"
+#include "msg/async/compression_onwire.h"
+
+#include "crimson/common/gated.h"
+#include "crimson/net/Socket.h"
+
+#ifdef UNIT_TESTS_BUILT
+#include "Interceptor.h"
+#endif
+
+namespace crimson::net {
+
+class SocketConnection;
+class FrameAssemblerV2;
+using FrameAssemblerV2Ref = std::unique_ptr<FrameAssemblerV2>;
+
+class FrameAssemblerV2 {
+public:
+ FrameAssemblerV2(SocketConnection &conn);
+
+ ~FrameAssemblerV2();
+
+ FrameAssemblerV2(const FrameAssemblerV2 &) = delete;
+
+ FrameAssemblerV2(FrameAssemblerV2 &&) = delete;
+
+ void set_shard_id(seastar::shard_id _sid) {
+ assert(seastar::this_shard_id() == sid);
+ clear();
+ sid = _sid;
+ }
+
+ seastar::shard_id get_shard_id() const {
+ return sid;
+ }
+
+ void set_is_rev1(bool is_rev1);
+
+ void create_session_stream_handlers(
+ const AuthConnectionMeta &auth_meta,
+ bool crossed);
+
+ void reset_handlers();
+
+ /*
+ * replacing
+ */
+
+ struct mover_t {
+ SocketFRef socket;
+ ceph::crypto::onwire::rxtx_t session_stream_handlers;
+ ceph::compression::onwire::rxtx_t session_comp_handlers;
+ };
+
+ mover_t to_replace();
+
+ seastar::future<> replace_by(mover_t &&);
+
+ /*
+ * auth signature interfaces
+ */
+
+ void start_recording();
+
+ struct record_bufs_t {
+ ceph::bufferlist rxbuf;
+ ceph::bufferlist txbuf;
+ };
+ record_bufs_t stop_recording();
+
+ /*
+ * socket maintainence interfaces
+ */
+
+ // the socket exists and not shutdown
+ bool is_socket_valid() const;
+
+ seastar::shard_id get_socket_shard_id() const;
+
+ void set_socket(SocketFRef &&);
+
+ void learn_socket_ephemeral_port_as_connector(uint16_t port);
+
+ // if may_cross_core == true, gate is required for cross-core shutdown
+ template <bool may_cross_core>
+ void shutdown_socket(crimson::common::Gated *gate);
+
+ seastar::future<> replace_shutdown_socket(SocketFRef &&);
+
+ seastar::future<> close_shutdown_socket();
+
+ /*
+ * socket read and write interfaces
+ */
+
+ template <bool may_cross_core = true>
+ seastar::future<ceph::bufferptr> read_exactly(std::size_t bytes);
+
+ template <bool may_cross_core = true>
+ seastar::future<ceph::bufferlist> read(std::size_t bytes);
+
+ template <bool may_cross_core = true>
+ seastar::future<> write(ceph::bufferlist);
+
+ template <bool may_cross_core = true>
+ seastar::future<> flush();
+
+ template <bool may_cross_core = true>
+ seastar::future<> write_flush(ceph::bufferlist);
+
+ /*
+ * frame read and write interfaces
+ */
+
+ /// may throw negotiation_failure as fault
+ struct read_main_t {
+ ceph::msgr::v2::Tag tag;
+ const ceph::msgr::v2::FrameAssembler *rx_frame_asm;
+ };
+ template <bool may_cross_core = true>
+ seastar::future<read_main_t> read_main_preamble();
+
+ /// may throw negotiation_failure as fault
+ using read_payload_t = ceph::msgr::v2::segment_bls_t;
+ // FIXME: read_payload_t cannot be no-throw move constructible
+ template <bool may_cross_core = true>
+ seastar::future<read_payload_t*> read_frame_payload();
+
+ template <class F>
+ ceph::bufferlist get_buffer(F &tx_frame) {
+ assert(seastar::this_shard_id() == sid);
+ auto bl = tx_frame.get_buffer(tx_frame_asm);
+ log_main_preamble(bl);
+ return bl;
+ }
+
+ template <class F, bool may_cross_core = true>
+ seastar::future<> write_flush_frame(F &tx_frame) {
+ assert(seastar::this_shard_id() == sid);
+ auto bl = get_buffer(tx_frame);
+#ifdef UNIT_TESTS_BUILT
+ return intercept_frame(F::tag, true
+ ).then([this, bl=std::move(bl)]() mutable {
+ return write_flush<may_cross_core>(std::move(bl));
+ });
+#else
+ return write_flush<may_cross_core>(std::move(bl));
+#endif
+ }
+
+ static FrameAssemblerV2Ref create(SocketConnection &conn);
+
+#ifdef UNIT_TESTS_BUILT
+ seastar::future<> intercept_frames(
+ std::vector<ceph::msgr::v2::Tag> tags,
+ bool is_write) {
+ auto type = is_write ? bp_type_t::WRITE : bp_type_t::READ;
+ std::vector<Breakpoint> bps;
+ for (auto &tag : tags) {
+ bps.emplace_back(Breakpoint{tag, type});
+ }
+ return intercept_frames(bps, type);
+ }
+
+ seastar::future<> intercept_frame(
+ ceph::msgr::v2::Tag tag,
+ bool is_write) {
+ auto type = is_write ? bp_type_t::WRITE : bp_type_t::READ;
+ std::vector<Breakpoint> bps;
+ bps.emplace_back(Breakpoint{tag, type});
+ return intercept_frames(bps, type);
+ }
+
+ seastar::future<> intercept_frame(
+ custom_bp_t bp,
+ bool is_write) {
+ auto type = is_write ? bp_type_t::WRITE : bp_type_t::READ;
+ std::vector<Breakpoint> bps;
+ bps.emplace_back(Breakpoint{bp});
+ return intercept_frames(bps, type);
+ }
+#endif
+
+private:
+#ifdef UNIT_TESTS_BUILT
+ seastar::future<> intercept_frames(
+ std::vector<Breakpoint> bps,
+ bp_type_t type);
+#endif
+
+ bool has_socket() const;
+
+ SocketFRef move_socket();
+
+ void clear();
+
+ void log_main_preamble(const ceph::bufferlist &bl);
+
+ SocketConnection &conn;
+
+ SocketFRef socket;
+
+ // checking Socket::is_shutdown() synchronously is impossible when sid is
+ // different from the socket sid.
+ bool is_socket_shutdown = false;
+
+ // the current working shard, can be messenger or socket shard.
+ // if is messenger shard, should call interfaces with may_cross_core = true.
+ seastar::shard_id sid;
+
+ /*
+ * auth signature
+ *
+ * only in the messenger core
+ */
+
+ bool record_io = false;
+
+ ceph::bufferlist rxbuf;
+
+ ceph::bufferlist txbuf;
+
+ /*
+ * frame data and handlers
+ */
+
+ ceph::crypto::onwire::rxtx_t session_stream_handlers = { nullptr, nullptr };
+
+ // TODO
+ ceph::compression::onwire::rxtx_t session_comp_handlers = { nullptr, nullptr };
+
+ bool is_rev1 = false;
+
+ ceph::msgr::v2::FrameAssembler tx_frame_asm{
+ &session_stream_handlers, is_rev1, common::local_conf()->ms_crc_data,
+ &session_comp_handlers};
+
+ ceph::msgr::v2::FrameAssembler rx_frame_asm{
+ &session_stream_handlers, is_rev1, common::local_conf()->ms_crc_data,
+ &session_comp_handlers};
+
+ // in the messenger core during handshake,
+ // and in the socket core during open,
+ // must be cleaned before switching cores.
+
+ ceph::bufferlist rx_preamble;
+
+ read_payload_t rx_segments_data;
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Fwd.h b/src/crimson/net/Fwd.h
new file mode 100644
index 000000000..2b1595141
--- /dev/null
+++ b/src/crimson/net/Fwd.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/container/small_vector.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sharded.hh>
+
+#include "msg/Connection.h"
+#include "msg/MessageRef.h"
+#include "msg/msg_types.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/common/local_shared_foreign_ptr.h"
+
+class AuthConnectionMeta;
+
+namespace crimson::net {
+
+using msgr_tag_t = uint8_t;
+using stop_t = seastar::stop_iteration;
+
+class Connection;
+using ConnectionLRef = seastar::shared_ptr<Connection>;
+using ConnectionFRef = seastar::foreign_ptr<ConnectionLRef>;
+using ConnectionRef = ::crimson::local_shared_foreign_ptr<ConnectionLRef>;
+
+class Dispatcher;
+class ChainedDispatchers;
+constexpr std::size_t NUM_DISPATCHERS = 4u;
+using dispatchers_t = boost::container::small_vector<Dispatcher*, NUM_DISPATCHERS>;
+
+class Messenger;
+using MessengerRef = seastar::shared_ptr<Messenger>;
+
+using MessageFRef = seastar::foreign_ptr<MessageURef>;
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Interceptor.h b/src/crimson/net/Interceptor.h
new file mode 100644
index 000000000..35b74e243
--- /dev/null
+++ b/src/crimson/net/Interceptor.h
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <variant>
+#include <seastar/core/sharded.hh>
+#include <seastar/core/sleep.hh>
+
+#include "Fwd.h"
+#include "msg/async/frames_v2.h"
+
+namespace crimson::net {
+
+enum class custom_bp_t : uint8_t {
+ BANNER_WRITE = 0,
+ BANNER_READ,
+ BANNER_PAYLOAD_READ,
+ SOCKET_CONNECTING,
+ SOCKET_ACCEPTED
+};
+inline const char* get_bp_name(custom_bp_t bp) {
+ uint8_t index = static_cast<uint8_t>(bp);
+ static const char *const bp_names[] = {"BANNER_WRITE",
+ "BANNER_READ",
+ "BANNER_PAYLOAD_READ",
+ "SOCKET_CONNECTING",
+ "SOCKET_ACCEPTED"};
+ assert(index < std::size(bp_names));
+ return bp_names[index];
+}
+
+enum class bp_type_t {
+ READ = 0,
+ WRITE
+};
+
+enum class bp_action_t {
+ CONTINUE = 0,
+ FAULT,
+ BLOCK,
+ STALL
+};
+
+class socket_blocker {
+ std::optional<seastar::abort_source> p_blocked;
+ std::optional<seastar::abort_source> p_unblocked;
+ const seastar::shard_id primary_sid;
+
+ public:
+ socket_blocker() : primary_sid{seastar::this_shard_id()} {}
+
+ seastar::future<> wait_blocked() {
+ ceph_assert(seastar::this_shard_id() == primary_sid);
+ ceph_assert(!p_blocked);
+ if (p_unblocked) {
+ return seastar::make_ready_future<>();
+ } else {
+ p_blocked = seastar::abort_source();
+ return seastar::sleep_abortable(
+ std::chrono::seconds(10), *p_blocked
+ ).then([] {
+ throw std::runtime_error(
+ "Timeout (10s) in socket_blocker::wait_blocked()");
+ }).handle_exception_type([] (const seastar::sleep_aborted& e) {
+ // wait done!
+ });
+ }
+ }
+
+ seastar::future<> block() {
+ return seastar::smp::submit_to(primary_sid, [this] {
+ if (p_blocked) {
+ p_blocked->request_abort();
+ p_blocked = std::nullopt;
+ }
+ ceph_assert(!p_unblocked);
+ p_unblocked = seastar::abort_source();
+ return seastar::sleep_abortable(
+ std::chrono::seconds(10), *p_unblocked
+ ).then([] {
+ ceph_abort("Timeout (10s) in socket_blocker::block()");
+ }).handle_exception_type([] (const seastar::sleep_aborted& e) {
+ // wait done!
+ });
+ });
+ }
+
+ void unblock() {
+ ceph_assert(seastar::this_shard_id() == primary_sid);
+ ceph_assert(!p_blocked);
+ ceph_assert(p_unblocked);
+ p_unblocked->request_abort();
+ p_unblocked = std::nullopt;
+ }
+};
+
+struct tag_bp_t {
+ ceph::msgr::v2::Tag tag;
+ bp_type_t type;
+ bool operator==(const tag_bp_t& x) const {
+ return tag == x.tag && type == x.type;
+ }
+ bool operator!=(const tag_bp_t& x) const { return !operator==(x); }
+ bool operator<(const tag_bp_t& x) const {
+ return std::tie(tag, type) < std::tie(x.tag, x.type);
+ }
+};
+
+struct Breakpoint {
+ using var_t = std::variant<custom_bp_t, tag_bp_t>;
+ var_t bp;
+ Breakpoint(custom_bp_t bp) : bp(bp) { }
+ Breakpoint(ceph::msgr::v2::Tag tag, bp_type_t type)
+ : bp(tag_bp_t{tag, type}) { }
+ bool operator==(const Breakpoint& x) const { return bp == x.bp; }
+ bool operator!=(const Breakpoint& x) const { return !operator==(x); }
+ bool operator==(const custom_bp_t& x) const { return bp == var_t(x); }
+ bool operator!=(const custom_bp_t& x) const { return !operator==(x); }
+ bool operator==(const tag_bp_t& x) const { return bp == var_t(x); }
+ bool operator!=(const tag_bp_t& x) const { return !operator==(x); }
+ bool operator<(const Breakpoint& x) const { return bp < x.bp; }
+};
+
+struct Interceptor {
+ socket_blocker blocker;
+ virtual ~Interceptor() {}
+ virtual void register_conn(ConnectionRef) = 0;
+ virtual void register_conn_ready(ConnectionRef) = 0;
+ virtual void register_conn_closed(ConnectionRef) = 0;
+ virtual void register_conn_replaced(ConnectionRef) = 0;
+
+ virtual seastar::future<bp_action_t>
+ intercept(Connection&, std::vector<Breakpoint> bp) = 0;
+};
+
+} // namespace crimson::net
+
+template<>
+struct fmt::formatter<crimson::net::bp_action_t> : fmt::formatter<std::string_view> {
+ template <typename FormatContext>
+ auto format(const crimson::net::bp_action_t& action, FormatContext& ctx) const {
+ static const char *const action_names[] = {"CONTINUE",
+ "FAULT",
+ "BLOCK",
+ "STALL"};
+ assert(static_cast<size_t>(action) < std::size(action_names));
+ return formatter<std::string_view>::format(action_names[static_cast<size_t>(action)], ctx);
+ }
+};
+
+template<>
+struct fmt::formatter<crimson::net::Breakpoint> : fmt::formatter<std::string_view> {
+ template <typename FormatContext>
+ auto format(const crimson::net::Breakpoint& bp, FormatContext& ctx) const {
+ if (auto custom_bp = std::get_if<crimson::net::custom_bp_t>(&bp.bp)) {
+ return formatter<std::string_view>::format(crimson::net::get_bp_name(*custom_bp), ctx);
+ }
+ auto tag_bp = std::get<crimson::net::tag_bp_t>(bp.bp);
+ static const char *const tag_names[] = {"NONE",
+ "HELLO",
+ "AUTH_REQUEST",
+ "AUTH_BAD_METHOD",
+ "AUTH_REPLY_MORE",
+ "AUTH_REQUEST_MORE",
+ "AUTH_DONE",
+ "AUTH_SIGNATURE",
+ "CLIENT_IDENT",
+ "SERVER_IDENT",
+ "IDENT_MISSING_FEATURES",
+ "SESSION_RECONNECT",
+ "SESSION_RESET",
+ "SESSION_RETRY",
+ "SESSION_RETRY_GLOBAL",
+ "SESSION_RECONNECT_OK",
+ "WAIT",
+ "MESSAGE",
+ "KEEPALIVE2",
+ "KEEPALIVE2_ACK",
+ "ACK"};
+ assert(static_cast<size_t>(tag_bp.tag) < std::size(tag_names));
+ return fmt::format_to(ctx.out(), "{}_{}",
+ tag_names[static_cast<size_t>(tag_bp.tag)],
+ tag_bp.type == crimson::net::bp_type_t::WRITE ? "WRITE" : "READ");
+ }
+};
diff --git a/src/crimson/net/Messenger.cc b/src/crimson/net/Messenger.cc
new file mode 100644
index 000000000..1af198589
--- /dev/null
+++ b/src/crimson/net/Messenger.cc
@@ -0,0 +1,19 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Messenger.h"
+#include "SocketMessenger.h"
+
+namespace crimson::net {
+
+MessengerRef
+Messenger::create(const entity_name_t& name,
+ const std::string& lname,
+ uint64_t nonce,
+ bool dispatch_only_on_this_shard)
+{
+ return seastar::make_shared<SocketMessenger>(
+ name, lname, nonce, dispatch_only_on_this_shard);
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Messenger.h b/src/crimson/net/Messenger.h
new file mode 100644
index 000000000..74df062d8
--- /dev/null
+++ b/src/crimson/net/Messenger.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "Fwd.h"
+#include "crimson/common/throttle.h"
+#include "msg/Message.h"
+#include "msg/Policy.h"
+
+class AuthAuthorizer;
+
+namespace crimson::auth {
+class AuthClient;
+class AuthServer;
+}
+
+namespace crimson::net {
+
+#ifdef UNIT_TESTS_BUILT
+class Interceptor;
+#endif
+
+using Throttle = crimson::common::Throttle;
+using SocketPolicy = ceph::net::Policy<Throttle>;
+
+class Messenger {
+public:
+ Messenger() {}
+
+ virtual ~Messenger() {}
+
+ virtual const entity_name_t& get_myname() const = 0;
+
+ entity_type_t get_mytype() const { return get_myname().type(); }
+
+ virtual const entity_addrvec_t &get_myaddrs() const = 0;
+
+ entity_addr_t get_myaddr() const { return get_myaddrs().front(); }
+
+ virtual void set_myaddrs(const entity_addrvec_t& addrs) = 0;
+
+ virtual bool set_addr_unknowns(const entity_addrvec_t &addrs) = 0;
+
+ virtual void set_auth_client(crimson::auth::AuthClient *) = 0;
+
+ virtual void set_auth_server(crimson::auth::AuthServer *) = 0;
+
+ using bind_ertr = crimson::errorator<
+ crimson::ct_error::address_in_use, // The address (range) is already bound
+ crimson::ct_error::address_not_available
+ >;
+ /// bind to the given address
+ virtual bind_ertr::future<> bind(const entity_addrvec_t& addr) = 0;
+
+ /// start the messenger
+ virtual seastar::future<> start(const dispatchers_t&) = 0;
+
+ /// either return an existing connection to the peer,
+ /// or a new pending connection
+ virtual ConnectionRef
+ connect(const entity_addr_t& peer_addr,
+ const entity_name_t& peer_name) = 0;
+
+ ConnectionRef
+ connect(const entity_addr_t& peer_addr,
+ const entity_type_t& peer_type) {
+ return connect(peer_addr, entity_name_t(peer_type, -1));
+ }
+
+ virtual bool owns_connection(Connection &) const = 0;
+
+ // wait for messenger shutdown
+ virtual seastar::future<> wait() = 0;
+
+ // stop dispatching events and messages
+ virtual void stop() = 0;
+
+ virtual bool is_started() const = 0;
+
+ // free internal resources before destruction, must be called after stopped,
+ // and must be called if is bound.
+ virtual seastar::future<> shutdown() = 0;
+
+ virtual void print(std::ostream& out) const = 0;
+
+ virtual SocketPolicy get_policy(entity_type_t peer_type) const = 0;
+
+ virtual SocketPolicy get_default_policy() const = 0;
+
+ virtual void set_default_policy(const SocketPolicy& p) = 0;
+
+ virtual void set_policy(entity_type_t peer_type, const SocketPolicy& p) = 0;
+
+ virtual void set_policy_throttler(entity_type_t peer_type, Throttle* throttle) = 0;
+
+ static MessengerRef
+ create(const entity_name_t& name,
+ const std::string& lname,
+ uint64_t nonce,
+ bool dispatch_only_on_this_shard);
+
+#ifdef UNIT_TESTS_BUILT
+ virtual void set_interceptor(Interceptor *) = 0;
+#endif
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Messenger& msgr) {
+ out << "[";
+ msgr.print(out);
+ out << "]";
+ return out;
+}
+
+} // namespace crimson::net
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::net::Messenger> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/net/ProtocolV2.cc b/src/crimson/net/ProtocolV2.cc
new file mode 100644
index 000000000..55b669384
--- /dev/null
+++ b/src/crimson/net/ProtocolV2.cc
@@ -0,0 +1,2348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ProtocolV2.h"
+
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+#include "include/msgr.h"
+#include "include/random.h"
+#include "msg/msg_fmt.h"
+
+#include "crimson/auth/AuthClient.h"
+#include "crimson/auth/AuthServer.h"
+#include "crimson/common/formatter.h"
+#include "crimson/common/log.h"
+
+#include "Errors.h"
+#include "SocketMessenger.h"
+
+using namespace ceph::msgr::v2;
+using crimson::common::local_conf;
+
+namespace {
+
+// TODO: CEPH_MSGR2_FEATURE_COMPRESSION
+const uint64_t CRIMSON_MSGR2_SUPPORTED_FEATURES =
+ (CEPH_MSGR2_FEATURE_REVISION_1 |
+ // CEPH_MSGR2_FEATURE_COMPRESSION |
+ UINT64_C(0));
+
+// Log levels in V2 Protocol:
+// * error level, something error that cause connection to terminate:
+// - fatal errors;
+// - bugs;
+// * warn level: something unusual that identifies connection fault or replacement:
+// - unstable network;
+// - incompatible peer;
+// - auth failure;
+// - connection race;
+// - connection reset;
+// * info level, something very important to show connection lifecycle,
+// which doesn't happen very frequently;
+// * debug level, important logs for debugging, including:
+// - all the messages sent/received (-->/<==);
+// - all the frames exchanged (WRITE/GOT);
+// - important fields updated (UPDATE);
+// - connection state transitions (TRIGGER);
+// * trace level, trivial logs showing:
+// - the exact bytes being sent/received (SEND/RECV(bytes));
+// - detailed information of sub-frames;
+// - integrity checks;
+// - etc.
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_ms);
+}
+
+[[noreturn]] void abort_in_fault() {
+ throw std::system_error(make_error_code(crimson::net::error::negotiation_failure));
+}
+
+[[noreturn]] void abort_protocol() {
+ throw std::system_error(make_error_code(crimson::net::error::protocol_aborted));
+}
+
+#define ABORT_IN_CLOSE(is_dispatch_reset) { \
+ do_close(is_dispatch_reset); \
+ abort_protocol(); \
+}
+
+inline void expect_tag(const Tag& expected,
+ const Tag& actual,
+ crimson::net::SocketConnection& conn,
+ const char *where) {
+ if (actual != expected) {
+ logger().warn("{} {} received wrong tag: {}, expected {}",
+ conn, where,
+ static_cast<uint32_t>(actual),
+ static_cast<uint32_t>(expected));
+ abort_in_fault();
+ }
+}
+
+inline void unexpected_tag(const Tag& unexpected,
+ crimson::net::SocketConnection& conn,
+ const char *where) {
+ logger().warn("{} {} received unexpected tag: {}",
+ conn, where, static_cast<uint32_t>(unexpected));
+ abort_in_fault();
+}
+
+inline uint64_t generate_client_cookie() {
+ return ceph::util::generate_random_number<uint64_t>(
+ 1, std::numeric_limits<uint64_t>::max());
+}
+
+} // namespace anonymous
+
+namespace crimson::net {
+
+seastar::future<> ProtocolV2::Timer::backoff(double seconds)
+{
+ logger().warn("{} waiting {} seconds ...", conn, seconds);
+ cancel();
+ last_dur_ = seconds;
+ as = seastar::abort_source();
+ auto dur = std::chrono::duration_cast<seastar::lowres_clock::duration>(
+ std::chrono::duration<double>(seconds));
+ return seastar::sleep_abortable(dur, *as
+ ).handle_exception_type([this] (const seastar::sleep_aborted& e) {
+ logger().debug("{} wait aborted", conn);
+ abort_protocol();
+ });
+}
+
+ProtocolV2::ProtocolV2(SocketConnection& conn,
+ IOHandler &io_handler)
+ : conn{conn},
+ messenger{conn.messenger},
+ io_handler{io_handler},
+ frame_assembler{FrameAssemblerV2::create(conn)},
+ auth_meta{seastar::make_lw_shared<AuthConnectionMeta>()},
+ protocol_timer{conn}
+{
+ io_states = io_handler.get_states();
+}
+
+ProtocolV2::~ProtocolV2() {}
+
+void ProtocolV2::start_connect(const entity_addr_t& _peer_addr,
+ const entity_name_t& _peer_name)
+{
+ assert(seastar::this_shard_id() == conn.get_messenger_shard_id());
+ ceph_assert(state == state_t::NONE);
+ ceph_assert(!gate.is_closed());
+ conn.peer_addr = _peer_addr;
+ conn.target_addr = _peer_addr;
+ conn.set_peer_name(_peer_name);
+ conn.policy = messenger.get_policy(_peer_name.type());
+ client_cookie = generate_client_cookie();
+ logger().info("{} ProtocolV2::start_connect(): peer_addr={}, peer_name={}, cc={}"
+ " policy(lossy={}, server={}, standby={}, resetcheck={})",
+ conn, _peer_addr, _peer_name, client_cookie,
+ conn.policy.lossy, conn.policy.server,
+ conn.policy.standby, conn.policy.resetcheck);
+ messenger.register_conn(
+ seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+ execute_connecting();
+}
+
+void ProtocolV2::start_accept(SocketFRef&& new_socket,
+ const entity_addr_t& _peer_addr)
+{
+ assert(seastar::this_shard_id() == conn.get_messenger_shard_id());
+ ceph_assert(state == state_t::NONE);
+ // until we know better
+ conn.target_addr = _peer_addr;
+ frame_assembler->set_socket(std::move(new_socket));
+ has_socket = true;
+ is_socket_valid = true;
+ logger().info("{} ProtocolV2::start_accept(): target_addr={}", conn, _peer_addr);
+ messenger.accept_conn(
+ seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+
+ auto cc_seq = crosscore.prepare_submit();
+ gate.dispatch_in_background("set_accepted_sid", conn, [this, cc_seq] {
+ return io_handler.set_accepted_sid(
+ cc_seq,
+ frame_assembler->get_socket_shard_id(),
+ seastar::make_foreign(conn.shared_from_this()));
+ });
+
+ execute_accepting();
+}
+
+void ProtocolV2::trigger_state_phase1(state_t new_state)
+{
+ ceph_assert_always(!gate.is_closed());
+ if (new_state == state) {
+ logger().error("{} is not allowed to re-trigger state {}",
+ conn, get_state_name(state));
+ ceph_abort();
+ }
+ if (state == state_t::CLOSING) {
+ logger().error("{} CLOSING is not allowed to trigger state {}",
+ conn, get_state_name(new_state));
+ ceph_abort();
+ }
+ logger().debug("{} TRIGGER {}, was {}",
+ conn, get_state_name(new_state), get_state_name(state));
+
+ if (state == state_t::READY) {
+ // from READY
+ ceph_assert_always(!need_exit_io);
+ ceph_assert_always(!pr_exit_io.has_value());
+ need_exit_io = true;
+ pr_exit_io = seastar::shared_promise<>();
+ }
+
+ if (new_state == state_t::STANDBY && !conn.policy.server) {
+ need_notify_out = true;
+ } else {
+ need_notify_out = false;
+ }
+
+ state = new_state;
+}
+
+void ProtocolV2::trigger_state_phase2(
+ state_t new_state, io_state_t new_io_state)
+{
+ ceph_assert_always(new_state == state);
+ ceph_assert_always(!gate.is_closed());
+ ceph_assert_always(!pr_switch_io_shard.has_value());
+
+ FrameAssemblerV2Ref fa;
+ if (new_state == state_t::READY) {
+ assert(new_io_state == io_state_t::open);
+ assert(io_handler.get_shard_id() ==
+ frame_assembler->get_socket_shard_id());
+ frame_assembler->set_shard_id(io_handler.get_shard_id());
+ fa = std::move(frame_assembler);
+ } else {
+ assert(new_io_state != io_state_t::open);
+ }
+
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} send {} IOHandler::set_io_state(): new_state={}, new_io_state={}, "
+ "fa={}, set_notify_out={}",
+ conn, cc_seq, get_state_name(new_state), new_io_state,
+ fa ? fmt::format("(sid={})", fa->get_shard_id()) : "N/A",
+ need_notify_out);
+ gate.dispatch_in_background(
+ "set_io_state", conn,
+ [this, cc_seq, new_io_state, fa=std::move(fa)]() mutable {
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(),
+ [this, cc_seq, new_io_state,
+ fa=std::move(fa), set_notify_out=need_notify_out]() mutable {
+ return io_handler.set_io_state(
+ cc_seq, new_io_state, std::move(fa), set_notify_out);
+ });
+ });
+
+ if (need_exit_io) {
+ // from READY
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} send {} IOHandler::wait_io_exit_dispatching() ...",
+ conn, cc_seq);
+ assert(pr_exit_io.has_value());
+ assert(new_io_state != io_state_t::open);
+ need_exit_io = false;
+ gate.dispatch_in_background("exit_io", conn, [this, cc_seq] {
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(), [this, cc_seq] {
+ return io_handler.wait_io_exit_dispatching(cc_seq);
+ }).then([this, cc_seq](auto ret) {
+ logger().debug("{} finish {} IOHandler::wait_io_exit_dispatching(), {}",
+ conn, cc_seq, ret.io_states);
+ frame_assembler = std::move(ret.frame_assembler);
+ assert(seastar::this_shard_id() == conn.get_messenger_shard_id());
+ ceph_assert_always(
+ seastar::this_shard_id() == frame_assembler->get_shard_id());
+ ceph_assert_always(!frame_assembler->is_socket_valid());
+ assert(!need_exit_io);
+ io_states = ret.io_states;
+ pr_exit_io->set_value();
+ pr_exit_io = std::nullopt;
+ });
+ });
+ }
+}
+
+void ProtocolV2::fault(
+ state_t expected_state,
+ const char *where,
+ std::exception_ptr eptr)
+{
+ assert(expected_state == state_t::CONNECTING ||
+ expected_state == state_t::ESTABLISHING ||
+ expected_state == state_t::REPLACING ||
+ expected_state == state_t::READY);
+ const char *e_what;
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception &e) {
+ e_what = e.what();
+ }
+
+ if (state != expected_state) {
+ logger().info("{} protocol {} {} is aborted at inconsistent {} -- {}",
+ conn,
+ get_state_name(expected_state),
+ where,
+ get_state_name(state),
+ e_what);
+#ifndef NDEBUG
+ if (expected_state == state_t::REPLACING) {
+ assert(state == state_t::CLOSING);
+ } else if (expected_state == state_t::READY) {
+ assert(state == state_t::CLOSING ||
+ state == state_t::REPLACING ||
+ state == state_t::CONNECTING ||
+ state == state_t::STANDBY);
+ } else {
+ assert(state == state_t::CLOSING ||
+ state == state_t::REPLACING);
+ }
+#endif
+ return;
+ }
+ assert(state == expected_state);
+
+ if (state != state_t::CONNECTING && conn.policy.lossy) {
+ // socket will be shutdown in do_close()
+ logger().info("{} protocol {} {} fault on lossy channel, going to CLOSING -- {}",
+ conn, get_state_name(state), where, e_what);
+ do_close(true);
+ return;
+ }
+
+ if (likely(has_socket)) {
+ if (likely(is_socket_valid)) {
+ ceph_assert_always(state != state_t::READY);
+ frame_assembler->shutdown_socket<true>(&gate);
+ is_socket_valid = false;
+ } else {
+ ceph_assert_always(state != state_t::ESTABLISHING);
+ }
+ } else { // !has_socket
+ ceph_assert_always(state == state_t::CONNECTING);
+ assert(!is_socket_valid);
+ }
+
+ if (conn.policy.server ||
+ (conn.policy.standby && !io_states.is_out_queued_or_sent())) {
+ if (conn.policy.server) {
+ logger().info("{} protocol {} {} fault as server, going to STANDBY {} -- {}",
+ conn,
+ get_state_name(state),
+ where,
+ io_states,
+ e_what);
+ } else {
+ logger().info("{} protocol {} {} fault with nothing to send, going to STANDBY {} -- {}",
+ conn,
+ get_state_name(state),
+ where,
+ io_states,
+ e_what);
+ }
+ execute_standby();
+ } else if (state == state_t::CONNECTING ||
+ state == state_t::REPLACING) {
+ logger().info("{} protocol {} {} fault, going to WAIT {} -- {}",
+ conn,
+ get_state_name(state),
+ where,
+ io_states,
+ e_what);
+ execute_wait(false);
+ } else {
+ assert(state == state_t::READY ||
+ state == state_t::ESTABLISHING);
+ logger().info("{} protocol {} {} fault, going to CONNECTING {} -- {}",
+ conn,
+ get_state_name(state),
+ where,
+ io_states,
+ e_what);
+ execute_connecting();
+ }
+}
+
+void ProtocolV2::reset_session(bool full)
+{
+ server_cookie = 0;
+ connect_seq = 0;
+ if (full) {
+ client_cookie = generate_client_cookie();
+ peer_global_seq = 0;
+ }
+
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} send {} IOHandler::reset_session({})",
+ conn, cc_seq, full);
+ io_states.reset_session(full);
+ gate.dispatch_in_background(
+ "reset_session", conn, [this, cc_seq, full] {
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(), [this, cc_seq, full] {
+ return io_handler.reset_session(cc_seq, full);
+ });
+ });
+ // user can make changes
+}
+
+seastar::future<std::tuple<entity_type_t, entity_addr_t>>
+ProtocolV2::banner_exchange(bool is_connect)
+{
+ // 1. prepare and send banner
+ bufferlist banner_payload;
+ encode((uint64_t)CRIMSON_MSGR2_SUPPORTED_FEATURES, banner_payload, 0);
+ encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0);
+
+ bufferlist bl;
+ bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX));
+ auto len_payload = static_cast<uint16_t>(banner_payload.length());
+ encode(len_payload, bl, 0);
+ bl.claim_append(banner_payload);
+ logger().debug("{} SEND({}) banner: len_payload={}, supported={}, "
+ "required={}, banner=\"{}\"",
+ conn, bl.length(), len_payload,
+ CRIMSON_MSGR2_SUPPORTED_FEATURES,
+ CEPH_MSGR2_REQUIRED_FEATURES,
+ CEPH_BANNER_V2_PREFIX);
+#ifdef UNIT_TESTS_BUILT
+ return frame_assembler->intercept_frame(custom_bp_t::BANNER_WRITE, true
+ ).then([this, bl=std::move(bl)]() mutable {
+ return frame_assembler->write_flush(std::move(bl));
+ }
+#else
+ return frame_assembler->write_flush(std::move(bl)
+#endif
+ ).then([this] {
+ // 2. read peer banner
+ unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(ceph_le16);
+#ifdef UNIT_TESTS_BUILT
+ return frame_assembler->intercept_frame(custom_bp_t::BANNER_READ, false
+ ).then([this, banner_len] {
+ return frame_assembler->read_exactly(banner_len);
+ });
+#else
+ return frame_assembler->read_exactly(banner_len);
+#endif
+ }).then([this](auto bptr) {
+ // 3. process peer banner and read banner_payload
+ unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX);
+ logger().debug("{} RECV({}) banner: \"{}\"",
+ conn, bptr.length(),
+ std::string(bptr.c_str(), banner_prefix_len));
+
+ if (memcmp(bptr.c_str(), CEPH_BANNER_V2_PREFIX, banner_prefix_len) != 0) {
+ if (memcmp(bptr.c_str(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) {
+ logger().warn("{} peer is using V1 protocol", conn);
+ } else {
+ logger().warn("{} peer sent bad banner", conn);
+ }
+ abort_in_fault();
+ }
+
+ bptr.set_offset(bptr.offset() + banner_prefix_len);
+ bptr.set_length(bptr.length() - banner_prefix_len);
+ assert(bptr.length() == sizeof(ceph_le16));
+
+ uint16_t payload_len;
+ bufferlist buf;
+ buf.append(std::move(bptr));
+ auto ti = buf.cbegin();
+ try {
+ decode(payload_len, ti);
+ } catch (const buffer::error &e) {
+ logger().warn("{} decode banner payload len failed", conn);
+ abort_in_fault();
+ }
+ logger().debug("{} GOT banner: payload_len={}", conn, payload_len);
+#ifdef UNIT_TESTS_BUILT
+ return frame_assembler->intercept_frame(
+ custom_bp_t::BANNER_PAYLOAD_READ, false
+ ).then([this, payload_len] {
+ return frame_assembler->read(payload_len);
+ });
+#else
+ return frame_assembler->read(payload_len);
+#endif
+ }).then([this, is_connect] (bufferlist bl) {
+ // 4. process peer banner_payload and send HelloFrame
+ auto p = bl.cbegin();
+ uint64_t _peer_supported_features;
+ uint64_t _peer_required_features;
+ try {
+ decode(_peer_supported_features, p);
+ decode(_peer_required_features, p);
+ } catch (const buffer::error &e) {
+ logger().warn("{} decode banner payload failed", conn);
+ abort_in_fault();
+ }
+ logger().debug("{} RECV({}) banner features: supported={} required={}",
+ conn, bl.length(),
+ _peer_supported_features, _peer_required_features);
+
+ // Check feature bit compatibility
+ uint64_t supported_features = CRIMSON_MSGR2_SUPPORTED_FEATURES;
+ uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES;
+ if ((required_features & _peer_supported_features) != required_features) {
+ logger().error("{} peer does not support all required features"
+ " required={} peer_supported={}",
+ conn, required_features, _peer_supported_features);
+ ABORT_IN_CLOSE(is_connect);
+ }
+ if ((supported_features & _peer_required_features) != _peer_required_features) {
+ logger().error("{} we do not support all peer required features"
+ " peer_required={} supported={}",
+ conn, _peer_required_features, supported_features);
+ ABORT_IN_CLOSE(is_connect);
+ }
+ peer_supported_features = _peer_supported_features;
+ bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+ frame_assembler->set_is_rev1(is_rev1);
+
+ auto hello = HelloFrame::Encode(messenger.get_mytype(),
+ conn.target_addr);
+ logger().debug("{} WRITE HelloFrame: my_type={}, peer_addr={}",
+ conn, ceph_entity_type_name(messenger.get_mytype()),
+ conn.target_addr);
+ return frame_assembler->write_flush_frame(hello);
+ }).then([this] {
+ //5. read peer HelloFrame
+ return frame_assembler->read_main_preamble();
+ }).then([this](auto ret) {
+ expect_tag(Tag::HELLO, ret.tag, conn, "read_hello_frame");
+ return frame_assembler->read_frame_payload();
+ }).then([this](auto payload) {
+ // 6. process peer HelloFrame
+ auto hello = HelloFrame::Decode(payload->back());
+ logger().debug("{} GOT HelloFrame: my_type={} peer_addr={}",
+ conn, ceph_entity_type_name(hello.entity_type()),
+ hello.peer_addr());
+ return seastar::make_ready_future<std::tuple<entity_type_t, entity_addr_t>>(
+ std::make_tuple(hello.entity_type(), hello.peer_addr()));
+ });
+}
+
+// CONNECTING state
+
+seastar::future<> ProtocolV2::handle_auth_reply()
+{
+ return frame_assembler->read_main_preamble(
+ ).then([this](auto ret) {
+ switch (ret.tag) {
+ case Tag::AUTH_BAD_METHOD:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_auth_bad_method() logic
+ auto bad_method = AuthBadMethodFrame::Decode(payload->back());
+ logger().warn("{} GOT AuthBadMethodFrame: method={} result={}, "
+ "allowed_methods={}, allowed_modes={}",
+ conn, bad_method.method(), cpp_strerror(bad_method.result()),
+ bad_method.allowed_methods(), bad_method.allowed_modes());
+ ceph_assert(messenger.get_auth_client());
+ int r = messenger.get_auth_client()->handle_auth_bad_method(
+ conn, *auth_meta,
+ bad_method.method(), bad_method.result(),
+ bad_method.allowed_methods(), bad_method.allowed_modes());
+ if (r < 0) {
+ logger().warn("{} auth_client handle_auth_bad_method returned {}",
+ conn, r);
+ abort_in_fault();
+ }
+ return client_auth(bad_method.allowed_methods());
+ });
+ case Tag::AUTH_REPLY_MORE:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_auth_reply_more() logic
+ auto auth_more = AuthReplyMoreFrame::Decode(payload->back());
+ logger().debug("{} GOT AuthReplyMoreFrame: payload_len={}",
+ conn, auth_more.auth_payload().length());
+ ceph_assert(messenger.get_auth_client());
+ // let execute_connecting() take care of the thrown exception
+ auto reply = messenger.get_auth_client()->handle_auth_reply_more(
+ conn, *auth_meta, auth_more.auth_payload());
+ auto more_reply = AuthRequestMoreFrame::Encode(reply);
+ logger().debug("{} WRITE AuthRequestMoreFrame: payload_len={}",
+ conn, reply.length());
+ return frame_assembler->write_flush_frame(more_reply);
+ }).then([this] {
+ return handle_auth_reply();
+ });
+ case Tag::AUTH_DONE:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_auth_done() logic
+ auto auth_done = AuthDoneFrame::Decode(payload->back());
+ logger().debug("{} GOT AuthDoneFrame: gid={}, con_mode={}, payload_len={}",
+ conn, auth_done.global_id(),
+ ceph_con_mode_name(auth_done.con_mode()),
+ auth_done.auth_payload().length());
+ ceph_assert(messenger.get_auth_client());
+ int r = messenger.get_auth_client()->handle_auth_done(
+ conn,
+ *auth_meta,
+ auth_done.global_id(),
+ auth_done.con_mode(),
+ auth_done.auth_payload());
+ if (r < 0) {
+ logger().warn("{} auth_client handle_auth_done returned {}", conn, r);
+ abort_in_fault();
+ }
+ auth_meta->con_mode = auth_done.con_mode();
+ frame_assembler->create_session_stream_handlers(*auth_meta, false);
+ return finish_auth();
+ });
+ default: {
+ unexpected_tag(ret.tag, conn, "handle_auth_reply");
+ return seastar::now();
+ }
+ }
+ });
+}
+
+seastar::future<> ProtocolV2::client_auth(std::vector<uint32_t> &allowed_methods)
+{
+ // send_auth_request() logic
+ ceph_assert(messenger.get_auth_client());
+
+ try {
+ auto [auth_method, preferred_modes, bl] =
+ messenger.get_auth_client()->get_auth_request(conn, *auth_meta);
+ auth_meta->auth_method = auth_method;
+ auto frame = AuthRequestFrame::Encode(auth_method, preferred_modes, bl);
+ logger().debug("{} WRITE AuthRequestFrame: method={},"
+ " preferred_modes={}, payload_len={}",
+ conn, auth_method, preferred_modes, bl.length());
+ return frame_assembler->write_flush_frame(frame
+ ).then([this] {
+ return handle_auth_reply();
+ });
+ } catch (const crimson::auth::error& e) {
+ logger().error("{} get_initial_auth_request returned {}", conn, e.what());
+ ABORT_IN_CLOSE(true);
+ return seastar::now();
+ }
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::process_wait()
+{
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_wait() logic
+ logger().debug("{} GOT WaitFrame", conn);
+ WaitFrame::Decode(payload->back());
+ return next_step_t::wait;
+ });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::client_connect()
+{
+ // send_client_ident() logic
+ uint64_t flags = 0;
+ if (conn.policy.lossy) {
+ flags |= CEPH_MSG_CONNECT_LOSSY;
+ }
+
+ auto client_ident = ClientIdentFrame::Encode(
+ messenger.get_myaddrs(),
+ conn.target_addr,
+ messenger.get_myname().num(),
+ global_seq,
+ conn.policy.features_supported,
+ conn.policy.features_required | msgr2_required, flags,
+ client_cookie);
+
+ logger().debug("{} WRITE ClientIdentFrame: addrs={}, target={}, gid={},"
+ " gs={}, features_supported={}, features_required={},"
+ " flags={}, cookie={}",
+ conn, messenger.get_myaddrs(), conn.target_addr,
+ messenger.get_myname().num(), global_seq,
+ conn.policy.features_supported,
+ conn.policy.features_required | msgr2_required,
+ flags, client_cookie);
+ return frame_assembler->write_flush_frame(client_ident
+ ).then([this] {
+ return frame_assembler->read_main_preamble();
+ }).then([this](auto ret) {
+ switch (ret.tag) {
+ case Tag::IDENT_MISSING_FEATURES:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_ident_missing_features() logic
+ auto ident_missing = IdentMissingFeaturesFrame::Decode(payload->back());
+ logger().warn("{} GOT IdentMissingFeaturesFrame: features={}"
+ " (client does not support all server features)",
+ conn, ident_missing.features());
+ abort_in_fault();
+ return next_step_t::none;
+ });
+ case Tag::WAIT:
+ return process_wait();
+ case Tag::SERVER_IDENT:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} at receiving SERVER_IDENT",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+
+ // handle_server_ident() logic
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} send {} IOHandler::requeue_out_sent()",
+ conn, cc_seq);
+ io_states.requeue_out_sent();
+ gate.dispatch_in_background(
+ "requeue_out_sent", conn, [this, cc_seq] {
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(), [this, cc_seq] {
+ return io_handler.requeue_out_sent(cc_seq);
+ });
+ });
+
+ auto server_ident = ServerIdentFrame::Decode(payload->back());
+ logger().debug("{} GOT ServerIdentFrame:"
+ " addrs={}, gid={}, gs={},"
+ " features_supported={}, features_required={},"
+ " flags={}, cookie={}",
+ conn,
+ server_ident.addrs(), server_ident.gid(),
+ server_ident.global_seq(),
+ server_ident.supported_features(),
+ server_ident.required_features(),
+ server_ident.flags(), server_ident.cookie());
+
+ // is this who we intended to talk to?
+ // be a bit forgiving here, since we may be connecting based on addresses parsed out
+ // of mon_host or something.
+ if (!server_ident.addrs().contains(conn.target_addr)) {
+ logger().warn("{} peer identifies as {}, does not include {}",
+ conn, server_ident.addrs(), conn.target_addr);
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+
+ server_cookie = server_ident.cookie();
+
+ // TODO: change peer_addr to entity_addrvec_t
+ if (server_ident.addrs().front() != conn.peer_addr) {
+ logger().warn("{} peer advertises as {}, does not match {}",
+ conn, server_ident.addrs(), conn.peer_addr);
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ if (conn.get_peer_id() != entity_name_t::NEW &&
+ conn.get_peer_id() != server_ident.gid()) {
+ logger().error("{} connection peer id ({}) does not match "
+ "what it should be ({}) during connecting, close",
+ conn, server_ident.gid(), conn.get_peer_id());
+ ABORT_IN_CLOSE(true);
+ }
+ conn.set_peer_id(server_ident.gid());
+ conn.set_features(server_ident.supported_features() &
+ conn.policy.features_supported);
+ logger().debug("{} UPDATE: features={}", conn, conn.get_features());
+ peer_global_seq = server_ident.global_seq();
+
+ bool lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY;
+ if (lossy != conn.policy.lossy) {
+ logger().warn("{} UPDATE Policy(lossy={}) from server flags", conn, lossy);
+ conn.policy.lossy = lossy;
+ }
+ if (lossy && (connect_seq != 0 || server_cookie != 0)) {
+ logger().warn("{} UPDATE cs=0({}) sc=0({}) for lossy policy",
+ conn, connect_seq, server_cookie);
+ connect_seq = 0;
+ server_cookie = 0;
+ }
+
+ return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+ });
+ default: {
+ unexpected_tag(ret.tag, conn, "post_client_connect");
+ return seastar::make_ready_future<next_step_t>(next_step_t::none);
+ }
+ }
+ });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::client_reconnect()
+{
+ // send_reconnect() logic
+ auto reconnect = ReconnectFrame::Encode(messenger.get_myaddrs(),
+ client_cookie,
+ server_cookie,
+ global_seq,
+ connect_seq,
+ io_states.in_seq);
+ logger().debug("{} WRITE ReconnectFrame: addrs={}, client_cookie={},"
+ " server_cookie={}, gs={}, cs={}, in_seq={}",
+ conn, messenger.get_myaddrs(),
+ client_cookie, server_cookie,
+ global_seq, connect_seq, io_states.in_seq);
+ return frame_assembler->write_flush_frame(reconnect).then([this] {
+ return frame_assembler->read_main_preamble();
+ }).then([this](auto ret) {
+ switch (ret.tag) {
+ case Tag::SESSION_RETRY_GLOBAL:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_session_retry_global() logic
+ auto retry = RetryGlobalFrame::Decode(payload->back());
+ logger().warn("{} GOT RetryGlobalFrame: gs={}",
+ conn, retry.global_seq());
+ global_seq = messenger.get_global_seq(retry.global_seq());
+ logger().warn("{} UPDATE: gs={} for retry global", conn, global_seq);
+ return client_reconnect();
+ });
+ case Tag::SESSION_RETRY:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_session_retry() logic
+ auto retry = RetryFrame::Decode(payload->back());
+ logger().warn("{} GOT RetryFrame: cs={}",
+ conn, retry.connect_seq());
+ connect_seq = retry.connect_seq() + 1;
+ logger().warn("{} UPDATE: cs={}", conn, connect_seq);
+ return client_reconnect();
+ });
+ case Tag::SESSION_RESET:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} before reset_session()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ // handle_session_reset() logic
+ auto reset = ResetFrame::Decode(payload->back());
+ logger().warn("{} GOT ResetFrame: full={}", conn, reset.full());
+
+ reset_session(reset.full());
+ // user can make changes
+
+ return client_connect();
+ });
+ case Tag::WAIT:
+ return process_wait();
+ case Tag::SESSION_RECONNECT_OK:
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} at receiving RECONNECT_OK",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+
+ // handle_reconnect_ok() logic
+ auto reconnect_ok = ReconnectOkFrame::Decode(payload->back());
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} GOT ReconnectOkFrame: msg_seq={}, "
+ "send {} IOHandler::requeue_out_sent_up_to()",
+ conn, reconnect_ok.msg_seq(), cc_seq);
+
+ io_states.requeue_out_sent_up_to();
+ auto msg_seq = reconnect_ok.msg_seq();
+ gate.dispatch_in_background(
+ "requeue_out_reconnecting", conn, [this, cc_seq, msg_seq] {
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(), [this, cc_seq, msg_seq] {
+ return io_handler.requeue_out_sent_up_to(cc_seq, msg_seq);
+ });
+ });
+
+ return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+ });
+ default: {
+ unexpected_tag(ret.tag, conn, "post_client_reconnect");
+ return seastar::make_ready_future<next_step_t>(next_step_t::none);
+ }
+ }
+ });
+}
+
+void ProtocolV2::execute_connecting()
+{
+ ceph_assert_always(!is_socket_valid);
+ trigger_state(state_t::CONNECTING, io_state_t::delay);
+ gated_execute("execute_connecting", conn, [this] {
+ global_seq = messenger.get_global_seq();
+ assert(client_cookie != 0);
+ if (!conn.policy.lossy && server_cookie != 0) {
+ ++connect_seq;
+ logger().debug("{} UPDATE: gs={}, cs={} for reconnect",
+ conn, global_seq, connect_seq);
+ } else { // conn.policy.lossy || server_cookie == 0
+ assert(connect_seq == 0);
+ assert(server_cookie == 0);
+ logger().debug("{} UPDATE: gs={} for connect", conn, global_seq);
+ }
+ return wait_exit_io().then([this] {
+#ifdef UNIT_TESTS_BUILT
+ // process custom_bp_t::SOCKET_CONNECTING
+ // supports CONTINUE/FAULT/BLOCK
+ if (!conn.interceptor) {
+ return seastar::now();
+ }
+ return conn.interceptor->intercept(
+ conn, {Breakpoint{custom_bp_t::SOCKET_CONNECTING}}
+ ).then([this](bp_action_t action) {
+ switch (action) {
+ case bp_action_t::CONTINUE:
+ return seastar::now();
+ case bp_action_t::FAULT:
+ logger().info("[Test] got FAULT");
+ abort_in_fault();
+ case bp_action_t::BLOCK:
+ logger().info("[Test] got BLOCK");
+ return conn.interceptor->blocker.block();
+ default:
+ ceph_abort("unexpected action from trap");
+ return seastar::now();
+ }
+ });;
+ }).then([this] {
+#endif
+ ceph_assert_always(frame_assembler);
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} before Socket::connect()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ return Socket::connect(conn.peer_addr);
+ }).then([this](SocketRef _new_socket) {
+ logger().debug("{} socket connected", conn);
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} during Socket::connect()",
+ conn, get_state_name(state));
+ return _new_socket->close().then([sock=std::move(_new_socket)] {
+ abort_protocol();
+ });
+ }
+ SocketFRef new_socket = seastar::make_foreign(std::move(_new_socket));
+ if (!has_socket) {
+ frame_assembler->set_socket(std::move(new_socket));
+ has_socket = true;
+ } else {
+ gate.dispatch_in_background(
+ "replace_socket_connecting",
+ conn,
+ [this, new_socket=std::move(new_socket)]() mutable {
+ return frame_assembler->replace_shutdown_socket(std::move(new_socket));
+ }
+ );
+ }
+ is_socket_valid = true;
+ return seastar::now();
+ }).then([this] {
+ auth_meta = seastar::make_lw_shared<AuthConnectionMeta>();
+ frame_assembler->reset_handlers();
+ frame_assembler->start_recording();
+ return banner_exchange(true);
+ }).then([this] (auto&& ret) {
+ auto [_peer_type, _my_addr_from_peer] = std::move(ret);
+ if (conn.get_peer_type() != _peer_type) {
+ logger().warn("{} connection peer type does not match what peer advertises {} != {}",
+ conn, ceph_entity_type_name(conn.get_peer_type()),
+ ceph_entity_type_name(_peer_type));
+ ABORT_IN_CLOSE(true);
+ }
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} during banner_exchange(), abort",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ frame_assembler->learn_socket_ephemeral_port_as_connector(
+ _my_addr_from_peer.get_port());
+ if (unlikely(_my_addr_from_peer.is_legacy())) {
+ logger().warn("{} peer sent a legacy address for me: {}",
+ conn, _my_addr_from_peer);
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ _my_addr_from_peer.set_type(entity_addr_t::TYPE_MSGR2);
+ messenger.learned_addr(_my_addr_from_peer, conn);
+ return client_auth();
+ }).then([this] {
+ if (server_cookie == 0) {
+ ceph_assert(connect_seq == 0);
+ return client_connect();
+ } else {
+ ceph_assert(connect_seq > 0);
+ return client_reconnect();
+ }
+ }).then([this] (next_step_t next) {
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} at the end of execute_connecting()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ switch (next) {
+ case next_step_t::ready: {
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} before dispatch_connect(), abort",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+
+ auto cc_seq = crosscore.prepare_submit();
+ // there are 2 hops with dispatch_connect()
+ crosscore.prepare_submit();
+ logger().info("{} connected: gs={}, pgs={}, cs={}, "
+ "client_cookie={}, server_cookie={}, {}, new_sid={}, "
+ "send {} IOHandler::dispatch_connect()",
+ conn, global_seq, peer_global_seq, connect_seq,
+ client_cookie, server_cookie, io_states,
+ frame_assembler->get_socket_shard_id(), cc_seq);
+
+ // set io_handler to a new shard
+ auto new_io_shard = frame_assembler->get_socket_shard_id();
+ ConnectionFRef conn_fref = seastar::make_foreign(
+ conn.shared_from_this());
+ ceph_assert_always(!pr_switch_io_shard.has_value());
+ pr_switch_io_shard = seastar::shared_promise<>();
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(),
+ [this, cc_seq, new_io_shard,
+ conn_fref=std::move(conn_fref)]() mutable {
+ return io_handler.dispatch_connect(
+ cc_seq, new_io_shard, std::move(conn_fref));
+ }).then([this, new_io_shard] {
+ ceph_assert_always(io_handler.get_shard_id() == new_io_shard);
+ pr_switch_io_shard->set_value();
+ pr_switch_io_shard = std::nullopt;
+ // user can make changes
+
+ if (unlikely(state != state_t::CONNECTING)) {
+ logger().debug("{} triggered {} after dispatch_connect(), abort",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ execute_ready();
+ });
+ }
+ case next_step_t::wait: {
+ logger().info("{} execute_connecting(): going to WAIT(max-backoff)", conn);
+ ceph_assert_always(is_socket_valid);
+ frame_assembler->shutdown_socket<true>(&gate);
+ is_socket_valid = false;
+ execute_wait(true);
+ return seastar::now();
+ }
+ default: {
+ ceph_abort("impossible next step");
+ }
+ }
+ }).handle_exception([this](std::exception_ptr eptr) {
+ fault(state_t::CONNECTING, "execute_connecting", eptr);
+ });
+ });
+}
+
+// ACCEPTING state
+
+seastar::future<> ProtocolV2::_auth_bad_method(int r)
+{
+ // _auth_bad_method() logic
+ ceph_assert(r < 0);
+ auto [allowed_methods, allowed_modes] =
+ messenger.get_auth_server()->get_supported_auth_methods(conn.get_peer_type());
+ auto bad_method = AuthBadMethodFrame::Encode(
+ auth_meta->auth_method, r, allowed_methods, allowed_modes);
+ logger().warn("{} WRITE AuthBadMethodFrame: method={}, result={}, "
+ "allowed_methods={}, allowed_modes={})",
+ conn, auth_meta->auth_method, cpp_strerror(r),
+ allowed_methods, allowed_modes);
+ return frame_assembler->write_flush_frame(bad_method
+ ).then([this] {
+ return server_auth();
+ });
+}
+
+seastar::future<> ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more)
+{
+ // _handle_auth_request() logic
+ ceph_assert(messenger.get_auth_server());
+ bufferlist reply;
+ int r = messenger.get_auth_server()->handle_auth_request(
+ conn,
+ *auth_meta,
+ more,
+ auth_meta->auth_method,
+ auth_payload,
+ &conn.peer_global_id,
+ &reply);
+ switch (r) {
+ // successful
+ case 1: {
+ auto auth_done = AuthDoneFrame::Encode(
+ conn.peer_global_id, auth_meta->con_mode, reply);
+ logger().debug("{} WRITE AuthDoneFrame: gid={}, con_mode={}, payload_len={}",
+ conn, conn.peer_global_id,
+ ceph_con_mode_name(auth_meta->con_mode), reply.length());
+ return frame_assembler->write_flush_frame(auth_done
+ ).then([this] {
+ ceph_assert(auth_meta);
+ frame_assembler->create_session_stream_handlers(*auth_meta, true);
+ return finish_auth();
+ });
+ }
+ // auth more
+ case 0: {
+ auto more = AuthReplyMoreFrame::Encode(reply);
+ logger().debug("{} WRITE AuthReplyMoreFrame: payload_len={}",
+ conn, reply.length());
+ return frame_assembler->write_flush_frame(more
+ ).then([this] {
+ return frame_assembler->read_main_preamble();
+ }).then([this](auto ret) {
+ expect_tag(Tag::AUTH_REQUEST_MORE, ret.tag, conn, "read_auth_request_more");
+ return frame_assembler->read_frame_payload();
+ }).then([this](auto payload) {
+ auto auth_more = AuthRequestMoreFrame::Decode(payload->back());
+ logger().debug("{} GOT AuthRequestMoreFrame: payload_len={}",
+ conn, auth_more.auth_payload().length());
+ return _handle_auth_request(auth_more.auth_payload(), true);
+ });
+ }
+ case -EBUSY: {
+ logger().warn("{} auth_server handle_auth_request returned -EBUSY", conn);
+ abort_in_fault();
+ return seastar::now();
+ }
+ default: {
+ logger().warn("{} auth_server handle_auth_request returned {}", conn, r);
+ return _auth_bad_method(r);
+ }
+ }
+}
+
+seastar::future<> ProtocolV2::server_auth()
+{
+ return frame_assembler->read_main_preamble(
+ ).then([this](auto ret) {
+ expect_tag(Tag::AUTH_REQUEST, ret.tag, conn, "read_auth_request");
+ return frame_assembler->read_frame_payload();
+ }).then([this](auto payload) {
+ // handle_auth_request() logic
+ auto request = AuthRequestFrame::Decode(payload->back());
+ logger().debug("{} GOT AuthRequestFrame: method={}, preferred_modes={},"
+ " payload_len={}",
+ conn, request.method(), request.preferred_modes(),
+ request.auth_payload().length());
+ auth_meta->auth_method = request.method();
+ auth_meta->con_mode = messenger.get_auth_server()->pick_con_mode(
+ conn.get_peer_type(), auth_meta->auth_method,
+ request.preferred_modes());
+ if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) {
+ logger().warn("{} auth_server pick_con_mode returned mode CEPH_CON_MODE_UNKNOWN", conn);
+ return _auth_bad_method(-EOPNOTSUPP);
+ }
+ return _handle_auth_request(request.auth_payload(), false);
+ });
+}
+
+bool ProtocolV2::validate_peer_name(const entity_name_t& peer_name) const
+{
+ auto my_peer_name = conn.get_peer_name();
+ if (my_peer_name.type() != peer_name.type()) {
+ return false;
+ }
+ if (my_peer_name.num() != entity_name_t::NEW &&
+ peer_name.num() != entity_name_t::NEW &&
+ my_peer_name.num() != peer_name.num()) {
+ return false;
+ }
+ return true;
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::send_wait()
+{
+ auto wait = WaitFrame::Encode();
+ logger().debug("{} WRITE WaitFrame", conn);
+ return frame_assembler->write_flush_frame(wait
+ ).then([] {
+ return next_step_t::wait;
+ });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::reuse_connection(
+ ProtocolV2* existing_proto, bool do_reset,
+ bool reconnect, uint64_t conn_seq, uint64_t msg_seq)
+{
+ if (unlikely(state != state_t::ACCEPTING)) {
+ logger().debug("{} triggered {} before trigger_replacing()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+
+ existing_proto->trigger_replacing(reconnect,
+ do_reset,
+ frame_assembler->to_replace(),
+ std::move(auth_meta),
+ peer_global_seq,
+ client_cookie,
+ conn.get_peer_name(),
+ conn.get_features(),
+ peer_supported_features,
+ conn_seq,
+ msg_seq);
+ ceph_assert_always(has_socket && is_socket_valid);
+ is_socket_valid = false;
+ has_socket = false;
+#ifdef UNIT_TESTS_BUILT
+ if (conn.interceptor) {
+ conn.interceptor->register_conn_replaced(
+ conn.get_local_shared_foreign_from_this());
+ }
+#endif
+ // close this connection because all the necessary information is delivered
+ // to the exisiting connection, and jump to error handling code to abort the
+ // current state.
+ ABORT_IN_CLOSE(false);
+ return seastar::make_ready_future<next_step_t>(next_step_t::none);
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::handle_existing_connection(SocketConnectionRef existing_conn)
+{
+ // handle_existing_connection() logic
+ ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>(
+ existing_conn->protocol.get());
+ ceph_assert(existing_proto);
+ logger().debug("{}(gs={}, pgs={}, cs={}, cc={}, sc={}) connecting,"
+ " found existing {}(state={}, gs={}, pgs={}, cs={}, cc={}, sc={})",
+ conn, global_seq, peer_global_seq, connect_seq,
+ client_cookie, server_cookie,
+ fmt::ptr(existing_conn.get()), get_state_name(existing_proto->state),
+ existing_proto->global_seq,
+ existing_proto->peer_global_seq,
+ existing_proto->connect_seq,
+ existing_proto->client_cookie,
+ existing_proto->server_cookie);
+
+ if (!validate_peer_name(existing_conn->get_peer_name())) {
+ logger().error("{} server_connect: my peer_name doesn't match"
+ " the existing connection {}, abort", conn, fmt::ptr(existing_conn.get()));
+ abort_in_fault();
+ }
+
+ if (existing_proto->state == state_t::REPLACING) {
+ logger().warn("{} server_connect: racing replace happened while"
+ " replacing existing connection {}, send wait.",
+ conn, *existing_conn);
+ return send_wait();
+ }
+
+ if (existing_proto->peer_global_seq > peer_global_seq) {
+ logger().warn("{} server_connect:"
+ " this is a stale connection, because peer_global_seq({})"
+ " < existing->peer_global_seq({}), close this connection"
+ " in favor of existing connection {}",
+ conn, peer_global_seq,
+ existing_proto->peer_global_seq, *existing_conn);
+ abort_in_fault();
+ }
+
+ if (existing_conn->policy.lossy) {
+ // existing connection can be thrown out in favor of this one
+ logger().warn("{} server_connect:"
+ " existing connection {} is a lossy channel. Close existing in favor of"
+ " this connection", conn, *existing_conn);
+ if (unlikely(state != state_t::ACCEPTING)) {
+ logger().debug("{} triggered {} before execute_establishing()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ execute_establishing(existing_conn);
+ return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+ }
+
+ if (existing_proto->server_cookie != 0) {
+ if (existing_proto->client_cookie != client_cookie) {
+ // Found previous session
+ // peer has reset and we're going to reuse the existing connection
+ // by replacing the socket
+ logger().warn("{} server_connect:"
+ " found new session (cs={})"
+ " when existing {} {} is with stale session (cs={}, ss={}),"
+ " peer must have reset",
+ conn,
+ client_cookie,
+ get_state_name(existing_proto->state),
+ *existing_conn,
+ existing_proto->client_cookie,
+ existing_proto->server_cookie);
+ return reuse_connection(existing_proto, conn.policy.resetcheck);
+ } else {
+ // session establishment interrupted between client_ident and server_ident,
+ // continuing...
+ logger().warn("{} server_connect: found client session with existing {} {}"
+ " matched (cs={}, ss={}), continuing session establishment",
+ conn,
+ get_state_name(existing_proto->state),
+ *existing_conn,
+ client_cookie,
+ existing_proto->server_cookie);
+ return reuse_connection(existing_proto);
+ }
+ } else {
+ // Looks like a connection race: server and client are both connecting to
+ // each other at the same time.
+ if (existing_proto->client_cookie != client_cookie) {
+ if (existing_conn->peer_wins()) {
+ // acceptor (this connection, the peer) wins
+ logger().warn("{} server_connect: connection race detected (cs={}, e_cs={}, ss=0)"
+ " and win, reusing existing {} {}",
+ conn,
+ client_cookie,
+ existing_proto->client_cookie,
+ get_state_name(existing_proto->state),
+ *existing_conn);
+ return reuse_connection(existing_proto);
+ } else {
+ // acceptor (this connection, the peer) loses
+ logger().warn("{} server_connect: connection race detected (cs={}, e_cs={}, ss=0)"
+ " and lose to existing {}, ask client to wait",
+ conn, client_cookie, existing_proto->client_cookie, *existing_conn);
+ return existing_conn->send_keepalive().then([this] {
+ return send_wait();
+ });
+ }
+ } else {
+ logger().warn("{} server_connect: found client session with existing {} {}"
+ " matched (cs={}, ss={}), continuing session establishment",
+ conn,
+ get_state_name(existing_proto->state),
+ *existing_conn,
+ client_cookie,
+ existing_proto->server_cookie);
+ return reuse_connection(existing_proto);
+ }
+ }
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::server_connect()
+{
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_client_ident() logic
+ auto client_ident = ClientIdentFrame::Decode(payload->back());
+ logger().debug("{} GOT ClientIdentFrame: addrs={}, target={},"
+ " gid={}, gs={}, features_supported={},"
+ " features_required={}, flags={}, cookie={}",
+ conn, client_ident.addrs(), client_ident.target_addr(),
+ client_ident.gid(), client_ident.global_seq(),
+ client_ident.supported_features(),
+ client_ident.required_features(),
+ client_ident.flags(), client_ident.cookie());
+
+ if (client_ident.addrs().empty() ||
+ client_ident.addrs().front() == entity_addr_t()) {
+ logger().warn("{} oops, client_ident.addrs() is empty", conn);
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ if (!messenger.get_myaddrs().contains(client_ident.target_addr())) {
+ logger().warn("{} peer is trying to reach {} which is not us ({})",
+ conn, client_ident.target_addr(), messenger.get_myaddrs());
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ conn.peer_addr = client_ident.addrs().front();
+ logger().debug("{} UPDATE: peer_addr={}", conn, conn.peer_addr);
+ conn.target_addr = conn.peer_addr;
+ if (!conn.policy.lossy && !conn.policy.server && conn.target_addr.get_port() <= 0) {
+ logger().warn("{} we don't know how to reconnect to peer {}",
+ conn, conn.target_addr);
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+
+ if (conn.get_peer_id() != entity_name_t::NEW &&
+ conn.get_peer_id() != client_ident.gid()) {
+ logger().error("{} client_ident peer_id ({}) does not match"
+ " what it should be ({}) during accepting, abort",
+ conn, client_ident.gid(), conn.get_peer_id());
+ abort_in_fault();
+ }
+ conn.set_peer_id(client_ident.gid());
+ client_cookie = client_ident.cookie();
+
+ uint64_t feat_missing =
+ (conn.policy.features_required | msgr2_required) &
+ ~(uint64_t)client_ident.supported_features();
+ if (feat_missing) {
+ auto ident_missing_features = IdentMissingFeaturesFrame::Encode(feat_missing);
+ logger().warn("{} WRITE IdentMissingFeaturesFrame: features={} (peer missing)",
+ conn, feat_missing);
+ return frame_assembler->write_flush_frame(ident_missing_features
+ ).then([] {
+ return next_step_t::wait;
+ });
+ }
+ conn.set_features(client_ident.supported_features() &
+ conn.policy.features_supported);
+ logger().debug("{} UPDATE: features={}", conn, conn.get_features());
+
+ peer_global_seq = client_ident.global_seq();
+
+ bool lossy = client_ident.flags() & CEPH_MSG_CONNECT_LOSSY;
+ if (lossy != conn.policy.lossy) {
+ logger().warn("{} my lossy policy {} doesn't match client {}, ignore",
+ conn, conn.policy.lossy, lossy);
+ }
+
+ // Looks good so far, let's check if there is already an existing connection
+ // to this peer.
+
+ SocketConnectionRef existing_conn = messenger.lookup_conn(conn.peer_addr);
+
+ if (existing_conn) {
+ return handle_existing_connection(existing_conn);
+ } else {
+ if (unlikely(state != state_t::ACCEPTING)) {
+ logger().debug("{} triggered {} before execute_establishing()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ execute_establishing(nullptr);
+ return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+ }
+ });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::read_reconnect()
+{
+ return frame_assembler->read_main_preamble(
+ ).then([this](auto ret) {
+ expect_tag(Tag::SESSION_RECONNECT, ret.tag, conn, "read_session_reconnect");
+ return server_reconnect();
+ });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::send_retry(uint64_t connect_seq)
+{
+ auto retry = RetryFrame::Encode(connect_seq);
+ logger().warn("{} WRITE RetryFrame: cs={}", conn, connect_seq);
+ return frame_assembler->write_flush_frame(retry
+ ).then([this] {
+ return read_reconnect();
+ });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::send_retry_global(uint64_t global_seq)
+{
+ auto retry = RetryGlobalFrame::Encode(global_seq);
+ logger().warn("{} WRITE RetryGlobalFrame: gs={}", conn, global_seq);
+ return frame_assembler->write_flush_frame(retry
+ ).then([this] {
+ return read_reconnect();
+ });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::send_reset(bool full)
+{
+ auto reset = ResetFrame::Encode(full);
+ logger().warn("{} WRITE ResetFrame: full={}", conn, full);
+ return frame_assembler->write_flush_frame(reset
+ ).then([this] {
+ return frame_assembler->read_main_preamble();
+ }).then([this](auto ret) {
+ expect_tag(Tag::CLIENT_IDENT, ret.tag, conn, "post_send_reset");
+ return server_connect();
+ });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::server_reconnect()
+{
+ return frame_assembler->read_frame_payload(
+ ).then([this](auto payload) {
+ // handle_reconnect() logic
+ auto reconnect = ReconnectFrame::Decode(payload->back());
+
+ logger().debug("{} GOT ReconnectFrame: addrs={}, client_cookie={},"
+ " server_cookie={}, gs={}, cs={}, msg_seq={}",
+ conn, reconnect.addrs(),
+ reconnect.client_cookie(), reconnect.server_cookie(),
+ reconnect.global_seq(), reconnect.connect_seq(),
+ reconnect.msg_seq());
+
+ // can peer_addrs be changed on-the-fly?
+ // TODO: change peer_addr to entity_addrvec_t
+ entity_addr_t paddr = reconnect.addrs().front();
+ if (paddr.is_msgr2() || paddr.is_any()) {
+ // good
+ } else {
+ logger().warn("{} peer's address {} is not v2", conn, paddr);
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ if (conn.peer_addr == entity_addr_t()) {
+ conn.peer_addr = paddr;
+ } else if (conn.peer_addr != paddr) {
+ logger().error("{} peer identifies as {}, while conn.peer_addr={},"
+ " reconnect failed",
+ conn, paddr, conn.peer_addr);
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ peer_global_seq = reconnect.global_seq();
+
+ SocketConnectionRef existing_conn = messenger.lookup_conn(conn.peer_addr);
+
+ if (!existing_conn) {
+ // there is no existing connection therefore cannot reconnect to previous
+ // session
+ logger().warn("{} server_reconnect: no existing connection from address {},"
+ " reseting client", conn, conn.peer_addr);
+ return send_reset(true);
+ }
+
+ ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>(
+ existing_conn->protocol.get());
+ ceph_assert(existing_proto);
+ logger().debug("{}(gs={}, pgs={}, cs={}, cc={}, sc={}) re-connecting,"
+ " found existing {}(state={}, gs={}, pgs={}, cs={}, cc={}, sc={})",
+ conn, global_seq, peer_global_seq, reconnect.connect_seq(),
+ reconnect.client_cookie(), reconnect.server_cookie(),
+ fmt::ptr(existing_conn.get()),
+ get_state_name(existing_proto->state),
+ existing_proto->global_seq,
+ existing_proto->peer_global_seq,
+ existing_proto->connect_seq,
+ existing_proto->client_cookie,
+ existing_proto->server_cookie);
+
+ if (!validate_peer_name(existing_conn->get_peer_name())) {
+ logger().error("{} server_reconnect: my peer_name doesn't match"
+ " the existing connection {}, abort", conn, fmt::ptr(existing_conn.get()));
+ abort_in_fault();
+ }
+
+ if (existing_proto->state == state_t::REPLACING) {
+ logger().warn("{} server_reconnect: racing replace happened while "
+ " replacing existing connection {}, retry global.",
+ conn, *existing_conn);
+ return send_retry_global(existing_proto->peer_global_seq);
+ }
+
+ if (existing_proto->client_cookie != reconnect.client_cookie()) {
+ logger().warn("{} server_reconnect:"
+ " client_cookie mismatch with existing connection {},"
+ " cc={} rcc={}. I must have reset, reseting client.",
+ conn, *existing_conn,
+ existing_proto->client_cookie, reconnect.client_cookie());
+ return send_reset(conn.policy.resetcheck);
+ } else if (existing_proto->server_cookie == 0) {
+ // this happens when:
+ // - a connects to b
+ // - a sends client_ident
+ // - b gets client_ident, sends server_ident and sets cookie X
+ // - connection fault
+ // - b reconnects to a with cookie X, connect_seq=1
+ // - a has cookie==0
+ logger().warn("{} server_reconnect: I was a client (cc={}) and didn't received the"
+ " server_ident with existing connection {}."
+ " Asking peer to resume session establishment",
+ conn, existing_proto->client_cookie, *existing_conn);
+ return send_reset(false);
+ }
+
+ if (existing_proto->peer_global_seq > reconnect.global_seq()) {
+ logger().warn("{} server_reconnect: stale global_seq: exist_pgs({}) > peer_gs({}),"
+ " with existing connection {},"
+ " ask client to retry global",
+ conn, existing_proto->peer_global_seq,
+ reconnect.global_seq(), *existing_conn);
+ return send_retry_global(existing_proto->peer_global_seq);
+ }
+
+ if (existing_proto->connect_seq > reconnect.connect_seq()) {
+ logger().warn("{} server_reconnect: stale peer connect_seq peer_cs({}) < exist_cs({}),"
+ " with existing connection {}, ask client to retry",
+ conn, reconnect.connect_seq(),
+ existing_proto->connect_seq, *existing_conn);
+ return send_retry(existing_proto->connect_seq);
+ } else if (existing_proto->connect_seq == reconnect.connect_seq()) {
+ // reconnect race: both peers are sending reconnect messages
+ if (existing_conn->peer_wins()) {
+ // acceptor (this connection, the peer) wins
+ logger().warn("{} server_reconnect: reconnect race detected (cs={})"
+ " and win, reusing existing {} {}",
+ conn,
+ reconnect.connect_seq(),
+ get_state_name(existing_proto->state),
+ *existing_conn);
+ return reuse_connection(
+ existing_proto, false,
+ true, reconnect.connect_seq(), reconnect.msg_seq());
+ } else {
+ // acceptor (this connection, the peer) loses
+ logger().warn("{} server_reconnect: reconnect race detected (cs={})"
+ " and lose to existing {}, ask client to wait",
+ conn, reconnect.connect_seq(), *existing_conn);
+ return send_wait();
+ }
+ } else { // existing_proto->connect_seq < reconnect.connect_seq()
+ logger().warn("{} server_reconnect: stale exsiting connect_seq exist_cs({}) < peer_cs({}),"
+ " reusing existing {} {}",
+ conn,
+ existing_proto->connect_seq,
+ reconnect.connect_seq(),
+ get_state_name(existing_proto->state),
+ *existing_conn);
+ return reuse_connection(
+ existing_proto, false,
+ true, reconnect.connect_seq(), reconnect.msg_seq());
+ }
+ });
+}
+
+void ProtocolV2::execute_accepting()
+{
+ assert(is_socket_valid);
+ trigger_state(state_t::ACCEPTING, io_state_t::none);
+ gate.dispatch_in_background("execute_accepting", conn, [this] {
+ return seastar::futurize_invoke([this] {
+#ifdef UNIT_TESTS_BUILT
+ if (conn.interceptor) {
+ // only notify socket accepted
+ gate.dispatch_in_background(
+ "test_intercept_socket_accepted", conn, [this] {
+ return conn.interceptor->intercept(
+ conn, {Breakpoint{custom_bp_t::SOCKET_ACCEPTED}}
+ ).then([](bp_action_t action) {
+ ceph_assert(action == bp_action_t::CONTINUE);
+ });
+ });
+ }
+#endif
+ auth_meta = seastar::make_lw_shared<AuthConnectionMeta>();
+ frame_assembler->reset_handlers();
+ frame_assembler->start_recording();
+ return banner_exchange(false);
+ }).then([this] (auto&& ret) {
+ auto [_peer_type, _my_addr_from_peer] = std::move(ret);
+ ceph_assert(conn.get_peer_type() == 0);
+ conn.set_peer_type(_peer_type);
+
+ conn.policy = messenger.get_policy(_peer_type);
+ logger().info("{} UPDATE: peer_type={},"
+ " policy(lossy={} server={} standby={} resetcheck={})",
+ conn, ceph_entity_type_name(_peer_type),
+ conn.policy.lossy, conn.policy.server,
+ conn.policy.standby, conn.policy.resetcheck);
+ if (!messenger.get_myaddr().is_blank_ip() &&
+ (messenger.get_myaddr().get_port() != _my_addr_from_peer.get_port() ||
+ messenger.get_myaddr().get_nonce() != _my_addr_from_peer.get_nonce())) {
+ logger().warn("{} my_addr_from_peer {} port/nonce doesn't match myaddr {}",
+ conn, _my_addr_from_peer, messenger.get_myaddr());
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ messenger.learned_addr(_my_addr_from_peer, conn);
+ return server_auth();
+ }).then([this] {
+ return frame_assembler->read_main_preamble();
+ }).then([this](auto ret) {
+ switch (ret.tag) {
+ case Tag::CLIENT_IDENT:
+ return server_connect();
+ case Tag::SESSION_RECONNECT:
+ return server_reconnect();
+ default: {
+ unexpected_tag(ret.tag, conn, "post_server_auth");
+ return seastar::make_ready_future<next_step_t>(next_step_t::none);
+ }
+ }
+ }).then([this] (next_step_t next) {
+ switch (next) {
+ case next_step_t::ready:
+ assert(state != state_t::ACCEPTING);
+ break;
+ case next_step_t::wait:
+ if (unlikely(state != state_t::ACCEPTING)) {
+ logger().debug("{} triggered {} at the end of execute_accepting()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ logger().info("{} execute_accepting(): going to SERVER_WAIT", conn);
+ execute_server_wait();
+ break;
+ default:
+ ceph_abort("impossible next step");
+ }
+ }).handle_exception([this](std::exception_ptr eptr) {
+ const char *e_what;
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception &e) {
+ e_what = e.what();
+ }
+ logger().info("{} execute_accepting(): fault at {}, going to CLOSING -- {}",
+ conn, get_state_name(state), e_what);
+ do_close(false);
+ });
+ });
+}
+
+// CONNECTING or ACCEPTING state
+
+seastar::future<> ProtocolV2::finish_auth()
+{
+ ceph_assert(auth_meta);
+
+ auto records = frame_assembler->stop_recording();
+ const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+ auth_meta->session_key.hmac_sha256(nullptr, records.rxbuf);
+ auto sig_frame = AuthSignatureFrame::Encode(sig);
+ logger().debug("{} WRITE AuthSignatureFrame: signature={}", conn, sig);
+ return frame_assembler->write_flush_frame(sig_frame
+ ).then([this] {
+ return frame_assembler->read_main_preamble();
+ }).then([this](auto ret) {
+ expect_tag(Tag::AUTH_SIGNATURE, ret.tag, conn, "post_finish_auth");
+ return frame_assembler->read_frame_payload();
+ }).then([this, txbuf=std::move(records.txbuf)](auto payload) {
+ // handle_auth_signature() logic
+ auto sig_frame = AuthSignatureFrame::Decode(payload->back());
+ logger().debug("{} GOT AuthSignatureFrame: signature={}", conn, sig_frame.signature());
+
+ const auto actual_tx_sig = auth_meta->session_key.empty() ?
+ sha256_digest_t() : auth_meta->session_key.hmac_sha256(nullptr, txbuf);
+ if (sig_frame.signature() != actual_tx_sig) {
+ logger().warn("{} pre-auth signature mismatch actual_tx_sig={}"
+ " sig_frame.signature()={}",
+ conn, actual_tx_sig, sig_frame.signature());
+ abort_in_fault();
+ }
+ });
+}
+
+// ESTABLISHING
+
+void ProtocolV2::execute_establishing(SocketConnectionRef existing_conn) {
+ auto accept_me = [this] {
+ messenger.register_conn(
+ seastar::static_pointer_cast<SocketConnection>(
+ conn.shared_from_this()));
+ messenger.unaccept_conn(
+ seastar::static_pointer_cast<SocketConnection>(
+ conn.shared_from_this()));
+ };
+
+ ceph_assert_always(is_socket_valid);
+ trigger_state(state_t::ESTABLISHING, io_state_t::delay);
+ bool is_replace;
+ if (existing_conn) {
+ logger().info("{} start establishing: gs={}, pgs={}, cs={}, "
+ "client_cookie={}, server_cookie={}, {}, new_sid={}, "
+ "close existing {}",
+ conn, global_seq, peer_global_seq, connect_seq,
+ client_cookie, server_cookie,
+ io_states, frame_assembler->get_socket_shard_id(),
+ *existing_conn);
+ is_replace = true;
+ ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>(
+ existing_conn->protocol.get());
+ existing_proto->do_close(
+ true, // is_dispatch_reset
+ std::move(accept_me));
+ if (unlikely(state != state_t::ESTABLISHING)) {
+ logger().warn("{} triggered {} during execute_establishing(), "
+ "the accept event will not be delivered!",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ } else {
+ logger().info("{} start establishing: gs={}, pgs={}, cs={}, "
+ "client_cookie={}, server_cookie={}, {}, new_sid={}, "
+ "no existing",
+ conn, global_seq, peer_global_seq, connect_seq,
+ client_cookie, server_cookie, io_states,
+ frame_assembler->get_socket_shard_id());
+ is_replace = false;
+ accept_me();
+ }
+
+ gated_execute("execute_establishing", conn, [this, is_replace] {
+ ceph_assert_always(state == state_t::ESTABLISHING);
+
+ // set io_handler to a new shard
+ auto cc_seq = crosscore.prepare_submit();
+ // there are 2 hops with dispatch_accept()
+ crosscore.prepare_submit();
+ auto new_io_shard = frame_assembler->get_socket_shard_id();
+ logger().debug("{} send {} IOHandler::dispatch_accept({})",
+ conn, cc_seq, new_io_shard);
+ ConnectionFRef conn_fref = seastar::make_foreign(
+ conn.shared_from_this());
+ ceph_assert_always(!pr_switch_io_shard.has_value());
+ pr_switch_io_shard = seastar::shared_promise<>();
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(),
+ [this, cc_seq, new_io_shard, is_replace,
+ conn_fref=std::move(conn_fref)]() mutable {
+ return io_handler.dispatch_accept(
+ cc_seq, new_io_shard, std::move(conn_fref), is_replace);
+ }).then([this, new_io_shard] {
+ ceph_assert_always(io_handler.get_shard_id() == new_io_shard);
+ pr_switch_io_shard->set_value();
+ pr_switch_io_shard = std::nullopt;
+ // user can make changes
+
+ if (unlikely(state != state_t::ESTABLISHING)) {
+ logger().debug("{} triggered {} after dispatch_accept() during execute_establishing()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+
+ return send_server_ident();
+ }).then([this] {
+ if (unlikely(state != state_t::ESTABLISHING)) {
+ logger().debug("{} triggered {} at the end of execute_establishing()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ logger().info("{} established, going to ready", conn);
+ execute_ready();
+ }).handle_exception([this](std::exception_ptr eptr) {
+ fault(state_t::ESTABLISHING, "execute_establishing", eptr);
+ });
+ });
+}
+
+// ESTABLISHING or REPLACING state
+
+seastar::future<>
+ProtocolV2::send_server_ident()
+{
+ ceph_assert_always(state == state_t::ESTABLISHING ||
+ state == state_t::REPLACING);
+ // send_server_ident() logic
+
+ // refered to async-conn v2: not assign gs to global_seq
+ global_seq = messenger.get_global_seq();
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} UPDATE: gs={} for server ident, "
+ "send {} IOHandler::reset_peer_state()",
+ conn, global_seq, cc_seq);
+
+ // this is required for the case when this connection is being replaced
+ io_states.reset_peer_state();
+ gate.dispatch_in_background(
+ "reset_peer_state", conn, [this, cc_seq] {
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(), [this, cc_seq] {
+ return io_handler.reset_peer_state(cc_seq);
+ });
+ });
+
+ if (!conn.policy.lossy) {
+ server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+ }
+
+ uint64_t flags = 0;
+ if (conn.policy.lossy) {
+ flags = flags | CEPH_MSG_CONNECT_LOSSY;
+ }
+
+ auto server_ident = ServerIdentFrame::Encode(
+ messenger.get_myaddrs(),
+ messenger.get_myname().num(),
+ global_seq,
+ conn.policy.features_supported,
+ conn.policy.features_required | msgr2_required,
+ flags,
+ server_cookie);
+
+ logger().debug("{} WRITE ServerIdentFrame: addrs={}, gid={},"
+ " gs={}, features_supported={}, features_required={},"
+ " flags={}, cookie={}",
+ conn, messenger.get_myaddrs(), messenger.get_myname().num(),
+ global_seq, conn.policy.features_supported,
+ conn.policy.features_required | msgr2_required,
+ flags, server_cookie);
+
+ return frame_assembler->write_flush_frame(server_ident);
+}
+
+// REPLACING state
+
+void ProtocolV2::trigger_replacing(bool reconnect,
+ bool do_reset,
+ FrameAssemblerV2::mover_t &&mover,
+ AuthConnectionMetaRef&& new_auth_meta,
+ uint64_t new_peer_global_seq,
+ uint64_t new_client_cookie,
+ entity_name_t new_peer_name,
+ uint64_t new_conn_features,
+ uint64_t new_peer_supported_features,
+ uint64_t new_connect_seq,
+ uint64_t new_msg_seq)
+{
+ ceph_assert_always(state >= state_t::ESTABLISHING);
+ ceph_assert_always(state <= state_t::WAIT);
+ ceph_assert_always(has_socket || state == state_t::CONNECTING);
+ // mover.socket shouldn't be shutdown
+
+ logger().info("{} start replacing ({}): pgs was {}, cs was {}, "
+ "client_cookie was {}, {}, new_sid={}",
+ conn, reconnect ? "reconnected" : "connected",
+ peer_global_seq, connect_seq, client_cookie,
+ io_states, mover.socket->get_shard_id());
+ if (is_socket_valid) {
+ frame_assembler->shutdown_socket<true>(&gate);
+ is_socket_valid = false;
+ }
+ trigger_state_phase1(state_t::REPLACING);
+ gate.dispatch_in_background(
+ "trigger_replacing",
+ conn,
+ [this,
+ reconnect,
+ do_reset,
+ mover = std::move(mover),
+ new_auth_meta = std::move(new_auth_meta),
+ new_client_cookie, new_peer_name,
+ new_conn_features, new_peer_supported_features,
+ new_peer_global_seq,
+ new_connect_seq, new_msg_seq] () mutable {
+ ceph_assert_always(state == state_t::REPLACING);
+ auto new_io_shard = mover.socket->get_shard_id();
+ // state may become CLOSING below, but we cannot abort the chain until
+ // mover.socket is correctly handled (closed or replaced).
+
+ // this is preemptive
+ return wait_switch_io_shard(
+ ).then([this] {
+ if (unlikely(state != state_t::REPLACING)) {
+ ceph_assert_always(state == state_t::CLOSING);
+ return seastar::now();
+ }
+
+ trigger_state_phase2(state_t::REPLACING, io_state_t::delay);
+ return wait_exit_io();
+ }).then([this] {
+ if (unlikely(state != state_t::REPLACING)) {
+ ceph_assert_always(state == state_t::CLOSING);
+ return seastar::now();
+ }
+
+ ceph_assert_always(frame_assembler);
+ protocol_timer.cancel();
+ auto done = std::move(execution_done);
+ execution_done = seastar::now();
+ return done;
+ }).then([this, new_io_shard] {
+ if (unlikely(state != state_t::REPLACING)) {
+ ceph_assert_always(state == state_t::CLOSING);
+ return seastar::now();
+ }
+
+ // set io_handler to a new shard
+ // we should prevent parallel switching core attemps
+ auto cc_seq = crosscore.prepare_submit();
+ // there are 2 hops with dispatch_accept()
+ crosscore.prepare_submit();
+ logger().debug("{} send {} IOHandler::dispatch_accept({})",
+ conn, cc_seq, new_io_shard);
+ ConnectionFRef conn_fref = seastar::make_foreign(
+ conn.shared_from_this());
+ ceph_assert_always(!pr_switch_io_shard.has_value());
+ pr_switch_io_shard = seastar::shared_promise<>();
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(),
+ [this, cc_seq, new_io_shard,
+ conn_fref=std::move(conn_fref)]() mutable {
+ return io_handler.dispatch_accept(
+ cc_seq, new_io_shard, std::move(conn_fref), false);
+ }).then([this, new_io_shard] {
+ ceph_assert_always(io_handler.get_shard_id() == new_io_shard);
+ pr_switch_io_shard->set_value();
+ pr_switch_io_shard = std::nullopt;
+ // user can make changes
+ });
+ }).then([this,
+ reconnect,
+ do_reset,
+ mover = std::move(mover),
+ new_auth_meta = std::move(new_auth_meta),
+ new_client_cookie, new_peer_name,
+ new_conn_features, new_peer_supported_features,
+ new_peer_global_seq,
+ new_connect_seq, new_msg_seq] () mutable {
+ if (state == state_t::REPLACING && do_reset) {
+ reset_session(true);
+ // user can make changes
+ }
+
+ if (unlikely(state != state_t::REPLACING)) {
+ logger().debug("{} triggered {} in the middle of trigger_replacing(), abort",
+ conn, get_state_name(state));
+ ceph_assert_always(state == state_t::CLOSING);
+ return mover.socket->close(
+ ).then([sock = std::move(mover.socket)] {
+ abort_protocol();
+ });
+ }
+
+ auth_meta = std::move(new_auth_meta);
+ peer_global_seq = new_peer_global_seq;
+ gate.dispatch_in_background(
+ "replace_frame_assembler",
+ conn,
+ [this, mover=std::move(mover)]() mutable {
+ return frame_assembler->replace_by(std::move(mover));
+ }
+ );
+ is_socket_valid = true;
+ has_socket = true;
+
+ if (reconnect) {
+ connect_seq = new_connect_seq;
+ // send_reconnect_ok() logic
+
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} send {} IOHandler::requeue_out_sent_up_to({})",
+ conn, cc_seq, new_msg_seq);
+ io_states.requeue_out_sent_up_to();
+ gate.dispatch_in_background(
+ "requeue_out_replacing", conn, [this, cc_seq, new_msg_seq] {
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(), [this, cc_seq, new_msg_seq] {
+ return io_handler.requeue_out_sent_up_to(cc_seq, new_msg_seq);
+ });
+ });
+
+ auto reconnect_ok = ReconnectOkFrame::Encode(io_states.in_seq);
+ logger().debug("{} WRITE ReconnectOkFrame: msg_seq={}", conn, io_states.in_seq);
+ return frame_assembler->write_flush_frame(reconnect_ok);
+ } else {
+ client_cookie = new_client_cookie;
+ assert(conn.get_peer_type() == new_peer_name.type());
+ if (conn.get_peer_id() == entity_name_t::NEW) {
+ conn.set_peer_id(new_peer_name.num());
+ }
+ conn.set_features(new_conn_features);
+ peer_supported_features = new_peer_supported_features;
+ bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+ frame_assembler->set_is_rev1(is_rev1);
+ return send_server_ident();
+ }
+ }).then([this, reconnect] {
+ if (unlikely(state != state_t::REPLACING)) {
+ logger().debug("{} triggered {} at the end of trigger_replacing(), abort",
+ conn, get_state_name(state));
+ ceph_assert_always(state == state_t::CLOSING);
+ abort_protocol();
+ }
+ logger().info("{} replaced ({}), going to ready: "
+ "gs={}, pgs={}, cs={}, "
+ "client_cookie={}, server_cookie={}, {}",
+ conn, reconnect ? "reconnected" : "connected",
+ global_seq, peer_global_seq, connect_seq,
+ client_cookie, server_cookie, io_states);
+ execute_ready();
+ }).handle_exception([this](std::exception_ptr eptr) {
+ fault(state_t::REPLACING, "trigger_replacing", eptr);
+ });
+ });
+}
+
+// READY state
+
+seastar::future<> ProtocolV2::notify_out_fault(
+ crosscore_t::seq_t cc_seq,
+ const char *where,
+ std::exception_ptr eptr,
+ io_handler_state _io_states)
+{
+ assert(seastar::this_shard_id() == conn.get_messenger_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} notify_out_fault(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq, where, eptr, _io_states] {
+ return notify_out_fault(cc_seq, where, eptr, _io_states);
+ });
+ }
+
+ io_states = _io_states;
+ logger().debug("{} got {} notify_out_fault(): io_states={}",
+ conn, cc_seq, io_states);
+ fault(state_t::READY, where, eptr);
+ return seastar::now();
+}
+
+void ProtocolV2::execute_ready()
+{
+ assert(conn.policy.lossy || (client_cookie != 0 && server_cookie != 0));
+ protocol_timer.cancel();
+ ceph_assert_always(is_socket_valid);
+ // I'm not responsible to shutdown the socket at READY
+ is_socket_valid = false;
+ trigger_state(state_t::READY, io_state_t::open);
+#ifdef UNIT_TESTS_BUILT
+ if (conn.interceptor) {
+ // FIXME: doesn't support cross-core
+ conn.interceptor->register_conn_ready(
+ conn.get_local_shared_foreign_from_this());
+ }
+#endif
+}
+
+// STANDBY state
+
+void ProtocolV2::execute_standby()
+{
+ ceph_assert_always(!is_socket_valid);
+ trigger_state(state_t::STANDBY, io_state_t::delay);
+}
+
+seastar::future<> ProtocolV2::notify_out(
+ crosscore_t::seq_t cc_seq)
+{
+ assert(seastar::this_shard_id() == conn.get_messenger_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} notify_out(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq] {
+ return notify_out(cc_seq);
+ });
+ }
+
+ logger().debug("{} got {} notify_out(): at {}",
+ conn, cc_seq, get_state_name(state));
+ io_states.is_out_queued = true;
+ if (unlikely(state == state_t::STANDBY && !conn.policy.server)) {
+ logger().info("{} notify_out(): at {}, going to CONNECTING",
+ conn, get_state_name(state));
+ execute_connecting();
+ }
+ return seastar::now();
+}
+
+// WAIT state
+
+void ProtocolV2::execute_wait(bool max_backoff)
+{
+ ceph_assert_always(!is_socket_valid);
+ trigger_state(state_t::WAIT, io_state_t::delay);
+ gated_execute("execute_wait", conn, [this, max_backoff] {
+ double backoff = protocol_timer.last_dur();
+ if (max_backoff) {
+ backoff = local_conf().get_val<double>("ms_max_backoff");
+ } else if (backoff > 0) {
+ backoff = std::min(local_conf().get_val<double>("ms_max_backoff"), 2 * backoff);
+ } else {
+ backoff = local_conf().get_val<double>("ms_initial_backoff");
+ }
+ return protocol_timer.backoff(backoff).then([this] {
+ if (unlikely(state != state_t::WAIT)) {
+ logger().debug("{} triggered {} at the end of execute_wait()",
+ conn, get_state_name(state));
+ abort_protocol();
+ }
+ logger().info("{} execute_wait(): going to CONNECTING", conn);
+ execute_connecting();
+ }).handle_exception([this](std::exception_ptr eptr) {
+ const char *e_what;
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception &e) {
+ e_what = e.what();
+ }
+ logger().info("{} execute_wait(): protocol aborted at {} -- {}",
+ conn, get_state_name(state), e_what);
+ assert(state == state_t::REPLACING ||
+ state == state_t::CLOSING);
+ });
+ });
+}
+
+// SERVER_WAIT state
+
+void ProtocolV2::execute_server_wait()
+{
+ ceph_assert_always(is_socket_valid);
+ trigger_state(state_t::SERVER_WAIT, io_state_t::none);
+ gated_execute("execute_server_wait", conn, [this] {
+ return frame_assembler->read_exactly(1
+ ).then([this](auto bptr) {
+ logger().warn("{} SERVER_WAIT got read, abort", conn);
+ abort_in_fault();
+ }).handle_exception([this](std::exception_ptr eptr) {
+ const char *e_what;
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception &e) {
+ e_what = e.what();
+ }
+ logger().info("{} execute_server_wait(): fault at {}, going to CLOSING -- {}",
+ conn, get_state_name(state), e_what);
+ do_close(false);
+ });
+ });
+}
+
+// CLOSING state
+
+seastar::future<> ProtocolV2::notify_mark_down(
+ crosscore_t::seq_t cc_seq)
+{
+ assert(seastar::this_shard_id() == conn.get_messenger_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} notify_mark_down(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq] {
+ return notify_mark_down(cc_seq);
+ });
+ }
+
+ logger().debug("{} got {} notify_mark_down()",
+ conn, cc_seq);
+ do_close(false);
+ return seastar::now();
+}
+
+seastar::future<> ProtocolV2::close_clean_yielded()
+{
+ // yield() so that do_close() can be called *after* close_clean_yielded() is
+ // applied to all connections in a container using
+ // seastar::parallel_for_each(). otherwise, we could erase a connection in
+ // the container when seastar::parallel_for_each() is still iterating in it.
+ // that'd lead to a segfault.
+ return seastar::yield(
+ ).then([this] {
+ do_close(false);
+ return pr_closed_clean.get_shared_future();
+
+ // connection may be unreferenced from the messenger,
+ // so need to hold the additional reference.
+ }).finally([conn_ref = conn.shared_from_this()] {});;
+}
+
+void ProtocolV2::do_close(
+ bool is_dispatch_reset,
+ std::optional<std::function<void()>> f_accept_new)
+{
+ if (state == state_t::CLOSING) {
+ // already closing
+ return;
+ }
+
+ bool is_replace = f_accept_new ? true : false;
+ logger().info("{} closing: reset {}, replace {}", conn,
+ is_dispatch_reset ? "yes" : "no",
+ is_replace ? "yes" : "no");
+
+ /*
+ * atomic operations
+ */
+
+ ceph_assert_always(!gate.is_closed());
+
+ // messenger registrations, must before user events
+ messenger.closing_conn(
+ seastar::static_pointer_cast<SocketConnection>(
+ conn.shared_from_this()));
+ if (state == state_t::ACCEPTING || state == state_t::SERVER_WAIT) {
+ messenger.unaccept_conn(
+ seastar::static_pointer_cast<SocketConnection>(
+ conn.shared_from_this()));
+ } else if (state >= state_t::ESTABLISHING && state < state_t::CLOSING) {
+ messenger.unregister_conn(
+ seastar::static_pointer_cast<SocketConnection>(
+ conn.shared_from_this()));
+ } else {
+ // cannot happen
+ ceph_assert(false);
+ }
+ if (f_accept_new) {
+ // the replacing connection must be registerred after the replaced
+ // connection is unreigsterred.
+ (*f_accept_new)();
+ }
+
+ protocol_timer.cancel();
+ if (is_socket_valid) {
+ frame_assembler->shutdown_socket<true>(&gate);
+ is_socket_valid = false;
+ }
+
+ trigger_state_phase1(state_t::CLOSING);
+ gate.dispatch_in_background(
+ "close_io", conn, [this, is_dispatch_reset, is_replace] {
+ // this is preemptive
+ return wait_switch_io_shard(
+ ).then([this, is_dispatch_reset, is_replace] {
+ trigger_state_phase2(state_t::CLOSING, io_state_t::drop);
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} send {} IOHandler::close_io(reset={}, replace={})",
+ conn, cc_seq, is_dispatch_reset, is_replace);
+
+ std::ignore = gate.close(
+ ).then([this] {
+ ceph_assert_always(!need_exit_io);
+ ceph_assert_always(!pr_exit_io.has_value());
+ if (has_socket) {
+ ceph_assert_always(frame_assembler);
+ return frame_assembler->close_shutdown_socket();
+ } else {
+ return seastar::now();
+ }
+ }).then([this] {
+ logger().debug("{} closed!", conn);
+ messenger.closed_conn(
+ seastar::static_pointer_cast<SocketConnection>(
+ conn.shared_from_this()));
+ pr_closed_clean.set_value();
+#ifdef UNIT_TESTS_BUILT
+ closed_clean = true;
+ if (conn.interceptor) {
+ conn.interceptor->register_conn_closed(
+ conn.get_local_shared_foreign_from_this());
+ }
+#endif
+ // connection is unreferenced from the messenger,
+ // so need to hold the additional reference.
+ }).handle_exception([conn_ref = conn.shared_from_this(), this] (auto eptr) {
+ logger().error("{} closing got unexpected exception {}",
+ conn, eptr);
+ ceph_abort();
+ });
+
+ return seastar::smp::submit_to(
+ io_handler.get_shard_id(),
+ [this, cc_seq, is_dispatch_reset, is_replace] {
+ return io_handler.close_io(cc_seq, is_dispatch_reset, is_replace);
+ });
+ // user can make changes
+ });
+ });
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/ProtocolV2.h b/src/crimson/net/ProtocolV2.h
new file mode 100644
index 000000000..dd7a1e703
--- /dev/null
+++ b/src/crimson/net/ProtocolV2.h
@@ -0,0 +1,328 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/sleep.hh>
+
+#include "io_handler.h"
+
+namespace crimson::net {
+
+class ProtocolV2 final : public HandshakeListener {
+ using AuthConnectionMetaRef = seastar::lw_shared_ptr<AuthConnectionMeta>;
+
+public:
+ ProtocolV2(SocketConnection &,
+ IOHandler &);
+
+ ~ProtocolV2() final;
+
+ ProtocolV2(const ProtocolV2 &) = delete;
+ ProtocolV2(ProtocolV2 &&) = delete;
+ ProtocolV2 &operator=(const ProtocolV2 &) = delete;
+ ProtocolV2 &operator=(ProtocolV2 &&) = delete;
+
+/**
+ * as HandshakeListener
+ */
+private:
+ seastar::future<> notify_out(
+ crosscore_t::seq_t cc_seq) final;
+
+ seastar::future<> notify_out_fault(
+ crosscore_t::seq_t cc_seq,
+ const char *where,
+ std::exception_ptr,
+ io_handler_state) final;
+
+ seastar::future<> notify_mark_down(
+ crosscore_t::seq_t cc_seq) final;
+
+/*
+* as ProtocolV2 to be called by SocketConnection
+*/
+public:
+ void start_connect(const entity_addr_t& peer_addr,
+ const entity_name_t& peer_name);
+
+ void start_accept(SocketFRef&& socket,
+ const entity_addr_t& peer_addr);
+
+ seastar::future<> close_clean_yielded();
+
+#ifdef UNIT_TESTS_BUILT
+ bool is_ready() const {
+ return state == state_t::READY;
+ }
+
+ bool is_standby() const {
+ return state == state_t::STANDBY;
+ }
+
+ bool is_closed_clean() const {
+ return closed_clean;
+ }
+
+ bool is_closed() const {
+ return state == state_t::CLOSING;
+ }
+
+#endif
+private:
+ using io_state_t = IOHandler::io_state_t;
+
+ seastar::future<> wait_switch_io_shard() {
+ if (pr_switch_io_shard.has_value()) {
+ return pr_switch_io_shard->get_shared_future();
+ } else {
+ return seastar::now();
+ }
+ }
+
+ seastar::future<> wait_exit_io() {
+ if (pr_exit_io.has_value()) {
+ return pr_exit_io->get_shared_future();
+ } else {
+ assert(!need_exit_io);
+ return seastar::now();
+ }
+ }
+
+ enum class state_t {
+ NONE = 0,
+ ACCEPTING,
+ SERVER_WAIT,
+ ESTABLISHING,
+ CONNECTING,
+ READY,
+ STANDBY,
+ WAIT,
+ REPLACING,
+ CLOSING
+ };
+
+ static const char *get_state_name(state_t state) {
+ const char *const statenames[] = {"NONE",
+ "ACCEPTING",
+ "SERVER_WAIT",
+ "ESTABLISHING",
+ "CONNECTING",
+ "READY",
+ "STANDBY",
+ "WAIT",
+ "REPLACING",
+ "CLOSING"};
+ return statenames[static_cast<int>(state)];
+ }
+
+ void trigger_state_phase1(state_t new_state);
+
+ void trigger_state_phase2(state_t new_state, io_state_t new_io_state);
+
+ void trigger_state(state_t new_state, io_state_t new_io_state) {
+ ceph_assert_always(!pr_switch_io_shard.has_value());
+ trigger_state_phase1(new_state);
+ trigger_state_phase2(new_state, new_io_state);
+ }
+
+ template <typename Func, typename T>
+ void gated_execute(const char *what, T &who, Func &&func) {
+ gate.dispatch_in_background(what, who, [this, &who, &func] {
+ if (!execution_done.available()) {
+ // discard the unready future
+ gate.dispatch_in_background(
+ "gated_execute_abandon",
+ who,
+ [fut=std::move(execution_done)]() mutable {
+ return std::move(fut);
+ }
+ );
+ }
+ seastar::promise<> pr;
+ execution_done = pr.get_future();
+ return seastar::futurize_invoke(std::forward<Func>(func)
+ ).finally([pr=std::move(pr)]() mutable {
+ pr.set_value();
+ });
+ });
+ }
+
+ void fault(state_t expected_state,
+ const char *where,
+ std::exception_ptr eptr);
+
+ void reset_session(bool is_full);
+ seastar::future<std::tuple<entity_type_t, entity_addr_t>>
+ banner_exchange(bool is_connect);
+
+ enum class next_step_t {
+ ready,
+ wait,
+ none, // protocol should have been aborted or failed
+ };
+
+ // CONNECTING (client)
+ seastar::future<> handle_auth_reply();
+ inline seastar::future<> client_auth() {
+ std::vector<uint32_t> empty;
+ return client_auth(empty);
+ }
+ seastar::future<> client_auth(std::vector<uint32_t> &allowed_methods);
+
+ seastar::future<next_step_t> process_wait();
+ seastar::future<next_step_t> client_connect();
+ seastar::future<next_step_t> client_reconnect();
+ void execute_connecting();
+
+ // ACCEPTING (server)
+ seastar::future<> _auth_bad_method(int r);
+ seastar::future<> _handle_auth_request(bufferlist& auth_payload, bool more);
+ seastar::future<> server_auth();
+
+ bool validate_peer_name(const entity_name_t& peer_name) const;
+ seastar::future<next_step_t> send_wait();
+ seastar::future<next_step_t> reuse_connection(ProtocolV2* existing_proto,
+ bool do_reset=false,
+ bool reconnect=false,
+ uint64_t conn_seq=0,
+ uint64_t msg_seq=0);
+
+ seastar::future<next_step_t> handle_existing_connection(SocketConnectionRef existing_conn);
+ seastar::future<next_step_t> server_connect();
+
+ seastar::future<next_step_t> read_reconnect();
+ seastar::future<next_step_t> send_retry(uint64_t connect_seq);
+ seastar::future<next_step_t> send_retry_global(uint64_t global_seq);
+ seastar::future<next_step_t> send_reset(bool full);
+ seastar::future<next_step_t> server_reconnect();
+
+ void execute_accepting();
+
+ // CONNECTING/ACCEPTING
+ seastar::future<> finish_auth();
+
+ // ESTABLISHING
+ void execute_establishing(SocketConnectionRef existing_conn);
+
+ // ESTABLISHING/REPLACING (server)
+ seastar::future<> send_server_ident();
+
+ // REPLACING (server)
+ void trigger_replacing(bool reconnect,
+ bool do_reset,
+ FrameAssemblerV2::mover_t &&mover,
+ AuthConnectionMetaRef&& new_auth_meta,
+ uint64_t new_peer_global_seq,
+ // !reconnect
+ uint64_t new_client_cookie,
+ entity_name_t new_peer_name,
+ uint64_t new_conn_features,
+ uint64_t new_peer_supported_features,
+ // reconnect
+ uint64_t new_connect_seq,
+ uint64_t new_msg_seq);
+
+ // READY
+ void execute_ready();
+
+ // STANDBY
+ void execute_standby();
+
+ // WAIT
+ void execute_wait(bool max_backoff);
+
+ // SERVER_WAIT
+ void execute_server_wait();
+
+ // CLOSING
+ // reentrant
+ void do_close(bool is_dispatch_reset,
+ std::optional<std::function<void()>> f_accept_new=std::nullopt);
+
+private:
+ SocketConnection &conn;
+
+ SocketMessenger &messenger;
+
+ IOHandler &io_handler;
+
+ // asynchronously populated from io_handler
+ io_handler_state io_states;
+
+ crosscore_t crosscore;
+
+ bool has_socket = false;
+
+ // the socket exists and it is not shutdown
+ bool is_socket_valid = false;
+
+ FrameAssemblerV2Ref frame_assembler;
+
+ bool need_notify_out = false;
+
+ std::optional<seastar::shared_promise<>> pr_switch_io_shard;
+
+ bool need_exit_io = false;
+
+ std::optional<seastar::shared_promise<>> pr_exit_io;
+
+ AuthConnectionMetaRef auth_meta;
+
+ crimson::common::Gated gate;
+
+ seastar::shared_promise<> pr_closed_clean;
+
+#ifdef UNIT_TESTS_BUILT
+ bool closed_clean = false;
+
+#endif
+ state_t state = state_t::NONE;
+
+ uint64_t peer_supported_features = 0;
+
+ uint64_t client_cookie = 0;
+ uint64_t server_cookie = 0;
+ uint64_t global_seq = 0;
+ uint64_t peer_global_seq = 0;
+ uint64_t connect_seq = 0;
+
+ seastar::future<> execution_done = seastar::now();
+
+ class Timer {
+ double last_dur_ = 0.0;
+ const SocketConnection& conn;
+ std::optional<seastar::abort_source> as;
+ public:
+ Timer(SocketConnection& conn) : conn(conn) {}
+ double last_dur() const { return last_dur_; }
+ seastar::future<> backoff(double seconds);
+ void cancel() {
+ last_dur_ = 0.0;
+ if (as) {
+ as->request_abort();
+ as = std::nullopt;
+ }
+ }
+ };
+ Timer protocol_timer;
+};
+
+struct create_handlers_ret {
+ std::unique_ptr<ConnectionHandler> io_handler;
+ std::unique_ptr<ProtocolV2> protocol;
+};
+inline create_handlers_ret create_handlers(ChainedDispatchers &dispatchers, SocketConnection &conn) {
+ std::unique_ptr<ConnectionHandler> io_handler = std::make_unique<IOHandler>(dispatchers, conn);
+ IOHandler &io_handler_concrete = static_cast<IOHandler&>(*io_handler);
+ auto protocol = std::make_unique<ProtocolV2>(conn, io_handler_concrete);
+ io_handler_concrete.set_handshake_listener(*protocol);
+ return {std::move(io_handler), std::move(protocol)};
+}
+
+} // namespace crimson::net
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::net::ProtocolV2> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/net/Socket.cc b/src/crimson/net/Socket.cc
new file mode 100644
index 000000000..95b1e2250
--- /dev/null
+++ b/src/crimson/net/Socket.cc
@@ -0,0 +1,523 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Socket.h"
+
+#include <seastar/core/sleep.hh>
+#include <seastar/core/when_all.hh>
+#include <seastar/net/packet.hh>
+
+#include "crimson/common/log.h"
+#include "Errors.h"
+
+using crimson::common::local_conf;
+
+namespace crimson::net {
+
+namespace {
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_ms);
+}
+
+using tmp_buf = seastar::temporary_buffer<char>;
+using packet = seastar::net::packet;
+
+// an input_stream consumer that reads buffer segments into a bufferlist up to
+// the given number of remaining bytes
+struct bufferlist_consumer {
+ bufferlist& bl;
+ size_t& remaining;
+
+ bufferlist_consumer(bufferlist& bl, size_t& remaining)
+ : bl(bl), remaining(remaining) {}
+
+ using consumption_result_type = typename seastar::input_stream<char>::consumption_result_type;
+
+ // consume some or all of a buffer segment
+ seastar::future<consumption_result_type> operator()(tmp_buf&& data) {
+ if (remaining >= data.size()) {
+ // consume the whole buffer
+ remaining -= data.size();
+ bl.append(buffer::create(std::move(data)));
+ if (remaining > 0) {
+ // return none to request more segments
+ return seastar::make_ready_future<consumption_result_type>(
+ seastar::continue_consuming{});
+ } else {
+ // return an empty buffer to singal that we're done
+ return seastar::make_ready_future<consumption_result_type>(
+ consumption_result_type::stop_consuming_type({}));
+ }
+ }
+ if (remaining > 0) {
+ // consume the front
+ bl.append(buffer::create(data.share(0, remaining)));
+ data.trim_front(remaining);
+ remaining = 0;
+ }
+ // give the rest back to signal that we're done
+ return seastar::make_ready_future<consumption_result_type>(
+ consumption_result_type::stop_consuming_type{std::move(data)});
+ };
+};
+
+seastar::future<> inject_delay()
+{
+ if (float delay_period = local_conf()->ms_inject_internal_delays;
+ delay_period) {
+ logger().debug("Socket::inject_delay: sleep for {}", delay_period);
+ return seastar::sleep(
+ std::chrono::milliseconds((int)(delay_period * 1000.0)));
+ }
+ return seastar::now();
+}
+
+void inject_failure()
+{
+ if (local_conf()->ms_inject_socket_failures) {
+ uint64_t rand =
+ ceph::util::generate_random_number<uint64_t>(1, RAND_MAX);
+ if (rand % local_conf()->ms_inject_socket_failures == 0) {
+ logger().warn("Socket::inject_failure: injecting socket failure");
+ throw std::system_error(make_error_code(
+ error::negotiation_failure));
+ }
+ }
+}
+
+} // anonymous namespace
+
+Socket::Socket(
+ seastar::connected_socket &&_socket,
+ side_t _side,
+ uint16_t e_port,
+ construct_tag)
+ : sid{seastar::this_shard_id()},
+ socket(std::move(_socket)),
+ in(socket.input()),
+ // the default buffer size 8192 is too small that may impact our write
+ // performance. see seastar::net::connected_socket::output()
+ out(socket.output(65536)),
+ socket_is_shutdown(false),
+ side(_side),
+ ephemeral_port(e_port)
+{
+ if (local_conf()->ms_tcp_nodelay) {
+ socket.set_nodelay(true);
+ }
+}
+
+Socket::~Socket()
+{
+ assert(seastar::this_shard_id() == sid);
+#ifndef NDEBUG
+ assert(closed);
+#endif
+}
+
+seastar::future<bufferlist>
+Socket::read(size_t bytes)
+{
+ assert(seastar::this_shard_id() == sid);
+#ifdef UNIT_TESTS_BUILT
+ return try_trap_pre(next_trap_read).then([bytes, this] {
+#endif
+ if (bytes == 0) {
+ return seastar::make_ready_future<bufferlist>();
+ }
+ r.buffer.clear();
+ r.remaining = bytes;
+ return in.consume(bufferlist_consumer{r.buffer, r.remaining}).then([this] {
+ if (r.remaining) { // throw on short reads
+ throw std::system_error(make_error_code(error::read_eof));
+ }
+ inject_failure();
+ return inject_delay().then([this] {
+ return seastar::make_ready_future<bufferlist>(std::move(r.buffer));
+ });
+ });
+#ifdef UNIT_TESTS_BUILT
+ }).then([this](auto buf) {
+ return try_trap_post(next_trap_read
+ ).then([buf = std::move(buf)]() mutable {
+ return std::move(buf);
+ });
+ });
+#endif
+}
+
+seastar::future<bufferptr>
+Socket::read_exactly(size_t bytes) {
+ assert(seastar::this_shard_id() == sid);
+#ifdef UNIT_TESTS_BUILT
+ return try_trap_pre(next_trap_read).then([bytes, this] {
+#endif
+ if (bytes == 0) {
+ return seastar::make_ready_future<bufferptr>();
+ }
+ return in.read_exactly(bytes).then([bytes](auto buf) {
+ bufferptr ptr(buffer::create(buf.share()));
+ if (ptr.length() < bytes) {
+ throw std::system_error(make_error_code(error::read_eof));
+ }
+ inject_failure();
+ return inject_delay(
+ ).then([ptr = std::move(ptr)]() mutable {
+ return seastar::make_ready_future<bufferptr>(std::move(ptr));
+ });
+ });
+#ifdef UNIT_TESTS_BUILT
+ }).then([this](auto ptr) {
+ return try_trap_post(next_trap_read
+ ).then([ptr = std::move(ptr)]() mutable {
+ return std::move(ptr);
+ });
+ });
+#endif
+}
+
+seastar::future<>
+Socket::write(bufferlist buf)
+{
+ assert(seastar::this_shard_id() == sid);
+#ifdef UNIT_TESTS_BUILT
+ return try_trap_pre(next_trap_write
+ ).then([buf = std::move(buf), this]() mutable {
+#endif
+ inject_failure();
+ return inject_delay(
+ ).then([buf = std::move(buf), this]() mutable {
+ packet p(std::move(buf));
+ return out.write(std::move(p));
+ });
+#ifdef UNIT_TESTS_BUILT
+ }).then([this] {
+ return try_trap_post(next_trap_write);
+ });
+#endif
+}
+
+seastar::future<>
+Socket::flush()
+{
+ assert(seastar::this_shard_id() == sid);
+ inject_failure();
+ return inject_delay().then([this] {
+ return out.flush();
+ });
+}
+
+seastar::future<>
+Socket::write_flush(bufferlist buf)
+{
+ assert(seastar::this_shard_id() == sid);
+#ifdef UNIT_TESTS_BUILT
+ return try_trap_pre(next_trap_write
+ ).then([buf = std::move(buf), this]() mutable {
+#endif
+ inject_failure();
+ return inject_delay(
+ ).then([buf = std::move(buf), this]() mutable {
+ packet p(std::move(buf));
+ return out.write(std::move(p)
+ ).then([this] {
+ return out.flush();
+ });
+ });
+#ifdef UNIT_TESTS_BUILT
+ }).then([this] {
+ return try_trap_post(next_trap_write);
+ });
+#endif
+}
+
+void Socket::shutdown()
+{
+ assert(seastar::this_shard_id() == sid);
+ socket_is_shutdown = true;
+ socket.shutdown_input();
+ socket.shutdown_output();
+}
+
+static inline seastar::future<>
+close_and_handle_errors(seastar::output_stream<char>& out)
+{
+ return out.close().handle_exception_type([](const std::system_error& e) {
+ if (e.code() != std::errc::broken_pipe &&
+ e.code() != std::errc::connection_reset) {
+ logger().error("Socket::close(): unexpected error {}", e.what());
+ ceph_abort();
+ }
+ // can happen when out is already shutdown, ignore
+ });
+}
+
+seastar::future<>
+Socket::close()
+{
+ assert(seastar::this_shard_id() == sid);
+#ifndef NDEBUG
+ ceph_assert_always(!closed);
+ closed = true;
+#endif
+ return seastar::when_all_succeed(
+ inject_delay(),
+ in.close(),
+ close_and_handle_errors(out)
+ ).then_unpack([] {
+ return seastar::make_ready_future<>();
+ }).handle_exception([](auto eptr) {
+ const char *e_what;
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception &e) {
+ e_what = e.what();
+ }
+ logger().error("Socket::close(): unexpected exception {}", e_what);
+ ceph_abort();
+ });
+}
+
+seastar::future<SocketRef>
+Socket::connect(const entity_addr_t &peer_addr)
+{
+ inject_failure();
+ return inject_delay(
+ ).then([peer_addr] {
+ return seastar::connect(peer_addr.in4_addr());
+ }).then([peer_addr](seastar::connected_socket socket) {
+ auto ret = std::make_unique<Socket>(
+ std::move(socket), side_t::connector, 0, construct_tag{});
+ logger().debug("Socket::connect(): connected to {}, socket {}",
+ peer_addr, fmt::ptr(ret));
+ return ret;
+ });
+}
+
+#ifdef UNIT_TESTS_BUILT
+void Socket::set_trap(bp_type_t type, bp_action_t action, socket_blocker* blocker_) {
+ assert(seastar::this_shard_id() == sid);
+ blocker = blocker_;
+ if (type == bp_type_t::READ) {
+ ceph_assert_always(next_trap_read == bp_action_t::CONTINUE);
+ next_trap_read = action;
+ } else { // type == bp_type_t::WRITE
+ if (next_trap_write == bp_action_t::CONTINUE) {
+ next_trap_write = action;
+ } else if (next_trap_write == bp_action_t::FAULT) {
+ // do_sweep_messages() may combine multiple write events into one socket write
+ ceph_assert_always(action == bp_action_t::FAULT || action == bp_action_t::CONTINUE);
+ } else {
+ ceph_abort();
+ }
+ }
+}
+
+seastar::future<>
+Socket::try_trap_pre(bp_action_t& trap) {
+ auto action = trap;
+ trap = bp_action_t::CONTINUE;
+ switch (action) {
+ case bp_action_t::CONTINUE:
+ break;
+ case bp_action_t::FAULT:
+ logger().info("[Test] got FAULT");
+ throw std::system_error(make_error_code(error::negotiation_failure));
+ case bp_action_t::BLOCK:
+ logger().info("[Test] got BLOCK");
+ return blocker->block();
+ case bp_action_t::STALL:
+ trap = action;
+ break;
+ default:
+ ceph_abort("unexpected action from trap");
+ }
+ return seastar::make_ready_future<>();
+}
+
+seastar::future<>
+Socket::try_trap_post(bp_action_t& trap) {
+ auto action = trap;
+ trap = bp_action_t::CONTINUE;
+ switch (action) {
+ case bp_action_t::CONTINUE:
+ break;
+ case bp_action_t::STALL:
+ logger().info("[Test] got STALL and block");
+ force_shutdown();
+ return blocker->block();
+ default:
+ ceph_abort("unexpected action from trap");
+ }
+ return seastar::make_ready_future<>();
+}
+#endif
+
+ShardedServerSocket::ShardedServerSocket(
+ seastar::shard_id sid,
+ bool dispatch_only_on_primary_sid,
+ construct_tag)
+ : primary_sid{sid}, dispatch_only_on_primary_sid{dispatch_only_on_primary_sid}
+{
+}
+
+ShardedServerSocket::~ShardedServerSocket()
+{
+ assert(!listener);
+ // detect whether user have called destroy() properly
+ ceph_assert_always(!service);
+}
+
+listen_ertr::future<>
+ShardedServerSocket::listen(entity_addr_t addr)
+{
+ ceph_assert_always(seastar::this_shard_id() == primary_sid);
+ logger().debug("ShardedServerSocket({})::listen()...", addr);
+ return this->container().invoke_on_all([addr](auto& ss) {
+ ss.listen_addr = addr;
+ seastar::socket_address s_addr(addr.in4_addr());
+ seastar::listen_options lo;
+ lo.reuse_address = true;
+ if (ss.dispatch_only_on_primary_sid) {
+ lo.set_fixed_cpu(ss.primary_sid);
+ }
+ ss.listener = seastar::listen(s_addr, lo);
+ }).then([] {
+ return listen_ertr::now();
+ }).handle_exception_type(
+ [addr](const std::system_error& e) -> listen_ertr::future<> {
+ if (e.code() == std::errc::address_in_use) {
+ logger().debug("ShardedServerSocket({})::listen(): address in use", addr);
+ return crimson::ct_error::address_in_use::make();
+ } else if (e.code() == std::errc::address_not_available) {
+ logger().debug("ShardedServerSocket({})::listen(): address not available",
+ addr);
+ return crimson::ct_error::address_not_available::make();
+ }
+ logger().error("ShardedServerSocket({})::listen(): "
+ "got unexpeted error {}", addr, e.what());
+ ceph_abort();
+ });
+}
+
+seastar::future<>
+ShardedServerSocket::accept(accept_func_t &&_fn_accept)
+{
+ ceph_assert_always(seastar::this_shard_id() == primary_sid);
+ logger().debug("ShardedServerSocket({})::accept()...", listen_addr);
+ return this->container().invoke_on_all([_fn_accept](auto &ss) {
+ assert(ss.listener);
+ ss.fn_accept = _fn_accept;
+ // gate accepting
+ // ShardedServerSocket::shutdown() will drain the continuations in the gate
+ // so ignore the returned future
+ std::ignore = seastar::with_gate(ss.shutdown_gate, [&ss] {
+ return seastar::keep_doing([&ss] {
+ return ss.listener->accept(
+ ).then([&ss](seastar::accept_result accept_result) {
+#ifndef NDEBUG
+ if (ss.dispatch_only_on_primary_sid) {
+ // see seastar::listen_options::set_fixed_cpu()
+ ceph_assert_always(seastar::this_shard_id() == ss.primary_sid);
+ }
+#endif
+ auto [socket, paddr] = std::move(accept_result);
+ entity_addr_t peer_addr;
+ peer_addr.set_sockaddr(&paddr.as_posix_sockaddr());
+ peer_addr.set_type(ss.listen_addr.get_type());
+ SocketRef _socket = std::make_unique<Socket>(
+ std::move(socket), Socket::side_t::acceptor,
+ peer_addr.get_port(), Socket::construct_tag{});
+ logger().debug("ShardedServerSocket({})::accept(): accepted peer {}, "
+ "socket {}, dispatch_only_on_primary_sid = {}",
+ ss.listen_addr, peer_addr, fmt::ptr(_socket),
+ ss.dispatch_only_on_primary_sid);
+ std::ignore = seastar::with_gate(
+ ss.shutdown_gate,
+ [socket=std::move(_socket), peer_addr, &ss]() mutable {
+ return ss.fn_accept(std::move(socket), peer_addr
+ ).handle_exception([&ss, peer_addr](auto eptr) {
+ const char *e_what;
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception &e) {
+ e_what = e.what();
+ }
+ logger().error("ShardedServerSocket({})::accept(): "
+ "fn_accept(s, {}) got unexpected exception {}",
+ ss.listen_addr, peer_addr, e_what);
+ ceph_abort();
+ });
+ });
+ });
+ }).handle_exception_type([&ss](const std::system_error& e) {
+ if (e.code() == std::errc::connection_aborted ||
+ e.code() == std::errc::invalid_argument) {
+ logger().debug("ShardedServerSocket({})::accept(): stopped ({})",
+ ss.listen_addr, e.what());
+ } else {
+ throw;
+ }
+ }).handle_exception([&ss](auto eptr) {
+ const char *e_what;
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception &e) {
+ e_what = e.what();
+ }
+ logger().error("ShardedServerSocket({})::accept(): "
+ "got unexpected exception {}", ss.listen_addr, e_what);
+ ceph_abort();
+ });
+ });
+ });
+}
+
+seastar::future<>
+ShardedServerSocket::shutdown_destroy()
+{
+ assert(seastar::this_shard_id() == primary_sid);
+ logger().debug("ShardedServerSocket({})::shutdown_destroy()...", listen_addr);
+ // shutdown shards
+ return this->container().invoke_on_all([](auto& ss) {
+ if (ss.listener) {
+ ss.listener->abort_accept();
+ }
+ return ss.shutdown_gate.close();
+ }).then([this] {
+ // destroy shards
+ return this->container().invoke_on_all([](auto& ss) {
+ assert(ss.shutdown_gate.is_closed());
+ ss.listen_addr = entity_addr_t();
+ ss.listener.reset();
+ });
+ }).then([this] {
+ // stop the sharded service: we should only construct/stop shards on #0
+ return this->container().invoke_on(0, [](auto& ss) {
+ assert(ss.service);
+ return ss.service->stop().finally([cleanup = std::move(ss.service)] {});
+ });
+ });
+}
+
+seastar::future<ShardedServerSocket*>
+ShardedServerSocket::create(bool dispatch_only_on_this_shard)
+{
+ auto primary_sid = seastar::this_shard_id();
+ // start the sharded service: we should only construct/stop shards on #0
+ return seastar::smp::submit_to(0, [primary_sid, dispatch_only_on_this_shard] {
+ auto service = std::make_unique<sharded_service_t>();
+ return service->start(
+ primary_sid, dispatch_only_on_this_shard, construct_tag{}
+ ).then([service = std::move(service)]() mutable {
+ auto p_shard = service.get();
+ p_shard->local().service = std::move(service);
+ return p_shard;
+ });
+ }).then([](auto p_shard) {
+ return &p_shard->local();
+ });
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Socket.h b/src/crimson/net/Socket.h
new file mode 100644
index 000000000..478f2d630
--- /dev/null
+++ b/src/crimson/net/Socket.h
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/gate.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sharded.hh>
+
+#include "include/buffer.h"
+
+#include "crimson/common/log.h"
+#include "Errors.h"
+#include "Fwd.h"
+
+#ifdef UNIT_TESTS_BUILT
+#include "Interceptor.h"
+#endif
+
+namespace crimson::net {
+
+class Socket;
+using SocketRef = std::unique_ptr<Socket>;
+using SocketFRef = seastar::foreign_ptr<SocketRef>;
+
+class Socket {
+ struct construct_tag {};
+
+public:
+ // if acceptor side, peer is using a different port (ephemeral_port)
+ // if connector side, I'm using a different port (ephemeral_port)
+ enum class side_t {
+ acceptor,
+ connector
+ };
+ Socket(seastar::connected_socket &&, side_t, uint16_t e_port, construct_tag);
+
+ ~Socket();
+
+ Socket(Socket&& o) = delete;
+
+ seastar::shard_id get_shard_id() const {
+ return sid;
+ }
+
+ side_t get_side() const {
+ return side;
+ }
+
+ uint16_t get_ephemeral_port() const {
+ return ephemeral_port;
+ }
+
+ seastar::socket_address get_local_address() const {
+ return socket.local_address();
+ }
+
+ bool is_shutdown() const {
+ assert(seastar::this_shard_id() == sid);
+ return socket_is_shutdown;
+ }
+
+ // learn my ephemeral_port as connector.
+ // unfortunately, there's no way to identify which port I'm using as
+ // connector with current seastar interface.
+ void learn_ephemeral_port_as_connector(uint16_t port) {
+ assert(side == side_t::connector &&
+ (ephemeral_port == 0 || ephemeral_port == port));
+ ephemeral_port = port;
+ }
+
+ /// read the requested number of bytes into a bufferlist
+ seastar::future<bufferlist> read(size_t bytes);
+
+ seastar::future<bufferptr> read_exactly(size_t bytes);
+
+ seastar::future<> write(bufferlist);
+
+ seastar::future<> flush();
+
+ seastar::future<> write_flush(bufferlist);
+
+ // preemptively disable further reads or writes, can only be shutdown once.
+ void shutdown();
+
+ /// Socket can only be closed once.
+ seastar::future<> close();
+
+ static seastar::future<SocketRef>
+ connect(const entity_addr_t& peer_addr);
+
+ /*
+ * test interfaces
+ */
+
+ // shutdown for tests
+ void force_shutdown() {
+ assert(seastar::this_shard_id() == sid);
+ socket.shutdown_input();
+ socket.shutdown_output();
+ }
+
+ // shutdown input_stream only, for tests
+ void force_shutdown_in() {
+ assert(seastar::this_shard_id() == sid);
+ socket.shutdown_input();
+ }
+
+ // shutdown output_stream only, for tests
+ void force_shutdown_out() {
+ assert(seastar::this_shard_id() == sid);
+ socket.shutdown_output();
+ }
+
+private:
+ const seastar::shard_id sid;
+ seastar::connected_socket socket;
+ seastar::input_stream<char> in;
+ seastar::output_stream<char> out;
+ bool socket_is_shutdown;
+ side_t side;
+ uint16_t ephemeral_port;
+
+#ifndef NDEBUG
+ bool closed = false;
+#endif
+
+ /// buffer state for read()
+ struct {
+ bufferlist buffer;
+ size_t remaining;
+ } r;
+
+#ifdef UNIT_TESTS_BUILT
+public:
+ void set_trap(bp_type_t type, bp_action_t action, socket_blocker* blocker_);
+
+private:
+ seastar::future<> try_trap_pre(bp_action_t& trap);
+
+ seastar::future<> try_trap_post(bp_action_t& trap);
+
+ bp_action_t next_trap_read = bp_action_t::CONTINUE;
+ bp_action_t next_trap_write = bp_action_t::CONTINUE;
+ socket_blocker* blocker = nullptr;
+
+#endif
+ friend class ShardedServerSocket;
+};
+
+using listen_ertr = crimson::errorator<
+ crimson::ct_error::address_in_use, // The address is already bound
+ crimson::ct_error::address_not_available // https://techoverflow.net/2021/08/06/how-i-fixed-python-oserror-errno-99-cannot-assign-requested-address/
+ >;
+
+class ShardedServerSocket
+ : public seastar::peering_sharded_service<ShardedServerSocket> {
+ struct construct_tag {};
+
+public:
+ ShardedServerSocket(
+ seastar::shard_id sid,
+ bool dispatch_only_on_primary_sid,
+ construct_tag);
+
+ ~ShardedServerSocket();
+
+ ShardedServerSocket(ShardedServerSocket&&) = delete;
+ ShardedServerSocket(const ShardedServerSocket&) = delete;
+ ShardedServerSocket& operator=(ShardedServerSocket&&) = delete;
+ ShardedServerSocket& operator=(const ShardedServerSocket&) = delete;
+
+ bool is_fixed_shard_dispatching() const {
+ return dispatch_only_on_primary_sid;
+ }
+
+ listen_ertr::future<> listen(entity_addr_t addr);
+
+ using accept_func_t =
+ std::function<seastar::future<>(SocketRef, entity_addr_t)>;
+ seastar::future<> accept(accept_func_t &&_fn_accept);
+
+ seastar::future<> shutdown_destroy();
+
+ static seastar::future<ShardedServerSocket*> create(
+ bool dispatch_only_on_this_shard);
+
+private:
+ const seastar::shard_id primary_sid;
+ /// XXX: Remove once all infrastructure uses multi-core messenger
+ const bool dispatch_only_on_primary_sid;
+ entity_addr_t listen_addr;
+ std::optional<seastar::server_socket> listener;
+ seastar::gate shutdown_gate;
+ accept_func_t fn_accept;
+
+ using sharded_service_t = seastar::sharded<ShardedServerSocket>;
+ std::unique_ptr<sharded_service_t> service;
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/SocketConnection.cc b/src/crimson/net/SocketConnection.cc
new file mode 100644
index 000000000..57e5c12c1
--- /dev/null
+++ b/src/crimson/net/SocketConnection.cc
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "SocketConnection.h"
+
+#include "ProtocolV2.h"
+#include "SocketMessenger.h"
+
+#ifdef UNIT_TESTS_BUILT
+#include "Interceptor.h"
+#endif
+
+using std::ostream;
+using crimson::common::local_conf;
+
+namespace crimson::net {
+
+SocketConnection::SocketConnection(SocketMessenger& messenger,
+ ChainedDispatchers& dispatchers)
+ : msgr_sid{messenger.get_shard_id()}, messenger(messenger)
+{
+ auto ret = create_handlers(dispatchers, *this);
+ io_handler = std::move(ret.io_handler);
+ protocol = std::move(ret.protocol);
+#ifdef UNIT_TESTS_BUILT
+ if (messenger.interceptor) {
+ interceptor = messenger.interceptor;
+ interceptor->register_conn(this->get_local_shared_foreign_from_this());
+ }
+#endif
+}
+
+SocketConnection::~SocketConnection() {}
+
+bool SocketConnection::is_connected() const
+{
+ return io_handler->is_connected();
+}
+
+#ifdef UNIT_TESTS_BUILT
+bool SocketConnection::is_protocol_ready() const
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ return protocol->is_ready();
+}
+
+bool SocketConnection::is_protocol_standby() const {
+ assert(seastar::this_shard_id() == msgr_sid);
+ return protocol->is_standby();
+}
+
+bool SocketConnection::is_protocol_closed() const
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ return protocol->is_closed();
+}
+
+bool SocketConnection::is_protocol_closed_clean() const
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ return protocol->is_closed_clean();
+}
+
+#endif
+bool SocketConnection::peer_wins() const
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ return (messenger.get_myaddr() > peer_addr || policy.server);
+}
+
+seastar::future<> SocketConnection::send(MessageURef _msg)
+{
+ // may be invoked from any core
+ MessageFRef msg = seastar::make_foreign(std::move(_msg));
+ return io_handler->send(std::move(msg));
+}
+
+seastar::future<> SocketConnection::send_keepalive()
+{
+ // may be invoked from any core
+ return io_handler->send_keepalive();
+}
+
+SocketConnection::clock_t::time_point
+SocketConnection::get_last_keepalive() const
+{
+ return io_handler->get_last_keepalive();
+}
+
+SocketConnection::clock_t::time_point
+SocketConnection::get_last_keepalive_ack() const
+{
+ return io_handler->get_last_keepalive_ack();
+}
+
+void SocketConnection::set_last_keepalive_ack(clock_t::time_point when)
+{
+ io_handler->set_last_keepalive_ack(when);
+}
+
+void SocketConnection::mark_down()
+{
+ io_handler->mark_down();
+}
+
+void
+SocketConnection::start_connect(const entity_addr_t& _peer_addr,
+ const entity_name_t& _peer_name)
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ protocol->start_connect(_peer_addr, _peer_name);
+}
+
+void
+SocketConnection::start_accept(SocketFRef&& sock,
+ const entity_addr_t& _peer_addr)
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ protocol->start_accept(std::move(sock), _peer_addr);
+}
+
+seastar::future<>
+SocketConnection::close_clean_yielded()
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ return protocol->close_clean_yielded();
+}
+
+seastar::socket_address SocketConnection::get_local_address() const {
+ assert(seastar::this_shard_id() == msgr_sid);
+ return socket->get_local_address();
+}
+
+ConnectionRef
+SocketConnection::get_local_shared_foreign_from_this()
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ return make_local_shared_foreign(
+ seastar::make_foreign(shared_from_this()));
+}
+
+SocketMessenger &
+SocketConnection::get_messenger() const
+{
+ assert(seastar::this_shard_id() == msgr_sid);
+ return messenger;
+}
+
+seastar::shard_id
+SocketConnection::get_messenger_shard_id() const
+{
+ return msgr_sid;
+}
+
+void SocketConnection::set_peer_type(entity_type_t peer_type) {
+ assert(seastar::this_shard_id() == msgr_sid);
+ // it is not allowed to assign an unknown value when the current
+ // value is known
+ assert(!(peer_type == 0 &&
+ peer_name.type() != 0));
+ // it is not allowed to assign a different known value when the
+ // current value is also known.
+ assert(!(peer_type != 0 &&
+ peer_name.type() != 0 &&
+ peer_type != peer_name.type()));
+ peer_name._type = peer_type;
+}
+
+void SocketConnection::set_peer_id(int64_t peer_id) {
+ assert(seastar::this_shard_id() == msgr_sid);
+ // it is not allowed to assign an unknown value when the current
+ // value is known
+ assert(!(peer_id == entity_name_t::NEW &&
+ peer_name.num() != entity_name_t::NEW));
+ // it is not allowed to assign a different known value when the
+ // current value is also known.
+ assert(!(peer_id != entity_name_t::NEW &&
+ peer_name.num() != entity_name_t::NEW &&
+ peer_id != peer_name.num()));
+ peer_name._num = peer_id;
+}
+
+void SocketConnection::set_features(uint64_t f) {
+ assert(seastar::this_shard_id() == msgr_sid);
+ features = f;
+}
+
+void SocketConnection::set_socket(Socket *s) {
+ assert(seastar::this_shard_id() == msgr_sid);
+ socket = s;
+}
+
+void SocketConnection::print(ostream& out) const {
+ out << (void*)this << " ";
+ messenger.print(out);
+ if (seastar::this_shard_id() != msgr_sid) {
+ out << " >> " << get_peer_name() << " " << peer_addr;
+ } else if (!socket) {
+ out << " >> " << get_peer_name() << " " << peer_addr;
+ } else if (socket->get_side() == Socket::side_t::acceptor) {
+ out << " >> " << get_peer_name() << " " << peer_addr
+ << "@" << socket->get_ephemeral_port();
+ } else { // socket->get_side() == Socket::side_t::connector
+ out << "@" << socket->get_ephemeral_port()
+ << " >> " << get_peer_name() << " " << peer_addr;
+ }
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/SocketConnection.h b/src/crimson/net/SocketConnection.h
new file mode 100644
index 000000000..823d6c574
--- /dev/null
+++ b/src/crimson/net/SocketConnection.h
@@ -0,0 +1,236 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <seastar/core/sharded.hh>
+
+#include "msg/Policy.h"
+#include "crimson/common/throttle.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Socket.h"
+
+namespace crimson::net {
+
+class ProtocolV2;
+class SocketMessenger;
+class SocketConnection;
+using SocketConnectionRef = seastar::shared_ptr<SocketConnection>;
+
+#ifdef UNIT_TESTS_BUILT
+class Interceptor;
+#endif
+
+/**
+ * ConnectionHandler
+ *
+ * The interface class to implement Connection, called by SocketConnection.
+ *
+ * The operations must be done in get_shard_id().
+ */
+class ConnectionHandler {
+public:
+ using clock_t = seastar::lowres_system_clock;
+
+ virtual ~ConnectionHandler() = default;
+
+ ConnectionHandler(const ConnectionHandler &) = delete;
+ ConnectionHandler(ConnectionHandler &&) = delete;
+ ConnectionHandler &operator=(const ConnectionHandler &) = delete;
+ ConnectionHandler &operator=(ConnectionHandler &&) = delete;
+
+ virtual seastar::shard_id get_shard_id() const = 0;
+
+ virtual bool is_connected() const = 0;
+
+ virtual seastar::future<> send(MessageFRef) = 0;
+
+ virtual seastar::future<> send_keepalive() = 0;
+
+ virtual clock_t::time_point get_last_keepalive() const = 0;
+
+ virtual clock_t::time_point get_last_keepalive_ack() const = 0;
+
+ virtual void set_last_keepalive_ack(clock_t::time_point) = 0;
+
+ virtual void mark_down() = 0;
+
+protected:
+ ConnectionHandler() = default;
+};
+
+class SocketConnection : public Connection {
+ /*
+ * Connection interfaces, public to users
+ * Working in ConnectionHandler::get_shard_id()
+ */
+ public:
+ SocketConnection(SocketMessenger& messenger,
+ ChainedDispatchers& dispatchers);
+
+ ~SocketConnection() override;
+
+ const seastar::shard_id get_shard_id() const override {
+ return io_handler->get_shard_id();
+ }
+
+ const entity_name_t &get_peer_name() const override {
+ return peer_name;
+ }
+
+ const entity_addr_t &get_peer_addr() const override {
+ return peer_addr;
+ }
+
+ const entity_addr_t &get_peer_socket_addr() const override {
+ return target_addr;
+ }
+
+ uint64_t get_features() const override {
+ return features;
+ }
+
+ bool is_connected() const override;
+
+ seastar::future<> send(MessageURef msg) override;
+
+ seastar::future<> send_keepalive() override;
+
+ clock_t::time_point get_last_keepalive() const override;
+
+ clock_t::time_point get_last_keepalive_ack() const override;
+
+ void set_last_keepalive_ack(clock_t::time_point when) override;
+
+ void mark_down() override;
+
+ bool has_user_private() const override {
+ return user_private != nullptr;
+ }
+
+ user_private_t &get_user_private() override {
+ assert(has_user_private());
+ return *user_private;
+ }
+
+ void set_user_private(std::unique_ptr<user_private_t> new_user_private) override {
+ assert(!has_user_private());
+ user_private = std::move(new_user_private);
+ }
+
+ void print(std::ostream& out) const override;
+
+ /*
+ * Public to SocketMessenger
+ * Working in SocketMessenger::get_shard_id();
+ */
+ public:
+ /// start a handshake from the client's perspective,
+ /// only call when SocketConnection first construct
+ void start_connect(const entity_addr_t& peer_addr,
+ const entity_name_t& peer_name);
+
+ /// start a handshake from the server's perspective,
+ /// only call when SocketConnection first construct
+ void start_accept(SocketFRef&& socket,
+ const entity_addr_t& peer_addr);
+
+ seastar::future<> close_clean_yielded();
+
+ seastar::socket_address get_local_address() const;
+
+ seastar::shard_id get_messenger_shard_id() const;
+
+ SocketMessenger &get_messenger() const;
+
+ ConnectionRef get_local_shared_foreign_from_this();
+
+private:
+ void set_peer_type(entity_type_t peer_type);
+
+ void set_peer_id(int64_t peer_id);
+
+ void set_peer_name(entity_name_t name) {
+ set_peer_type(name.type());
+ set_peer_id(name.num());
+ }
+
+ void set_features(uint64_t f);
+
+ void set_socket(Socket *s);
+
+#ifdef UNIT_TESTS_BUILT
+ bool is_protocol_ready() const override;
+
+ bool is_protocol_standby() const override;
+
+ bool is_protocol_closed_clean() const override;
+
+ bool is_protocol_closed() const override;
+
+ // peer wins if myaddr > peeraddr
+ bool peer_wins() const override;
+
+ Interceptor *interceptor = nullptr;
+#else
+ // peer wins if myaddr > peeraddr
+ bool peer_wins() const;
+#endif
+
+private:
+ const seastar::shard_id msgr_sid;
+
+ /*
+ * Core owner is messenger core, may allow to access from the I/O core.
+ */
+ SocketMessenger& messenger;
+
+ std::unique_ptr<ProtocolV2> protocol;
+
+ Socket *socket = nullptr;
+
+ entity_name_t peer_name = {0, entity_name_t::NEW};
+
+ entity_addr_t peer_addr;
+
+ // which of the peer_addrs we're connecting to (as client)
+ // or should reconnect to (as peer)
+ entity_addr_t target_addr;
+
+ uint64_t features = 0;
+
+ ceph::net::Policy<crimson::common::Throttle> policy;
+
+ uint64_t peer_global_id = 0;
+
+ /*
+ * Core owner is I/O core (mutable).
+ */
+ std::unique_ptr<ConnectionHandler> io_handler;
+
+ /*
+ * Core owner is up to the connection user.
+ */
+ std::unique_ptr<user_private_t> user_private;
+
+ friend class IOHandler;
+ friend class ProtocolV2;
+ friend class FrameAssemblerV2;
+};
+
+} // namespace crimson::net
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::net::SocketConnection> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/net/SocketMessenger.cc b/src/crimson/net/SocketMessenger.cc
new file mode 100644
index 000000000..382d08f98
--- /dev/null
+++ b/src/crimson/net/SocketMessenger.cc
@@ -0,0 +1,485 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "SocketMessenger.h"
+
+#include <seastar/core/sleep.hh>
+
+#include <tuple>
+#include <boost/functional/hash.hpp>
+#include <fmt/os.h>
+
+#include "auth/Auth.h"
+#include "Errors.h"
+#include "Socket.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_ms);
+ }
+}
+
+namespace crimson::net {
+
+SocketMessenger::SocketMessenger(const entity_name_t& myname,
+ const std::string& logic_name,
+ uint32_t nonce,
+ bool dispatch_only_on_this_shard)
+ : sid{seastar::this_shard_id()},
+ logic_name{logic_name},
+ nonce{nonce},
+ dispatch_only_on_sid{dispatch_only_on_this_shard},
+ my_name{myname}
+{}
+
+SocketMessenger::~SocketMessenger()
+{
+ logger().debug("~SocketMessenger: {}", logic_name);
+ ceph_assert_always(seastar::this_shard_id() == sid);
+ ceph_assert(!listener);
+}
+
+bool SocketMessenger::set_addr_unknowns(const entity_addrvec_t &addrs)
+{
+ assert(seastar::this_shard_id() == sid);
+ bool ret = false;
+
+ entity_addrvec_t newaddrs = my_addrs;
+ for (auto& a : newaddrs.v) {
+ if (a.is_blank_ip()) {
+ int type = a.get_type();
+ int port = a.get_port();
+ uint32_t nonce = a.get_nonce();
+ for (auto& b : addrs.v) {
+ if (a.get_family() == b.get_family()) {
+ logger().debug(" assuming my addr {} matches provided addr {}", a, b);
+ a = b;
+ a.set_nonce(nonce);
+ a.set_type(type);
+ a.set_port(port);
+ ret = true;
+ break;
+ }
+ }
+ }
+ }
+ my_addrs = newaddrs;
+ return ret;
+}
+
+void SocketMessenger::set_myaddrs(const entity_addrvec_t& addrs)
+{
+ assert(seastar::this_shard_id() == sid);
+ my_addrs = addrs;
+ for (auto& addr : my_addrs.v) {
+ addr.nonce = nonce;
+ }
+}
+
+crimson::net::listen_ertr::future<>
+SocketMessenger::do_listen(const entity_addrvec_t& addrs)
+{
+ ceph_assert(addrs.front().get_family() == AF_INET);
+ set_myaddrs(addrs);
+ return seastar::futurize_invoke([this] {
+ if (!listener) {
+ return ShardedServerSocket::create(dispatch_only_on_sid
+ ).then([this] (auto _listener) {
+ listener = _listener;
+ });
+ } else {
+ return seastar::now();
+ }
+ }).then([this] () -> listen_ertr::future<> {
+ const entity_addr_t listen_addr = get_myaddr();
+ logger().debug("{} do_listen: try listen {}...", *this, listen_addr);
+ if (!listener) {
+ logger().warn("{} do_listen: listener doesn't exist", *this);
+ return listen_ertr::now();
+ }
+ return listener->listen(listen_addr);
+ });
+}
+
+SocketMessenger::bind_ertr::future<>
+SocketMessenger::try_bind(const entity_addrvec_t& addrs,
+ uint32_t min_port, uint32_t max_port)
+{
+ // the classical OSD iterates over the addrvec and tries to listen on each
+ // addr. crimson doesn't need to follow as there is a consensus we need to
+ // worry only about proto v2.
+ assert(addrs.size() == 1);
+ auto addr = addrs.msgr2_addr();
+ if (addr.get_port() != 0) {
+ return do_listen(addrs).safe_then([this] {
+ logger().info("{} try_bind: done", *this);
+ });
+ }
+ ceph_assert(min_port <= max_port);
+ return seastar::do_with(uint32_t(min_port),
+ [this, max_port, addr] (auto& port) {
+ return seastar::repeat_until_value([this, max_port, addr, &port] {
+ auto to_bind = addr;
+ to_bind.set_port(port);
+ return do_listen(entity_addrvec_t{to_bind}
+ ).safe_then([this] () -> seastar::future<std::optional<std::error_code>> {
+ logger().info("{} try_bind: done", *this);
+ return seastar::make_ready_future<std::optional<std::error_code>>(
+ std::make_optional<std::error_code>(std::error_code{/* success! */}));
+ }, listen_ertr::all_same_way([this, max_port, &port]
+ (const std::error_code& e) mutable
+ -> seastar::future<std::optional<std::error_code>> {
+ logger().trace("{} try_bind: {} got error {}", *this, port, e);
+ if (port == max_port) {
+ return seastar::make_ready_future<std::optional<std::error_code>>(
+ std::make_optional<std::error_code>(e));
+ }
+ ++port;
+ return seastar::make_ready_future<std::optional<std::error_code>>(
+ std::optional<std::error_code>{std::nullopt});
+ }));
+ }).then([] (const std::error_code e) -> bind_ertr::future<> {
+ if (!e) {
+ return bind_ertr::now(); // success!
+ } else if (e == std::errc::address_in_use) {
+ return crimson::ct_error::address_in_use::make();
+ } else if (e == std::errc::address_not_available) {
+ return crimson::ct_error::address_not_available::make();
+ }
+ ceph_abort();
+ });
+ });
+}
+
+SocketMessenger::bind_ertr::future<>
+SocketMessenger::bind(const entity_addrvec_t& addrs)
+{
+ assert(seastar::this_shard_id() == sid);
+ using crimson::common::local_conf;
+ return seastar::do_with(int64_t{local_conf()->ms_bind_retry_count},
+ [this, addrs] (auto& count) {
+ return seastar::repeat_until_value([this, addrs, &count] {
+ assert(count >= 0);
+ return try_bind(addrs,
+ local_conf()->ms_bind_port_min,
+ local_conf()->ms_bind_port_max)
+ .safe_then([this] {
+ logger().info("{} try_bind: done", *this);
+ return seastar::make_ready_future<std::optional<std::error_code>>(
+ std::make_optional<std::error_code>(std::error_code{/* success! */}));
+ }, bind_ertr::all_same_way([this, &count] (const std::error_code error) {
+ if (count-- > 0) {
+ logger().info("{} was unable to bind. Trying again in {} seconds",
+ *this, local_conf()->ms_bind_retry_delay);
+ return seastar::sleep(
+ std::chrono::seconds(local_conf()->ms_bind_retry_delay)
+ ).then([] {
+ // one more time, please
+ return seastar::make_ready_future<std::optional<std::error_code>>(
+ std::optional<std::error_code>{std::nullopt});
+ });
+ } else {
+ logger().info("{} was unable to bind after {} attempts: {}",
+ *this, local_conf()->ms_bind_retry_count, error);
+ return seastar::make_ready_future<std::optional<std::error_code>>(
+ std::make_optional<std::error_code>(error));
+ }
+ }));
+ }).then([] (const std::error_code error) -> bind_ertr::future<> {
+ if (!error) {
+ return bind_ertr::now(); // success!
+ } else if (error == std::errc::address_in_use) {
+ return crimson::ct_error::address_in_use::make();
+ } else if (error == std::errc::address_not_available) {
+ return crimson::ct_error::address_not_available::make();
+ }
+ ceph_abort();
+ });
+ });
+}
+
+seastar::future<> SocketMessenger::accept(
+ SocketFRef &&socket, const entity_addr_t &peer_addr)
+{
+ assert(seastar::this_shard_id() == sid);
+ SocketConnectionRef conn =
+ seastar::make_shared<SocketConnection>(*this, dispatchers);
+ conn->start_accept(std::move(socket), peer_addr);
+ return seastar::now();
+}
+
+seastar::future<> SocketMessenger::start(
+ const dispatchers_t& _dispatchers) {
+ assert(seastar::this_shard_id() == sid);
+
+ dispatchers.assign(_dispatchers);
+ if (listener) {
+ // make sure we have already bound to a valid address
+ ceph_assert(get_myaddr().is_msgr2());
+ ceph_assert(get_myaddr().get_port() > 0);
+
+ return listener->accept([this](SocketRef _socket, entity_addr_t peer_addr) {
+ assert(get_myaddr().is_msgr2());
+ SocketFRef socket = seastar::make_foreign(std::move(_socket));
+ if (listener->is_fixed_shard_dispatching()) {
+ return accept(std::move(socket), peer_addr);
+ } else {
+ return seastar::smp::submit_to(sid,
+ [this, peer_addr, socket = std::move(socket)]() mutable {
+ return accept(std::move(socket), peer_addr);
+ });
+ }
+ });
+ }
+ return seastar::now();
+}
+
+crimson::net::ConnectionRef
+SocketMessenger::connect(const entity_addr_t& peer_addr, const entity_name_t& peer_name)
+{
+ assert(seastar::this_shard_id() == sid);
+
+ // make sure we connect to a valid peer_addr
+ if (!peer_addr.is_msgr2()) {
+ ceph_abort_msg("ProtocolV1 is no longer supported");
+ }
+ ceph_assert(peer_addr.get_port() > 0);
+
+ if (auto found = lookup_conn(peer_addr); found) {
+ logger().debug("{} connect to existing", *found);
+ return found->get_local_shared_foreign_from_this();
+ }
+ SocketConnectionRef conn =
+ seastar::make_shared<SocketConnection>(*this, dispatchers);
+ conn->start_connect(peer_addr, peer_name);
+ return conn->get_local_shared_foreign_from_this();
+}
+
+seastar::future<> SocketMessenger::shutdown()
+{
+ assert(seastar::this_shard_id() == sid);
+ return seastar::futurize_invoke([this] {
+ assert(dispatchers.empty());
+ if (listener) {
+ auto d_listener = listener;
+ listener = nullptr;
+ return d_listener->shutdown_destroy();
+ } else {
+ return seastar::now();
+ }
+ // close all connections
+ }).then([this] {
+ return seastar::parallel_for_each(accepting_conns, [] (auto conn) {
+ return conn->close_clean_yielded();
+ });
+ }).then([this] {
+ ceph_assert(accepting_conns.empty());
+ return seastar::parallel_for_each(connections, [] (auto conn) {
+ return conn.second->close_clean_yielded();
+ });
+ }).then([this] {
+ return seastar::parallel_for_each(closing_conns, [] (auto conn) {
+ return conn->close_clean_yielded();
+ });
+ }).then([this] {
+ ceph_assert(connections.empty());
+ shutdown_promise.set_value();
+ });
+}
+
+static entity_addr_t choose_addr(
+ const entity_addr_t &peer_addr_for_me,
+ const SocketConnection& conn)
+{
+ using crimson::common::local_conf;
+ // XXX: a syscall is here
+ if (const auto local_addr = conn.get_local_address();
+ local_conf()->ms_learn_addr_from_peer) {
+ logger().info("{} peer {} says I am {} (socket says {})",
+ conn, conn.get_peer_socket_addr(), peer_addr_for_me,
+ local_addr);
+ return peer_addr_for_me;
+ } else {
+ const auto local_addr_for_me = conn.get_local_address();
+ logger().info("{} socket to {} says I am {} (peer says {})",
+ conn, conn.get_peer_socket_addr(),
+ local_addr, peer_addr_for_me);
+ entity_addr_t addr;
+ addr.set_sockaddr(&local_addr_for_me.as_posix_sockaddr());
+ return addr;
+ }
+}
+
+void SocketMessenger::learned_addr(
+ const entity_addr_t &peer_addr_for_me,
+ const SocketConnection& conn)
+{
+ assert(seastar::this_shard_id() == sid);
+ if (!need_addr) {
+ if ((!get_myaddr().is_any() &&
+ get_myaddr().get_type() != peer_addr_for_me.get_type()) ||
+ get_myaddr().get_family() != peer_addr_for_me.get_family() ||
+ !get_myaddr().is_same_host(peer_addr_for_me)) {
+ logger().warn("{} peer_addr_for_me {} type/family/IP doesn't match myaddr {}",
+ conn, peer_addr_for_me, get_myaddr());
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ return;
+ }
+
+ if (get_myaddr().get_type() == entity_addr_t::TYPE_NONE) {
+ // Not bound
+ auto addr = choose_addr(peer_addr_for_me, conn);
+ addr.set_type(entity_addr_t::TYPE_ANY);
+ addr.set_port(0);
+ need_addr = false;
+ set_myaddrs(entity_addrvec_t{addr});
+ logger().info("{} learned myaddr={} (unbound)", conn, get_myaddr());
+ } else {
+ // Already bound
+ if (!get_myaddr().is_any() &&
+ get_myaddr().get_type() != peer_addr_for_me.get_type()) {
+ logger().warn("{} peer_addr_for_me {} type doesn't match myaddr {}",
+ conn, peer_addr_for_me, get_myaddr());
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ if (get_myaddr().get_family() != peer_addr_for_me.get_family()) {
+ logger().warn("{} peer_addr_for_me {} family doesn't match myaddr {}",
+ conn, peer_addr_for_me, get_myaddr());
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ }
+ if (get_myaddr().is_blank_ip()) {
+ auto addr = choose_addr(peer_addr_for_me, conn);
+ addr.set_type(get_myaddr().get_type());
+ addr.set_port(get_myaddr().get_port());
+ need_addr = false;
+ set_myaddrs(entity_addrvec_t{addr});
+ logger().info("{} learned myaddr={} (blank IP)", conn, get_myaddr());
+ } else if (!get_myaddr().is_same_host(peer_addr_for_me)) {
+ logger().warn("{} peer_addr_for_me {} IP doesn't match myaddr {}",
+ conn, peer_addr_for_me, get_myaddr());
+ throw std::system_error(
+ make_error_code(crimson::net::error::bad_peer_address));
+ } else {
+ need_addr = false;
+ }
+ }
+}
+
+SocketPolicy SocketMessenger::get_policy(entity_type_t peer_type) const
+{
+ assert(seastar::this_shard_id() == sid);
+ return policy_set.get(peer_type);
+}
+
+SocketPolicy SocketMessenger::get_default_policy() const
+{
+ assert(seastar::this_shard_id() == sid);
+ return policy_set.get_default();
+}
+
+void SocketMessenger::set_default_policy(const SocketPolicy& p)
+{
+ assert(seastar::this_shard_id() == sid);
+ policy_set.set_default(p);
+}
+
+void SocketMessenger::set_policy(entity_type_t peer_type,
+ const SocketPolicy& p)
+{
+ assert(seastar::this_shard_id() == sid);
+ policy_set.set(peer_type, p);
+}
+
+void SocketMessenger::set_policy_throttler(entity_type_t peer_type,
+ Throttle* throttle)
+{
+ assert(seastar::this_shard_id() == sid);
+ // only byte throttler is used in OSD
+ policy_set.set_throttlers(peer_type, throttle, nullptr);
+}
+
+crimson::net::SocketConnectionRef SocketMessenger::lookup_conn(const entity_addr_t& addr)
+{
+ assert(seastar::this_shard_id() == sid);
+ if (auto found = connections.find(addr);
+ found != connections.end()) {
+ return found->second;
+ } else {
+ return nullptr;
+ }
+}
+
+void SocketMessenger::accept_conn(SocketConnectionRef conn)
+{
+ assert(seastar::this_shard_id() == sid);
+ accepting_conns.insert(conn);
+}
+
+void SocketMessenger::unaccept_conn(SocketConnectionRef conn)
+{
+ assert(seastar::this_shard_id() == sid);
+ accepting_conns.erase(conn);
+}
+
+void SocketMessenger::register_conn(SocketConnectionRef conn)
+{
+ assert(seastar::this_shard_id() == sid);
+ auto [i, added] = connections.emplace(conn->get_peer_addr(), conn);
+ std::ignore = i;
+ ceph_assert(added);
+}
+
+void SocketMessenger::unregister_conn(SocketConnectionRef conn)
+{
+ assert(seastar::this_shard_id() == sid);
+ ceph_assert(conn);
+ auto found = connections.find(conn->get_peer_addr());
+ ceph_assert(found != connections.end());
+ ceph_assert(found->second == conn);
+ connections.erase(found);
+}
+
+void SocketMessenger::closing_conn(SocketConnectionRef conn)
+{
+ assert(seastar::this_shard_id() == sid);
+ closing_conns.push_back(conn);
+}
+
+void SocketMessenger::closed_conn(SocketConnectionRef conn)
+{
+ assert(seastar::this_shard_id() == sid);
+ for (auto it = closing_conns.begin();
+ it != closing_conns.end();) {
+ if (*it == conn) {
+ it = closing_conns.erase(it);
+ } else {
+ it++;
+ }
+ }
+}
+
+uint32_t SocketMessenger::get_global_seq(uint32_t old)
+{
+ assert(seastar::this_shard_id() == sid);
+ if (old > global_seq) {
+ global_seq = old;
+ }
+ return ++global_seq;
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/SocketMessenger.h b/src/crimson/net/SocketMessenger.h
new file mode 100644
index 000000000..e4ac63184
--- /dev/null
+++ b/src/crimson/net/SocketMessenger.h
@@ -0,0 +1,192 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+#include <seastar/core/gate.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sharded.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "crimson/net/chained_dispatchers.h"
+#include "Messenger.h"
+#include "Socket.h"
+#include "SocketConnection.h"
+
+namespace crimson::net {
+
+class ShardedServerSocket;
+
+class SocketMessenger final : public Messenger {
+// Messenger public interfaces
+public:
+ SocketMessenger(const entity_name_t& myname,
+ const std::string& logic_name,
+ uint32_t nonce,
+ bool dispatch_only_on_this_shard);
+
+ ~SocketMessenger() override;
+
+ const entity_name_t &get_myname() const override {
+ return my_name;
+ }
+
+ const entity_addrvec_t &get_myaddrs() const override {
+ return my_addrs;
+ }
+
+ void set_myaddrs(const entity_addrvec_t& addr) override;
+
+ bool set_addr_unknowns(const entity_addrvec_t &addr) override;
+
+ void set_auth_client(crimson::auth::AuthClient *ac) override {
+ assert(seastar::this_shard_id() == sid);
+ auth_client = ac;
+ }
+
+ void set_auth_server(crimson::auth::AuthServer *as) override {
+ assert(seastar::this_shard_id() == sid);
+ auth_server = as;
+ }
+
+ bind_ertr::future<> bind(const entity_addrvec_t& addr) override;
+
+ seastar::future<> start(const dispatchers_t& dispatchers) override;
+
+ ConnectionRef connect(const entity_addr_t& peer_addr,
+ const entity_name_t& peer_name) override;
+
+ bool owns_connection(Connection &conn) const override {
+ assert(seastar::this_shard_id() == sid);
+ return this == &static_cast<SocketConnection&>(conn).get_messenger();
+ }
+
+ // can only wait once
+ seastar::future<> wait() override {
+ assert(seastar::this_shard_id() == sid);
+ return shutdown_promise.get_future();
+ }
+
+ void stop() override {
+ assert(seastar::this_shard_id() == sid);
+ dispatchers.clear();
+ }
+
+ bool is_started() const override {
+ assert(seastar::this_shard_id() == sid);
+ return !dispatchers.empty();
+ }
+
+ seastar::future<> shutdown() override;
+
+ void print(std::ostream& out) const override {
+ out << get_myname()
+ << "(" << logic_name
+ << ") " << get_myaddr();
+ }
+
+ SocketPolicy get_policy(entity_type_t peer_type) const override;
+
+ SocketPolicy get_default_policy() const override;
+
+ void set_default_policy(const SocketPolicy& p) override;
+
+ void set_policy(entity_type_t peer_type, const SocketPolicy& p) override;
+
+ void set_policy_throttler(entity_type_t peer_type, Throttle* throttle) override;
+
+// SocketMessenger public interfaces
+public:
+ crimson::auth::AuthClient* get_auth_client() const {
+ assert(seastar::this_shard_id() == sid);
+ return auth_client;
+ }
+
+ crimson::auth::AuthServer* get_auth_server() const {
+ assert(seastar::this_shard_id() == sid);
+ return auth_server;
+ }
+
+ uint32_t get_global_seq(uint32_t old=0);
+
+ void learned_addr(const entity_addr_t &peer_addr_for_me,
+ const SocketConnection& conn);
+
+ SocketConnectionRef lookup_conn(const entity_addr_t& addr);
+
+ void accept_conn(SocketConnectionRef);
+
+ void unaccept_conn(SocketConnectionRef);
+
+ void register_conn(SocketConnectionRef);
+
+ void unregister_conn(SocketConnectionRef);
+
+ void closing_conn(SocketConnectionRef);
+
+ void closed_conn(SocketConnectionRef);
+
+ seastar::shard_id get_shard_id() const {
+ return sid;
+ }
+
+#ifdef UNIT_TESTS_BUILT
+ void set_interceptor(Interceptor *i) override {
+ interceptor = i;
+ }
+
+ Interceptor *interceptor = nullptr;
+#endif
+
+private:
+ seastar::future<> accept(SocketFRef &&, const entity_addr_t &);
+
+ listen_ertr::future<> do_listen(const entity_addrvec_t& addr);
+
+ /// try to bind to the first unused port of given address
+ bind_ertr::future<> try_bind(const entity_addrvec_t& addr,
+ uint32_t min_port, uint32_t max_port);
+
+ const seastar::shard_id sid;
+ // Distinguish messengers with meaningful names for debugging
+ const std::string logic_name;
+ const uint32_t nonce;
+ const bool dispatch_only_on_sid;
+
+ entity_name_t my_name;
+ entity_addrvec_t my_addrs;
+ crimson::auth::AuthClient* auth_client = nullptr;
+ crimson::auth::AuthServer* auth_server = nullptr;
+
+ ShardedServerSocket *listener = nullptr;
+ ChainedDispatchers dispatchers;
+ std::map<entity_addr_t, SocketConnectionRef> connections;
+ std::set<SocketConnectionRef> accepting_conns;
+ std::vector<SocketConnectionRef> closing_conns;
+ ceph::net::PolicySet<Throttle> policy_set;
+ // specifying we haven't learned our addr; set false when we find it.
+ bool need_addr = true;
+ uint32_t global_seq = 0;
+ bool started = false;
+ seastar::promise<> shutdown_promise;
+};
+
+} // namespace crimson::net
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::net::SocketMessenger> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/net/chained_dispatchers.cc b/src/crimson/net/chained_dispatchers.cc
new file mode 100644
index 000000000..1e4af3baa
--- /dev/null
+++ b/src/crimson/net/chained_dispatchers.cc
@@ -0,0 +1,114 @@
+#include "crimson/common/log.h"
+#include "crimson/net/chained_dispatchers.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Dispatcher.h"
+#include "msg/Message.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_ms);
+ }
+}
+
+namespace crimson::net {
+
+seastar::future<>
+ChainedDispatchers::ms_dispatch(ConnectionRef conn,
+ MessageRef m) {
+ try {
+ for (auto& dispatcher : dispatchers) {
+ auto dispatched = dispatcher->ms_dispatch(conn, m);
+ if (dispatched.has_value()) {
+ return std::move(*dispatched
+ ).handle_exception([conn] (std::exception_ptr eptr) {
+ logger().error("{} got unexpected exception in ms_dispatch() throttling {}",
+ *conn, eptr);
+ ceph_abort();
+ });
+ }
+ }
+ } catch (...) {
+ logger().error("{} got unexpected exception in ms_dispatch() {}",
+ *conn, std::current_exception());
+ ceph_abort();
+ }
+ if (!dispatchers.empty()) {
+ logger().error("ms_dispatch unhandled message {}", *m);
+ }
+ return seastar::now();
+}
+
+void
+ChainedDispatchers::ms_handle_shard_change(
+ ConnectionRef conn,
+ seastar::shard_id new_shard,
+ bool ac) {
+ try {
+ for (auto& dispatcher : dispatchers) {
+ dispatcher->ms_handle_shard_change(conn, new_shard, ac);
+ }
+ } catch (...) {
+ logger().error("{} got unexpected exception in ms_handle_shard_change() {}",
+ *conn, std::current_exception());
+ ceph_abort();
+ }
+}
+
+void
+ChainedDispatchers::ms_handle_accept(
+ ConnectionRef conn,
+ seastar::shard_id prv_shard,
+ bool is_replace) {
+ try {
+ for (auto& dispatcher : dispatchers) {
+ dispatcher->ms_handle_accept(conn, prv_shard, is_replace);
+ }
+ } catch (...) {
+ logger().error("{} got unexpected exception in ms_handle_accept() {}",
+ *conn, std::current_exception());
+ ceph_abort();
+ }
+}
+
+void
+ChainedDispatchers::ms_handle_connect(
+ ConnectionRef conn,
+ seastar::shard_id prv_shard) {
+ try {
+ for(auto& dispatcher : dispatchers) {
+ dispatcher->ms_handle_connect(conn, prv_shard);
+ }
+ } catch (...) {
+ logger().error("{} got unexpected exception in ms_handle_connect() {}",
+ *conn, std::current_exception());
+ ceph_abort();
+ }
+}
+
+void
+ChainedDispatchers::ms_handle_reset(ConnectionRef conn, bool is_replace) {
+ try {
+ for (auto& dispatcher : dispatchers) {
+ dispatcher->ms_handle_reset(conn, is_replace);
+ }
+ } catch (...) {
+ logger().error("{} got unexpected exception in ms_handle_reset() {}",
+ *conn, std::current_exception());
+ ceph_abort();
+ }
+}
+
+void
+ChainedDispatchers::ms_handle_remote_reset(ConnectionRef conn) {
+ try {
+ for (auto& dispatcher : dispatchers) {
+ dispatcher->ms_handle_remote_reset(conn);
+ }
+ } catch (...) {
+ logger().error("{} got unexpected exception in ms_handle_remote_reset() {}",
+ *conn, std::current_exception());
+ ceph_abort();
+ }
+}
+
+}
diff --git a/src/crimson/net/chained_dispatchers.h b/src/crimson/net/chained_dispatchers.h
new file mode 100644
index 000000000..ec085864f
--- /dev/null
+++ b/src/crimson/net/chained_dispatchers.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/smp.hh>
+
+#include "Fwd.h"
+#include "crimson/common/log.h"
+
+namespace crimson::net {
+
+class Dispatcher;
+
+class ChainedDispatchers {
+public:
+ void assign(const dispatchers_t& _dispatchers) {
+ assert(empty());
+ assert(!_dispatchers.empty());
+ dispatchers = _dispatchers;
+ }
+ void clear() {
+ dispatchers.clear();
+ }
+ bool empty() const {
+ return dispatchers.empty();
+ }
+ seastar::future<> ms_dispatch(ConnectionRef, MessageRef);
+ void ms_handle_shard_change(ConnectionRef, seastar::shard_id, bool);
+ void ms_handle_accept(ConnectionRef conn, seastar::shard_id, bool is_replace);
+ void ms_handle_connect(ConnectionRef conn, seastar::shard_id);
+ void ms_handle_reset(ConnectionRef conn, bool is_replace);
+ void ms_handle_remote_reset(ConnectionRef conn);
+
+ private:
+ dispatchers_t dispatchers;
+};
+
+}
diff --git a/src/crimson/net/io_handler.cc b/src/crimson/net/io_handler.cc
new file mode 100644
index 000000000..c414c48e1
--- /dev/null
+++ b/src/crimson/net/io_handler.cc
@@ -0,0 +1,1287 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "io_handler.h"
+
+#include "auth/Auth.h"
+
+#include "crimson/common/formatter.h"
+#include "crimson/common/log.h"
+#include "crimson/net/Errors.h"
+#include "crimson/net/chained_dispatchers.h"
+#include "crimson/net/SocketMessenger.h"
+#include "msg/Message.h"
+#include "msg/msg_fmt.h"
+
+using namespace ceph::msgr::v2;
+using crimson::common::local_conf;
+
+namespace {
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_ms);
+}
+
+[[noreturn]] void abort_in_fault() {
+ throw std::system_error(make_error_code(crimson::net::error::negotiation_failure));
+}
+
+[[noreturn]] void abort_protocol() {
+ throw std::system_error(make_error_code(crimson::net::error::protocol_aborted));
+}
+
+std::size_t get_msg_size(const FrameAssembler &rx_frame_asm)
+{
+ ceph_assert(rx_frame_asm.get_num_segments() > 0);
+ size_t sum = 0;
+ // we don't include SegmentIndex::Msg::HEADER.
+ for (size_t idx = 1; idx < rx_frame_asm.get_num_segments(); idx++) {
+ sum += rx_frame_asm.get_segment_logical_len(idx);
+ }
+ return sum;
+}
+
+} // namespace anonymous
+
+namespace crimson::net {
+
+IOHandler::IOHandler(ChainedDispatchers &dispatchers,
+ SocketConnection &conn)
+ : shard_states(shard_states_t::create(
+ seastar::this_shard_id(), io_state_t::none)),
+ dispatchers(dispatchers),
+ conn(conn),
+ conn_ref(conn.get_local_shared_foreign_from_this())
+{}
+
+IOHandler::~IOHandler()
+{
+ // close_io() must be finished
+ ceph_assert_always(maybe_prv_shard_states == nullptr);
+ // should be true in the according shard
+ // ceph_assert_always(shard_states->assert_closed_and_exit());
+ assert(!conn_ref);
+}
+
+#ifdef UNIT_TESTS_BUILT
+IOHandler::sweep_ret
+#else
+ceph::bufferlist
+#endif
+IOHandler::sweep_out_pending_msgs_to_sent(
+ bool require_keepalive,
+ std::optional<utime_t> maybe_keepalive_ack,
+ bool require_ack)
+{
+ std::size_t num_msgs = out_pending_msgs.size();
+ ceph::bufferlist bl;
+
+#ifdef UNIT_TESTS_BUILT
+ std::vector<Tag> tags;
+#endif
+
+ if (unlikely(require_keepalive)) {
+ auto keepalive_frame = KeepAliveFrame::Encode();
+ bl.append(frame_assembler->get_buffer(keepalive_frame));
+#ifdef UNIT_TESTS_BUILT
+ auto tag = KeepAliveFrame::tag;
+ tags.push_back(tag);
+#endif
+ }
+
+ if (unlikely(maybe_keepalive_ack.has_value())) {
+ auto keepalive_ack_frame = KeepAliveFrameAck::Encode(*maybe_keepalive_ack);
+ bl.append(frame_assembler->get_buffer(keepalive_ack_frame));
+#ifdef UNIT_TESTS_BUILT
+ auto tag = KeepAliveFrameAck::tag;
+ tags.push_back(tag);
+#endif
+ }
+
+ if (require_ack && num_msgs == 0u) {
+ auto ack_frame = AckFrame::Encode(in_seq);
+ bl.append(frame_assembler->get_buffer(ack_frame));
+#ifdef UNIT_TESTS_BUILT
+ auto tag = AckFrame::tag;
+ tags.push_back(tag);
+#endif
+ }
+
+ std::for_each(
+ out_pending_msgs.begin(),
+ out_pending_msgs.begin()+num_msgs,
+ [this, &bl
+#ifdef UNIT_TESTS_BUILT
+ , &tags
+#endif
+ ](const MessageFRef& msg) {
+ // set priority
+ msg->get_header().src = conn.messenger.get_myname();
+
+ msg->encode(conn.features, 0);
+
+ ceph_assert(!msg->get_seq() && "message already has seq");
+ msg->set_seq(++out_seq);
+
+ ceph_msg_header &header = msg->get_header();
+ ceph_msg_footer &footer = msg->get_footer();
+
+ ceph_msg_header2 header2{header.seq, header.tid,
+ header.type, header.priority,
+ header.version,
+ ceph_le32(0), header.data_off,
+ ceph_le64(in_seq),
+ footer.flags, header.compat_version,
+ header.reserved};
+
+ auto message = MessageFrame::Encode(header2,
+ msg->get_payload(), msg->get_middle(), msg->get_data());
+ logger().debug("{} --> #{} === {} ({})",
+ conn, msg->get_seq(), *msg, msg->get_type());
+ bl.append(frame_assembler->get_buffer(message));
+#ifdef UNIT_TESTS_BUILT
+ auto tag = MessageFrame::tag;
+ tags.push_back(tag);
+#endif
+ });
+
+ if (!conn.policy.lossy) {
+ out_sent_msgs.insert(
+ out_sent_msgs.end(),
+ std::make_move_iterator(out_pending_msgs.begin()),
+ std::make_move_iterator(out_pending_msgs.end()));
+ }
+ out_pending_msgs.clear();
+
+#ifdef UNIT_TESTS_BUILT
+ return sweep_ret{std::move(bl), tags};
+#else
+ return bl;
+#endif
+}
+
+seastar::future<> IOHandler::send(MessageFRef msg)
+{
+ // sid may be changed on-the-fly during the submission
+ if (seastar::this_shard_id() == get_shard_id()) {
+ return do_send(std::move(msg));
+ } else {
+ logger().trace("{} send() is directed to {} -- {}",
+ conn, get_shard_id(), *msg);
+ return seastar::smp::submit_to(
+ get_shard_id(), [this, msg=std::move(msg)]() mutable {
+ return send_redirected(std::move(msg));
+ });
+ }
+}
+
+seastar::future<> IOHandler::send_redirected(MessageFRef msg)
+{
+ // sid may be changed on-the-fly during the submission
+ if (seastar::this_shard_id() == get_shard_id()) {
+ return do_send(std::move(msg));
+ } else {
+ logger().debug("{} send() is redirected to {} -- {}",
+ conn, get_shard_id(), *msg);
+ return seastar::smp::submit_to(
+ get_shard_id(), [this, msg=std::move(msg)]() mutable {
+ return send_redirected(std::move(msg));
+ });
+ }
+}
+
+seastar::future<> IOHandler::do_send(MessageFRef msg)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ logger().trace("{} do_send() got message -- {}", conn, *msg);
+ if (get_io_state() != io_state_t::drop) {
+ out_pending_msgs.push_back(std::move(msg));
+ notify_out_dispatch();
+ }
+ return seastar::now();
+}
+
+seastar::future<> IOHandler::send_keepalive()
+{
+ // sid may be changed on-the-fly during the submission
+ if (seastar::this_shard_id() == get_shard_id()) {
+ return do_send_keepalive();
+ } else {
+ logger().trace("{} send_keepalive() is directed to {}", conn, get_shard_id());
+ return seastar::smp::submit_to(
+ get_shard_id(), [this] {
+ return send_keepalive_redirected();
+ });
+ }
+}
+
+seastar::future<> IOHandler::send_keepalive_redirected()
+{
+ // sid may be changed on-the-fly during the submission
+ if (seastar::this_shard_id() == get_shard_id()) {
+ return do_send_keepalive();
+ } else {
+ logger().debug("{} send_keepalive() is redirected to {}", conn, get_shard_id());
+ return seastar::smp::submit_to(
+ get_shard_id(), [this] {
+ return send_keepalive_redirected();
+ });
+ }
+}
+
+seastar::future<> IOHandler::do_send_keepalive()
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ logger().trace("{} do_send_keeplive(): need_keepalive={}", conn, need_keepalive);
+ if (!need_keepalive) {
+ need_keepalive = true;
+ notify_out_dispatch();
+ }
+ return seastar::now();
+}
+
+void IOHandler::mark_down()
+{
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ ceph_assert_always(get_io_state() != io_state_t::none);
+ need_dispatch_reset = false;
+ if (get_io_state() == io_state_t::drop) {
+ return;
+ }
+
+ auto cc_seq = crosscore.prepare_submit();
+ logger().info("{} mark_down() at {}, send {} notify_mark_down()",
+ conn, io_stat_printer{*this}, cc_seq);
+ do_set_io_state(io_state_t::drop);
+ shard_states->dispatch_in_background(
+ "notify_mark_down", conn, [this, cc_seq] {
+ return seastar::smp::submit_to(
+ conn.get_messenger_shard_id(), [this, cc_seq] {
+ return handshake_listener->notify_mark_down(cc_seq);
+ });
+ });
+}
+
+void IOHandler::print_io_stat(std::ostream &out) const
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ out << "io_stat("
+ << "io_state=" << fmt::format("{}", get_io_state())
+ << ", in_seq=" << in_seq
+ << ", out_seq=" << out_seq
+ << ", out_pending_msgs_size=" << out_pending_msgs.size()
+ << ", out_sent_msgs_size=" << out_sent_msgs.size()
+ << ", need_ack=" << (ack_left > 0)
+ << ", need_keepalive=" << need_keepalive
+ << ", need_keepalive_ack=" << bool(next_keepalive_ack)
+ << ")";
+}
+
+void IOHandler::assign_frame_assembler(FrameAssemblerV2Ref fa)
+{
+ assert(fa != nullptr);
+ ceph_assert_always(frame_assembler == nullptr);
+ frame_assembler = std::move(fa);
+ ceph_assert_always(
+ frame_assembler->get_shard_id() == get_shard_id());
+ // should have been set through dispatch_accept/connect()
+ ceph_assert_always(
+ frame_assembler->get_socket_shard_id() == get_shard_id());
+ ceph_assert_always(frame_assembler->is_socket_valid());
+}
+
+void IOHandler::do_set_io_state(
+ io_state_t new_state,
+ std::optional<crosscore_t::seq_t> cc_seq,
+ FrameAssemblerV2Ref fa,
+ bool set_notify_out)
+{
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ auto prv_state = get_io_state();
+ logger().debug("{} got {}do_set_io_state(): prv_state={}, new_state={}, "
+ "fa={}, set_notify_out={}, at {}",
+ conn,
+ cc_seq.has_value() ? fmt::format("{} ", *cc_seq) : "",
+ prv_state, new_state,
+ fa ? "present" : "N/A", set_notify_out,
+ io_stat_printer{*this});
+ ceph_assert_always(!(
+ (new_state == io_state_t::none && prv_state != io_state_t::none) ||
+ (new_state == io_state_t::open && prv_state == io_state_t::open)
+ ));
+
+ if (prv_state == io_state_t::drop) {
+ // only possible due to a racing mark_down() from user
+ if (new_state == io_state_t::open) {
+ assign_frame_assembler(std::move(fa));
+ frame_assembler->shutdown_socket<false>(nullptr);
+ } else {
+ assert(fa == nullptr);
+ }
+ return;
+ }
+
+ bool dispatch_in = false;
+ if (new_state == io_state_t::open) {
+ // to open
+ ceph_assert_always(protocol_is_connected == true);
+ assign_frame_assembler(std::move(fa));
+ dispatch_in = true;
+ } else if (prv_state == io_state_t::open) {
+ // from open
+ ceph_assert_always(protocol_is_connected == true);
+ protocol_is_connected = false;
+ assert(fa == nullptr);
+ ceph_assert_always(frame_assembler->is_socket_valid());
+ frame_assembler->shutdown_socket<false>(nullptr);
+ } else {
+ assert(fa == nullptr);
+ }
+
+ if (new_state == io_state_t::delay) {
+ need_notify_out = set_notify_out;
+ if (need_notify_out) {
+ maybe_notify_out_dispatch();
+ }
+ } else {
+ assert(set_notify_out == false);
+ need_notify_out = false;
+ }
+
+ // FIXME: simplify and drop the prv_state == new_state case
+ if (prv_state != new_state) {
+ shard_states->set_io_state(new_state);
+ }
+
+ /*
+ * not atomic below
+ */
+
+ if (dispatch_in) {
+ do_in_dispatch();
+ }
+}
+
+seastar::future<> IOHandler::set_io_state(
+ crosscore_t::seq_t cc_seq,
+ io_state_t new_state,
+ FrameAssemblerV2Ref fa,
+ bool set_notify_out)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} set_io_state(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq, new_state,
+ fa=std::move(fa), set_notify_out]() mutable {
+ return set_io_state(cc_seq, new_state, std::move(fa), set_notify_out);
+ });
+ }
+
+ do_set_io_state(new_state, cc_seq, std::move(fa), set_notify_out);
+ return seastar::now();
+}
+
+seastar::future<IOHandler::exit_dispatching_ret>
+IOHandler::wait_io_exit_dispatching(
+ crosscore_t::seq_t cc_seq)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} wait_io_exit_dispatching(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq] {
+ return wait_io_exit_dispatching(cc_seq);
+ });
+ }
+
+ logger().debug("{} got {} wait_io_exit_dispatching()",
+ conn, cc_seq);
+ ceph_assert_always(get_io_state() != io_state_t::open);
+ ceph_assert_always(frame_assembler != nullptr);
+ ceph_assert_always(!frame_assembler->is_socket_valid());
+ return seastar::futurize_invoke([this] {
+ // cannot be running in parallel with to_new_sid()
+ if (maybe_dropped_sid.has_value()) {
+ ceph_assert_always(get_io_state() == io_state_t::drop);
+ assert(shard_states->assert_closed_and_exit());
+ auto prv_sid = *maybe_dropped_sid;
+ return seastar::smp::submit_to(prv_sid, [this] {
+ logger().debug("{} got wait_io_exit_dispatching from prv_sid", conn);
+ assert(maybe_prv_shard_states != nullptr);
+ return maybe_prv_shard_states->wait_io_exit_dispatching();
+ });
+ } else {
+ return shard_states->wait_io_exit_dispatching();
+ }
+ }).then([this] {
+ logger().debug("{} finish wait_io_exit_dispatching at {}",
+ conn, io_stat_printer{*this});
+ ceph_assert_always(frame_assembler != nullptr);
+ ceph_assert_always(!frame_assembler->is_socket_valid());
+ frame_assembler->set_shard_id(conn.get_messenger_shard_id());
+ return exit_dispatching_ret{
+ std::move(frame_assembler),
+ get_states()};
+ });
+}
+
+seastar::future<> IOHandler::reset_session(
+ crosscore_t::seq_t cc_seq,
+ bool full)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} reset_session(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq, full] {
+ return reset_session(cc_seq, full);
+ });
+ }
+
+ logger().debug("{} got {} reset_session({})",
+ conn, cc_seq, full);
+ assert(get_io_state() != io_state_t::open);
+ reset_in();
+ if (full) {
+ reset_out();
+ dispatch_remote_reset();
+ }
+ return seastar::now();
+}
+
+seastar::future<> IOHandler::reset_peer_state(
+ crosscore_t::seq_t cc_seq)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} reset_peer_state(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq] {
+ return reset_peer_state(cc_seq);
+ });
+ }
+
+ logger().debug("{} got {} reset_peer_state()",
+ conn, cc_seq);
+ assert(get_io_state() != io_state_t::open);
+ reset_in();
+ do_requeue_out_sent_up_to(0);
+ discard_out_sent();
+ return seastar::now();
+}
+
+seastar::future<> IOHandler::requeue_out_sent(
+ crosscore_t::seq_t cc_seq)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} requeue_out_sent(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq] {
+ return requeue_out_sent(cc_seq);
+ });
+ }
+
+ logger().debug("{} got {} requeue_out_sent()",
+ conn, cc_seq);
+ do_requeue_out_sent();
+ return seastar::now();
+}
+
+void IOHandler::do_requeue_out_sent()
+{
+ assert(get_io_state() != io_state_t::open);
+ if (out_sent_msgs.empty()) {
+ return;
+ }
+
+ out_seq -= out_sent_msgs.size();
+ logger().debug("{} requeue {} items, revert out_seq to {}",
+ conn, out_sent_msgs.size(), out_seq);
+ for (MessageFRef& msg : out_sent_msgs) {
+ msg->clear_payload();
+ msg->set_seq(0);
+ }
+ out_pending_msgs.insert(
+ out_pending_msgs.begin(),
+ std::make_move_iterator(out_sent_msgs.begin()),
+ std::make_move_iterator(out_sent_msgs.end()));
+ out_sent_msgs.clear();
+ maybe_notify_out_dispatch();
+}
+
+seastar::future<> IOHandler::requeue_out_sent_up_to(
+ crosscore_t::seq_t cc_seq,
+ seq_num_t msg_seq)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} requeue_out_sent_up_to(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq, msg_seq] {
+ return requeue_out_sent_up_to(cc_seq, msg_seq);
+ });
+ }
+
+ logger().debug("{} got {} requeue_out_sent_up_to({})",
+ conn, cc_seq, msg_seq);
+ do_requeue_out_sent_up_to(msg_seq);
+ return seastar::now();
+}
+
+void IOHandler::do_requeue_out_sent_up_to(seq_num_t seq)
+{
+ assert(get_io_state() != io_state_t::open);
+ if (out_sent_msgs.empty() && out_pending_msgs.empty()) {
+ logger().debug("{} nothing to requeue, reset out_seq from {} to seq {}",
+ conn, out_seq, seq);
+ out_seq = seq;
+ return;
+ }
+ logger().debug("{} discarding sent msgs by seq {} (sent_len={}, out_seq={})",
+ conn, seq, out_sent_msgs.size(), out_seq);
+ while (!out_sent_msgs.empty()) {
+ auto cur_seq = out_sent_msgs.front()->get_seq();
+ if (cur_seq == 0 || cur_seq > seq) {
+ break;
+ } else {
+ out_sent_msgs.pop_front();
+ }
+ }
+ do_requeue_out_sent();
+}
+
+void IOHandler::reset_in()
+{
+ assert(get_io_state() != io_state_t::open);
+ in_seq = 0;
+}
+
+void IOHandler::reset_out()
+{
+ assert(get_io_state() != io_state_t::open);
+ discard_out_sent();
+ out_pending_msgs.clear();
+ need_keepalive = false;
+ next_keepalive_ack = std::nullopt;
+ ack_left = 0;
+}
+
+void IOHandler::discard_out_sent()
+{
+ assert(get_io_state() != io_state_t::open);
+ out_seq = 0;
+ out_sent_msgs.clear();
+}
+
+seastar::future<>
+IOHandler::dispatch_accept(
+ crosscore_t::seq_t cc_seq,
+ seastar::shard_id new_sid,
+ ConnectionFRef conn_fref,
+ bool is_replace)
+{
+ return to_new_sid(cc_seq, new_sid, std::move(conn_fref), is_replace);
+}
+
+seastar::future<>
+IOHandler::dispatch_connect(
+ crosscore_t::seq_t cc_seq,
+ seastar::shard_id new_sid,
+ ConnectionFRef conn_fref)
+{
+ return to_new_sid(cc_seq, new_sid, std::move(conn_fref), std::nullopt);
+}
+
+seastar::future<>
+IOHandler::cleanup_prv_shard(seastar::shard_id prv_sid)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ return seastar::smp::submit_to(prv_sid, [this] {
+ logger().debug("{} got cleanup_prv_shard()", conn);
+ assert(maybe_prv_shard_states != nullptr);
+ auto ref_prv_states = std::move(maybe_prv_shard_states);
+ auto &prv_states = *ref_prv_states;
+ return prv_states.close(
+ ).then([ref_prv_states=std::move(ref_prv_states)] {
+ ceph_assert_always(ref_prv_states->assert_closed_and_exit());
+ });
+ }).then([this] {
+ ceph_assert_always(maybe_prv_shard_states == nullptr);
+ });
+}
+
+seastar::future<>
+IOHandler::to_new_sid(
+ crosscore_t::seq_t cc_seq,
+ seastar::shard_id new_sid,
+ ConnectionFRef conn_fref,
+ std::optional<bool> is_replace)
+{
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} to_new_sid(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq, new_sid, is_replace,
+ conn_fref=std::move(conn_fref)]() mutable {
+ return to_new_sid(cc_seq, new_sid, std::move(conn_fref), is_replace);
+ });
+ }
+
+ bool is_accept_or_connect = is_replace.has_value();
+ logger().debug("{} got {} to_new_sid_1(new_sid={}, {}) at {}",
+ conn, cc_seq, new_sid,
+ fmt::format("{}",
+ is_accept_or_connect ?
+ (*is_replace ? "accept(replace)" : "accept(!replace)") :
+ "connect"),
+ io_stat_printer{*this});
+ auto next_cc_seq = ++cc_seq;
+
+ if (get_io_state() != io_state_t::drop) {
+ ceph_assert_always(conn_ref);
+ if (new_sid != seastar::this_shard_id()) {
+ dispatchers.ms_handle_shard_change(conn_ref, new_sid, is_accept_or_connect);
+ // user can make changes
+ }
+ } else {
+ // it is possible that both io_handler and protocolv2 are
+ // trying to close each other from different cores simultaneously.
+ assert(!protocol_is_connected);
+ }
+
+ if (get_io_state() != io_state_t::drop) {
+ if (is_accept_or_connect) {
+ // protocol_is_connected can be from true to true here if the replacing is
+ // happening to a connected connection.
+ } else {
+ ceph_assert_always(protocol_is_connected == false);
+ }
+ protocol_is_connected = true;
+ } else {
+ assert(!protocol_is_connected);
+ }
+
+ bool is_dropped = false;
+ if (get_io_state() == io_state_t::drop) {
+ is_dropped = true;
+ }
+ ceph_assert_always(get_io_state() != io_state_t::open);
+
+ // apply the switching atomically
+ ceph_assert_always(conn_ref);
+ conn_ref.reset();
+ auto prv_sid = get_shard_id();
+ ceph_assert_always(maybe_prv_shard_states == nullptr);
+ maybe_prv_shard_states = std::move(shard_states);
+ shard_states = shard_states_t::create_from_previous(
+ *maybe_prv_shard_states, new_sid);
+ assert(new_sid == get_shard_id());
+
+ return seastar::smp::submit_to(new_sid,
+ [this, next_cc_seq, is_dropped, prv_sid, is_replace, conn_fref=std::move(conn_fref)]() mutable {
+ logger().debug("{} got {} to_new_sid_2(prv_sid={}, is_dropped={}, {}) at {}",
+ conn, next_cc_seq, prv_sid, is_dropped,
+ fmt::format("{}",
+ is_replace.has_value() ?
+ (*is_replace ? "accept(replace)" : "accept(!replace)") :
+ "connect"),
+ io_stat_printer{*this});
+
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ ceph_assert_always(get_io_state() != io_state_t::open);
+ ceph_assert_always(!maybe_dropped_sid.has_value());
+ ceph_assert_always(crosscore.proceed_or_wait(next_cc_seq));
+
+ if (is_dropped) {
+ ceph_assert_always(get_io_state() == io_state_t::drop);
+ ceph_assert_always(shard_states->assert_closed_and_exit());
+ maybe_dropped_sid = prv_sid;
+ // cleanup_prv_shard() will be done in a follow-up close_io()
+ } else {
+ // possible at io_state_t::drop
+
+ // previous shard is not cleaned,
+ // but close_io() is responsible to clean up the current shard,
+ // so cleanup the previous shard here.
+ shard_states->dispatch_in_background(
+ "cleanup_prv_sid", conn, [this, prv_sid] {
+ return cleanup_prv_shard(prv_sid);
+ });
+ maybe_notify_out_dispatch();
+ }
+
+ ceph_assert_always(!conn_ref);
+ // assign even if already dropping
+ conn_ref = make_local_shared_foreign(std::move(conn_fref));
+
+ if (get_io_state() != io_state_t::drop) {
+ if (is_replace.has_value()) {
+ dispatchers.ms_handle_accept(conn_ref, prv_sid, *is_replace);
+ } else {
+ dispatchers.ms_handle_connect(conn_ref, prv_sid);
+ }
+ // user can make changes
+ }
+ });
+}
+
+seastar::future<> IOHandler::set_accepted_sid(
+ crosscore_t::seq_t cc_seq,
+ seastar::shard_id sid,
+ ConnectionFRef conn_fref)
+{
+ assert(seastar::this_shard_id() == get_shard_id());
+ assert(get_io_state() == io_state_t::none);
+ ceph_assert_always(conn_ref);
+ conn_ref.reset();
+ assert(maybe_prv_shard_states == nullptr);
+ shard_states.reset();
+ shard_states = shard_states_t::create(sid, io_state_t::none);
+ return seastar::smp::submit_to(sid,
+ [this, cc_seq, conn_fref=std::move(conn_fref)]() mutable {
+ // must be the first to proceed
+ ceph_assert_always(crosscore.proceed_or_wait(cc_seq));
+
+ logger().debug("{} set accepted sid", conn);
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ ceph_assert_always(get_io_state() == io_state_t::none);
+ assert(maybe_prv_shard_states == nullptr);
+ ceph_assert_always(!conn_ref);
+ conn_ref = make_local_shared_foreign(std::move(conn_fref));
+ });
+}
+
+void IOHandler::dispatch_reset(bool is_replace)
+{
+ ceph_assert_always(get_io_state() == io_state_t::drop);
+ if (!need_dispatch_reset) {
+ return;
+ }
+ need_dispatch_reset = false;
+ ceph_assert_always(conn_ref);
+
+ dispatchers.ms_handle_reset(conn_ref, is_replace);
+ // user can make changes
+}
+
+void IOHandler::dispatch_remote_reset()
+{
+ if (get_io_state() == io_state_t::drop) {
+ return;
+ }
+ ceph_assert_always(conn_ref);
+
+ dispatchers.ms_handle_remote_reset(conn_ref);
+ // user can make changes
+}
+
+void IOHandler::ack_out_sent(seq_num_t seq)
+{
+ if (conn.policy.lossy) { // lossy connections don't keep sent messages
+ return;
+ }
+ while (!out_sent_msgs.empty() &&
+ out_sent_msgs.front()->get_seq() <= seq) {
+ logger().trace("{} got ack seq {} >= {}, pop {}",
+ conn, seq, out_sent_msgs.front()->get_seq(),
+ *out_sent_msgs.front());
+ out_sent_msgs.pop_front();
+ }
+}
+
+seastar::future<>
+IOHandler::do_out_dispatch(shard_states_t &ctx)
+{
+ return seastar::repeat([this, &ctx] {
+ switch (ctx.get_io_state()) {
+ case io_state_t::open: {
+ if (unlikely(!is_out_queued())) {
+ // try exit open dispatching
+ return frame_assembler->flush<false>(
+ ).then([this, &ctx] {
+ if (ctx.get_io_state() != io_state_t::open || is_out_queued()) {
+ return seastar::make_ready_future<stop_t>(stop_t::no);
+ }
+ // still nothing pending to send after flush,
+ // open dispatching can ONLY stop now
+ ctx.exit_out_dispatching("exit-open", conn);
+ return seastar::make_ready_future<stop_t>(stop_t::yes);
+ });
+ }
+
+ auto require_keepalive = need_keepalive;
+ need_keepalive = false;
+ auto maybe_keepalive_ack = next_keepalive_ack;
+ next_keepalive_ack = std::nullopt;
+ auto to_ack = ack_left;
+ assert(to_ack == 0 || in_seq > 0);
+ ack_left = 0;
+#ifdef UNIT_TESTS_BUILT
+ auto ret = sweep_out_pending_msgs_to_sent(
+ require_keepalive, maybe_keepalive_ack, to_ack > 0);
+ return frame_assembler->intercept_frames(ret.tags, true
+ ).then([this, bl=std::move(ret.bl)]() mutable {
+ return frame_assembler->write<false>(std::move(bl));
+ }
+#else
+ auto bl = sweep_out_pending_msgs_to_sent(
+ require_keepalive, maybe_keepalive_ack, to_ack > 0);
+ return frame_assembler->write<false>(std::move(bl)
+#endif
+ ).then([this, &ctx] {
+ if (ctx.get_io_state() != io_state_t::open) {
+ return frame_assembler->flush<false>(
+ ).then([] {
+ return seastar::make_ready_future<stop_t>(stop_t::no);
+ });
+ }
+
+ // FIXME: may leak a flush if state is changed after return and before
+ // the next repeat body.
+ return seastar::make_ready_future<stop_t>(stop_t::no);
+ });
+ }
+ case io_state_t::delay:
+ // delay out dispatching until open
+ ctx.notify_out_dispatching_stopped("delay...", conn);
+ return ctx.wait_state_change(
+ ).then([] { return stop_t::no; });
+ case io_state_t::drop:
+ ctx.exit_out_dispatching("dropped", conn);
+ return seastar::make_ready_future<stop_t>(stop_t::yes);
+ case io_state_t::switched:
+ ctx.exit_out_dispatching("switched", conn);
+ return seastar::make_ready_future<stop_t>(stop_t::yes);
+ default:
+ ceph_abort("impossible");
+ }
+ }).handle_exception_type([this, &ctx](const std::system_error& e) {
+ auto io_state = ctx.get_io_state();
+ if (e.code() != std::errc::broken_pipe &&
+ e.code() != std::errc::connection_reset &&
+ e.code() != error::negotiation_failure) {
+ logger().error("{} do_out_dispatch(): unexpected error at {} -- {}",
+ conn, io_state, e.what());
+ ceph_abort();
+ }
+
+ if (io_state == io_state_t::open) {
+ auto cc_seq = crosscore.prepare_submit();
+ logger().info("{} do_out_dispatch(): fault at {}, {}, going to delay -- {}, "
+ "send {} notify_out_fault()",
+ conn, io_state, io_stat_printer{*this}, e.what(), cc_seq);
+ std::exception_ptr eptr;
+ try {
+ throw e;
+ } catch(...) {
+ eptr = std::current_exception();
+ }
+ do_set_io_state(io_state_t::delay);
+ shard_states->dispatch_in_background(
+ "notify_out_fault(out)", conn, [this, cc_seq, eptr] {
+ auto states = get_states();
+ return seastar::smp::submit_to(
+ conn.get_messenger_shard_id(), [this, cc_seq, eptr, states] {
+ return handshake_listener->notify_out_fault(
+ cc_seq, "do_out_dispatch", eptr, states);
+ });
+ });
+ } else {
+ if (io_state != io_state_t::switched) {
+ logger().info("{} do_out_dispatch(): fault at {}, {} -- {}",
+ conn, io_state, io_stat_printer{*this}, e.what());
+ } else {
+ logger().info("{} do_out_dispatch(): fault at {} -- {}",
+ conn, io_state, e.what());
+ }
+ }
+
+ return do_out_dispatch(ctx);
+ });
+}
+
+void IOHandler::maybe_notify_out_dispatch()
+{
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ if (is_out_queued()) {
+ notify_out_dispatch();
+ }
+}
+
+void IOHandler::notify_out_dispatch()
+{
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ assert(is_out_queued());
+ if (need_notify_out) {
+ auto cc_seq = crosscore.prepare_submit();
+ logger().debug("{} send {} notify_out()",
+ conn, cc_seq);
+ shard_states->dispatch_in_background(
+ "notify_out", conn, [this, cc_seq] {
+ return seastar::smp::submit_to(
+ conn.get_messenger_shard_id(), [this, cc_seq] {
+ return handshake_listener->notify_out(cc_seq);
+ });
+ });
+ }
+ if (shard_states->try_enter_out_dispatching()) {
+ shard_states->dispatch_in_background(
+ "do_out_dispatch", conn, [this] {
+ return do_out_dispatch(*shard_states);
+ });
+ }
+}
+
+seastar::future<>
+IOHandler::read_message(
+ shard_states_t &ctx,
+ utime_t throttle_stamp,
+ std::size_t msg_size)
+{
+ return frame_assembler->read_frame_payload<false>(
+ ).then([this, throttle_stamp, msg_size, &ctx](auto payload) {
+ if (unlikely(ctx.get_io_state() != io_state_t::open)) {
+ logger().debug("{} triggered {} during read_message()",
+ conn, ctx.get_io_state());
+ abort_protocol();
+ }
+
+ utime_t recv_stamp{seastar::lowres_system_clock::now()};
+
+ // we need to get the size before std::moving segments data
+ auto msg_frame = MessageFrame::Decode(*payload);
+ // XXX: paranoid copy just to avoid oops
+ ceph_msg_header2 current_header = msg_frame.header();
+
+ logger().trace("{} got {} + {} + {} byte message,"
+ " envelope type={} src={} off={} seq={}",
+ conn,
+ msg_frame.front_len(),
+ msg_frame.middle_len(),
+ msg_frame.data_len(),
+ current_header.type,
+ conn.get_peer_name(),
+ current_header.data_off,
+ current_header.seq);
+
+ ceph_msg_header header{current_header.seq,
+ current_header.tid,
+ current_header.type,
+ current_header.priority,
+ current_header.version,
+ ceph_le32(msg_frame.front_len()),
+ ceph_le32(msg_frame.middle_len()),
+ ceph_le32(msg_frame.data_len()),
+ current_header.data_off,
+ conn.get_peer_name(),
+ current_header.compat_version,
+ current_header.reserved,
+ ceph_le32(0)};
+ ceph_msg_footer footer{ceph_le32(0), ceph_le32(0),
+ ceph_le32(0), ceph_le64(0), current_header.flags};
+
+ Message *message = decode_message(nullptr, 0, header, footer,
+ msg_frame.front(), msg_frame.middle(), msg_frame.data(), nullptr);
+ if (!message) {
+ logger().warn("{} decode message failed", conn);
+ abort_in_fault();
+ }
+
+ // store reservation size in message, so we don't get confused
+ // by messages entering the dispatch queue through other paths.
+ message->set_dispatch_throttle_size(msg_size);
+
+ message->set_throttle_stamp(throttle_stamp);
+ message->set_recv_stamp(recv_stamp);
+ message->set_recv_complete_stamp(utime_t{seastar::lowres_system_clock::now()});
+
+ // check received seq#. if it is old, drop the message.
+ // note that incoming messages may skip ahead. this is convenient for the
+ // client side queueing because messages can't be renumbered, but the (kernel)
+ // client will occasionally pull a message out of the sent queue to send
+ // elsewhere. in that case it doesn't matter if we "got" it or not.
+ uint64_t cur_seq = in_seq;
+ if (message->get_seq() <= cur_seq) {
+ logger().error("{} got old message {} <= {} {}, discarding",
+ conn, message->get_seq(), cur_seq, *message);
+ if (HAVE_FEATURE(conn.features, RECONNECT_SEQ) &&
+ local_conf()->ms_die_on_old_message) {
+ ceph_assert(0 == "old msgs despite reconnect_seq feature");
+ }
+ return seastar::now();
+ } else if (message->get_seq() > cur_seq + 1) {
+ logger().error("{} missed message? skipped from seq {} to {}",
+ conn, cur_seq, message->get_seq());
+ if (local_conf()->ms_die_on_skipped_message) {
+ ceph_assert(0 == "skipped incoming seq");
+ }
+ }
+
+ // note last received message.
+ in_seq = message->get_seq();
+ if (conn.policy.lossy) {
+ logger().debug("{} <== #{} === {} ({})",
+ conn,
+ message->get_seq(),
+ *message,
+ message->get_type());
+ } else {
+ logger().debug("{} <== #{},{} === {} ({})",
+ conn,
+ message->get_seq(),
+ current_header.ack_seq,
+ *message,
+ message->get_type());
+ }
+
+ // notify ack
+ if (!conn.policy.lossy) {
+ ++ack_left;
+ notify_out_dispatch();
+ }
+
+ ack_out_sent(current_header.ack_seq);
+
+ // TODO: change MessageRef with seastar::shared_ptr
+ auto msg_ref = MessageRef{message, false};
+ assert(ctx.get_io_state() == io_state_t::open);
+ assert(get_io_state() == io_state_t::open);
+ ceph_assert_always(conn_ref);
+
+ // throttle the reading process by the returned future
+ return dispatchers.ms_dispatch(conn_ref, std::move(msg_ref));
+ // user can make changes
+ });
+}
+
+void IOHandler::do_in_dispatch()
+{
+ shard_states->enter_in_dispatching();
+ shard_states->dispatch_in_background(
+ "do_in_dispatch", conn, [this, &ctx=*shard_states] {
+ return seastar::keep_doing([this, &ctx] {
+ return frame_assembler->read_main_preamble<false>(
+ ).then([this, &ctx](auto ret) {
+ switch (ret.tag) {
+ case Tag::MESSAGE: {
+ size_t msg_size = get_msg_size(*ret.rx_frame_asm);
+ return seastar::futurize_invoke([this] {
+ // throttle_message() logic
+ if (!conn.policy.throttler_messages) {
+ return seastar::now();
+ }
+ // TODO: message throttler
+ ceph_abort("TODO");
+ return seastar::now();
+ }).then([this, msg_size] {
+ // throttle_bytes() logic
+ if (!conn.policy.throttler_bytes) {
+ return seastar::now();
+ }
+ if (!msg_size) {
+ return seastar::now();
+ }
+ logger().trace("{} wants {} bytes from policy throttler {}/{}",
+ conn, msg_size,
+ conn.policy.throttler_bytes->get_current(),
+ conn.policy.throttler_bytes->get_max());
+ return conn.policy.throttler_bytes->get(msg_size);
+ }).then([this, msg_size, &ctx] {
+ // TODO: throttle_dispatch_queue() logic
+ utime_t throttle_stamp{seastar::lowres_system_clock::now()};
+ return read_message(ctx, throttle_stamp, msg_size);
+ });
+ }
+ case Tag::ACK:
+ return frame_assembler->read_frame_payload<false>(
+ ).then([this](auto payload) {
+ // handle_message_ack() logic
+ auto ack = AckFrame::Decode(payload->back());
+ logger().debug("{} GOT AckFrame: seq={}", conn, ack.seq());
+ ack_out_sent(ack.seq());
+ });
+ case Tag::KEEPALIVE2:
+ return frame_assembler->read_frame_payload<false>(
+ ).then([this](auto payload) {
+ // handle_keepalive2() logic
+ auto keepalive_frame = KeepAliveFrame::Decode(payload->back());
+ logger().debug("{} GOT KeepAliveFrame: timestamp={}",
+ conn, keepalive_frame.timestamp());
+ // notify keepalive ack
+ next_keepalive_ack = keepalive_frame.timestamp();
+ if (seastar::this_shard_id() == get_shard_id()) {
+ notify_out_dispatch();
+ }
+
+ last_keepalive = seastar::lowres_system_clock::now();
+ });
+ case Tag::KEEPALIVE2_ACK:
+ return frame_assembler->read_frame_payload<false>(
+ ).then([this](auto payload) {
+ // handle_keepalive2_ack() logic
+ auto keepalive_ack_frame = KeepAliveFrameAck::Decode(payload->back());
+ auto _last_keepalive_ack =
+ seastar::lowres_system_clock::time_point{keepalive_ack_frame.timestamp()};
+ set_last_keepalive_ack(_last_keepalive_ack);
+ logger().debug("{} GOT KeepAliveFrameAck: timestamp={}",
+ conn, _last_keepalive_ack);
+ });
+ default: {
+ logger().warn("{} do_in_dispatch() received unexpected tag: {}",
+ conn, static_cast<uint32_t>(ret.tag));
+ abort_in_fault();
+ }
+ }
+ });
+ }).handle_exception([this, &ctx](std::exception_ptr eptr) {
+ const char *e_what;
+ try {
+ std::rethrow_exception(eptr);
+ } catch (std::exception &e) {
+ e_what = e.what();
+ }
+
+ auto io_state = ctx.get_io_state();
+ if (io_state == io_state_t::open) {
+ auto cc_seq = crosscore.prepare_submit();
+ logger().info("{} do_in_dispatch(): fault at {}, {}, going to delay -- {}, "
+ "send {} notify_out_fault()",
+ conn, io_state, io_stat_printer{*this}, e_what, cc_seq);
+ do_set_io_state(io_state_t::delay);
+ shard_states->dispatch_in_background(
+ "notify_out_fault(in)", conn, [this, cc_seq, eptr] {
+ auto states = get_states();
+ return seastar::smp::submit_to(
+ conn.get_messenger_shard_id(), [this, cc_seq, eptr, states] {
+ return handshake_listener->notify_out_fault(
+ cc_seq, "do_in_dispatch", eptr, states);
+ });
+ });
+ } else {
+ if (io_state != io_state_t::switched) {
+ logger().info("{} do_in_dispatch(): fault at {}, {} -- {}",
+ conn, io_state, io_stat_printer{*this}, e_what);
+ } else {
+ logger().info("{} do_in_dispatch(): fault at {} -- {}",
+ conn, io_state, e_what);
+ }
+ }
+ }).finally([&ctx] {
+ ctx.exit_in_dispatching();
+ });
+ });
+}
+
+seastar::future<>
+IOHandler::close_io(
+ crosscore_t::seq_t cc_seq,
+ bool is_dispatch_reset,
+ bool is_replace)
+{
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ if (!crosscore.proceed_or_wait(cc_seq)) {
+ logger().debug("{} got {} close_io(), wait at {}",
+ conn, cc_seq, crosscore.get_in_seq());
+ return crosscore.wait(cc_seq
+ ).then([this, cc_seq, is_dispatch_reset, is_replace] {
+ return close_io(cc_seq, is_dispatch_reset, is_replace);
+ });
+ }
+
+ logger().debug("{} got {} close_io(reset={}, replace={})",
+ conn, cc_seq, is_dispatch_reset, is_replace);
+ ceph_assert_always(get_io_state() == io_state_t::drop);
+
+ if (is_dispatch_reset) {
+ dispatch_reset(is_replace);
+ }
+
+ ceph_assert_always(conn_ref);
+ conn_ref.reset();
+
+ // cannot be running in parallel with to_new_sid()
+ if (maybe_dropped_sid.has_value()) {
+ assert(shard_states->assert_closed_and_exit());
+ auto prv_sid = *maybe_dropped_sid;
+ return cleanup_prv_shard(prv_sid);
+ } else {
+ return shard_states->close(
+ ).then([this] {
+ assert(shard_states->assert_closed_and_exit());
+ });
+ }
+}
+
+/*
+ * IOHandler::shard_states_t
+ */
+
+void
+IOHandler::shard_states_t::notify_out_dispatching_stopped(
+ const char *what, SocketConnection &conn)
+{
+ assert(seastar::this_shard_id() == sid);
+ if (unlikely(out_exit_dispatching.has_value())) {
+ out_exit_dispatching->set_value();
+ out_exit_dispatching = std::nullopt;
+ logger().info("{} do_out_dispatch: stop({}) at {}, set out_exit_dispatching",
+ conn, what, io_state);
+ } else {
+ if (unlikely(io_state != io_state_t::open)) {
+ logger().info("{} do_out_dispatch: stop({}) at {}, no out_exit_dispatching",
+ conn, what, io_state);
+ }
+ }
+}
+
+seastar::future<>
+IOHandler::shard_states_t::wait_io_exit_dispatching()
+{
+ assert(seastar::this_shard_id() == sid);
+ assert(io_state != io_state_t::open);
+ assert(!gate.is_closed());
+ return seastar::when_all(
+ [this] {
+ if (out_exit_dispatching) {
+ return out_exit_dispatching->get_future();
+ } else {
+ return seastar::now();
+ }
+ }(),
+ [this] {
+ if (in_exit_dispatching) {
+ return in_exit_dispatching->get_future();
+ } else {
+ return seastar::now();
+ }
+ }()
+ ).discard_result();
+}
+
+IOHandler::shard_states_ref_t
+IOHandler::shard_states_t::create_from_previous(
+ shard_states_t &prv_states,
+ seastar::shard_id new_sid)
+{
+ auto io_state = prv_states.io_state;
+ assert(io_state != io_state_t::open);
+ auto ret = shard_states_t::create(new_sid, io_state);
+ if (io_state == io_state_t::drop) {
+ // the new gate should not never be used
+ auto fut = ret->gate.close();
+ ceph_assert_always(fut.available());
+ }
+ prv_states.set_io_state(io_state_t::switched);
+ return ret;
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/io_handler.h b/src/crimson/net/io_handler.h
new file mode 100644
index 000000000..f53c2ba64
--- /dev/null
+++ b/src/crimson/net/io_handler.h
@@ -0,0 +1,623 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <vector>
+
+#include <seastar/core/shared_future.hh>
+#include <seastar/util/later.hh>
+
+#include "crimson/common/gated.h"
+#include "Fwd.h"
+#include "SocketConnection.h"
+#include "FrameAssemblerV2.h"
+
+namespace crimson::net {
+
+/**
+ * crosscore_t
+ *
+ * To preserve the event order across cores.
+ */
+class crosscore_t {
+public:
+ using seq_t = uint64_t;
+
+ crosscore_t() = default;
+ ~crosscore_t() = default;
+
+ seq_t get_in_seq() const {
+ return in_seq;
+ }
+
+ seq_t prepare_submit() {
+ ++out_seq;
+ return out_seq;
+ }
+
+ bool proceed_or_wait(seq_t seq) {
+ if (seq == in_seq + 1) {
+ ++in_seq;
+ if (unlikely(in_pr_wait.has_value())) {
+ in_pr_wait->set_value();
+ in_pr_wait = std::nullopt;
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ seastar::future<> wait(seq_t seq) {
+ assert(seq != in_seq + 1);
+ if (!in_pr_wait.has_value()) {
+ in_pr_wait = seastar::shared_promise<>();
+ }
+ return in_pr_wait->get_shared_future();
+ }
+
+private:
+ seq_t out_seq = 0;
+ seq_t in_seq = 0;
+ std::optional<seastar::shared_promise<>> in_pr_wait;
+};
+
+/**
+ * io_handler_state
+ *
+ * It is required to populate the states from IOHandler to ProtocolV2
+ * asynchronously.
+ */
+struct io_handler_state {
+ seq_num_t in_seq;
+ bool is_out_queued;
+ bool has_out_sent;
+
+ bool is_out_queued_or_sent() const {
+ return is_out_queued || has_out_sent;
+ }
+
+ /*
+ * should be consistent with the accroding interfaces in IOHandler
+ */
+
+ void reset_session(bool full) {
+ in_seq = 0;
+ if (full) {
+ is_out_queued = false;
+ has_out_sent = false;
+ }
+ }
+
+ void reset_peer_state() {
+ in_seq = 0;
+ is_out_queued = is_out_queued_or_sent();
+ has_out_sent = false;
+ }
+
+ void requeue_out_sent_up_to() {
+ // noop since the information is insufficient
+ }
+
+ void requeue_out_sent() {
+ if (has_out_sent) {
+ has_out_sent = false;
+ is_out_queued = true;
+ }
+ }
+};
+
+/**
+ * HandshakeListener
+ *
+ * The interface class for IOHandler to notify the ProtocolV2.
+ *
+ * The notifications may be cross-core and must be sent to
+ * SocketConnection::get_messenger_shard_id()
+ */
+class HandshakeListener {
+public:
+ virtual ~HandshakeListener() = default;
+
+ HandshakeListener(const HandshakeListener&) = delete;
+ HandshakeListener(HandshakeListener &&) = delete;
+ HandshakeListener &operator=(const HandshakeListener &) = delete;
+ HandshakeListener &operator=(HandshakeListener &&) = delete;
+
+ virtual seastar::future<> notify_out(
+ crosscore_t::seq_t cc_seq) = 0;
+
+ virtual seastar::future<> notify_out_fault(
+ crosscore_t::seq_t cc_seq,
+ const char *where,
+ std::exception_ptr,
+ io_handler_state) = 0;
+
+ virtual seastar::future<> notify_mark_down(
+ crosscore_t::seq_t cc_seq) = 0;
+
+protected:
+ HandshakeListener() = default;
+};
+
+/**
+ * IOHandler
+ *
+ * Implements the message read and write paths after the handshake, and also be
+ * responsible to dispatch events. It is supposed to be working on the same
+ * core with the underlying socket and the FrameAssemblerV2 class.
+ */
+class IOHandler final : public ConnectionHandler {
+public:
+ IOHandler(ChainedDispatchers &,
+ SocketConnection &);
+
+ ~IOHandler() final;
+
+ IOHandler(const IOHandler &) = delete;
+ IOHandler(IOHandler &&) = delete;
+ IOHandler &operator=(const IOHandler &) = delete;
+ IOHandler &operator=(IOHandler &&) = delete;
+
+/*
+ * as ConnectionHandler
+ */
+public:
+ seastar::shard_id get_shard_id() const final {
+ return shard_states->get_shard_id();
+ }
+
+ bool is_connected() const final {
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ return protocol_is_connected;
+ }
+
+ seastar::future<> send(MessageFRef msg) final;
+
+ seastar::future<> send_keepalive() final;
+
+ clock_t::time_point get_last_keepalive() const final {
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ return last_keepalive;
+ }
+
+ clock_t::time_point get_last_keepalive_ack() const final {
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ return last_keepalive_ack;
+ }
+
+ void set_last_keepalive_ack(clock_t::time_point when) final {
+ ceph_assert_always(seastar::this_shard_id() == get_shard_id());
+ last_keepalive_ack = when;
+ }
+
+ void mark_down() final;
+
+/*
+ * as IOHandler to be called by ProtocolV2 handshake
+ *
+ * The calls may be cross-core and asynchronous
+ */
+public:
+ /*
+ * should not be called cross-core
+ */
+
+ void set_handshake_listener(HandshakeListener &hl) {
+ assert(seastar::this_shard_id() == get_shard_id());
+ ceph_assert_always(handshake_listener == nullptr);
+ handshake_listener = &hl;
+ }
+
+ io_handler_state get_states() const {
+ // might be called from prv_sid during wait_io_exit_dispatching()
+ return {in_seq, is_out_queued(), has_out_sent()};
+ }
+
+ struct io_stat_printer {
+ const IOHandler &io_handler;
+ };
+ void print_io_stat(std::ostream &out) const;
+
+ seastar::future<> set_accepted_sid(
+ crosscore_t::seq_t cc_seq,
+ seastar::shard_id sid,
+ ConnectionFRef conn_fref);
+
+ /*
+ * may be called cross-core
+ */
+
+ seastar::future<> close_io(
+ crosscore_t::seq_t cc_seq,
+ bool is_dispatch_reset,
+ bool is_replace);
+
+ /**
+ * io_state_t
+ *
+ * The io_state is changed with the protocol state, to control the
+ * io behavior accordingly.
+ */
+ enum class io_state_t : uint8_t {
+ none, // no IO is possible as the connection is not available to the user yet.
+ delay, // IO is delayed until open.
+ open, // Dispatch In and Out concurrently.
+ drop, // Drop IO as the connection is closed.
+ switched // IO is switched to a different core
+ // (is moved to maybe_prv_shard_states)
+ };
+ friend class fmt::formatter<io_state_t>;
+
+ seastar::future<> set_io_state(
+ crosscore_t::seq_t cc_seq,
+ io_state_t new_state,
+ FrameAssemblerV2Ref fa,
+ bool set_notify_out);
+
+ struct exit_dispatching_ret {
+ FrameAssemblerV2Ref frame_assembler;
+ io_handler_state io_states;
+ };
+ seastar::future<exit_dispatching_ret>
+ wait_io_exit_dispatching(
+ crosscore_t::seq_t cc_seq);
+
+ seastar::future<> reset_session(
+ crosscore_t::seq_t cc_seq,
+ bool full);
+
+ seastar::future<> reset_peer_state(
+ crosscore_t::seq_t cc_seq);
+
+ seastar::future<> requeue_out_sent_up_to(
+ crosscore_t::seq_t cc_seq,
+ seq_num_t msg_seq);
+
+ seastar::future<> requeue_out_sent(
+ crosscore_t::seq_t cc_seq);
+
+ seastar::future<> dispatch_accept(
+ crosscore_t::seq_t cc_seq,
+ seastar::shard_id new_sid,
+ ConnectionFRef,
+ bool is_replace);
+
+ seastar::future<> dispatch_connect(
+ crosscore_t::seq_t cc_seq,
+ seastar::shard_id new_sid,
+ ConnectionFRef);
+
+ private:
+ class shard_states_t;
+ using shard_states_ref_t = std::unique_ptr<shard_states_t>;
+
+ class shard_states_t {
+ public:
+ shard_states_t(seastar::shard_id _sid, io_state_t state)
+ : sid{_sid}, io_state{state} {}
+
+ seastar::shard_id get_shard_id() const {
+ return sid;
+ }
+
+ io_state_t get_io_state() const {
+ assert(seastar::this_shard_id() == sid);
+ return io_state;
+ }
+
+ void set_io_state(io_state_t new_state) {
+ assert(seastar::this_shard_id() == sid);
+ assert(io_state != new_state);
+ pr_io_state_changed.set_value();
+ pr_io_state_changed = seastar::promise<>();
+ if (io_state == io_state_t::open) {
+ // from open
+ if (out_dispatching) {
+ ceph_assert_always(!out_exit_dispatching.has_value());
+ out_exit_dispatching = seastar::promise<>();
+ }
+ }
+ io_state = new_state;
+ }
+
+ seastar::future<> wait_state_change() {
+ assert(seastar::this_shard_id() == sid);
+ return pr_io_state_changed.get_future();
+ }
+
+ template <typename Func>
+ void dispatch_in_background(
+ const char *what, SocketConnection &who, Func &&func) {
+ assert(seastar::this_shard_id() == sid);
+ ceph_assert_always(!gate.is_closed());
+ gate.dispatch_in_background(what, who, std::move(func));
+ }
+
+ void enter_in_dispatching() {
+ assert(seastar::this_shard_id() == sid);
+ assert(io_state == io_state_t::open);
+ ceph_assert_always(!in_exit_dispatching.has_value());
+ in_exit_dispatching = seastar::promise<>();
+ }
+
+ void exit_in_dispatching() {
+ assert(seastar::this_shard_id() == sid);
+ assert(io_state != io_state_t::open);
+ ceph_assert_always(in_exit_dispatching.has_value());
+ in_exit_dispatching->set_value();
+ in_exit_dispatching = std::nullopt;
+ }
+
+ bool try_enter_out_dispatching() {
+ assert(seastar::this_shard_id() == sid);
+ if (out_dispatching) {
+ // already dispatching out
+ return false;
+ }
+ switch (io_state) {
+ case io_state_t::open:
+ [[fallthrough]];
+ case io_state_t::delay:
+ out_dispatching = true;
+ return true;
+ case io_state_t::drop:
+ [[fallthrough]];
+ case io_state_t::switched:
+ // do not dispatch out
+ return false;
+ default:
+ ceph_abort("impossible");
+ }
+ }
+
+ void notify_out_dispatching_stopped(
+ const char *what, SocketConnection &conn);
+
+ void exit_out_dispatching(
+ const char *what, SocketConnection &conn) {
+ assert(seastar::this_shard_id() == sid);
+ ceph_assert_always(out_dispatching);
+ out_dispatching = false;
+ notify_out_dispatching_stopped(what, conn);
+ }
+
+ seastar::future<> wait_io_exit_dispatching();
+
+ seastar::future<> close() {
+ assert(seastar::this_shard_id() == sid);
+ assert(!gate.is_closed());
+ return gate.close();
+ }
+
+ bool assert_closed_and_exit() const {
+ assert(seastar::this_shard_id() == sid);
+ if (gate.is_closed()) {
+ ceph_assert_always(io_state == io_state_t::drop ||
+ io_state == io_state_t::switched);
+ ceph_assert_always(!out_dispatching);
+ ceph_assert_always(!out_exit_dispatching);
+ ceph_assert_always(!in_exit_dispatching);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ static shard_states_ref_t create(
+ seastar::shard_id sid, io_state_t state) {
+ return std::make_unique<shard_states_t>(sid, state);
+ }
+
+ static shard_states_ref_t create_from_previous(
+ shard_states_t &prv_states, seastar::shard_id new_sid);
+
+ private:
+ const seastar::shard_id sid;
+ io_state_t io_state;
+
+ crimson::common::Gated gate;
+ seastar::promise<> pr_io_state_changed;
+ bool out_dispatching = false;
+ std::optional<seastar::promise<>> out_exit_dispatching;
+ std::optional<seastar::promise<>> in_exit_dispatching;
+ };
+
+ void do_set_io_state(
+ io_state_t new_state,
+ std::optional<crosscore_t::seq_t> cc_seq = std::nullopt,
+ FrameAssemblerV2Ref fa = nullptr,
+ bool set_notify_out = false);
+
+ io_state_t get_io_state() const {
+ return shard_states->get_io_state();
+ }
+
+ void do_requeue_out_sent();
+
+ void do_requeue_out_sent_up_to(seq_num_t seq);
+
+ void assign_frame_assembler(FrameAssemblerV2Ref);
+
+ seastar::future<> send_redirected(MessageFRef msg);
+
+ seastar::future<> do_send(MessageFRef msg);
+
+ seastar::future<> send_keepalive_redirected();
+
+ seastar::future<> do_send_keepalive();
+
+ seastar::future<> to_new_sid(
+ crosscore_t::seq_t cc_seq,
+ seastar::shard_id new_sid,
+ ConnectionFRef,
+ std::optional<bool> is_replace);
+
+ void dispatch_reset(bool is_replace);
+
+ void dispatch_remote_reset();
+
+ bool is_out_queued() const {
+ return (!out_pending_msgs.empty() ||
+ ack_left > 0 ||
+ need_keepalive ||
+ next_keepalive_ack.has_value());
+ }
+
+ bool has_out_sent() const {
+ return !out_sent_msgs.empty();
+ }
+
+ void reset_in();
+
+ void reset_out();
+
+ void discard_out_sent();
+
+ seastar::future<> do_out_dispatch(shard_states_t &ctx);
+
+#ifdef UNIT_TESTS_BUILT
+ struct sweep_ret {
+ ceph::bufferlist bl;
+ std::vector<ceph::msgr::v2::Tag> tags;
+ };
+ sweep_ret
+#else
+ ceph::bufferlist
+#endif
+ sweep_out_pending_msgs_to_sent(
+ bool require_keepalive,
+ std::optional<utime_t> maybe_keepalive_ack,
+ bool require_ack);
+
+ void maybe_notify_out_dispatch();
+
+ void notify_out_dispatch();
+
+ void ack_out_sent(seq_num_t seq);
+
+ seastar::future<> read_message(
+ shard_states_t &ctx,
+ utime_t throttle_stamp,
+ std::size_t msg_size);
+
+ void do_in_dispatch();
+
+ seastar::future<> cleanup_prv_shard(seastar::shard_id prv_sid);
+
+private:
+ shard_states_ref_t shard_states;
+
+ crosscore_t crosscore;
+
+ // drop was happening in the previous sid
+ std::optional<seastar::shard_id> maybe_dropped_sid;
+
+ // the remaining states in the previous sid for cleanup, see to_new_sid()
+ shard_states_ref_t maybe_prv_shard_states;
+
+ ChainedDispatchers &dispatchers;
+
+ SocketConnection &conn;
+
+ // core local reference for dispatching, valid until reset/close
+ ConnectionRef conn_ref;
+
+ HandshakeListener *handshake_listener = nullptr;
+
+ FrameAssemblerV2Ref frame_assembler;
+
+ bool protocol_is_connected = false;
+
+ bool need_dispatch_reset = true;
+
+ /*
+ * out states for writing
+ */
+
+ /// the seq num of the last transmitted message
+ seq_num_t out_seq = 0;
+
+ // messages to be resent after connection gets reset
+ std::deque<MessageFRef> out_pending_msgs;
+
+ // messages sent, but not yet acked by peer
+ std::deque<MessageFRef> out_sent_msgs;
+
+ bool need_keepalive = false;
+
+ std::optional<utime_t> next_keepalive_ack = std::nullopt;
+
+ uint64_t ack_left = 0;
+
+ bool need_notify_out = false;
+
+ /*
+ * in states for reading
+ */
+
+ /// the seq num of the last received message
+ seq_num_t in_seq = 0;
+
+ clock_t::time_point last_keepalive;
+
+ clock_t::time_point last_keepalive_ack;
+};
+
+inline std::ostream& operator<<(
+ std::ostream& out, IOHandler::io_stat_printer stat) {
+ stat.io_handler.print_io_stat(out);
+ return out;
+}
+
+} // namespace crimson::net
+
+template <>
+struct fmt::formatter<crimson::net::io_handler_state> {
+ constexpr auto parse(format_parse_context& ctx) {
+ return ctx.begin();
+ }
+
+ template <typename FormatContext>
+ auto format(crimson::net::io_handler_state state, FormatContext& ctx) {
+ return fmt::format_to(
+ ctx.out(),
+ "io(in_seq={}, is_out_queued={}, has_out_sent={})",
+ state.in_seq,
+ state.is_out_queued,
+ state.has_out_sent);
+ }
+};
+
+template <>
+struct fmt::formatter<crimson::net::IOHandler::io_state_t>
+ : fmt::formatter<std::string_view> {
+ template <typename FormatContext>
+ auto format(crimson::net::IOHandler::io_state_t state, FormatContext& ctx) {
+ using enum crimson::net::IOHandler::io_state_t;
+ std::string_view name;
+ switch (state) {
+ case none:
+ name = "none";
+ break;
+ case delay:
+ name = "delay";
+ break;
+ case open:
+ name = "open";
+ break;
+ case drop:
+ name = "drop";
+ break;
+ case switched:
+ name = "switched";
+ break;
+ }
+ return formatter<string_view>::format(name, ctx);
+ }
+};
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::net::IOHandler::io_stat_printer> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/CMakeLists.txt b/src/crimson/os/CMakeLists.txt
new file mode 100644
index 000000000..5054cabf4
--- /dev/null
+++ b/src/crimson/os/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_library(crimson-os STATIC
+ futurized_store.cc
+ ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc)
+add_subdirectory(cyanstore)
+
+if(WITH_BLUESTORE)
+ add_subdirectory(alienstore)
+ set(alienstore_lib crimson-alienstore)
+endif()
+
+add_subdirectory(seastore)
+target_link_libraries(crimson-os
+ crimson-cyanstore
+ ${alienstore_lib}
+ crimson-seastore
+ crimson)
diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt
new file mode 100644
index 000000000..c881f4fbc
--- /dev/null
+++ b/src/crimson/os/alienstore/CMakeLists.txt
@@ -0,0 +1,86 @@
+include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/src/rocksdb/include")
+
+add_library(alien::cflags INTERFACE IMPORTED)
+set_target_properties(alien::cflags PROPERTIES
+ INTERFACE_COMPILE_DEFINITIONS "WITH_SEASTAR;WITH_ALIEN"
+ INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:Seastar::seastar,INTERFACE_INCLUDE_DIRECTORIES>)
+
+set(crimson_alien_common_srcs
+ ${PROJECT_SOURCE_DIR}/src/common/admin_socket.cc
+ ${PROJECT_SOURCE_DIR}/src/common/url_escape.cc
+ ${PROJECT_SOURCE_DIR}/src/common/blkdev.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Finisher.cc
+ ${PROJECT_SOURCE_DIR}/src/common/HeartbeatMap.cc
+ ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc
+ ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc
+ ${PROJECT_SOURCE_DIR}/src/common/perf_counters_collection.cc
+ ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc
+ ${PROJECT_SOURCE_DIR}/src/common/SubProcess.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Throttle.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Timer.cc
+ ${PROJECT_SOURCE_DIR}/src/common/TrackedOp.cc
+ ${PROJECT_SOURCE_DIR}/src/common/WorkQueue.cc
+ ${PROJECT_SOURCE_DIR}/src/common/util.cc
+ ${PROJECT_SOURCE_DIR}/src/crush/CrushLocation.cc
+ ${PROJECT_SOURCE_DIR}/src/global/global_context.cc
+ ${PROJECT_SOURCE_DIR}/src/log/Log.cc
+ $<TARGET_OBJECTS:compressor_objs>
+ $<TARGET_OBJECTS:common_prioritycache_obj>)
+if(WITH_CEPH_DEBUG_MUTEX)
+ list(APPEND crimson_alien_common_srcs
+ ${PROJECT_SOURCE_DIR}/src/common/lockdep.cc
+ ${PROJECT_SOURCE_DIR}/src/common/mutex_debug.cc
+ ${PROJECT_SOURCE_DIR}/src/common/condition_variable_debug.cc
+ ${PROJECT_SOURCE_DIR}/src/common/shared_mutex_debug.cc)
+endif()
+add_library(crimson-alien-common STATIC
+ ${crimson_alien_common_srcs})
+
+target_link_libraries(crimson-alien-common
+ crimson-common
+ alien::cflags)
+
+set(alien_store_srcs
+ alien_store.cc
+ thread_pool.cc
+ alien_log.cc
+ ${PROJECT_SOURCE_DIR}/src/os/ObjectStore.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BtreeAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapFreelistManager.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueFS.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueRocksEnv.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/simple_bitmap.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/StupidAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/memstore/MemStore.cc)
+if(WITH_ZBD)
+ list(APPEND alien_store_srcs
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedFreelistManager.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedAllocator.cc)
+endif()
+add_library(crimson-alienstore STATIC
+ ${alien_store_srcs})
+if(WITH_LTTNG)
+ add_dependencies(crimson-alienstore bluestore-tp)
+endif()
+target_link_libraries(crimson-alienstore
+ PRIVATE
+ alien::cflags
+ fmt::fmt
+ kv
+ heap_profiler
+ crimson-alien-common
+ ${BLKID_LIBRARIES}
+ ${UDEV_LIBRARIES}
+ crimson
+ blk)
diff --git a/src/crimson/os/alienstore/alien_collection.h b/src/crimson/os/alienstore/alien_collection.h
new file mode 100644
index 000000000..17a930e77
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_collection.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "os/ObjectStore.h"
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "alien_store.h"
+
+namespace crimson::os {
+
+class AlienCollection final : public FuturizedCollection {
+public:
+ AlienCollection(ObjectStore::CollectionHandle ch)
+ : FuturizedCollection(ch->cid),
+ collection(ch) {}
+
+ ~AlienCollection() {}
+
+ template <typename Func, typename Result = std::invoke_result_t<Func>>
+ seastar::futurize_t<Result> with_lock(Func&& func) {
+ // newer versions of Seastar provide two variants of `with_lock`
+ // - generic, friendly towards throwing move constructors of Func,
+ // - specialized for `noexcept`.
+ // unfortunately, the former has a limitation: the return value
+ // of `Func` must be compatible with `current_exception_as_future()`
+ // which boils down to returning `seastar::future<void>`.
+ static_assert(std::is_nothrow_move_constructible_v<Func>);
+ return seastar::with_lock(mutex, std::forward<Func>(func));
+ }
+
+private:
+ ObjectStore::CollectionHandle collection;
+ seastar::shared_mutex mutex;
+ friend AlienStore;
+};
+}
diff --git a/src/crimson/os/alienstore/alien_log.cc b/src/crimson/os/alienstore/alien_log.cc
new file mode 100644
index 000000000..b371af897
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_log.cc
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "alien_log.h"
+#include "log/SubsystemMap.h"
+#include <seastar/core/alien.hh>
+#include "crimson/common/log.h"
+
+namespace ceph::logging {
+CnLog::CnLog(const SubsystemMap *s, seastar::alien::instance& inst, unsigned shard)
+ :Log(s)
+ ,inst(inst)
+ ,shard(shard) {
+}
+
+CnLog::~CnLog() {
+}
+
+void CnLog::_flush(EntryVector& q, bool crash) {
+ seastar::alien::submit_to(inst, shard, [&q] {
+ for (auto& it : q) {
+ crimson::get_logger(it.m_subsys).log(
+ crimson::to_log_level(it.m_prio),
+ "{}",
+ it.strv());
+ }
+ return seastar::make_ready_future<>();
+ }).wait();
+ q.clear();
+ return;
+}
+
+} //namespace ceph::logging
diff --git a/src/crimson/os/alienstore/alien_log.h b/src/crimson/os/alienstore/alien_log.h
new file mode 100644
index 000000000..0f966d9ab
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_log.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef ALIEN_LOG_H
+#define ALIEN_LOG_H
+
+#include "log/Log.h"
+
+namespace ceph {
+namespace logging {
+class SubsystemMap;
+}
+}
+
+namespace seastar::alien {
+ class instance;
+}
+namespace ceph::logging
+{
+class CnLog : public ceph::logging::Log
+{
+ seastar::alien::instance& inst;
+ unsigned shard;
+ void _flush(EntryVector& q, bool crash) override;
+public:
+ CnLog(const SubsystemMap *s, seastar::alien::instance& inst, unsigned shard);
+ ~CnLog() override;
+};
+}
+
+#endif
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
new file mode 100644
index 000000000..61f23de97
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -0,0 +1,620 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "alien_collection.h"
+#include "alien_store.h"
+#include "alien_log.h"
+
+#include <algorithm>
+#include <iterator>
+#include <map>
+#include <string_view>
+#include <boost/algorithm/string/trim.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <seastar/core/alien.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/resource.hh>
+
+#include "common/ceph_context.h"
+#include "global/global_context.h"
+#include "include/Context.h"
+#include "os/ObjectStore.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/log.h"
+#include "crimson/os/futurized_store.h"
+
+using std::map;
+using std::set;
+using std::string;
+
+namespace {
+
+seastar::logger& logger()
+{
+ return crimson::get_logger(ceph_subsys_alienstore);
+}
+
+class OnCommit final: public Context
+{
+ const int cpuid;
+ seastar::alien::instance &alien;
+ seastar::promise<> &alien_done;
+public:
+ OnCommit(
+ int id,
+ seastar::promise<> &done,
+ seastar::alien::instance &alien,
+ ceph::os::Transaction& txn)
+ : cpuid(id),
+ alien(alien),
+ alien_done(done) {
+ }
+
+ void finish(int) final {
+ return seastar::alien::submit_to(alien, cpuid, [this] {
+ alien_done.set_value();
+ return seastar::make_ready_future<>();
+ }).wait();
+ }
+};
+}
+
+namespace crimson::os {
+
+using crimson::common::get_conf;
+
+AlienStore::AlienStore(const std::string& type,
+ const std::string& path,
+ const ConfigValues& values)
+ : type(type),
+ path{path},
+ values(values)
+{
+}
+
+AlienStore::~AlienStore()
+{
+}
+
+seastar::future<> AlienStore::start()
+{
+ cct = std::make_unique<CephContext>(
+ CEPH_ENTITY_TYPE_OSD,
+ CephContext::create_options { CODE_ENVIRONMENT_UTILITY, 0,
+ [](const ceph::logging::SubsystemMap* subsys_map) {
+ return new ceph::logging::CnLog(subsys_map, seastar::engine().alien(), seastar::this_shard_id());
+ }
+ }
+ );
+ g_ceph_context = cct.get();
+ cct->_conf.set_config_values(values);
+ cct->_log->start();
+
+ store = ObjectStore::create(cct.get(), type, path);
+ if (!store) {
+ ceph_abort_msgf("unsupported objectstore type: %s", type.c_str());
+ }
+ auto cpu_cores = seastar::resource::parse_cpuset(
+ get_conf<std::string>("crimson_alien_thread_cpu_cores"));
+ // cores except the first "N_CORES_FOR_SEASTAR" ones will
+ // be used for alien threads scheduling:
+ // [0, N_CORES_FOR_SEASTAR) are reserved for seastar reactors
+ // [N_CORES_FOR_SEASTAR, ..] are assigned to alien threads.
+ if (!cpu_cores.has_value()) {
+ seastar::resource::cpuset cpuset;
+ std::copy(boost::counting_iterator<unsigned>(N_CORES_FOR_SEASTAR),
+ boost::counting_iterator<unsigned>(sysconf(_SC_NPROCESSORS_ONLN)),
+ std::inserter(cpuset, cpuset.end()));
+ if (cpuset.empty()) {
+ logger().error("{}: unable to get nproc: {}", __func__, errno);
+ } else {
+ cpu_cores = cpuset;
+ }
+ }
+ const auto num_threads =
+ get_conf<uint64_t>("crimson_alien_op_num_threads");
+ tp = std::make_unique<crimson::os::ThreadPool>(num_threads, 128, cpu_cores);
+ return tp->start();
+}
+
+seastar::future<> AlienStore::stop()
+{
+ if (!tp) {
+ // not really started yet
+ return seastar::now();
+ }
+ return tp->submit([this] {
+ for (auto [cid, ch]: coll_map) {
+ static_cast<AlienCollection*>(ch.get())->collection.reset();
+ }
+ store.reset();
+ cct.reset();
+ g_ceph_context = nullptr;
+
+ }).then([this] {
+ return tp->stop();
+ });
+}
+
+AlienStore::mount_ertr::future<> AlienStore::mount()
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return tp->submit([this] {
+ return store->mount();
+ }).then([] (const int r) -> mount_ertr::future<> {
+ if (r != 0) {
+ return crimson::stateful_ec{
+ std::error_code(-r, std::generic_category()) };
+ } else {
+ return mount_ertr::now();
+ }
+ });
+}
+
+seastar::future<> AlienStore::umount()
+{
+ logger().info("{}", __func__);
+ if (!tp) {
+ // not really started yet
+ return seastar::now();
+ }
+ return op_gate.close().then([this] {
+ return tp->submit([this] {
+ return store->umount();
+ });
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
+ });
+}
+
+AlienStore::mkfs_ertr::future<> AlienStore::mkfs(uuid_d osd_fsid)
+{
+ logger().debug("{}", __func__);
+ store->set_fsid(osd_fsid);
+ assert(tp);
+ return tp->submit([this] {
+ return store->mkfs();
+ }).then([] (int r) -> mkfs_ertr::future<> {
+ if (r != 0) {
+ return crimson::stateful_ec{
+ std::error_code(-r, std::generic_category()) };
+ } else {
+ return mkfs_ertr::now();
+ }
+ });
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+AlienStore::list_objects(CollectionRef ch,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return do_with_op_gate(std::vector<ghobject_t>(), ghobject_t(),
+ [=, this] (auto &objects, auto &next) {
+ objects.reserve(limit);
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()),
+ [=, this, &objects, &next] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->collection_list(c->collection, start, end,
+ store->get_ideal_list_max(),
+ &objects, &next);
+ }).then([&objects, &next] (int r) {
+ assert(r == 0);
+ return seastar::make_ready_future<
+ std::tuple<std::vector<ghobject_t>, ghobject_t>>(
+ std::move(objects), std::move(next));
+ });
+ });
+}
+
+seastar::future<CollectionRef> AlienStore::create_new_collection(const coll_t& cid)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return tp->submit([this, cid] {
+ return store->create_new_collection(cid);
+ }).then([this, cid] (ObjectStore::CollectionHandle c) {
+ CollectionRef ch;
+ auto cp = coll_map.find(c->cid);
+ if (cp == coll_map.end()) {
+ ch = new AlienCollection(c);
+ coll_map[c->cid] = ch;
+ } else {
+ ch = cp->second;
+ auto ach = static_cast<AlienCollection*>(ch.get());
+ if (ach->collection != c) {
+ ach->collection = c;
+ }
+ }
+ return seastar::make_ready_future<CollectionRef>(ch);
+ });
+
+}
+
+seastar::future<CollectionRef> AlienStore::open_collection(const coll_t& cid)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return tp->submit([this, cid] {
+ return store->open_collection(cid);
+ }).then([this] (ObjectStore::CollectionHandle c) {
+ if (!c) {
+ return seastar::make_ready_future<CollectionRef>();
+ }
+ CollectionRef ch;
+ auto cp = coll_map.find(c->cid);
+ if (cp == coll_map.end()){
+ ch = new AlienCollection(c);
+ coll_map[c->cid] = ch;
+ } else {
+ ch = cp->second;
+ auto ach = static_cast<AlienCollection*>(ch.get());
+ if (ach->collection != c){
+ ach->collection = c;
+ }
+ }
+ return seastar::make_ready_future<CollectionRef>(ch);
+ });
+}
+
+seastar::future<std::vector<coll_core_t>> AlienStore::list_collections()
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+
+ return do_with_op_gate(std::vector<coll_t>{}, [this] (auto &ls) {
+ return tp->submit([this, &ls] {
+ return store->list_collections(ls);
+ }).then([&ls] (int r) -> seastar::future<std::vector<coll_core_t>> {
+ assert(r == 0);
+ std::vector<coll_core_t> ret;
+ ret.resize(ls.size());
+ std::transform(
+ ls.begin(), ls.end(), ret.begin(),
+ [](auto p) { return std::make_pair(p, NULL_CORE); });
+ return seastar::make_ready_future<std::vector<coll_core_t>>(std::move(ret));
+ });
+ });
+}
+
+AlienStore::read_errorator::future<ceph::bufferlist>
+AlienStore::read(CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return do_with_op_gate(ceph::bufferlist{}, [=, this] (auto &bl) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &bl] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->read(c->collection, oid, offset, len, bl, op_flags);
+ }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r == -EIO) {
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ return read_errorator::make_ready_future<ceph::bufferlist>(
+ std::move(bl));
+ }
+ });
+ });
+}
+
+AlienStore::read_errorator::future<ceph::bufferlist>
+AlienStore::readv(CollectionRef ch,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return do_with_op_gate(ceph::bufferlist{},
+ [this, ch, oid, &m, op_flags](auto& bl) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()),
+ [this, ch, oid, &m, op_flags, &bl] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->readv(c->collection, oid, m, bl, op_flags);
+ }).then([&bl](int r) -> read_errorator::future<ceph::bufferlist> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r == -EIO) {
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ return read_errorator::make_ready_future<ceph::bufferlist>(
+ std::move(bl));
+ }
+ });
+ });
+}
+
+AlienStore::get_attr_errorator::future<ceph::bufferlist>
+AlienStore::get_attr(CollectionRef ch,
+ const ghobject_t& oid,
+ std::string_view name) const
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return do_with_op_gate(ceph::bufferlist{}, std::string{name},
+ [=, this] (auto &value, const auto& name) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &value, &name] {
+ // XXX: `name` isn't a `std::string_view` anymore! it had to be converted
+ // to `std::string` for the sake of extending life-time not only of
+ // a _ptr-to-data_ but _data_ as well. Otherwise we would run into a use-
+ // after-free issue.
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->getattr(c->collection, oid, name.c_str(), value);
+ }).then([oid, &value](int r) -> get_attr_errorator::future<ceph::bufferlist> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r == -ENODATA) {
+ return crimson::ct_error::enodata::make();
+ } else {
+ return get_attr_errorator::make_ready_future<ceph::bufferlist>(
+ std::move(value));
+ }
+ });
+ });
+}
+
+AlienStore::get_attrs_ertr::future<AlienStore::attrs_t>
+AlienStore::get_attrs(CollectionRef ch,
+ const ghobject_t& oid)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return do_with_op_gate(attrs_t{}, [=, this] (auto &aset) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &aset] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ const auto r = store->getattrs(c->collection, oid, aset);
+ return r;
+ }).then([&aset] (int r) -> get_attrs_ertr::future<attrs_t> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else {
+ return get_attrs_ertr::make_ready_future<attrs_t>(std::move(aset));
+ }
+ });
+ });
+}
+
+auto AlienStore::omap_get_values(CollectionRef ch,
+ const ghobject_t& oid,
+ const set<string>& keys)
+ -> read_errorator::future<omap_values_t>
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->omap_get_values(c->collection, oid, keys,
+ reinterpret_cast<map<string, bufferlist>*>(&values));
+ }).then([&values] (int r) -> read_errorator::future<omap_values_t> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else {
+ assert(r == 0);
+ return read_errorator::make_ready_future<omap_values_t>(
+ std::move(values));
+ }
+ });
+ });
+}
+
+auto AlienStore::omap_get_values(CollectionRef ch,
+ const ghobject_t &oid,
+ const std::optional<string> &start)
+ -> read_errorator::future<std::tuple<bool, omap_values_t>>
+{
+ logger().debug("{} with_start", __func__);
+ assert(tp);
+ return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->omap_get_values(c->collection, oid, start,
+ reinterpret_cast<map<string, bufferlist>*>(&values));
+ }).then([&values] (int r)
+ -> read_errorator::future<std::tuple<bool, omap_values_t>> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r < 0){
+ logger().error("omap_get_values(start): {}", r);
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ return read_errorator::make_ready_future<std::tuple<bool, omap_values_t>>(
+ true, std::move(values));
+ }
+ });
+ });
+}
+
+seastar::future<> AlienStore::do_transaction_no_callbacks(
+ CollectionRef ch,
+ ceph::os::Transaction&& txn)
+{
+ logger().debug("{}", __func__);
+ auto id = seastar::this_shard_id();
+ auto done = seastar::promise<>();
+ return do_with_op_gate(
+ std::move(txn),
+ std::move(done),
+ [this, ch, id] (auto &txn, auto &done) {
+ AlienCollection* alien_coll = static_cast<AlienCollection*>(ch.get());
+ // moving the `ch` is crucial for buildability on newer S* versions.
+ return alien_coll->with_lock([this, ch=std::move(ch), id, &txn, &done] {
+ assert(tp);
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()),
+ [this, ch, id, &txn, &done, &alien=seastar::engine().alien()] {
+ txn.register_on_commit(new OnCommit(id, done, alien, txn));
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->queue_transaction(c->collection, std::move(txn));
+ });
+ }).then([&done] (int r) {
+ assert(r == 0);
+ return done.get_future();
+ });
+ });
+}
+
+seastar::future<> AlienStore::inject_data_error(const ghobject_t& o)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return seastar::with_gate(op_gate, [=, this] {
+ return tp->submit([o, this] {
+ return store->inject_data_error(o);
+ });
+ });
+}
+
+seastar::future<> AlienStore::inject_mdata_error(const ghobject_t& o)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return seastar::with_gate(op_gate, [=, this] {
+ return tp->submit([=, this] {
+ return store->inject_mdata_error(o);
+ });
+ });
+}
+
+seastar::future<> AlienStore::write_meta(const std::string& key,
+ const std::string& value)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return seastar::with_gate(op_gate, [=, this] {
+ return tp->submit([=, this] {
+ return store->write_meta(key, value);
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::make_ready_future<>();
+ });
+ });
+}
+
+seastar::future<std::tuple<int, std::string>>
+AlienStore::read_meta(const std::string& key)
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return seastar::with_gate(op_gate, [this, key] {
+ return tp->submit([this, key] {
+ std::string value;
+ int r = store->read_meta(key, &value);
+ if (r > 0) {
+ value.resize(r);
+ boost::algorithm::trim_right_if(value,
+ [] (unsigned char c) {return isspace(c);});
+ } else {
+ value.clear();
+ }
+ return std::make_pair(r, value);
+ }).then([] (auto entry) {
+ return seastar::make_ready_future<std::tuple<int, std::string>>(
+ std::move(entry));
+ });
+ });
+}
+
+uuid_d AlienStore::get_fsid() const
+{
+ logger().debug("{}", __func__);
+ return store->get_fsid();
+}
+
+seastar::future<store_statfs_t> AlienStore::stat() const
+{
+ logger().info("{}", __func__);
+ assert(tp);
+ return do_with_op_gate(store_statfs_t{}, [this] (store_statfs_t &st) {
+ return tp->submit([this, &st] {
+ return store->statfs(&st, nullptr);
+ }).then([&st] (int r) {
+ assert(r == 0);
+ return seastar::make_ready_future<store_statfs_t>(std::move(st));
+ });
+ });
+}
+
+unsigned AlienStore::get_max_attr_name_length() const
+{
+ logger().info("{}", __func__);
+ return 256;
+}
+
+seastar::future<struct stat> AlienStore::stat(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ assert(tp);
+ return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [this, ch, oid, &st] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ store->stat(c->collection, oid, &st);
+ return st;
+ });
+ });
+}
+
+auto AlienStore::omap_get_header(CollectionRef ch,
+ const ghobject_t& oid)
+ -> get_attr_errorator::future<ceph::bufferlist>
+{
+ assert(tp);
+ return do_with_op_gate(ceph::bufferlist(), [=, this](auto& bl) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &bl] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->omap_get_header(c->collection, oid, &bl);
+ }).then([&bl](int r) -> get_attr_errorator::future<ceph::bufferlist> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r < 0) {
+ logger().error("omap_get_header: {}", r);
+ ceph_assert(0 == "impossible");
+ } else {
+ return get_attr_errorator::make_ready_future<ceph::bufferlist>(
+ std::move(bl));
+ }
+ });
+ });
+}
+
+AlienStore::read_errorator::future<std::map<uint64_t, uint64_t>> AlienStore::fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len)
+{
+ assert(tp);
+ return do_with_op_gate(std::map<uint64_t, uint64_t>(), [=, this](auto& destmap) {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &destmap] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->fiemap(c->collection, oid, off, len, destmap);
+ }).then([&destmap](int r)
+ -> read_errorator::future<std::map<uint64_t, uint64_t>> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else {
+ return read_errorator::make_ready_future<std::map<uint64_t, uint64_t>>(
+ std::move(destmap));
+ }
+ });
+ });
+}
+
+}
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
new file mode 100644
index 000000000..79c19b298
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_mutex.hh>
+
+#include "common/ceph_context.h"
+#include "os/ObjectStore.h"
+#include "osd/osd_types.h"
+
+#include "crimson/os/alienstore/thread_pool.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+using coll_core_t = FuturizedStore::coll_core_t;
+class AlienStore final : public FuturizedStore,
+ public FuturizedStore::Shard {
+public:
+ AlienStore(const std::string& type,
+ const std::string& path,
+ const ConfigValues& values);
+ ~AlienStore() final;
+
+ seastar::future<> start() final;
+ seastar::future<> stop() final;
+ mount_ertr::future<> mount() final;
+ seastar::future<> umount() final;
+
+ mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
+ read_errorator::future<ceph::bufferlist> read(CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags = 0) final;
+ read_errorator::future<ceph::bufferlist> readv(CollectionRef c,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags = 0) final;
+
+
+ get_attr_errorator::future<ceph::bufferlist> get_attr(CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name) const final;
+ get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c,
+ const ghobject_t& oid) final;
+
+ read_errorator::future<omap_values_t> omap_get_values(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const omap_keys_t& keys) final;
+
+ /// Retrieves paged set of values > start (if present)
+ read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+ CollectionRef c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] oid
+ const std::optional<std::string> &start ///< [in] start, empty for begin
+ ) final; ///< @return <done, values> values.empty() iff done
+
+ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+ CollectionRef c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const final;
+
+ seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+ seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+ seastar::future<std::vector<coll_core_t>> list_collections() final;
+
+ seastar::future<> do_transaction_no_callbacks(
+ CollectionRef c,
+ ceph::os::Transaction&& txn) final;
+
+ // error injection
+ seastar::future<> inject_data_error(const ghobject_t& o) final;
+ seastar::future<> inject_mdata_error(const ghobject_t& o) final;
+
+ seastar::future<> write_meta(const std::string& key,
+ const std::string& value) final;
+ seastar::future<std::tuple<int, std::string>> read_meta(
+ const std::string& key) final;
+ uuid_d get_fsid() const final;
+ seastar::future<store_statfs_t> stat() const final;
+ unsigned get_max_attr_name_length() const final;
+ seastar::future<struct stat> stat(
+ CollectionRef,
+ const ghobject_t&) final;
+ get_attr_errorator::future<ceph::bufferlist> omap_get_header(
+ CollectionRef,
+ const ghobject_t&) final;
+ read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef,
+ const ghobject_t&,
+ uint64_t off,
+ uint64_t len) final;
+
+ FuturizedStore::Shard& get_sharded_store() final {
+ return *this;
+ }
+
+private:
+ template <class... Args>
+ auto do_with_op_gate(Args&&... args) const {
+ return seastar::with_gate(op_gate,
+ // perfect forwarding in lambda's closure isn't available in C++17
+ // using tuple as workaround; see: https://stackoverflow.com/a/49902823
+ [args = std::make_tuple(std::forward<Args>(args)...)] () mutable {
+ return std::apply([] (auto&&... args) {
+ return seastar::do_with(std::forward<decltype(args)>(args)...);
+ }, std::move(args));
+ });
+ }
+
+ // number of cores that are PREVENTED from being scheduled
+ // to run alien store threads.
+ static constexpr int N_CORES_FOR_SEASTAR = 3;
+ mutable std::unique_ptr<crimson::os::ThreadPool> tp;
+ const std::string type;
+ const std::string path;
+ const ConfigValues values;
+ uint64_t used_bytes = 0;
+ std::unique_ptr<ObjectStore> store;
+ std::unique_ptr<CephContext> cct;
+ mutable seastar::gate op_gate;
+ std::unordered_map<coll_t, CollectionRef> coll_map;
+};
+}
diff --git a/src/crimson/os/alienstore/semaphore.h b/src/crimson/os/alienstore/semaphore.h
new file mode 100644
index 000000000..8cba02ab1
--- /dev/null
+++ b/src/crimson/os/alienstore/semaphore.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+#pragma once
+
+#include <semaphore.h>
+#include <ctime>
+#include <cerrno>
+#include <exception>
+#include <chrono>
+
+namespace crimson {
+
+// an implementation of std::counting_semaphore<> in C++17 using the POSIX
+// semaphore.
+//
+// LeastMaxValue is ignored, as we don't have different backends optimized
+// for different LeastMaxValues
+template<unsigned LeastMaxValue = 64>
+class counting_semaphore {
+ using clock_t = std::chrono::system_clock;
+public:
+ explicit counting_semaphore(unsigned count) noexcept {
+ sem_init(&sem, 0, count);
+ }
+
+ counting_semaphore(const counting_semaphore&) = delete;
+ counting_semaphore& operator=(const counting_semaphore&) = delete;
+
+ ~counting_semaphore() {
+ sem_destroy(&sem);
+ }
+
+ void acquire() noexcept {
+ for (;;) {
+ int err = sem_wait(&sem);
+ if (err != 0) {
+ if (errno == EINTR) {
+ continue;
+ } else {
+ std::terminate();
+ }
+ } else {
+ break;
+ }
+ }
+ }
+
+ void release(unsigned update = 1) {
+ for (; update != 0; --update) {
+ int err = sem_post(&sem);
+ if (err != 0) {
+ std::terminate();
+ }
+ }
+ }
+
+ template<typename Clock, typename Duration>
+ bool try_acquire_until(const std::chrono::time_point<Clock, Duration>& abs_time) noexcept {
+ auto s = std::chrono::time_point_cast<std::chrono::seconds>(abs_time);
+ auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(abs_time - s);
+ struct timespec ts = {
+ static_cast<std::time_t>(s.time_since_epoch().count()),
+ static_cast<long>(ns.count())
+ };
+ for (;;) {
+ if (int err = sem_timedwait(&sem, &ts); err) {
+ if (errno == EINTR) {
+ continue;
+ } else if (errno == ETIMEDOUT || errno == EINVAL) {
+ return false;
+ } else {
+ std::terminate();
+ }
+ } else {
+ break;
+ }
+ }
+ return true;
+ }
+
+ template<typename Rep, typename Period>
+ bool try_acquire_for(const std::chrono::duration<Rep, Period>& rel_time) {
+ return try_acquire_until(clock_t::now() + rel_time);
+ }
+
+private:
+ sem_t sem;
+};
+
+}
diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc
new file mode 100644
index 000000000..5cf9590e6
--- /dev/null
+++ b/src/crimson/os/alienstore/thread_pool.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "thread_pool.h"
+
+#include <chrono>
+#include <pthread.h>
+
+#include "include/ceph_assert.h"
+#include "crimson/common/config_proxy.h"
+
+using crimson::common::local_conf;
+
+namespace crimson::os {
+
+ThreadPool::ThreadPool(size_t n_threads,
+ size_t queue_sz,
+ const std::optional<seastar::resource::cpuset>& cpus)
+ : n_threads(n_threads),
+ queue_size{round_up_to(queue_sz, seastar::smp::count)},
+ pending_queues(n_threads)
+{
+ auto queue_max_wait = std::chrono::seconds(local_conf()->threadpool_empty_queue_max_wait);
+ for (size_t i = 0; i < n_threads; i++) {
+ threads.emplace_back([this, cpus, queue_max_wait, i] {
+ if (cpus.has_value()) {
+ pin(*cpus);
+ }
+ block_sighup();
+ (void) pthread_setname_np(pthread_self(), "alien-store-tp");
+ loop(queue_max_wait, i);
+ });
+ }
+}
+
+ThreadPool::~ThreadPool()
+{
+ for (auto& thread : threads) {
+ thread.join();
+ }
+}
+
+void ThreadPool::pin(const seastar::resource::cpuset& cpus)
+{
+ cpu_set_t cs;
+ CPU_ZERO(&cs);
+ for (auto cpu : cpus) {
+ CPU_SET(cpu, &cs);
+ }
+ [[maybe_unused]] auto r = pthread_setaffinity_np(pthread_self(),
+ sizeof(cs), &cs);
+ ceph_assert(r == 0);
+}
+
+void ThreadPool::block_sighup()
+{
+ sigset_t sigs;
+ sigemptyset(&sigs);
+ // alien threads must ignore the SIGHUP. It's necessary as in
+ // `crimson/osd/main.cc` we set a handler using the Seastar's
+ // signal handling infrastrucute which assumes the `_backend`
+ // of `seastar::engine()` is not null. Grep `reactor.cc` for
+ // `sigaction` or just visit `reactor::signals::handle_signal()`.
+ sigaddset(&sigs, SIGHUP);
+ pthread_sigmask(SIG_BLOCK, &sigs, nullptr);
+}
+
+void ThreadPool::loop(std::chrono::milliseconds queue_max_wait, size_t shard)
+{
+ auto& pending = pending_queues[shard];
+ for (;;) {
+ WorkItem* work_item = nullptr;
+ work_item = pending.pop_front(queue_max_wait);
+ if (work_item) {
+ work_item->process();
+ } else if (is_stopping()) {
+ break;
+ }
+ }
+}
+
+seastar::future<> ThreadPool::start()
+{
+ auto slots_per_shard = queue_size / seastar::smp::count;
+ return submit_queue.start(slots_per_shard);
+}
+
+seastar::future<> ThreadPool::stop()
+{
+ return submit_queue.stop().then([this] {
+ stopping = true;
+ for (auto& q : pending_queues) {
+ q.stop();
+ }
+ });
+}
+
+} // namespace crimson::os
diff --git a/src/crimson/os/alienstore/thread_pool.h b/src/crimson/os/alienstore/thread_pool.h
new file mode 100644
index 000000000..78e18219a
--- /dev/null
+++ b/src/crimson/os/alienstore/thread_pool.h
@@ -0,0 +1,184 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <tuple>
+#include <type_traits>
+#include <boost/lockfree/queue.hpp>
+#include <boost/optional.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/resource.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/sharded.hh>
+
+#if __cplusplus > 201703L
+#include <semaphore>
+namespace crimson {
+ using std::counting_semaphore;
+}
+#else
+#include "semaphore.h"
+#endif
+
+namespace crimson::os {
+
+struct WorkItem {
+ virtual ~WorkItem() {}
+ virtual void process() = 0;
+};
+
+template<typename Func>
+struct Task final : WorkItem {
+ using T = std::invoke_result_t<Func>;
+ using future_stored_type_t =
+ std::conditional_t<std::is_void_v<T>,
+ seastar::internal::future_stored_type_t<>,
+ seastar::internal::future_stored_type_t<T>>;
+ using futurator_t = seastar::futurize<T>;
+public:
+ explicit Task(Func&& f)
+ : func(std::move(f))
+ {}
+ void process() override {
+ try {
+ if constexpr (std::is_void_v<T>) {
+ func();
+ state.set();
+ } else {
+ state.set(func());
+ }
+ } catch (...) {
+ state.set_exception(std::current_exception());
+ }
+ on_done.write_side().signal(1);
+ }
+ typename futurator_t::type get_future() {
+ return on_done.wait().then([this](size_t) {
+ if (state.failed()) {
+ return futurator_t::make_exception_future(state.get_exception());
+ } else {
+ return futurator_t::from_tuple(state.get_value());
+ }
+ });
+ }
+private:
+ Func func;
+ seastar::future_state<future_stored_type_t> state;
+ seastar::readable_eventfd on_done;
+};
+
+struct SubmitQueue {
+ seastar::semaphore free_slots;
+ seastar::gate pending_tasks;
+ explicit SubmitQueue(size_t num_free_slots)
+ : free_slots(num_free_slots)
+ {}
+ seastar::future<> stop() {
+ return pending_tasks.close();
+ }
+};
+
+struct ShardedWorkQueue {
+public:
+ WorkItem* pop_front(std::chrono::milliseconds& queue_max_wait) {
+ if (sem.try_acquire_for(queue_max_wait)) {
+ if (!is_stopping()) {
+ WorkItem* work_item = nullptr;
+ [[maybe_unused]] bool popped = pending.pop(work_item);
+ assert(popped);
+ return work_item;
+ }
+ }
+ return nullptr;
+ }
+ void stop() {
+ stopping = true;
+ sem.release();
+ }
+ void push_back(WorkItem* work_item) {
+ [[maybe_unused]] bool pushed = pending.push(work_item);
+ assert(pushed);
+ sem.release();
+ }
+private:
+ bool is_stopping() const {
+ return stopping;
+ }
+ std::atomic<bool> stopping = false;
+ static constexpr unsigned QUEUE_SIZE = 128;
+ crimson::counting_semaphore<QUEUE_SIZE> sem{0};
+ boost::lockfree::queue<WorkItem*> pending{QUEUE_SIZE};
+};
+
+/// an engine for scheduling non-seastar tasks from seastar fibers
+class ThreadPool {
+public:
+ /**
+ * @param queue_sz the depth of pending queue. before a task is scheduled,
+ * it waits in this queue. we will round this number to
+ * multiple of the number of cores.
+ * @param n_threads the number of threads in this thread pool.
+ * @param cpu the CPU core to which this thread pool is assigned
+ * @note each @c Task has its own crimson::thread::Condition, which possesses
+ * an fd, so we should keep the size of queue under a reasonable limit.
+ */
+ ThreadPool(size_t n_threads, size_t queue_sz, const std::optional<seastar::resource::cpuset>& cpus);
+ ~ThreadPool();
+ seastar::future<> start();
+ seastar::future<> stop();
+ size_t size() {
+ return n_threads;
+ }
+ template<typename Func, typename...Args>
+ auto submit(int shard, Func&& func, Args&&... args) {
+ auto packaged = [func=std::move(func),
+ args=std::forward_as_tuple(args...)] {
+ return std::apply(std::move(func), std::move(args));
+ };
+ return seastar::with_gate(submit_queue.local().pending_tasks,
+ [packaged=std::move(packaged), shard, this] {
+ return local_free_slots().wait()
+ .then([packaged=std::move(packaged), shard, this] {
+ auto task = new Task{std::move(packaged)};
+ auto fut = task->get_future();
+ pending_queues[shard].push_back(task);
+ return fut.finally([task, this] {
+ local_free_slots().signal();
+ delete task;
+ });
+ });
+ });
+ }
+
+ template<typename Func>
+ auto submit(Func&& func) {
+ return submit(::rand() % n_threads, std::forward<Func>(func));
+ }
+
+private:
+ void loop(std::chrono::milliseconds queue_max_wait, size_t shard);
+ bool is_stopping() const {
+ return stopping.load(std::memory_order_relaxed);
+ }
+ static void pin(const seastar::resource::cpuset& cpus);
+ static void block_sighup();
+ seastar::semaphore& local_free_slots() {
+ return submit_queue.local().free_slots;
+ }
+ ThreadPool(const ThreadPool&) = delete;
+ ThreadPool& operator=(const ThreadPool&) = delete;
+
+private:
+ size_t n_threads;
+ std::atomic<bool> stopping = false;
+ std::vector<std::thread> threads;
+ seastar::sharded<SubmitQueue> submit_queue;
+ const size_t queue_size;
+ std::vector<ShardedWorkQueue> pending_queues;
+};
+
+} // namespace crimson::os
diff --git a/src/crimson/os/cyanstore/CMakeLists.txt b/src/crimson/os/cyanstore/CMakeLists.txt
new file mode 100644
index 000000000..65f2b5498
--- /dev/null
+++ b/src/crimson/os/cyanstore/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_library(crimson-cyanstore STATIC
+ cyan_store.cc
+ cyan_collection.cc
+ cyan_object.cc)
+target_link_libraries(crimson-cyanstore
+ crimson
+ crimson-os)
diff --git a/src/crimson/os/cyanstore/cyan_collection.cc b/src/crimson/os/cyanstore/cyan_collection.cc
new file mode 100644
index 000000000..9a814f978
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_collection.cc
@@ -0,0 +1,78 @@
+#include "cyan_collection.h"
+
+#include "cyan_object.h"
+
+using std::make_pair;
+
+namespace crimson::os
+{
+
+Collection::Collection(const coll_t& c)
+ : FuturizedCollection{c}
+{}
+
+Collection::~Collection() = default;
+
+Collection::ObjectRef Collection::create_object() const
+{
+ return new crimson::os::Object;
+}
+
+Collection::ObjectRef Collection::get_object(ghobject_t oid)
+{
+ auto o = object_hash.find(oid);
+ if (o == object_hash.end())
+ return ObjectRef();
+ return o->second;
+}
+
+Collection::ObjectRef Collection::get_or_create_object(ghobject_t oid)
+{
+ auto result = object_hash.emplace(oid, ObjectRef{});
+ if (result.second)
+ object_map[oid] = result.first->second = create_object();
+ return result.first->second;
+}
+
+uint64_t Collection::used_bytes() const
+{
+ uint64_t result = 0;
+ for (auto& obj : object_map) {
+ result += obj.second->get_size();
+ }
+ return result;
+}
+
+void Collection::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(xattr, bl);
+ encode(use_page_set, bl);
+ uint32_t s = object_map.size();
+ encode(s, bl);
+ for (auto& [oid, obj] : object_map) {
+ encode(oid, bl);
+ obj->encode(bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void Collection::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(xattr, p);
+ decode(use_page_set, p);
+ uint32_t s;
+ decode(s, p);
+ while (s--) {
+ ghobject_t k;
+ decode(k, p);
+ auto o = create_object();
+ o->decode(p);
+ object_map.insert(make_pair(k, o));
+ object_hash.insert(make_pair(k, o));
+ }
+ DECODE_FINISH(p);
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_collection.h b/src/crimson/os/cyanstore/cyan_collection.h
new file mode 100644
index 000000000..068e427d8
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_collection.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+
+#include "crimson/os/futurized_collection.h"
+
+namespace crimson::os {
+
+class Object;
+/**
+ * a collection also orders transactions
+ *
+ * Any transactions queued under a given collection will be applied in
+ * sequence. Transactions queued under different collections may run
+ * in parallel.
+ *
+ * ObjectStore users may get collection handles with open_collection() (or,
+ * for bootstrapping a new collection, create_new_collection()).
+ */
+struct Collection final : public FuturizedCollection {
+ using ObjectRef = boost::intrusive_ptr<Object>;
+ int bits = 0;
+ // always use bufferlist object for testing
+ bool use_page_set = false;
+ std::unordered_map<ghobject_t, ObjectRef> object_hash; ///< for lookup
+ std::map<ghobject_t, ObjectRef> object_map; ///< for iteration
+ std::map<std::string,bufferptr> xattr;
+ bool exists = true;
+
+ Collection(const coll_t& c);
+ ~Collection() final;
+
+ ObjectRef create_object() const;
+ ObjectRef get_object(ghobject_t oid);
+ ObjectRef get_or_create_object(ghobject_t oid);
+ uint64_t used_bytes() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+};
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_object.cc b/src/crimson/os/cyanstore/cyan_object.cc
new file mode 100644
index 000000000..34bc13b7f
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_object.cc
@@ -0,0 +1,89 @@
+#include "cyan_object.h"
+#include "include/encoding.h"
+
+namespace crimson::os {
+
+size_t Object::get_size() const {
+ return data.length();
+}
+
+ceph::bufferlist Object::read(uint64_t offset, uint64_t len)
+{
+ bufferlist ret;
+ ret.substr_of(data, offset, len);
+ return ret;
+}
+
+int Object::write(uint64_t offset, const bufferlist &src)
+{
+ unsigned len = src.length();
+ // before
+ bufferlist newdata;
+ if (get_size() >= offset) {
+ newdata.substr_of(data, 0, offset);
+ } else {
+ if (get_size()) {
+ newdata.substr_of(data, 0, get_size());
+ }
+ newdata.append_zero(offset - get_size());
+ }
+
+ newdata.append(src);
+
+ // after
+ if (get_size() > offset + len) {
+ bufferlist tail;
+ tail.substr_of(data, offset + len, get_size() - (offset + len));
+ newdata.append(tail);
+ }
+
+ data = std::move(newdata);
+ return 0;
+}
+
+int Object::clone(Object *src, uint64_t srcoff, uint64_t len,
+ uint64_t dstoff)
+{
+ bufferlist bl;
+ if (srcoff == dstoff && len == src->get_size()) {
+ data = src->data;
+ return 0;
+ }
+ bl.substr_of(src->data, srcoff, len);
+ return write(dstoff, bl);
+
+}
+
+int Object::truncate(uint64_t size)
+{
+ if (get_size() > size) {
+ bufferlist bl;
+ bl.substr_of(data, 0, size);
+ data = std::move(bl);
+ } else if (get_size() == size) {
+ // do nothing
+ } else {
+ data.append_zero(size - get_size());
+ }
+ return 0;
+}
+
+void Object::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(data, bl);
+ encode(xattr, bl);
+ encode(omap_header, bl);
+ encode(omap, bl);
+ ENCODE_FINISH(bl);
+}
+
+void Object::decode(bufferlist::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(data, p);
+ decode(xattr, p);
+ decode(omap_header, p);
+ decode(omap, p);
+ DECODE_FINISH(p);
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_object.h b/src/crimson/os/cyanstore/cyan_object.h
new file mode 100644
index 000000000..624f9513a
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_object.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <cstddef>
+#include <map>
+#include <string>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/buffer.h"
+
+namespace crimson::os {
+
+struct Object : public boost::intrusive_ref_counter<
+ Object,
+ boost::thread_unsafe_counter>
+{
+ using bufferlist = ceph::bufferlist;
+
+ bufferlist data;
+ // use transparent comparator for better performance, see
+ // https://en.cppreference.com/w/cpp/utility/functional/less_void
+ std::map<std::string,bufferlist,std::less<>> xattr;
+ bufferlist omap_header;
+ std::map<std::string,bufferlist> omap;
+
+ typedef boost::intrusive_ptr<Object> Ref;
+
+ Object() = default;
+
+ // interface for object data
+ size_t get_size() const;
+ ceph::bufferlist read(uint64_t offset, uint64_t len);
+ int write(uint64_t offset, const bufferlist &bl);
+ int clone(Object *src, uint64_t srcoff, uint64_t len,
+ uint64_t dstoff);
+ int truncate(uint64_t offset);
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+};
+using ObjectRef = boost::intrusive_ptr<Object>;
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc
new file mode 100644
index 000000000..f2a6018e3
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_store.cc
@@ -0,0 +1,952 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cyan_store.h"
+
+#include <boost/algorithm/string/trim.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "common/safe_io.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+#include "cyan_collection.h"
+#include "cyan_object.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_cyanstore);
+ }
+}
+
+using std::string;
+using crimson::common::local_conf;
+
+namespace crimson::os {
+
+using ObjectRef = boost::intrusive_ptr<Object>;
+
+CyanStore::CyanStore(const std::string& path)
+ : path{path}
+{}
+
+CyanStore::~CyanStore() = default;
+
+template <const char* MsgV>
+struct singleton_ec : std::error_code {
+ singleton_ec()
+ : error_code(42, this_error_category{}) {
+ };
+private:
+ struct this_error_category : std::error_category {
+ const char* name() const noexcept final {
+ // XXX: we could concatenate with MsgV at compile-time but the burden
+ // isn't worth the benefit.
+ return "singleton_ec";
+ }
+ std::string message([[maybe_unused]] const int ev) const final {
+ assert(ev == 42);
+ return MsgV;
+ }
+ };
+};
+
+seastar::future<store_statfs_t> CyanStore::stat() const
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ logger().debug("{}", __func__);
+ return shard_stores.map_reduce0(
+ [](const CyanStore::Shard &local_store) {
+ return local_store.get_used_bytes();
+ },
+ (uint64_t)0,
+ std::plus<uint64_t>()
+ ).then([](uint64_t used_bytes) {
+ store_statfs_t st;
+ st.total = crimson::common::local_conf().get_val<Option::size_t>("memstore_device_bytes");
+ st.available = st.total - used_bytes;
+ return seastar::make_ready_future<store_statfs_t>(std::move(st));
+ });
+}
+
+
+CyanStore::mkfs_ertr::future<> CyanStore::mkfs(uuid_d new_osd_fsid)
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ static const char read_meta_errmsg[]{"read_meta"};
+ static const char parse_fsid_errmsg[]{"failed to parse fsid"};
+ static const char match_ofsid_errmsg[]{"unmatched osd_fsid"};
+ return read_meta("fsid").then([=, this](auto&& ret) -> mkfs_ertr::future<> {
+ auto& [r, fsid_str] = ret;
+ if (r == -ENOENT) {
+ if (new_osd_fsid.is_zero()) {
+ osd_fsid.generate_random();
+ } else {
+ osd_fsid = new_osd_fsid;
+ }
+ return write_meta("fsid", fmt::format("{}", osd_fsid));
+ } else if (r < 0) {
+ return crimson::stateful_ec{ singleton_ec<read_meta_errmsg>() };
+ } else {
+ logger().info("mkfs already has fsid {}", fsid_str);
+ if (!osd_fsid.parse(fsid_str.c_str())) {
+ return crimson::stateful_ec{ singleton_ec<parse_fsid_errmsg>() };
+ } else if (osd_fsid != new_osd_fsid) {
+ logger().error("on-disk fsid {} != provided {}", osd_fsid, new_osd_fsid);
+ return crimson::stateful_ec{ singleton_ec<match_ofsid_errmsg>() };
+ } else {
+ return mkfs_ertr::now();
+ }
+ }
+ }).safe_then([this]{
+ return write_meta("type", "memstore");
+ }).safe_then([this] {
+ return shard_stores.invoke_on_all(
+ [](auto &local_store) {
+ return local_store.mkfs();
+ });
+ });
+}
+
+seastar::future<> CyanStore::Shard::mkfs()
+{
+ std::string fn =
+ path + "/collections" + std::to_string(seastar::this_shard_id());
+ ceph::bufferlist bl;
+ std::set<coll_t> collections;
+ ceph::encode(collections, bl);
+ return crimson::write_file(std::move(bl), fn);
+}
+
+using coll_core_t = FuturizedStore::coll_core_t;
+seastar::future<std::vector<coll_core_t>>
+CyanStore::list_collections()
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return seastar::do_with(std::vector<coll_core_t>{}, [this](auto &collections) {
+ return shard_stores.map([](auto &local_store) {
+ return local_store.list_collections();
+ }).then([&collections](std::vector<std::vector<coll_core_t>> results) {
+ for (auto& colls : results) {
+ collections.insert(collections.end(), colls.begin(), colls.end());
+ }
+ return seastar::make_ready_future<std::vector<coll_core_t>>(
+ std::move(collections));
+ });
+ });
+}
+
+CyanStore::mount_ertr::future<> CyanStore::Shard::mount()
+{
+ static const char read_file_errmsg[]{"read_file"};
+ ceph::bufferlist bl;
+ std::string fn =
+ path + "/collections" + std::to_string(seastar::this_shard_id());
+ std::string err;
+ if (int r = bl.read_file(fn.c_str(), &err); r < 0) {
+ return crimson::stateful_ec{ singleton_ec<read_file_errmsg>() };
+ }
+
+ std::set<coll_t> collections;
+ auto p = bl.cbegin();
+ ceph::decode(collections, p);
+
+ for (auto& coll : collections) {
+ std::string fn = fmt::format("{}/{}{}", path, coll,
+ std::to_string(seastar::this_shard_id()));
+ ceph::bufferlist cbl;
+ if (int r = cbl.read_file(fn.c_str(), &err); r < 0) {
+ return crimson::stateful_ec{ singleton_ec<read_file_errmsg>() };
+ }
+ boost::intrusive_ptr<Collection> c{new Collection{coll}};
+ auto p = cbl.cbegin();
+ c->decode(p);
+ coll_map[coll] = c;
+ used_bytes += c->used_bytes();
+ }
+ return mount_ertr::now();
+}
+
+seastar::future<> CyanStore::Shard::umount()
+{
+ return seastar::do_with(std::set<coll_t>{}, [this](auto& collections) {
+ return seastar::do_for_each(coll_map, [&collections, this](auto& coll) {
+ auto& [col, ch] = coll;
+ collections.insert(col);
+ ceph::bufferlist bl;
+ ceph_assert(ch);
+ ch->encode(bl);
+ std::string fn = fmt::format("{}/{}{}", path, col,
+ std::to_string(seastar::this_shard_id()));
+ return crimson::write_file(std::move(bl), fn);
+ }).then([&collections, this] {
+ ceph::bufferlist bl;
+ ceph::encode(collections, bl);
+ std::string fn = fmt::format("{}/collections{}",
+ path, std::to_string(seastar::this_shard_id()));
+ return crimson::write_file(std::move(bl), fn);
+ });
+ });
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+CyanStore::Shard::list_objects(
+ CollectionRef ch,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {} {} {}",
+ __func__, c->get_cid(), start, end, limit);
+ std::vector<ghobject_t> objects;
+ objects.reserve(limit);
+ ghobject_t next = ghobject_t::get_max();
+ for (const auto& [oid, obj] :
+ boost::make_iterator_range(c->object_map.lower_bound(start),
+ c->object_map.end())) {
+ std::ignore = obj;
+ if (oid >= end || objects.size() >= limit) {
+ next = oid;
+ break;
+ }
+ objects.push_back(oid);
+ }
+ return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>(
+ std::make_tuple(std::move(objects), next));
+}
+
+seastar::future<CollectionRef>
+CyanStore::Shard::create_new_collection(const coll_t& cid)
+{
+ auto c = new Collection{cid};
+ new_coll_map[cid] = c;
+ return seastar::make_ready_future<CollectionRef>(c);
+}
+
+seastar::future<CollectionRef>
+CyanStore::Shard::open_collection(const coll_t& cid)
+{
+ return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
+}
+
+seastar::future<std::vector<coll_core_t>>
+CyanStore::Shard::list_collections()
+{
+ std::vector<coll_core_t> collections;
+ for (auto& coll : coll_map) {
+ collections.push_back(std::make_pair(coll.first, seastar::this_shard_id()));
+ }
+ return seastar::make_ready_future<std::vector<coll_core_t>>(std::move(collections));
+}
+
+CyanStore::Shard::read_errorator::future<ceph::bufferlist>
+CyanStore::Shard::read(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags)
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {} {}~{}",
+ __func__, c->get_cid(), oid, offset, len);
+ if (!c->exists) {
+ return crimson::ct_error::enoent::make();
+ }
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ if (offset >= o->get_size())
+ return read_errorator::make_ready_future<ceph::bufferlist>();
+ size_t l = len;
+ if (l == 0 && offset == 0) // note: len == 0 means read the entire object
+ l = o->get_size();
+ else if (offset + l > o->get_size())
+ l = o->get_size() - offset;
+ return read_errorator::make_ready_future<ceph::bufferlist>(o->read(offset, l));
+}
+
+CyanStore::Shard::read_errorator::future<ceph::bufferlist>
+CyanStore::Shard::readv(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags)
+{
+ return seastar::do_with(ceph::bufferlist{},
+ [this, ch, oid, &m, op_flags](auto& bl) {
+ return crimson::do_for_each(m,
+ [this, ch, oid, op_flags, &bl](auto& p) {
+ return read(ch, oid, p.first, p.second, op_flags)
+ .safe_then([&bl](auto ret) {
+ bl.claim_append(ret);
+ });
+ }).safe_then([&bl] {
+ return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+ });
+ });
+}
+
+CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist>
+CyanStore::Shard::get_attr(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ std::string_view name) const
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {}",
+ __func__, c->get_cid(), oid);
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ if (auto found = o->xattr.find(name); found != o->xattr.end()) {
+ return get_attr_errorator::make_ready_future<ceph::bufferlist>(found->second);
+ } else {
+ return crimson::ct_error::enodata::make();
+ }
+}
+
+CyanStore::Shard::get_attrs_ertr::future<CyanStore::Shard::attrs_t>
+CyanStore::Shard::get_attrs(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {}",
+ __func__, c->get_cid(), oid);
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ return get_attrs_ertr::make_ready_future<attrs_t>(o->xattr);
+}
+
+auto CyanStore::Shard::omap_get_values(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ const omap_keys_t& keys)
+ -> read_errorator::future<omap_values_t>
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {}", __func__, c->get_cid(), oid);
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ omap_values_t values;
+ for (auto& key : keys) {
+ if (auto found = o->omap.find(key); found != o->omap.end()) {
+ values.insert(*found);
+ }
+ }
+ return seastar::make_ready_future<omap_values_t>(std::move(values));
+}
+
+auto CyanStore::Shard::omap_get_values(
+ CollectionRef ch,
+ const ghobject_t &oid,
+ const std::optional<string> &start)
+ -> CyanStore::Shard::read_errorator::future<std::tuple<bool, omap_values_t>>
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {}", __func__, c->get_cid(), oid);
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ omap_values_t values;
+ for (auto i = start ? o->omap.upper_bound(*start) : o->omap.begin();
+ i != o->omap.end();
+ ++i) {
+ values.insert(*i);
+ }
+ return seastar::make_ready_future<std::tuple<bool, omap_values_t>>(
+ std::make_tuple(true, std::move(values)));
+}
+
+auto CyanStore::Shard::omap_get_header(
+ CollectionRef ch,
+ const ghobject_t& oid)
+ -> CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist>
+{
+ auto c = static_cast<Collection*>(ch.get());
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+
+ return get_attr_errorator::make_ready_future<ceph::bufferlist>(
+ o->omap_header);
+}
+
+seastar::future<> CyanStore::Shard::do_transaction_no_callbacks(
+ CollectionRef ch,
+ ceph::os::Transaction&& t)
+{
+ using ceph::os::Transaction;
+ int r = 0;
+ try {
+ auto i = t.begin();
+ while (i.have_op()) {
+ r = 0;
+ switch (auto op = i.decode_op(); op->op) {
+ case Transaction::OP_NOP:
+ break;
+ case Transaction::OP_REMOVE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _remove(cid, oid);
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ }
+ break;
+ case Transaction::OP_TOUCH:
+ case Transaction::OP_CREATE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _touch(cid, oid);
+ }
+ break;
+ case Transaction::OP_WRITE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ r = _write(cid, oid, off, len, bl, fadvise_flags);
+ }
+ break;
+ case Transaction::OP_ZERO:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ r = _zero(cid, oid, off, len);
+ }
+ break;
+ case Transaction::OP_TRUNCATE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ r = _truncate(cid, oid, off);
+ }
+ break;
+ case Transaction::OP_CLONE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ ghobject_t noid = i.get_oid(op->dest_oid);
+ r = _clone(cid, oid, noid);
+ }
+ break;
+ case Transaction::OP_SETATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ std::string name = i.decode_string();
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ std::map<std::string, bufferlist> to_set;
+ to_set.emplace(name, std::move(bl));
+ r = _setattrs(cid, oid, std::move(to_set));
+ }
+ break;
+ case Transaction::OP_SETATTRS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ std::map<std::string, bufferlist> aset;
+ i.decode_attrset(aset);
+ r = _setattrs(cid, oid, std::move(aset));
+ }
+ break;
+ case Transaction::OP_RMATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ std::string name = i.decode_string();
+ r = _rm_attr(cid, oid, name);
+ }
+ break;
+ case Transaction::OP_RMATTRS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _rm_attrs(cid, oid);
+ }
+ break;
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ r = _create_collection(cid, op->split_bits);
+ }
+ break;
+ case Transaction::OP_SETALLOCHINT:
+ {
+ r = 0;
+ }
+ break;
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _omap_clear(cid, oid);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ std::map<std::string, ceph::bufferlist> aset;
+ i.decode_attrset(aset);
+ r = _omap_set_values(cid, oid, std::move(aset));
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ r = _omap_set_header(cid, oid, bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ omap_keys_t keys;
+ i.decode_keyset(keys);
+ r = _omap_rmkeys(cid, oid, keys);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ r = _omap_rmkeyrange(cid, oid, first, last);
+ }
+ break;
+ case Transaction::OP_COLL_HINT:
+ {
+ ceph::bufferlist hint;
+ i.decode_bl(hint);
+ // ignored
+ break;
+ }
+ default:
+ logger().error("bad op {}", static_cast<unsigned>(op->op));
+ abort();
+ }
+ if (r < 0) {
+ break;
+ }
+ }
+ } catch (std::exception &e) {
+ logger().error("{} got exception {}", __func__, e);
+ r = -EINVAL;
+ }
+ if (r < 0) {
+ logger().error(" transaction dump:\n");
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ std::stringstream str;
+ f.flush(str);
+ logger().error("{}", str.str());
+ ceph_assert(r == 0);
+ }
+ return seastar::now();
+}
+
+int CyanStore::Shard::_remove(const coll_t& cid, const ghobject_t& oid)
+{
+ logger().debug("{} cid={} oid={}",
+ __func__, cid, oid);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ auto i = c->object_hash.find(oid);
+ if (i == c->object_hash.end())
+ return -ENOENT;
+ used_bytes -= i->second->get_size();
+ c->object_hash.erase(i);
+ c->object_map.erase(oid);
+ return 0;
+}
+
+int CyanStore::Shard::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+ logger().debug("{} cid={} oid={}",
+ __func__, cid, oid);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ c->get_or_create_object(oid);
+ return 0;
+}
+
+int CyanStore::Shard::_write(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ const ceph::bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ logger().debug("{} {} {} {} ~ {}",
+ __func__, cid, oid, offset, len);
+ assert(len == bl.length());
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ if (len > 0 && !local_conf()->memstore_debug_omit_block_device_write) {
+ const ssize_t old_size = o->get_size();
+ o->write(offset, bl);
+ used_bytes += (o->get_size() - old_size);
+ }
+
+ return 0;
+}
+
+int CyanStore::Shard::_zero(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len)
+{
+ logger().debug("{} {} {} {} ~ {}",
+ __func__, cid, oid, offset, len);
+
+ ceph::buffer::list bl;
+ bl.append_zero(len);
+ return _write(cid, oid, offset, len, bl, 0);
+}
+
+int CyanStore::Shard::_omap_clear(
+ const coll_t& cid,
+ const ghobject_t& oid)
+{
+ logger().debug("{} {} {}", __func__, cid, oid);
+
+ auto c = _get_collection(cid);
+ if (!c) {
+ return -ENOENT;
+ }
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ return -ENOENT;
+ }
+ o->omap.clear();
+ o->omap_header.clear();
+ return 0;
+}
+
+int CyanStore::Shard::_omap_set_values(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ std::map<std::string, ceph::bufferlist> &&aset)
+{
+ logger().debug(
+ "{} {} {} {} keys",
+ __func__, cid, oid, aset.size());
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ for (auto&& [key, val]: aset) {
+ o->omap.insert_or_assign(std::move(key), std::move(val));
+ }
+ return 0;
+}
+
+int CyanStore::Shard::_omap_set_header(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const ceph::bufferlist &header)
+{
+ logger().debug(
+ "{} {} {} {} bytes",
+ __func__, cid, oid, header.length());
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ o->omap_header = header;
+ return 0;
+}
+
+int CyanStore::Shard::_omap_rmkeys(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const omap_keys_t& aset)
+{
+ logger().debug(
+ "{} {} {} {} keys",
+ __func__, cid, oid, aset.size());
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ for (auto &i: aset) {
+ o->omap.erase(i);
+ }
+ return 0;
+}
+
+int CyanStore::Shard::_omap_rmkeyrange(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const std::string &first,
+ const std::string &last)
+{
+ logger().debug(
+ "{} {} {} first={} last={}",
+ __func__, cid, oid, first, last);
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ for (auto i = o->omap.lower_bound(first);
+ i != o->omap.end() && i->first < last;
+ o->omap.erase(i++));
+ return 0;
+}
+
+int CyanStore::Shard::_truncate(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ uint64_t size)
+{
+ logger().debug("{} cid={} oid={} size={}",
+ __func__, cid, oid, size);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ if (local_conf()->memstore_debug_omit_block_device_write)
+ return 0;
+ const ssize_t old_size = o->get_size();
+ int r = o->truncate(size);
+ used_bytes += (o->get_size() - old_size);
+ return r;
+}
+
+int CyanStore::Shard::_clone(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const ghobject_t& noid)
+{
+ logger().debug("{} cid={} oid={} noid={}",
+ __func__, cid, oid, noid);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef oo = c->get_object(oid);
+ if (!oo)
+ return -ENOENT;
+ if (local_conf()->memstore_debug_omit_block_device_write)
+ return 0;
+ ObjectRef no = c->get_or_create_object(noid);
+ used_bytes += ((ssize_t)oo->get_size() - (ssize_t)no->get_size());
+ no->clone(oo.get(), 0, oo->get_size(), 0);
+
+ no->omap_header = oo->omap_header;
+ no->omap = oo->omap;
+ no->xattr = oo->xattr;
+ return 0;
+}
+
+int CyanStore::Shard::_setattrs(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ std::map<std::string,bufferlist>&& aset)
+{
+ logger().debug("{} cid={} oid={}",
+ __func__, cid, oid);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ for (auto&& [key, val]: aset) {
+ o->xattr.insert_or_assign(std::move(key), std::move(val));
+ }
+ return 0;
+}
+
+int CyanStore::Shard::_rm_attr(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ std::string_view name)
+{
+ logger().debug("{} cid={} oid={} name={}", __func__, cid, oid, name);
+ auto c = _get_collection(cid);
+ if (!c) {
+ return -ENOENT;
+ }
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ return -ENOENT;
+ }
+ auto i = o->xattr.find(name);
+ if (i == o->xattr.end()) {
+ return -ENODATA;
+ }
+ o->xattr.erase(i);
+ return 0;
+}
+
+int CyanStore::Shard::_rm_attrs(
+ const coll_t& cid,
+ const ghobject_t& oid)
+{
+ logger().debug("{} cid={} oid={}", __func__, cid, oid);
+ auto c = _get_collection(cid);
+ if (!c) {
+ return -ENOENT;
+ }
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ return -ENOENT;
+ }
+ o->xattr.clear();
+ return 0;
+}
+
+int CyanStore::Shard::_create_collection(const coll_t& cid, int bits)
+{
+ auto result = coll_map.try_emplace(cid);
+ if (!result.second)
+ return -EEXIST;
+ auto p = new_coll_map.find(cid);
+ assert(p != new_coll_map.end());
+ result.first->second = p->second;
+ result.first->second->bits = bits;
+ new_coll_map.erase(p);
+ return 0;
+}
+
+boost::intrusive_ptr<Collection>
+CyanStore::Shard::_get_collection(const coll_t& cid)
+{
+ auto cp = coll_map.find(cid);
+ if (cp == coll_map.end())
+ return {};
+ return cp->second;
+}
+
+seastar::future<> CyanStore::write_meta(
+ const std::string& key,
+ const std::string& value)
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ std::string v = value;
+ v += "\n";
+ if (int r = safe_write_file(path.c_str(), key.c_str(),
+ v.c_str(), v.length(), 0600);
+ r < 0) {
+ throw std::runtime_error{fmt::format("unable to write_meta({})", key)};
+ }
+ return seastar::make_ready_future<>();
+}
+
+seastar::future<std::tuple<int, std::string>>
+CyanStore::read_meta(const std::string& key)
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ std::string fsid(4096, '\0');
+ int r = safe_read_file(path.c_str(), key.c_str(), fsid.data(), fsid.size());
+ if (r > 0) {
+ fsid.resize(r);
+ // drop trailing newlines
+ boost::algorithm::trim_right_if(fsid,
+ [](unsigned char c) {return isspace(c);});
+ } else {
+ fsid.clear();
+ }
+ return seastar::make_ready_future<std::tuple<int, std::string>>(
+ std::make_tuple(r, fsid));
+}
+
+uuid_d CyanStore::get_fsid() const
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return osd_fsid;
+}
+
+unsigned CyanStore::Shard::get_max_attr_name_length() const
+{
+ // arbitrary limitation exactly like in the case of MemStore.
+ return 256;
+}
+
+CyanStore::Shard::read_errorator::future<std::map<uint64_t, uint64_t>>
+CyanStore::Shard::fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len)
+{
+ auto c = static_cast<Collection*>(ch.get());
+
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ throw std::runtime_error(fmt::format("object does not exist: {}", oid));
+ }
+ std::map<uint64_t, uint64_t> m{{0, o->get_size()}};
+ return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(std::move(m));
+}
+
+seastar::future<struct stat>
+CyanStore::Shard::stat(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ auto c = static_cast<Collection*>(ch.get());
+ auto o = c->get_object(oid);
+ if (!o) {
+ throw std::runtime_error(fmt::format("object does not exist: {}", oid));
+ }
+ struct stat st;
+ st.st_size = o->get_size();
+ return seastar::make_ready_future<struct stat>(std::move(st));
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h
new file mode 100644
index 000000000..307f9ec32
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_store.h
@@ -0,0 +1,219 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <typeinfo>
+#include <vector>
+
+#include <optional>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "osd/osd_types.h"
+#include "include/uuid.h"
+
+#include "crimson/os/cyanstore/cyan_object.h"
+#include "crimson/os/cyanstore/cyan_collection.h"
+#include "crimson/os/futurized_store.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+class CyanStore final : public FuturizedStore {
+ class Shard : public FuturizedStore::Shard {
+ public:
+ Shard(std::string path)
+ :path(path){}
+
+ seastar::future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ read_errorator::future<ceph::bufferlist> read(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags = 0) final;
+
+ read_errorator::future<ceph::bufferlist> readv(
+ CollectionRef c,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags = 0) final;
+
+ get_attr_errorator::future<ceph::bufferlist> get_attr(
+ CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name) const final;
+
+ get_attrs_ertr::future<attrs_t> get_attrs(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ read_errorator::future<omap_values_t> omap_get_values(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const omap_keys_t& keys) final;
+
+ read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+ CollectionRef c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] oid
+ const std::optional<std::string> &start ///< [in] start, empty for begin
+ ) final;
+
+ get_attr_errorator::future<ceph::bufferlist> omap_get_header(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+ list_objects(
+ CollectionRef c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const final;
+
+ seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+
+ seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+
+ seastar::future<> do_transaction_no_callbacks(
+ CollectionRef ch,
+ ceph::os::Transaction&& txn) final;
+
+ read_errorator::future<std::map<uint64_t, uint64_t>>
+ fiemap(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len) final;
+
+ unsigned get_max_attr_name_length() const final;
+
+ public:
+ // only exposed to CyanStore
+ mount_ertr::future<> mount();
+
+ seastar::future<> umount();
+
+ seastar::future<> mkfs();
+
+ mkfs_ertr::future<> mkcoll(uuid_d new_osd_fsid);
+
+ using coll_core_t = FuturizedStore::coll_core_t;
+ seastar::future<std::vector<coll_core_t>> list_collections();
+
+ uint64_t get_used_bytes() const { return used_bytes; }
+
+ private:
+ int _remove(const coll_t& cid, const ghobject_t& oid);
+ int _touch(const coll_t& cid, const ghobject_t& oid);
+ int _write(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len, const ceph::bufferlist& bl,
+ uint32_t fadvise_flags);
+ int _zero(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len);
+ int _omap_clear(
+ const coll_t& cid,
+ const ghobject_t& oid);
+ int _omap_set_values(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ std::map<std::string, ceph::bufferlist> &&aset);
+ int _omap_set_header(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const ceph::bufferlist &header);
+ int _omap_rmkeys(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const omap_keys_t& aset);
+ int _omap_rmkeyrange(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const std::string &first,
+ const std::string &last);
+ int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+ int _clone(const coll_t& cid, const ghobject_t& oid,
+ const ghobject_t& noid);
+ int _setattrs(const coll_t& cid, const ghobject_t& oid,
+ std::map<std::string,bufferlist>&& aset);
+ int _rm_attr(const coll_t& cid, const ghobject_t& oid,
+ std::string_view name);
+ int _rm_attrs(const coll_t& cid, const ghobject_t& oid);
+ int _create_collection(const coll_t& cid, int bits);
+ boost::intrusive_ptr<Collection> _get_collection(const coll_t& cid);
+
+ private:
+ uint64_t used_bytes = 0;
+ const std::string path;
+ std::unordered_map<coll_t, boost::intrusive_ptr<Collection>> coll_map;
+ std::map<coll_t, boost::intrusive_ptr<Collection>> new_coll_map;
+ };
+
+public:
+ CyanStore(const std::string& path);
+ ~CyanStore() final;
+
+ seastar::future<> start() final {
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.start(path);
+ }
+
+ seastar::future<> stop() final {
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.stop();
+ }
+
+ mount_ertr::future<> mount() final {
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.invoke_on_all(
+ [](auto &local_store) {
+ return local_store.mount().handle_error(
+ crimson::stateful_ec::handle([](const auto& ec) {
+ crimson::get_logger(ceph_subsys_cyanstore).error(
+ "error mounting cyanstore: ({}) {}",
+ ec.value(), ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ });
+ }
+
+ seastar::future<> umount() final {
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.invoke_on_all(
+ [](auto &local_store) {
+ return local_store.umount();
+ });
+ }
+
+ mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
+
+ seastar::future<store_statfs_t> stat() const final;
+
+ uuid_d get_fsid() const final;
+
+ seastar::future<> write_meta(const std::string& key,
+ const std::string& value) final;
+
+ FuturizedStore::Shard& get_sharded_store() final{
+ return shard_stores.local();
+ }
+
+ seastar::future<std::tuple<int, std::string>>
+ read_meta(const std::string& key) final;
+
+ seastar::future<std::vector<coll_core_t>> list_collections() final;
+
+private:
+ seastar::sharded<CyanStore::Shard> shard_stores;
+ const std::string path;
+ uuid_d osd_fsid;
+};
+}
diff --git a/src/crimson/os/futurized_collection.h b/src/crimson/os/futurized_collection.h
new file mode 100644
index 000000000..7b460dffe
--- /dev/null
+++ b/src/crimson/os/futurized_collection.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "osd/osd_types.h"
+
+namespace crimson::os {
+class FuturizedStore;
+
+class FuturizedCollection
+ : public boost::intrusive_ref_counter<FuturizedCollection,
+ boost::thread_safe_counter>
+{
+public:
+ FuturizedCollection(const coll_t& cid)
+ : cid{cid} {}
+ virtual ~FuturizedCollection() {}
+ virtual seastar::future<> flush() {
+ return seastar::make_ready_future<>();
+ }
+ virtual seastar::future<bool> flush_commit() {
+ return seastar::make_ready_future<bool>(true);
+ }
+ const coll_t& get_cid() const {
+ return cid;
+ }
+private:
+ const coll_t cid;
+};
+
+using CollectionRef = boost::intrusive_ptr<FuturizedCollection>;
+}
diff --git a/src/crimson/os/futurized_store.cc b/src/crimson/os/futurized_store.cc
new file mode 100644
index 000000000..bc47ec78f
--- /dev/null
+++ b/src/crimson/os/futurized_store.cc
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "futurized_store.h"
+#include "cyanstore/cyan_store.h"
+#ifdef WITH_BLUESTORE
+#include "alienstore/alien_store.h"
+#endif
+#include "seastore/seastore.h"
+
+namespace crimson::os {
+
+std::unique_ptr<FuturizedStore>
+FuturizedStore::create(const std::string& type,
+ const std::string& data,
+ const ConfigValues& values)
+{
+ if (type == "cyanstore") {
+ using crimson::os::CyanStore;
+ return std::make_unique<CyanStore>(data);
+ } else if (type == "seastore") {
+ return crimson::os::seastore::make_seastore(
+ data);
+ } else {
+ using crimson::os::AlienStore;
+#ifdef WITH_BLUESTORE
+ // use AlienStore as a fallback. It adapts e.g. BlueStore.
+ return std::make_unique<AlienStore>(type, data, values);
+#else
+ ceph_abort_msgf("unsupported objectstore type: %s", type.c_str());
+ return {};
+#endif
+ }
+}
+
+}
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
new file mode 100644
index 000000000..783cd7485
--- /dev/null
+++ b/src/crimson/os/futurized_store.h
@@ -0,0 +1,195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <optional>
+#include <vector>
+
+#include <seastar/core/future.hh>
+
+#include "os/Transaction.h"
+#include "crimson/common/smp_helpers.h"
+#include "crimson/common/smp_helpers.h"
+#include "crimson/osd/exceptions.h"
+#include "include/buffer_fwd.h"
+#include "include/uuid.h"
+#include "osd/osd_types.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+class FuturizedCollection;
+
+class FuturizedStore {
+public:
+ class Shard {
+ public:
+ Shard() = default;
+ virtual ~Shard() = default;
+ // no copying
+ explicit Shard(const Shard& o) = delete;
+ const Shard& operator=(const Shard& o) = delete;
+
+ using CollectionRef = boost::intrusive_ptr<FuturizedCollection>;
+ using read_errorator = crimson::errorator<crimson::ct_error::enoent,
+ crimson::ct_error::input_output_error>;
+ virtual read_errorator::future<ceph::bufferlist> read(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags = 0) = 0;
+
+ virtual read_errorator::future<ceph::bufferlist> readv(
+ CollectionRef c,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags = 0) = 0;
+
+ using get_attr_errorator = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::enodata>;
+ virtual get_attr_errorator::future<ceph::bufferlist> get_attr(
+ CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name) const = 0;
+
+ using get_attrs_ertr = crimson::errorator<
+ crimson::ct_error::enoent>;
+ using attrs_t = std::map<std::string, ceph::bufferlist, std::less<>>;
+ virtual get_attrs_ertr::future<attrs_t> get_attrs(
+ CollectionRef c,
+ const ghobject_t& oid) = 0;
+
+ virtual seastar::future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid) = 0;
+
+ using omap_values_t = std::map<std::string, ceph::bufferlist, std::less<>>;
+ using omap_keys_t = std::set<std::string>;
+ virtual read_errorator::future<omap_values_t> omap_get_values(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const omap_keys_t& keys) = 0;
+
+ virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+ CollectionRef c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] oid
+ const std::optional<std::string> &start ///< [in] start, empty for begin
+ ) = 0; ///< @return <done, values> values.empty() only if done
+
+ virtual get_attr_errorator::future<bufferlist> omap_get_header(
+ CollectionRef c,
+ const ghobject_t& oid) = 0;
+
+ virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+ CollectionRef c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const = 0;
+
+ virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0;
+
+ virtual seastar::future<CollectionRef> open_collection(const coll_t& cid) = 0;
+
+ protected:
+ virtual seastar::future<> do_transaction_no_callbacks(
+ CollectionRef ch,
+ ceph::os::Transaction&& txn) = 0;
+
+ public:
+ seastar::future<> do_transaction(
+ CollectionRef ch,
+ ceph::os::Transaction&& txn) {
+ std::unique_ptr<Context> on_commit(
+ ceph::os::Transaction::collect_all_contexts(txn));
+ return do_transaction_no_callbacks(
+ std::move(ch), std::move(txn)
+ ).then([on_commit=std::move(on_commit)]() mutable {
+ auto c = on_commit.release();
+ if (c) c->complete(0);
+ return seastar::now();
+ });
+ }
+
+
+ /**
+ * flush
+ *
+ * Flushes outstanding transactions on ch, returned future resolves
+ * after any previously submitted transactions on ch have committed.
+ *
+ * @param ch [in] collection on which to flush
+ */
+ virtual seastar::future<> flush(CollectionRef ch) {
+ return do_transaction(ch, ceph::os::Transaction{});
+ }
+
+ // error injection
+ virtual seastar::future<> inject_data_error(const ghobject_t& o) {
+ return seastar::now();
+ }
+
+ virtual seastar::future<> inject_mdata_error(const ghobject_t& o) {
+ return seastar::now();
+ }
+
+ virtual read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len) = 0;
+
+ virtual unsigned get_max_attr_name_length() const = 0;
+ };
+
+public:
+ static std::unique_ptr<FuturizedStore> create(const std::string& type,
+ const std::string& data,
+ const ConfigValues& values);
+ FuturizedStore()
+ : primary_core(seastar::this_shard_id())
+ {}
+
+ virtual ~FuturizedStore() = default;
+
+ // no copying
+ explicit FuturizedStore(const FuturizedStore& o) = delete;
+ const FuturizedStore& operator=(const FuturizedStore& o) = delete;
+
+ virtual seastar::future<> start() = 0;
+
+ virtual seastar::future<> stop() = 0;
+
+ using mount_ertr = crimson::errorator<crimson::stateful_ec>;
+ virtual mount_ertr::future<> mount() = 0;
+
+ virtual seastar::future<> umount() = 0;
+
+ using mkfs_ertr = crimson::errorator<crimson::stateful_ec>;
+ virtual mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) = 0;
+
+ virtual seastar::future<store_statfs_t> stat() const = 0;
+
+ virtual uuid_d get_fsid() const = 0;
+
+ virtual seastar::future<> write_meta(const std::string& key,
+ const std::string& value) = 0;
+ // called on the shard and get this FuturizedStore::shard;
+ virtual Shard& get_sharded_store() = 0;
+
+ virtual seastar::future<std::tuple<int, std::string>> read_meta(
+ const std::string& key) = 0;
+
+ using coll_core_t = std::pair<coll_t, core_id_t>;
+ virtual seastar::future<std::vector<coll_core_t>> list_collections() = 0;
+
+protected:
+ const core_id_t primary_core;
+};
+}
diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt
new file mode 100644
index 000000000..4bdbab8c4
--- /dev/null
+++ b/src/crimson/os/seastore/CMakeLists.txt
@@ -0,0 +1,79 @@
+set(crimson_seastore_srcs
+ cached_extent.cc
+ seastore_types.cc
+ segment_manager.cc
+ segment_manager/ephemeral.cc
+ segment_manager/block.cc
+ transaction_manager.cc
+ transaction.cc
+ cache.cc
+ root_block.cc
+ lba_manager.cc
+ async_cleaner.cc
+ backref_manager.cc
+ backref/backref_tree_node.cc
+ backref/btree_backref_manager.cc
+ lba_manager/btree/btree_lba_manager.cc
+ lba_manager/btree/lba_btree_node.cc
+ omap_manager.cc
+ omap_manager/btree/btree_omap_manager.cc
+ omap_manager/btree/omap_btree_node_impl.cc
+ btree/btree_range_pin.cc
+ btree/fixed_kv_node.cc
+ onode.cc
+ onode_manager/staged-fltree/node.cc
+ onode_manager/staged-fltree/node_extent_manager.cc
+ onode_manager/staged-fltree/node_extent_manager/seastore.cc
+ onode_manager/staged-fltree/node_impl.cc
+ onode_manager/staged-fltree/stages/item_iterator_stage.cc
+ onode_manager/staged-fltree/stages/key_layout.cc
+ onode_manager/staged-fltree/stages/node_stage_layout.cc
+ onode_manager/staged-fltree/stages/node_stage.cc
+ onode_manager/staged-fltree/stages/sub_items_stage.cc
+ onode_manager/staged-fltree/super.cc
+ onode_manager/staged-fltree/value.cc
+ onode_manager/staged-fltree/fltree_onode_manager.cc
+ collection_manager.cc
+ collection_manager/flat_collection_manager.cc
+ collection_manager/collection_flat_node.cc
+ extent_placement_manager.cc
+ object_data_handler.cc
+ seastore.cc
+ random_block_manager.cc
+ random_block_manager/block_rb_manager.cc
+ random_block_manager/rbm_device.cc
+ random_block_manager/nvme_block_device.cc
+ random_block_manager/avlallocator.cc
+ journal/segmented_journal.cc
+ journal/segment_allocator.cc
+ journal/record_submitter.cc
+ journal/circular_journal_space.cc
+ journal.cc
+ device.cc
+ segment_manager_group.cc
+ record_scanner.cc
+ journal/circular_bounded_journal.cc
+ ../../../test/crimson/seastore/test_block.cc
+ ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc
+ )
+
+CMAKE_DEPENDENT_OPTION(WITH_ZNS "enable Linux ZNS support" OFF
+ "CMAKE_SYSTEM_NAME STREQUAL Linux" OFF)
+if(WITH_ZNS)
+ find_package(LinuxZNS REQUIRED)
+ list(APPEND crimson_seastore_srcs
+ segment_manager/zbd.cc)
+endif()
+
+add_library(crimson-seastore STATIC
+ ${crimson_seastore_srcs})
+
+target_link_libraries(crimson-seastore
+ crimson)
+if(WITH_ZNS)
+ target_link_libraries(crimson-seastore
+ Linux::ZNS)
+endif()
+
+set_target_properties(crimson-seastore PROPERTIES
+ JOB_POOL_COMPILE heavy_compile_job_pool)
diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc
new file mode 100644
index 000000000..d7e398f5f
--- /dev/null
+++ b/src/crimson/os/seastore/async_cleaner.cc
@@ -0,0 +1,1817 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/chrono.h>
+#include <seastar/core/metrics.hh>
+
+#include "crimson/os/seastore/logging.h"
+
+#include "crimson/os/seastore/async_cleaner.h"
+#include "crimson/os/seastore/backref_manager.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+SET_SUBSYS(seastore_cleaner);
+
+namespace {
+
+enum class gc_formula_t {
+ GREEDY,
+ BENEFIT,
+ COST_BENEFIT,
+};
+constexpr auto gc_formula = gc_formula_t::COST_BENEFIT;
+
+}
+
+namespace crimson::os::seastore {
+
+void segment_info_t::set_open(
+ segment_seq_t _seq, segment_type_t _type,
+ data_category_t _category, rewrite_gen_t _generation)
+{
+ ceph_assert(_seq != NULL_SEG_SEQ);
+ ceph_assert(_type != segment_type_t::NULL_SEG);
+ ceph_assert(_category != data_category_t::NUM);
+ ceph_assert(is_rewrite_generation(_generation));
+ state = Segment::segment_state_t::OPEN;
+ seq = _seq;
+ type = _type;
+ category = _category;
+ generation = _generation;
+ written_to = 0;
+}
+
+void segment_info_t::set_empty()
+{
+ state = Segment::segment_state_t::EMPTY;
+ seq = NULL_SEG_SEQ;
+ type = segment_type_t::NULL_SEG;
+ category = data_category_t::NUM;
+ generation = NULL_GENERATION;
+ modify_time = NULL_TIME;
+ num_extents = 0;
+ written_to = 0;
+}
+
+void segment_info_t::set_closed()
+{
+ state = Segment::segment_state_t::CLOSED;
+ // the rest of information is unchanged
+}
+
+void segment_info_t::init_closed(
+ segment_seq_t _seq, segment_type_t _type,
+ data_category_t _category, rewrite_gen_t _generation,
+ segment_off_t seg_size)
+{
+ ceph_assert(_seq != NULL_SEG_SEQ);
+ ceph_assert(_type != segment_type_t::NULL_SEG);
+ ceph_assert(_category != data_category_t::NUM);
+ ceph_assert(is_rewrite_generation(_generation));
+ state = Segment::segment_state_t::CLOSED;
+ seq = _seq;
+ type = _type;
+ category = _category;
+ generation = _generation;
+ written_to = seg_size;
+}
+
+std::ostream& operator<<(std::ostream &out, const segment_info_t &info)
+{
+ out << "seg_info_t("
+ << "state=" << info.state
+ << ", " << info.id;
+ if (info.is_empty()) {
+ // pass
+ } else { // open or closed
+ out << " " << info.type
+ << " " << segment_seq_printer_t{info.seq}
+ << " " << info.category
+ << " " << rewrite_gen_printer_t{info.generation}
+ << ", modify_time=" << sea_time_point_printer_t{info.modify_time}
+ << ", num_extents=" << info.num_extents
+ << ", written_to=" << info.written_to;
+ }
+ return out << ")";
+}
+
+void segments_info_t::reset()
+{
+ segments.clear();
+
+ segment_size = 0;
+
+ journal_segment_id = NULL_SEG_ID;
+ num_in_journal_open = 0;
+ num_type_journal = 0;
+ num_type_ool = 0;
+
+ num_open = 0;
+ num_empty = 0;
+ num_closed = 0;
+
+ count_open_journal = 0;
+ count_open_ool = 0;
+ count_release_journal = 0;
+ count_release_ool = 0;
+ count_close_journal = 0;
+ count_close_ool = 0;
+
+ total_bytes = 0;
+ avail_bytes_in_open = 0;
+
+ modify_times.clear();
+}
+
+void segments_info_t::add_segment_manager(
+ SegmentManager &segment_manager)
+{
+ LOG_PREFIX(segments_info_t::add_segment_manager);
+ device_id_t d_id = segment_manager.get_device_id();
+ auto ssize = segment_manager.get_segment_size();
+ auto nsegments = segment_manager.get_num_segments();
+ auto sm_size = segment_manager.get_available_size();
+ INFO("adding segment manager {}, size={}, ssize={}, segments={}",
+ device_id_printer_t{d_id}, sm_size, ssize, nsegments);
+ ceph_assert(ssize > 0);
+ ceph_assert(nsegments > 0);
+ ceph_assert(sm_size > 0);
+
+ // also validate if the device is duplicated
+ segments.add_device(d_id, nsegments, segment_info_t{});
+
+ // assume all the segment managers share the same settings as follows.
+ if (segment_size == 0) {
+ ceph_assert(ssize > 0);
+ segment_size = ssize;
+ } else {
+ ceph_assert(segment_size == ssize);
+ }
+
+ // NOTE: by default the segments are empty
+ num_empty += nsegments;
+
+ total_bytes += sm_size;
+}
+
+void segments_info_t::init_closed(
+ segment_id_t segment, segment_seq_t seq, segment_type_t type,
+ data_category_t category, rewrite_gen_t generation)
+{
+ LOG_PREFIX(segments_info_t::init_closed);
+ auto& segment_info = segments[segment];
+ DEBUG("initiating {} {} {} {} {}, {}, "
+ "num_segments(empty={}, opened={}, closed={})",
+ segment, type, segment_seq_printer_t{seq},
+ category, rewrite_gen_printer_t{generation},
+ segment_info, num_empty, num_open, num_closed);
+ ceph_assert(segment_info.is_empty());
+ ceph_assert(num_empty > 0);
+ --num_empty;
+ ++num_closed;
+ if (type == segment_type_t::JOURNAL) {
+ // init_closed won't initialize journal_segment_id
+ ceph_assert(get_submitted_journal_head() == JOURNAL_SEQ_NULL);
+ ++num_type_journal;
+ } else {
+ ++num_type_ool;
+ }
+ // do not increment count_close_*;
+
+ if (segment_info.modify_time != NULL_TIME) {
+ modify_times.insert(segment_info.modify_time);
+ } else {
+ ceph_assert(segment_info.num_extents == 0);
+ }
+
+ segment_info.init_closed(
+ seq, type, category, generation, get_segment_size());
+}
+
+void segments_info_t::mark_open(
+ segment_id_t segment, segment_seq_t seq, segment_type_t type,
+ data_category_t category, rewrite_gen_t generation)
+{
+ LOG_PREFIX(segments_info_t::mark_open);
+ auto& segment_info = segments[segment];
+ INFO("opening {} {} {} {} {}, {}, "
+ "num_segments(empty={}, opened={}, closed={})",
+ segment, type, segment_seq_printer_t{seq},
+ category, rewrite_gen_printer_t{generation},
+ segment_info, num_empty, num_open, num_closed);
+ ceph_assert(segment_info.is_empty());
+ ceph_assert(num_empty > 0);
+ --num_empty;
+ ++num_open;
+ if (type == segment_type_t::JOURNAL) {
+ if (journal_segment_id != NULL_SEG_ID) {
+ auto& last_journal_segment = segments[journal_segment_id];
+ ceph_assert(last_journal_segment.is_closed());
+ ceph_assert(last_journal_segment.type == segment_type_t::JOURNAL);
+ ceph_assert(last_journal_segment.seq + 1 == seq);
+ }
+ journal_segment_id = segment;
+
+ ++num_in_journal_open;
+ ++num_type_journal;
+ ++count_open_journal;
+ } else {
+ ++num_type_ool;
+ ++count_open_ool;
+ }
+ avail_bytes_in_open += get_segment_size();
+
+ segment_info.set_open(seq, type, category, generation);
+}
+
+void segments_info_t::mark_empty(
+ segment_id_t segment)
+{
+ LOG_PREFIX(segments_info_t::mark_empty);
+ auto& segment_info = segments[segment];
+ INFO("releasing {}, {}, num_segments(empty={}, opened={}, closed={})",
+ segment, segment_info,
+ num_empty, num_open, num_closed);
+ ceph_assert(segment_info.is_closed());
+ auto type = segment_info.type;
+ assert(type != segment_type_t::NULL_SEG);
+ ceph_assert(num_closed > 0);
+ --num_closed;
+ ++num_empty;
+ if (type == segment_type_t::JOURNAL) {
+ ceph_assert(num_type_journal > 0);
+ --num_type_journal;
+ ++count_release_journal;
+ } else {
+ ceph_assert(num_type_ool > 0);
+ --num_type_ool;
+ ++count_release_ool;
+ }
+
+ if (segment_info.modify_time != NULL_TIME) {
+ auto to_erase = modify_times.find(segment_info.modify_time);
+ ceph_assert(to_erase != modify_times.end());
+ modify_times.erase(to_erase);
+ } else {
+ ceph_assert(segment_info.num_extents == 0);
+ }
+
+ segment_info.set_empty();
+}
+
+void segments_info_t::mark_closed(
+ segment_id_t segment)
+{
+ LOG_PREFIX(segments_info_t::mark_closed);
+ auto& segment_info = segments[segment];
+ INFO("closing {}, {}, num_segments(empty={}, opened={}, closed={})",
+ segment, segment_info,
+ num_empty, num_open, num_closed);
+ ceph_assert(segment_info.is_open());
+ ceph_assert(num_open > 0);
+ --num_open;
+ ++num_closed;
+ if (segment_info.type == segment_type_t::JOURNAL) {
+ ceph_assert(num_in_journal_open > 0);
+ --num_in_journal_open;
+ ++count_close_journal;
+ } else {
+ ++count_close_ool;
+ }
+ ceph_assert(get_segment_size() >= segment_info.written_to);
+ auto seg_avail_bytes = get_segment_size() - segment_info.written_to;
+ ceph_assert(avail_bytes_in_open >= (std::size_t)seg_avail_bytes);
+ avail_bytes_in_open -= seg_avail_bytes;
+
+ if (segment_info.modify_time != NULL_TIME) {
+ modify_times.insert(segment_info.modify_time);
+ } else {
+ ceph_assert(segment_info.num_extents == 0);
+ }
+
+ segment_info.set_closed();
+}
+
+void segments_info_t::update_written_to(
+ segment_type_t type,
+ paddr_t offset)
+{
+ LOG_PREFIX(segments_info_t::update_written_to);
+ auto& saddr = offset.as_seg_paddr();
+ auto& segment_info = segments[saddr.get_segment_id()];
+ if (!segment_info.is_open()) {
+ ERROR("segment is not open, not updating, type={}, offset={}, {}",
+ type, offset, segment_info);
+ ceph_abort();
+ }
+
+ auto new_written_to = saddr.get_segment_off();
+ ceph_assert(new_written_to <= get_segment_size());
+ if (segment_info.written_to > new_written_to) {
+ ERROR("written_to should not decrease! type={}, offset={}, {}",
+ type, offset, segment_info);
+ ceph_abort();
+ }
+
+ DEBUG("type={}, offset={}, {}", type, offset, segment_info);
+ ceph_assert(type == segment_info.type);
+ auto avail_deduction = new_written_to - segment_info.written_to;
+ ceph_assert(avail_bytes_in_open >= (std::size_t)avail_deduction);
+ avail_bytes_in_open -= avail_deduction;
+ segment_info.written_to = new_written_to;
+}
+
+std::ostream &operator<<(std::ostream &os, const segments_info_t &infos)
+{
+ return os << "segments("
+ << "empty=" << infos.get_num_empty()
+ << ", open=" << infos.get_num_open()
+ << ", closed=" << infos.get_num_closed()
+ << ", type_journal=" << infos.get_num_type_journal()
+ << ", type_ool=" << infos.get_num_type_ool()
+ << ", total=" << infos.get_total_bytes() << "B"
+ << ", available=" << infos.get_available_bytes() << "B"
+ << ", unavailable=" << infos.get_unavailable_bytes() << "B"
+ << ", available_ratio=" << infos.get_available_ratio()
+ << ", submitted_head=" << infos.get_submitted_journal_head()
+ << ", time_bound=" << sea_time_point_printer_t{infos.get_time_bound()}
+ << ")";
+}
+
+void JournalTrimmerImpl::config_t::validate() const
+{
+ ceph_assert(max_journal_bytes <= DEVICE_OFF_MAX);
+ ceph_assert(max_journal_bytes > target_journal_dirty_bytes);
+ ceph_assert(max_journal_bytes > target_journal_alloc_bytes);
+ ceph_assert(rewrite_dirty_bytes_per_cycle > 0);
+ ceph_assert(rewrite_backref_bytes_per_cycle > 0);
+}
+
+JournalTrimmerImpl::config_t
+JournalTrimmerImpl::config_t::get_default(
+ std::size_t roll_size, journal_type_t type)
+{
+ assert(roll_size);
+ std::size_t target_dirty_bytes = 0;
+ std::size_t target_alloc_bytes = 0;
+ std::size_t max_journal_bytes = 0;
+ if (type == journal_type_t::SEGMENTED) {
+ target_dirty_bytes = 12 * roll_size;
+ target_alloc_bytes = 2 * roll_size;
+ max_journal_bytes = 16 * roll_size;
+ } else {
+ assert(type == journal_type_t::RANDOM_BLOCK);
+ target_dirty_bytes = roll_size / 4;
+ target_alloc_bytes = roll_size / 4;
+ max_journal_bytes = roll_size / 2;
+ }
+ return config_t{
+ target_dirty_bytes,
+ target_alloc_bytes,
+ max_journal_bytes,
+ 1<<17,// rewrite_dirty_bytes_per_cycle
+ 1<<24 // rewrite_backref_bytes_per_cycle
+ };
+}
+
+JournalTrimmerImpl::config_t
+JournalTrimmerImpl::config_t::get_test(
+ std::size_t roll_size, journal_type_t type)
+{
+ assert(roll_size);
+ std::size_t target_dirty_bytes = 0;
+ std::size_t target_alloc_bytes = 0;
+ std::size_t max_journal_bytes = 0;
+ if (type == journal_type_t::SEGMENTED) {
+ target_dirty_bytes = 2 * roll_size;
+ target_alloc_bytes = 2 * roll_size;
+ max_journal_bytes = 4 * roll_size;
+ } else {
+ assert(type == journal_type_t::RANDOM_BLOCK);
+ target_dirty_bytes = roll_size / 4;
+ target_alloc_bytes = roll_size / 4;
+ max_journal_bytes = roll_size / 2;
+ }
+ return config_t{
+ target_dirty_bytes,
+ target_alloc_bytes,
+ max_journal_bytes,
+ 1<<17,// rewrite_dirty_bytes_per_cycle
+ 1<<24 // rewrite_backref_bytes_per_cycle
+ };
+}
+
+JournalTrimmerImpl::JournalTrimmerImpl(
+ BackrefManager &backref_manager,
+ config_t config,
+ journal_type_t type,
+ device_off_t roll_start,
+ device_off_t roll_size)
+ : backref_manager(backref_manager),
+ config(config),
+ journal_type(type),
+ roll_start(roll_start),
+ roll_size(roll_size),
+ reserved_usage(0)
+{
+ config.validate();
+ ceph_assert(roll_start >= 0);
+ ceph_assert(roll_size > 0);
+ register_metrics();
+}
+
+void JournalTrimmerImpl::set_journal_head(journal_seq_t head)
+{
+ LOG_PREFIX(JournalTrimmerImpl::set_journal_head);
+
+ ceph_assert(head != JOURNAL_SEQ_NULL);
+ ceph_assert(journal_head == JOURNAL_SEQ_NULL ||
+ head >= journal_head);
+ ceph_assert(journal_alloc_tail == JOURNAL_SEQ_NULL ||
+ head >= journal_alloc_tail);
+ ceph_assert(journal_dirty_tail == JOURNAL_SEQ_NULL ||
+ head >= journal_dirty_tail);
+
+ std::swap(journal_head, head);
+ if (journal_head.segment_seq == head.segment_seq) {
+ DEBUG("journal_head {} => {}, {}",
+ head, journal_head, stat_printer_t{*this, false});
+ } else {
+ INFO("journal_head {} => {}, {}",
+ head, journal_head, stat_printer_t{*this, false});
+ }
+ background_callback->maybe_wake_background();
+}
+
+void JournalTrimmerImpl::update_journal_tails(
+ journal_seq_t dirty_tail,
+ journal_seq_t alloc_tail)
+{
+ LOG_PREFIX(JournalTrimmerImpl::update_journal_tails);
+
+ if (dirty_tail != JOURNAL_SEQ_NULL) {
+ ceph_assert(journal_head == JOURNAL_SEQ_NULL ||
+ journal_head >= dirty_tail);
+ if (journal_dirty_tail != JOURNAL_SEQ_NULL &&
+ journal_dirty_tail > dirty_tail) {
+ ERROR("journal_dirty_tail {} => {} is backwards!",
+ journal_dirty_tail, dirty_tail);
+ ceph_abort();
+ }
+ std::swap(journal_dirty_tail, dirty_tail);
+ if (journal_dirty_tail.segment_seq == dirty_tail.segment_seq) {
+ DEBUG("journal_dirty_tail {} => {}, {}",
+ dirty_tail, journal_dirty_tail, stat_printer_t{*this, false});
+ } else {
+ INFO("journal_dirty_tail {} => {}, {}",
+ dirty_tail, journal_dirty_tail, stat_printer_t{*this, false});
+ }
+ }
+
+ if (alloc_tail != JOURNAL_SEQ_NULL) {
+ ceph_assert(journal_head == JOURNAL_SEQ_NULL ||
+ journal_head >= alloc_tail);
+ if (journal_alloc_tail != JOURNAL_SEQ_NULL &&
+ journal_alloc_tail > alloc_tail) {
+ ERROR("journal_alloc_tail {} => {} is backwards!",
+ journal_alloc_tail, alloc_tail);
+ ceph_abort();
+ }
+ std::swap(journal_alloc_tail, alloc_tail);
+ if (journal_alloc_tail.segment_seq == alloc_tail.segment_seq) {
+ DEBUG("journal_alloc_tail {} => {}, {}",
+ alloc_tail, journal_alloc_tail, stat_printer_t{*this, false});
+ } else {
+ INFO("journal_alloc_tail {} => {}, {}",
+ alloc_tail, journal_alloc_tail, stat_printer_t{*this, false});
+ }
+ }
+
+ background_callback->maybe_wake_background();
+ background_callback->maybe_wake_blocked_io();
+}
+
+journal_seq_t JournalTrimmerImpl::get_tail_limit() const
+{
+ assert(background_callback->is_ready());
+ auto ret = journal_head.add_offset(
+ journal_type,
+ -static_cast<device_off_t>(config.max_journal_bytes),
+ roll_start,
+ roll_size);
+ return ret;
+}
+
+journal_seq_t JournalTrimmerImpl::get_dirty_tail_target() const
+{
+ assert(background_callback->is_ready());
+ auto ret = journal_head.add_offset(
+ journal_type,
+ -static_cast<device_off_t>(config.target_journal_dirty_bytes),
+ roll_start,
+ roll_size);
+ return ret;
+}
+
+journal_seq_t JournalTrimmerImpl::get_alloc_tail_target() const
+{
+ assert(background_callback->is_ready());
+ auto ret = journal_head.add_offset(
+ journal_type,
+ -static_cast<device_off_t>(config.target_journal_alloc_bytes),
+ roll_start,
+ roll_size);
+ return ret;
+}
+
+std::size_t JournalTrimmerImpl::get_dirty_journal_size() const
+{
+ if (!background_callback->is_ready()) {
+ return 0;
+ }
+ auto ret = journal_head.relative_to(
+ journal_type,
+ journal_dirty_tail,
+ roll_start,
+ roll_size);
+ ceph_assert(ret >= 0);
+ return static_cast<std::size_t>(ret);
+}
+
+std::size_t JournalTrimmerImpl::get_alloc_journal_size() const
+{
+ if (!background_callback->is_ready()) {
+ return 0;
+ }
+ auto ret = journal_head.relative_to(
+ journal_type,
+ journal_alloc_tail,
+ roll_start,
+ roll_size);
+ ceph_assert(ret >= 0);
+ return static_cast<std::size_t>(ret);
+}
+
+seastar::future<> JournalTrimmerImpl::trim() {
+ return seastar::when_all(
+ [this] {
+ if (should_trim_alloc()) {
+ return trim_alloc(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "encountered invalid error in trim_alloc"
+ }
+ );
+ } else {
+ return seastar::now();
+ }
+ },
+ [this] {
+ if (should_trim_dirty()) {
+ return trim_dirty(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "encountered invalid error in trim_dirty"
+ }
+ );
+ } else {
+ return seastar::now();
+ }
+ }
+ ).discard_result();
+}
+
+JournalTrimmerImpl::trim_ertr::future<>
+JournalTrimmerImpl::trim_alloc()
+{
+ LOG_PREFIX(JournalTrimmerImpl::trim_alloc);
+ assert(background_callback->is_ready());
+ return repeat_eagain([this, FNAME] {
+ return extent_callback->with_transaction_intr(
+ Transaction::src_t::TRIM_ALLOC,
+ "trim_alloc",
+ [this, FNAME](auto &t)
+ {
+ auto target = get_alloc_tail_target();
+ DEBUGT("start, alloc_tail={}, target={}",
+ t, journal_alloc_tail, target);
+ return backref_manager.merge_cached_backrefs(
+ t,
+ target,
+ config.rewrite_backref_bytes_per_cycle
+ ).si_then([this, FNAME, &t](auto trim_alloc_to)
+ -> ExtentCallbackInterface::submit_transaction_direct_iertr::future<>
+ {
+ DEBUGT("trim_alloc_to={}", t, trim_alloc_to);
+ if (trim_alloc_to != JOURNAL_SEQ_NULL) {
+ return extent_callback->submit_transaction_direct(
+ t, std::make_optional<journal_seq_t>(trim_alloc_to));
+ }
+ return seastar::now();
+ });
+ });
+ }).safe_then([this, FNAME] {
+ DEBUG("finish, alloc_tail={}", journal_alloc_tail);
+ });
+}
+
+JournalTrimmerImpl::trim_ertr::future<>
+JournalTrimmerImpl::trim_dirty()
+{
+ LOG_PREFIX(JournalTrimmerImpl::trim_dirty);
+ assert(background_callback->is_ready());
+ return repeat_eagain([this, FNAME] {
+ return extent_callback->with_transaction_intr(
+ Transaction::src_t::TRIM_DIRTY,
+ "trim_dirty",
+ [this, FNAME](auto &t)
+ {
+ auto target = get_dirty_tail_target();
+ DEBUGT("start, dirty_tail={}, target={}",
+ t, journal_dirty_tail, target);
+ return extent_callback->get_next_dirty_extents(
+ t,
+ target,
+ config.rewrite_dirty_bytes_per_cycle
+ ).si_then([this, FNAME, &t](auto dirty_list) {
+ DEBUGT("rewrite {} dirty extents", t, dirty_list.size());
+ return seastar::do_with(
+ std::move(dirty_list),
+ [this, &t](auto &dirty_list)
+ {
+ return trans_intr::do_for_each(
+ dirty_list,
+ [this, &t](auto &e) {
+ return extent_callback->rewrite_extent(
+ t, e, INIT_GENERATION, NULL_TIME);
+ });
+ });
+ }).si_then([this, &t] {
+ return extent_callback->submit_transaction_direct(t);
+ });
+ });
+ }).safe_then([this, FNAME] {
+ DEBUG("finish, dirty_tail={}", journal_dirty_tail);
+ });
+}
+
+void JournalTrimmerImpl::register_metrics()
+{
+ namespace sm = seastar::metrics;
+ metrics.add_group("journal_trimmer", {
+ sm::make_counter("dirty_journal_bytes",
+ [this] { return get_dirty_journal_size(); },
+ sm::description("the size of the journal for dirty extents")),
+ sm::make_counter("alloc_journal_bytes",
+ [this] { return get_alloc_journal_size(); },
+ sm::description("the size of the journal for alloc info"))
+ });
+}
+
+std::ostream &operator<<(
+ std::ostream &os, const JournalTrimmerImpl::stat_printer_t &stats)
+{
+ os << "JournalTrimmer(";
+ if (stats.trimmer.background_callback->is_ready()) {
+ os << "should_block_io_on_trim=" << stats.trimmer.should_block_io_on_trim()
+ << ", should_(trim_dirty=" << stats.trimmer.should_trim_dirty()
+ << ", trim_alloc=" << stats.trimmer.should_trim_alloc() << ")";
+ } else {
+ os << "not-ready";
+ }
+ if (stats.detailed) {
+ os << ", journal_head=" << stats.trimmer.get_journal_head()
+ << ", alloc_tail=" << stats.trimmer.get_alloc_tail()
+ << ", dirty_tail=" << stats.trimmer.get_dirty_tail();
+ if (stats.trimmer.background_callback->is_ready()) {
+ os << ", alloc_tail_target=" << stats.trimmer.get_alloc_tail_target()
+ << ", dirty_tail_target=" << stats.trimmer.get_dirty_tail_target()
+ << ", tail_limit=" << stats.trimmer.get_tail_limit();
+ }
+ }
+ os << ")";
+ return os;
+}
+
+bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const
+{
+ LOG_PREFIX(SpaceTrackerSimple::equals);
+ const auto &other = static_cast<const SpaceTrackerSimple&>(_other);
+
+ if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) {
+ ERROR("different segment counts, bug in test");
+ assert(0 == "segment counts should match");
+ return false;
+ }
+
+ bool all_match = true;
+ for (auto i = live_bytes_by_segment.begin(), j = other.live_bytes_by_segment.begin();
+ i != live_bytes_by_segment.end(); ++i, ++j) {
+ if (i->second.live_bytes != j->second.live_bytes) {
+ all_match = false;
+ DEBUG("segment_id {} live bytes mismatch *this: {}, other: {}",
+ i->first, i->second.live_bytes, j->second.live_bytes);
+ }
+ }
+ return all_match;
+}
+
+int64_t SpaceTrackerDetailed::SegmentMap::allocate(
+ device_segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len,
+ const extent_len_t block_size)
+{
+ LOG_PREFIX(SegmentMap::allocate);
+ assert(offset % block_size == 0);
+ assert(len % block_size == 0);
+
+ const auto b = (offset / block_size);
+ const auto e = (offset + len) / block_size;
+
+ bool error = false;
+ for (auto i = b; i < e; ++i) {
+ if (bitmap[i]) {
+ if (!error) {
+ ERROR("found allocated in {}, {} ~ {}", segment, offset, len);
+ error = true;
+ }
+ DEBUG("block {} allocated", i * block_size);
+ }
+ bitmap[i] = true;
+ }
+ return update_usage(len);
+}
+
+int64_t SpaceTrackerDetailed::SegmentMap::release(
+ device_segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len,
+ const extent_len_t block_size)
+{
+ LOG_PREFIX(SegmentMap::release);
+ assert(offset % block_size == 0);
+ assert(len % block_size == 0);
+
+ const auto b = (offset / block_size);
+ const auto e = (offset + len) / block_size;
+
+ bool error = false;
+ for (auto i = b; i < e; ++i) {
+ if (!bitmap[i]) {
+ if (!error) {
+ ERROR("found unallocated in {}, {} ~ {}", segment, offset, len);
+ error = true;
+ }
+ DEBUG("block {} unallocated", i * block_size);
+ }
+ bitmap[i] = false;
+ }
+ return update_usage(-(int64_t)len);
+}
+
+bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const
+{
+ LOG_PREFIX(SpaceTrackerDetailed::equals);
+ const auto &other = static_cast<const SpaceTrackerDetailed&>(_other);
+
+ if (other.segment_usage.size() != segment_usage.size()) {
+ ERROR("different segment counts, bug in test");
+ assert(0 == "segment counts should match");
+ return false;
+ }
+
+ bool all_match = true;
+ for (auto i = segment_usage.begin(), j = other.segment_usage.begin();
+ i != segment_usage.end(); ++i, ++j) {
+ if (i->second.get_usage() != j->second.get_usage()) {
+ all_match = false;
+ ERROR("segment_id {} live bytes mismatch *this: {}, other: {}",
+ i->first, i->second.get_usage(), j->second.get_usage());
+ }
+ }
+ return all_match;
+}
+
+void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const
+{
+ LOG_PREFIX(SegmentMap::dump_usage);
+ INFO("dump start");
+ for (unsigned i = 0; i < bitmap.size(); ++i) {
+ if (bitmap[i]) {
+ LOCAL_LOGGER.info(" {} still live", i * block_size);
+ }
+ }
+}
+
+void SpaceTrackerDetailed::dump_usage(segment_id_t id) const
+{
+ LOG_PREFIX(SpaceTrackerDetailed::dump_usage);
+ INFO("{}", id);
+ segment_usage[id].dump_usage(
+ block_size_by_segment_manager[id.device_id()]);
+}
+
+void SpaceTrackerSimple::dump_usage(segment_id_t id) const
+{
+ LOG_PREFIX(SpaceTrackerSimple::dump_usage);
+ INFO("id: {}, live_bytes: {}",
+ id, live_bytes_by_segment[id].live_bytes);
+}
+
+std::ostream &operator<<(
+ std::ostream &os, const AsyncCleaner::stat_printer_t &stats)
+{
+ stats.cleaner.print(os, stats.detailed);
+ return os;
+}
+
+SegmentCleaner::SegmentCleaner(
+ config_t config,
+ SegmentManagerGroupRef&& sm_group,
+ BackrefManager &backref_manager,
+ SegmentSeqAllocator &segment_seq_allocator,
+ bool detailed,
+ bool is_cold)
+ : detailed(detailed),
+ is_cold(is_cold),
+ config(config),
+ sm_group(std::move(sm_group)),
+ backref_manager(backref_manager),
+ ool_segment_seq_allocator(segment_seq_allocator)
+{
+ config.validate();
+}
+
+void SegmentCleaner::register_metrics()
+{
+ namespace sm = seastar::metrics;
+ stats.segment_util.buckets.resize(UTIL_BUCKETS);
+ std::size_t i;
+ for (i = 0; i < UTIL_BUCKETS; ++i) {
+ stats.segment_util.buckets[i].upper_bound = ((double)(i + 1)) / 10;
+ stats.segment_util.buckets[i].count = 0;
+ }
+ // NOTE: by default the segments are empty
+ i = get_bucket_index(UTIL_STATE_EMPTY);
+ stats.segment_util.buckets[i].count = segments.get_num_segments();
+
+ std::string prefix;
+ if (is_cold) {
+ prefix.append("cold_");
+ }
+ prefix.append("segment_cleaner");
+
+ metrics.add_group(prefix, {
+ sm::make_counter("segments_number",
+ [this] { return segments.get_num_segments(); },
+ sm::description("the number of segments")),
+ sm::make_counter("segment_size",
+ [this] { return segments.get_segment_size(); },
+ sm::description("the bytes of a segment")),
+ sm::make_counter("segments_in_journal",
+ [this] { return get_segments_in_journal(); },
+ sm::description("the number of segments in journal")),
+ sm::make_counter("segments_type_journal",
+ [this] { return segments.get_num_type_journal(); },
+ sm::description("the number of segments typed journal")),
+ sm::make_counter("segments_type_ool",
+ [this] { return segments.get_num_type_ool(); },
+ sm::description("the number of segments typed out-of-line")),
+ sm::make_counter("segments_open",
+ [this] { return segments.get_num_open(); },
+ sm::description("the number of open segments")),
+ sm::make_counter("segments_empty",
+ [this] { return segments.get_num_empty(); },
+ sm::description("the number of empty segments")),
+ sm::make_counter("segments_closed",
+ [this] { return segments.get_num_closed(); },
+ sm::description("the number of closed segments")),
+
+ sm::make_counter("segments_count_open_journal",
+ [this] { return segments.get_count_open_journal(); },
+ sm::description("the count of open journal segment operations")),
+ sm::make_counter("segments_count_open_ool",
+ [this] { return segments.get_count_open_ool(); },
+ sm::description("the count of open ool segment operations")),
+ sm::make_counter("segments_count_release_journal",
+ [this] { return segments.get_count_release_journal(); },
+ sm::description("the count of release journal segment operations")),
+ sm::make_counter("segments_count_release_ool",
+ [this] { return segments.get_count_release_ool(); },
+ sm::description("the count of release ool segment operations")),
+ sm::make_counter("segments_count_close_journal",
+ [this] { return segments.get_count_close_journal(); },
+ sm::description("the count of close journal segment operations")),
+ sm::make_counter("segments_count_close_ool",
+ [this] { return segments.get_count_close_ool(); },
+ sm::description("the count of close ool segment operations")),
+
+ sm::make_counter("total_bytes",
+ [this] { return segments.get_total_bytes(); },
+ sm::description("the size of the space")),
+ sm::make_counter("available_bytes",
+ [this] { return segments.get_available_bytes(); },
+ sm::description("the size of the space is available")),
+ sm::make_counter("unavailable_unreclaimable_bytes",
+ [this] { return get_unavailable_unreclaimable_bytes(); },
+ sm::description("the size of the space is unavailable and unreclaimable")),
+ sm::make_counter("unavailable_reclaimable_bytes",
+ [this] { return get_unavailable_reclaimable_bytes(); },
+ sm::description("the size of the space is unavailable and reclaimable")),
+ sm::make_counter("used_bytes", stats.used_bytes,
+ sm::description("the size of the space occupied by live extents")),
+ sm::make_counter("unavailable_unused_bytes",
+ [this] { return get_unavailable_unused_bytes(); },
+ sm::description("the size of the space is unavailable and not alive")),
+
+ sm::make_counter("projected_count", stats.projected_count,
+ sm::description("the number of projected usage reservations")),
+ sm::make_counter("projected_used_bytes_sum", stats.projected_used_bytes_sum,
+ sm::description("the sum of the projected usage in bytes")),
+
+ sm::make_counter("reclaimed_bytes", stats.reclaimed_bytes,
+ sm::description("rewritten bytes due to reclaim")),
+ sm::make_counter("reclaimed_segment_bytes", stats.reclaimed_segment_bytes,
+ sm::description("rewritten bytes due to reclaim")),
+ sm::make_counter("closed_journal_used_bytes", stats.closed_journal_used_bytes,
+ sm::description("used bytes when close a journal segment")),
+ sm::make_counter("closed_journal_total_bytes", stats.closed_journal_total_bytes,
+ sm::description("total bytes of closed journal segments")),
+ sm::make_counter("closed_ool_used_bytes", stats.closed_ool_used_bytes,
+ sm::description("used bytes when close a ool segment")),
+ sm::make_counter("closed_ool_total_bytes", stats.closed_ool_total_bytes,
+ sm::description("total bytes of closed ool segments")),
+
+ sm::make_gauge("available_ratio",
+ [this] { return segments.get_available_ratio(); },
+ sm::description("ratio of available space to total space")),
+ sm::make_gauge("reclaim_ratio",
+ [this] { return get_reclaim_ratio(); },
+ sm::description("ratio of reclaimable space to unavailable space")),
+
+ sm::make_histogram("segment_utilization_distribution",
+ [this]() -> seastar::metrics::histogram& {
+ return stats.segment_util;
+ },
+ sm::description("utilization distribution of all segments"))
+ });
+}
+
+segment_id_t SegmentCleaner::allocate_segment(
+ segment_seq_t seq,
+ segment_type_t type,
+ data_category_t category,
+ rewrite_gen_t generation)
+{
+ LOG_PREFIX(SegmentCleaner::allocate_segment);
+ assert(seq != NULL_SEG_SEQ);
+ ceph_assert(type == segment_type_t::OOL ||
+ trimmer != nullptr); // segment_type_t::JOURNAL
+ for (auto it = segments.begin();
+ it != segments.end();
+ ++it) {
+ auto seg_id = it->first;
+ auto& segment_info = it->second;
+ if (segment_info.is_empty()) {
+ auto old_usage = calc_utilization(seg_id);
+ segments.mark_open(seg_id, seq, type, category, generation);
+ background_callback->maybe_wake_background();
+ auto new_usage = calc_utilization(seg_id);
+ adjust_segment_util(old_usage, new_usage);
+ INFO("opened, {}", stat_printer_t{*this, false});
+ return seg_id;
+ }
+ }
+ ERROR("out of space with {} {} {} {}",
+ type, segment_seq_printer_t{seq}, category,
+ rewrite_gen_printer_t{generation});
+ ceph_abort("seastore device size setting is too small");
+ return NULL_SEG_ID;
+}
+
+void SegmentCleaner::close_segment(segment_id_t segment)
+{
+ LOG_PREFIX(SegmentCleaner::close_segment);
+ auto old_usage = calc_utilization(segment);
+ segments.mark_closed(segment);
+ auto &seg_info = segments[segment];
+ if (seg_info.type == segment_type_t::JOURNAL) {
+ stats.closed_journal_used_bytes += space_tracker->get_usage(segment);
+ stats.closed_journal_total_bytes += segments.get_segment_size();
+ } else {
+ stats.closed_ool_used_bytes += space_tracker->get_usage(segment);
+ stats.closed_ool_total_bytes += segments.get_segment_size();
+ }
+ auto new_usage = calc_utilization(segment);
+ adjust_segment_util(old_usage, new_usage);
+ INFO("closed, {} -- {}", stat_printer_t{*this, false}, seg_info);
+}
+
+double SegmentCleaner::calc_gc_benefit_cost(
+ segment_id_t id,
+ const sea_time_point &now_time,
+ const sea_time_point &bound_time) const
+{
+ double util = calc_utilization(id);
+ ceph_assert(util >= 0 && util < 1);
+ if constexpr (gc_formula == gc_formula_t::GREEDY) {
+ return 1 - util;
+ }
+
+ if constexpr (gc_formula == gc_formula_t::COST_BENEFIT) {
+ if (util == 0) {
+ return std::numeric_limits<double>::max();
+ }
+ auto modify_time = segments[id].modify_time;
+ double age_segment = modify_time.time_since_epoch().count();
+ double age_now = now_time.time_since_epoch().count();
+ if (likely(age_now > age_segment)) {
+ return (1 - util) * (age_now - age_segment) / (2 * util);
+ } else {
+ // time is wrong
+ return (1 - util) / (2 * util);
+ }
+ }
+
+ assert(gc_formula == gc_formula_t::BENEFIT);
+ auto modify_time = segments[id].modify_time;
+ double age_factor = 0.5; // middle value if age is invalid
+ if (likely(bound_time != NULL_TIME &&
+ modify_time != NULL_TIME &&
+ now_time > modify_time)) {
+ assert(modify_time >= bound_time);
+ double age_bound = bound_time.time_since_epoch().count();
+ double age_now = now_time.time_since_epoch().count();
+ double age_segment = modify_time.time_since_epoch().count();
+ age_factor = (age_now - age_segment) / (age_now - age_bound);
+ }
+ return ((1 - 2 * age_factor) * util * util +
+ (2 * age_factor - 2) * util + 1);
+}
+
+SegmentCleaner::do_reclaim_space_ret
+SegmentCleaner::do_reclaim_space(
+ const std::vector<CachedExtentRef> &backref_extents,
+ const backref_pin_list_t &pin_list,
+ std::size_t &reclaimed,
+ std::size_t &runs)
+{
+ return repeat_eagain([this, &backref_extents,
+ &pin_list, &reclaimed, &runs] {
+ reclaimed = 0;
+ runs++;
+ auto src = Transaction::src_t::CLEANER_MAIN;
+ if (is_cold) {
+ src = Transaction::src_t::CLEANER_COLD;
+ }
+ return extent_callback->with_transaction_intr(
+ src,
+ "clean_reclaim_space",
+ [this, &backref_extents, &pin_list, &reclaimed](auto &t)
+ {
+ return seastar::do_with(
+ std::vector<CachedExtentRef>(backref_extents),
+ [this, &t, &reclaimed, &pin_list](auto &extents)
+ {
+ LOG_PREFIX(SegmentCleaner::do_reclaim_space);
+ // calculate live extents
+ auto cached_backref_entries =
+ backref_manager.get_cached_backref_entries_in_range(
+ reclaim_state->start_pos, reclaim_state->end_pos);
+ backref_entry_query_set_t backref_entries;
+ for (auto &pin : pin_list) {
+ backref_entries.emplace(
+ pin->get_key(),
+ pin->get_val(),
+ pin->get_length(),
+ pin->get_type(),
+ JOURNAL_SEQ_NULL);
+ }
+ for (auto &cached_backref : cached_backref_entries) {
+ if (cached_backref.laddr == L_ADDR_NULL) {
+ auto it = backref_entries.find(cached_backref.paddr);
+ assert(it->len == cached_backref.len);
+ backref_entries.erase(it);
+ } else {
+ backref_entries.emplace(cached_backref);
+ }
+ }
+ // retrieve live extents
+ DEBUGT("start, backref_entries={}, backref_extents={}",
+ t, backref_entries.size(), extents.size());
+ return seastar::do_with(
+ std::move(backref_entries),
+ [this, &extents, &t](auto &backref_entries) {
+ return trans_intr::parallel_for_each(
+ backref_entries,
+ [this, &extents, &t](auto &ent)
+ {
+ LOG_PREFIX(SegmentCleaner::do_reclaim_space);
+ TRACET("getting extent of type {} at {}~{}",
+ t,
+ ent.type,
+ ent.paddr,
+ ent.len);
+ return extent_callback->get_extents_if_live(
+ t, ent.type, ent.paddr, ent.laddr, ent.len
+ ).si_then([FNAME, &extents, &ent, &t](auto list) {
+ if (list.empty()) {
+ TRACET("addr {} dead, skipping", t, ent.paddr);
+ } else {
+ for (auto &e : list) {
+ extents.emplace_back(std::move(e));
+ }
+ }
+ });
+ });
+ }).si_then([FNAME, &extents, this, &reclaimed, &t] {
+ DEBUGT("reclaim {} extents", t, extents.size());
+ // rewrite live extents
+ auto modify_time = segments[reclaim_state->get_segment_id()].modify_time;
+ return trans_intr::do_for_each(
+ extents,
+ [this, modify_time, &t, &reclaimed](auto ext)
+ {
+ reclaimed += ext->get_length();
+ return extent_callback->rewrite_extent(
+ t, ext, reclaim_state->target_generation, modify_time);
+ });
+ });
+ }).si_then([this, &t] {
+ return extent_callback->submit_transaction_direct(t);
+ });
+ });
+ });
+}
+
+SegmentCleaner::clean_space_ret SegmentCleaner::clean_space()
+{
+ LOG_PREFIX(SegmentCleaner::clean_space);
+ assert(background_callback->is_ready());
+ ceph_assert(can_clean_space());
+ if (!reclaim_state) {
+ segment_id_t seg_id = get_next_reclaim_segment();
+ auto &segment_info = segments[seg_id];
+ INFO("reclaim {} {} start, usage={}, time_bound={}",
+ seg_id, segment_info,
+ space_tracker->calc_utilization(seg_id),
+ sea_time_point_printer_t{segments.get_time_bound()});
+ ceph_assert(segment_info.is_closed());
+ reclaim_state = reclaim_state_t::create(
+ seg_id, segment_info.generation, segments.get_segment_size());
+ }
+ reclaim_state->advance(config.reclaim_bytes_per_cycle);
+
+ DEBUG("reclaiming {} {}~{}",
+ rewrite_gen_printer_t{reclaim_state->generation},
+ reclaim_state->start_pos,
+ reclaim_state->end_pos);
+ double pavail_ratio = get_projected_available_ratio();
+ sea_time_point start = seastar::lowres_system_clock::now();
+
+ // Backref-tree doesn't support tree-read during tree-updates with parallel
+ // transactions. So, concurrent transactions between trim and reclaim are
+ // not allowed right now.
+ return seastar::do_with(
+ std::pair<std::vector<CachedExtentRef>, backref_pin_list_t>(),
+ [this](auto &weak_read_ret) {
+ return repeat_eagain([this, &weak_read_ret] {
+ return extent_callback->with_transaction_intr(
+ Transaction::src_t::READ,
+ "retrieve_from_backref_tree",
+ [this, &weak_read_ret](auto &t) {
+ return backref_manager.get_mappings(
+ t,
+ reclaim_state->start_pos,
+ reclaim_state->end_pos
+ ).si_then([this, &t, &weak_read_ret](auto pin_list) {
+ if (!pin_list.empty()) {
+ auto it = pin_list.begin();
+ auto &first_pin = *it;
+ if (first_pin->get_key() < reclaim_state->start_pos) {
+ // BackrefManager::get_mappings may include a entry before
+ // reclaim_state->start_pos, which is semantically inconsistent
+ // with the requirements of the cleaner
+ pin_list.erase(it);
+ }
+ }
+ return backref_manager.retrieve_backref_extents_in_range(
+ t,
+ reclaim_state->start_pos,
+ reclaim_state->end_pos
+ ).si_then([pin_list=std::move(pin_list),
+ &weak_read_ret](auto extents) mutable {
+ weak_read_ret = std::make_pair(std::move(extents), std::move(pin_list));
+ });
+ });
+ });
+ }).safe_then([&weak_read_ret] {
+ return std::move(weak_read_ret);
+ });
+ }).safe_then([this, FNAME, pavail_ratio, start](auto weak_read_ret) {
+ return seastar::do_with(
+ std::move(weak_read_ret.first),
+ std::move(weak_read_ret.second),
+ (size_t)0,
+ (size_t)0,
+ [this, FNAME, pavail_ratio, start](
+ auto &backref_extents, auto &pin_list, auto &reclaimed, auto &runs)
+ {
+ return do_reclaim_space(
+ backref_extents,
+ pin_list,
+ reclaimed,
+ runs
+ ).safe_then([this, FNAME, pavail_ratio, start, &reclaimed, &runs] {
+ stats.reclaiming_bytes += reclaimed;
+ auto d = seastar::lowres_system_clock::now() - start;
+ DEBUG("duration: {}, pavail_ratio before: {}, repeats: {}",
+ d, pavail_ratio, runs);
+ if (reclaim_state->is_complete()) {
+ auto segment_to_release = reclaim_state->get_segment_id();
+ INFO("reclaim {} finish, reclaimed alive/total={}",
+ segment_to_release,
+ stats.reclaiming_bytes/(double)segments.get_segment_size());
+ stats.reclaimed_bytes += stats.reclaiming_bytes;
+ stats.reclaimed_segment_bytes += segments.get_segment_size();
+ stats.reclaiming_bytes = 0;
+ reclaim_state.reset();
+ return sm_group->release_segment(segment_to_release
+ ).handle_error(
+ clean_space_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "SegmentCleaner::clean_space encountered invalid error in release_segment"
+ }
+ ).safe_then([this, FNAME, segment_to_release] {
+ auto old_usage = calc_utilization(segment_to_release);
+ if(unlikely(old_usage != 0)) {
+ space_tracker->dump_usage(segment_to_release);
+ ERROR("segment {} old_usage {} != 0",
+ segment_to_release, old_usage);
+ ceph_abort();
+ }
+ segments.mark_empty(segment_to_release);
+ auto new_usage = calc_utilization(segment_to_release);
+ adjust_segment_util(old_usage, new_usage);
+ INFO("released {}, {}",
+ segment_to_release, stat_printer_t{*this, false});
+ background_callback->maybe_wake_blocked_io();
+ });
+ } else {
+ return clean_space_ertr::now();
+ }
+ });
+ });
+ });
+}
+
+SegmentCleaner::mount_ret SegmentCleaner::mount()
+{
+ LOG_PREFIX(SegmentCleaner::mount);
+ const auto& sms = sm_group->get_segment_managers();
+ INFO("{} segment managers", sms.size());
+
+ assert(background_callback->get_state() == state_t::MOUNT);
+
+ space_tracker.reset(
+ detailed ?
+ (SpaceTrackerI*)new SpaceTrackerDetailed(
+ sms) :
+ (SpaceTrackerI*)new SpaceTrackerSimple(
+ sms));
+
+ segments.reset();
+ for (auto sm : sms) {
+ segments.add_segment_manager(*sm);
+ }
+ segments.assign_ids();
+
+ stats = {};
+ metrics.clear();
+ register_metrics();
+
+ INFO("{} segments", segments.get_num_segments());
+ return crimson::do_for_each(
+ segments.begin(),
+ segments.end(),
+ [this, FNAME](auto& it)
+ {
+ auto segment_id = it.first;
+ return sm_group->read_segment_header(
+ segment_id
+ ).safe_then([segment_id, this, FNAME](auto header) {
+ DEBUG("segment_id={} -- {}", segment_id, header);
+ auto s_type = header.get_type();
+ if (s_type == segment_type_t::NULL_SEG) {
+ ERROR("got null segment, segment_id={} -- {}", segment_id, header);
+ ceph_abort();
+ }
+ return sm_group->read_segment_tail(
+ segment_id
+ ).safe_then([this, FNAME, segment_id, header](auto tail)
+ -> scan_extents_ertr::future<> {
+ if (tail.segment_nonce != header.segment_nonce) {
+ return scan_no_tail_segment(header, segment_id);
+ }
+ ceph_assert(header.get_type() == tail.get_type());
+
+ sea_time_point modify_time = mod_to_timepoint(tail.modify_time);
+ std::size_t num_extents = tail.num_extents;
+ if ((modify_time == NULL_TIME && num_extents == 0) ||
+ (modify_time != NULL_TIME && num_extents != 0)) {
+ segments.update_modify_time(segment_id, modify_time, num_extents);
+ } else {
+ ERROR("illegal modify time {}", tail);
+ return crimson::ct_error::input_output_error::make();
+ }
+
+ init_mark_segment_closed(
+ segment_id,
+ header.segment_seq,
+ header.type,
+ header.category,
+ header.generation);
+ return seastar::now();
+ }).handle_error(
+ crimson::ct_error::enodata::handle(
+ [this, header, segment_id](auto) {
+ return scan_no_tail_segment(header, segment_id);
+ }),
+ crimson::ct_error::pass_further_all{}
+ );
+ }).handle_error(
+ crimson::ct_error::enoent::handle([](auto) {
+ return mount_ertr::now();
+ }),
+ crimson::ct_error::enodata::handle([](auto) {
+ return mount_ertr::now();
+ }),
+ crimson::ct_error::input_output_error::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error"}
+ );
+ }).safe_then([this, FNAME] {
+ INFO("done, {}", segments);
+ });
+}
+
+SegmentCleaner::scan_extents_ret SegmentCleaner::scan_no_tail_segment(
+ const segment_header_t &segment_header,
+ segment_id_t segment_id)
+{
+ LOG_PREFIX(SegmentCleaner::scan_no_tail_segment);
+ INFO("scan {} {}", segment_id, segment_header);
+ return seastar::do_with(
+ scan_valid_records_cursor({
+ segments[segment_id].seq,
+ paddr_t::make_seg_paddr(segment_id, 0)
+ }),
+ SegmentManagerGroup::found_record_handler_t(
+ [this, segment_id, segment_header, FNAME](
+ record_locator_t locator,
+ const record_group_header_t &record_group_header,
+ const bufferlist& mdbuf
+ ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<>
+ {
+ DEBUG("{} {}, decoding {} records",
+ segment_id, segment_header.get_type(), record_group_header.records);
+
+ auto maybe_headers = try_decode_record_headers(
+ record_group_header, mdbuf);
+ if (!maybe_headers) {
+ // This should be impossible, we did check the crc on the mdbuf
+ ERROR("unable to decode record headers for record group {}",
+ locator.record_block_base);
+ return crimson::ct_error::input_output_error::make();
+ }
+
+ for (auto &record_header : *maybe_headers) {
+ auto modify_time = mod_to_timepoint(record_header.modify_time);
+ if (record_header.extents == 0 || modify_time != NULL_TIME) {
+ segments.update_modify_time(
+ segment_id, modify_time, record_header.extents);
+ } else {
+ ERROR("illegal modify time {}", record_header);
+ return crimson::ct_error::input_output_error::make();
+ }
+ }
+ return seastar::now();
+ }),
+ [this, segment_header](auto &cursor, auto &handler)
+ {
+ return sm_group->scan_valid_records(
+ cursor,
+ segment_header.segment_nonce,
+ segments.get_segment_size(),
+ handler).discard_result();
+ }).safe_then([this, segment_id, segment_header] {
+ init_mark_segment_closed(
+ segment_id,
+ segment_header.segment_seq,
+ segment_header.type,
+ segment_header.category,
+ segment_header.generation);
+ });
+}
+
+bool SegmentCleaner::check_usage()
+{
+ SpaceTrackerIRef tracker(space_tracker->make_empty());
+ extent_callback->with_transaction_weak(
+ "check_usage",
+ [this, &tracker](auto &t) {
+ return backref_manager.scan_mapped_space(
+ t,
+ [&tracker](
+ paddr_t paddr,
+ paddr_t backref_key,
+ extent_len_t len,
+ extent_types_t type,
+ laddr_t laddr)
+ {
+ if (paddr.get_addr_type() == paddr_types_t::SEGMENT) {
+ if (is_backref_node(type)) {
+ assert(laddr == L_ADDR_NULL);
+ assert(backref_key != P_ADDR_NULL);
+ tracker->allocate(
+ paddr.as_seg_paddr().get_segment_id(),
+ paddr.as_seg_paddr().get_segment_off(),
+ len);
+ } else if (laddr == L_ADDR_NULL) {
+ assert(backref_key == P_ADDR_NULL);
+ tracker->release(
+ paddr.as_seg_paddr().get_segment_id(),
+ paddr.as_seg_paddr().get_segment_off(),
+ len);
+ } else {
+ assert(backref_key == P_ADDR_NULL);
+ tracker->allocate(
+ paddr.as_seg_paddr().get_segment_id(),
+ paddr.as_seg_paddr().get_segment_off(),
+ len);
+ }
+ }
+ });
+ }).unsafe_get0();
+ return space_tracker->equals(*tracker);
+}
+
+void SegmentCleaner::mark_space_used(
+ paddr_t addr,
+ extent_len_t len)
+{
+ LOG_PREFIX(SegmentCleaner::mark_space_used);
+ assert(background_callback->get_state() >= state_t::SCAN_SPACE);
+ assert(len);
+ // TODO: drop
+ if (addr.get_addr_type() != paddr_types_t::SEGMENT) {
+ return;
+ }
+
+ auto& seg_addr = addr.as_seg_paddr();
+ stats.used_bytes += len;
+ auto old_usage = calc_utilization(seg_addr.get_segment_id());
+ [[maybe_unused]] auto ret = space_tracker->allocate(
+ seg_addr.get_segment_id(),
+ seg_addr.get_segment_off(),
+ len);
+ auto new_usage = calc_utilization(seg_addr.get_segment_id());
+ adjust_segment_util(old_usage, new_usage);
+
+ background_callback->maybe_wake_background();
+ assert(ret > 0);
+ DEBUG("segment {} new len: {}~{}, live_bytes: {}",
+ seg_addr.get_segment_id(),
+ addr,
+ len,
+ space_tracker->get_usage(seg_addr.get_segment_id()));
+}
+
+void SegmentCleaner::mark_space_free(
+ paddr_t addr,
+ extent_len_t len)
+{
+ LOG_PREFIX(SegmentCleaner::mark_space_free);
+ assert(background_callback->get_state() >= state_t::SCAN_SPACE);
+ assert(len);
+ // TODO: drop
+ if (addr.get_addr_type() != paddr_types_t::SEGMENT) {
+ return;
+ }
+
+ ceph_assert(stats.used_bytes >= len);
+ stats.used_bytes -= len;
+ auto& seg_addr = addr.as_seg_paddr();
+
+ DEBUG("segment {} free len: {}~{}",
+ seg_addr.get_segment_id(), addr, len);
+ auto old_usage = calc_utilization(seg_addr.get_segment_id());
+ [[maybe_unused]] auto ret = space_tracker->release(
+ seg_addr.get_segment_id(),
+ seg_addr.get_segment_off(),
+ len);
+ auto new_usage = calc_utilization(seg_addr.get_segment_id());
+ adjust_segment_util(old_usage, new_usage);
+ background_callback->maybe_wake_blocked_io();
+ assert(ret >= 0);
+ DEBUG("segment {} free len: {}~{}, live_bytes: {}",
+ seg_addr.get_segment_id(),
+ addr,
+ len,
+ space_tracker->get_usage(seg_addr.get_segment_id()));
+}
+
+segment_id_t SegmentCleaner::get_next_reclaim_segment() const
+{
+ LOG_PREFIX(SegmentCleaner::get_next_reclaim_segment);
+ segment_id_t id = NULL_SEG_ID;
+ double max_benefit_cost = 0;
+ sea_time_point now_time;
+ if constexpr (gc_formula != gc_formula_t::GREEDY) {
+ now_time = seastar::lowres_system_clock::now();
+ } else {
+ now_time = NULL_TIME;
+ }
+ sea_time_point bound_time;
+ if constexpr (gc_formula == gc_formula_t::BENEFIT) {
+ bound_time = segments.get_time_bound();
+ if (bound_time == NULL_TIME) {
+ WARN("BENEFIT -- bound_time is NULL_TIME");
+ }
+ } else {
+ bound_time = NULL_TIME;
+ }
+ for (auto& [_id, segment_info] : segments) {
+ if (segment_info.is_closed() &&
+ (trimmer == nullptr ||
+ !segment_info.is_in_journal(trimmer->get_journal_tail()))) {
+ double benefit_cost = calc_gc_benefit_cost(_id, now_time, bound_time);
+ if (benefit_cost > max_benefit_cost) {
+ id = _id;
+ max_benefit_cost = benefit_cost;
+ }
+ }
+ }
+ if (id != NULL_SEG_ID) {
+ DEBUG("segment {}, benefit_cost {}",
+ id, max_benefit_cost);
+ return id;
+ } else {
+ ceph_assert(get_segments_reclaimable() == 0);
+ // see should_clean_space()
+ ceph_abort("impossible!");
+ return NULL_SEG_ID;
+ }
+}
+
+bool SegmentCleaner::try_reserve_projected_usage(std::size_t projected_usage)
+{
+ assert(background_callback->is_ready());
+ stats.projected_used_bytes += projected_usage;
+ if (should_block_io_on_clean()) {
+ stats.projected_used_bytes -= projected_usage;
+ return false;
+ } else {
+ ++stats.projected_count;
+ stats.projected_used_bytes_sum += stats.projected_used_bytes;
+ return true;
+ }
+}
+
+void SegmentCleaner::release_projected_usage(std::size_t projected_usage)
+{
+ assert(background_callback->is_ready());
+ ceph_assert(stats.projected_used_bytes >= projected_usage);
+ stats.projected_used_bytes -= projected_usage;
+ background_callback->maybe_wake_blocked_io();
+}
+
+void SegmentCleaner::print(std::ostream &os, bool is_detailed) const
+{
+ os << "SegmentCleaner(";
+ if (background_callback->is_ready()) {
+ os << "should_block_io_on_clean=" << should_block_io_on_clean()
+ << ", should_clean=" << should_clean_space();
+ } else {
+ os << "not-ready";
+ }
+ os << ", projected_avail_ratio=" << get_projected_available_ratio()
+ << ", reclaim_ratio=" << get_reclaim_ratio()
+ << ", alive_ratio=" << get_alive_ratio();
+ if (is_detailed) {
+ os << ", unavailable_unreclaimable="
+ << get_unavailable_unreclaimable_bytes() << "B"
+ << ", unavailable_reclaimble="
+ << get_unavailable_reclaimable_bytes() << "B"
+ << ", alive=" << stats.used_bytes << "B"
+ << ", " << segments;
+ }
+ os << ")";
+}
+
+RBMCleaner::RBMCleaner(
+ RBMDeviceGroupRef&& rb_group,
+ BackrefManager &backref_manager,
+ bool detailed)
+ : detailed(detailed),
+ rb_group(std::move(rb_group)),
+ backref_manager(backref_manager)
+{}
+
+void RBMCleaner::print(std::ostream &os, bool is_detailed) const
+{
+ // TODO
+ return;
+}
+
+void RBMCleaner::mark_space_used(
+ paddr_t addr,
+ extent_len_t len)
+{
+ LOG_PREFIX(RBMCleaner::mark_space_used);
+ assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK);
+ auto rbms = rb_group->get_rb_managers();
+ for (auto rbm : rbms) {
+ if (addr.get_device_id() == rbm->get_device_id()) {
+ if (rbm->get_start() <= addr) {
+ INFO("allocate addr: {} len: {}", addr, len);
+ stats.used_bytes += len;
+ rbm->mark_space_used(addr, len);
+ }
+ return;
+ }
+ }
+}
+
+void RBMCleaner::mark_space_free(
+ paddr_t addr,
+ extent_len_t len)
+{
+ LOG_PREFIX(RBMCleaner::mark_space_free);
+ assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK);
+ auto rbms = rb_group->get_rb_managers();
+ for (auto rbm : rbms) {
+ if (addr.get_device_id() == rbm->get_device_id()) {
+ if (rbm->get_start() <= addr) {
+ INFO("free addr: {} len: {}", addr, len);
+ ceph_assert(stats.used_bytes >= len);
+ stats.used_bytes -= len;
+ rbm->mark_space_free(addr, len);
+ }
+ return;
+ }
+ }
+}
+
+void RBMCleaner::commit_space_used(paddr_t addr, extent_len_t len)
+{
+ auto rbms = rb_group->get_rb_managers();
+ for (auto rbm : rbms) {
+ if (addr.get_device_id() == rbm->get_device_id()) {
+ if (rbm->get_start() <= addr) {
+ rbm->complete_allocation(addr, len);
+ }
+ return;
+ }
+ }
+}
+
+bool RBMCleaner::try_reserve_projected_usage(std::size_t projected_usage)
+{
+ assert(background_callback->is_ready());
+ stats.projected_used_bytes += projected_usage;
+ return true;
+}
+
+void RBMCleaner::release_projected_usage(std::size_t projected_usage)
+{
+ assert(background_callback->is_ready());
+ ceph_assert(stats.projected_used_bytes >= projected_usage);
+ stats.projected_used_bytes -= projected_usage;
+ background_callback->maybe_wake_blocked_io();
+}
+
+RBMCleaner::clean_space_ret RBMCleaner::clean_space()
+{
+ // TODO
+ return clean_space_ertr::now();
+}
+
+RBMCleaner::mount_ret RBMCleaner::mount()
+{
+ stats = {};
+ register_metrics();
+ return seastar::do_with(
+ rb_group->get_rb_managers(),
+ [](auto &rbs) {
+ return crimson::do_for_each(
+ rbs.begin(),
+ rbs.end(),
+ [](auto& it) {
+ return it->open(
+ ).handle_error(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all{
+ "Invalid error when opening RBM"}
+ );
+ });
+ });
+}
+
+bool RBMCleaner::check_usage()
+{
+ assert(detailed);
+ const auto& rbms = rb_group->get_rb_managers();
+ RBMSpaceTracker tracker(rbms);
+ extent_callback->with_transaction_weak(
+ "check_usage",
+ [this, &tracker, &rbms](auto &t) {
+ return backref_manager.scan_mapped_space(
+ t,
+ [&tracker, &rbms](
+ paddr_t paddr,
+ paddr_t backref_key,
+ extent_len_t len,
+ extent_types_t type,
+ laddr_t laddr)
+ {
+ for (auto rbm : rbms) {
+ if (rbm->get_device_id() == paddr.get_device_id()) {
+ if (is_backref_node(type)) {
+ assert(laddr == L_ADDR_NULL);
+ assert(backref_key != P_ADDR_NULL);
+ tracker.allocate(
+ paddr,
+ len);
+ } else if (laddr == L_ADDR_NULL) {
+ assert(backref_key == P_ADDR_NULL);
+ tracker.release(
+ paddr,
+ len);
+ } else {
+ assert(backref_key == P_ADDR_NULL);
+ tracker.allocate(
+ paddr,
+ len);
+ }
+ }
+ }
+ });
+ }).unsafe_get0();
+ return equals(tracker);
+}
+
+bool RBMCleaner::equals(const RBMSpaceTracker &_other) const
+{
+ LOG_PREFIX(RBMSpaceTracker::equals);
+ const auto &other = static_cast<const RBMSpaceTracker&>(_other);
+ auto rbs = rb_group->get_rb_managers();
+ //TODO: multiple rbm allocator
+ auto rbm = rbs[0];
+ assert(rbm);
+
+ if (rbm->get_device()->get_available_size() / rbm->get_block_size()
+ != other.block_usage.size()) {
+ assert(0 == "block counts should match");
+ return false;
+ }
+ bool all_match = true;
+ for (auto i = other.block_usage.begin();
+ i != other.block_usage.end(); ++i) {
+ if (i->first < rbm->get_start().as_blk_paddr().get_device_off()) {
+ continue;
+ }
+ auto addr = i->first;
+ auto state = rbm->get_extent_state(
+ convert_abs_addr_to_paddr(addr, rbm->get_device_id()),
+ rbm->get_block_size());
+ if ((i->second.used && state == rbm_extent_state_t::ALLOCATED) ||
+ (!i->second.used && (state == rbm_extent_state_t::FREE ||
+ state == rbm_extent_state_t::RESERVED))) {
+ // pass
+ } else {
+ all_match = false;
+ ERROR("block addr {} mismatch other used: {}",
+ addr, i->second.used);
+ }
+ }
+ return all_match;
+}
+
+void RBMCleaner::register_metrics()
+{
+ namespace sm = seastar::metrics;
+
+ metrics.add_group("rbm_cleaner", {
+ sm::make_counter("total_bytes",
+ [this] { return get_total_bytes(); },
+ sm::description("the size of the space")),
+ sm::make_counter("available_bytes",
+ [this] { return get_total_bytes() - get_journal_bytes() - stats.used_bytes; },
+ sm::description("the size of the space is available")),
+ sm::make_counter("used_bytes", stats.used_bytes,
+ sm::description("the size of the space occupied by live extents")),
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h
new file mode 100644
index 000000000..fb8e03bb4
--- /dev/null
+++ b/src/crimson/os/seastore/async_cleaner.h
@@ -0,0 +1,1761 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive/set.hpp>
+#include <seastar/core/metrics_types.hh>
+
+#include "common/ceph_time.h"
+
+#include "osd/osd_types.h"
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/segment_manager_group.h"
+#include "crimson/os/seastore/randomblock_manager_group.h"
+#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/segment_seq_allocator.h"
+
+namespace crimson::os::seastore {
+
+/*
+ * segment_info_t
+ *
+ * Maintains the tracked information for a segment.
+ * It is read-only outside segments_info_t.
+ */
+struct segment_info_t {
+ segment_id_t id = NULL_SEG_ID;
+
+ // segment_info_t is initiated as set_empty()
+ Segment::segment_state_t state = Segment::segment_state_t::EMPTY;
+
+ // Will be non-null for any segments in the current journal
+ segment_seq_t seq = NULL_SEG_SEQ;
+
+ segment_type_t type = segment_type_t::NULL_SEG;
+
+ data_category_t category = data_category_t::NUM;
+
+ rewrite_gen_t generation = NULL_GENERATION;
+
+ sea_time_point modify_time = NULL_TIME;
+
+ std::size_t num_extents = 0;
+
+ segment_off_t written_to = 0;
+
+ bool is_in_journal(journal_seq_t tail_committed) const {
+ return type == segment_type_t::JOURNAL &&
+ tail_committed.segment_seq <= seq;
+ }
+
+ bool is_empty() const {
+ return state == Segment::segment_state_t::EMPTY;
+ }
+
+ bool is_closed() const {
+ return state == Segment::segment_state_t::CLOSED;
+ }
+
+ bool is_open() const {
+ return state == Segment::segment_state_t::OPEN;
+ }
+
+ void init_closed(segment_seq_t, segment_type_t,
+ data_category_t, rewrite_gen_t,
+ segment_off_t);
+
+ void set_open(segment_seq_t, segment_type_t,
+ data_category_t, rewrite_gen_t);
+
+ void set_empty();
+
+ void set_closed();
+
+ void update_modify_time(sea_time_point _modify_time, std::size_t _num_extents) {
+ ceph_assert(!is_closed());
+ assert(_modify_time != NULL_TIME);
+ assert(_num_extents != 0);
+ if (modify_time == NULL_TIME) {
+ modify_time = _modify_time;
+ num_extents = _num_extents;
+ } else {
+ modify_time = get_average_time(
+ modify_time, num_extents, _modify_time, _num_extents);
+ num_extents += _num_extents;
+ }
+ }
+};
+
+std::ostream& operator<<(std::ostream&, const segment_info_t&);
+
+/*
+ * segments_info_t
+ *
+ * Keep track of all segments and related information.
+ */
+class segments_info_t {
+public:
+ segments_info_t() {
+ reset();
+ }
+
+ const segment_info_t& operator[](segment_id_t id) const {
+ return segments[id];
+ }
+
+ auto begin() const {
+ return segments.begin();
+ }
+
+ auto end() const {
+ return segments.end();
+ }
+
+ std::size_t get_num_segments() const {
+ assert(segments.size() > 0);
+ return segments.size();
+ }
+ segment_off_t get_segment_size() const {
+ assert(segment_size > 0);
+ return segment_size;
+ }
+ std::size_t get_num_in_journal_open() const {
+ return num_in_journal_open;
+ }
+ std::size_t get_num_type_journal() const {
+ return num_type_journal;
+ }
+ std::size_t get_num_type_ool() const {
+ return num_type_ool;
+ }
+ std::size_t get_num_open() const {
+ return num_open;
+ }
+ std::size_t get_num_empty() const {
+ return num_empty;
+ }
+ std::size_t get_num_closed() const {
+ return num_closed;
+ }
+ std::size_t get_count_open_journal() const {
+ return count_open_journal;
+ }
+ std::size_t get_count_open_ool() const {
+ return count_open_ool;
+ }
+ std::size_t get_count_release_journal() const {
+ return count_release_journal;
+ }
+ std::size_t get_count_release_ool() const {
+ return count_release_ool;
+ }
+ std::size_t get_count_close_journal() const {
+ return count_close_journal;
+ }
+ std::size_t get_count_close_ool() const {
+ return count_close_ool;
+ }
+
+ std::size_t get_total_bytes() const {
+ return total_bytes;
+ }
+ /// the available space that is writable, including in open segments
+ std::size_t get_available_bytes() const {
+ return num_empty * get_segment_size() + avail_bytes_in_open;
+ }
+ /// the unavailable space that is not writable
+ std::size_t get_unavailable_bytes() const {
+ assert(total_bytes >= get_available_bytes());
+ return total_bytes - get_available_bytes();
+ }
+ std::size_t get_available_bytes_in_open() const {
+ return avail_bytes_in_open;
+ }
+ double get_available_ratio() const {
+ return (double)get_available_bytes() / (double)total_bytes;
+ }
+
+ journal_seq_t get_submitted_journal_head() const {
+ if (unlikely(journal_segment_id == NULL_SEG_ID)) {
+ return JOURNAL_SEQ_NULL;
+ }
+ auto &segment_info = segments[journal_segment_id];
+ assert(!segment_info.is_empty());
+ assert(segment_info.type == segment_type_t::JOURNAL);
+ assert(segment_info.seq != NULL_SEG_SEQ);
+ return journal_seq_t{
+ segment_info.seq,
+ paddr_t::make_seg_paddr(
+ journal_segment_id,
+ segment_info.written_to)
+ };
+ }
+
+ sea_time_point get_time_bound() const {
+ if (!modify_times.empty()) {
+ return *modify_times.begin();
+ } else {
+ return NULL_TIME;
+ }
+ }
+
+ void reset();
+
+ void add_segment_manager(SegmentManager &segment_manager);
+
+ void assign_ids() {
+ for (auto &item : segments) {
+ item.second.id = item.first;
+ }
+ }
+
+ // initiate non-empty segments, the others are by default empty
+ void init_closed(segment_id_t, segment_seq_t, segment_type_t,
+ data_category_t, rewrite_gen_t);
+
+ void mark_open(segment_id_t, segment_seq_t, segment_type_t,
+ data_category_t, rewrite_gen_t);
+
+ void mark_empty(segment_id_t);
+
+ void mark_closed(segment_id_t);
+
+ void update_written_to(segment_type_t, paddr_t);
+
+ void update_modify_time(
+ segment_id_t id, sea_time_point tp, std::size_t num) {
+ if (num == 0) {
+ return;
+ }
+
+ assert(tp != NULL_TIME);
+ segments[id].update_modify_time(tp, num);
+ }
+
+private:
+ // See reset() for member initialization
+ segment_map_t<segment_info_t> segments;
+
+ segment_off_t segment_size;
+
+ segment_id_t journal_segment_id;
+ std::size_t num_in_journal_open;
+ std::size_t num_type_journal;
+ std::size_t num_type_ool;
+
+ std::size_t num_open;
+ std::size_t num_empty;
+ std::size_t num_closed;
+
+ std::size_t count_open_journal;
+ std::size_t count_open_ool;
+ std::size_t count_release_journal;
+ std::size_t count_release_ool;
+ std::size_t count_close_journal;
+ std::size_t count_close_ool;
+
+ std::size_t total_bytes;
+ std::size_t avail_bytes_in_open;
+
+ std::multiset<sea_time_point> modify_times;
+};
+
+std::ostream &operator<<(std::ostream &, const segments_info_t &);
+
+/**
+ * Callback interface for querying extents and operating on transactions.
+ */
+class ExtentCallbackInterface {
+public:
+ using base_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using base_iertr = trans_iertr<base_ertr>;
+
+ virtual ~ExtentCallbackInterface() = default;
+
+ /// Creates empty transaction
+ /// weak transaction should be type READ
+ virtual TransactionRef create_transaction(
+ Transaction::src_t, const char *name, bool is_weak=false) = 0;
+
+ /// Creates empty transaction with interruptible context
+ template <typename Func>
+ auto with_transaction_intr(
+ Transaction::src_t src,
+ const char* name,
+ Func &&f) {
+ return do_with_transaction_intr<Func, false>(
+ src, name, std::forward<Func>(f));
+ }
+
+ template <typename Func>
+ auto with_transaction_weak(
+ const char* name,
+ Func &&f) {
+ return do_with_transaction_intr<Func, true>(
+ Transaction::src_t::READ, name, std::forward<Func>(f)
+ ).handle_error(
+ crimson::ct_error::eagain::handle([] {
+ ceph_assert(0 == "eagain impossible");
+ }),
+ crimson::ct_error::pass_further_all{}
+ );
+ }
+
+ /// See Cache::get_next_dirty_extents
+ using get_next_dirty_extents_iertr = base_iertr;
+ using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future<
+ std::vector<CachedExtentRef>>;
+ virtual get_next_dirty_extents_ret get_next_dirty_extents(
+ Transaction &t, ///< [in] current transaction
+ journal_seq_t bound,///< [in] return extents with dirty_from < bound
+ size_t max_bytes ///< [in] return up to max_bytes of extents
+ ) = 0;
+
+ /**
+ * rewrite_extent
+ *
+ * Updates t with operations moving the passed extents to a new
+ * segment. extent may be invalid, implementation must correctly
+ * handle finding the current instance if it is still alive and
+ * otherwise ignore it.
+ */
+ using rewrite_extent_iertr = base_iertr;
+ using rewrite_extent_ret = rewrite_extent_iertr::future<>;
+ virtual rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent,
+ rewrite_gen_t target_generation,
+ sea_time_point modify_time) = 0;
+
+ /**
+ * get_extent_if_live
+ *
+ * Returns extent at specified location if still referenced by
+ * lba_manager and not removed by t.
+ *
+ * See TransactionManager::get_extent_if_live and
+ * LBAManager::get_physical_extent_if_live.
+ */
+ using get_extents_if_live_iertr = base_iertr;
+ using get_extents_if_live_ret = get_extents_if_live_iertr::future<
+ std::list<CachedExtentRef>>;
+ virtual get_extents_if_live_ret get_extents_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ extent_len_t len) = 0;
+
+ /**
+ * submit_transaction_direct
+ *
+ * Submits transaction without any space throttling.
+ */
+ using submit_transaction_direct_iertr = base_iertr;
+ using submit_transaction_direct_ret =
+ submit_transaction_direct_iertr::future<>;
+ virtual submit_transaction_direct_ret submit_transaction_direct(
+ Transaction &t,
+ std::optional<journal_seq_t> seq_to_trim = std::nullopt) = 0;
+
+private:
+ template <typename Func, bool IsWeak>
+ auto do_with_transaction_intr(
+ Transaction::src_t src,
+ const char* name,
+ Func &&f) {
+ return seastar::do_with(
+ create_transaction(src, name, IsWeak),
+ [f=std::forward<Func>(f)](auto &ref_t) mutable {
+ return with_trans_intr(
+ *ref_t,
+ [f=std::forward<Func>(f)](auto& t) mutable {
+ return f(t);
+ }
+ );
+ }
+ );
+ }
+};
+
+/**
+ * Callback interface to wake up background works
+ */
+struct BackgroundListener {
+ enum class state_t {
+ STOP,
+ MOUNT,
+ SCAN_SPACE,
+ RUNNING,
+ HALT,
+ };
+
+ virtual ~BackgroundListener() = default;
+ virtual void maybe_wake_background() = 0;
+ virtual void maybe_wake_blocked_io() = 0;
+ virtual state_t get_state() const = 0;
+
+ bool is_ready() const {
+ return get_state() >= state_t::RUNNING;
+ }
+};
+
+/**
+ * Callback interface for Journal
+ */
+class JournalTrimmer {
+public:
+ // get the committed journal head
+ virtual journal_seq_t get_journal_head() const = 0;
+
+ // set the committed journal head
+ virtual void set_journal_head(journal_seq_t) = 0;
+
+ // get the committed journal dirty tail
+ virtual journal_seq_t get_dirty_tail() const = 0;
+
+ // get the committed journal alloc tail
+ virtual journal_seq_t get_alloc_tail() const = 0;
+
+ // set the committed journal tails
+ virtual void update_journal_tails(
+ journal_seq_t dirty_tail, journal_seq_t alloc_tail) = 0;
+
+ // try reserve the projected usage in journal
+ // returns if the reservation is successful
+ // if the reservation is successful, user should call
+ // release_inline_usage to restore.
+ virtual bool try_reserve_inline_usage(std::size_t usage) = 0;
+
+ // release the projected usage in journal
+ virtual void release_inline_usage(std::size_t usage) = 0;
+
+ virtual ~JournalTrimmer() {}
+
+ journal_seq_t get_journal_tail() const {
+ return std::min(get_alloc_tail(), get_dirty_tail());
+ }
+
+ virtual std::size_t get_trim_size_per_cycle() const = 0;
+
+ bool check_is_ready() const {
+ return (get_journal_head() != JOURNAL_SEQ_NULL &&
+ get_dirty_tail() != JOURNAL_SEQ_NULL &&
+ get_alloc_tail() != JOURNAL_SEQ_NULL);
+ }
+
+ std::size_t get_num_rolls() const {
+ if (!check_is_ready()) {
+ return 0;
+ }
+ assert(get_journal_head().segment_seq >=
+ get_journal_tail().segment_seq);
+ return get_journal_head().segment_seq + 1 -
+ get_journal_tail().segment_seq;
+ }
+};
+
+class BackrefManager;
+class JournalTrimmerImpl;
+using JournalTrimmerImplRef = std::unique_ptr<JournalTrimmerImpl>;
+
+/**
+ * Journal trimming implementation
+ */
+class JournalTrimmerImpl : public JournalTrimmer {
+public:
+ struct config_t {
+ /// Number of minimum bytes to stop trimming dirty.
+ std::size_t target_journal_dirty_bytes = 0;
+ /// Number of minimum bytes to stop trimming allocation
+ /// (having the corresponding backrefs unmerged)
+ std::size_t target_journal_alloc_bytes = 0;
+ /// Number of maximum bytes to block user transactions.
+ std::size_t max_journal_bytes = 0;
+ /// Number of bytes to rewrite dirty per cycle
+ std::size_t rewrite_dirty_bytes_per_cycle = 0;
+ /// Number of bytes to rewrite backref per cycle
+ std::size_t rewrite_backref_bytes_per_cycle = 0;
+
+ void validate() const;
+
+ static config_t get_default(
+ std::size_t roll_size, journal_type_t type);
+
+ static config_t get_test(
+ std::size_t roll_size, journal_type_t type);
+ };
+
+ JournalTrimmerImpl(
+ BackrefManager &backref_manager,
+ config_t config,
+ journal_type_t type,
+ device_off_t roll_start,
+ device_off_t roll_size);
+
+ ~JournalTrimmerImpl() = default;
+
+ /*
+ * JournalTrimmer interfaces
+ */
+
+ journal_seq_t get_journal_head() const final {
+ return journal_head;
+ }
+
+ void set_journal_head(journal_seq_t) final;
+
+ journal_seq_t get_dirty_tail() const final {
+ return journal_dirty_tail;
+ }
+
+ journal_seq_t get_alloc_tail() const final {
+ return journal_alloc_tail;
+ }
+
+ void update_journal_tails(
+ journal_seq_t dirty_tail, journal_seq_t alloc_tail) final;
+
+ std::size_t get_trim_size_per_cycle() const final {
+ return config.rewrite_backref_bytes_per_cycle +
+ config.rewrite_dirty_bytes_per_cycle;
+ }
+
+ journal_type_t get_journal_type() const {
+ return journal_type;
+ }
+
+ void set_extent_callback(ExtentCallbackInterface *cb) {
+ extent_callback = cb;
+ }
+
+ void set_background_callback(BackgroundListener *cb) {
+ background_callback = cb;
+ }
+
+ void reset() {
+ journal_head = JOURNAL_SEQ_NULL;
+ journal_dirty_tail = JOURNAL_SEQ_NULL;
+ journal_alloc_tail = JOURNAL_SEQ_NULL;
+ }
+
+ bool should_trim() const {
+ return should_trim_alloc() || should_trim_dirty();
+ }
+
+ bool should_block_io_on_trim() const {
+ return get_tail_limit() >
+ get_journal_tail().add_offset(
+ journal_type, reserved_usage, roll_start, roll_size);
+ }
+
+ bool try_reserve_inline_usage(std::size_t usage) final {
+ reserved_usage += usage;
+ if (should_block_io_on_trim()) {
+ reserved_usage -= usage;
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ void release_inline_usage(std::size_t usage) final {
+ ceph_assert(reserved_usage >= usage);
+ reserved_usage -= usage;
+ }
+
+ seastar::future<> trim();
+
+ static JournalTrimmerImplRef create(
+ BackrefManager &backref_manager,
+ config_t config,
+ journal_type_t type,
+ device_off_t roll_start,
+ device_off_t roll_size) {
+ return std::make_unique<JournalTrimmerImpl>(
+ backref_manager, config, type, roll_start, roll_size);
+ }
+
+ struct stat_printer_t {
+ const JournalTrimmerImpl &trimmer;
+ bool detailed = false;
+ };
+ friend std::ostream &operator<<(std::ostream &, const stat_printer_t &);
+
+private:
+ bool should_trim_dirty() const {
+ return get_dirty_tail_target() > journal_dirty_tail;
+ }
+
+ bool should_trim_alloc() const {
+ return get_alloc_tail_target() > journal_alloc_tail;
+ }
+
+ using trim_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ trim_ertr::future<> trim_dirty();
+
+ trim_ertr::future<> trim_alloc();
+
+ journal_seq_t get_tail_limit() const;
+ journal_seq_t get_dirty_tail_target() const;
+ journal_seq_t get_alloc_tail_target() const;
+ std::size_t get_dirty_journal_size() const;
+ std::size_t get_alloc_journal_size() const;
+ void register_metrics();
+
+ ExtentCallbackInterface *extent_callback = nullptr;
+ BackgroundListener *background_callback = nullptr;
+ BackrefManager &backref_manager;
+
+ config_t config;
+ journal_type_t journal_type;
+ device_off_t roll_start;
+ device_off_t roll_size;
+
+ journal_seq_t journal_head;
+ journal_seq_t journal_dirty_tail;
+ journal_seq_t journal_alloc_tail;
+
+ std::size_t reserved_usage;
+
+ seastar::metrics::metric_group metrics;
+};
+
+std::ostream &operator<<(
+ std::ostream &, const JournalTrimmerImpl::stat_printer_t &);
+
+/**
+ * Callback interface for managing available segments
+ */
+class SegmentProvider {
+public:
+ virtual const segment_info_t& get_seg_info(segment_id_t id) const = 0;
+
+ virtual segment_id_t allocate_segment(
+ segment_seq_t, segment_type_t, data_category_t, rewrite_gen_t) = 0;
+
+ virtual void close_segment(segment_id_t) = 0;
+
+ // set the submitted segment writes in order
+ virtual void update_segment_avail_bytes(segment_type_t, paddr_t) = 0;
+
+ virtual void update_modify_time(
+ segment_id_t, sea_time_point, std::size_t) = 0;
+
+ virtual SegmentManagerGroup* get_segment_manager_group() = 0;
+
+ virtual ~SegmentProvider() {}
+};
+
+class SpaceTrackerI {
+public:
+ virtual int64_t allocate(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) = 0;
+
+ virtual int64_t release(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) = 0;
+
+ virtual int64_t get_usage(
+ segment_id_t segment) const = 0;
+
+ virtual bool equals(const SpaceTrackerI &other) const = 0;
+
+ virtual std::unique_ptr<SpaceTrackerI> make_empty() const = 0;
+
+ virtual void dump_usage(segment_id_t) const = 0;
+
+ virtual double calc_utilization(segment_id_t segment) const = 0;
+
+ virtual void reset() = 0;
+
+ virtual ~SpaceTrackerI() = default;
+};
+using SpaceTrackerIRef = std::unique_ptr<SpaceTrackerI>;
+
+class SpaceTrackerSimple : public SpaceTrackerI {
+ struct segment_bytes_t {
+ int64_t live_bytes = 0;
+ segment_off_t total_bytes = 0;
+ };
+ // Tracks live space for each segment
+ segment_map_t<segment_bytes_t> live_bytes_by_segment;
+
+ int64_t update_usage(segment_id_t segment, int64_t delta) {
+ live_bytes_by_segment[segment].live_bytes += delta;
+ assert(live_bytes_by_segment[segment].live_bytes >= 0);
+ return live_bytes_by_segment[segment].live_bytes;
+ }
+public:
+ SpaceTrackerSimple(const SpaceTrackerSimple &) = default;
+ SpaceTrackerSimple(const std::vector<SegmentManager*> &sms) {
+ for (auto sm : sms) {
+ live_bytes_by_segment.add_device(
+ sm->get_device_id(),
+ sm->get_num_segments(),
+ {0, sm->get_segment_size()});
+ }
+ }
+
+ int64_t allocate(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) final {
+ return update_usage(segment, len);
+ }
+
+ int64_t release(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) final {
+ return update_usage(segment, -(int64_t)len);
+ }
+
+ int64_t get_usage(segment_id_t segment) const final {
+ return live_bytes_by_segment[segment].live_bytes;
+ }
+
+ double calc_utilization(segment_id_t segment) const final {
+ auto& seg_bytes = live_bytes_by_segment[segment];
+ return (double)seg_bytes.live_bytes / (double)seg_bytes.total_bytes;
+ }
+
+ void dump_usage(segment_id_t) const final;
+
+ void reset() final {
+ for (auto &i : live_bytes_by_segment) {
+ i.second = {0, 0};
+ }
+ }
+
+ SpaceTrackerIRef make_empty() const final {
+ auto ret = SpaceTrackerIRef(new SpaceTrackerSimple(*this));
+ ret->reset();
+ return ret;
+ }
+
+ bool equals(const SpaceTrackerI &other) const;
+};
+
+class SpaceTrackerDetailed : public SpaceTrackerI {
+ class SegmentMap {
+ int64_t used = 0;
+ segment_off_t total_bytes = 0;
+ std::vector<bool> bitmap;
+
+ public:
+ SegmentMap(
+ size_t blocks,
+ segment_off_t total_bytes)
+ : total_bytes(total_bytes),
+ bitmap(blocks, false) {}
+
+ int64_t update_usage(int64_t delta) {
+ used += delta;
+ return used;
+ }
+
+ int64_t allocate(
+ device_segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len,
+ const extent_len_t block_size);
+
+ int64_t release(
+ device_segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len,
+ const extent_len_t block_size);
+
+ int64_t get_usage() const {
+ return used;
+ }
+
+ void dump_usage(extent_len_t block_size) const;
+
+ double calc_utilization() const {
+ return (double)used / (double)total_bytes;
+ }
+
+ void reset() {
+ used = 0;
+ for (auto &&i: bitmap) {
+ i = false;
+ }
+ }
+ };
+
+ // Tracks live space for each segment
+ segment_map_t<SegmentMap> segment_usage;
+ std::vector<size_t> block_size_by_segment_manager;
+
+public:
+ SpaceTrackerDetailed(const SpaceTrackerDetailed &) = default;
+ SpaceTrackerDetailed(const std::vector<SegmentManager*> &sms)
+ {
+ block_size_by_segment_manager.resize(DEVICE_ID_MAX, 0);
+ for (auto sm : sms) {
+ segment_usage.add_device(
+ sm->get_device_id(),
+ sm->get_num_segments(),
+ SegmentMap(
+ sm->get_segment_size() / sm->get_block_size(),
+ sm->get_segment_size()));
+ block_size_by_segment_manager[sm->get_device_id()] = sm->get_block_size();
+ }
+ }
+
+ int64_t allocate(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) final {
+ return segment_usage[segment].allocate(
+ segment.device_segment_id(),
+ offset,
+ len,
+ block_size_by_segment_manager[segment.device_id()]);
+ }
+
+ int64_t release(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) final {
+ return segment_usage[segment].release(
+ segment.device_segment_id(),
+ offset,
+ len,
+ block_size_by_segment_manager[segment.device_id()]);
+ }
+
+ int64_t get_usage(segment_id_t segment) const final {
+ return segment_usage[segment].get_usage();
+ }
+
+ double calc_utilization(segment_id_t segment) const final {
+ return segment_usage[segment].calc_utilization();
+ }
+
+ void dump_usage(segment_id_t seg) const final;
+
+ void reset() final {
+ for (auto &i: segment_usage) {
+ i.second.reset();
+ }
+ }
+
+ SpaceTrackerIRef make_empty() const final {
+ auto ret = SpaceTrackerIRef(new SpaceTrackerDetailed(*this));
+ ret->reset();
+ return ret;
+ }
+
+ bool equals(const SpaceTrackerI &other) const;
+};
+
+template <typename T>
+class block_map_t {
+public:
+ block_map_t() {
+ device_to_blocks.resize(DEVICE_ID_MAX_VALID);
+ device_block_size.resize(DEVICE_ID_MAX_VALID);
+ }
+ void add_device(device_id_t device, std::size_t blocks, const T& init,
+ size_t block_size) {
+ ceph_assert(device <= DEVICE_ID_MAX_VALID);
+ ceph_assert(device_to_blocks[device].size() == 0);
+ ceph_assert(blocks > 0);
+ device_to_blocks[device].resize(blocks, init);
+ total_blocks += blocks;
+ device_block_size[device] = block_size;
+ }
+ void clear() {
+ device_to_blocks.clear();
+ device_to_blocks.resize(DEVICE_ID_MAX_VALID);
+ total_blocks = 0;
+ }
+
+ T& operator[](paddr_t block) {
+ ceph_assert(device_to_blocks[block.get_device_id()].size() != 0);
+ auto &blk = block.as_blk_paddr();
+ auto block_id = get_block_id(block.get_device_id(), blk.get_device_off());
+ return device_to_blocks[block.get_device_id()][block_id];
+ }
+ const T& operator[](paddr_t block) const {
+ ceph_assert(device_to_blocks[block.get_device_id()].size() != 0);
+ auto &blk = block.as_blk_paddr();
+ auto block_id = get_block_id(block.get_device_id(), blk.get_device_off());
+ return device_to_blocks[block.get_device_id()][block_id];
+ }
+
+ auto begin() {
+ return iterator<false>::lower_bound(*this, 0, 0);
+ }
+ auto begin() const {
+ return iterator<true>::lower_bound(*this, 0, 0);
+ }
+
+ auto end() {
+ return iterator<false>::end_iterator(*this);
+ }
+ auto end() const {
+ return iterator<true>::end_iterator(*this);
+ }
+
+ size_t size() const {
+ return total_blocks;
+ }
+
+ uint64_t get_block_size(device_id_t device_id) {
+ return device_block_size[device_id];
+ }
+
+ uint32_t get_block_id(device_id_t device_id, device_off_t blk_off) const {
+ auto block_size = device_block_size[device_id];
+ return blk_off == 0 ? 0 : blk_off/block_size;
+ }
+
+ template <bool is_const = false>
+ class iterator {
+ /// points at set being iterated over
+ std::conditional_t<
+ is_const,
+ const block_map_t &,
+ block_map_t &> parent;
+
+ /// points at current device, or DEVICE_ID_MAX_VALID if is_end()
+ device_id_t device_id;
+
+ /// segment at which we are pointing, 0 if is_end()
+ device_off_t blk_off;
+
+ /// holds referent for operator* and operator-> when !is_end()
+ std::optional<
+ std::pair<
+ const device_off_t,
+ std::conditional_t<is_const, const T&, T&>
+ >> current;
+
+ bool is_end() const {
+ return device_id == DEVICE_ID_MAX_VALID;
+ }
+
+ uint32_t get_block_id() {
+ return parent.get_block_id(device_id, blk_off);
+ }
+
+ void find_valid() {
+ assert(!is_end());
+ auto &device_vec = parent.device_to_blocks[device_id];
+ if (device_vec.size() == 0 ||
+ get_block_id() == device_vec.size()) {
+ while (++device_id < DEVICE_ID_MAX_VALID&&
+ parent.device_to_blocks[device_id].size() == 0);
+ blk_off = 0;
+ }
+ if (is_end()) {
+ current = std::nullopt;
+ } else {
+ current.emplace(
+ blk_off,
+ parent.device_to_blocks[device_id][get_block_id()]
+ );
+ }
+ }
+
+ iterator(
+ decltype(parent) &parent,
+ device_id_t device_id,
+ device_off_t device_block_off)
+ : parent(parent), device_id(device_id),
+ blk_off(device_block_off) {}
+
+ public:
+ static iterator lower_bound(
+ decltype(parent) &parent,
+ device_id_t device_id,
+ device_off_t block_off) {
+ if (device_id == DEVICE_ID_MAX_VALID) {
+ return end_iterator(parent);
+ } else {
+ auto ret = iterator{parent, device_id, block_off};
+ ret.find_valid();
+ return ret;
+ }
+ }
+
+ static iterator end_iterator(
+ decltype(parent) &parent) {
+ return iterator{parent, DEVICE_ID_MAX_VALID, 0};
+ }
+
+ iterator<is_const>& operator++() {
+ assert(!is_end());
+ auto block_size = parent.device_block_size[device_id];
+ blk_off += block_size;
+ find_valid();
+ return *this;
+ }
+
+ bool operator==(iterator<is_const> rit) {
+ return (device_id == rit.device_id &&
+ blk_off == rit.blk_off);
+ }
+
+ bool operator!=(iterator<is_const> rit) {
+ return !(*this == rit);
+ }
+ template <bool c = is_const, std::enable_if_t<c, int> = 0>
+ const std::pair<const device_off_t, const T&> *operator->() {
+ assert(!is_end());
+ return &*current;
+ }
+ template <bool c = is_const, std::enable_if_t<!c, int> = 0>
+ std::pair<const device_off_t, T&> *operator->() {
+ assert(!is_end());
+ return &*current;
+ }
+ template <bool c = is_const, std::enable_if_t<c, int> = 0>
+ const std::pair<const device_off_t, const T&> &operator*() {
+ assert(!is_end());
+ return *current;
+ }
+ template <bool c = is_const, std::enable_if_t<!c, int> = 0>
+ std::pair<const device_off_t, T&> &operator*() {
+ assert(!is_end());
+ return *current;
+ }
+ };
+ std::vector<std::vector<T>> device_to_blocks;
+ std::vector<size_t> device_block_size;
+ size_t total_blocks = 0;
+};
+
+class RBMSpaceTracker {
+ struct random_block_t {
+ bool used = false;
+ void allocate() {
+ used = true;
+ }
+ void release() {
+ used = false;
+ }
+ };
+ block_map_t<random_block_t> block_usage;
+
+public:
+ RBMSpaceTracker(const RBMSpaceTracker &) = default;
+ RBMSpaceTracker(const std::vector<RandomBlockManager*> &rbms) {
+ for (auto rbm : rbms) {
+ block_usage.add_device(
+ rbm->get_device_id(),
+ rbm->get_device()->get_available_size() / rbm->get_block_size(),
+ {false},
+ rbm->get_block_size());
+ }
+ }
+
+ void allocate(
+ paddr_t addr,
+ extent_len_t len) {
+ paddr_t cursor = addr;
+ paddr_t end = addr.add_offset(len);
+ do {
+ block_usage[cursor].allocate();
+ cursor = cursor.add_offset(
+ block_usage.get_block_size(addr.get_device_id()));
+ } while (cursor < end);
+ }
+
+ void release(
+ paddr_t addr,
+ extent_len_t len) {
+ paddr_t cursor = addr;
+ paddr_t end = addr.add_offset(len);
+ do {
+ block_usage[cursor].release();
+ cursor = cursor.add_offset(
+ block_usage.get_block_size(addr.get_device_id()));
+ } while (cursor < end);
+ }
+
+ void reset() {
+ for (auto &i : block_usage) {
+ i.second = {false};
+ }
+ }
+
+ std::unique_ptr<RBMSpaceTracker> make_empty() const {
+ auto ret = std::make_unique<RBMSpaceTracker>(*this);
+ ret->reset();
+ return ret;
+ }
+ friend class RBMCleaner;
+};
+using RBMSpaceTrackerRef = std::unique_ptr<RBMSpaceTracker>;
+
+/*
+ * AsyncCleaner
+ *
+ * Interface for ExtentPlacementManager::BackgroundProcess
+ * to do background cleaning.
+ */
+class AsyncCleaner {
+public:
+ using state_t = BackgroundListener::state_t;
+ using base_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+
+ virtual void set_background_callback(BackgroundListener *) = 0;
+
+ virtual void set_extent_callback(ExtentCallbackInterface *) = 0;
+
+ virtual store_statfs_t get_stat() const = 0;
+
+ virtual void print(std::ostream &, bool is_detailed) const = 0;
+
+ virtual bool check_usage_is_empty() const = 0;
+
+ using mount_ertr = base_ertr;
+ using mount_ret = mount_ertr::future<>;
+ virtual mount_ret mount() = 0;
+
+ virtual void mark_space_used(paddr_t, extent_len_t) = 0;
+
+ virtual void mark_space_free(paddr_t, extent_len_t) = 0;
+
+ virtual void commit_space_used(paddr_t, extent_len_t) = 0;
+
+ // try reserve the projected usage in cleaner
+ // returns if the reservation is successful
+ // if the reservation is successful, user should call
+ // release_projected_usage to restore.
+ virtual bool try_reserve_projected_usage(std::size_t) = 0;
+
+ virtual void release_projected_usage(std::size_t) = 0;
+
+ virtual bool should_block_io_on_clean() const = 0;
+
+ virtual bool can_clean_space() const = 0;
+
+ virtual bool should_clean_space() const = 0;
+
+ using clean_space_ertr = base_ertr;
+ using clean_space_ret = clean_space_ertr::future<>;
+ virtual clean_space_ret clean_space() = 0;
+
+ virtual const std::set<device_id_t>& get_device_ids() const = 0;
+
+ virtual std::size_t get_reclaim_size_per_cycle() const = 0;
+
+ // test only
+ virtual bool check_usage() = 0;
+
+ struct stat_printer_t {
+ const AsyncCleaner &cleaner;
+ bool detailed = false;
+ };
+
+ virtual ~AsyncCleaner() {}
+};
+
+using AsyncCleanerRef = std::unique_ptr<AsyncCleaner>;
+
+std::ostream &operator<<(
+ std::ostream &, const AsyncCleaner::stat_printer_t &);
+
+class SegmentCleaner;
+using SegmentCleanerRef = std::unique_ptr<SegmentCleaner>;
+
+class SegmentCleaner : public SegmentProvider, public AsyncCleaner {
+public:
+ /// Config
+ struct config_t {
+ /// Ratio of maximum available space to disable reclaiming.
+ double available_ratio_gc_max = 0;
+ /// Ratio of minimum available space to force reclaiming.
+ double available_ratio_hard_limit = 0;
+ /// Ratio of minimum reclaimable space to stop reclaiming.
+ double reclaim_ratio_gc_threshold = 0;
+ /// Number of bytes to reclaim per cycle
+ std::size_t reclaim_bytes_per_cycle = 0;
+
+ void validate() const {
+ ceph_assert(available_ratio_gc_max > available_ratio_hard_limit);
+ ceph_assert(reclaim_bytes_per_cycle > 0);
+ }
+
+ static config_t get_default() {
+ return config_t{
+ .15, // available_ratio_gc_max
+ .1, // available_ratio_hard_limit
+ .1, // reclaim_ratio_gc_threshold
+ 1<<20 // reclaim_bytes_per_cycle
+ };
+ }
+
+ static config_t get_test() {
+ return config_t{
+ .99, // available_ratio_gc_max
+ .2, // available_ratio_hard_limit
+ .6, // reclaim_ratio_gc_threshold
+ 1<<20 // reclaim_bytes_per_cycle
+ };
+ }
+ };
+
+ SegmentCleaner(
+ config_t config,
+ SegmentManagerGroupRef&& sm_group,
+ BackrefManager &backref_manager,
+ SegmentSeqAllocator &segment_seq_allocator,
+ bool detailed,
+ bool is_cold);
+
+ void set_journal_trimmer(JournalTrimmer &_trimmer) {
+ trimmer = &_trimmer;
+ }
+
+ static SegmentCleanerRef create(
+ config_t config,
+ SegmentManagerGroupRef&& sm_group,
+ BackrefManager &backref_manager,
+ SegmentSeqAllocator &ool_seq_allocator,
+ bool detailed,
+ bool is_cold = false) {
+ return std::make_unique<SegmentCleaner>(
+ config, std::move(sm_group), backref_manager,
+ ool_seq_allocator, detailed, is_cold);
+ }
+
+ /*
+ * SegmentProvider interfaces
+ */
+
+ const segment_info_t& get_seg_info(segment_id_t id) const final {
+ return segments[id];
+ }
+
+ segment_id_t allocate_segment(
+ segment_seq_t, segment_type_t, data_category_t, rewrite_gen_t) final;
+
+ void close_segment(segment_id_t segment) final;
+
+ void update_segment_avail_bytes(segment_type_t type, paddr_t offset) final {
+ assert(type == segment_type_t::OOL ||
+ trimmer != nullptr); // segment_type_t::JOURNAL
+ segments.update_written_to(type, offset);
+ background_callback->maybe_wake_background();
+ }
+
+ void update_modify_time(
+ segment_id_t id, sea_time_point tp, std::size_t num_extents) final {
+ ceph_assert(num_extents == 0 || tp != NULL_TIME);
+ segments.update_modify_time(id, tp, num_extents);
+ }
+
+ SegmentManagerGroup* get_segment_manager_group() final {
+ return sm_group.get();
+ }
+
+ /*
+ * AsyncCleaner interfaces
+ */
+
+ void set_background_callback(BackgroundListener *cb) final {
+ background_callback = cb;
+ }
+
+ void set_extent_callback(ExtentCallbackInterface *cb) final {
+ extent_callback = cb;
+ }
+
+ store_statfs_t get_stat() const final {
+ store_statfs_t st;
+ st.total = segments.get_total_bytes();
+ st.available = segments.get_total_bytes() - stats.used_bytes;
+ st.allocated = stats.used_bytes;
+ st.data_stored = stats.used_bytes;
+
+ // TODO add per extent type counters for omap_allocated and
+ // internal metadata
+ return st;
+ }
+
+ void print(std::ostream &, bool is_detailed) const final;
+
+ bool check_usage_is_empty() const final {
+ return space_tracker->equals(*space_tracker->make_empty());
+ }
+
+ mount_ret mount() final;
+
+ void mark_space_used(paddr_t, extent_len_t) final;
+
+ void mark_space_free(paddr_t, extent_len_t) final;
+
+ void commit_space_used(paddr_t addr, extent_len_t len) final {
+ mark_space_used(addr, len);
+ }
+
+ bool try_reserve_projected_usage(std::size_t) final;
+
+ void release_projected_usage(size_t) final;
+
+ bool should_block_io_on_clean() const final {
+ assert(background_callback->is_ready());
+ if (get_segments_reclaimable() == 0) {
+ return false;
+ }
+ auto aratio = get_projected_available_ratio();
+ return aratio < config.available_ratio_hard_limit;
+ }
+
+ bool can_clean_space() const final {
+ assert(background_callback->is_ready());
+ return get_segments_reclaimable() > 0;
+ }
+
+ bool should_clean_space() const final {
+ assert(background_callback->is_ready());
+ if (get_segments_reclaimable() == 0) {
+ return false;
+ }
+ auto aratio = segments.get_available_ratio();
+ auto rratio = get_reclaim_ratio();
+ return (
+ (aratio < config.available_ratio_hard_limit) ||
+ ((aratio < config.available_ratio_gc_max) &&
+ (rratio > config.reclaim_ratio_gc_threshold))
+ );
+ }
+
+ clean_space_ret clean_space() final;
+
+ const std::set<device_id_t>& get_device_ids() const final {
+ return sm_group->get_device_ids();
+ }
+
+ std::size_t get_reclaim_size_per_cycle() const final {
+ return config.reclaim_bytes_per_cycle;
+ }
+
+ // Testing interfaces
+
+ bool check_usage() final;
+
+private:
+ /*
+ * 10 buckets for the number of closed segments by usage
+ * 2 extra buckets for the number of open and empty segments
+ */
+ static constexpr double UTIL_STATE_OPEN = 1.05;
+ static constexpr double UTIL_STATE_EMPTY = 1.15;
+ static constexpr std::size_t UTIL_BUCKETS = 12;
+ static std::size_t get_bucket_index(double util) {
+ auto index = std::floor(util * 10);
+ assert(index < UTIL_BUCKETS);
+ return index;
+ }
+ double calc_utilization(segment_id_t id) const {
+ auto& info = segments[id];
+ if (info.is_open()) {
+ return UTIL_STATE_OPEN;
+ } else if (info.is_empty()) {
+ return UTIL_STATE_EMPTY;
+ } else {
+ auto ret = space_tracker->calc_utilization(id);
+ assert(ret >= 0 && ret < 1);
+ return ret;
+ }
+ }
+
+ // journal status helpers
+
+ double calc_gc_benefit_cost(
+ segment_id_t id,
+ const sea_time_point &now_time,
+ const sea_time_point &bound_time) const;
+
+ segment_id_t get_next_reclaim_segment() const;
+
+ struct reclaim_state_t {
+ rewrite_gen_t generation;
+ rewrite_gen_t target_generation;
+ segment_off_t segment_size;
+ paddr_t start_pos;
+ paddr_t end_pos;
+
+ static reclaim_state_t create(
+ segment_id_t segment_id,
+ rewrite_gen_t generation,
+ segment_off_t segment_size) {
+ ceph_assert(is_rewrite_generation(generation));
+
+ rewrite_gen_t target_gen;
+ if (generation < MIN_REWRITE_GENERATION) {
+ target_gen = MIN_REWRITE_GENERATION;
+ } else {
+ // tolerate the target_gen to exceed MAX_REWRETE_GENERATION to make EPM
+ // aware of its original generation for the decisions.
+ target_gen = generation + 1;
+ }
+
+ assert(is_target_rewrite_generation(target_gen));
+ return {generation,
+ target_gen,
+ segment_size,
+ P_ADDR_NULL,
+ paddr_t::make_seg_paddr(segment_id, 0)};
+ }
+
+ segment_id_t get_segment_id() const {
+ return end_pos.as_seg_paddr().get_segment_id();
+ }
+
+ bool is_complete() const {
+ return end_pos.as_seg_paddr().get_segment_off() >= segment_size;
+ }
+
+ void advance(std::size_t bytes) {
+ assert(!is_complete());
+ start_pos = end_pos;
+ auto &end_seg_paddr = end_pos.as_seg_paddr();
+ auto next_off = end_seg_paddr.get_segment_off() + bytes;
+ if (next_off > (std::size_t)segment_size) {
+ end_seg_paddr.set_segment_off(segment_size);
+ } else {
+ end_seg_paddr.set_segment_off(next_off);
+ }
+ }
+ };
+ std::optional<reclaim_state_t> reclaim_state;
+
+ using do_reclaim_space_ertr = base_ertr;
+ using do_reclaim_space_ret = do_reclaim_space_ertr::future<>;
+ do_reclaim_space_ret do_reclaim_space(
+ const std::vector<CachedExtentRef> &backref_extents,
+ const backref_pin_list_t &pin_list,
+ std::size_t &reclaimed,
+ std::size_t &runs);
+
+ /*
+ * Segments calculations
+ */
+ std::size_t get_segments_in_journal() const {
+ if (trimmer != nullptr) {
+ return trimmer->get_num_rolls();
+ } else {
+ return 0;
+ }
+ }
+ std::size_t get_segments_in_journal_closed() const {
+ auto in_journal = get_segments_in_journal();
+ auto in_journal_open = segments.get_num_in_journal_open();
+ if (in_journal >= in_journal_open) {
+ return in_journal - in_journal_open;
+ } else {
+ return 0;
+ }
+ }
+ std::size_t get_segments_reclaimable() const {
+ assert(segments.get_num_closed() >= get_segments_in_journal_closed());
+ return segments.get_num_closed() - get_segments_in_journal_closed();
+ }
+
+ /*
+ * Space calculations
+ */
+ /// the unavailable space that is not reclaimable yet
+ std::size_t get_unavailable_unreclaimable_bytes() const {
+ auto ret = (segments.get_num_open() + get_segments_in_journal_closed()) *
+ segments.get_segment_size();
+ assert(ret >= segments.get_available_bytes_in_open());
+ return ret - segments.get_available_bytes_in_open();
+ }
+ /// the unavailable space that can be reclaimed
+ std::size_t get_unavailable_reclaimable_bytes() const {
+ auto ret = get_segments_reclaimable() * segments.get_segment_size();
+ ceph_assert(ret + get_unavailable_unreclaimable_bytes() == segments.get_unavailable_bytes());
+ return ret;
+ }
+ /// the unavailable space that is not alive
+ std::size_t get_unavailable_unused_bytes() const {
+ assert(segments.get_unavailable_bytes() > stats.used_bytes);
+ return segments.get_unavailable_bytes() - stats.used_bytes;
+ }
+ double get_reclaim_ratio() const {
+ if (segments.get_unavailable_bytes() == 0) return 0;
+ return (double)get_unavailable_unused_bytes() / (double)segments.get_unavailable_bytes();
+ }
+ double get_alive_ratio() const {
+ return stats.used_bytes / (double)segments.get_total_bytes();
+ }
+
+ /*
+ * Space calculations (projected)
+ */
+ std::size_t get_projected_available_bytes() const {
+ return (segments.get_available_bytes() > stats.projected_used_bytes) ?
+ segments.get_available_bytes() - stats.projected_used_bytes:
+ 0;
+ }
+ double get_projected_available_ratio() const {
+ return (double)get_projected_available_bytes() /
+ (double)segments.get_total_bytes();
+ }
+
+ using scan_extents_ertr = SegmentManagerGroup::scan_valid_records_ertr;
+ using scan_extents_ret = scan_extents_ertr::future<>;
+ scan_extents_ret scan_no_tail_segment(
+ const segment_header_t& header,
+ segment_id_t segment_id);
+
+ void adjust_segment_util(double old_usage, double new_usage) {
+ auto old_index = get_bucket_index(old_usage);
+ auto new_index = get_bucket_index(new_usage);
+ assert(stats.segment_util.buckets[old_index].count > 0);
+ stats.segment_util.buckets[old_index].count--;
+ stats.segment_util.buckets[new_index].count++;
+ }
+
+ void init_mark_segment_closed(
+ segment_id_t segment,
+ segment_seq_t seq,
+ segment_type_t s_type,
+ data_category_t category,
+ rewrite_gen_t generation) {
+ assert(background_callback->get_state() == state_t::MOUNT);
+ ceph_assert(s_type == segment_type_t::OOL ||
+ trimmer != nullptr); // segment_type_t::JOURNAL
+ auto old_usage = calc_utilization(segment);
+ segments.init_closed(segment, seq, s_type, category, generation);
+ auto new_usage = calc_utilization(segment);
+ adjust_segment_util(old_usage, new_usage);
+ if (s_type == segment_type_t::OOL) {
+ ool_segment_seq_allocator.set_next_segment_seq(seq);
+ }
+ }
+
+ const bool detailed;
+ const bool is_cold;
+ const config_t config;
+
+ SegmentManagerGroupRef sm_group;
+ BackrefManager &backref_manager;
+
+ SpaceTrackerIRef space_tracker;
+ segments_info_t segments;
+
+ struct {
+ /**
+ * used_bytes
+ *
+ * Bytes occupied by live extents
+ */
+ uint64_t used_bytes = 0;
+
+ /**
+ * projected_used_bytes
+ *
+ * Sum of projected bytes used by each transaction between throttle
+ * acquisition and commit completion. See try_reserve_projected_usage()
+ */
+ uint64_t projected_used_bytes = 0;
+ uint64_t projected_count = 0;
+ uint64_t projected_used_bytes_sum = 0;
+
+ uint64_t closed_journal_used_bytes = 0;
+ uint64_t closed_journal_total_bytes = 0;
+ uint64_t closed_ool_used_bytes = 0;
+ uint64_t closed_ool_total_bytes = 0;
+
+ uint64_t reclaiming_bytes = 0;
+ uint64_t reclaimed_bytes = 0;
+ uint64_t reclaimed_segment_bytes = 0;
+
+ seastar::metrics::histogram segment_util;
+ } stats;
+ seastar::metrics::metric_group metrics;
+ void register_metrics();
+
+ // optional, set if this cleaner is assigned to SegmentedJournal
+ JournalTrimmer *trimmer = nullptr;
+
+ ExtentCallbackInterface *extent_callback = nullptr;
+
+ BackgroundListener *background_callback = nullptr;
+
+ // TODO: drop once paddr->journal_seq_t is introduced
+ SegmentSeqAllocator &ool_segment_seq_allocator;
+};
+
+class RBMCleaner;
+using RBMCleanerRef = std::unique_ptr<RBMCleaner>;
+
+class RBMCleaner : public AsyncCleaner {
+public:
+ RBMCleaner(
+ RBMDeviceGroupRef&& rb_group,
+ BackrefManager &backref_manager,
+ bool detailed);
+
+ static RBMCleanerRef create(
+ RBMDeviceGroupRef&& rb_group,
+ BackrefManager &backref_manager,
+ bool detailed) {
+ return std::make_unique<RBMCleaner>(
+ std::move(rb_group), backref_manager, detailed);
+ }
+
+ RBMDeviceGroup* get_rb_group() {
+ return rb_group.get();
+ }
+
+ /*
+ * AsyncCleaner interfaces
+ */
+
+ void set_background_callback(BackgroundListener *cb) final {
+ background_callback = cb;
+ }
+
+ void set_extent_callback(ExtentCallbackInterface *cb) final {
+ extent_callback = cb;
+ }
+
+ store_statfs_t get_stat() const final {
+ store_statfs_t st;
+ st.total = get_total_bytes();
+ st.available = get_total_bytes() - get_journal_bytes() - stats.used_bytes;
+ st.allocated = get_journal_bytes() + stats.used_bytes;
+ st.data_stored = get_journal_bytes() + stats.used_bytes;
+ return st;
+ }
+
+ void print(std::ostream &, bool is_detailed) const final;
+
+ mount_ret mount() final;
+
+ void mark_space_used(paddr_t, extent_len_t) final;
+
+ void mark_space_free(paddr_t, extent_len_t) final;
+
+ void commit_space_used(paddr_t, extent_len_t) final;
+
+ bool try_reserve_projected_usage(std::size_t) final;
+
+ void release_projected_usage(size_t) final;
+
+ bool should_block_io_on_clean() const final {
+ return false;
+ }
+
+ bool can_clean_space() const final {
+ return false;
+ }
+
+ bool should_clean_space() const final {
+ return false;
+ }
+
+ clean_space_ret clean_space() final;
+
+ const std::set<device_id_t>& get_device_ids() const final {
+ return rb_group->get_device_ids();
+ }
+
+ std::size_t get_reclaim_size_per_cycle() const final {
+ return 0;
+ }
+
+ RandomBlockManager* get_rbm(paddr_t paddr) {
+ auto rbs = rb_group->get_rb_managers();
+ for (auto p : rbs) {
+ if (p->get_device_id() == paddr.get_device_id()) {
+ return p;
+ }
+ }
+ return nullptr;
+ }
+
+ paddr_t alloc_paddr(extent_len_t length) {
+ // TODO: implement allocation strategy (dirty metadata and multiple devices)
+ auto rbs = rb_group->get_rb_managers();
+ auto paddr = rbs[0]->alloc_extent(length);
+ stats.used_bytes += length;
+ return paddr;
+ }
+
+ size_t get_total_bytes() const {
+ auto rbs = rb_group->get_rb_managers();
+ size_t total = 0;
+ for (auto p : rbs) {
+ total += p->get_device()->get_available_size();
+ }
+ return total;
+ }
+
+ size_t get_journal_bytes() const {
+ auto rbs = rb_group->get_rb_managers();
+ size_t total = 0;
+ for (auto p : rbs) {
+ total += p->get_journal_size();
+ }
+ return total;
+ }
+
+ // Testing interfaces
+
+ bool check_usage() final;
+
+ bool check_usage_is_empty() const final {
+ // TODO
+ return true;
+ }
+
+private:
+ bool equals(const RBMSpaceTracker &other) const;
+
+ const bool detailed;
+ RBMDeviceGroupRef rb_group;
+ BackrefManager &backref_manager;
+
+ struct {
+ /**
+ * used_bytes
+ *
+ * Bytes occupied by live extents
+ */
+ uint64_t used_bytes = 0;
+
+ /**
+ * projected_used_bytes
+ *
+ * Sum of projected bytes used by each transaction between throttle
+ * acquisition and commit completion. See reserve_projected_usage()
+ */
+ uint64_t projected_used_bytes = 0;
+ } stats;
+ seastar::metrics::metric_group metrics;
+ void register_metrics();
+
+ ExtentCallbackInterface *extent_callback = nullptr;
+ BackgroundListener *background_callback = nullptr;
+};
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::segment_info_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::segments_info_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::AsyncCleaner::stat_printer_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::JournalTrimmerImpl::stat_printer_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/backref/backref_tree_node.cc b/src/crimson/os/seastore/backref/backref_tree_node.cc
new file mode 100644
index 000000000..513c29994
--- /dev/null
+++ b/src/crimson/os/seastore/backref/backref_tree_node.cc
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/backref/backref_tree_node.h"
+
+namespace crimson::os::seastore::backref {
+
+std::ostream& operator<<(std::ostream &out, const backref_map_val_t& val) {
+ return out << "backref_map_val_t("
+ << val.laddr
+ << "~" << val.len << ")";
+}
+
+} // namespace crimson::os::seastore::backref
diff --git a/src/crimson/os/seastore/backref/backref_tree_node.h b/src/crimson/os/seastore/backref/backref_tree_node.h
new file mode 100644
index 000000000..c3ff52520
--- /dev/null
+++ b/src/crimson/os/seastore/backref/backref_tree_node.h
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/btree/fixed_kv_node.h"
+
+namespace crimson::os::seastore::backref {
+
+using backref_node_meta_t = fixed_kv_node_meta_t<paddr_t>;
+using backref_node_meta_le_t = fixed_kv_node_meta_le_t<paddr_t>;
+
+constexpr size_t INTERNAL_NODE_CAPACITY = 254;
+constexpr size_t LEAF_NODE_CAPACITY = 169;
+
+using BackrefNode = FixedKVNode<paddr_t>;
+
+struct backref_map_val_t {
+ extent_len_t len = 0; ///< length of extents
+ laddr_t laddr = 0; ///< logical address of extents
+ extent_types_t type = extent_types_t::ROOT;
+
+ backref_map_val_t() = default;
+ backref_map_val_t(
+ extent_len_t len,
+ laddr_t laddr,
+ extent_types_t type)
+ : len(len), laddr(laddr), type(type) {}
+
+ bool operator==(const backref_map_val_t& rhs) const noexcept {
+ return len == rhs.len && laddr == rhs.laddr;
+ }
+};
+
+std::ostream& operator<<(std::ostream &out, const backref_map_val_t& val);
+
+struct backref_map_val_le_t {
+ extent_len_le_t len = init_extent_len_le(0);
+ laddr_le_t laddr = laddr_le_t(0);
+ extent_types_le_t type = 0;
+
+ backref_map_val_le_t() = default;
+ backref_map_val_le_t(const backref_map_val_le_t &) = default;
+ explicit backref_map_val_le_t(const backref_map_val_t &val)
+ : len(init_extent_len_le(val.len)),
+ laddr(val.laddr),
+ type(extent_types_le_t(val.type)) {}
+
+ operator backref_map_val_t() const {
+ return backref_map_val_t{len, laddr, (extent_types_t)type};
+ }
+};
+
+class BackrefInternalNode
+ : public FixedKVInternalNode<
+ INTERNAL_NODE_CAPACITY,
+ paddr_t, paddr_le_t,
+ BACKREF_NODE_SIZE,
+ BackrefInternalNode> {
+public:
+ template <typename... T>
+ BackrefInternalNode(T&&... t) :
+ FixedKVInternalNode(std::forward<T>(t)...) {}
+
+ static constexpr extent_types_t TYPE = extent_types_t::BACKREF_INTERNAL;
+
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+};
+using BackrefInternalNodeRef = BackrefInternalNode::Ref;
+
+class BackrefLeafNode
+ : public FixedKVLeafNode<
+ LEAF_NODE_CAPACITY,
+ paddr_t, paddr_le_t,
+ backref_map_val_t, backref_map_val_le_t,
+ BACKREF_NODE_SIZE,
+ BackrefLeafNode,
+ false> {
+public:
+ template <typename... T>
+ BackrefLeafNode(T&&... t) :
+ FixedKVLeafNode(std::forward<T>(t)...) {}
+
+ static constexpr extent_types_t TYPE = extent_types_t::BACKREF_LEAF;
+
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ const_iterator insert(
+ const_iterator iter,
+ paddr_t key,
+ backref_map_val_t val,
+ LogicalCachedExtent*) final {
+ journal_insert(
+ iter,
+ key,
+ val,
+ maybe_get_delta_buffer());
+ return iter;
+ }
+
+ void update(
+ const_iterator iter,
+ backref_map_val_t val,
+ LogicalCachedExtent*) final {
+ return journal_update(
+ iter,
+ val,
+ maybe_get_delta_buffer());
+ }
+
+ void remove(const_iterator iter) final {
+ return journal_remove(
+ iter,
+ maybe_get_delta_buffer());
+ }
+
+ // backref leaf nodes don't have to resolve relative addresses
+ void resolve_relative_addrs(paddr_t base) final {}
+
+ void node_resolve_vals(iterator from, iterator to) const final {}
+
+ void node_unresolve_vals(iterator from, iterator to) const final {}
+};
+using BackrefLeafNodeRef = BackrefLeafNode::Ref;
+
+} // namespace crimson::os::seastore::backref
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::backref::backref_map_val_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::backref::BackrefInternalNode> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::backref::BackrefLeafNode> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::backref::backref_node_meta_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc
new file mode 100644
index 000000000..30ff45540
--- /dev/null
+++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc
@@ -0,0 +1,609 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/backref/btree_backref_manager.h"
+
+SET_SUBSYS(seastore_backref);
+
+namespace crimson::os::seastore {
+
+template<>
+Transaction::tree_stats_t& get_tree_stats<
+ crimson::os::seastore::backref::BackrefBtree>(Transaction &t) {
+ return t.get_backref_tree_stats();
+}
+
+template<>
+phy_tree_root_t& get_phy_tree_root<
+ crimson::os::seastore::backref::BackrefBtree>(root_t &r) {
+ return r.backref_root;
+}
+
+template<>
+const get_phy_tree_root_node_ret get_phy_tree_root_node<
+ crimson::os::seastore::backref::BackrefBtree>(
+ const RootBlockRef &root_block, op_context_t<paddr_t> c) {
+ auto backref_root = root_block->backref_root_node;
+ if (backref_root) {
+ ceph_assert(backref_root->is_initial_pending()
+ == root_block->is_pending());
+ return {true,
+ trans_intr::make_interruptible(
+ c.cache.get_extent_viewable_by_trans(c.trans, backref_root))};
+ } else if (root_block->is_pending()) {
+ auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance());
+ backref_root = prior.backref_root_node;
+ if (backref_root) {
+ return {true,
+ trans_intr::make_interruptible(
+ c.cache.get_extent_viewable_by_trans(c.trans, backref_root))};
+ } else {
+ return {false,
+ trans_intr::make_interruptible(
+ Cache::get_extent_ertr::make_ready_future<
+ CachedExtentRef>())};
+ }
+ } else {
+ return {false,
+ trans_intr::make_interruptible(
+ Cache::get_extent_ertr::make_ready_future<
+ CachedExtentRef>())};
+ }
+}
+
+template <typename ROOT>
+void link_phy_tree_root_node(RootBlockRef &root_block, ROOT* backref_root) {
+ root_block->backref_root_node = backref_root;
+ ceph_assert(backref_root != nullptr);
+ backref_root->root_block = root_block;
+}
+
+template void link_phy_tree_root_node(
+ RootBlockRef &root_block, backref::BackrefInternalNode* backref_root);
+template void link_phy_tree_root_node(
+ RootBlockRef &root_block, backref::BackrefLeafNode* backref_root);
+template void link_phy_tree_root_node(
+ RootBlockRef &root_block, backref::BackrefNode* backref_root);
+
+template <>
+void unlink_phy_tree_root_node<paddr_t>(RootBlockRef &root_block) {
+ root_block->backref_root_node = nullptr;
+}
+
+}
+
+namespace crimson::os::seastore::backref {
+
+BtreeBackrefManager::mkfs_ret
+BtreeBackrefManager::mkfs(
+ Transaction &t)
+{
+ LOG_PREFIX(BtreeBackrefManager::mkfs);
+ INFOT("start", t);
+ return cache.get_root(t).si_then([this, &t](auto croot) {
+ assert(croot->is_mutation_pending());
+ croot->get_root().backref_root = BackrefBtree::mkfs(croot, get_context(t));
+ return mkfs_iertr::now();
+ }).handle_error_interruptible(
+ mkfs_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in BtreeBackrefManager::mkfs"
+ }
+ );
+}
+
+BtreeBackrefManager::get_mapping_ret
+BtreeBackrefManager::get_mapping(
+ Transaction &t,
+ paddr_t offset)
+{
+ LOG_PREFIX(BtreeBackrefManager::get_mapping);
+ TRACET("{}", t, offset);
+ auto c = get_context(t);
+ return with_btree_ret<BackrefBtree, BackrefMappingRef>(
+ cache,
+ c,
+ [c, offset](auto &btree) {
+ return btree.lower_bound(
+ c, offset
+ ).si_then([offset, c](auto iter) -> get_mapping_ret {
+ LOG_PREFIX(BtreeBackrefManager::get_mapping);
+ if (iter.is_end() || iter.get_key() != offset) {
+ ERRORT("{} doesn't exist", c.trans, offset);
+ return crimson::ct_error::enoent::make();
+ } else {
+ TRACET("{} got {}, {}",
+ c.trans, offset, iter.get_key(), iter.get_val());
+ return get_mapping_ret(
+ interruptible::ready_future_marker{},
+ iter.get_pin(c));
+ }
+ });
+ });
+}
+
+BtreeBackrefManager::get_mappings_ret
+BtreeBackrefManager::get_mappings(
+ Transaction &t,
+ paddr_t offset,
+ paddr_t end)
+{
+ LOG_PREFIX(BtreeBackrefManager::get_mappings);
+ TRACET("{}~{}", t, offset, end);
+ auto c = get_context(t);
+ return with_btree_state<BackrefBtree, backref_pin_list_t>(
+ cache,
+ c,
+ [c, offset, end](auto &btree, auto &ret) {
+ return BackrefBtree::iterate_repeat(
+ c,
+ btree.upper_bound_right(c, offset),
+ [&ret, offset, end, c](auto &pos) {
+ LOG_PREFIX(BtreeBackrefManager::get_mappings);
+ if (pos.is_end() || pos.get_key() >= end) {
+ TRACET("{}~{} done with {} results",
+ c.trans, offset, end, ret.size());
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ }
+ TRACET("{}~{} got {}, {}, repeat ...",
+ c.trans, offset, end, pos.get_key(), pos.get_val());
+ ceph_assert((pos.get_key().add_offset(pos.get_val().len)) > offset);
+ ret.emplace_back(pos.get_pin(c));
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ });
+ });
+}
+
+BtreeBackrefManager::new_mapping_ret
+BtreeBackrefManager::new_mapping(
+ Transaction &t,
+ paddr_t key,
+ extent_len_t len,
+ laddr_t addr,
+ extent_types_t type)
+{
+ ceph_assert(
+ is_aligned(
+ key.get_addr_type() == paddr_types_t::SEGMENT ?
+ key.as_seg_paddr().get_segment_off() :
+ key.as_blk_paddr().get_device_off(),
+ cache.get_block_size()));
+ struct state_t {
+ paddr_t last_end;
+
+ std::optional<BackrefBtree::iterator> insert_iter;
+ std::optional<BackrefBtree::iterator> ret;
+
+ state_t(paddr_t hint) : last_end(hint) {}
+ };
+
+ LOG_PREFIX(BtreeBackrefManager::new_mapping);
+ DEBUGT("{}~{}, paddr={}", t, addr, len, key);
+ backref_map_val_t val{len, addr, type};
+ auto c = get_context(t);
+ //++stats.num_alloc_extents;
+ //auto lookup_attempts = stats.num_alloc_extents_iter_nexts;
+ return crimson::os::seastore::with_btree_state<BackrefBtree, state_t>(
+ cache,
+ c,
+ key,
+ [val, c, key, len, addr, /*lookup_attempts,*/ &t]
+ (auto &btree, auto &state) {
+ return BackrefBtree::iterate_repeat(
+ c,
+ btree.upper_bound_right(c, key),
+ [&state, len, addr, &t, key/*, lookup_attempts*/](auto &pos) {
+ LOG_PREFIX(BtreeBackrefManager::new_mapping);
+ //++stats.num_alloc_extents_iter_nexts;
+ if (pos.is_end()) {
+ DEBUGT("{}~{}, paddr={}, state: end, insert at {}",
+ t, addr, len, key,
+ //stats.num_alloc_extents_iter_nexts - lookup_attempts,
+ state.last_end);
+ state.insert_iter = pos;
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ } else if (pos.get_key() >= (state.last_end.add_offset(len))) {
+ DEBUGT("{}~{}, paddr={}, state: {}~{}, "
+ "insert at {} -- {}",
+ t, addr, len, key,
+ pos.get_key(), pos.get_val().len,
+ //stats.num_alloc_extents_iter_nexts - lookup_attempts,
+ state.last_end,
+ pos.get_val());
+ state.insert_iter = pos;
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ } else {
+ ERRORT("{}~{}, paddr={}, state: {}~{}, repeat ... -- {}",
+ t, addr, len, key,
+ pos.get_key(), pos.get_val().len,
+ pos.get_val());
+ ceph_abort("not possible for the backref tree");
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ }
+ }).si_then([c, addr, len, key, &btree, &state, val] {
+ return btree.insert(
+ c,
+ *state.insert_iter,
+ state.last_end,
+ val,
+ nullptr
+ ).si_then([&state, c, addr, len, key](auto &&p) {
+ LOG_PREFIX(BtreeBackrefManager::new_mapping);
+ auto [iter, inserted] = std::move(p);
+ TRACET("{}~{}, paddr={}, inserted at {}, leaf {}",
+ c.trans, addr, len, key, state.last_end, *iter.get_leaf_node());
+ ceph_assert(inserted);
+ state.ret = iter;
+ });
+ });
+ }).si_then([c](auto &&state) {
+ return new_mapping_iertr::make_ready_future<BackrefMappingRef>(
+ state.ret->get_pin(c));
+ });
+}
+
+BtreeBackrefManager::merge_cached_backrefs_ret
+BtreeBackrefManager::merge_cached_backrefs(
+ Transaction &t,
+ const journal_seq_t &limit,
+ const uint64_t max)
+{
+ LOG_PREFIX(BtreeBackrefManager::merge_cached_backrefs);
+ DEBUGT("insert up to {}", t, limit);
+ return seastar::do_with(
+ limit,
+ JOURNAL_SEQ_NULL,
+ [this, &t, max](auto &limit, auto &inserted_to) {
+ auto &backref_entryrefs_by_seq = cache.get_backref_entryrefs_by_seq();
+ return seastar::do_with(
+ backref_entryrefs_by_seq.begin(),
+ JOURNAL_SEQ_NULL,
+ [this, &t, &limit, &backref_entryrefs_by_seq, max](auto &iter, auto &inserted_to) {
+ return trans_intr::repeat(
+ [&iter, this, &t, &limit, &backref_entryrefs_by_seq, max, &inserted_to]()
+ -> merge_cached_backrefs_iertr::future<seastar::stop_iteration> {
+ if (iter == backref_entryrefs_by_seq.end()) {
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ auto &seq = iter->first;
+ auto &backref_entry_refs = iter->second;
+ LOG_PREFIX(BtreeBackrefManager::merge_cached_backrefs);
+ DEBUGT("seq {}, limit {}, num_fresh_backref {}"
+ , t, seq, limit, t.get_num_fresh_backref());
+ if (seq <= limit && t.get_num_fresh_backref() * BACKREF_NODE_SIZE < max) {
+ inserted_to = seq;
+ return trans_intr::do_for_each(
+ backref_entry_refs,
+ [this, &t](auto &backref_entry_ref) {
+ LOG_PREFIX(BtreeBackrefManager::merge_cached_backrefs);
+ auto &backref_entry = *backref_entry_ref;
+ if (backref_entry.laddr != L_ADDR_NULL) {
+ DEBUGT("new mapping: {}~{} -> {}",
+ t,
+ backref_entry.paddr,
+ backref_entry.len,
+ backref_entry.laddr);
+ return new_mapping(
+ t,
+ backref_entry.paddr,
+ backref_entry.len,
+ backref_entry.laddr,
+ backref_entry.type).si_then([](auto &&pin) {
+ return seastar::now();
+ });
+ } else {
+ DEBUGT("remove mapping: {}", t, backref_entry.paddr);
+ return remove_mapping(
+ t,
+ backref_entry.paddr
+ ).si_then([](auto&&) {
+ return seastar::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all("no enoent possible")
+ );
+ }
+ }).si_then([&iter] {
+ iter++;
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ }
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }).si_then([&inserted_to] {
+ return seastar::make_ready_future<journal_seq_t>(
+ std::move(inserted_to));
+ });
+ });
+ return merge_cached_backrefs_iertr::make_ready_future<journal_seq_t>(
+ std::move(inserted_to));
+ });
+}
+
+BtreeBackrefManager::scan_mapped_space_ret
+BtreeBackrefManager::scan_mapped_space(
+ Transaction &t,
+ BtreeBackrefManager::scan_mapped_space_func_t &&f)
+{
+ LOG_PREFIX(BtreeBackrefManager::scan_mapped_space);
+ DEBUGT("scan backref tree", t);
+ auto c = get_context(t);
+ return seastar::do_with(
+ std::move(f),
+ [this, c, FNAME](auto &scan_visitor)
+ {
+ auto block_size = cache.get_block_size();
+ // traverse leaf-node entries
+ return with_btree<BackrefBtree>(
+ cache, c,
+ [c, &scan_visitor, block_size, FNAME](auto &btree)
+ {
+ return BackrefBtree::iterate_repeat(
+ c,
+ btree.lower_bound(
+ c,
+ P_ADDR_MIN),
+ [c, &scan_visitor, block_size, FNAME](auto &pos) {
+ if (pos.is_end()) {
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ }
+ TRACET("tree value {}~{} {}~{} {} used",
+ c.trans,
+ pos.get_key(),
+ pos.get_val().len,
+ pos.get_val().laddr,
+ pos.get_val().len,
+ pos.get_val().type);
+ ceph_assert(pos.get_key().is_absolute());
+ ceph_assert(pos.get_val().len > 0 &&
+ pos.get_val().len % block_size == 0);
+ ceph_assert(!is_backref_node(pos.get_val().type));
+ ceph_assert(pos.get_val().laddr != L_ADDR_NULL);
+ scan_visitor(
+ pos.get_key(),
+ P_ADDR_NULL,
+ pos.get_val().len,
+ pos.get_val().type,
+ pos.get_val().laddr);
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ }
+ );
+ }).si_then([this, &scan_visitor, c, FNAME, block_size] {
+ // traverse alloc-deltas in order
+ auto &backref_entryrefs = cache.get_backref_entryrefs_by_seq();
+ for (auto &[seq, refs] : backref_entryrefs) {
+ boost::ignore_unused(seq);
+ DEBUGT("scan {} backref entries", c.trans, refs.size());
+ for (auto &backref_entry : refs) {
+ if (backref_entry->laddr == L_ADDR_NULL) {
+ TRACET("backref entry {}~{} {} free",
+ c.trans,
+ backref_entry->paddr,
+ backref_entry->len,
+ backref_entry->type);
+ } else {
+ TRACET("backref entry {}~{} {}~{} {} used",
+ c.trans,
+ backref_entry->paddr,
+ backref_entry->len,
+ backref_entry->laddr,
+ backref_entry->len,
+ backref_entry->type);
+ }
+ ceph_assert(backref_entry->paddr.is_absolute());
+ ceph_assert(backref_entry->len > 0 &&
+ backref_entry->len % block_size == 0);
+ ceph_assert(!is_backref_node(backref_entry->type));
+ scan_visitor(
+ backref_entry->paddr,
+ P_ADDR_NULL,
+ backref_entry->len,
+ backref_entry->type,
+ backref_entry->laddr);
+ }
+ }
+ }).si_then([this, &scan_visitor, block_size, c, FNAME] {
+ BackrefBtree::mapped_space_visitor_t f =
+ [&scan_visitor, block_size, FNAME, c](
+ paddr_t paddr, paddr_t key, extent_len_t len,
+ depth_t depth, extent_types_t type, BackrefBtree::iterator&) {
+ TRACET("tree node {}~{} {}, depth={} used",
+ c.trans, paddr, len, type, depth);
+ ceph_assert(paddr.is_absolute());
+ ceph_assert(len > 0 && len % block_size == 0);
+ ceph_assert(depth >= 1);
+ ceph_assert(is_backref_node(type));
+ return scan_visitor(paddr, key, len, type, L_ADDR_NULL);
+ };
+ return seastar::do_with(
+ std::move(f),
+ [this, c](auto &tree_visitor)
+ {
+ // traverse internal-node entries
+ return with_btree<BackrefBtree>(
+ cache, c,
+ [c, &tree_visitor](auto &btree)
+ {
+ return BackrefBtree::iterate_repeat(
+ c,
+ btree.lower_bound(
+ c,
+ P_ADDR_MIN,
+ &tree_visitor),
+ [](auto &pos) {
+ if (pos.is_end()) {
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ }
+ return BackrefBtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ },
+ &tree_visitor
+ );
+ });
+ });
+ });
+ });
+}
+
+BtreeBackrefManager::base_iertr::future<> _init_cached_extent(
+ op_context_t<paddr_t> c,
+ const CachedExtentRef &e,
+ BackrefBtree &btree,
+ bool &ret)
+{
+ return btree.init_cached_extent(c, e
+ ).si_then([&ret](bool is_alive) {
+ ret = is_alive;
+ });
+}
+
+BtreeBackrefManager::init_cached_extent_ret BtreeBackrefManager::init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e)
+{
+ LOG_PREFIX(BtreeBackrefManager::init_cached_extent);
+ TRACET("{}", t, *e);
+ return seastar::do_with(bool(), [this, e, &t](bool &ret) {
+ auto c = get_context(t);
+ return with_btree<BackrefBtree>(cache, c, [c, e, &ret](auto &btree)
+ -> base_iertr::future<> {
+ LOG_PREFIX(BtreeBackrefManager::init_cached_extent);
+ DEBUGT("extent {}", c.trans, *e);
+ return _init_cached_extent(c, e, btree, ret);
+ }).si_then([&ret] { return ret; });
+ });
+}
+
+BtreeBackrefManager::rewrite_extent_ret
+BtreeBackrefManager::rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent)
+{
+ auto c = get_context(t);
+ return with_btree<BackrefBtree>(
+ cache,
+ c,
+ [c, extent](auto &btree) mutable {
+ return btree.rewrite_extent(c, extent);
+ });
+}
+
+BtreeBackrefManager::remove_mapping_ret
+BtreeBackrefManager::remove_mapping(
+ Transaction &t,
+ paddr_t addr)
+{
+ auto c = get_context(t);
+ return with_btree_ret<BackrefBtree, remove_mapping_result_t>(
+ cache,
+ c,
+ [c, addr](auto &btree) mutable {
+ return btree.lower_bound(
+ c, addr
+ ).si_then([&btree, c, addr](auto iter)
+ -> remove_mapping_ret {
+ if (iter.is_end() || iter.get_key() != addr) {
+ LOG_PREFIX(BtreeBackrefManager::remove_mapping);
+ WARNT("paddr={} doesn't exist, state: {}, leaf {}",
+ c.trans, addr, iter.get_key(), *iter.get_leaf_node());
+ return remove_mapping_iertr::make_ready_future<
+ remove_mapping_result_t>(remove_mapping_result_t());
+ }
+
+ auto ret = remove_mapping_result_t{
+ iter.get_key(),
+ iter.get_val().len,
+ iter.get_val().laddr};
+ return btree.remove(
+ c,
+ iter
+ ).si_then([ret] {
+ return ret;
+ });
+ });
+ });
+}
+
+Cache::backref_entry_query_mset_t
+BtreeBackrefManager::get_cached_backref_entries_in_range(
+ paddr_t start,
+ paddr_t end)
+{
+ return cache.get_backref_entries_in_range(start, end);
+}
+
+void BtreeBackrefManager::cache_new_backref_extent(
+ paddr_t paddr,
+ paddr_t key,
+ extent_types_t type)
+{
+ return cache.add_backref_extent(paddr, key, type);
+}
+
+BtreeBackrefManager::retrieve_backref_extents_in_range_ret
+BtreeBackrefManager::retrieve_backref_extents_in_range(
+ Transaction &t,
+ paddr_t start,
+ paddr_t end)
+{
+ auto backref_extents = cache.get_backref_extents_in_range(start, end);
+ return seastar::do_with(
+ std::vector<CachedExtentRef>(),
+ std::move(backref_extents),
+ [this, &t](auto &extents, auto &backref_extents) {
+ return trans_intr::parallel_for_each(
+ backref_extents,
+ [this, &extents, &t](auto &ent) {
+ // only the gc fiber which is single can rewrite backref extents,
+ // so it must be alive
+ assert(is_backref_node(ent.type));
+ LOG_PREFIX(BtreeBackrefManager::retrieve_backref_extents_in_range);
+ DEBUGT("getting backref extent of type {} at {}, key {}",
+ t,
+ ent.type,
+ ent.paddr,
+ ent.key);
+
+ auto c = get_context(t);
+ return with_btree_ret<BackrefBtree, CachedExtentRef>(
+ cache,
+ c,
+ [c, &ent](auto &btree) {
+ if (ent.type == extent_types_t::BACKREF_INTERNAL) {
+ return btree.get_internal_if_live(
+ c, ent.paddr, ent.key, BACKREF_NODE_SIZE);
+ } else {
+ assert(ent.type == extent_types_t::BACKREF_LEAF);
+ return btree.get_leaf_if_live(
+ c, ent.paddr, ent.key, BACKREF_NODE_SIZE);
+ }
+ }).si_then([&extents](auto ext) {
+ ceph_assert(ext);
+ extents.emplace_back(std::move(ext));
+ });
+ }).si_then([&extents] {
+ return std::move(extents);
+ });
+ });
+}
+
+} // namespace crimson::os::seastore::backref
diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h
new file mode 100644
index 000000000..952e78b65
--- /dev/null
+++ b/src/crimson/os/seastore/backref/btree_backref_manager.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/backref_manager.h"
+#include "crimson/os/seastore/backref/backref_tree_node.h"
+#include "crimson/os/seastore/btree/fixed_kv_btree.h"
+
+namespace crimson::os::seastore::backref {
+
+constexpr size_t BACKREF_BLOCK_SIZE = 4096;
+
+class BtreeBackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> {
+ extent_types_t type;
+public:
+ BtreeBackrefMapping(op_context_t<paddr_t> ctx)
+ : BtreeNodeMapping(ctx) {}
+ BtreeBackrefMapping(
+ op_context_t<paddr_t> ctx,
+ CachedExtentRef parent,
+ uint16_t pos,
+ backref_map_val_t &val,
+ backref_node_meta_t &&meta)
+ : BtreeNodeMapping(
+ ctx,
+ parent,
+ pos,
+ val.laddr,
+ val.len,
+ std::forward<backref_node_meta_t>(meta)),
+ type(val.type)
+ {}
+ extent_types_t get_type() const final {
+ return type;
+ }
+
+protected:
+ std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>> _duplicate(
+ op_context_t<paddr_t> ctx) const final {
+ return std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>>(
+ new BtreeBackrefMapping(ctx));
+ }
+};
+
+using BackrefBtree = FixedKVBtree<
+ paddr_t, backref_map_val_t, BackrefInternalNode,
+ BackrefLeafNode, BtreeBackrefMapping, BACKREF_BLOCK_SIZE, false>;
+
+class BtreeBackrefManager : public BackrefManager {
+public:
+
+ BtreeBackrefManager(Cache &cache)
+ : cache(cache)
+ {}
+
+ mkfs_ret mkfs(
+ Transaction &t) final;
+
+ get_mapping_ret get_mapping(
+ Transaction &t,
+ paddr_t offset) final;
+
+ get_mappings_ret get_mappings(
+ Transaction &t,
+ paddr_t offset,
+ paddr_t end) final;
+
+ new_mapping_ret new_mapping(
+ Transaction &t,
+ paddr_t key,
+ extent_len_t len,
+ laddr_t val,
+ extent_types_t type) final;
+
+ merge_cached_backrefs_ret merge_cached_backrefs(
+ Transaction &t,
+ const journal_seq_t &limit,
+ const uint64_t max) final;
+
+ remove_mapping_ret remove_mapping(
+ Transaction &t,
+ paddr_t offset) final;
+
+ scan_mapped_space_ret scan_mapped_space(
+ Transaction &t,
+ scan_mapped_space_func_t &&f) final;
+
+ init_cached_extent_ret init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e) final;
+
+ rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent) final;
+
+ Cache::backref_entry_query_mset_t
+ get_cached_backref_entries_in_range(
+ paddr_t start,
+ paddr_t end) final;
+
+ retrieve_backref_extents_in_range_ret
+ retrieve_backref_extents_in_range(
+ Transaction &t,
+ paddr_t start,
+ paddr_t end) final;
+
+ void cache_new_backref_extent(
+ paddr_t paddr,
+ paddr_t key,
+ extent_types_t type) final;
+
+private:
+ Cache &cache;
+
+ op_context_t<paddr_t> get_context(Transaction &t) {
+ return op_context_t<paddr_t>{cache, t};
+ }
+};
+
+} // namespace crimson::os::seastore::backref
diff --git a/src/crimson/os/seastore/backref_manager.cc b/src/crimson/os/seastore/backref_manager.cc
new file mode 100644
index 000000000..c596ee41d
--- /dev/null
+++ b/src/crimson/os/seastore/backref_manager.cc
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/backref_manager.h"
+#include "crimson/os/seastore/backref/btree_backref_manager.h"
+
+namespace crimson::os::seastore {
+
+BackrefManagerRef create_backref_manager(
+ Cache &cache)
+{
+ return BackrefManagerRef(
+ new backref::BtreeBackrefManager(cache));
+}
+
+} // namespace crimson::os::seastore::backref
+
diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h
new file mode 100644
index 000000000..3feedb997
--- /dev/null
+++ b/src/crimson/os/seastore/backref_manager.h
@@ -0,0 +1,152 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * Abstract interface for managing back references that map paddr_t to laddr_t
+ */
+class BackrefManager {
+public:
+ using base_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using base_iertr = trans_iertr<base_ertr>;
+
+ using mkfs_iertr = base_iertr;
+ using mkfs_ret = mkfs_iertr::future<>;
+ virtual mkfs_ret mkfs(
+ Transaction &t) = 0;
+
+ /**
+ * Fetches mappings for paddr_t in range [offset, offset + len)
+ *
+ * Future will not resolve until all pins have resolved
+ */
+ using get_mappings_iertr = base_iertr;
+ using get_mappings_ret = get_mappings_iertr::future<backref_pin_list_t>;
+ virtual get_mappings_ret get_mappings(
+ Transaction &t,
+ paddr_t offset,
+ paddr_t end) = 0;
+
+ /**
+ * Fetches the mapping for paddr_t
+ *
+ * Future will not resolve until the pin has resolved
+ */
+ using get_mapping_iertr = base_iertr::extend<
+ crimson::ct_error::enoent>;
+ using get_mapping_ret = get_mapping_iertr::future<BackrefMappingRef>;
+ virtual get_mapping_ret get_mapping(
+ Transaction &t,
+ paddr_t offset) = 0;
+
+ /**
+ * rewrite_extent
+ *
+ * rewrite extent into passed transaction
+ */
+ using rewrite_extent_iertr = base_iertr;
+ using rewrite_extent_ret = rewrite_extent_iertr::future<>;
+ virtual rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent) = 0;
+
+ /**
+ * Insert new paddr_t -> laddr_t mapping
+ */
+ using new_mapping_iertr = base_iertr;
+ using new_mapping_ret = new_mapping_iertr::future<BackrefMappingRef>;
+ virtual new_mapping_ret new_mapping(
+ Transaction &t,
+ paddr_t key,
+ extent_len_t len,
+ laddr_t val,
+ extent_types_t type) = 0;
+
+ /**
+ * Check if a CachedExtent is alive, should be called
+ * after replay on each cached extent.
+ *
+ * @return returns whether the extent is alive
+ */
+ using init_cached_extent_iertr = base_iertr;
+ using init_cached_extent_ret = init_cached_extent_iertr::future<bool>;
+ virtual init_cached_extent_ret init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e) = 0;
+
+ virtual Cache::backref_entry_query_mset_t
+ get_cached_backref_entries_in_range(
+ paddr_t start,
+ paddr_t end) = 0;
+
+ using retrieve_backref_extents_in_range_iertr = base_iertr;
+ using retrieve_backref_extents_in_range_ret =
+ retrieve_backref_extents_in_range_iertr::future<std::vector<CachedExtentRef>>;
+ virtual retrieve_backref_extents_in_range_ret
+ retrieve_backref_extents_in_range(
+ Transaction &t,
+ paddr_t start,
+ paddr_t end) = 0;
+
+ virtual void cache_new_backref_extent(
+ paddr_t paddr,
+ paddr_t key,
+ extent_types_t type) = 0;
+
+ /**
+ * merge in-cache paddr_t -> laddr_t mappings to the on-disk backref tree
+ */
+ using merge_cached_backrefs_iertr = base_iertr;
+ using merge_cached_backrefs_ret = merge_cached_backrefs_iertr::future<journal_seq_t>;
+ virtual merge_cached_backrefs_ret merge_cached_backrefs(
+ Transaction &t,
+ const journal_seq_t &limit,
+ const uint64_t max) = 0;
+
+ struct remove_mapping_result_t {
+ paddr_t offset = P_ADDR_NULL;
+ extent_len_t len = 0;
+ laddr_t laddr = L_ADDR_NULL;
+ };
+
+ /**
+ * delete the mapping for paddr_t offset
+ */
+ using remove_mapping_iertr = base_iertr::extend<
+ crimson::ct_error::enoent>;
+ using remove_mapping_ret = remove_mapping_iertr::future<remove_mapping_result_t>;
+ virtual remove_mapping_ret remove_mapping(
+ Transaction &t,
+ paddr_t offset) = 0;
+
+ /**
+ * scan all extents in both tree and cache,
+ * including backref extents, logical extents and lba extents,
+ * visit them with scan_mapped_space_func_t
+ */
+ using scan_mapped_space_iertr = base_iertr;
+ using scan_mapped_space_ret = scan_mapped_space_iertr::future<>;
+ using scan_mapped_space_func_t = std::function<
+ void(paddr_t, paddr_t, extent_len_t, extent_types_t, laddr_t)>;
+ virtual scan_mapped_space_ret scan_mapped_space(
+ Transaction &t,
+ scan_mapped_space_func_t &&f) = 0;
+
+ virtual ~BackrefManager() {}
+};
+
+using BackrefManagerRef =
+ std::unique_ptr<BackrefManager>;
+
+BackrefManagerRef create_backref_manager(
+ Cache &cache);
+
+} // namespace crimson::os::seastore::backref
diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc
new file mode 100644
index 000000000..2f801dcf1
--- /dev/null
+++ b/src/crimson/os/seastore/btree/btree_range_pin.cc
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+#include "crimson/os/seastore/btree/fixed_kv_node.h"
+
+namespace crimson::os::seastore {
+
+template <typename key_t, typename val_t>
+get_child_ret_t<LogicalCachedExtent>
+BtreeNodeMapping<key_t, val_t>::get_logical_extent(
+ Transaction &t)
+{
+ assert(parent);
+ assert(parent->is_valid());
+ assert(pos != std::numeric_limits<uint16_t>::max());
+ auto &p = (FixedKVNode<key_t>&)*parent;
+ auto v = p.get_logical_child(ctx, pos);
+ if (!v.has_child()) {
+ this->child_pos = v.get_child_pos();
+ }
+ return v;
+}
+
+template class BtreeNodeMapping<laddr_t, paddr_t>;
+template class BtreeNodeMapping<paddr_t, laddr_t>;
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h
new file mode 100644
index 000000000..68188e9ff
--- /dev/null
+++ b/src/crimson/os/seastore/btree/btree_range_pin.h
@@ -0,0 +1,204 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive/set.hpp>
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+template <typename node_key_t>
+struct op_context_t {
+ Cache &cache;
+ Transaction &trans;
+};
+
+constexpr uint16_t MAX_FIXEDKVBTREE_DEPTH = 8;
+
+template <typename bound_t>
+struct fixed_kv_node_meta_t {
+ bound_t begin = min_max_t<bound_t>::min;
+ bound_t end = min_max_t<bound_t>::min;
+ depth_t depth = 0;
+
+ bool is_parent_of(const fixed_kv_node_meta_t &other) const {
+ return (depth == other.depth + 1) &&
+ (begin <= other.begin) &&
+ (end > other.begin);
+ }
+
+ bool is_in_range(const bound_t key) const {
+ return begin <= key && end > key;
+ }
+
+ std::pair<fixed_kv_node_meta_t, fixed_kv_node_meta_t> split_into(bound_t pivot) const {
+ return std::make_pair(
+ fixed_kv_node_meta_t{begin, pivot, depth},
+ fixed_kv_node_meta_t{pivot, end, depth});
+ }
+
+ static fixed_kv_node_meta_t merge_from(
+ const fixed_kv_node_meta_t &lhs, const fixed_kv_node_meta_t &rhs) {
+ ceph_assert(lhs.depth == rhs.depth);
+ return fixed_kv_node_meta_t{lhs.begin, rhs.end, lhs.depth};
+ }
+
+ static std::pair<fixed_kv_node_meta_t, fixed_kv_node_meta_t>
+ rebalance(const fixed_kv_node_meta_t &lhs, const fixed_kv_node_meta_t &rhs, bound_t pivot) {
+ ceph_assert(lhs.depth == rhs.depth);
+ return std::make_pair(
+ fixed_kv_node_meta_t{lhs.begin, pivot, lhs.depth},
+ fixed_kv_node_meta_t{pivot, rhs.end, lhs.depth});
+ }
+
+ bool is_root() const {
+ return begin == min_max_t<bound_t>::min && end == min_max_t<bound_t>::max;
+ }
+};
+
+template <typename bound_t>
+inline std::ostream &operator<<(
+ std::ostream &lhs,
+ const fixed_kv_node_meta_t<bound_t> &rhs)
+{
+ return lhs << "btree_node_meta_t("
+ << "begin=" << rhs.begin
+ << ", end=" << rhs.end
+ << ", depth=" << rhs.depth
+ << ")";
+}
+
+/**
+ * fixed_kv_node_meta_le_t
+ *
+ * On disk layout for fixed_kv_node_meta_t
+ */
+template <typename bound_le_t>
+struct fixed_kv_node_meta_le_t {
+ bound_le_t begin = bound_le_t(0);
+ bound_le_t end = bound_le_t(0);
+ depth_le_t depth = init_depth_le(0);
+
+ fixed_kv_node_meta_le_t() = default;
+ fixed_kv_node_meta_le_t(
+ const fixed_kv_node_meta_le_t<bound_le_t> &) = default;
+ explicit fixed_kv_node_meta_le_t(
+ const fixed_kv_node_meta_t<typename bound_le_t::orig_type> &val)
+ : begin(val.begin),
+ end(val.end),
+ depth(init_depth_le(val.depth)) {}
+
+ operator fixed_kv_node_meta_t<typename bound_le_t::orig_type>() const {
+ return fixed_kv_node_meta_t<typename bound_le_t::orig_type>{
+ begin, end, depth };
+ }
+};
+
+template <typename key_t, typename val_t>
+class BtreeNodeMapping : public PhysicalNodeMapping<key_t, val_t> {
+protected:
+ op_context_t<key_t> ctx;
+ /**
+ * parent
+ *
+ * populated until link_extent is called to ensure cache residence
+ * until add_pin is called.
+ */
+ CachedExtentRef parent;
+
+ pladdr_t value;
+ extent_len_t len = 0;
+ fixed_kv_node_meta_t<key_t> range;
+ uint16_t pos = std::numeric_limits<uint16_t>::max();
+
+ virtual std::unique_ptr<BtreeNodeMapping> _duplicate(op_context_t<key_t>) const = 0;
+ fixed_kv_node_meta_t<key_t> _get_pin_range() const {
+ return range;
+ }
+
+public:
+ using val_type = val_t;
+ BtreeNodeMapping(op_context_t<key_t> ctx) : ctx(ctx) {}
+
+ BtreeNodeMapping(
+ op_context_t<key_t> ctx,
+ CachedExtentRef parent,
+ uint16_t pos,
+ pladdr_t value,
+ extent_len_t len,
+ fixed_kv_node_meta_t<key_t> meta)
+ : ctx(ctx),
+ parent(parent),
+ value(value),
+ len(len),
+ range(meta),
+ pos(pos)
+ {
+ if (!parent->is_pending()) {
+ this->child_pos = {parent, pos};
+ }
+ }
+
+ CachedExtentRef get_parent() const final {
+ return parent;
+ }
+
+ CachedExtentRef get_parent() {
+ return parent;
+ }
+
+ void set_parent(CachedExtentRef ext) {
+ parent = ext;
+ }
+
+ uint16_t get_pos() const final {
+ return pos;
+ }
+
+ extent_len_t get_length() const final {
+ ceph_assert(range.end > range.begin);
+ return len;
+ }
+
+ extent_types_t get_type() const override {
+ ceph_abort("should never happen");
+ return extent_types_t::ROOT;
+ }
+
+ val_t get_val() const final {
+ if constexpr (std::is_same_v<val_t, paddr_t>) {
+ return value.get_paddr();
+ } else {
+ static_assert(std::is_same_v<val_t, laddr_t>);
+ return value.get_laddr();
+ }
+ }
+
+ key_t get_key() const override {
+ return range.begin;
+ }
+
+ PhysicalNodeMappingRef<key_t, val_t> duplicate() const final {
+ auto ret = _duplicate(ctx);
+ ret->range = range;
+ ret->value = value;
+ ret->parent = parent;
+ ret->len = len;
+ ret->pos = pos;
+ return ret;
+ }
+
+ bool has_been_invalidated() const final {
+ return parent->has_been_invalidated();
+ }
+
+ get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction&) final;
+};
+
+}
diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h
new file mode 100644
index 000000000..2970d0440
--- /dev/null
+++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h
@@ -0,0 +1,2251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <boost/container/static_vector.hpp>
+#include <sys/mman.h>
+#include <memory>
+#include <string.h>
+
+#include "crimson/os/seastore/logging.h"
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+#include "crimson/os/seastore/root_block.h"
+
+#define RESERVATION_PTR reinterpret_cast<ChildableCachedExtent*>(0x1)
+
+namespace crimson::os::seastore::lba_manager::btree {
+struct lba_map_val_t;
+}
+
+namespace crimson::os::seastore {
+
+bool is_valid_child_ptr(ChildableCachedExtent* child);
+
+template <typename T>
+phy_tree_root_t& get_phy_tree_root(root_t& r);
+
+using get_child_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ typename trans_intr::condition,
+ get_child_ertr>;
+using get_phy_tree_root_node_ret =
+ std::pair<bool, get_child_iertr::future<CachedExtentRef>>;
+
+template <typename T, typename key_t>
+const get_phy_tree_root_node_ret get_phy_tree_root_node(
+ const RootBlockRef &root_block,
+ op_context_t<key_t> c);
+
+template <typename ROOT_T>
+void link_phy_tree_root_node(RootBlockRef &root_block, ROOT_T* root_node);
+
+template <typename T>
+void unlink_phy_tree_root_node(RootBlockRef &root_block);
+
+template <typename T>
+Transaction::tree_stats_t& get_tree_stats(Transaction &t);
+
+template <
+ typename node_key_t,
+ typename node_val_t,
+ typename internal_node_t,
+ typename leaf_node_t,
+ typename pin_t,
+ size_t node_size,
+ bool leaf_has_children>
+class FixedKVBtree {
+ static constexpr size_t MAX_DEPTH = 16;
+ using self_type = FixedKVBtree<
+ node_key_t,
+ node_val_t,
+ internal_node_t,
+ leaf_node_t,
+ pin_t,
+ node_size,
+ leaf_has_children>;
+public:
+ using InternalNodeRef = TCachedExtentRef<internal_node_t>;
+ using LeafNodeRef = TCachedExtentRef<leaf_node_t>;
+
+ using base_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using base_iertr = trans_iertr<base_ertr>;
+
+ class iterator;
+ using iterator_fut = base_iertr::future<iterator>;
+
+ using mapped_space_visitor_t = std::function<
+ void(paddr_t, node_key_t, extent_len_t, depth_t, extent_types_t, iterator&)>;
+
+ class iterator {
+ public:
+ iterator(const iterator &rhs) noexcept :
+ internal(rhs.internal), leaf(rhs.leaf) {}
+ iterator(iterator &&rhs) noexcept :
+ internal(std::move(rhs.internal)), leaf(std::move(rhs.leaf)) {}
+
+ iterator &operator=(const iterator &) = default;
+ iterator &operator=(iterator &&) = default;
+
+ iterator_fut next(
+ op_context_t<node_key_t> c,
+ mapped_space_visitor_t *visitor=nullptr) const
+ {
+ assert_valid();
+ assert(!is_end());
+
+ auto ret = *this;
+ ret.leaf.pos++;
+ if (ret.at_boundary()) {
+ return seastar::do_with(
+ ret,
+ [c, visitor](auto &ret) mutable {
+ return ret.handle_boundary(
+ c, visitor
+ ).si_then([&ret] {
+ return std::move(ret);
+ });
+ });
+ } else {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ ret);
+ }
+
+ }
+
+ iterator_fut prev(op_context_t<node_key_t> c) const
+ {
+ assert_valid();
+ assert(!is_begin());
+
+ auto ret = *this;
+
+ if (ret.leaf.pos > 0) {
+ ret.leaf.pos--;
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ ret);
+ }
+
+ depth_t depth_with_space = 2;
+ for (; depth_with_space <= get_depth(); ++depth_with_space) {
+ if (ret.get_internal(depth_with_space).pos > 0) {
+ break;
+ }
+ }
+
+ assert(depth_with_space <= ret.get_depth()); // must not be begin()
+ return seastar::do_with(
+ std::move(ret),
+ [](const internal_node_t &internal) { return --internal.end(); },
+ [](const leaf_node_t &leaf) { return --leaf.end(); },
+ [c, depth_with_space](auto &ret, auto &li, auto &ll) {
+ for (depth_t depth = 2; depth < depth_with_space; ++depth) {
+ ret.get_internal(depth).reset();
+ }
+ ret.leaf.reset();
+ ret.get_internal(depth_with_space).pos--;
+ // note, cannot result in at_boundary() by construction
+ return lookup_depth_range(
+ c, ret, depth_with_space - 1, 0, li, ll, nullptr
+ ).si_then([&ret] {
+ assert(!ret.at_boundary());
+ return std::move(ret);
+ });
+ });
+ }
+
+ void assert_valid() const {
+ assert(leaf.node);
+ assert(leaf.pos <= leaf.node->get_size());
+
+ for (auto &i: internal) {
+ (void)i;
+ assert(i.node);
+ assert(i.pos < i.node->get_size());
+ }
+ }
+
+ depth_t get_depth() const {
+ return internal.size() + 1;
+ }
+
+ auto &get_internal(depth_t depth) {
+ assert(depth > 1);
+ assert((depth - 2) < internal.size());
+ return internal[depth - 2];
+ }
+
+ const auto &get_internal(depth_t depth) const {
+ assert(depth > 1);
+ assert((depth - 2) < internal.size());
+ return internal[depth - 2];
+ }
+
+ node_key_t get_key() const {
+ assert(!is_end());
+ return leaf.node->iter_idx(leaf.pos).get_key();
+ }
+ node_val_t get_val() const {
+ assert(!is_end());
+ auto ret = leaf.node->iter_idx(leaf.pos).get_val();
+ if constexpr (
+ std::is_same_v<crimson::os::seastore::lba_manager::btree::lba_map_val_t,
+ node_val_t>) {
+ if (ret.pladdr.is_paddr()) {
+ ret.pladdr = ret.pladdr.get_paddr().maybe_relative_to(
+ leaf.node->get_paddr());
+ }
+ }
+ return ret;
+ }
+
+ bool is_end() const {
+ // external methods may only resolve at a boundary if at end
+ return at_boundary();
+ }
+
+ bool is_begin() const {
+ for (auto &i: internal) {
+ if (i.pos != 0)
+ return false;
+ }
+ return leaf.pos == 0;
+ }
+
+ std::unique_ptr<pin_t> get_pin(op_context_t<node_key_t> ctx) const {
+ assert(!is_end());
+ auto val = get_val();
+ auto key = get_key();
+ return std::make_unique<pin_t>(
+ ctx,
+ leaf.node,
+ leaf.pos,
+ val,
+ fixed_kv_node_meta_t<node_key_t>{ key, key + val.len, 0 });
+ }
+
+ typename leaf_node_t::Ref get_leaf_node() {
+ return leaf.node;
+ }
+
+ uint16_t get_leaf_pos() {
+ return leaf.pos;
+ }
+ private:
+ iterator() noexcept {}
+ iterator(depth_t depth) noexcept : internal(depth - 1) {}
+
+ friend class FixedKVBtree;
+ static constexpr uint16_t INVALID = std::numeric_limits<uint16_t>::max();
+ template <typename NodeType>
+ struct node_position_t {
+ typename NodeType::Ref node;
+ uint16_t pos = INVALID;
+
+ node_position_t() = default;
+ node_position_t(
+ typename NodeType::Ref node,
+ uint16_t pos)
+ : node(node), pos(pos) {}
+
+ void reset() {
+ *this = node_position_t{};
+ }
+
+ auto get_iter() {
+ assert(pos != INVALID);
+ assert(pos < node->get_size());
+ return node->iter_idx(pos);
+ }
+ };
+ boost::container::static_vector<
+ node_position_t<internal_node_t>, MAX_DEPTH> internal;
+ node_position_t<leaf_node_t> leaf;
+
+ bool at_boundary() const {
+ assert(leaf.pos <= leaf.node->get_size());
+ return leaf.pos == leaf.node->get_size();
+ }
+
+ using handle_boundary_ertr = base_iertr;
+ using handle_boundary_ret = handle_boundary_ertr::future<>;
+ handle_boundary_ret handle_boundary(
+ op_context_t<node_key_t> c,
+ mapped_space_visitor_t *visitor)
+ {
+ assert(at_boundary());
+ depth_t depth_with_space = 2;
+ for (; depth_with_space <= get_depth(); ++depth_with_space) {
+ if ((get_internal(depth_with_space).pos + 1) <
+ get_internal(depth_with_space).node->get_size()) {
+ break;
+ }
+ }
+
+ if (depth_with_space <= get_depth()) {
+ return seastar::do_with(
+ [](const internal_node_t &internal) { return internal.begin(); },
+ [](const leaf_node_t &leaf) { return leaf.begin(); },
+ [this, c, depth_with_space, visitor](auto &li, auto &ll) {
+ for (depth_t depth = 2; depth < depth_with_space; ++depth) {
+ get_internal(depth).reset();
+ }
+ leaf.reset();
+ get_internal(depth_with_space).pos++;
+ // note, cannot result in at_boundary() by construction
+ return lookup_depth_range(
+ c, *this, depth_with_space - 1, 0, li, ll, visitor
+ );
+ });
+ } else {
+ // end
+ return seastar::now();
+ }
+ }
+
+ depth_t check_split() const {
+ if (!leaf.node->at_max_capacity()) {
+ return 0;
+ }
+ for (depth_t split_from = 1; split_from < get_depth(); ++split_from) {
+ if (!get_internal(split_from + 1).node->at_max_capacity())
+ return split_from;
+ }
+ return get_depth();
+ }
+
+ depth_t check_merge() const {
+ if (!leaf.node->below_min_capacity()) {
+ return 0;
+ }
+ for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) {
+ if (!get_internal(merge_from + 1).node->below_min_capacity())
+ return merge_from;
+ }
+ return get_depth();
+ }
+ };
+
+ FixedKVBtree(RootBlockRef &root_block) : root_block(root_block) {}
+
+ auto& get_root() {
+ return get_phy_tree_root<self_type>(root_block->get_root());
+ }
+
+ auto& get_root() const {
+ return get_phy_tree_root<self_type>(root_block->get_root());
+ }
+
+ template <typename T>
+ void set_root_node(const TCachedExtentRef<T> &root_node) {
+ static_assert(std::is_base_of_v<typename internal_node_t::base_t, T>);
+ link_phy_tree_root_node(root_block, root_node.get());
+ }
+
+ auto get_root_node(op_context_t<node_key_t> c) const {
+ return get_phy_tree_root_node<self_type>(root_block, c);
+ }
+
+ /// mkfs
+ using mkfs_ret = phy_tree_root_t;
+ static mkfs_ret mkfs(RootBlockRef &root_block, op_context_t<node_key_t> c) {
+ assert(root_block->is_mutation_pending());
+ auto root_leaf = c.cache.template alloc_new_extent<leaf_node_t>(
+ c.trans,
+ node_size,
+ placement_hint_t::HOT,
+ INIT_GENERATION);
+ root_leaf->set_size(0);
+ fixed_kv_node_meta_t<node_key_t> meta{min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, 1};
+ root_leaf->set_meta(meta);
+ root_leaf->range = meta;
+ get_tree_stats<self_type>(c.trans).depth = 1u;
+ get_tree_stats<self_type>(c.trans).extents_num_delta++;
+ link_phy_tree_root_node(root_block, root_leaf.get());
+ return phy_tree_root_t{root_leaf->get_paddr(), 1u};
+ }
+
+ /**
+ * lower_bound
+ *
+ * @param c [in] context
+ * @param addr [in] ddr
+ * @return least iterator >= key
+ */
+ iterator_fut lower_bound(
+ op_context_t<node_key_t> c,
+ node_key_t addr,
+ mapped_space_visitor_t *visitor=nullptr,
+ depth_t min_depth = 1) const
+ {
+ LOG_PREFIX(FixedKVBtree::lower_bound);
+ return lookup(
+ c,
+ [addr](const internal_node_t &internal) {
+ assert(internal.get_size() > 0);
+ auto iter = internal.upper_bound(addr);
+ assert(iter != internal.begin());
+ --iter;
+ return iter;
+ },
+ [FNAME, c, addr](const leaf_node_t &leaf) {
+ auto ret = leaf.lower_bound(addr);
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "leaf addr {}, got ret offset {}, size {}, end {}",
+ c.trans,
+ addr,
+ ret.get_offset(),
+ leaf.get_size(),
+ ret == leaf.end());
+ return ret;
+ },
+ min_depth,
+ visitor
+ ).si_then([FNAME, c, min_depth](auto &&ret) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "ret.leaf.pos {}",
+ c.trans,
+ ret.leaf.pos);
+ if (min_depth == 1) {
+ ret.assert_valid();
+ }
+ return std::move(ret);
+ });
+ }
+
+
+ /**
+ * upper_bound
+ *
+ * @param c [in] context
+ * @param addr [in] ddr
+ * @return least iterator > key
+ */
+ iterator_fut upper_bound(
+ op_context_t<node_key_t> c,
+ node_key_t addr
+ ) const {
+ return lower_bound(
+ c, addr
+ ).si_then([c, addr](auto iter) {
+ if (!iter.is_end() && iter.get_key() == addr) {
+ return iter.next(c);
+ } else {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ iter);
+ }
+ });
+ }
+
+ /**
+ * upper_bound_right
+ *
+ * @param c [in] context
+ * @param addr [in] addr
+ * @return least iterator i s.t. i.get_key() + i.get_val().len > key
+ */
+ iterator_fut upper_bound_right(
+ op_context_t<node_key_t> c,
+ node_key_t addr) const
+ {
+ return lower_bound(
+ c, addr
+ ).si_then([c, addr](auto iter) {
+ if (iter.is_begin()) {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ iter);
+ } else {
+ return iter.prev(
+ c
+ ).si_then([iter, addr](auto prev) {
+ if ((prev.get_key() + prev.get_val().len) > addr) {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ prev);
+ } else {
+ return iterator_fut(
+ interruptible::ready_future_marker{},
+ iter);
+ }
+ });
+ }
+ });
+ }
+
+ iterator_fut begin(op_context_t<node_key_t> c) const {
+ return lower_bound(c, 0);
+ }
+ iterator_fut end(op_context_t<node_key_t> c) const {
+ return upper_bound(c, min_max_t<node_key_t>::max);
+ }
+
+ template <typename child_node_t, typename node_t, bool lhc = leaf_has_children,
+ typename std::enable_if<lhc, int>::type = 0>
+ void check_node(
+ op_context_t<node_key_t> c,
+ TCachedExtentRef<node_t> node)
+ {
+ assert(leaf_has_children);
+ for (auto i : *node) {
+ CachedExtentRef child_node;
+ Transaction::get_extent_ret ret;
+
+ if constexpr (std::is_base_of_v<typename internal_node_t::base_t, child_node_t>) {
+ ret = c.trans.get_extent(
+ i->get_val().maybe_relative_to(node->get_paddr()),
+ &child_node);
+ } else {
+ assert(i->get_val().pladdr.is_paddr());
+ ret = c.trans.get_extent(
+ i->get_val().pladdr.get_paddr().maybe_relative_to(node->get_paddr()),
+ &child_node);
+ }
+ if (ret == Transaction::get_extent_ret::PRESENT) {
+ if (child_node->is_stable()) {
+ assert(child_node->is_valid());
+ auto cnode = child_node->template cast<child_node_t>();
+ assert(cnode->has_parent_tracker());
+ if (node->is_pending()) {
+ auto &n = node->get_stable_for_key(i->get_key());
+ assert(cnode->get_parent_node().get() == &n);
+ auto pos = n.lower_bound_offset(i->get_key());
+ assert(pos < n.get_node_size());
+ assert(n.children[pos] == cnode.get());
+ } else {
+ assert(cnode->get_parent_node().get() == node.get());
+ assert(node->children[i->get_offset()] == cnode.get());
+ }
+ } else if (child_node->is_pending()) {
+ if (child_node->is_mutation_pending()) {
+ auto &prior = (child_node_t &)*child_node->prior_instance;
+ assert(prior.is_valid());
+ assert(prior.is_parent_valid());
+ if (node->is_mutation_pending()) {
+ auto &n = node->get_stable_for_key(i->get_key());
+ assert(prior.get_parent_node().get() == &n);
+ auto pos = n.lower_bound_offset(i->get_key());
+ assert(pos < n.get_node_size());
+ assert(n.children[pos] == &prior);
+ } else {
+ assert(prior.get_parent_node().get() == node.get());
+ assert(node->children[i->get_offset()] == &prior);
+ }
+ } else {
+ auto cnode = child_node->template cast<child_node_t>();
+ auto pos = node->find(i->get_key()).get_offset();
+ auto child = node->children[pos];
+ assert(child);
+ assert(child == cnode.get());
+ assert(cnode->is_parent_valid());
+ }
+ } else {
+ ceph_assert(!child_node->is_valid());
+ ceph_abort("impossible");
+ }
+ } else if (ret == Transaction::get_extent_ret::ABSENT) {
+ ChildableCachedExtent* child = nullptr;
+ if (node->is_pending()) {
+ auto &n = node->get_stable_for_key(i->get_key());
+ auto pos = n.lower_bound_offset(i->get_key());
+ assert(pos < n.get_node_size());
+ child = n.children[pos];
+ if (is_valid_child_ptr(child)) {
+ auto c = (child_node_t*)child;
+ assert(c->has_parent_tracker());
+ assert(c->get_parent_node().get() == &n);
+ }
+ } else {
+ child = node->children[i->get_offset()];
+ if (is_valid_child_ptr(child)) {
+ auto c = (child_node_t*)child;
+ assert(c->has_parent_tracker());
+ assert(c->get_parent_node().get() == node.get());
+ }
+ }
+
+ if (!is_valid_child_ptr(child)) {
+ if constexpr (
+ std::is_base_of_v<typename internal_node_t::base_t, child_node_t>)
+ {
+ assert(!c.cache.query_cache(i->get_val(), nullptr));
+ } else {
+ if constexpr (leaf_has_children) {
+ assert(i->get_val().pladdr.is_paddr()
+ ? (bool)!c.cache.query_cache(
+ i->get_val().pladdr.get_paddr(), nullptr)
+ : true);
+ }
+ }
+ }
+ } else {
+ ceph_abort("impossible");
+ }
+ }
+ }
+
+ using check_child_trackers_ret = base_iertr::future<>;
+ template <bool lhc = leaf_has_children,
+ typename std::enable_if<lhc, int>::type = 0>
+ check_child_trackers_ret check_child_trackers(
+ op_context_t<node_key_t> c) {
+ mapped_space_visitor_t checker = [c, this](
+ paddr_t,
+ node_key_t,
+ extent_len_t,
+ depth_t depth,
+ extent_types_t,
+ iterator& iter) {
+ if constexpr (!leaf_has_children) {
+ if (depth == 1) {
+ return seastar::now();
+ }
+ }
+ if (depth > 1) {
+ auto &node = iter.get_internal(depth).node;
+ assert(node->is_valid());
+ check_node<typename internal_node_t::base_t>(c, node);
+ } else {
+ assert(depth == 1);
+ auto &node = iter.leaf.node;
+ assert(node->is_valid());
+ check_node<LogicalCachedExtent>(c, node);
+ }
+ return seastar::now();
+ };
+
+ return seastar::do_with(
+ std::move(checker),
+ [this, c](auto &checker) {
+ return iterate_repeat(
+ c,
+ lower_bound(
+ c,
+ min_max_t<node_key_t>::min,
+ &checker),
+ [](auto &pos) {
+ if (pos.is_end()) {
+ return base_iertr::make_ready_future<
+ seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ return base_iertr::make_ready_future<
+ seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ },
+ &checker);
+ });
+ }
+
+ using iterate_repeat_ret_inner = base_iertr::future<
+ seastar::stop_iteration>;
+ template <typename F>
+ static base_iertr::future<> iterate_repeat(
+ op_context_t<node_key_t> c,
+ iterator_fut &&iter_fut,
+ F &&f,
+ mapped_space_visitor_t *visitor=nullptr) {
+ return std::move(
+ iter_fut
+ ).si_then([c, visitor, f=std::forward<F>(f)](auto iter) {
+ return seastar::do_with(
+ iter,
+ std::move(f),
+ [c, visitor](auto &pos, auto &f) {
+ return trans_intr::repeat(
+ [c, visitor, &f, &pos] {
+ return f(
+ pos
+ ).si_then([c, visitor, &pos](auto done) {
+ if (done == seastar::stop_iteration::yes) {
+ return iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ } else {
+ ceph_assert(!pos.is_end());
+ return pos.next(
+ c, visitor
+ ).si_then([&pos](auto next) {
+ pos = next;
+ return iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ });
+ }
+ });
+ });
+ });
+ });
+ }
+
+ /**
+ * insert
+ *
+ * Inserts val at laddr with iter as a hint. If element at laddr already
+ * exists returns iterator to that element unchanged and returns false.
+ *
+ * Invalidates all outstanding iterators for this tree on this transaction.
+ *
+ * @param c [in] op context
+ * @param iter [in] hint, insertion constant if immediately prior to iter
+ * @param laddr [in] addr at which to insert
+ * @param val [in] val to insert
+ * @return pair<iter, bool> where iter points to element at addr, bool true
+ * iff element at laddr did not exist.
+ */
+ using insert_iertr = base_iertr;
+ using insert_ret = insert_iertr::future<std::pair<iterator, bool>>;
+ insert_ret insert(
+ op_context_t<node_key_t> c,
+ iterator iter,
+ node_key_t laddr,
+ node_val_t val,
+ LogicalCachedExtent* nextent
+ ) {
+ LOG_PREFIX(FixedKVBtree::insert);
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "inserting laddr {} at iter {}",
+ c.trans,
+ laddr,
+ iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key());
+ return seastar::do_with(
+ iter,
+ [this, c, laddr, val, nextent](auto &ret) {
+ return find_insertion(
+ c, laddr, ret
+ ).si_then([this, c, laddr, val, &ret, nextent] {
+ if (!ret.at_boundary() && ret.get_key() == laddr) {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ std::make_pair(ret, false));
+ } else {
+ ++(get_tree_stats<self_type>(c.trans).num_inserts);
+ return handle_split(
+ c, ret
+ ).si_then([c, laddr, val, &ret, nextent] {
+ if (!ret.leaf.node->is_mutable()) {
+ CachedExtentRef mut = c.cache.duplicate_for_write(
+ c.trans, ret.leaf.node
+ );
+ ret.leaf.node = mut->cast<leaf_node_t>();
+ }
+ auto iter = typename leaf_node_t::const_iterator(
+ ret.leaf.node.get(), ret.leaf.pos);
+ assert(iter == ret.leaf.node->lower_bound(laddr));
+ assert(iter == ret.leaf.node->end() || iter->get_key() > laddr);
+ assert(laddr >= ret.leaf.node->get_meta().begin &&
+ laddr < ret.leaf.node->get_meta().end);
+ ret.leaf.node->insert(iter, laddr, val, nextent);
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ std::make_pair(ret, true));
+ });
+ }
+ });
+ });
+ }
+
+ insert_ret insert(
+ op_context_t<node_key_t> c,
+ node_key_t laddr,
+ node_val_t val,
+ LogicalCachedExtent* nextent) {
+ return lower_bound(
+ c, laddr
+ ).si_then([this, c, laddr, val, nextent](auto iter) {
+ return this->insert(c, iter, laddr, val, nextent);
+ });
+ }
+
+ /**
+ * update
+ *
+ * Invalidates all outstanding iterators for this tree on this transaction.
+ *
+ * @param c [in] op context
+ * @param iter [in] iterator to element to update, must not be end
+ * @param val [in] val with which to update
+ * @return iterator to newly updated element
+ */
+ using update_iertr = base_iertr;
+ using update_ret = update_iertr::future<iterator>;
+ update_ret update(
+ op_context_t<node_key_t> c,
+ iterator iter,
+ node_val_t val,
+ LogicalCachedExtent* nextent)
+ {
+ LOG_PREFIX(FixedKVBtree::update);
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "update element at {}",
+ c.trans,
+ iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key());
+ if (!iter.leaf.node->is_mutable()) {
+ CachedExtentRef mut = c.cache.duplicate_for_write(
+ c.trans, iter.leaf.node
+ );
+ iter.leaf.node = mut->cast<leaf_node_t>();
+ }
+ ++(get_tree_stats<self_type>(c.trans).num_updates);
+ iter.leaf.node->update(
+ iter.leaf.node->iter_idx(iter.leaf.pos),
+ val,
+ nextent);
+ return update_ret(
+ interruptible::ready_future_marker{},
+ iter);
+ }
+
+
+ /**
+ * remove
+ *
+ * Invalidates all outstanding iterators for this tree on this transaction.
+ *
+ * @param c [in] op context
+ * @param iter [in] iterator to element to remove, must not be end
+ */
+ using remove_iertr = base_iertr;
+ using remove_ret = remove_iertr::future<>;
+ remove_ret remove(
+ op_context_t<node_key_t> c,
+ iterator iter)
+ {
+ LOG_PREFIX(FixedKVBtree::remove);
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "remove element at {}",
+ c.trans,
+ iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key());
+ assert(!iter.is_end());
+ ++(get_tree_stats<self_type>(c.trans).num_erases);
+ return seastar::do_with(
+ iter,
+ [this, c](auto &ret) {
+ if (!ret.leaf.node->is_mutable()) {
+ CachedExtentRef mut = c.cache.duplicate_for_write(
+ c.trans, ret.leaf.node
+ );
+ ret.leaf.node = mut->cast<leaf_node_t>();
+ }
+ ret.leaf.node->remove(
+ ret.leaf.node->iter_idx(ret.leaf.pos));
+
+ return handle_merge(
+ c, ret
+ );
+ });
+ }
+
+ /**
+ * init_cached_extent
+ *
+ * Checks whether e is live (reachable from fixed kv tree) and drops or initializes
+ * accordingly.
+ *
+ * Returns if e is live.
+ */
+ using init_cached_extent_iertr = base_iertr;
+ using init_cached_extent_ret = init_cached_extent_iertr::future<bool>;
+ init_cached_extent_ret init_cached_extent(
+ op_context_t<node_key_t> c,
+ CachedExtentRef e)
+ {
+ assert(!e->is_logical());
+ LOG_PREFIX(FixedKVTree::init_cached_extent);
+ SUBTRACET(seastore_fixedkv_tree, "extent {}", c.trans, *e);
+ if (e->get_type() == internal_node_t::TYPE) {
+ auto eint = e->cast<internal_node_t>();
+ return lower_bound(
+ c, eint->get_node_meta().begin
+ ).si_then([e, c, eint](auto iter) {
+ // Note, this check is valid even if iter.is_end()
+ LOG_PREFIX(FixedKVTree::init_cached_extent);
+ depth_t cand_depth = eint->get_node_meta().depth;
+ if (cand_depth <= iter.get_depth() &&
+ &*iter.get_internal(cand_depth).node == &*eint) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "extent {} is live",
+ c.trans,
+ *eint);
+ return true;
+ } else {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "extent {} is not live",
+ c.trans,
+ *eint);
+ return false;
+ }
+ });
+ } else if (e->get_type() == leaf_node_t::TYPE) {
+ auto eleaf = e->cast<leaf_node_t>();
+ return lower_bound(
+ c, eleaf->get_node_meta().begin
+ ).si_then([c, e, eleaf](auto iter) {
+ // Note, this check is valid even if iter.is_end()
+ LOG_PREFIX(FixedKVTree::init_cached_extent);
+ if (iter.leaf.node == &*eleaf) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "extent {} is live",
+ c.trans,
+ *eleaf);
+ return true;
+ } else {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "extent {} is not live",
+ c.trans,
+ *eleaf);
+ return false;
+ }
+ });
+ } else {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "found other extent {} type {}",
+ c.trans,
+ *e,
+ e->get_type());
+ return init_cached_extent_ret(
+ interruptible::ready_future_marker{},
+ true);
+ }
+ }
+
+ /// get_leaf_if_live: get leaf node at laddr/addr if still live
+ using get_leaf_if_live_iertr = base_iertr;
+ using get_leaf_if_live_ret = get_leaf_if_live_iertr::future<CachedExtentRef>;
+ get_leaf_if_live_ret get_leaf_if_live(
+ op_context_t<node_key_t> c,
+ paddr_t addr,
+ node_key_t laddr,
+ extent_len_t len)
+ {
+ LOG_PREFIX(FixedKVBtree::get_leaf_if_live);
+ return lower_bound(
+ c, laddr
+ ).si_then([FNAME, c, addr, laddr, len](auto iter) {
+ if (iter.leaf.node->get_paddr() == addr) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "extent laddr {} addr {}~{} found: {}",
+ c.trans,
+ laddr,
+ addr,
+ len,
+ *iter.leaf.node);
+ return CachedExtentRef(iter.leaf.node);
+ } else {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "extent laddr {} addr {}~{} is not live, does not match node {}",
+ c.trans,
+ laddr,
+ addr,
+ len,
+ *iter.leaf.node);
+ return CachedExtentRef();
+ }
+ });
+ }
+
+
+ /// get_internal_if_live: get internal node at laddr/addr if still live
+ using get_internal_if_live_iertr = base_iertr;
+ using get_internal_if_live_ret = get_internal_if_live_iertr::future<CachedExtentRef>;
+ get_internal_if_live_ret get_internal_if_live(
+ op_context_t<node_key_t> c,
+ paddr_t addr,
+ node_key_t laddr,
+ extent_len_t len)
+ {
+ LOG_PREFIX(FixedKVBtree::get_internal_if_live);
+ return lower_bound(
+ c, laddr
+ ).si_then([FNAME, c, addr, laddr, len](auto iter) {
+ for (depth_t d = 2; d <= iter.get_depth(); ++d) {
+ CachedExtent &node = *iter.get_internal(d).node;
+ auto internal_node = node.cast<internal_node_t>();
+ if (internal_node->get_paddr() == addr) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "extent laddr {} addr {}~{} found: {}",
+ c.trans,
+ laddr,
+ addr,
+ len,
+ *internal_node);
+ assert(internal_node->get_node_meta().begin == laddr);
+ return CachedExtentRef(internal_node);
+ }
+ }
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "extent laddr {} addr {}~{} is not live, no matching internal node",
+ c.trans,
+ laddr,
+ addr,
+ len);
+ return CachedExtentRef();
+ });
+ }
+
+
+ /**
+ * rewrite_extent
+ *
+ * Rewrites a fresh copy of extent into transaction and updates internal
+ * references.
+ */
+ using rewrite_extent_iertr = base_iertr;
+ using rewrite_extent_ret = rewrite_extent_iertr::future<>;
+ rewrite_extent_ret rewrite_extent(
+ op_context_t<node_key_t> c,
+ CachedExtentRef e) {
+ LOG_PREFIX(FixedKVBtree::rewrite_extent);
+ assert(is_lba_backref_node(e->get_type()));
+
+ auto do_rewrite = [&](auto &fixed_kv_extent) {
+ auto n_fixed_kv_extent = c.cache.template alloc_new_extent<
+ std::remove_reference_t<decltype(fixed_kv_extent)>
+ >(
+ c.trans,
+ fixed_kv_extent.get_length(),
+ fixed_kv_extent.get_user_hint(),
+ // get target rewrite generation
+ fixed_kv_extent.get_rewrite_generation());
+ fixed_kv_extent.get_bptr().copy_out(
+ 0,
+ fixed_kv_extent.get_length(),
+ n_fixed_kv_extent->get_bptr().c_str());
+ n_fixed_kv_extent->set_modify_time(fixed_kv_extent.get_modify_time());
+ n_fixed_kv_extent->range = n_fixed_kv_extent->get_node_meta();
+
+ if (fixed_kv_extent.get_type() == internal_node_t::TYPE ||
+ leaf_node_t::do_has_children) {
+ if (!fixed_kv_extent.is_pending()) {
+ n_fixed_kv_extent->copy_sources.emplace(&fixed_kv_extent);
+ n_fixed_kv_extent->prior_instance = &fixed_kv_extent;
+ } else {
+ ceph_assert(fixed_kv_extent.is_mutation_pending());
+ n_fixed_kv_extent->copy_sources.emplace(
+ (typename internal_node_t::base_t*
+ )fixed_kv_extent.get_prior_instance().get());
+ n_fixed_kv_extent->children = std::move(fixed_kv_extent.children);
+ n_fixed_kv_extent->prior_instance = fixed_kv_extent.get_prior_instance();
+ n_fixed_kv_extent->adjust_ptracker_for_children();
+ }
+ }
+
+ /* This is a bit underhanded. Any relative addrs here must necessarily
+ * be record relative as we are rewriting a dirty extent. Thus, we
+ * are using resolve_relative_addrs with a (likely negative) block
+ * relative offset to correct them to block-relative offsets adjusted
+ * for our new transaction location.
+ *
+ * Upon commit, these now block relative addresses will be interpretted
+ * against the real final address.
+ */
+ if (!n_fixed_kv_extent->get_paddr().is_absolute()) {
+ // backend_type_t::SEGMENTED
+ assert(n_fixed_kv_extent->get_paddr().is_record_relative());
+ n_fixed_kv_extent->resolve_relative_addrs(
+ make_record_relative_paddr(0).block_relative_to(
+ n_fixed_kv_extent->get_paddr()));
+ } // else: backend_type_t::RANDOM_BLOCK
+
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "rewriting {} into {}",
+ c.trans,
+ fixed_kv_extent,
+ *n_fixed_kv_extent);
+
+ return update_internal_mapping(
+ c,
+ n_fixed_kv_extent->get_node_meta().depth,
+ n_fixed_kv_extent->get_node_meta().begin,
+ e->get_paddr(),
+ n_fixed_kv_extent->get_paddr(),
+ n_fixed_kv_extent
+ ).si_then([c, e] {
+ c.cache.retire_extent(c.trans, e);
+ });
+ };
+
+ CachedExtentRef n_fixed_kv_extent;
+ if (e->get_type() == internal_node_t::TYPE) {
+ auto lint = e->cast<internal_node_t>();
+ return do_rewrite(*lint);
+ } else {
+ assert(e->get_type() == leaf_node_t::TYPE);
+ auto lleaf = e->cast<leaf_node_t>();
+ return do_rewrite(*lleaf);
+ }
+ }
+
+ using update_internal_mapping_iertr = base_iertr;
+ using update_internal_mapping_ret = update_internal_mapping_iertr::future<>;
+ update_internal_mapping_ret update_internal_mapping(
+ op_context_t<node_key_t> c,
+ depth_t depth,
+ node_key_t laddr,
+ paddr_t old_addr,
+ paddr_t new_addr,
+ typename internal_node_t::base_ref nextent)
+ {
+ LOG_PREFIX(FixedKVBtree::update_internal_mapping);
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "updating laddr {} at depth {} from {} to {}, nextent {}",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ *nextent);
+
+ return lower_bound(
+ c, laddr, nullptr, depth + 1
+ ).si_then([=, this](auto iter) {
+ assert(iter.get_depth() >= depth);
+ if (depth == iter.get_depth()) {
+ SUBTRACET(seastore_fixedkv_tree, "update at root", c.trans);
+
+ if (laddr != min_max_t<node_key_t>::min) {
+ SUBERRORT(
+ seastore_fixedkv_tree,
+ "updating root laddr {} at depth {} from {} to {},"
+ "laddr is not 0",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ get_root().get_location());
+ ceph_assert(0 == "impossible");
+ }
+
+ if (get_root().get_location() != old_addr) {
+ SUBERRORT(
+ seastore_fixedkv_tree,
+ "updating root laddr {} at depth {} from {} to {},"
+ "root addr {} does not match",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ get_root().get_location());
+ ceph_assert(0 == "impossible");
+ }
+
+ root_block = c.cache.duplicate_for_write(
+ c.trans, root_block)->template cast<RootBlock>();
+ get_root().set_location(new_addr);
+ set_root_node(nextent);
+ } else {
+ auto &parent = iter.get_internal(depth + 1);
+ assert(parent.node);
+ assert(parent.pos < parent.node->get_size());
+ auto piter = parent.node->iter_idx(parent.pos);
+
+ if (piter->get_key() != laddr) {
+ SUBERRORT(
+ seastore_fixedkv_tree,
+ "updating laddr {} at depth {} from {} to {},"
+ "node {} pos {} val pivot addr {} does not match",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ *(parent.node),
+ parent.pos,
+ piter->get_key());
+ ceph_assert(0 == "impossible");
+ }
+
+
+ if (piter->get_val() != old_addr) {
+ SUBERRORT(
+ seastore_fixedkv_tree,
+ "updating laddr {} at depth {} from {} to {},"
+ "node {} pos {} val addr {} does not match",
+ c.trans,
+ laddr,
+ depth,
+ old_addr,
+ new_addr,
+ *(parent.node),
+ parent.pos,
+ piter->get_val());
+ ceph_assert(0 == "impossible");
+ }
+
+ CachedExtentRef mut = c.cache.duplicate_for_write(
+ c.trans,
+ parent.node
+ );
+ typename internal_node_t::Ref mparent = mut->cast<internal_node_t>();
+ mparent->update(piter, new_addr, nextent.get());
+
+ /* Note, iter is now invalid as we didn't udpate either the parent
+ * node reference to the new mutable instance nor did we update the
+ * child pointer to the new node. Not a problem as we'll now just
+ * destruct it.
+ */
+ }
+ return seastar::now();
+ });
+ }
+
+
+private:
+ RootBlockRef root_block;
+
+ template <typename T>
+ using node_position_t = typename iterator::template node_position_t<T>;
+
+ using get_internal_node_iertr = base_iertr;
+ using get_internal_node_ret = get_internal_node_iertr::future<InternalNodeRef>;
+ static get_internal_node_ret get_internal_node(
+ op_context_t<node_key_t> c,
+ depth_t depth,
+ paddr_t offset,
+ node_key_t begin,
+ node_key_t end,
+ typename std::optional<node_position_t<internal_node_t>> parent_pos)
+ {
+ LOG_PREFIX(FixedKVBtree::get_internal_node);
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "reading internal at offset {}, depth {}, begin {}, end {}",
+ c.trans,
+ offset,
+ depth,
+ begin,
+ end);
+ assert(depth > 1);
+ auto init_internal = [c, depth, begin, end,
+ parent_pos=std::move(parent_pos)]
+ (internal_node_t &node) {
+ assert(!node.is_pending());
+ assert(!node.is_linked());
+ node.range = fixed_kv_node_meta_t<node_key_t>{begin, end, depth};
+ if (parent_pos) {
+ auto &parent = parent_pos->node;
+ parent->link_child(&node, parent_pos->pos);
+ } else {
+ assert(node.range.is_root());
+ auto root_block = c.cache.get_root_fast(c.trans);
+ if (root_block->is_mutation_pending()) {
+ auto &stable_root = (RootBlockRef&)*root_block->get_prior_instance();
+ link_phy_tree_root_node(stable_root, &node);
+ } else {
+ assert(!root_block->is_pending());
+ link_phy_tree_root_node(root_block, &node);
+ }
+ }
+ };
+ return c.cache.template get_absent_extent<internal_node_t>(
+ c.trans,
+ offset,
+ node_size,
+ init_internal
+ ).si_then([FNAME, c, offset, init_internal, depth, begin, end](
+ typename internal_node_t::Ref ret) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "read internal at offset {} {}",
+ c.trans,
+ offset,
+ *ret);
+ // This can only happen during init_cached_extent
+ // or when backref extent being rewritten by gc space reclaiming
+ if (!ret->is_pending() && !ret->is_linked()) {
+ assert(ret->is_dirty()
+ || (is_backref_node(ret->get_type())
+ && ret->is_clean()));
+ init_internal(*ret);
+ }
+ auto meta = ret->get_meta();
+ if (ret->get_size()) {
+ ceph_assert(meta.begin <= ret->begin()->get_key());
+ ceph_assert(meta.end > (ret->end() - 1)->get_key());
+ }
+ ceph_assert(depth == meta.depth);
+ ceph_assert(begin == meta.begin);
+ ceph_assert(end == meta.end);
+ return get_internal_node_ret(
+ interruptible::ready_future_marker{},
+ ret);
+ });
+ }
+
+
+ using get_leaf_node_iertr = base_iertr;
+ using get_leaf_node_ret = get_leaf_node_iertr::future<LeafNodeRef>;
+ static get_leaf_node_ret get_leaf_node(
+ op_context_t<node_key_t> c,
+ paddr_t offset,
+ node_key_t begin,
+ node_key_t end,
+ typename std::optional<node_position_t<leaf_node_t>> parent_pos)
+ {
+ LOG_PREFIX(FixedKVBtree::get_leaf_node);
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "reading leaf at offset {}, begin {}, end {}",
+ c.trans,
+ offset,
+ begin,
+ end);
+ auto init_leaf = [c, begin, end,
+ parent_pos=std::move(parent_pos)]
+ (leaf_node_t &node) {
+ assert(!node.is_pending());
+ assert(!node.is_linked());
+ node.range = fixed_kv_node_meta_t<node_key_t>{begin, end, 1};
+ if (parent_pos) {
+ auto &parent = parent_pos->node;
+ parent->link_child(&node, parent_pos->pos);
+ } else {
+ assert(node.range.is_root());
+ auto root_block = c.cache.get_root_fast(c.trans);
+ if (root_block->is_mutation_pending()) {
+ auto &stable_root = (RootBlockRef&)*root_block->get_prior_instance();
+ link_phy_tree_root_node(stable_root, &node);
+ } else {
+ assert(!root_block->is_pending());
+ link_phy_tree_root_node(root_block, &node);
+ }
+ }
+ };
+ return c.cache.template get_absent_extent<leaf_node_t>(
+ c.trans,
+ offset,
+ node_size,
+ init_leaf
+ ).si_then([FNAME, c, offset, init_leaf, begin, end]
+ (typename leaf_node_t::Ref ret) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "read leaf at offset {} {}",
+ c.trans,
+ offset,
+ *ret);
+ // This can only happen during init_cached_extent
+ // or when backref extent being rewritten by gc space reclaiming
+ if (!ret->is_pending() && !ret->is_linked()) {
+ assert(ret->is_dirty()
+ || (is_backref_node(ret->get_type())
+ && ret->is_clean()));
+ init_leaf(*ret);
+ }
+ auto meta = ret->get_meta();
+ if (ret->get_size()) {
+ ceph_assert(meta.begin <= ret->begin()->get_key());
+ ceph_assert(meta.end > (ret->end() - 1)->get_key());
+ }
+ ceph_assert(1 == meta.depth);
+ ceph_assert(begin == meta.begin);
+ ceph_assert(end == meta.end);
+ return get_leaf_node_ret(
+ interruptible::ready_future_marker{},
+ ret);
+ });
+ }
+
+ using lookup_root_iertr = base_iertr;
+ using lookup_root_ret = lookup_root_iertr::future<>;
+ lookup_root_ret lookup_root(
+ op_context_t<node_key_t> c,
+ iterator &iter,
+ mapped_space_visitor_t *visitor) const {
+ LOG_PREFIX(FixedKVBtree::lookup_root);
+ SUBTRACET(seastore_fixedkv_tree,
+ "looking up root on {}",
+ c.trans,
+ *root_block);
+ auto [found, fut] = get_root_node(c);
+
+ auto on_found_internal =
+ [this, visitor, &iter](InternalNodeRef &root_node) {
+ iter.get_internal(get_root().get_depth()).node = root_node;
+ if (visitor) (*visitor)(
+ root_node->get_paddr(),
+ root_node->get_node_meta().begin,
+ root_node->get_length(),
+ get_root().get_depth(),
+ internal_node_t::TYPE,
+ iter);
+ return lookup_root_iertr::now();
+ };
+ auto on_found_leaf =
+ [visitor, &iter, this](LeafNodeRef root_node) {
+ iter.leaf.node = root_node;
+ if (visitor) (*visitor)(
+ root_node->get_paddr(),
+ root_node->get_node_meta().begin,
+ root_node->get_length(),
+ get_root().get_depth(),
+ leaf_node_t::TYPE,
+ iter);
+ return lookup_root_iertr::now();
+ };
+
+ if (found) {
+ return fut.si_then(
+ [this, c, on_found_internal=std::move(on_found_internal),
+ on_found_leaf=std::move(on_found_leaf)](auto root) {
+ LOG_PREFIX(FixedKVBtree::lookup_root);
+ ceph_assert(root);
+ SUBTRACET(seastore_fixedkv_tree,
+ "got root node on {}, res: {}",
+ c.trans,
+ *root_block,
+ *root);
+
+ if (get_root().get_depth() > 1) {
+ auto root_node = root->template cast<internal_node_t>();
+ return on_found_internal(root_node);
+ } else {
+ auto root_node = root->template cast<leaf_node_t>();
+ return on_found_leaf(root_node);
+ }
+ });
+ } else {
+ if (get_root().get_depth() > 1) {
+ return get_internal_node(
+ c,
+ get_root().get_depth(),
+ get_root().get_location(),
+ min_max_t<node_key_t>::min,
+ min_max_t<node_key_t>::max,
+ std::nullopt
+ ).si_then([on_found=std::move(on_found_internal)](InternalNodeRef root_node) {
+ return on_found(root_node);
+ });
+ } else {
+ return get_leaf_node(
+ c,
+ get_root().get_location(),
+ min_max_t<node_key_t>::min,
+ min_max_t<node_key_t>::max,
+ std::nullopt
+ ).si_then([on_found=std::move(on_found_leaf)](LeafNodeRef root_node) {
+ return on_found(root_node);
+ });
+ }
+ }
+ }
+
+ using lookup_internal_level_iertr = base_iertr;
+ using lookup_internal_level_ret = lookup_internal_level_iertr::future<>;
+ template <typename F>
+ static lookup_internal_level_ret lookup_internal_level(
+ op_context_t<node_key_t> c,
+ depth_t depth,
+ iterator &iter,
+ F &f,
+ mapped_space_visitor_t *visitor
+ ) {
+ assert(depth > 1);
+ auto &parent_entry = iter.get_internal(depth + 1);
+ auto parent = parent_entry.node;
+ auto node_iter = parent->iter_idx(parent_entry.pos);
+
+ auto on_found = [depth, visitor, &iter, &f](InternalNodeRef node) {
+ auto &entry = iter.get_internal(depth);
+ entry.node = node;
+ auto node_iter = f(*node);
+ assert(node_iter != node->end());
+ entry.pos = node_iter->get_offset();
+ if (visitor)
+ (*visitor)(
+ node->get_paddr(),
+ node->get_node_meta().begin,
+ node->get_length(),
+ depth,
+ node->get_type(),
+ iter);
+ return seastar::now();
+ };
+
+ auto v = parent->template get_child<internal_node_t>(c, node_iter);
+ if (v.has_child()) {
+ return v.get_child_fut().safe_then(
+ [on_found=std::move(on_found), node_iter, c,
+ parent_entry](auto child) mutable {
+ LOG_PREFIX(FixedKVBtree::lookup_internal_level);
+ SUBTRACET(seastore_fixedkv_tree,
+ "got child on {}, pos: {}, res: {}",
+ c.trans,
+ *parent_entry.node,
+ parent_entry.pos,
+ *child);
+ auto &cnode = (typename internal_node_t::base_t &)*child;
+ assert(cnode.get_node_meta().begin == node_iter.get_key());
+ assert(cnode.get_node_meta().end > node_iter.get_key());
+ return on_found(child->template cast<internal_node_t>());
+ });
+ }
+
+ auto child_pos = v.get_child_pos();
+ auto next_iter = node_iter + 1;
+ auto begin = node_iter->get_key();
+ auto end = next_iter == parent->end()
+ ? parent->get_node_meta().end
+ : next_iter->get_key();
+ return get_internal_node(
+ c,
+ depth,
+ node_iter->get_val().maybe_relative_to(parent->get_paddr()),
+ begin,
+ end,
+ std::make_optional<node_position_t<internal_node_t>>(
+ child_pos.template get_parent<internal_node_t>(),
+ child_pos.get_pos())
+ ).si_then([on_found=std::move(on_found)](InternalNodeRef node) {
+ return on_found(node);
+ });
+ }
+
+ using lookup_leaf_iertr = base_iertr;
+ using lookup_leaf_ret = lookup_leaf_iertr::future<>;
+ template <typename F>
+ static lookup_internal_level_ret lookup_leaf(
+ op_context_t<node_key_t> c,
+ iterator &iter,
+ F &f,
+ mapped_space_visitor_t *visitor
+ ) {
+ auto &parent_entry = iter.get_internal(2);
+ auto parent = parent_entry.node;
+ assert(parent);
+ auto node_iter = parent->iter_idx(parent_entry.pos);
+
+ auto on_found = [visitor, &iter, &f](LeafNodeRef node) {
+ iter.leaf.node = node;
+ auto node_iter = f(*node);
+ iter.leaf.pos = node_iter->get_offset();
+ if (visitor)
+ (*visitor)(
+ node->get_paddr(),
+ node->get_node_meta().begin,
+ node->get_length(),
+ 1,
+ node->get_type(),
+ iter);
+ return seastar::now();
+ };
+
+ auto v = parent->template get_child<leaf_node_t>(c, node_iter);
+ if (v.has_child()) {
+ return v.get_child_fut().safe_then(
+ [on_found=std::move(on_found), node_iter, c,
+ parent_entry](auto child) mutable {
+ LOG_PREFIX(FixedKVBtree::lookup_leaf);
+ SUBTRACET(seastore_fixedkv_tree,
+ "got child on {}, pos: {}, res: {}",
+ c.trans,
+ *parent_entry.node,
+ parent_entry.pos,
+ *child);
+ auto &cnode = (typename internal_node_t::base_t &)*child;
+ assert(cnode.get_node_meta().begin == node_iter.get_key());
+ assert(cnode.get_node_meta().end > node_iter.get_key());
+ return on_found(child->template cast<leaf_node_t>());
+ });
+ }
+
+ auto child_pos = v.get_child_pos();
+ auto next_iter = node_iter + 1;
+ auto begin = node_iter->get_key();
+ auto end = next_iter == parent->end()
+ ? parent->get_node_meta().end
+ : next_iter->get_key();
+
+ return get_leaf_node(
+ c,
+ node_iter->get_val().maybe_relative_to(parent->get_paddr()),
+ begin,
+ end,
+ std::make_optional<node_position_t<leaf_node_t>>(
+ child_pos.template get_parent<leaf_node_t>(),
+ child_pos.get_pos())
+ ).si_then([on_found=std::move(on_found)](LeafNodeRef node) {
+ return on_found(node);
+ });
+ }
+
+ /**
+ * lookup_depth_range
+ *
+ * Performs node lookups on depths [from, to) using li and ll to
+ * specific target at each level. Note, may leave the iterator
+ * at_boundary(), call handle_boundary() prior to returning out
+ * lf FixedKVBtree.
+ */
+ using lookup_depth_range_iertr = base_iertr;
+ using lookup_depth_range_ret = lookup_depth_range_iertr::future<>;
+ template <typename LI, typename LL>
+ static lookup_depth_range_ret lookup_depth_range(
+ op_context_t<node_key_t> c, ///< [in] context
+ iterator &iter, ///< [in,out] iterator to populate
+ depth_t from, ///< [in] from inclusive
+ depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop)
+ LI &li, ///< [in] internal->iterator
+ LL &ll, ///< [in] leaf->iterator
+ mapped_space_visitor_t *visitor ///< [in] mapped space visitor
+ ) {
+ LOG_PREFIX(FixedKVBtree::lookup_depth_range);
+ SUBTRACET(seastore_fixedkv_tree, "{} -> {}", c.trans, from, to);
+ return seastar::do_with(
+ from,
+ [c, to, visitor, &iter, &li, &ll](auto &d) {
+ return trans_intr::repeat(
+ [c, to, visitor, &iter, &li, &ll, &d] {
+ if (d > to) {
+ return [&] {
+ if (d > 1) {
+ return lookup_internal_level(
+ c,
+ d,
+ iter,
+ li,
+ visitor);
+ } else {
+ assert(d == 1);
+ return lookup_leaf(
+ c,
+ iter,
+ ll,
+ visitor);
+ }
+ }().si_then([&d] {
+ --d;
+ return lookup_depth_range_iertr::make_ready_future<
+ seastar::stop_iteration
+ >(seastar::stop_iteration::no);
+ });
+ } else {
+ return lookup_depth_range_iertr::make_ready_future<
+ seastar::stop_iteration
+ >(seastar::stop_iteration::yes);
+ }
+ });
+ });
+ }
+
+ using lookup_iertr = base_iertr;
+ using lookup_ret = lookup_iertr::future<iterator>;
+ template <typename LI, typename LL>
+ lookup_ret lookup(
+ op_context_t<node_key_t> c,
+ LI &&lookup_internal,
+ LL &&lookup_leaf,
+ depth_t min_depth,
+ mapped_space_visitor_t *visitor
+ ) const {
+ LOG_PREFIX(FixedKVBtree::lookup);
+ assert(min_depth > 0);
+ return seastar::do_with(
+ iterator{get_root().get_depth()},
+ std::forward<LI>(lookup_internal),
+ std::forward<LL>(lookup_leaf),
+ [FNAME, this, visitor, c, min_depth](auto &iter, auto &li, auto &ll) {
+ return lookup_root(
+ c, iter, visitor
+ ).si_then([FNAME, this, visitor, c, &iter, &li, &ll, min_depth] {
+ if (iter.get_depth() > 1) {
+ auto &root_entry = *(iter.internal.rbegin());
+ root_entry.pos = li(*(root_entry.node)).get_offset();
+ } else {
+ auto &root_entry = iter.leaf;
+ auto riter = ll(*(root_entry.node));
+ root_entry.pos = riter->get_offset();
+ }
+ SUBTRACET(seastore_fixedkv_tree, "got root, depth {}",
+ c.trans, get_root().get_depth());
+ return lookup_depth_range(
+ c,
+ iter,
+ get_root().get_depth() - 1,
+ min_depth - 1,
+ li,
+ ll,
+ visitor
+ ).si_then([c, visitor, &iter, min_depth] {
+ // It's only when the lookup is triggered by
+ // update_internal_mapping() that min_depth is
+ // NOT 1
+ if (min_depth == 1 && iter.at_boundary()) {
+ return iter.handle_boundary(c, visitor);
+ } else {
+ return lookup_iertr::now();
+ }
+ });
+ }).si_then([&iter] {
+ return std::move(iter);
+ });
+ });
+ }
+
+ /**
+ * find_insertion
+ *
+ * Prepare iter for insertion. iter should begin pointing at
+ * the valid insertion point (lower_bound(laddr)).
+ *
+ * Upon completion, iter will point at the
+ * position at which laddr should be inserted. iter may, upon completion,
+ * point at the end of a leaf other than the end leaf if that's the correct
+ * insertion point.
+ */
+ using find_insertion_iertr = base_iertr;
+ using find_insertion_ret = find_insertion_iertr::future<>;
+ static find_insertion_ret find_insertion(
+ op_context_t<node_key_t> c,
+ node_key_t laddr,
+ iterator &iter)
+ {
+ assert(iter.is_end() || iter.get_key() >= laddr);
+ if (!iter.is_end() && iter.get_key() == laddr) {
+ return seastar::now();
+ } else if (iter.leaf.node->get_node_meta().begin <= laddr) {
+#ifndef NDEBUG
+ auto p = iter;
+ if (p.leaf.pos > 0) {
+ --p.leaf.pos;
+ assert(p.get_key() < laddr);
+ }
+#endif
+ return seastar::now();
+ } else {
+ assert(iter.leaf.pos == 0);
+ return iter.prev(
+ c
+ ).si_then([laddr, &iter](auto p) {
+ boost::ignore_unused(laddr); // avoid clang warning;
+ assert(p.leaf.node->get_node_meta().begin <= laddr);
+ assert(p.get_key() < laddr);
+ // Note, this is specifically allowed to violate the iterator
+ // invariant that pos is a valid index for the node in the event
+ // that the insertion point is at the end of a node.
+ p.leaf.pos++;
+ assert(p.at_boundary());
+ iter = p;
+ return seastar::now();
+ });
+ }
+ }
+
+ /**
+ * handle_split
+ *
+ * Split nodes in iter as needed for insertion. First, scan iter from leaf
+ * to find first non-full level. Then, split from there towards leaf.
+ *
+ * Upon completion, iter will point at the newly split insertion point. As
+ * with find_insertion, iter's leaf pointer may be end without iter being
+ * end.
+ */
+ using handle_split_iertr = base_iertr;
+ using handle_split_ret = handle_split_iertr::future<>;
+ handle_split_ret handle_split(
+ op_context_t<node_key_t> c,
+ iterator &iter)
+ {
+ LOG_PREFIX(FixedKVBtree::handle_split);
+
+ depth_t split_from = iter.check_split();
+
+ SUBTRACET(seastore_fixedkv_tree, "split_from {}, depth {}", c.trans, split_from, iter.get_depth());
+
+ if (split_from == iter.get_depth()) {
+ auto nroot = c.cache.template alloc_new_extent<internal_node_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ fixed_kv_node_meta_t<node_key_t> meta{
+ min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, iter.get_depth() + 1};
+ nroot->set_meta(meta);
+ nroot->range = meta;
+ nroot->journal_insert(
+ nroot->begin(),
+ min_max_t<node_key_t>::min,
+ get_root().get_location(),
+ nullptr);
+ iter.internal.push_back({nroot, 0});
+
+ get_tree_stats<self_type>(c.trans).depth = iter.get_depth();
+ get_tree_stats<self_type>(c.trans).extents_num_delta++;
+
+ root_block = c.cache.duplicate_for_write(
+ c.trans, root_block)->template cast<RootBlock>();
+ get_root().set_location(nroot->get_paddr());
+ get_root().set_depth(iter.get_depth());
+ ceph_assert(get_root().get_depth() <= MAX_FIXEDKVBTREE_DEPTH);
+ set_root_node(nroot);
+ }
+
+ /* pos may be either node_position_t<leaf_node_t> or
+ * node_position_t<internal_node_t> */
+ auto split_level = [&](auto &parent_pos, auto &pos) {
+ LOG_PREFIX(FixedKVBtree::handle_split);
+ auto [left, right, pivot] = pos.node->make_split_children(c);
+
+ auto parent_node = parent_pos.node;
+ auto parent_iter = parent_pos.get_iter();
+
+ parent_node->update(
+ parent_iter,
+ left->get_paddr(),
+ left.get());
+ parent_node->insert(
+ parent_iter + 1,
+ pivot,
+ right->get_paddr(),
+ right.get());
+
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "splitted {} into left: {}, right: {}",
+ c.trans,
+ *pos.node,
+ *left,
+ *right);
+ c.cache.retire_extent(c.trans, pos.node);
+
+ get_tree_stats<self_type>(c.trans).extents_num_delta++;
+ return std::make_pair(left, right);
+ };
+
+ for (; split_from > 0; --split_from) {
+ auto &parent_pos = iter.get_internal(split_from + 1);
+ if (!parent_pos.node->is_mutable()) {
+ parent_pos.node = c.cache.duplicate_for_write(
+ c.trans, parent_pos.node
+ )->template cast<internal_node_t>();
+ }
+
+ if (split_from > 1) {
+ auto &pos = iter.get_internal(split_from);
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "splitting internal {} at depth {}, parent: {} at pos: {}",
+ c.trans,
+ *pos.node,
+ split_from,
+ *parent_pos.node,
+ parent_pos.pos);
+ auto [left, right] = split_level(parent_pos, pos);
+
+ if (pos.pos < left->get_size()) {
+ pos.node = left;
+ } else {
+ pos.node = right;
+ pos.pos -= left->get_size();
+
+ parent_pos.pos += 1;
+ }
+ } else {
+ auto &pos = iter.leaf;
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "splitting leaf {}, parent: {} at pos: {}",
+ c.trans,
+ *pos.node,
+ *parent_pos.node,
+ parent_pos.pos);
+ auto [left, right] = split_level(parent_pos, pos);
+
+ /* right->get_node_meta().begin == pivot == right->begin()->get_key()
+ * Thus, if pos.pos == left->get_size(), we want iter to point to
+ * left with pos.pos at the end rather than right with pos.pos = 0
+ * since the insertion would be to the left of the first element
+ * of right and thus necessarily less than right->get_node_meta().begin.
+ */
+ if (pos.pos <= left->get_size()) {
+ pos.node = left;
+ } else {
+ pos.node = right;
+ pos.pos -= left->get_size();
+
+ parent_pos.pos += 1;
+ }
+ }
+ }
+
+ return seastar::now();
+ }
+
+
+ using handle_merge_iertr = base_iertr;
+ using handle_merge_ret = handle_merge_iertr::future<>;
+ handle_merge_ret handle_merge(
+ op_context_t<node_key_t> c,
+ iterator &iter)
+ {
+ LOG_PREFIX(FixedKVBtree::handle_merge);
+ if (iter.get_depth() == 1 ||
+ !iter.leaf.node->below_min_capacity()) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "no need to merge leaf, leaf size {}, depth {}",
+ c.trans,
+ iter.leaf.node->get_size(),
+ iter.get_depth());
+ return seastar::now();
+ }
+
+ return seastar::do_with(
+ depth_t{1},
+ [FNAME, this, c, &iter](auto &to_merge) {
+ return trans_intr::repeat(
+ [FNAME, this, c, &iter, &to_merge] {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "merging depth {}",
+ c.trans,
+ to_merge);
+ auto &parent_pos = iter.get_internal(to_merge + 1);
+ auto merge_fut = handle_merge_iertr::now();
+ if (to_merge > 1) {
+ auto &pos = iter.get_internal(to_merge);
+ merge_fut = merge_level(c, to_merge, parent_pos, pos);
+ } else {
+ auto &pos = iter.leaf;
+ merge_fut = merge_level(c, to_merge, parent_pos, pos);
+ }
+
+ return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] {
+ ++to_merge;
+ auto &pos = iter.get_internal(to_merge);
+ if (to_merge == iter.get_depth()) {
+ if (pos.node->get_size() == 1) {
+ SUBTRACET(seastore_fixedkv_tree, "collapsing root", c.trans);
+ c.cache.retire_extent(c.trans, pos.node);
+ assert(pos.pos == 0);
+ auto node_iter = pos.get_iter();
+ iter.internal.pop_back();
+ get_tree_stats<self_type>(c.trans).depth = iter.get_depth();
+ get_tree_stats<self_type>(c.trans).extents_num_delta--;
+
+ root_block = c.cache.duplicate_for_write(
+ c.trans, root_block
+ )->template cast<RootBlock>();
+ get_root().set_location(
+ node_iter->get_val().maybe_relative_to(pos.node->get_paddr()));
+ get_root().set_depth(iter.get_depth());
+ if (iter.get_depth() > 1) {
+ auto root_node = iter.get_internal(iter.get_depth()).node;
+ set_root_node(root_node);
+ } else {
+ set_root_node(iter.leaf.node);
+ }
+ } else {
+ SUBTRACET(seastore_fixedkv_tree, "no need to collapse root", c.trans);
+ }
+ return seastar::stop_iteration::yes;
+ } else if (pos.node->below_min_capacity()) {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "continuing, next node {} depth {} at min",
+ c.trans,
+ *pos.node,
+ to_merge);
+ return seastar::stop_iteration::no;
+ } else {
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "complete, next node {} depth {} not min",
+ c.trans,
+ *pos.node,
+ to_merge);
+ return seastar::stop_iteration::yes;
+ }
+ });
+ });
+ });
+ }
+
+ template <typename NodeType,
+ std::enable_if_t<std::is_same_v<NodeType, leaf_node_t>, int> = 0>
+ base_iertr::future<typename NodeType::Ref> get_node(
+ op_context_t<node_key_t> c,
+ depth_t depth,
+ paddr_t addr,
+ node_key_t begin,
+ node_key_t end,
+ typename std::optional<node_position_t<leaf_node_t>> parent_pos) {
+ assert(depth == 1);
+ return get_leaf_node(c, addr, begin, end, std::move(parent_pos));
+ }
+
+ template <typename NodeType,
+ std::enable_if_t<std::is_same_v<NodeType, internal_node_t>, int> = 0>
+ base_iertr::future<typename NodeType::Ref> get_node(
+ op_context_t<node_key_t> c,
+ depth_t depth,
+ paddr_t addr,
+ node_key_t begin,
+ node_key_t end,
+ typename std::optional<node_position_t<internal_node_t>> parent_pos) {
+ return get_internal_node(c, depth, addr, begin, end, std::move(parent_pos));
+ }
+
+ template <typename NodeType>
+ handle_merge_ret merge_level(
+ op_context_t<node_key_t> c,
+ depth_t depth,
+ node_position_t<internal_node_t> &parent_pos,
+ node_position_t<NodeType> &pos)
+ {
+ LOG_PREFIX(FixedKVBtree::merge_level);
+ if (!parent_pos.node->is_mutable()) {
+ parent_pos.node = c.cache.duplicate_for_write(
+ c.trans, parent_pos.node
+ )->template cast<internal_node_t>();
+ }
+
+ auto iter = parent_pos.get_iter();
+ assert(iter.get_offset() < parent_pos.node->get_size());
+ bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size());
+ auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1);
+ auto next_iter = donor_iter + 1;
+ auto begin = donor_iter->get_key();
+ auto end = next_iter == parent_pos.node->end()
+ ? parent_pos.node->get_node_meta().end
+ : next_iter->get_key();
+
+ SUBTRACET(seastore_fixedkv_tree, "parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node);
+ auto do_merge = [c, iter, donor_iter, donor_is_left, &parent_pos, &pos](
+ typename NodeType::Ref donor) {
+ LOG_PREFIX(FixedKVBtree::merge_level);
+ auto [l, r] = donor_is_left ?
+ std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor);
+
+ auto [liter, riter] = donor_is_left ?
+ std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+
+ if (donor->at_min_capacity()) {
+ auto replacement = l->make_full_merge(c, r);
+
+ parent_pos.node->update(
+ liter,
+ replacement->get_paddr(),
+ replacement.get());
+ parent_pos.node->remove(riter);
+
+ pos.node = replacement;
+ if (donor_is_left) {
+ pos.pos += l->get_size();
+ parent_pos.pos--;
+ }
+
+ SUBTRACET(seastore_fixedkv_tree, "l: {}, r: {}, replacement: {}", c.trans, *l, *r, *replacement);
+ c.cache.retire_extent(c.trans, l);
+ c.cache.retire_extent(c.trans, r);
+ get_tree_stats<self_type>(c.trans).extents_num_delta--;
+ } else {
+ LOG_PREFIX(FixedKVBtree::merge_level);
+ auto [replacement_l, replacement_r, pivot] =
+ l->make_balanced(
+ c,
+ r,
+ !donor_is_left);
+
+ parent_pos.node->update(
+ liter,
+ replacement_l->get_paddr(),
+ replacement_l.get());
+ parent_pos.node->replace(
+ riter,
+ pivot,
+ replacement_r->get_paddr(),
+ replacement_r.get());
+
+ if (donor_is_left) {
+ assert(parent_pos.pos > 0);
+ parent_pos.pos--;
+ }
+
+ auto orig_position = donor_is_left ?
+ l->get_size() + pos.pos :
+ pos.pos;
+ if (orig_position < replacement_l->get_size()) {
+ pos.node = replacement_l;
+ pos.pos = orig_position;
+ } else {
+ parent_pos.pos++;
+ pos.node = replacement_r;
+ pos.pos = orig_position - replacement_l->get_size();
+ }
+
+ SUBTRACET(
+ seastore_fixedkv_tree,
+ "l: {}, r: {}, replacement_l: {}, replacement_r: {}",
+ c.trans, *l, *r, *replacement_l, *replacement_r);
+ c.cache.retire_extent(c.trans, l);
+ c.cache.retire_extent(c.trans, r);
+ }
+
+ return seastar::now();
+ };
+
+ auto v = parent_pos.node->template get_child<NodeType>(c, donor_iter);
+ if (v.has_child()) {
+ return v.get_child_fut().safe_then(
+ [do_merge=std::move(do_merge), &pos,
+ donor_iter, donor_is_left, c, parent_pos](auto child) mutable {
+ LOG_PREFIX(FixedKVBtree::merge_level);
+ SUBTRACET(seastore_fixedkv_tree,
+ "got child on {}, pos: {}, res: {}",
+ c.trans,
+ *parent_pos.node,
+ donor_iter.get_offset(),
+ *child);
+ auto &node = (typename internal_node_t::base_t&)*child;
+ assert(donor_is_left ?
+ node.get_node_meta().end == pos.node->get_node_meta().begin :
+ node.get_node_meta().begin == pos.node->get_node_meta().end);
+ assert(node.get_node_meta().begin == donor_iter.get_key());
+ assert(node.get_node_meta().end > donor_iter.get_key());
+ return do_merge(child->template cast<NodeType>());
+ });
+ }
+
+ auto child_pos = v.get_child_pos();
+ return get_node<NodeType>(
+ c,
+ depth,
+ donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()),
+ begin,
+ end,
+ std::make_optional<node_position_t<NodeType>>(
+ child_pos.template get_parent<NodeType>(),
+ child_pos.get_pos())
+ ).si_then([do_merge=std::move(do_merge)](typename NodeType::Ref donor) {
+ return do_merge(donor);
+ });
+ }
+};
+
+template <typename T>
+struct is_fixed_kv_tree : std::false_type {};
+
+template <
+ typename node_key_t,
+ typename node_val_t,
+ typename internal_node_t,
+ typename leaf_node_t,
+ typename pin_t,
+ size_t node_size,
+ bool leaf_has_children>
+struct is_fixed_kv_tree<
+ FixedKVBtree<
+ node_key_t,
+ node_val_t,
+ internal_node_t,
+ leaf_node_t,
+ pin_t,
+ node_size,
+ leaf_has_children>> : std::true_type {};
+
+template <
+ typename tree_type_t,
+ typename node_key_t,
+ typename F,
+ std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0>
+auto with_btree(
+ Cache &cache,
+ op_context_t<node_key_t> c,
+ F &&f) {
+ return cache.get_root(
+ c.trans
+ ).si_then([f=std::forward<F>(f)](RootBlockRef croot) mutable {
+ return seastar::do_with(
+ tree_type_t(croot),
+ [f=std::move(f)](auto &btree) mutable {
+ return f(btree);
+ });
+ });
+}
+
+template <
+ typename tree_type_t,
+ typename State,
+ typename node_key_t,
+ typename F,
+ std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0>
+auto with_btree_state(
+ Cache &cache,
+ op_context_t<node_key_t> c,
+ State &&init,
+ F &&f) {
+ return seastar::do_with(
+ std::forward<State>(init),
+ [&cache, c, f=std::forward<F>(f)](auto &state) mutable {
+ return with_btree<tree_type_t>(
+ cache,
+ c,
+ [&state, f=std::move(f)](auto &btree) mutable {
+ return f(btree, state);
+ }).si_then([&state] {
+ return seastar::make_ready_future<State>(std::move(state));
+ });
+ });
+}
+
+template <
+ typename tree_type_t,
+ typename State,
+ typename node_key_t,
+ typename F,
+ std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0>
+auto with_btree_state(
+ Cache &cache,
+ op_context_t<node_key_t> c,
+ F &&f) {
+ return crimson::os::seastore::with_btree_state<tree_type_t, State>(
+ cache, c, State{}, std::forward<F>(f));
+}
+
+template <
+ typename tree_type_t,
+ typename Ret,
+ typename node_key_t,
+ typename F>
+auto with_btree_ret(
+ Cache &cache,
+ op_context_t<node_key_t> c,
+ F &&f) {
+ return with_btree_state<tree_type_t, Ret>(
+ cache,
+ c,
+ [f=std::forward<F>(f)](auto &btree, auto &ret) mutable {
+ return f(
+ btree
+ ).si_then([&ret](auto &&_ret) {
+ ret = std::move(_ret);
+ });
+ });
+}
+
+}
+
diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.cc b/src/crimson/os/seastore/btree/fixed_kv_node.cc
new file mode 100644
index 000000000..00aceab92
--- /dev/null
+++ b/src/crimson/os/seastore/btree/fixed_kv_node.cc
@@ -0,0 +1,12 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/btree/fixed_kv_node.h"
+
+namespace crimson::os::seastore {
+
+bool is_valid_child_ptr(ChildableCachedExtent* child) {
+ return child != nullptr && child != RESERVATION_PTR;
+}
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h
new file mode 100644
index 000000000..956a1824e
--- /dev/null
+++ b/src/crimson/os/seastore/btree/fixed_kv_node.h
@@ -0,0 +1,1220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/mman.h>
+#include <memory>
+#include <string.h>
+
+
+#include "include/buffer.h"
+
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/cached_extent.h"
+
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+#include "crimson/os/seastore/btree/fixed_kv_btree.h"
+#include "crimson/os/seastore/root_block.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * FixedKVNode
+ *
+ * Base class enabling recursive lookup between internal and leaf nodes.
+ */
+template <typename node_key_t>
+struct FixedKVNode : ChildableCachedExtent {
+ using FixedKVNodeRef = TCachedExtentRef<FixedKVNode>;
+ fixed_kv_node_meta_t<node_key_t> range;
+
+ struct copy_source_cmp_t {
+ using is_transparent = node_key_t;
+ bool operator()(const FixedKVNodeRef &l, const FixedKVNodeRef &r) const {
+ assert(l->range.end <= r->range.begin
+ || r->range.end <= l->range.begin
+ || (l->range.begin == r->range.begin
+ && l->range.end == r->range.end));
+ return l->range.begin < r->range.begin;
+ }
+ bool operator()(const node_key_t &l, const FixedKVNodeRef &r) const {
+ return l < r->range.begin;
+ }
+ bool operator()(const FixedKVNodeRef &l, const node_key_t &r) const {
+ return l->range.begin < r;
+ }
+ };
+
+ /*
+ *
+ * Nodes of fixed-kv-btree connect to their child nodes by pointers following
+ * invariants below:
+ *
+ * 1. if nodes are stable:
+ * a. parent points at the node's stable parent
+ * b. prior_instance is empty
+ * c. child pointers point at stable children. Child resolution is done
+ * directly via this array.
+ * c. copy_sources is empty
+ * 2. if nodes are mutation_pending:
+ * a. parent is empty and needs to be fixed upon commit
+ * b. prior_instance points to its stable version
+ * c. child pointers are null except for initial_pending() children of
+ * this transaction. Child resolution is done by first checking this
+ * array, and then recursively resolving via the parent. We copy child
+ * pointers from parent on commit.
+ * c. copy_sources is empty
+ * 3. if nodes are initial_pending
+ * a. parent points at its pending parent on this transaction (must exist)
+ * b. prior_instance is empty or, if it's the result of rewrite, points to
+ * its stable predecessor
+ * c. child pointers are null except for initial_pending() children of
+ * this transaction (live due to 3a below). Child resolution is done
+ * by first checking this array, and then recursively resolving via
+ * the correct copy_sources entry. We copy child pointers from copy_sources
+ * on commit.
+ * d. copy_sources contains the set of stable nodes at the same tree-level(only
+ * its "prior_instance" if the node is the result of a rewrite), with which
+ * the lba range of this node overlaps.
+ */
+ std::vector<ChildableCachedExtent*> children;
+ std::set<FixedKVNodeRef, copy_source_cmp_t> copy_sources;
+ uint16_t capacity = 0;
+ parent_tracker_t* my_tracker = nullptr;
+ RootBlockRef root_block;
+
+ bool is_linked() {
+ assert(!has_parent_tracker() || !(bool)root_block);
+ return (bool)has_parent_tracker() || (bool)root_block;
+ }
+
+ FixedKVNode(uint16_t capacity, ceph::bufferptr &&ptr)
+ : ChildableCachedExtent(std::move(ptr)),
+ children(capacity, nullptr),
+ capacity(capacity) {}
+ FixedKVNode(const FixedKVNode &rhs)
+ : ChildableCachedExtent(rhs),
+ range(rhs.range),
+ children(rhs.capacity, nullptr),
+ capacity(rhs.capacity) {}
+
+ virtual fixed_kv_node_meta_t<node_key_t> get_node_meta() const = 0;
+ virtual uint16_t get_node_size() const = 0;
+
+ virtual ~FixedKVNode() = default;
+ virtual node_key_t get_key_from_idx(uint16_t idx) const = 0;
+
+ template<typename iter_t>
+ void update_child_ptr(iter_t iter, ChildableCachedExtent* child) {
+ children[iter.get_offset()] = child;
+ set_child_ptracker(child);
+ }
+
+ virtual bool is_leaf_and_has_children() const = 0;
+
+ template<typename iter_t>
+ void insert_child_ptr(iter_t iter, ChildableCachedExtent* child) {
+ auto raw_children = children.data();
+ auto offset = iter.get_offset();
+ std::memmove(
+ &raw_children[offset + 1],
+ &raw_children[offset],
+ (get_node_size() - offset) * sizeof(ChildableCachedExtent*));
+ if (child) {
+ children[offset] = child;
+ set_child_ptracker(child);
+ } else {
+ // this can only happen when reserving lba spaces
+ ceph_assert(is_leaf_and_has_children());
+ // this is to avoid mistakenly copying pointers from
+ // copy sources when committing this lba node, because
+ // we rely on pointers' "nullness" to avoid copying
+ // pointers for updated values
+ children[offset] = RESERVATION_PTR;
+ }
+ }
+
+ template<typename iter_t>
+ void remove_child_ptr(iter_t iter) {
+ LOG_PREFIX(FixedKVNode::remove_child_ptr);
+ auto raw_children = children.data();
+ auto offset = iter.get_offset();
+ SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, total size {}, extent {}",
+ this->pending_for_transaction,
+ offset,
+ get_node_size(),
+ (void*)raw_children[offset]);
+ // parent tracker of the child being removed will be
+ // reset when the child is invalidated, so no need to
+ // reset it here
+ std::memmove(
+ &raw_children[offset],
+ &raw_children[offset + 1],
+ (get_node_size() - offset - 1) * sizeof(ChildableCachedExtent*));
+ }
+
+ FixedKVNode& get_stable_for_key(node_key_t key) {
+ ceph_assert(is_pending());
+ if (is_mutation_pending()) {
+ return (FixedKVNode&)*get_prior_instance();
+ } else {
+ ceph_assert(!copy_sources.empty());
+ auto it = copy_sources.upper_bound(key);
+ it--;
+ auto &copy_source = *it;
+ ceph_assert(copy_source->get_node_meta().is_in_range(key));
+ return *copy_source;
+ }
+ }
+
+ static void push_copy_sources(
+ FixedKVNode &dest,
+ FixedKVNode &src)
+ {
+ ceph_assert(dest.is_initial_pending());
+ if (!src.is_pending()) {
+ dest.copy_sources.emplace(&src);
+ } else if (src.is_mutation_pending()) {
+ dest.copy_sources.emplace(
+ src.get_prior_instance()->template cast<FixedKVNode>());
+ } else {
+ ceph_assert(src.is_initial_pending());
+ dest.copy_sources.insert(
+ src.copy_sources.begin(),
+ src.copy_sources.end());
+ }
+ }
+
+ virtual uint16_t get_node_split_pivot() = 0;
+
+ static void move_child_ptrs(
+ FixedKVNode &dest,
+ FixedKVNode &src,
+ size_t dest_start,
+ size_t src_start,
+ size_t src_end)
+ {
+ std::memmove(
+ dest.children.data() + dest_start,
+ src.children.data() + src_start,
+ (src_end - src_start) * sizeof(ChildableCachedExtent*));
+
+ ceph_assert(src_start < src_end);
+ ceph_assert(src.children.size() >= src_end);
+ for (auto it = src.children.begin() + src_start;
+ it != src.children.begin() + src_end;
+ it++)
+ {
+ auto child = *it;
+ if (is_valid_child_ptr(child)) {
+ dest.set_child_ptracker(child);
+ }
+ }
+ }
+
+ void link_child(ChildableCachedExtent* child, uint16_t pos) {
+ assert(pos < get_node_size());
+ assert(child);
+ ceph_assert(!is_pending());
+ ceph_assert(child->is_valid() && !child->is_pending());
+ assert(!children[pos]);
+ children[pos] = child;
+ set_child_ptracker(child);
+ }
+
+ virtual get_child_ret_t<LogicalCachedExtent>
+ get_logical_child(op_context_t<node_key_t> c, uint16_t pos) = 0;
+
+ template <typename T, typename iter_t>
+ get_child_ret_t<T> get_child(op_context_t<node_key_t> c, iter_t iter) {
+ auto pos = iter.get_offset();
+ assert(children.capacity());
+ auto child = children[pos];
+ if (is_valid_child_ptr(child)) {
+ ceph_assert(child->get_type() == T::TYPE);
+ return c.cache.template get_extent_viewable_by_trans<T>(c.trans, (T*)child);
+ } else if (is_pending()) {
+ auto key = iter.get_key();
+ auto &sparent = get_stable_for_key(key);
+ auto spos = sparent.child_pos_for_key(key);
+ auto child = sparent.children[spos];
+ if (is_valid_child_ptr(child)) {
+ ceph_assert(child->get_type() == T::TYPE);
+ return c.cache.template get_extent_viewable_by_trans<T>(c.trans, (T*)child);
+ } else {
+ return child_pos_t(&sparent, spos);
+ }
+ } else {
+ return child_pos_t(this, pos);
+ }
+ }
+
+ void split_child_ptrs(
+ FixedKVNode &left,
+ FixedKVNode &right)
+ {
+ assert(!left.my_tracker);
+ assert(!right.my_tracker);
+ push_copy_sources(left, *this);
+ push_copy_sources(right, *this);
+ if (is_pending()) {
+ uint16_t pivot = get_node_split_pivot();
+ move_child_ptrs(left, *this, 0, 0, pivot);
+ move_child_ptrs(right, *this, 0, pivot, get_node_size());
+ my_tracker = nullptr;
+ }
+ }
+
+ void merge_child_ptrs(
+ FixedKVNode &left,
+ FixedKVNode &right)
+ {
+ ceph_assert(!my_tracker);
+ push_copy_sources(*this, left);
+ push_copy_sources(*this, right);
+
+ if (left.is_pending()) {
+ move_child_ptrs(*this, left, 0, 0, left.get_node_size());
+ left.my_tracker = nullptr;
+ }
+
+ if (right.is_pending()) {
+ move_child_ptrs(*this, right, left.get_node_size(), 0, right.get_node_size());
+ right.my_tracker = nullptr;
+ }
+ }
+
+ static void balance_child_ptrs(
+ FixedKVNode &left,
+ FixedKVNode &right,
+ bool prefer_left,
+ FixedKVNode &replacement_left,
+ FixedKVNode &replacement_right)
+ {
+ size_t l_size = left.get_node_size();
+ size_t r_size = right.get_node_size();
+ size_t total = l_size + r_size;
+ size_t pivot_idx = (l_size + r_size) / 2;
+ if (total % 2 && prefer_left) {
+ pivot_idx++;
+ }
+
+ assert(!replacement_left.my_tracker);
+ assert(!replacement_right.my_tracker);
+ if (pivot_idx < l_size) {
+ // deal with left
+ push_copy_sources(replacement_left, left);
+ push_copy_sources(replacement_right, left);
+ if (left.is_pending()) {
+ move_child_ptrs(replacement_left, left, 0, 0, pivot_idx);
+ move_child_ptrs(replacement_right, left, 0, pivot_idx, l_size);
+ left.my_tracker = nullptr;
+ }
+
+ // deal with right
+ push_copy_sources(replacement_right, right);
+ if (right.is_pending()) {
+ move_child_ptrs(replacement_right, right, l_size - pivot_idx, 0, r_size);
+ right.my_tracker= nullptr;
+ }
+ } else {
+ // deal with left
+ push_copy_sources(replacement_left, left);
+ if (left.is_pending()) {
+ move_child_ptrs(replacement_left, left, 0, 0, l_size);
+ left.my_tracker = nullptr;
+ }
+
+ // deal with right
+ push_copy_sources(replacement_left, right);
+ push_copy_sources(replacement_right, right);
+ if (right.is_pending()) {
+ move_child_ptrs(replacement_left, right, l_size, 0, pivot_idx - l_size);
+ move_child_ptrs(replacement_right, right, 0, pivot_idx - l_size, r_size);
+ right.my_tracker= nullptr;
+ }
+ }
+ }
+
+ void set_parent_tracker_from_prior_instance() {
+ assert(is_mutation_pending());
+ auto &prior = (FixedKVNode&)(*get_prior_instance());
+ if (range.is_root()) {
+ ceph_assert(prior.root_block);
+ ceph_assert(pending_for_transaction);
+ root_block = prior.root_block;
+ link_phy_tree_root_node(root_block, this);
+ return;
+ }
+ ceph_assert(!root_block);
+ take_prior_parent_tracker();
+ assert(is_parent_valid());
+ auto parent = get_parent_node<FixedKVNode>();
+ //TODO: can this search be avoided?
+ auto off = parent->lower_bound_offset(get_node_meta().begin);
+ assert(parent->get_key_from_idx(off) == get_node_meta().begin);
+ parent->children[off] = this;
+ }
+
+ bool is_children_empty() const {
+ for (auto it = children.begin();
+ it != children.begin() + get_node_size();
+ it++) {
+ if (is_valid_child_ptr(*it)
+ && (*it)->is_valid()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void set_children_from_prior_instance() {
+ assert(get_prior_instance());
+ auto &prior = (FixedKVNode&)(*get_prior_instance());
+ assert(prior.my_tracker || prior.is_children_empty());
+
+ if (prior.my_tracker) {
+ prior.my_tracker->reset_parent(this);
+ my_tracker = prior.my_tracker;
+ // All my initial pending children is pointing to the original
+ // tracker which has been dropped by the above line, so need
+ // to adjust them to point to the new tracker
+ adjust_ptracker_for_children();
+ }
+ assert(my_tracker || is_children_empty());
+ }
+
+ void adjust_ptracker_for_children() {
+ auto begin = children.begin();
+ auto end = begin + get_node_size();
+ ceph_assert(end <= children.end());
+ for (auto it = begin; it != end; it++) {
+ auto child = *it;
+ if (is_valid_child_ptr(child)) {
+ set_child_ptracker(child);
+ }
+ }
+ }
+
+ void on_delta_write(paddr_t record_block_offset) final {
+ // All in-memory relative addrs are necessarily record-relative
+ assert(get_prior_instance());
+ assert(pending_for_transaction);
+ resolve_relative_addrs(record_block_offset);
+ }
+
+ virtual uint16_t lower_bound_offset(node_key_t) const = 0;
+ virtual uint16_t upper_bound_offset(node_key_t) const = 0;
+ virtual uint16_t child_pos_for_key(node_key_t) const = 0;
+
+ virtual bool validate_stable_children() = 0;
+
+ template<typename iter_t>
+ uint16_t copy_children_from_stable_source(
+ FixedKVNode &source,
+ iter_t foreign_start_it,
+ iter_t foreign_end_it,
+ iter_t local_start_it) {
+ auto foreign_it = foreign_start_it, local_it = local_start_it;
+ while (foreign_it != foreign_end_it
+ && local_it.get_offset() < get_node_size())
+ {
+ auto &child = children[local_it.get_offset()];
+ if (foreign_it.get_key() == local_it.get_key()) {
+ // the foreign key is preserved
+ if (!child) {
+ child = source.children[foreign_it.get_offset()];
+ }
+ foreign_it++;
+ local_it++;
+ } else if (foreign_it.get_key() < local_it.get_key()) {
+ // the foreign key has been removed, because, if it hasn't,
+ // there must have been a local key before the one pointed
+ // by the current "local_it" that's equal to this foreign key
+ // and has pushed the foreign_it forward.
+ foreign_it++;
+ } else {
+ // the local key must be a newly inserted one.
+ local_it++;
+ }
+ }
+ return local_it.get_offset();
+ }
+
+ template<typename Func>
+ void copy_children_from_stable_sources(Func &&get_iter) {
+ if (!copy_sources.empty()) {
+ auto it = --copy_sources.upper_bound(get_node_meta().begin);
+ auto &cs = *it;
+ uint16_t start_pos = cs->lower_bound_offset(
+ get_node_meta().begin);
+ if (start_pos == cs->get_node_size()) {
+ it++;
+ start_pos = 0;
+ }
+ uint16_t local_next_pos = 0;
+ for (; it != copy_sources.end(); it++) {
+ auto& copy_source = *it;
+ auto end_pos = copy_source->get_node_size();
+ if (copy_source->get_node_meta().is_in_range(get_node_meta().end)) {
+ end_pos = copy_source->upper_bound_offset(get_node_meta().end);
+ }
+ auto local_start_iter = get_iter(*this, local_next_pos);
+ auto foreign_start_iter = get_iter(*copy_source, start_pos);
+ auto foreign_end_iter = get_iter(*copy_source, end_pos);
+ local_next_pos = copy_children_from_stable_source(
+ *copy_source, foreign_start_iter, foreign_end_iter, local_start_iter);
+ if (end_pos != copy_source->get_node_size()) {
+ break;
+ }
+ start_pos = 0;
+ }
+ }
+ }
+
+ void on_invalidated(Transaction &t) final {
+ reset_parent_tracker();
+ }
+
+ bool is_rewrite() {
+ return is_initial_pending() && get_prior_instance();
+ }
+
+ void on_initial_write() final {
+ // All in-memory relative addrs are necessarily block-relative
+ resolve_relative_addrs(get_paddr());
+ if (range.is_root()) {
+ reset_parent_tracker();
+ }
+ assert(has_parent_tracker() ? (is_parent_valid()) : true);
+ }
+
+ void set_child_ptracker(ChildableCachedExtent *child) {
+ if (!this->my_tracker) {
+ this->my_tracker = new parent_tracker_t(this);
+ }
+ child->reset_parent_tracker(this->my_tracker);
+ }
+
+ void on_clean_read() final {
+ // From initial write of block, relative addrs are necessarily block-relative
+ resolve_relative_addrs(get_paddr());
+ }
+
+ virtual void resolve_relative_addrs(paddr_t base) = 0;
+};
+
+/**
+ * FixedKVInternalNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * FixedKVBTree.
+ */
+template <
+ size_t CAPACITY,
+ typename NODE_KEY,
+ typename NODE_KEY_LE,
+ size_t node_size,
+ typename node_type_t>
+struct FixedKVInternalNode
+ : FixedKVNode<NODE_KEY>,
+ common::FixedKVNodeLayout<
+ CAPACITY,
+ fixed_kv_node_meta_t<NODE_KEY>,
+ fixed_kv_node_meta_le_t<NODE_KEY_LE>,
+ NODE_KEY, NODE_KEY_LE,
+ paddr_t, paddr_le_t> {
+ using Ref = TCachedExtentRef<node_type_t>;
+ using base_t = FixedKVNode<NODE_KEY>;
+ using base_ref = typename FixedKVNode<NODE_KEY>::FixedKVNodeRef;
+ using node_layout_t =
+ common::FixedKVNodeLayout<
+ CAPACITY,
+ fixed_kv_node_meta_t<NODE_KEY>,
+ fixed_kv_node_meta_le_t<NODE_KEY_LE>,
+ NODE_KEY,
+ NODE_KEY_LE,
+ paddr_t,
+ paddr_le_t>;
+ using internal_const_iterator_t = typename node_layout_t::const_iterator;
+ using internal_iterator_t = typename node_layout_t::iterator;
+ using this_type_t = FixedKVInternalNode<
+ CAPACITY,
+ NODE_KEY,
+ NODE_KEY_LE,
+ node_size,
+ node_type_t>;
+
+ FixedKVInternalNode(ceph::bufferptr &&ptr)
+ : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)),
+ node_layout_t(this->get_bptr().c_str()) {}
+ FixedKVInternalNode(const FixedKVInternalNode &rhs)
+ : FixedKVNode<NODE_KEY>(rhs),
+ node_layout_t(this->get_bptr().c_str()) {}
+
+ bool is_leaf_and_has_children() const final {
+ return false;
+ }
+
+ uint16_t get_node_split_pivot() final {
+ return this->get_split_pivot().get_offset();
+ }
+
+ void prepare_commit() final {
+ if (this->is_initial_pending()) {
+ if (this->is_rewrite()) {
+ this->set_children_from_prior_instance();
+ }
+ this->copy_children_from_stable_sources(
+ [this](base_t &node, uint16_t pos) {
+ ceph_assert(node.get_type() == this->get_type());
+ auto &n = static_cast<this_type_t&>(node);
+ return n.iter_idx(pos);
+ }
+ );
+ if (this->is_rewrite()) {
+ this->reset_prior_instance();
+ } else {
+ this->adjust_ptracker_for_children();
+ }
+ assert(this->validate_stable_children());
+ this->copy_sources.clear();
+ }
+ }
+
+ get_child_ret_t<LogicalCachedExtent>
+ get_logical_child(op_context_t<NODE_KEY>, uint16_t pos) final {
+ ceph_abort("impossible");
+ return get_child_ret_t<LogicalCachedExtent>(child_pos_t(nullptr, 0));
+ }
+
+ bool validate_stable_children() final {
+ LOG_PREFIX(FixedKVInternalNode::validate_stable_children);
+ if (this->children.empty()) {
+ return false;
+ }
+
+ for (auto i : *this) {
+ auto child = (FixedKVNode<NODE_KEY>*)this->children[i.get_offset()];
+ if (child && child->range.begin != i.get_key()) {
+ SUBERROR(seastore_fixedkv_tree,
+ "stable child not valid: child {}, child meta{}, key {}",
+ *child,
+ child->get_node_meta(),
+ i.get_key());
+ ceph_abort();
+ return false;
+ }
+ }
+ return true;
+ }
+
+ virtual ~FixedKVInternalNode() {
+ if (this->is_valid() && !this->is_pending()) {
+ if (this->range.is_root()) {
+ ceph_assert(this->root_block);
+ unlink_phy_tree_root_node<NODE_KEY>(this->root_block);
+ } else {
+ ceph_assert(this->is_parent_valid());
+ auto parent = this->template get_parent_node<FixedKVNode<NODE_KEY>>();
+ auto off = parent->lower_bound_offset(this->get_meta().begin);
+ assert(parent->get_key_from_idx(off) == this->get_meta().begin);
+ assert(parent->children[off] == this);
+ parent->children[off] = nullptr;
+ }
+ }
+ }
+
+ uint16_t lower_bound_offset(NODE_KEY key) const final {
+ return this->lower_bound(key).get_offset();
+ }
+
+ uint16_t upper_bound_offset(NODE_KEY key) const final {
+ return this->upper_bound(key).get_offset();
+ }
+
+ uint16_t child_pos_for_key(NODE_KEY key) const final {
+ auto it = this->upper_bound(key);
+ assert(it != this->begin());
+ --it;
+ return it.get_offset();
+ }
+
+ NODE_KEY get_key_from_idx(uint16_t idx) const final {
+ return this->iter_idx(idx).get_key();
+ }
+
+ fixed_kv_node_meta_t<NODE_KEY> get_node_meta() const {
+ return this->get_meta();
+ }
+
+ uint16_t get_node_size() const final {
+ return this->get_size();
+ }
+
+ typename node_layout_t::delta_buffer_t delta_buffer;
+ typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() {
+ return this->is_mutation_pending()
+ ? &delta_buffer : nullptr;
+ }
+
+ CachedExtentRef duplicate_for_write(Transaction&) override {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new node_type_t(*this));
+ };
+
+ void on_replace_prior(Transaction&) final {
+ ceph_assert(!this->is_rewrite());
+ this->set_children_from_prior_instance();
+ auto &prior = (this_type_t&)(*this->get_prior_instance());
+ auto copied = this->copy_children_from_stable_source(
+ prior,
+ prior.begin(),
+ prior.end(),
+ this->begin());
+ ceph_assert(copied <= get_node_size());
+ assert(this->validate_stable_children());
+ this->set_parent_tracker_from_prior_instance();
+ }
+
+ void update(
+ internal_const_iterator_t iter,
+ paddr_t addr,
+ FixedKVNode<NODE_KEY>* nextent) {
+ LOG_PREFIX(FixedKVInternalNode::update);
+ SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, {}",
+ this->pending_for_transaction,
+ iter.get_offset(),
+ *nextent);
+ this->update_child_ptr(iter, nextent);
+ return this->journal_update(
+ iter,
+ this->maybe_generate_relative(addr),
+ maybe_get_delta_buffer());
+ }
+
+ void insert(
+ internal_const_iterator_t iter,
+ NODE_KEY pivot,
+ paddr_t addr,
+ FixedKVNode<NODE_KEY>* nextent) {
+ LOG_PREFIX(FixedKVInternalNode::insert);
+ SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}, {}",
+ this->pending_for_transaction,
+ iter.get_offset(),
+ pivot,
+ *nextent);
+ this->insert_child_ptr(iter, nextent);
+ return this->journal_insert(
+ iter,
+ pivot,
+ this->maybe_generate_relative(addr),
+ maybe_get_delta_buffer());
+ }
+
+ void remove(internal_const_iterator_t iter) {
+ LOG_PREFIX(FixedKVInternalNode::remove);
+ SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}",
+ this->pending_for_transaction,
+ iter.get_offset(),
+ iter.get_key());
+ this->remove_child_ptr(iter);
+ return this->journal_remove(
+ iter,
+ maybe_get_delta_buffer());
+ }
+
+ void replace(
+ internal_const_iterator_t iter,
+ NODE_KEY pivot,
+ paddr_t addr,
+ FixedKVNode<NODE_KEY>* nextent) {
+ LOG_PREFIX(FixedKVInternalNode::replace);
+ SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, old key {}, key {}, {}",
+ this->pending_for_transaction,
+ iter.get_offset(),
+ iter.get_key(),
+ pivot,
+ *nextent);
+ this->update_child_ptr(iter, nextent);
+ return this->journal_replace(
+ iter,
+ pivot,
+ this->maybe_generate_relative(addr),
+ maybe_get_delta_buffer());
+ }
+
+ std::tuple<Ref, Ref, NODE_KEY>
+ make_split_children(op_context_t<NODE_KEY> c) {
+ auto left = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ auto right = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ this->split_child_ptrs(*left, *right);
+ auto pivot = this->split_into(*left, *right);
+ left->range = left->get_meta();
+ right->range = right->get_meta();
+ return std::make_tuple(
+ left,
+ right,
+ pivot);
+ }
+
+ Ref make_full_merge(
+ op_context_t<NODE_KEY> c,
+ Ref &right) {
+ auto replacement = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ replacement->merge_child_ptrs(*this, *right);
+ replacement->merge_from(*this, *right->template cast<node_type_t>());
+ replacement->range = replacement->get_meta();
+ return replacement;
+ }
+
+ std::tuple<Ref, Ref, NODE_KEY>
+ make_balanced(
+ op_context_t<NODE_KEY> c,
+ Ref &_right,
+ bool prefer_left) {
+ ceph_assert(_right->get_type() == this->get_type());
+ auto &right = *_right->template cast<node_type_t>();
+ auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+
+ auto pivot = this->balance_into_new_nodes(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+ this->balance_child_ptrs(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+
+ replacement_left->range = replacement_left->get_meta();
+ replacement_right->range = replacement_right->get_meta();
+ return std::make_tuple(
+ replacement_left,
+ replacement_right,
+ pivot);
+ }
+
+ /**
+ * Internal relative addresses on read or in memory prior to commit
+ * are either record or block relative depending on whether this
+ * physical node is is_initial_pending() or just is_mutable().
+ *
+ * User passes appropriate base depending on lifecycle and
+ * resolve_relative_addrs fixes up relative internal references
+ * based on base.
+ */
+ void resolve_relative_addrs(paddr_t base)
+ {
+ LOG_PREFIX(FixedKVInternalNode::resolve_relative_addrs);
+ for (auto i: *this) {
+ if (i->get_val().is_relative()) {
+ auto updated = base.add_relative(i->get_val());
+ SUBTRACE(seastore_fixedkv_tree, "{} -> {}", i->get_val(), updated);
+ i->set_val(updated);
+ }
+ }
+ }
+
+ void node_resolve_vals(
+ internal_iterator_t from,
+ internal_iterator_t to) const {
+ if (this->is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ if (i->get_val().is_relative()) {
+ assert(i->get_val().is_block_relative());
+ i->set_val(this->get_paddr().add_relative(i->get_val()));
+ }
+ }
+ }
+ }
+ void node_unresolve_vals(
+ internal_iterator_t from,
+ internal_iterator_t to) const {
+ if (this->is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ if (i->get_val().is_relative()) {
+ assert(i->get_val().is_record_relative());
+ i->set_val(i->get_val().block_relative_to(this->get_paddr()));
+ }
+ }
+ }
+ }
+
+ std::ostream &_print_detail(std::ostream &out) const
+ {
+ out << ", size=" << this->get_size()
+ << ", meta=" << this->get_meta()
+ << ", my_tracker=" << (void*)this->my_tracker;
+ if (this->my_tracker) {
+ out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get();
+ }
+ return out << ", root_block=" << (void*)this->root_block.get();
+ }
+
+ ceph::bufferlist get_delta() {
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &_bl) {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ typename node_layout_t::delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ this->set_last_committed_crc(this->get_crc32c());
+ resolve_relative_addrs(base);
+ }
+
+ constexpr static size_t get_min_capacity() {
+ return (node_layout_t::get_capacity() - 1) / 2;
+ }
+
+ bool at_max_capacity() const {
+ assert(this->get_size() <= node_layout_t::get_capacity());
+ return this->get_size() == node_layout_t::get_capacity();
+ }
+
+ bool at_min_capacity() const {
+ assert(this->get_size() >= (get_min_capacity() - 1));
+ return this->get_size() <= get_min_capacity();
+ }
+
+ bool below_min_capacity() const {
+ assert(this->get_size() >= (get_min_capacity() - 1));
+ return this->get_size() < get_min_capacity();
+ }
+};
+
+template <
+ size_t CAPACITY,
+ typename NODE_KEY,
+ typename NODE_KEY_LE,
+ typename VAL,
+ typename VAL_LE,
+ size_t node_size,
+ typename node_type_t,
+ bool has_children>
+struct FixedKVLeafNode
+ : FixedKVNode<NODE_KEY>,
+ common::FixedKVNodeLayout<
+ CAPACITY,
+ fixed_kv_node_meta_t<NODE_KEY>,
+ fixed_kv_node_meta_le_t<NODE_KEY_LE>,
+ NODE_KEY, NODE_KEY_LE,
+ VAL, VAL_LE> {
+ using Ref = TCachedExtentRef<node_type_t>;
+ using node_layout_t =
+ common::FixedKVNodeLayout<
+ CAPACITY,
+ fixed_kv_node_meta_t<NODE_KEY>,
+ fixed_kv_node_meta_le_t<NODE_KEY_LE>,
+ NODE_KEY,
+ NODE_KEY_LE,
+ VAL,
+ VAL_LE>;
+ using internal_const_iterator_t = typename node_layout_t::const_iterator;
+ using this_type_t = FixedKVLeafNode<
+ CAPACITY,
+ NODE_KEY,
+ NODE_KEY_LE,
+ VAL,
+ VAL_LE,
+ node_size,
+ node_type_t,
+ has_children>;
+ using base_t = FixedKVNode<NODE_KEY>;
+ FixedKVLeafNode(ceph::bufferptr &&ptr)
+ : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)),
+ node_layout_t(this->get_bptr().c_str()) {}
+ FixedKVLeafNode(const FixedKVLeafNode &rhs)
+ : FixedKVNode<NODE_KEY>(rhs),
+ node_layout_t(this->get_bptr().c_str()) {}
+
+ static constexpr bool do_has_children = has_children;
+
+ bool is_leaf_and_has_children() const final {
+ return has_children;
+ }
+
+ uint16_t get_node_split_pivot() final {
+ return this->get_split_pivot().get_offset();
+ }
+
+ get_child_ret_t<LogicalCachedExtent>
+ get_logical_child(op_context_t<NODE_KEY> c, uint16_t pos) final {
+ auto child = this->children[pos];
+ if (is_valid_child_ptr(child)) {
+ ceph_assert(child->is_logical());
+ return c.cache.template get_extent_viewable_by_trans<
+ LogicalCachedExtent>(c.trans, (LogicalCachedExtent*)child);
+ } else if (this->is_pending()) {
+ auto key = this->iter_idx(pos).get_key();
+ auto &sparent = this->get_stable_for_key(key);
+ auto spos = sparent.child_pos_for_key(key);
+ auto child = sparent.children[spos];
+ if (is_valid_child_ptr(child)) {
+ ceph_assert(child->is_logical());
+ return c.cache.template get_extent_viewable_by_trans<
+ LogicalCachedExtent>(c.trans, (LogicalCachedExtent*)child);
+ } else {
+ return child_pos_t(&sparent, spos);
+ }
+ } else {
+ return child_pos_t(this, pos);
+ }
+ }
+
+ bool validate_stable_children() override {
+ return true;
+ }
+
+ virtual ~FixedKVLeafNode() {
+ if (this->is_valid() && !this->is_pending()) {
+ if (this->range.is_root()) {
+ ceph_assert(this->root_block);
+ unlink_phy_tree_root_node<NODE_KEY>(this->root_block);
+ } else {
+ ceph_assert(this->is_parent_valid());
+ auto parent = this->template get_parent_node<FixedKVNode<NODE_KEY>>();
+ auto off = parent->lower_bound_offset(this->get_meta().begin);
+ assert(parent->get_key_from_idx(off) == this->get_meta().begin);
+ assert(parent->children[off] == this);
+ parent->children[off] = nullptr;
+ }
+ }
+ }
+
+ void prepare_commit() final {
+ if constexpr (has_children) {
+ if (this->is_initial_pending()) {
+ if (this->is_rewrite()) {
+ this->set_children_from_prior_instance();
+ }
+ this->copy_children_from_stable_sources(
+ [this](base_t &node, uint16_t pos) {
+ ceph_assert(node.get_type() == this->get_type());
+ auto &n = static_cast<this_type_t&>(node);
+ return n.iter_idx(pos);
+ }
+ );
+ if (this->is_rewrite()) {
+ this->reset_prior_instance();
+ } else {
+ this->adjust_ptracker_for_children();
+ }
+ assert(this->validate_stable_children());
+ this->copy_sources.clear();
+ }
+ }
+ assert(this->is_initial_pending()
+ ? this->copy_sources.empty():
+ true);
+ }
+
+ void on_replace_prior(Transaction&) final {
+ ceph_assert(!this->is_rewrite());
+ if constexpr (has_children) {
+ this->set_children_from_prior_instance();
+ auto &prior = (this_type_t&)(*this->get_prior_instance());
+ auto copied = this->copy_children_from_stable_source(
+ prior,
+ prior.begin(),
+ prior.end(),
+ this->begin());
+ ceph_assert(copied <= get_node_size());
+ assert(this->validate_stable_children());
+ this->set_parent_tracker_from_prior_instance();
+ } else {
+ this->set_parent_tracker_from_prior_instance();
+ }
+ }
+
+ uint16_t lower_bound_offset(NODE_KEY key) const final {
+ return this->lower_bound(key).get_offset();
+ }
+
+ uint16_t upper_bound_offset(NODE_KEY key) const final {
+ return this->upper_bound(key).get_offset();
+ }
+
+ uint16_t child_pos_for_key(NODE_KEY key) const final {
+ return lower_bound_offset(key);
+ }
+
+ NODE_KEY get_key_from_idx(uint16_t idx) const final {
+ return this->iter_idx(idx).get_key();
+ }
+
+ fixed_kv_node_meta_t<NODE_KEY> get_node_meta() const {
+ return this->get_meta();
+ }
+
+ uint16_t get_node_size() const final {
+ return this->get_size();
+ }
+
+ typename node_layout_t::delta_buffer_t delta_buffer;
+ virtual typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() {
+ return this->is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ CachedExtentRef duplicate_for_write(Transaction&) override {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new node_type_t(*this));
+ };
+
+ virtual void update(
+ internal_const_iterator_t iter,
+ VAL val,
+ LogicalCachedExtent* nextent) = 0;
+ virtual internal_const_iterator_t insert(
+ internal_const_iterator_t iter,
+ NODE_KEY addr,
+ VAL val,
+ LogicalCachedExtent* nextent) = 0;
+ virtual void remove(internal_const_iterator_t iter) = 0;
+
+ std::tuple<Ref, Ref, NODE_KEY>
+ make_split_children(op_context_t<NODE_KEY> c) {
+ auto left = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ auto right = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ if constexpr (has_children) {
+ this->split_child_ptrs(*left, *right);
+ }
+ auto pivot = this->split_into(*left, *right);
+ left->range = left->get_meta();
+ right->range = right->get_meta();
+ return std::make_tuple(
+ left,
+ right,
+ pivot);
+ }
+
+ Ref make_full_merge(
+ op_context_t<NODE_KEY> c,
+ Ref &right) {
+ auto replacement = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ if constexpr (has_children) {
+ replacement->merge_child_ptrs(*this, *right);
+ }
+ replacement->merge_from(*this, *right->template cast<node_type_t>());
+ replacement->range = replacement->get_meta();
+ return replacement;
+ }
+
+ std::tuple<Ref, Ref, NODE_KEY>
+ make_balanced(
+ op_context_t<NODE_KEY> c,
+ Ref &_right,
+ bool prefer_left) {
+ ceph_assert(_right->get_type() == this->get_type());
+ auto &right = *_right->template cast<node_type_t>();
+ auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+ auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
+
+ auto pivot = this->balance_into_new_nodes(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+ if constexpr (has_children) {
+ this->balance_child_ptrs(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+ }
+
+ replacement_left->range = replacement_left->get_meta();
+ replacement_right->range = replacement_right->get_meta();
+ return std::make_tuple(
+ replacement_left,
+ replacement_right,
+ pivot);
+ }
+
+ ceph::bufferlist get_delta() {
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &_bl) {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ typename node_layout_t::delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ this->set_last_committed_crc(this->get_crc32c());
+ this->resolve_relative_addrs(base);
+ }
+
+ std::ostream &_print_detail(std::ostream &out) const
+ {
+ return out << ", size=" << this->get_size()
+ << ", meta=" << this->get_meta();
+ }
+
+ constexpr static size_t get_min_capacity() {
+ return (node_layout_t::get_capacity() - 1) / 2;
+ }
+
+ bool at_max_capacity() const {
+ assert(this->get_size() <= node_layout_t::get_capacity());
+ return this->get_size() == node_layout_t::get_capacity();
+ }
+
+ bool at_min_capacity() const {
+ assert(this->get_size() >= (get_min_capacity() - 1));
+ return this->get_size() <= get_min_capacity();
+ }
+
+ bool below_min_capacity() const {
+ assert(this->get_size() >= (get_min_capacity() - 1));
+ return this->get_size() < get_min_capacity();
+ }
+};
+
+} // namespace crimson::os::seastore
+
+#if FMT_VERSION >= 90000
+template <>
+struct fmt::formatter<
+ crimson::os::seastore::FixedKVNode<
+ crimson::os::seastore::laddr_t>> : fmt::ostream_formatter {};
+template <>
+struct fmt::formatter<
+ crimson::os::seastore::FixedKVNode<
+ crimson::os::seastore::paddr_t>> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
new file mode 100644
index 000000000..4d1dc9296
--- /dev/null
+++ b/src/crimson/os/seastore/cache.cc
@@ -0,0 +1,2040 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/cache.h"
+
+#include <sstream>
+#include <string_view>
+
+#include <seastar/core/metrics.hh>
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/os/seastore/async_cleaner.h"
+
+// included for get_extent_by_type
+#include "crimson/os/seastore/collection_manager/collection_flat_node.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
+#include "crimson/os/seastore/object_data_handler.h"
+#include "crimson/os/seastore/collection_manager/collection_flat_node.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h"
+#include "crimson/os/seastore/backref/backref_tree_node.h"
+#include "test/crimson/seastore/test_block.h"
+
+using std::string_view;
+
+SET_SUBSYS(seastore_cache);
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) {
+ return out << "backref_entry_t{"
+ << ent.paddr << "~" << ent.len << ", "
+ << "laddr: " << ent.laddr << ", "
+ << "type: " << ent.type << ", "
+ << "seq: " << ent.seq << ", "
+ << "}";
+}
+
+Cache::Cache(
+ ExtentPlacementManager &epm)
+ : epm(epm),
+ lru(crimson::common::get_conf<Option::size_t>(
+ "seastore_cache_lru_size"))
+{
+ LOG_PREFIX(Cache::Cache);
+ INFO("created, lru_size={}", lru.get_capacity());
+ register_metrics();
+ segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr);
+}
+
+Cache::~Cache()
+{
+ LOG_PREFIX(Cache::~Cache);
+ for (auto &i: extents) {
+ ERROR("extent is still alive -- {}", i);
+ }
+ ceph_assert(extents.empty());
+}
+
+Cache::retire_extent_ret Cache::retire_extent_addr(
+ Transaction &t, paddr_t addr, extent_len_t length)
+{
+ LOG_PREFIX(Cache::retire_extent_addr);
+ TRACET("retire {}~{}", t, addr, length);
+
+ assert(addr.is_real() && !addr.is_block_relative());
+
+ CachedExtentRef ext;
+ auto result = t.get_extent(addr, &ext);
+ if (result == Transaction::get_extent_ret::PRESENT) {
+ DEBUGT("retire {}~{} on t -- {}", t, addr, length, *ext);
+ t.add_to_retired_set(CachedExtentRef(&*ext));
+ return retire_extent_iertr::now();
+ } else if (result == Transaction::get_extent_ret::RETIRED) {
+ ERRORT("retire {}~{} failed, already retired -- {}", t, addr, length, *ext);
+ ceph_abort();
+ }
+
+ // any relative addr must have been on the transaction
+ assert(!addr.is_relative());
+
+ // absent from transaction
+ // retiring is not included by the cache hit metrics
+ ext = query_cache(addr, nullptr);
+ if (ext) {
+ DEBUGT("retire {}~{} in cache -- {}", t, addr, length, *ext);
+ } else {
+ // add a new placeholder to Cache
+ ext = CachedExtent::make_cached_extent_ref<
+ RetiredExtentPlaceholder>(length);
+ ext->init(CachedExtent::extent_state_t::CLEAN,
+ addr,
+ PLACEMENT_HINT_NULL,
+ NULL_GENERATION,
+ TRANS_ID_NULL);
+ DEBUGT("retire {}~{} as placeholder, add extent -- {}",
+ t, addr, length, *ext);
+ const auto t_src = t.get_src();
+ add_extent(ext, &t_src);
+ }
+ t.add_to_read_set(ext);
+ t.add_to_retired_set(ext);
+ return retire_extent_iertr::now();
+}
+
+void Cache::dump_contents()
+{
+ LOG_PREFIX(Cache::dump_contents);
+ DEBUG("enter");
+ for (auto &&i: extents) {
+ DEBUG("live {}", i);
+ }
+ DEBUG("exit");
+}
+
+void Cache::register_metrics()
+{
+ LOG_PREFIX(Cache::register_metrics);
+ DEBUG("");
+
+ stats = {};
+
+ namespace sm = seastar::metrics;
+ using src_t = Transaction::src_t;
+
+ std::map<src_t, sm::label_instance> labels_by_src {
+ {src_t::MUTATE, sm::label_instance("src", "MUTATE")},
+ {src_t::READ, sm::label_instance("src", "READ")},
+ {src_t::TRIM_DIRTY, sm::label_instance("src", "TRIM_DIRTY")},
+ {src_t::TRIM_ALLOC, sm::label_instance("src", "TRIM_ALLOC")},
+ {src_t::CLEANER_MAIN, sm::label_instance("src", "CLEANER_MAIN")},
+ {src_t::CLEANER_COLD, sm::label_instance("src", "CLEANER_COLD")},
+ };
+ assert(labels_by_src.size() == (std::size_t)src_t::MAX);
+
+ std::map<extent_types_t, sm::label_instance> labels_by_ext {
+ {extent_types_t::ROOT, sm::label_instance("ext", "ROOT")},
+ {extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")},
+ {extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")},
+ {extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")},
+ {extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")},
+ {extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")},
+ {extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")},
+ {extent_types_t::COLL_BLOCK, sm::label_instance("ext", "COLL_BLOCK")},
+ {extent_types_t::OBJECT_DATA_BLOCK, sm::label_instance("ext", "OBJECT_DATA_BLOCK")},
+ {extent_types_t::RETIRED_PLACEHOLDER, sm::label_instance("ext", "RETIRED_PLACEHOLDER")},
+ {extent_types_t::ALLOC_INFO, sm::label_instance("ext", "ALLOC_INFO")},
+ {extent_types_t::JOURNAL_TAIL, sm::label_instance("ext", "JOURNAL_TAIL")},
+ {extent_types_t::TEST_BLOCK, sm::label_instance("ext", "TEST_BLOCK")},
+ {extent_types_t::TEST_BLOCK_PHYSICAL, sm::label_instance("ext", "TEST_BLOCK_PHYSICAL")},
+ {extent_types_t::BACKREF_INTERNAL, sm::label_instance("ext", "BACKREF_INTERNAL")},
+ {extent_types_t::BACKREF_LEAF, sm::label_instance("ext", "BACKREF_LEAF")}
+ };
+ assert(labels_by_ext.size() == (std::size_t)extent_types_t::NONE);
+
+ /*
+ * trans_created
+ */
+ for (auto& [src, src_label] : labels_by_src) {
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "trans_created",
+ get_by_src(stats.trans_created_by_src, src),
+ sm::description("total number of transaction created"),
+ {src_label}
+ ),
+ }
+ );
+ }
+
+ /*
+ * cache_query: cache_access and cache_hit
+ */
+ for (auto& [src, src_label] : labels_by_src) {
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "cache_access",
+ get_by_src(stats.cache_query_by_src, src).access,
+ sm::description("total number of cache accesses"),
+ {src_label}
+ ),
+ sm::make_counter(
+ "cache_hit",
+ get_by_src(stats.cache_query_by_src, src).hit,
+ sm::description("total number of cache hits"),
+ {src_label}
+ ),
+ }
+ );
+ }
+
+ {
+ /*
+ * efforts discarded/committed
+ */
+ auto effort_label = sm::label("effort");
+
+ // invalidated efforts
+ using namespace std::literals::string_view_literals;
+ const string_view invalidated_effort_names[] = {
+ "READ"sv,
+ "MUTATE"sv,
+ "RETIRE"sv,
+ "FRESH"sv,
+ "FRESH_OOL_WRITTEN"sv,
+ };
+ for (auto& [src, src_label] : labels_by_src) {
+ auto& efforts = get_by_src(stats.invalidated_efforts_by_src, src);
+ for (auto& [ext, ext_label] : labels_by_ext) {
+ auto& counter = get_by_ext(efforts.num_trans_invalidated, ext);
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "trans_invalidated_by_extent",
+ counter,
+ sm::description("total number of transactions invalidated by extents"),
+ {src_label, ext_label}
+ ),
+ }
+ );
+ }
+
+ if (src == src_t::READ) {
+ // read transaction won't have non-read efforts
+ auto read_effort_label = effort_label("READ");
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "invalidated_extents",
+ efforts.read.num,
+ sm::description("extents of invalidated transactions"),
+ {src_label, read_effort_label}
+ ),
+ sm::make_counter(
+ "invalidated_extent_bytes",
+ efforts.read.bytes,
+ sm::description("extent bytes of invalidated transactions"),
+ {src_label, read_effort_label}
+ ),
+ }
+ );
+ continue;
+ }
+
+ // non READ invalidated efforts
+ for (auto& effort_name : invalidated_effort_names) {
+ auto& effort = [&effort_name, &efforts]() -> io_stat_t& {
+ if (effort_name == "READ") {
+ return efforts.read;
+ } else if (effort_name == "MUTATE") {
+ return efforts.mutate;
+ } else if (effort_name == "RETIRE") {
+ return efforts.retire;
+ } else if (effort_name == "FRESH") {
+ return efforts.fresh;
+ } else {
+ assert(effort_name == "FRESH_OOL_WRITTEN");
+ return efforts.fresh_ool_written;
+ }
+ }();
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "invalidated_extents",
+ effort.num,
+ sm::description("extents of invalidated transactions"),
+ {src_label, effort_label(effort_name)}
+ ),
+ sm::make_counter(
+ "invalidated_extent_bytes",
+ effort.bytes,
+ sm::description("extent bytes of invalidated transactions"),
+ {src_label, effort_label(effort_name)}
+ ),
+ }
+ );
+ } // effort_name
+
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "trans_invalidated",
+ efforts.total_trans_invalidated,
+ sm::description("total number of transactions invalidated"),
+ {src_label}
+ ),
+ sm::make_counter(
+ "invalidated_delta_bytes",
+ efforts.mutate_delta_bytes,
+ sm::description("delta bytes of invalidated transactions"),
+ {src_label}
+ ),
+ sm::make_counter(
+ "invalidated_ool_records",
+ efforts.num_ool_records,
+ sm::description("number of ool-records from invalidated transactions"),
+ {src_label}
+ ),
+ sm::make_counter(
+ "invalidated_ool_record_bytes",
+ efforts.ool_record_bytes,
+ sm::description("bytes of ool-record from invalidated transactions"),
+ {src_label}
+ ),
+ }
+ );
+ } // src
+
+ // committed efforts
+ const string_view committed_effort_names[] = {
+ "READ"sv,
+ "MUTATE"sv,
+ "RETIRE"sv,
+ "FRESH_INVALID"sv,
+ "FRESH_INLINE"sv,
+ "FRESH_OOL"sv,
+ };
+ for (auto& [src, src_label] : labels_by_src) {
+ if (src == src_t::READ) {
+ // READ transaction won't commit
+ continue;
+ }
+ auto& efforts = get_by_src(stats.committed_efforts_by_src, src);
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "trans_committed",
+ efforts.num_trans,
+ sm::description("total number of transaction committed"),
+ {src_label}
+ ),
+ sm::make_counter(
+ "committed_ool_records",
+ efforts.num_ool_records,
+ sm::description("number of ool-records from committed transactions"),
+ {src_label}
+ ),
+ sm::make_counter(
+ "committed_ool_record_metadata_bytes",
+ efforts.ool_record_metadata_bytes,
+ sm::description("bytes of ool-record metadata from committed transactions"),
+ {src_label}
+ ),
+ sm::make_counter(
+ "committed_ool_record_data_bytes",
+ efforts.ool_record_data_bytes,
+ sm::description("bytes of ool-record data from committed transactions"),
+ {src_label}
+ ),
+ sm::make_counter(
+ "committed_inline_record_metadata_bytes",
+ efforts.inline_record_metadata_bytes,
+ sm::description("bytes of inline-record metadata from committed transactions"
+ "(excludes delta buffer)"),
+ {src_label}
+ ),
+ }
+ );
+ for (auto& effort_name : committed_effort_names) {
+ auto& effort_by_ext = [&efforts, &effort_name]()
+ -> counter_by_extent_t<io_stat_t>& {
+ if (effort_name == "READ") {
+ return efforts.read_by_ext;
+ } else if (effort_name == "MUTATE") {
+ return efforts.mutate_by_ext;
+ } else if (effort_name == "RETIRE") {
+ return efforts.retire_by_ext;
+ } else if (effort_name == "FRESH_INVALID") {
+ return efforts.fresh_invalid_by_ext;
+ } else if (effort_name == "FRESH_INLINE") {
+ return efforts.fresh_inline_by_ext;
+ } else {
+ assert(effort_name == "FRESH_OOL");
+ return efforts.fresh_ool_by_ext;
+ }
+ }();
+ for (auto& [ext, ext_label] : labels_by_ext) {
+ auto& effort = get_by_ext(effort_by_ext, ext);
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "committed_extents",
+ effort.num,
+ sm::description("extents of committed transactions"),
+ {src_label, effort_label(effort_name), ext_label}
+ ),
+ sm::make_counter(
+ "committed_extent_bytes",
+ effort.bytes,
+ sm::description("extent bytes of committed transactions"),
+ {src_label, effort_label(effort_name), ext_label}
+ ),
+ }
+ );
+ } // ext
+ } // effort_name
+
+ auto& delta_by_ext = efforts.delta_bytes_by_ext;
+ for (auto& [ext, ext_label] : labels_by_ext) {
+ auto& value = get_by_ext(delta_by_ext, ext);
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "committed_delta_bytes",
+ value,
+ sm::description("delta bytes of committed transactions"),
+ {src_label, ext_label}
+ ),
+ }
+ );
+ } // ext
+ } // src
+
+ // successful read efforts
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "trans_read_successful",
+ stats.success_read_efforts.num_trans,
+ sm::description("total number of successful read transactions")
+ ),
+ sm::make_counter(
+ "successful_read_extents",
+ stats.success_read_efforts.read.num,
+ sm::description("extents of successful read transactions")
+ ),
+ sm::make_counter(
+ "successful_read_extent_bytes",
+ stats.success_read_efforts.read.bytes,
+ sm::description("extent bytes of successful read transactions")
+ ),
+ }
+ );
+ }
+
+ /**
+ * Cached extents (including placeholders)
+ *
+ * Dirty extents
+ */
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "cached_extents",
+ [this] {
+ return extents.size();
+ },
+ sm::description("total number of cached extents")
+ ),
+ sm::make_counter(
+ "cached_extent_bytes",
+ [this] {
+ return extents.get_bytes();
+ },
+ sm::description("total bytes of cached extents")
+ ),
+ sm::make_counter(
+ "dirty_extents",
+ [this] {
+ return dirty.size();
+ },
+ sm::description("total number of dirty extents")
+ ),
+ sm::make_counter(
+ "dirty_extent_bytes",
+ stats.dirty_bytes,
+ sm::description("total bytes of dirty extents")
+ ),
+ sm::make_counter(
+ "cache_lru_size_bytes",
+ [this] {
+ return lru.get_current_contents_bytes();
+ },
+ sm::description("total bytes pinned by the lru")
+ ),
+ sm::make_counter(
+ "cache_lru_size_extents",
+ [this] {
+ return lru.get_current_contents_extents();
+ },
+ sm::description("total extents pinned by the lru")
+ ),
+ }
+ );
+
+ /**
+ * tree stats
+ */
+ auto tree_label = sm::label("tree");
+ auto onode_label = tree_label("ONODE");
+ auto omap_label = tree_label("OMAP");
+ auto lba_label = tree_label("LBA");
+ auto backref_label = tree_label("BACKREF");
+ auto register_tree_metrics = [&labels_by_src, &onode_label, &omap_label, this](
+ const sm::label_instance& tree_label,
+ uint64_t& tree_depth,
+ int64_t& tree_extents_num,
+ counter_by_src_t<tree_efforts_t>& committed_tree_efforts,
+ counter_by_src_t<tree_efforts_t>& invalidated_tree_efforts) {
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "tree_depth",
+ tree_depth,
+ sm::description("the depth of tree"),
+ {tree_label}
+ ),
+ sm::make_counter(
+ "tree_extents_num",
+ tree_extents_num,
+ sm::description("num of extents of the tree"),
+ {tree_label}
+ )
+ }
+ );
+ for (auto& [src, src_label] : labels_by_src) {
+ if (src == src_t::READ) {
+ // READ transaction won't contain any tree inserts and erases
+ continue;
+ }
+ if (is_background_transaction(src) &&
+ (tree_label == onode_label ||
+ tree_label == omap_label)) {
+ // CLEANER transaction won't contain any onode/omap tree operations
+ continue;
+ }
+ auto& committed_efforts = get_by_src(committed_tree_efforts, src);
+ auto& invalidated_efforts = get_by_src(invalidated_tree_efforts, src);
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "tree_inserts_committed",
+ committed_efforts.num_inserts,
+ sm::description("total number of committed insert operations"),
+ {tree_label, src_label}
+ ),
+ sm::make_counter(
+ "tree_erases_committed",
+ committed_efforts.num_erases,
+ sm::description("total number of committed erase operations"),
+ {tree_label, src_label}
+ ),
+ sm::make_counter(
+ "tree_updates_committed",
+ committed_efforts.num_updates,
+ sm::description("total number of committed update operations"),
+ {tree_label, src_label}
+ ),
+ sm::make_counter(
+ "tree_inserts_invalidated",
+ invalidated_efforts.num_inserts,
+ sm::description("total number of invalidated insert operations"),
+ {tree_label, src_label}
+ ),
+ sm::make_counter(
+ "tree_erases_invalidated",
+ invalidated_efforts.num_erases,
+ sm::description("total number of invalidated erase operations"),
+ {tree_label, src_label}
+ ),
+ sm::make_counter(
+ "tree_updates_invalidated",
+ invalidated_efforts.num_updates,
+ sm::description("total number of invalidated update operations"),
+ {tree_label, src_label}
+ ),
+ }
+ );
+ }
+ };
+ register_tree_metrics(
+ onode_label,
+ stats.onode_tree_depth,
+ stats.onode_tree_extents_num,
+ stats.committed_onode_tree_efforts,
+ stats.invalidated_onode_tree_efforts);
+ register_tree_metrics(
+ omap_label,
+ stats.omap_tree_depth,
+ stats.omap_tree_extents_num,
+ stats.committed_omap_tree_efforts,
+ stats.invalidated_omap_tree_efforts);
+ register_tree_metrics(
+ lba_label,
+ stats.lba_tree_depth,
+ stats.lba_tree_extents_num,
+ stats.committed_lba_tree_efforts,
+ stats.invalidated_lba_tree_efforts);
+ register_tree_metrics(
+ backref_label,
+ stats.backref_tree_depth,
+ stats.backref_tree_extents_num,
+ stats.committed_backref_tree_efforts,
+ stats.invalidated_backref_tree_efforts);
+
+ /**
+ * conflict combinations
+ */
+ auto srcs_label = sm::label("srcs");
+ auto num_srcs = static_cast<std::size_t>(Transaction::src_t::MAX);
+ std::size_t srcs_index = 0;
+ for (uint8_t src2_int = 0; src2_int < num_srcs; ++src2_int) {
+ auto src2 = static_cast<Transaction::src_t>(src2_int);
+ for (uint8_t src1_int = src2_int; src1_int < num_srcs; ++src1_int) {
+ ++srcs_index;
+ auto src1 = static_cast<Transaction::src_t>(src1_int);
+ // impossible combinations
+ // should be consistent with checks in account_conflict()
+ if ((src1 == Transaction::src_t::READ &&
+ src2 == Transaction::src_t::READ) ||
+ (src1 == Transaction::src_t::TRIM_DIRTY &&
+ src2 == Transaction::src_t::TRIM_DIRTY) ||
+ (src1 == Transaction::src_t::CLEANER_MAIN &&
+ src2 == Transaction::src_t::CLEANER_MAIN) ||
+ (src1 == Transaction::src_t::CLEANER_COLD &&
+ src2 == Transaction::src_t::CLEANER_COLD) ||
+ (src1 == Transaction::src_t::TRIM_ALLOC &&
+ src2 == Transaction::src_t::TRIM_ALLOC)) {
+ continue;
+ }
+ std::ostringstream oss;
+ oss << src1 << "," << src2;
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "trans_srcs_invalidated",
+ stats.trans_conflicts_by_srcs[srcs_index - 1],
+ sm::description("total number conflicted transactions by src pair"),
+ {srcs_label(oss.str())}
+ ),
+ }
+ );
+ }
+ }
+ assert(srcs_index == NUM_SRC_COMB);
+ srcs_index = 0;
+ for (uint8_t src_int = 0; src_int < num_srcs; ++src_int) {
+ ++srcs_index;
+ auto src = static_cast<Transaction::src_t>(src_int);
+ std::ostringstream oss;
+ oss << "UNKNOWN," << src;
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "trans_srcs_invalidated",
+ stats.trans_conflicts_by_unknown[srcs_index - 1],
+ sm::description("total number conflicted transactions by src pair"),
+ {srcs_label(oss.str())}
+ ),
+ }
+ );
+ }
+
+ /**
+ * rewrite version
+ */
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "version_count_dirty",
+ stats.committed_dirty_version.num,
+ sm::description("total number of rewrite-dirty extents")
+ ),
+ sm::make_counter(
+ "version_sum_dirty",
+ stats.committed_dirty_version.version,
+ sm::description("sum of the version from rewrite-dirty extents")
+ ),
+ sm::make_counter(
+ "version_count_reclaim",
+ stats.committed_reclaim_version.num,
+ sm::description("total number of rewrite-reclaim extents")
+ ),
+ sm::make_counter(
+ "version_sum_reclaim",
+ stats.committed_reclaim_version.version,
+ sm::description("sum of the version from rewrite-reclaim extents")
+ ),
+ }
+ );
+}
+
+void Cache::add_extent(
+ CachedExtentRef ref,
+ const Transaction::src_t* p_src=nullptr)
+{
+ assert(ref->is_valid());
+ assert(ref->user_hint == PLACEMENT_HINT_NULL);
+ assert(ref->rewrite_generation == NULL_GENERATION);
+ extents.insert(*ref);
+ if (ref->is_dirty()) {
+ add_to_dirty(ref);
+ } else {
+ touch_extent(*ref, p_src);
+ }
+}
+
+void Cache::mark_dirty(CachedExtentRef ref)
+{
+ if (ref->is_dirty()) {
+ assert(ref->primary_ref_list_hook.is_linked());
+ return;
+ }
+
+ lru.remove_from_lru(*ref);
+ ref->state = CachedExtent::extent_state_t::DIRTY;
+ add_to_dirty(ref);
+}
+
+void Cache::add_to_dirty(CachedExtentRef ref)
+{
+ assert(ref->is_dirty());
+ assert(!ref->primary_ref_list_hook.is_linked());
+ ceph_assert(ref->get_modify_time() != NULL_TIME);
+ intrusive_ptr_add_ref(&*ref);
+ dirty.push_back(*ref);
+ stats.dirty_bytes += ref->get_length();
+}
+
+void Cache::remove_from_dirty(CachedExtentRef ref)
+{
+ if (ref->is_dirty()) {
+ ceph_assert(ref->primary_ref_list_hook.is_linked());
+ stats.dirty_bytes -= ref->get_length();
+ dirty.erase(dirty.s_iterator_to(*ref));
+ intrusive_ptr_release(&*ref);
+ } else {
+ ceph_assert(!ref->primary_ref_list_hook.is_linked());
+ }
+}
+
+void Cache::remove_extent(CachedExtentRef ref)
+{
+ assert(ref->is_valid());
+ if (ref->is_dirty()) {
+ remove_from_dirty(ref);
+ } else if (!ref->is_placeholder()) {
+ lru.remove_from_lru(*ref);
+ }
+ extents.erase(*ref);
+}
+
+void Cache::commit_retire_extent(
+ Transaction& t,
+ CachedExtentRef ref)
+{
+ remove_extent(ref);
+
+ ref->dirty_from_or_retired_at = JOURNAL_SEQ_NULL;
+ invalidate_extent(t, *ref);
+}
+
+void Cache::commit_replace_extent(
+ Transaction& t,
+ CachedExtentRef next,
+ CachedExtentRef prev)
+{
+ assert(next->is_dirty());
+ assert(next->get_paddr() == prev->get_paddr());
+ assert(next->version == prev->version + 1);
+ extents.replace(*next, *prev);
+
+ if (prev->get_type() == extent_types_t::ROOT) {
+ assert(prev->is_stable_clean()
+ || prev->primary_ref_list_hook.is_linked());
+ if (prev->is_dirty()) {
+ stats.dirty_bytes -= prev->get_length();
+ dirty.erase(dirty.s_iterator_to(*prev));
+ intrusive_ptr_release(&*prev);
+ }
+ add_to_dirty(next);
+ } else if (prev->is_dirty()) {
+ assert(prev->get_dirty_from() == next->get_dirty_from());
+ assert(prev->primary_ref_list_hook.is_linked());
+ auto prev_it = dirty.iterator_to(*prev);
+ dirty.insert(prev_it, *next);
+ dirty.erase(prev_it);
+ intrusive_ptr_release(&*prev);
+ intrusive_ptr_add_ref(&*next);
+ } else {
+ lru.remove_from_lru(*prev);
+ add_to_dirty(next);
+ }
+
+ next->on_replace_prior(t);
+ invalidate_extent(t, *prev);
+}
+
+void Cache::invalidate_extent(
+ Transaction& t,
+ CachedExtent& extent)
+{
+ if (!extent.may_conflict()) {
+ assert(extent.transactions.empty());
+ extent.set_invalid(t);
+ return;
+ }
+
+ LOG_PREFIX(Cache::invalidate_extent);
+ bool do_conflict_log = true;
+ for (auto &&i: extent.transactions) {
+ if (!i.t->conflicted) {
+ if (do_conflict_log) {
+ SUBDEBUGT(seastore_t, "conflict begin -- {}", t, extent);
+ do_conflict_log = false;
+ }
+ assert(!i.t->is_weak());
+ account_conflict(t.get_src(), i.t->get_src());
+ mark_transaction_conflicted(*i.t, extent);
+ }
+ }
+ extent.set_invalid(t);
+}
+
+void Cache::mark_transaction_conflicted(
+ Transaction& t, CachedExtent& conflicting_extent)
+{
+ LOG_PREFIX(Cache::mark_transaction_conflicted);
+ SUBTRACET(seastore_t, "", t);
+ assert(!t.conflicted);
+ t.conflicted = true;
+
+ auto& efforts = get_by_src(stats.invalidated_efforts_by_src,
+ t.get_src());
+ ++efforts.total_trans_invalidated;
+
+ auto& counter = get_by_ext(efforts.num_trans_invalidated,
+ conflicting_extent.get_type());
+ ++counter;
+
+ io_stat_t read_stat;
+ for (auto &i: t.read_set) {
+ read_stat.increment(i.ref->get_length());
+ }
+ efforts.read.increment_stat(read_stat);
+
+ if (t.get_src() != Transaction::src_t::READ) {
+ io_stat_t retire_stat;
+ for (auto &i: t.retired_set) {
+ retire_stat.increment(i->get_length());
+ }
+ efforts.retire.increment_stat(retire_stat);
+
+ auto& fresh_stat = t.get_fresh_block_stats();
+ efforts.fresh.increment_stat(fresh_stat);
+
+ io_stat_t delta_stat;
+ for (auto &i: t.mutated_block_list) {
+ if (!i->is_valid()) {
+ continue;
+ }
+ efforts.mutate.increment(i->get_length());
+ delta_stat.increment(i->get_delta().length());
+ }
+ efforts.mutate_delta_bytes += delta_stat.bytes;
+
+ for (auto &i: t.pre_alloc_list) {
+ epm.mark_space_free(i->get_paddr(), i->get_length());
+ }
+
+ auto& ool_stats = t.get_ool_write_stats();
+ efforts.fresh_ool_written.increment_stat(ool_stats.extents);
+ efforts.num_ool_records += ool_stats.num_records;
+ auto ool_record_bytes = (ool_stats.md_bytes + ool_stats.get_data_bytes());
+ efforts.ool_record_bytes += ool_record_bytes;
+
+ if (is_background_transaction(t.get_src())) {
+ // CLEANER transaction won't contain any onode/omap tree operations
+ assert(t.onode_tree_stats.is_clear());
+ assert(t.omap_tree_stats.is_clear());
+ } else {
+ get_by_src(stats.invalidated_onode_tree_efforts, t.get_src()
+ ).increment(t.onode_tree_stats);
+ get_by_src(stats.invalidated_omap_tree_efforts, t.get_src()
+ ).increment(t.omap_tree_stats);
+ }
+
+ get_by_src(stats.invalidated_lba_tree_efforts, t.get_src()
+ ).increment(t.lba_tree_stats);
+ get_by_src(stats.invalidated_backref_tree_efforts, t.get_src()
+ ).increment(t.backref_tree_stats);
+
+ SUBDEBUGT(seastore_t,
+ "discard {} read, {} fresh, {} delta, {} retire, {}({}B) ool-records",
+ t,
+ read_stat,
+ fresh_stat,
+ delta_stat,
+ retire_stat,
+ ool_stats.num_records,
+ ool_record_bytes);
+ } else {
+ // read transaction won't have non-read efforts
+ assert(t.retired_set.empty());
+ assert(t.get_fresh_block_stats().is_clear());
+ assert(t.mutated_block_list.empty());
+ assert(t.get_ool_write_stats().is_clear());
+ assert(t.onode_tree_stats.is_clear());
+ assert(t.omap_tree_stats.is_clear());
+ assert(t.lba_tree_stats.is_clear());
+ assert(t.backref_tree_stats.is_clear());
+ SUBDEBUGT(seastore_t, "discard {} read", t, read_stat);
+ }
+}
+
+void Cache::on_transaction_destruct(Transaction& t)
+{
+ LOG_PREFIX(Cache::on_transaction_destruct);
+ SUBTRACET(seastore_t, "", t);
+ if (t.get_src() == Transaction::src_t::READ &&
+ t.conflicted == false) {
+ io_stat_t read_stat;
+ for (auto &i: t.read_set) {
+ read_stat.increment(i.ref->get_length());
+ }
+ SUBDEBUGT(seastore_t, "done {} read", t, read_stat);
+
+ if (!t.is_weak()) {
+ // exclude weak transaction as it is impossible to conflict
+ ++stats.success_read_efforts.num_trans;
+ stats.success_read_efforts.read.increment_stat(read_stat);
+ }
+
+ // read transaction won't have non-read efforts
+ assert(t.retired_set.empty());
+ assert(t.get_fresh_block_stats().is_clear());
+ assert(t.mutated_block_list.empty());
+ assert(t.onode_tree_stats.is_clear());
+ assert(t.omap_tree_stats.is_clear());
+ assert(t.lba_tree_stats.is_clear());
+ assert(t.backref_tree_stats.is_clear());
+ }
+}
+
+CachedExtentRef Cache::alloc_new_extent_by_type(
+ Transaction &t, ///< [in, out] current transaction
+ extent_types_t type, ///< [in] type tag
+ extent_len_t length, ///< [in] length
+ placement_hint_t hint, ///< [in] user hint
+ rewrite_gen_t gen ///< [in] rewrite generation
+)
+{
+ LOG_PREFIX(Cache::alloc_new_extent_by_type);
+ SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ t, type, length, hint, rewrite_gen_printer_t{gen});
+ switch (type) {
+ case extent_types_t::ROOT:
+ ceph_assert(0 == "ROOT is never directly alloc'd");
+ return CachedExtentRef();
+ case extent_types_t::LADDR_INTERNAL:
+ return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length, hint, gen);
+ case extent_types_t::LADDR_LEAF:
+ return alloc_new_extent<lba_manager::btree::LBALeafNode>(
+ t, length, hint, gen);
+ case extent_types_t::ONODE_BLOCK_STAGED:
+ return alloc_new_extent<onode::SeastoreNodeExtent>(t, length, hint, gen);
+ case extent_types_t::OMAP_INNER:
+ return alloc_new_extent<omap_manager::OMapInnerNode>(t, length, hint, gen);
+ case extent_types_t::OMAP_LEAF:
+ return alloc_new_extent<omap_manager::OMapLeafNode>(t, length, hint, gen);
+ case extent_types_t::COLL_BLOCK:
+ return alloc_new_extent<collection_manager::CollectionNode>(t, length, hint, gen);
+ case extent_types_t::OBJECT_DATA_BLOCK:
+ return alloc_new_extent<ObjectDataBlock>(t, length, hint, gen);
+ case extent_types_t::RETIRED_PLACEHOLDER:
+ ceph_assert(0 == "impossible");
+ return CachedExtentRef();
+ case extent_types_t::TEST_BLOCK:
+ return alloc_new_extent<TestBlock>(t, length, hint, gen);
+ case extent_types_t::TEST_BLOCK_PHYSICAL:
+ return alloc_new_extent<TestBlockPhysical>(t, length, hint, gen);
+ case extent_types_t::NONE: {
+ ceph_assert(0 == "NONE is an invalid extent type");
+ return CachedExtentRef();
+ }
+ default:
+ ceph_assert(0 == "impossible");
+ return CachedExtentRef();
+ }
+}
+
+CachedExtentRef Cache::duplicate_for_write(
+ Transaction &t,
+ CachedExtentRef i) {
+ LOG_PREFIX(Cache::duplicate_for_write);
+ assert(i->is_fully_loaded());
+
+ if (i->is_mutable())
+ return i;
+
+ if (i->is_exist_clean()) {
+ i->version++;
+ i->state = CachedExtent::extent_state_t::EXIST_MUTATION_PENDING;
+ i->last_committed_crc = i->get_crc32c();
+ // deepcopy the buffer of exist clean extent beacuse it shares
+ // buffer with original clean extent.
+ auto bp = i->get_bptr();
+ auto nbp = ceph::bufferptr(bp.c_str(), bp.length());
+ i->set_bptr(std::move(nbp));
+
+ t.add_mutated_extent(i);
+ DEBUGT("duplicate existing extent {}", t, *i);
+ return i;
+ }
+
+ auto ret = i->duplicate_for_write(t);
+ ret->pending_for_transaction = t.get_trans_id();
+ ret->prior_instance = i;
+ // duplicate_for_write won't occur after ool write finished
+ assert(!i->prior_poffset);
+ auto [iter, inserted] = i->mutation_pendings.insert(*ret);
+ ceph_assert(inserted);
+ t.add_mutated_extent(ret);
+ if (ret->get_type() == extent_types_t::ROOT) {
+ t.root = ret->cast<RootBlock>();
+ } else {
+ ret->last_committed_crc = i->last_committed_crc;
+ }
+
+ ret->version++;
+ ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
+ DEBUGT("{} -> {}", t, *i, *ret);
+ return ret;
+}
+
+record_t Cache::prepare_record(
+ Transaction &t,
+ const journal_seq_t &journal_head,
+ const journal_seq_t &journal_dirty_tail)
+{
+ LOG_PREFIX(Cache::prepare_record);
+ SUBTRACET(seastore_t, "enter", t);
+
+ auto trans_src = t.get_src();
+ assert(!t.is_weak());
+ assert(trans_src != Transaction::src_t::READ);
+
+ auto& efforts = get_by_src(stats.committed_efforts_by_src,
+ trans_src);
+
+ // Should be valid due to interruptible future
+ io_stat_t read_stat;
+ for (auto &i: t.read_set) {
+ if (!i.ref->is_valid()) {
+ SUBERRORT(seastore_t,
+ "read_set got invalid extent, aborting -- {}", t, *i.ref);
+ ceph_abort("no invalid extent allowed in transactions' read_set");
+ }
+ get_by_ext(efforts.read_by_ext,
+ i.ref->get_type()).increment(i.ref->get_length());
+ read_stat.increment(i.ref->get_length());
+ }
+ t.read_set.clear();
+ t.write_set.clear();
+
+ record_t record(trans_src);
+ auto commit_time = seastar::lowres_system_clock::now();
+
+ // Add new copy of mutated blocks, set_io_wait to block until written
+ record.deltas.reserve(t.mutated_block_list.size());
+ io_stat_t delta_stat;
+ for (auto &i: t.mutated_block_list) {
+ if (!i->is_valid()) {
+ DEBUGT("invalid mutated extent -- {}", t, *i);
+ continue;
+ }
+ assert(i->is_exist_mutation_pending() ||
+ i->prior_instance);
+ get_by_ext(efforts.mutate_by_ext,
+ i->get_type()).increment(i->get_length());
+
+ auto delta_bl = i->get_delta();
+ auto delta_length = delta_bl.length();
+ i->set_modify_time(commit_time);
+ DEBUGT("mutated extent with {}B delta -- {}",
+ t, delta_length, *i);
+ if (!i->is_exist_mutation_pending()) {
+ DEBUGT("commit replace extent ... -- {}, prior={}",
+ t, *i, *i->prior_instance);
+ // extent with EXIST_MUTATION_PENDING doesn't have
+ // prior_instance field so skip these extents.
+ // the existing extents should be added into Cache
+ // during complete_commit to sync with gc transaction.
+ commit_replace_extent(t, i, i->prior_instance);
+ }
+
+ i->prepare_write();
+ i->set_io_wait();
+ i->prepare_commit();
+
+ assert(i->get_version() > 0);
+ auto final_crc = i->get_crc32c();
+ if (i->get_type() == extent_types_t::ROOT) {
+ SUBTRACET(seastore_t, "writing out root delta {}B -- {}",
+ t, delta_length, *i);
+ assert(t.root == i);
+ root = t.root;
+ record.push_back(
+ delta_info_t{
+ extent_types_t::ROOT,
+ P_ADDR_NULL,
+ L_ADDR_NULL,
+ 0,
+ 0,
+ 0,
+ t.root->get_version() - 1,
+ MAX_SEG_SEQ,
+ segment_type_t::NULL_SEG,
+ std::move(delta_bl)
+ });
+ } else {
+ auto sseq = NULL_SEG_SEQ;
+ auto stype = segment_type_t::NULL_SEG;
+
+ // FIXME: This is specific to the segmented implementation
+ if (i->get_paddr().get_addr_type() == paddr_types_t::SEGMENT) {
+ auto sid = i->get_paddr().as_seg_paddr().get_segment_id();
+ auto sinfo = get_segment_info(sid);
+ if (sinfo) {
+ sseq = sinfo->seq;
+ stype = sinfo->type;
+ }
+ }
+
+ record.push_back(
+ delta_info_t{
+ i->get_type(),
+ i->get_paddr(),
+ (i->is_logical()
+ ? i->cast<LogicalCachedExtent>()->get_laddr()
+ : L_ADDR_NULL),
+ i->last_committed_crc,
+ final_crc,
+ i->get_length(),
+ i->get_version() - 1,
+ sseq,
+ stype,
+ std::move(delta_bl)
+ });
+ i->last_committed_crc = final_crc;
+ }
+ assert(delta_length);
+ get_by_ext(efforts.delta_bytes_by_ext,
+ i->get_type()) += delta_length;
+ delta_stat.increment(delta_length);
+ }
+
+ // Transaction is now a go, set up in-memory cache state
+ // invalidate now invalid blocks
+ io_stat_t retire_stat;
+ std::vector<alloc_delta_t> alloc_deltas;
+ alloc_delta_t rel_delta;
+ rel_delta.op = alloc_delta_t::op_types_t::CLEAR;
+ for (auto &i: t.retired_set) {
+ get_by_ext(efforts.retire_by_ext,
+ i->get_type()).increment(i->get_length());
+ retire_stat.increment(i->get_length());
+ DEBUGT("retired and remove extent -- {}", t, *i);
+ commit_retire_extent(t, i);
+ if (is_backref_mapped_extent_node(i)
+ || is_retired_placeholder(i->get_type())) {
+ rel_delta.alloc_blk_ranges.emplace_back(
+ i->get_paddr(),
+ L_ADDR_NULL,
+ i->get_length(),
+ i->get_type());
+ }
+ }
+ alloc_deltas.emplace_back(std::move(rel_delta));
+
+ record.extents.reserve(t.inline_block_list.size());
+ io_stat_t fresh_stat;
+ io_stat_t fresh_invalid_stat;
+ alloc_delta_t alloc_delta;
+ alloc_delta.op = alloc_delta_t::op_types_t::SET;
+ for (auto &i: t.inline_block_list) {
+ if (!i->is_valid()) {
+ DEBUGT("invalid fresh inline extent -- {}", t, *i);
+ fresh_invalid_stat.increment(i->get_length());
+ get_by_ext(efforts.fresh_invalid_by_ext,
+ i->get_type()).increment(i->get_length());
+ } else {
+ TRACET("fresh inline extent -- {}", t, *i);
+ }
+ fresh_stat.increment(i->get_length());
+ get_by_ext(efforts.fresh_inline_by_ext,
+ i->get_type()).increment(i->get_length());
+ assert(i->is_inline() || i->get_paddr().is_fake());
+
+ bufferlist bl;
+ i->prepare_write();
+ i->prepare_commit();
+ bl.append(i->get_bptr());
+ if (i->get_type() == extent_types_t::ROOT) {
+ ceph_assert(0 == "ROOT never gets written as a fresh block");
+ }
+
+ assert(bl.length() == i->get_length());
+ auto modify_time = i->get_modify_time();
+ if (modify_time == NULL_TIME) {
+ modify_time = commit_time;
+ }
+ record.push_back(extent_t{
+ i->get_type(),
+ i->is_logical()
+ ? i->cast<LogicalCachedExtent>()->get_laddr()
+ : (is_lba_node(i->get_type())
+ ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
+ : L_ADDR_NULL),
+ std::move(bl)
+ },
+ modify_time);
+ if (i->is_valid()
+ && is_backref_mapped_extent_node(i)) {
+ alloc_delta.alloc_blk_ranges.emplace_back(
+ i->get_paddr(),
+ i->is_logical()
+ ? i->cast<LogicalCachedExtent>()->get_laddr()
+ : (is_lba_node(i->get_type())
+ ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
+ : L_ADDR_NULL),
+ i->get_length(),
+ i->get_type());
+ }
+ }
+
+ for (auto &i: t.written_ool_block_list) {
+ TRACET("fresh ool extent -- {}", t, *i);
+ ceph_assert(i->is_valid());
+ assert(!i->is_inline());
+ get_by_ext(efforts.fresh_ool_by_ext,
+ i->get_type()).increment(i->get_length());
+ i->prepare_commit();
+ if (is_backref_mapped_extent_node(i)) {
+ alloc_delta.alloc_blk_ranges.emplace_back(
+ i->get_paddr(),
+ i->is_logical()
+ ? i->cast<LogicalCachedExtent>()->get_laddr()
+ : i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin,
+ i->get_length(),
+ i->get_type());
+ }
+ }
+
+ for (auto &i: t.existing_block_list) {
+ if (i->is_valid()) {
+ alloc_delta.alloc_blk_ranges.emplace_back(
+ i->get_paddr(),
+ i->cast<LogicalCachedExtent>()->get_laddr(),
+ i->get_length(),
+ i->get_type());
+ }
+ }
+ alloc_deltas.emplace_back(std::move(alloc_delta));
+
+ for (auto b : alloc_deltas) {
+ bufferlist bl;
+ encode(b, bl);
+ delta_info_t delta;
+ delta.type = extent_types_t::ALLOC_INFO;
+ delta.bl = bl;
+ record.push_back(std::move(delta));
+ }
+
+ if (is_background_transaction(trans_src)) {
+ assert(journal_head != JOURNAL_SEQ_NULL);
+ assert(journal_dirty_tail != JOURNAL_SEQ_NULL);
+ journal_seq_t dirty_tail;
+ auto maybe_dirty_tail = get_oldest_dirty_from();
+ if (!maybe_dirty_tail.has_value()) {
+ dirty_tail = journal_head;
+ SUBINFOT(seastore_t, "dirty_tail all trimmed, set to head {}, src={}",
+ t, dirty_tail, trans_src);
+ } else if (*maybe_dirty_tail == JOURNAL_SEQ_NULL) {
+ dirty_tail = journal_dirty_tail;
+ SUBINFOT(seastore_t, "dirty_tail is pending, set to {}, src={}",
+ t, dirty_tail, trans_src);
+ } else {
+ dirty_tail = *maybe_dirty_tail;
+ }
+ ceph_assert(dirty_tail != JOURNAL_SEQ_NULL);
+ journal_seq_t alloc_tail;
+ auto maybe_alloc_tail = get_oldest_backref_dirty_from();
+ if (!maybe_alloc_tail.has_value()) {
+ // FIXME: the replay point of the allocations requires to be accurate.
+ // Setting the alloc_tail to get_journal_head() cannot skip replaying the
+ // last unnecessary record.
+ alloc_tail = journal_head;
+ SUBINFOT(seastore_t, "alloc_tail all trimmed, set to head {}, src={}",
+ t, alloc_tail, trans_src);
+ } else if (*maybe_alloc_tail == JOURNAL_SEQ_NULL) {
+ ceph_abort("impossible");
+ } else {
+ alloc_tail = *maybe_alloc_tail;
+ }
+ ceph_assert(alloc_tail != JOURNAL_SEQ_NULL);
+ auto tails = journal_tail_delta_t{alloc_tail, dirty_tail};
+ SUBDEBUGT(seastore_t, "update tails as delta {}", t, tails);
+ bufferlist bl;
+ encode(tails, bl);
+ delta_info_t delta;
+ delta.type = extent_types_t::JOURNAL_TAIL;
+ delta.bl = bl;
+ record.push_back(std::move(delta));
+ }
+
+ ceph_assert(t.get_fresh_block_stats().num ==
+ t.inline_block_list.size() +
+ t.written_ool_block_list.size() +
+ t.num_delayed_invalid_extents +
+ t.num_allocated_invalid_extents);
+
+ auto& ool_stats = t.get_ool_write_stats();
+ ceph_assert(ool_stats.extents.num == t.written_ool_block_list.size());
+
+ if (record.is_empty()) {
+ SUBINFOT(seastore_t,
+ "record to submit is empty, src={}", t, trans_src);
+ assert(t.onode_tree_stats.is_clear());
+ assert(t.omap_tree_stats.is_clear());
+ assert(t.lba_tree_stats.is_clear());
+ assert(t.backref_tree_stats.is_clear());
+ assert(ool_stats.is_clear());
+ }
+
+ if (record.modify_time == NULL_TIME) {
+ record.modify_time = commit_time;
+ }
+
+ SUBDEBUGT(seastore_t,
+ "commit H{} dirty_from={}, alloc_from={}, "
+ "{} read, {} fresh with {} invalid, "
+ "{} delta, {} retire, {}(md={}B, data={}B) ool-records, "
+ "{}B md, {}B data, modify_time={}",
+ t, (void*)&t.get_handle(),
+ get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
+ get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL),
+ read_stat,
+ fresh_stat,
+ fresh_invalid_stat,
+ delta_stat,
+ retire_stat,
+ ool_stats.num_records,
+ ool_stats.md_bytes,
+ ool_stats.get_data_bytes(),
+ record.size.get_raw_mdlength(),
+ record.size.dlength,
+ sea_time_point_printer_t{record.modify_time});
+ if (is_background_transaction(trans_src)) {
+ // background transaction won't contain any onode tree operations
+ assert(t.onode_tree_stats.is_clear());
+ assert(t.omap_tree_stats.is_clear());
+ } else {
+ if (t.onode_tree_stats.depth) {
+ stats.onode_tree_depth = t.onode_tree_stats.depth;
+ }
+ if (t.omap_tree_stats.depth) {
+ stats.omap_tree_depth = t.omap_tree_stats.depth;
+ }
+ stats.onode_tree_extents_num += t.onode_tree_stats.extents_num_delta;
+ ceph_assert(stats.onode_tree_extents_num >= 0);
+ get_by_src(stats.committed_onode_tree_efforts, trans_src
+ ).increment(t.onode_tree_stats);
+ stats.omap_tree_extents_num += t.omap_tree_stats.extents_num_delta;
+ ceph_assert(stats.omap_tree_extents_num >= 0);
+ get_by_src(stats.committed_omap_tree_efforts, trans_src
+ ).increment(t.omap_tree_stats);
+ }
+
+ if (t.lba_tree_stats.depth) {
+ stats.lba_tree_depth = t.lba_tree_stats.depth;
+ }
+ stats.lba_tree_extents_num += t.lba_tree_stats.extents_num_delta;
+ ceph_assert(stats.lba_tree_extents_num >= 0);
+ get_by_src(stats.committed_lba_tree_efforts, trans_src
+ ).increment(t.lba_tree_stats);
+ if (t.backref_tree_stats.depth) {
+ stats.backref_tree_depth = t.backref_tree_stats.depth;
+ }
+ stats.backref_tree_extents_num += t.backref_tree_stats.extents_num_delta;
+ ceph_assert(stats.backref_tree_extents_num >= 0);
+ get_by_src(stats.committed_backref_tree_efforts, trans_src
+ ).increment(t.backref_tree_stats);
+
+ ++(efforts.num_trans);
+ efforts.num_ool_records += ool_stats.num_records;
+ efforts.ool_record_metadata_bytes += ool_stats.md_bytes;
+ efforts.ool_record_data_bytes += ool_stats.get_data_bytes();
+ efforts.inline_record_metadata_bytes +=
+ (record.size.get_raw_mdlength() - record.get_delta_size());
+
+ auto &rewrite_version_stats = t.get_rewrite_version_stats();
+ if (trans_src == Transaction::src_t::TRIM_DIRTY) {
+ stats.committed_dirty_version.increment_stat(rewrite_version_stats);
+ } else if (trans_src == Transaction::src_t::CLEANER_MAIN ||
+ trans_src == Transaction::src_t::CLEANER_COLD) {
+ stats.committed_reclaim_version.increment_stat(rewrite_version_stats);
+ } else {
+ assert(rewrite_version_stats.is_clear());
+ }
+
+ return record;
+}
+
+void Cache::backref_batch_update(
+ std::vector<backref_entry_ref> &&list,
+ const journal_seq_t &seq)
+{
+ LOG_PREFIX(Cache::backref_batch_update);
+ DEBUG("inserting {} entries at {}", list.size(), seq);
+ ceph_assert(seq != JOURNAL_SEQ_NULL);
+
+ for (auto &ent : list) {
+ backref_entry_mset.insert(*ent);
+ }
+
+ auto iter = backref_entryrefs_by_seq.find(seq);
+ if (iter == backref_entryrefs_by_seq.end()) {
+ backref_entryrefs_by_seq.emplace(seq, std::move(list));
+ } else {
+ iter->second.insert(
+ iter->second.end(),
+ std::make_move_iterator(list.begin()),
+ std::make_move_iterator(list.end()));
+ }
+}
+
+void Cache::complete_commit(
+ Transaction &t,
+ paddr_t final_block_start,
+ journal_seq_t start_seq)
+{
+ LOG_PREFIX(Cache::complete_commit);
+ SUBTRACET(seastore_t, "final_block_start={}, start_seq={}",
+ t, final_block_start, start_seq);
+
+ std::vector<backref_entry_ref> backref_list;
+ t.for_each_fresh_block([&](const CachedExtentRef &i) {
+ if (!i->is_valid()) {
+ return;
+ }
+
+ bool is_inline = false;
+ if (i->is_inline()) {
+ is_inline = true;
+ i->set_paddr(final_block_start.add_relative(i->get_paddr()));
+ }
+ i->last_committed_crc = i->get_crc32c();
+ i->pending_for_transaction = TRANS_ID_NULL;
+ i->on_initial_write();
+
+ i->state = CachedExtent::extent_state_t::CLEAN;
+ DEBUGT("add extent as fresh, inline={} -- {}",
+ t, is_inline, *i);
+ const auto t_src = t.get_src();
+ i->invalidate_hints();
+ add_extent(i, &t_src);
+ epm.commit_space_used(i->get_paddr(), i->get_length());
+ if (is_backref_mapped_extent_node(i)) {
+ DEBUGT("backref_list new {} len {}",
+ t,
+ i->get_paddr(),
+ i->get_length());
+ backref_list.emplace_back(
+ std::make_unique<backref_entry_t>(
+ i->get_paddr(),
+ i->is_logical()
+ ? i->cast<LogicalCachedExtent>()->get_laddr()
+ : (is_lba_node(i->get_type())
+ ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
+ : L_ADDR_NULL),
+ i->get_length(),
+ i->get_type(),
+ start_seq));
+ } else if (is_backref_node(i->get_type())) {
+ add_backref_extent(
+ i->get_paddr(),
+ i->cast<backref::BackrefNode>()->get_node_meta().begin,
+ i->get_type());
+ } else {
+ ERRORT("{}", t, *i);
+ ceph_abort("not possible");
+ }
+ });
+
+ // Add new copy of mutated blocks, set_io_wait to block until written
+ for (auto &i: t.mutated_block_list) {
+ if (!i->is_valid()) {
+ continue;
+ }
+ assert(i->is_exist_mutation_pending() ||
+ i->prior_instance);
+ i->on_delta_write(final_block_start);
+ i->pending_for_transaction = TRANS_ID_NULL;
+ i->prior_instance = CachedExtentRef();
+ i->state = CachedExtent::extent_state_t::DIRTY;
+ assert(i->version > 0);
+ if (i->version == 1 || i->get_type() == extent_types_t::ROOT) {
+ i->dirty_from_or_retired_at = start_seq;
+ DEBUGT("commit extent done, become dirty -- {}", t, *i);
+ } else {
+ DEBUGT("commit extent done -- {}", t, *i);
+ }
+ }
+
+ for (auto &i: t.retired_set) {
+ epm.mark_space_free(i->get_paddr(), i->get_length());
+ }
+ for (auto &i: t.existing_block_list) {
+ if (i->is_valid()) {
+ epm.mark_space_used(i->get_paddr(), i->get_length());
+ }
+ }
+
+ for (auto &i: t.mutated_block_list) {
+ if (!i->is_valid()) {
+ continue;
+ }
+ i->complete_io();
+ }
+
+ last_commit = start_seq;
+ for (auto &i: t.retired_set) {
+ i->dirty_from_or_retired_at = start_seq;
+ if (is_backref_mapped_extent_node(i)
+ || is_retired_placeholder(i->get_type())) {
+ DEBUGT("backref_list free {} len {}",
+ t,
+ i->get_paddr(),
+ i->get_length());
+ backref_list.emplace_back(
+ std::make_unique<backref_entry_t>(
+ i->get_paddr(),
+ L_ADDR_NULL,
+ i->get_length(),
+ i->get_type(),
+ start_seq));
+ } else if (is_backref_node(i->get_type())) {
+ remove_backref_extent(i->get_paddr());
+ } else {
+ ERRORT("{}", t, *i);
+ ceph_abort("not possible");
+ }
+ }
+
+ auto existing_stats = t.get_existing_block_stats();
+ DEBUGT("total existing blocks num: {}, exist clean num: {}, "
+ "exist mutation pending num: {}",
+ t,
+ existing_stats.valid_num,
+ existing_stats.clean_num,
+ existing_stats.mutated_num);
+ for (auto &i: t.existing_block_list) {
+ if (i->is_valid()) {
+ if (i->is_exist_clean()) {
+ i->state = CachedExtent::extent_state_t::CLEAN;
+ } else {
+ assert(i->state == CachedExtent::extent_state_t::DIRTY);
+ }
+ DEBUGT("backref_list new existing {} len {}",
+ t,
+ i->get_paddr(),
+ i->get_length());
+ backref_list.emplace_back(
+ std::make_unique<backref_entry_t>(
+ i->get_paddr(),
+ i->cast<LogicalCachedExtent>()->get_laddr(),
+ i->get_length(),
+ i->get_type(),
+ start_seq));
+ const auto t_src = t.get_src();
+ add_extent(i, &t_src);
+ }
+ }
+ if (!backref_list.empty()) {
+ backref_batch_update(std::move(backref_list), start_seq);
+ }
+
+ for (auto &i: t.pre_alloc_list) {
+ if (!i->is_valid()) {
+ epm.mark_space_free(i->get_paddr(), i->get_length());
+ }
+ }
+}
+
+void Cache::init()
+{
+ LOG_PREFIX(Cache::init);
+ if (root) {
+ // initial creation will do mkfs followed by mount each of which calls init
+ DEBUG("remove extent -- prv_root={}", *root);
+ remove_extent(root);
+ root = nullptr;
+ }
+ root = new RootBlock();
+ root->init(CachedExtent::extent_state_t::CLEAN,
+ P_ADDR_ROOT,
+ PLACEMENT_HINT_NULL,
+ NULL_GENERATION,
+ TRANS_ID_NULL);
+ INFO("init root -- {}", *root);
+ extents.insert(*root);
+}
+
+Cache::mkfs_iertr::future<> Cache::mkfs(Transaction &t)
+{
+ LOG_PREFIX(Cache::mkfs);
+ INFOT("create root", t);
+ return get_root(t).si_then([this, &t](auto croot) {
+ duplicate_for_write(t, croot);
+ return mkfs_iertr::now();
+ }).handle_error_interruptible(
+ mkfs_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in Cache::mkfs"
+ }
+ );
+}
+
+Cache::close_ertr::future<> Cache::close()
+{
+ LOG_PREFIX(Cache::close);
+ INFO("close with {}({}B) dirty, dirty_from={}, alloc_from={}, "
+ "{}({}B) lru, totally {}({}B) indexed extents",
+ dirty.size(),
+ stats.dirty_bytes,
+ get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
+ get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL),
+ lru.get_current_contents_extents(),
+ lru.get_current_contents_bytes(),
+ extents.size(),
+ extents.get_bytes());
+ root.reset();
+ for (auto i = dirty.begin(); i != dirty.end(); ) {
+ auto ptr = &*i;
+ stats.dirty_bytes -= ptr->get_length();
+ dirty.erase(i++);
+ intrusive_ptr_release(ptr);
+ }
+ backref_extents.clear();
+ backref_entryrefs_by_seq.clear();
+ assert(stats.dirty_bytes == 0);
+ lru.clear();
+ return close_ertr::now();
+}
+
+Cache::replay_delta_ret
+Cache::replay_delta(
+ journal_seq_t journal_seq,
+ paddr_t record_base,
+ const delta_info_t &delta,
+ const journal_seq_t &dirty_tail,
+ const journal_seq_t &alloc_tail,
+ sea_time_point modify_time)
+{
+ LOG_PREFIX(Cache::replay_delta);
+ assert(dirty_tail != JOURNAL_SEQ_NULL);
+ assert(alloc_tail != JOURNAL_SEQ_NULL);
+ ceph_assert(modify_time != NULL_TIME);
+
+ // FIXME: This is specific to the segmented implementation
+ /* The journal may validly contain deltas for extents in
+ * since released segments. We can detect those cases by
+ * checking whether the segment in question currently has a
+ * sequence number > the current journal segment seq. We can
+ * safetly skip these deltas because the extent must already
+ * have been rewritten.
+ */
+ if (delta.paddr != P_ADDR_NULL &&
+ delta.paddr.get_addr_type() == paddr_types_t::SEGMENT) {
+ auto& seg_addr = delta.paddr.as_seg_paddr();
+ auto seg_info = get_segment_info(seg_addr.get_segment_id());
+ if (seg_info) {
+ auto delta_paddr_segment_seq = seg_info->seq;
+ auto delta_paddr_segment_type = seg_info->type;
+ if (delta_paddr_segment_seq != delta.ext_seq ||
+ delta_paddr_segment_type != delta.seg_type) {
+ DEBUG("delta is obsolete, delta_paddr_segment_seq={},"
+ " delta_paddr_segment_type={} -- {}",
+ segment_seq_printer_t{delta_paddr_segment_seq},
+ delta_paddr_segment_type,
+ delta);
+ return replay_delta_ertr::make_ready_future<bool>(false);
+ }
+ }
+ }
+
+ if (delta.type == extent_types_t::JOURNAL_TAIL) {
+ // this delta should have been dealt with during segment cleaner mounting
+ return replay_delta_ertr::make_ready_future<bool>(false);
+ }
+
+ // replay alloc
+ if (delta.type == extent_types_t::ALLOC_INFO) {
+ if (journal_seq < alloc_tail) {
+ DEBUG("journal_seq {} < alloc_tail {}, don't replay {}",
+ journal_seq, alloc_tail, delta);
+ return replay_delta_ertr::make_ready_future<bool>(false);
+ }
+
+ alloc_delta_t alloc_delta;
+ decode(alloc_delta, delta.bl);
+ std::vector<backref_entry_ref> backref_list;
+ for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) {
+ if (alloc_blk.paddr.is_relative()) {
+ assert(alloc_blk.paddr.is_record_relative());
+ alloc_blk.paddr = record_base.add_relative(alloc_blk.paddr);
+ }
+ DEBUG("replay alloc_blk {}~{} {}, journal_seq: {}",
+ alloc_blk.paddr, alloc_blk.len, alloc_blk.laddr, journal_seq);
+ backref_list.emplace_back(
+ std::make_unique<backref_entry_t>(
+ alloc_blk.paddr,
+ alloc_blk.laddr,
+ alloc_blk.len,
+ alloc_blk.type,
+ journal_seq));
+ }
+ if (!backref_list.empty()) {
+ backref_batch_update(std::move(backref_list), journal_seq);
+ }
+ return replay_delta_ertr::make_ready_future<bool>(true);
+ }
+
+ // replay dirty
+ if (journal_seq < dirty_tail) {
+ DEBUG("journal_seq {} < dirty_tail {}, don't replay {}",
+ journal_seq, dirty_tail, delta);
+ return replay_delta_ertr::make_ready_future<bool>(false);
+ }
+
+ if (delta.type == extent_types_t::ROOT) {
+ TRACE("replay root delta at {} {}, remove extent ... -- {}, prv_root={}",
+ journal_seq, record_base, delta, *root);
+ remove_extent(root);
+ root->apply_delta_and_adjust_crc(record_base, delta.bl);
+ root->dirty_from_or_retired_at = journal_seq;
+ root->state = CachedExtent::extent_state_t::DIRTY;
+ DEBUG("replayed root delta at {} {}, add extent -- {}, root={}",
+ journal_seq, record_base, delta, *root);
+ root->set_modify_time(modify_time);
+ add_extent(root);
+ return replay_delta_ertr::make_ready_future<bool>(true);
+ } else {
+ auto _get_extent_if_cached = [this](paddr_t addr)
+ -> get_extent_ertr::future<CachedExtentRef> {
+ // replay is not included by the cache hit metrics
+ auto ret = query_cache(addr, nullptr);
+ if (ret) {
+ // no retired-placeholder should be exist yet because no transaction
+ // has been created.
+ assert(ret->get_type() != extent_types_t::RETIRED_PLACEHOLDER);
+ return ret->wait_io().then([ret] {
+ return ret;
+ });
+ } else {
+ return seastar::make_ready_future<CachedExtentRef>();
+ }
+ };
+ auto extent_fut = (delta.pversion == 0 ?
+ // replay is not included by the cache hit metrics
+ _get_extent_by_type(
+ delta.type,
+ delta.paddr,
+ delta.laddr,
+ delta.length,
+ nullptr,
+ [](CachedExtent &) {},
+ [](CachedExtent &) {}) :
+ _get_extent_if_cached(
+ delta.paddr)
+ ).handle_error(
+ replay_delta_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in Cache::replay_delta"
+ }
+ );
+ return extent_fut.safe_then([=, this, &delta](auto extent) {
+ if (!extent) {
+ DEBUG("replay extent is not present, so delta is obsolete at {} {} -- {}",
+ journal_seq, record_base, delta);
+ assert(delta.pversion > 0);
+ return replay_delta_ertr::make_ready_future<bool>(true);
+ }
+
+ DEBUG("replay extent delta at {} {} ... -- {}, prv_extent={}",
+ journal_seq, record_base, delta, *extent);
+
+ assert(extent->last_committed_crc == delta.prev_crc);
+ assert(extent->version == delta.pversion);
+ extent->apply_delta_and_adjust_crc(record_base, delta.bl);
+ extent->set_modify_time(modify_time);
+ assert(extent->last_committed_crc == delta.final_crc);
+
+ extent->version++;
+ if (extent->version == 1) {
+ extent->dirty_from_or_retired_at = journal_seq;
+ DEBUG("replayed extent delta at {} {}, become dirty -- {}, extent={}" ,
+ journal_seq, record_base, delta, *extent);
+ } else {
+ DEBUG("replayed extent delta at {} {} -- {}, extent={}" ,
+ journal_seq, record_base, delta, *extent);
+ }
+ mark_dirty(extent);
+ return replay_delta_ertr::make_ready_future<bool>(true);
+ });
+ }
+}
+
+Cache::get_next_dirty_extents_ret Cache::get_next_dirty_extents(
+ Transaction &t,
+ journal_seq_t seq,
+ size_t max_bytes)
+{
+ LOG_PREFIX(Cache::get_next_dirty_extents);
+ if (dirty.empty()) {
+ DEBUGT("max_bytes={}B, seq={}, dirty is empty",
+ t, max_bytes, seq);
+ } else {
+ DEBUGT("max_bytes={}B, seq={}, dirty_from={}",
+ t, max_bytes, seq, dirty.begin()->get_dirty_from());
+ }
+ std::vector<CachedExtentRef> cand;
+ size_t bytes_so_far = 0;
+ for (auto i = dirty.begin();
+ i != dirty.end() && bytes_so_far < max_bytes;
+ ++i) {
+ auto dirty_from = i->get_dirty_from();
+ //dirty extents must be fully loaded
+ assert(i->is_fully_loaded());
+ if (unlikely(dirty_from == JOURNAL_SEQ_NULL)) {
+ ERRORT("got dirty extent with JOURNAL_SEQ_NULL -- {}", t, *i);
+ ceph_abort();
+ }
+ if (dirty_from < seq) {
+ TRACET("next extent -- {}", t, *i);
+ if (!cand.empty() && cand.back()->get_dirty_from() > dirty_from) {
+ ERRORT("dirty extents are not ordered by dirty_from -- last={}, next={}",
+ t, *cand.back(), *i);
+ ceph_abort();
+ }
+ bytes_so_far += i->get_length();
+ cand.push_back(&*i);
+ } else {
+ break;
+ }
+ }
+ return seastar::do_with(
+ std::move(cand),
+ decltype(cand)(),
+ [FNAME, this, &t](auto &cand, auto &ret) {
+ return trans_intr::do_for_each(
+ cand,
+ [FNAME, this, &t, &ret](auto &ext) {
+ TRACET("waiting on extent -- {}", t, *ext);
+ return trans_intr::make_interruptible(
+ ext->wait_io()
+ ).then_interruptible([FNAME, this, ext, &t, &ret] {
+ if (!ext->is_valid()) {
+ ++(get_by_src(stats.trans_conflicts_by_unknown, t.get_src()));
+ mark_transaction_conflicted(t, *ext);
+ return;
+ }
+
+ CachedExtentRef on_transaction;
+ auto result = t.get_extent(ext->get_paddr(), &on_transaction);
+ if (result == Transaction::get_extent_ret::ABSENT) {
+ DEBUGT("extent is absent on t -- {}", t, *ext);
+ t.add_to_read_set(ext);
+ if (ext->get_type() == extent_types_t::ROOT) {
+ if (t.root) {
+ assert(&*t.root == &*ext);
+ ceph_assert(0 == "t.root would have to already be in the read set");
+ } else {
+ assert(&*ext == &*root);
+ t.root = root;
+ }
+ }
+ ret.push_back(ext);
+ } else if (result == Transaction::get_extent_ret::PRESENT) {
+ DEBUGT("extent is present on t -- {}, on t {}", t, *ext, *on_transaction);
+ ret.push_back(on_transaction);
+ } else {
+ assert(result == Transaction::get_extent_ret::RETIRED);
+ DEBUGT("extent is retired on t -- {}", t, *ext);
+ }
+ });
+ }).then_interruptible([&ret] {
+ return std::move(ret);
+ });
+ });
+}
+
+Cache::get_root_ret Cache::get_root(Transaction &t)
+{
+ LOG_PREFIX(Cache::get_root);
+ if (t.root) {
+ TRACET("root already on t -- {}", t, *t.root);
+ return t.root->wait_io().then([&t] {
+ return get_root_iertr::make_ready_future<RootBlockRef>(
+ t.root);
+ });
+ } else {
+ DEBUGT("root not on t -- {}", t, *root);
+ t.root = root;
+ t.add_to_read_set(root);
+ return root->wait_io().then([root=root] {
+ return get_root_iertr::make_ready_future<RootBlockRef>(
+ root);
+ });
+ }
+}
+
+Cache::get_extent_ertr::future<CachedExtentRef> Cache::_get_extent_by_type(
+ extent_types_t type,
+ paddr_t offset,
+ laddr_t laddr,
+ extent_len_t length,
+ const Transaction::src_t* p_src,
+ extent_init_func_t &&extent_init_func,
+ extent_init_func_t &&on_cache)
+{
+ return [=, this, extent_init_func=std::move(extent_init_func)]() mutable {
+ src_ext_t* p_metric_key = nullptr;
+ src_ext_t metric_key;
+ if (p_src) {
+ metric_key = std::make_pair(*p_src, type);
+ p_metric_key = &metric_key;
+ }
+
+ switch (type) {
+ case extent_types_t::ROOT:
+ ceph_assert(0 == "ROOT is never directly read");
+ return get_extent_ertr::make_ready_future<CachedExtentRef>();
+ case extent_types_t::BACKREF_INTERNAL:
+ return get_extent<backref::BackrefInternalNode>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::BACKREF_LEAF:
+ return get_extent<backref::BackrefLeafNode>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::LADDR_INTERNAL:
+ return get_extent<lba_manager::btree::LBAInternalNode>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::LADDR_LEAF:
+ return get_extent<lba_manager::btree::LBALeafNode>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::OMAP_INNER:
+ return get_extent<omap_manager::OMapInnerNode>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::OMAP_LEAF:
+ return get_extent<omap_manager::OMapLeafNode>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::COLL_BLOCK:
+ return get_extent<collection_manager::CollectionNode>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::ONODE_BLOCK_STAGED:
+ return get_extent<onode::SeastoreNodeExtent>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::OBJECT_DATA_BLOCK:
+ return get_extent<ObjectDataBlock>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::RETIRED_PLACEHOLDER:
+ ceph_assert(0 == "impossible");
+ return get_extent_ertr::make_ready_future<CachedExtentRef>();
+ case extent_types_t::TEST_BLOCK:
+ return get_extent<TestBlock>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::TEST_BLOCK_PHYSICAL:
+ return get_extent<TestBlockPhysical>(
+ offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::NONE: {
+ ceph_assert(0 == "NONE is an invalid extent type");
+ return get_extent_ertr::make_ready_future<CachedExtentRef>();
+ }
+ default:
+ ceph_assert(0 == "impossible");
+ return get_extent_ertr::make_ready_future<CachedExtentRef>();
+ }
+ }().safe_then([laddr](CachedExtentRef e) {
+ assert(e->is_logical() == (laddr != L_ADDR_NULL));
+ if (e->is_logical()) {
+ e->cast<LogicalCachedExtent>()->set_laddr(laddr);
+ }
+ return get_extent_ertr::make_ready_future<CachedExtentRef>(e);
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
new file mode 100644
index 000000000..c79473f98
--- /dev/null
+++ b/src/crimson/os/seastore/cache.h
@@ -0,0 +1,1688 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include "seastar/core/shared_future.hh"
+
+#include "include/buffer.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/extent_placement_manager.h"
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/random_block_manager.h"
+#include "crimson/os/seastore/root_block.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore::backref {
+class BtreeBackrefManager;
+}
+
+namespace crimson::os::seastore {
+
+template <
+ typename node_key_t,
+ typename node_val_t,
+ typename internal_node_t,
+ typename leaf_node_t,
+ typename pin_t,
+ size_t node_size,
+ bool leaf_has_children>
+class FixedKVBtree;
+class BackrefManager;
+class SegmentProvider;
+
+struct backref_entry_t {
+ backref_entry_t(
+ const paddr_t paddr,
+ const laddr_t laddr,
+ const extent_len_t len,
+ const extent_types_t type,
+ const journal_seq_t seq)
+ : paddr(paddr),
+ laddr(laddr),
+ len(len),
+ type(type),
+ seq(seq)
+ {}
+ backref_entry_t(alloc_blk_t alloc_blk)
+ : paddr(alloc_blk.paddr),
+ laddr(alloc_blk.laddr),
+ len(alloc_blk.len),
+ type(alloc_blk.type)
+ {}
+ paddr_t paddr = P_ADDR_NULL;
+ laddr_t laddr = L_ADDR_NULL;
+ extent_len_t len = 0;
+ extent_types_t type =
+ extent_types_t::ROOT;
+ journal_seq_t seq;
+ friend bool operator< (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr < r.paddr;
+ }
+ friend bool operator> (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr > r.paddr;
+ }
+ friend bool operator== (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr == r.paddr;
+ }
+
+ using set_hook_t =
+ boost::intrusive::set_member_hook<
+ boost::intrusive::link_mode<
+ boost::intrusive::auto_unlink>>;
+ set_hook_t backref_set_hook;
+ using backref_set_member_options = boost::intrusive::member_hook<
+ backref_entry_t,
+ set_hook_t,
+ &backref_entry_t::backref_set_hook>;
+ using multiset_t = boost::intrusive::multiset<
+ backref_entry_t,
+ backref_set_member_options,
+ boost::intrusive::constant_time_size<false>>;
+
+ struct cmp_t {
+ using is_transparent = paddr_t;
+ bool operator()(
+ const backref_entry_t &l,
+ const backref_entry_t &r) const {
+ return l.paddr < r.paddr;
+ }
+ bool operator()(const paddr_t l, const backref_entry_t &r) const {
+ return l < r.paddr;
+ }
+ bool operator()(const backref_entry_t &l, const paddr_t r) const {
+ return l.paddr < r;
+ }
+ };
+};
+
+std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent);
+
+using backref_entry_ref = std::unique_ptr<backref_entry_t>;
+using backref_entry_mset_t = backref_entry_t::multiset_t;
+using backref_entry_refs_t = std::vector<backref_entry_ref>;
+using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>;
+using backref_entry_query_set_t = std::set<
+ backref_entry_t, backref_entry_t::cmp_t>;
+
+/**
+ * Cache
+ *
+ * This component is responsible for buffer management, including
+ * transaction lifecycle.
+ *
+ * Seastore transactions are expressed as an atomic combination of
+ * 1) newly written blocks
+ * 2) logical mutations to existing physical blocks
+ *
+ * See record_t
+ *
+ * As such, any transaction has 3 components:
+ * 1) read_set: references to extents read during the transaction
+ * See Transaction::read_set
+ * 2) write_set: references to extents to be written as:
+ * a) new physical blocks, see Transaction::fresh_block_list
+ * b) mutations to existing physical blocks,
+ * see Transaction::mutated_block_list
+ * 3) retired_set: extent refs to be retired either due to 2b or
+ * due to releasing the extent generally.
+
+ * In the case of 2b, the CachedExtent will have been copied into
+ * a fresh CachedExtentRef such that the source extent ref is present
+ * in the read set and the newly allocated extent is present in the
+ * write_set.
+ *
+ * A transaction has 3 phases:
+ * 1) construction: user calls Cache::get_transaction() and populates
+ * the returned transaction by calling Cache methods
+ * 2) submission: user calls Cache::try_start_transaction(). If
+ * succcessful, the user may construct a record and submit the
+ * transaction to the journal.
+ * 3) completion: once the transaction is durable, the user must call
+ * Cache::complete_commit() with the block offset to complete
+ * the transaction.
+ *
+ * Internally, in phase 1, the fields in Transaction are filled in.
+ * - reads may block if the referenced extent is being written
+ * - once a read obtains a particular CachedExtentRef for a paddr_t,
+ * it'll always get the same one until overwritten
+ * - once a paddr_t is overwritten or written, subsequent reads of
+ * that addr will get the new ref
+ *
+ * In phase 2, if all extents in the read set are valid (not expired),
+ * we can commit (otherwise, we fail and the user must retry).
+ * - Expire all extents in the retired_set (they must all be valid)
+ * - Remove all extents in the retired_set from Cache::extents
+ * - Mark all extents in the write_set wait_io(), add promises to
+ * transaction
+ * - Merge Transaction::write_set into Cache::extents
+ *
+ * After phase 2, the user will submit the record to the journal.
+ * Once complete, we perform phase 3:
+ * - For each CachedExtent in block_list, call
+ * CachedExtent::complete_initial_write(paddr_t) with the block's
+ * final offset (inferred from the extent's position in the block_list
+ * and extent lengths).
+ * - For each block in mutation_list, call
+ * CachedExtent::delta_written(paddr_t) with the address of the start
+ * of the record
+ * - Complete all promises with the final record start paddr_t
+ *
+ *
+ * Cache logs
+ *
+ * levels:
+ * - INFO: major initiation, closing operations
+ * - DEBUG: major extent related operations, INFO details
+ * - TRACE: DEBUG details
+ * - seastore_t logs
+ */
+class Cache {
+public:
+ using base_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using base_iertr = trans_iertr<base_ertr>;
+
+ Cache(ExtentPlacementManager &epm);
+ ~Cache();
+
+ /// Creates empty transaction by source
+ TransactionRef create_transaction(
+ Transaction::src_t src,
+ const char* name,
+ bool is_weak) {
+ LOG_PREFIX(Cache::create_transaction);
+
+ ++(get_by_src(stats.trans_created_by_src, src));
+
+ auto ret = std::make_unique<Transaction>(
+ get_dummy_ordering_handle(),
+ is_weak,
+ src,
+ last_commit,
+ [this](Transaction& t) {
+ return on_transaction_destruct(t);
+ },
+ ++next_id
+ );
+ SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}",
+ *ret, name, src, is_weak);
+ assert(!is_weak || src == Transaction::src_t::READ);
+ return ret;
+ }
+
+ /// Resets transaction preserving
+ void reset_transaction_preserve_handle(Transaction &t) {
+ LOG_PREFIX(Cache::reset_transaction_preserve_handle);
+ if (t.did_reset()) {
+ SUBTRACET(seastore_t, "reset", t);
+ ++(get_by_src(stats.trans_created_by_src, t.get_src()));
+ }
+ t.reset_preserve_handle(last_commit);
+ }
+
+ /// Declare ref retired in t
+ void retire_extent(Transaction &t, CachedExtentRef ref) {
+ LOG_PREFIX(Cache::retire_extent);
+ SUBDEBUGT(seastore_cache, "retire extent -- {}", t, *ref);
+ t.add_to_retired_set(ref);
+ }
+
+ /// Declare paddr retired in t
+ using retire_extent_iertr = base_iertr;
+ using retire_extent_ret = base_iertr::future<>;
+ retire_extent_ret retire_extent_addr(
+ Transaction &t, paddr_t addr, extent_len_t length);
+
+ /**
+ * get_root
+ *
+ * returns ref to current root or t.root if modified in t
+ */
+ using get_root_iertr = base_iertr;
+ using get_root_ret = get_root_iertr::future<RootBlockRef>;
+ get_root_ret get_root(Transaction &t);
+
+ /**
+ * get_root_fast
+ *
+ * returns t.root and assume it is already present/read in t
+ */
+ RootBlockRef get_root_fast(Transaction &t) {
+ LOG_PREFIX(Cache::get_root_fast);
+ SUBTRACET(seastore_cache, "root already on t -- {}", t, *t.root);
+ assert(t.root);
+ return t.root;
+ }
+
+ /**
+ * get_extent
+ *
+ * returns ref to extent at offset~length of type T either from
+ * - extent_set if already in cache
+ * - disk
+ */
+ using src_ext_t = std::pair<Transaction::src_t, extent_types_t>;
+ using get_extent_ertr = base_ertr;
+ template <typename T>
+ using get_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>;
+ template <typename T, typename Func, typename OnCache>
+ get_extent_ret<T> get_extent(
+ paddr_t offset, ///< [in] starting addr
+ extent_len_t length, ///< [in] length
+ const src_ext_t* p_src_ext, ///< [in] cache query metric key
+ Func &&extent_init_func, ///< [in] init func for extent
+ OnCache &&on_cache
+ ) {
+ LOG_PREFIX(Cache::get_extent);
+ auto cached = query_cache(offset, p_src_ext);
+ if (!cached) {
+ auto ret = CachedExtent::make_cached_extent_ref<T>(
+ alloc_cache_buf(length));
+ ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
+ offset,
+ PLACEMENT_HINT_NULL,
+ NULL_GENERATION,
+ TRANS_ID_NULL);
+ SUBDEBUG(seastore_cache,
+ "{} {}~{} is absent, add extent and reading ... -- {}",
+ T::TYPE, offset, length, *ret);
+ const auto p_src = p_src_ext ? &p_src_ext->first : nullptr;
+ add_extent(ret, p_src);
+ on_cache(*ret);
+ extent_init_func(*ret);
+ return read_extent<T>(
+ std::move(ret));
+ }
+
+ // extent PRESENT in cache
+ if (cached->get_type() == extent_types_t::RETIRED_PLACEHOLDER) {
+ auto ret = CachedExtent::make_cached_extent_ref<T>(
+ alloc_cache_buf(length));
+ ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
+ offset,
+ PLACEMENT_HINT_NULL,
+ NULL_GENERATION,
+ TRANS_ID_NULL);
+ SUBDEBUG(seastore_cache,
+ "{} {}~{} is absent(placeholder), reading ... -- {}",
+ T::TYPE, offset, length, *ret);
+ extents.replace(*ret, *cached);
+ on_cache(*ret);
+
+ // replace placeholder in transactions
+ while (!cached->transactions.empty()) {
+ auto t = cached->transactions.begin()->t;
+ t->replace_placeholder(*cached, *ret);
+ }
+
+ cached->state = CachedExtent::extent_state_t::INVALID;
+ extent_init_func(*ret);
+ return read_extent<T>(
+ std::move(ret));
+ } else if (!cached->is_fully_loaded()) {
+ auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
+ on_cache(*ret);
+ SUBDEBUG(seastore_cache,
+ "{} {}~{} is present without been fully loaded, reading ... -- {}",
+ T::TYPE, offset, length, *ret);
+ auto bp = alloc_cache_buf(length);
+ ret->set_bptr(std::move(bp));
+ return read_extent<T>(
+ std::move(ret));
+ } else {
+ SUBTRACE(seastore_cache,
+ "{} {}~{} is present in cache -- {}",
+ T::TYPE, offset, length, *cached);
+ auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
+ on_cache(*ret);
+ return ret->wait_io(
+ ).then([ret=std::move(ret)]() mutable
+ -> get_extent_ret<T> {
+ // ret may be invalid, caller must check
+ return get_extent_ret<T>(
+ get_extent_ertr::ready_future_marker{},
+ std::move(ret));
+ });
+ }
+ }
+ template <typename T>
+ get_extent_ret<T> get_extent(
+ paddr_t offset, ///< [in] starting addr
+ extent_len_t length, ///< [in] length
+ const src_ext_t* p_metric_key ///< [in] cache query metric key
+ ) {
+ return get_extent<T>(
+ offset, length, p_metric_key,
+ [](T &){}, [](T &) {});
+ }
+
+
+ /**
+ * get_extent_if_cached
+ *
+ * Returns extent at offset if in cache
+ */
+ using get_extent_if_cached_iertr = base_iertr;
+ using get_extent_if_cached_ret =
+ get_extent_if_cached_iertr::future<CachedExtentRef>;
+ get_extent_if_cached_ret get_extent_if_cached(
+ Transaction &t,
+ paddr_t offset,
+ extent_types_t type) {
+ CachedExtentRef ret;
+ LOG_PREFIX(Cache::get_extent_if_cached);
+ auto result = t.get_extent(offset, &ret);
+ if (result == Transaction::get_extent_ret::RETIRED) {
+ SUBDEBUGT(seastore_cache, "{} {} is retired on t -- {}",
+ t, type, offset, *ret);
+ return get_extent_if_cached_iertr::make_ready_future<
+ CachedExtentRef>(ret);
+ } else if (result == Transaction::get_extent_ret::PRESENT) {
+ if (ret->is_fully_loaded()) {
+ SUBTRACET(seastore_cache, "{} {} is present on t -- {}",
+ t, type, offset, *ret);
+ return ret->wait_io().then([ret] {
+ return get_extent_if_cached_iertr::make_ready_future<
+ CachedExtentRef>(ret);
+ });
+ } else {
+ SUBDEBUGT(seastore_cache, "{} {} is present on t -- {}"
+ " without being fully loaded", t, type, offset, *ret);
+ return get_extent_if_cached_iertr::make_ready_future<
+ CachedExtentRef>();
+ }
+ }
+
+ // get_extent_ret::ABSENT from transaction
+ auto metric_key = std::make_pair(t.get_src(), type);
+ ret = query_cache(offset, &metric_key);
+ if (!ret) {
+ SUBDEBUGT(seastore_cache, "{} {} is absent", t, type, offset);
+ return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
+ } else if (ret->get_type() == extent_types_t::RETIRED_PLACEHOLDER) {
+ // retired_placeholder is not really cached yet
+ SUBDEBUGT(seastore_cache, "{} {} is absent(placeholder)",
+ t, type, offset);
+ return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
+ } else if (!ret->is_fully_loaded()) {
+ SUBDEBUGT(seastore_cache, "{} {} is present without "
+ "being fully loaded", t, type, offset);
+ return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
+ }
+
+ // present in cache(fully loaded) and is not a retired_placeholder
+ SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}",
+ t, type, offset, *ret);
+ t.add_to_read_set(ret);
+ touch_extent(*ret);
+ return ret->wait_io().then([ret] {
+ return get_extent_if_cached_iertr::make_ready_future<
+ CachedExtentRef>(ret);
+ });
+ }
+
+ /**
+ * get_extent
+ *
+ * returns ref to extent at offset~length of type T either from
+ * - t if modified by t
+ * - extent_set if already in cache
+ * - disk
+ *
+ * t *must not* have retired offset
+ */
+ using get_extent_iertr = base_iertr;
+ template <typename T, typename Func>
+ get_extent_iertr::future<TCachedExtentRef<T>> get_extent(
+ Transaction &t,
+ paddr_t offset,
+ extent_len_t length,
+ Func &&extent_init_func) {
+ CachedExtentRef ret;
+ LOG_PREFIX(Cache::get_extent);
+ auto result = t.get_extent(offset, &ret);
+ if (result == Transaction::get_extent_ret::RETIRED) {
+ SUBERRORT(seastore_cache, "{} {}~{} is retired on t -- {}",
+ t, T::TYPE, offset, length, *ret);
+ ceph_abort("impossible");
+ } else if (result == Transaction::get_extent_ret::PRESENT) {
+ if (ret->is_fully_loaded()) {
+ SUBTRACET(seastore_cache, "{} {}~{} is present on t -- {}",
+ t, T::TYPE, offset, length, *ret);
+ return ret->wait_io().then([ret] {
+ return seastar::make_ready_future<TCachedExtentRef<T>>(
+ ret->cast<T>());
+ });
+ } else {
+ assert(!ret->is_mutable());
+ touch_extent(*ret);
+ SUBDEBUGT(seastore_cache, "{} {}~{} is present on t without been \
+ fully loaded, reading ... {}", t, T::TYPE, offset, length, *ret);
+ auto bp = alloc_cache_buf(ret->get_length());
+ ret->set_bptr(std::move(bp));
+ return read_extent<T>(
+ ret->cast<T>());
+ }
+ } else {
+ SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
+ t, T::TYPE, offset, length);
+ auto f = [&t, this](CachedExtent &ext) {
+ t.add_to_read_set(CachedExtentRef(&ext));
+ touch_extent(ext);
+ };
+ auto metric_key = std::make_pair(t.get_src(), T::TYPE);
+ return trans_intr::make_interruptible(
+ get_extent<T>(
+ offset, length, &metric_key,
+ std::forward<Func>(extent_init_func), std::move(f))
+ );
+ }
+ }
+
+ /*
+ * get_absent_extent
+ *
+ * Mostly the same as Cache::get_extent(), with the only difference
+ * that get_absent_extent won't search the transaction's context for
+ * the specific CachedExtent
+ */
+ template <typename T, typename Func>
+ get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
+ Transaction &t,
+ paddr_t offset,
+ extent_len_t length,
+ Func &&extent_init_func) {
+ CachedExtentRef ret;
+ LOG_PREFIX(Cache::get_absent_extent);
+
+#ifndef NDEBUG
+ auto r = t.get_extent(offset, &ret);
+ if (r != Transaction::get_extent_ret::ABSENT) {
+ SUBERRORT(seastore_cache, "unexpected non-absent extent {}", t, *ret);
+ ceph_abort();
+ }
+#endif
+
+ SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
+ t, T::TYPE, offset, length);
+ auto f = [&t, this](CachedExtent &ext) {
+ t.add_to_read_set(CachedExtentRef(&ext));
+ touch_extent(ext);
+ };
+ auto metric_key = std::make_pair(t.get_src(), T::TYPE);
+ return trans_intr::make_interruptible(
+ get_extent<T>(
+ offset, length, &metric_key,
+ std::forward<Func>(extent_init_func), std::move(f))
+ );
+ }
+
+ template <typename T>
+ get_extent_iertr::future<TCachedExtentRef<T>> get_extent(
+ Transaction &t,
+ paddr_t offset,
+ extent_len_t length) {
+ return get_extent<T>(t, offset, length, [](T &){});
+ }
+
+ /*
+ * get_absent_extent
+ *
+ * Mostly the same as Cache::get_extent(), with the only difference
+ * that get_absent_extent won't search the transaction's context for
+ * the specific CachedExtent
+ */
+ template <typename T>
+ get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
+ Transaction &t,
+ paddr_t offset,
+ extent_len_t length) {
+ return get_absent_extent<T>(t, offset, length, [](T &){});
+ }
+
+ get_extent_ertr::future<CachedExtentRef> get_extent_viewable_by_trans(
+ Transaction &t,
+ CachedExtentRef extent)
+ {
+ auto p_extent = extent->get_transactional_view(t);
+ if (!p_extent->is_pending_in_trans(t.get_trans_id())) {
+ t.add_to_read_set(p_extent);
+ if (!p_extent->is_mutation_pending()) {
+ touch_extent(*p_extent);
+ }
+ }
+ // user should not see RETIRED_PLACEHOLDER extents
+ ceph_assert(p_extent->get_type() != extent_types_t::RETIRED_PLACEHOLDER);
+ if (!p_extent->is_fully_loaded()) {
+ assert(!p_extent->is_mutable());
+ touch_extent(*p_extent);
+ LOG_PREFIX(Cache::get_extent_viewable_by_trans);
+ SUBDEBUG(seastore_cache,
+ "{} {}~{} is present without been fully loaded, reading ... -- {}",
+ p_extent->get_type(), p_extent->get_paddr(),p_extent->get_length(),
+ *p_extent);
+ auto bp = alloc_cache_buf(p_extent->get_length());
+ p_extent->set_bptr(std::move(bp));
+ return read_extent<CachedExtent>(CachedExtentRef(p_extent));
+ }
+ return p_extent->wait_io(
+ ).then([p_extent] {
+ return get_extent_ertr::make_ready_future<CachedExtentRef>(
+ CachedExtentRef(p_extent));
+ });
+ }
+
+ template <typename T>
+ get_extent_ertr::future<TCachedExtentRef<T>> get_extent_viewable_by_trans(
+ Transaction &t,
+ TCachedExtentRef<T> extent)
+ {
+ return get_extent_viewable_by_trans(t, CachedExtentRef(extent.get())
+ ).safe_then([](auto p_extent) {
+ return p_extent->template cast<T>();
+ });
+ }
+
+ extent_len_t get_block_size() const {
+ return epm.get_block_size();
+ }
+
+private:
+ // This is a workaround std::move_only_function not being available,
+ // not really worth generalizing at this time.
+ class extent_init_func_t {
+ struct callable_i {
+ virtual void operator()(CachedExtent &extent) = 0;
+ virtual ~callable_i() = default;
+ };
+ template <typename Func>
+ struct callable_wrapper final : callable_i {
+ Func func;
+ callable_wrapper(Func &&func) : func(std::forward<Func>(func)) {}
+ void operator()(CachedExtent &extent) final {
+ return func(extent);
+ }
+ ~callable_wrapper() final = default;
+ };
+ public:
+ std::unique_ptr<callable_i> wrapped;
+ template <typename Func>
+ extent_init_func_t(Func &&func) : wrapped(
+ std::make_unique<callable_wrapper<Func>>(std::forward<Func>(func)))
+ {}
+ void operator()(CachedExtent &extent) {
+ return (*wrapped)(extent);
+ }
+ };
+ get_extent_ertr::future<CachedExtentRef> _get_extent_by_type(
+ extent_types_t type,
+ paddr_t offset,
+ laddr_t laddr,
+ extent_len_t length,
+ const Transaction::src_t* p_src,
+ extent_init_func_t &&extent_init_func,
+ extent_init_func_t &&on_cache
+ );
+
+ using get_extent_by_type_iertr = get_extent_iertr;
+ using get_extent_by_type_ret = get_extent_by_type_iertr::future<
+ CachedExtentRef>;
+ get_extent_by_type_ret _get_extent_by_type(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t offset,
+ laddr_t laddr,
+ extent_len_t length,
+ extent_init_func_t &&extent_init_func
+ ) {
+ LOG_PREFIX(Cache::get_extent_by_type);
+ CachedExtentRef ret;
+ auto status = t.get_extent(offset, &ret);
+ if (status == Transaction::get_extent_ret::RETIRED) {
+ SUBERRORT(seastore_cache, "{} {}~{} {} is retired on t -- {}",
+ t, type, offset, length, laddr, *ret);
+ ceph_abort("impossible");
+ } else if (status == Transaction::get_extent_ret::PRESENT) {
+ if (ret->is_fully_loaded()) {
+ SUBTRACET(seastore_cache, "{} {}~{} {} is present on t -- {}",
+ t, type, offset, length, laddr, *ret);
+ return ret->wait_io().then([ret] {
+ return seastar::make_ready_future<CachedExtentRef>(ret);
+ });
+ } else {
+ assert(!ret->is_mutable());
+ touch_extent(*ret);
+ SUBDEBUGT(seastore_cache, "{} {}~{} {} is present on t without been \
+ fully loaded, reading ...", t, type, offset, length, laddr);
+ auto bp = alloc_cache_buf(ret->get_length());
+ ret->set_bptr(std::move(bp));
+ return read_extent<CachedExtent>(
+ std::move(ret));
+ }
+ } else {
+ SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
+ t, type, offset, length, laddr);
+ auto f = [&t, this](CachedExtent &ext) {
+ t.add_to_read_set(CachedExtentRef(&ext));
+ touch_extent(ext);
+ };
+ auto src = t.get_src();
+ return trans_intr::make_interruptible(
+ _get_extent_by_type(
+ type, offset, laddr, length, &src,
+ std::move(extent_init_func), std::move(f))
+ );
+ }
+ }
+
+ get_extent_by_type_ret _get_absent_extent_by_type(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t offset,
+ laddr_t laddr,
+ extent_len_t length,
+ extent_init_func_t &&extent_init_func
+ ) {
+ LOG_PREFIX(Cache::_get_absent_extent_by_type);
+
+#ifndef NDEBUG
+ CachedExtentRef ret;
+ auto r = t.get_extent(offset, &ret);
+ if (r != Transaction::get_extent_ret::ABSENT) {
+ SUBERRORT(seastore_cache, "unexpected non-absent extent {}", t, *ret);
+ ceph_abort();
+ }
+#endif
+
+ SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
+ t, type, offset, length, laddr);
+ auto f = [&t, this](CachedExtent &ext) {
+ t.add_to_read_set(CachedExtentRef(&ext));
+ touch_extent(ext);
+ };
+ auto src = t.get_src();
+ return trans_intr::make_interruptible(
+ _get_extent_by_type(
+ type, offset, laddr, length, &src,
+ std::move(extent_init_func), std::move(f))
+ );
+ }
+
+ backref_entryrefs_by_seq_t backref_entryrefs_by_seq;
+ backref_entry_mset_t backref_entry_mset;
+
+ using backref_entry_query_mset_t = std::multiset<
+ backref_entry_t, backref_entry_t::cmp_t>;
+ backref_entry_query_mset_t get_backref_entries_in_range(
+ paddr_t start,
+ paddr_t end) {
+ auto start_iter = backref_entry_mset.lower_bound(
+ start,
+ backref_entry_t::cmp_t());
+ auto end_iter = backref_entry_mset.lower_bound(
+ end,
+ backref_entry_t::cmp_t());
+ backref_entry_query_mset_t res;
+ for (auto it = start_iter;
+ it != end_iter;
+ it++) {
+ res.emplace(it->paddr, it->laddr, it->len, it->type, it->seq);
+ }
+ return res;
+ }
+
+ const backref_entry_mset_t& get_backref_entry_mset() {
+ return backref_entry_mset;
+ }
+
+ backref_entryrefs_by_seq_t& get_backref_entryrefs_by_seq() {
+ return backref_entryrefs_by_seq;
+ }
+
+ const segment_info_t* get_segment_info(segment_id_t sid) {
+ auto provider = segment_providers_by_device_id[sid.device_id()];
+ if (provider) {
+ return &provider->get_seg_info(sid);
+ } else {
+ return nullptr;
+ }
+ }
+
+public:
+ /**
+ * get_extent_by_type
+ *
+ * Based on type, instantiate the correct concrete type
+ * and read in the extent at location offset~length.
+ */
+ template <typename Func>
+ get_extent_by_type_ret get_extent_by_type(
+ Transaction &t, ///< [in] transaction
+ extent_types_t type, ///< [in] type tag
+ paddr_t offset, ///< [in] starting addr
+ laddr_t laddr, ///< [in] logical address if logical
+ extent_len_t length, ///< [in] length
+ Func &&extent_init_func ///< [in] extent init func
+ ) {
+ return _get_extent_by_type(
+ t,
+ type,
+ offset,
+ laddr,
+ length,
+ extent_init_func_t(std::forward<Func>(extent_init_func)));
+ }
+
+ /*
+ * get_absent_extent_by_type
+ *
+ * Mostly the same as Cache::get_extent_by_type(), with the only difference
+ * that get_absent_extent_by_type won't search the transaction's context for
+ * the specific CachedExtent
+ */
+ template <typename Func>
+ get_extent_by_type_ret get_absent_extent_by_type(
+ Transaction &t, ///< [in] transaction
+ extent_types_t type, ///< [in] type tag
+ paddr_t offset, ///< [in] starting addr
+ laddr_t laddr, ///< [in] logical address if logical
+ extent_len_t length, ///< [in] length
+ Func &&extent_init_func ///< [in] extent init func
+ ) {
+ return _get_absent_extent_by_type(
+ t,
+ type,
+ offset,
+ laddr,
+ length,
+ extent_init_func_t(std::forward<Func>(extent_init_func)));
+ }
+
+ get_extent_by_type_ret get_extent_by_type(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t offset,
+ laddr_t laddr,
+ extent_len_t length
+ ) {
+ return get_extent_by_type(
+ t, type, offset, laddr, length, [](CachedExtent &) {});
+ }
+
+
+ /*
+ * get_absent_extent_by_type
+ *
+ * Mostly the same as Cache::get_extent_by_type(), with the only difference
+ * that get_absent_extent_by_type won't search the transaction's context for
+ * the specific CachedExtent
+ */
+ get_extent_by_type_ret get_absent_extent_by_type(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t offset,
+ laddr_t laddr,
+ extent_len_t length
+ ) {
+ return get_absent_extent_by_type(
+ t, type, offset, laddr, length, [](CachedExtent &) {});
+ }
+
+ void trim_backref_bufs(const journal_seq_t &trim_to) {
+ LOG_PREFIX(Cache::trim_backref_bufs);
+ SUBDEBUG(seastore_cache, "trimming to {}", trim_to);
+ if (!backref_entryrefs_by_seq.empty()) {
+ SUBDEBUG(seastore_cache, "backref_entryrefs_by_seq {} ~ {}, size={}",
+ backref_entryrefs_by_seq.rbegin()->first,
+ backref_entryrefs_by_seq.begin()->first,
+ backref_entryrefs_by_seq.size());
+ assert(backref_entryrefs_by_seq.rbegin()->first >= trim_to);
+ auto iter = backref_entryrefs_by_seq.upper_bound(trim_to);
+ backref_entryrefs_by_seq.erase(backref_entryrefs_by_seq.begin(), iter);
+ }
+ if (backref_entryrefs_by_seq.empty()) {
+ SUBDEBUG(seastore_cache, "backref_entryrefs_by_seq all trimmed");
+ }
+ }
+
+ /**
+ * alloc_new_extent
+ *
+ * Allocates a fresh extent. if delayed is true, addr will be alloc'd later.
+ * Note that epaddr can only be fed by the btree lba unittest for now
+ */
+ template <typename T>
+ TCachedExtentRef<T> alloc_new_extent(
+ Transaction &t, ///< [in, out] current transaction
+ extent_len_t length, ///< [in] length
+ placement_hint_t hint, ///< [in] user hint
+#ifdef UNIT_TESTS_BUILT
+ rewrite_gen_t gen, ///< [in] rewrite generation
+ std::optional<paddr_t> epaddr = std::nullopt ///< [in] paddr fed by callers
+#else
+ rewrite_gen_t gen
+#endif
+ ) {
+ LOG_PREFIX(Cache::alloc_new_extent);
+ SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ t, T::TYPE, length, hint, rewrite_gen_printer_t{gen});
+#ifdef UNIT_TESTS_BUILT
+ auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen, epaddr);
+#else
+ auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen);
+#endif
+ auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp));
+ ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING,
+ result.paddr,
+ hint,
+ result.gen,
+ t.get_trans_id());
+ t.add_fresh_extent(ret);
+ SUBDEBUGT(seastore_cache,
+ "allocated {} {}B extent at {}, hint={}, gen={} -- {}",
+ t, T::TYPE, length, result.paddr,
+ hint, rewrite_gen_printer_t{result.gen}, *ret);
+ return ret;
+ }
+
+ /**
+ * alloc_remapped_extent
+ *
+ * Allocates an EXIST_CLEAN extent. Use the buffer to fill the new extent
+ * if buffer exists.
+ */
+ template <typename T>
+ TCachedExtentRef<T> alloc_remapped_extent(
+ Transaction &t,
+ laddr_t remap_laddr,
+ paddr_t remap_paddr,
+ extent_len_t remap_length,
+ laddr_t original_laddr,
+ std::optional<ceph::bufferptr> &&original_bptr) {
+ LOG_PREFIX(Cache::alloc_remapped_extent);
+ assert(remap_laddr >= original_laddr);
+ TCachedExtentRef<T> ext;
+ if (original_bptr.has_value()) {
+ // shallow copy the buffer from original extent
+ auto nbp = ceph::bufferptr(
+ *original_bptr,
+ remap_laddr - original_laddr,
+ remap_length);
+ // ExtentPlacementManager::alloc_new_extent will make a new
+ // (relative/temp) paddr, so make extent directly
+ ext = CachedExtent::make_cached_extent_ref<T>(std::move(nbp));
+ } else {
+ ext = CachedExtent::make_placeholder_cached_extent_ref<T>(remap_length);
+ }
+
+ ext->init(CachedExtent::extent_state_t::EXIST_CLEAN,
+ remap_paddr,
+ PLACEMENT_HINT_NULL,
+ NULL_GENERATION,
+ t.get_trans_id());
+
+ t.add_fresh_extent(ext);
+ SUBTRACET(seastore_cache, "allocated {} {}B, hint={}, has ptr? {} -- {}",
+ t, T::TYPE, remap_length, remap_laddr, original_bptr.has_value(), *ext);
+ return ext;
+ }
+
+ /**
+ * alloc_new_extent
+ *
+ * Allocates a fresh extent. addr will be relative until commit.
+ */
+ CachedExtentRef alloc_new_extent_by_type(
+ Transaction &t, ///< [in, out] current transaction
+ extent_types_t type, ///< [in] type tag
+ extent_len_t length, ///< [in] length
+ placement_hint_t hint, ///< [in] user hint
+ rewrite_gen_t gen ///< [in] rewrite generation
+ );
+
+ /**
+ * Allocates mutable buffer from extent_set on offset~len
+ *
+ * TODO: Note, currently all implementations literally copy the
+ * buffer. This needn't be true, CachedExtent implementations could
+ * choose to refer to the same buffer unmodified until commit and just
+ * buffer the mutations in an ancillary data structure.
+ *
+ * @param current transaction
+ * @param extent to duplicate
+ * @return mutable extent
+ */
+ CachedExtentRef duplicate_for_write(
+ Transaction &t, ///< [in, out] current transaction
+ CachedExtentRef i ///< [in] ref to existing extent
+ );
+
+ /**
+ * set_segment_provider
+ *
+ * Set to provide segment information to help identify out-dated delta.
+ *
+ * FIXME: This is specific to the segmented implementation
+ */
+ void set_segment_providers(std::vector<SegmentProvider*> &&providers) {
+ segment_providers_by_device_id = std::move(providers);
+ }
+
+ /**
+ * prepare_record
+ *
+ * Construct the record for Journal from transaction.
+ */
+ record_t prepare_record(
+ Transaction &t, ///< [in, out] current transaction
+ const journal_seq_t &journal_head,
+ const journal_seq_t &journal_dirty_tail
+ );
+
+ /**
+ * complete_commit
+ *
+ * Must be called upon completion of write. Releases blocks on mutating
+ * extents, fills in addresses, and calls relevant callbacks on fresh
+ * and mutated exents.
+ */
+ void complete_commit(
+ Transaction &t, ///< [in, out] current transaction
+ paddr_t final_block_start, ///< [in] offset of initial block
+ journal_seq_t seq ///< [in] journal commit seq
+ );
+
+ /**
+ * init
+ */
+ void init();
+
+ /**
+ * mkfs
+ *
+ * Alloc initial root node and add to t. The intention is for other
+ * components to use t to adjust the resulting root ref prior to commit.
+ */
+ using mkfs_iertr = base_iertr;
+ mkfs_iertr::future<> mkfs(Transaction &t);
+
+ /**
+ * close
+ *
+ * TODO: should flush dirty blocks
+ */
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ close_ertr::future<> close();
+
+ /**
+ * replay_delta
+ *
+ * Intended for use in Journal::delta. For each delta, should decode delta,
+ * read relevant block from disk or cache (using correct type), and call
+ * CachedExtent::apply_delta marking the extent dirty.
+ *
+ * Returns whether the delta is applied.
+ */
+ using replay_delta_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using replay_delta_ret = replay_delta_ertr::future<bool>;
+ replay_delta_ret replay_delta(
+ journal_seq_t seq,
+ paddr_t record_block_base,
+ const delta_info_t &delta,
+ const journal_seq_t &dirty_tail,
+ const journal_seq_t &alloc_tail,
+ sea_time_point modify_time);
+
+ /**
+ * init_cached_extents
+ *
+ * Calls passed lambda for each dirty cached block. Intended for use
+ * after replay to allow lba_manager (or w/e) to read in any ancestor
+ * blocks.
+ */
+ using init_cached_extents_iertr = base_iertr;
+ using init_cached_extents_ret = init_cached_extents_iertr::future<>;
+ template <typename F>
+ init_cached_extents_ret init_cached_extents(
+ Transaction &t,
+ F &&f)
+ {
+ LOG_PREFIX(Cache::init_cached_extents);
+ SUBINFOT(seastore_cache,
+ "start with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
+ t,
+ extents.size(),
+ extents.get_bytes(),
+ dirty.size(),
+ get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
+ get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL));
+
+ // journal replay should has been finished at this point,
+ // Cache::root should have been inserted to the dirty list
+ assert(root->is_dirty());
+ std::vector<CachedExtentRef> _dirty;
+ for (auto &e : extents) {
+ _dirty.push_back(CachedExtentRef(&e));
+ }
+ return seastar::do_with(
+ std::forward<F>(f),
+ std::move(_dirty),
+ [this, FNAME, &t](auto &f, auto &refs) mutable
+ {
+ return trans_intr::do_for_each(
+ refs,
+ [this, FNAME, &t, &f](auto &e)
+ {
+ SUBTRACET(seastore_cache, "inspecting extent ... -- {}", t, *e);
+ return f(t, e
+ ).si_then([this, FNAME, &t, e](bool is_alive) {
+ if (!is_alive) {
+ SUBDEBUGT(seastore_cache, "extent is not alive, remove extent -- {}", t, *e);
+ remove_extent(e);
+ e->set_invalid(t);
+ } else {
+ SUBDEBUGT(seastore_cache, "extent is alive -- {}", t, *e);
+ }
+ });
+ });
+ }).handle_error_interruptible(
+ init_cached_extents_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in Cache::init_cached_extents"
+ }
+ ).si_then([this, FNAME, &t] {
+ SUBINFOT(seastore_cache,
+ "finish with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
+ t,
+ extents.size(),
+ extents.get_bytes(),
+ dirty.size(),
+ get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
+ get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL));
+ });
+ }
+
+ /**
+ * update_extent_from_transaction
+ *
+ * Updates passed extent based on t. If extent has been retired,
+ * a null result will be returned.
+ */
+ CachedExtentRef update_extent_from_transaction(
+ Transaction &t,
+ CachedExtentRef extent) {
+ if (extent->get_type() == extent_types_t::ROOT) {
+ if (t.root) {
+ return t.root;
+ } else {
+ t.add_to_read_set(extent);
+ t.root = extent->cast<RootBlock>();
+ return extent;
+ }
+ } else {
+ auto result = t.get_extent(extent->get_paddr(), &extent);
+ if (result == Transaction::get_extent_ret::RETIRED) {
+ return CachedExtentRef();
+ } else {
+ if (result == Transaction::get_extent_ret::ABSENT) {
+ t.add_to_read_set(extent);
+ }
+ return extent;
+ }
+ }
+ }
+
+ /**
+ * print
+ *
+ * Dump summary of contents (TODO)
+ */
+ std::ostream &print(
+ std::ostream &out) const {
+ return out;
+ }
+
+ /**
+ * get_next_dirty_extents
+ *
+ * Returns extents with get_dirty_from() < seq and adds to read set of
+ * t.
+ */
+ using get_next_dirty_extents_iertr = base_iertr;
+ using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future<
+ std::vector<CachedExtentRef>>;
+ get_next_dirty_extents_ret get_next_dirty_extents(
+ Transaction &t,
+ journal_seq_t seq,
+ size_t max_bytes);
+
+ /// returns std::nullopt if no pending alloc-infos
+ std::optional<journal_seq_t> get_oldest_backref_dirty_from() const {
+ LOG_PREFIX(Cache::get_oldest_backref_dirty_from);
+ if (backref_entryrefs_by_seq.empty()) {
+ SUBDEBUG(seastore_cache, "backref_oldest: null");
+ return std::nullopt;
+ }
+ auto oldest = backref_entryrefs_by_seq.begin()->first;
+ SUBDEBUG(seastore_cache, "backref_oldest: {}", oldest);
+ ceph_assert(oldest != JOURNAL_SEQ_NULL);
+ return oldest;
+ }
+
+ /// returns std::nullopt if no dirty extents
+ /// returns JOURNAL_SEQ_NULL if the oldest dirty extent is still pending
+ std::optional<journal_seq_t> get_oldest_dirty_from() const {
+ LOG_PREFIX(Cache::get_oldest_dirty_from);
+ if (dirty.empty()) {
+ SUBDEBUG(seastore_cache, "dirty_oldest: null");
+ return std::nullopt;
+ } else {
+ auto oldest = dirty.begin()->get_dirty_from();
+ if (oldest == JOURNAL_SEQ_NULL) {
+ SUBDEBUG(seastore_cache, "dirty_oldest: pending");
+ } else {
+ SUBDEBUG(seastore_cache, "dirty_oldest: {}", oldest);
+ }
+ return oldest;
+ }
+ }
+
+ /// Dump live extents
+ void dump_contents();
+
+ /**
+ * backref_extent_entry_t
+ *
+ * All the backref extent entries have to be indexed by paddr in memory,
+ * so they can be retrived by range during cleaning.
+ *
+ * See BtreeBackrefManager::retrieve_backref_extents_in_range()
+ */
+ struct backref_extent_entry_t {
+ backref_extent_entry_t(
+ paddr_t paddr,
+ paddr_t key,
+ extent_types_t type)
+ : paddr(paddr), key(key), type(type) {}
+ paddr_t paddr = P_ADDR_NULL;
+ paddr_t key = P_ADDR_NULL;
+ extent_types_t type = extent_types_t::ROOT;
+ struct cmp_t {
+ using is_transparent = paddr_t;
+ bool operator()(
+ const backref_extent_entry_t &l,
+ const backref_extent_entry_t &r) const {
+ return l.paddr < r.paddr;
+ }
+ bool operator()(
+ const paddr_t &l,
+ const backref_extent_entry_t &r) const {
+ return l < r.paddr;
+ }
+ bool operator()(
+ const backref_extent_entry_t &l,
+ const paddr_t &r) const {
+ return l.paddr < r;
+ }
+ };
+ };
+
+ void update_tree_extents_num(extent_types_t type, int64_t delta) {
+ switch (type) {
+ case extent_types_t::LADDR_INTERNAL:
+ [[fallthrough]];
+ case extent_types_t::DINK_LADDR_LEAF:
+ [[fallthrough]];
+ case extent_types_t::LADDR_LEAF:
+ stats.lba_tree_extents_num += delta;
+ ceph_assert(stats.lba_tree_extents_num >= 0);
+ return;
+ case extent_types_t::OMAP_INNER:
+ [[fallthrough]];
+ case extent_types_t::OMAP_LEAF:
+ stats.omap_tree_extents_num += delta;
+ ceph_assert(stats.lba_tree_extents_num >= 0);
+ return;
+ case extent_types_t::ONODE_BLOCK_STAGED:
+ stats.onode_tree_extents_num += delta;
+ ceph_assert(stats.onode_tree_extents_num >= 0);
+ return;
+ case extent_types_t::BACKREF_INTERNAL:
+ [[fallthrough]];
+ case extent_types_t::BACKREF_LEAF:
+ stats.backref_tree_extents_num += delta;
+ ceph_assert(stats.backref_tree_extents_num >= 0);
+ return;
+ default:
+ return;
+ }
+ }
+
+ uint64_t get_omap_tree_depth() {
+ return stats.omap_tree_depth;
+ }
+
+ /// Update lru for access to ref
+ void touch_extent(
+ CachedExtent &ext,
+ const Transaction::src_t* p_src=nullptr)
+ {
+ if (p_src && is_background_transaction(*p_src))
+ return;
+ if (ext.is_stable_clean() && !ext.is_placeholder()) {
+ lru.move_to_top(ext);
+ }
+ }
+
+private:
+ ExtentPlacementManager& epm;
+ RootBlockRef root; ///< ref to current root
+ ExtentIndex extents; ///< set of live extents
+
+ journal_seq_t last_commit = JOURNAL_SEQ_MIN;
+
+ // FIXME: This is specific to the segmented implementation
+ std::vector<SegmentProvider*> segment_providers_by_device_id;
+
+ transaction_id_t next_id = 0;
+
+ /**
+ * dirty
+ *
+ * holds refs to dirty extents. Ordered by CachedExtent::get_dirty_from().
+ */
+ CachedExtent::list dirty;
+
+ using backref_extent_entry_query_set_t =
+ std::set<
+ backref_extent_entry_t,
+ backref_extent_entry_t::cmp_t>;
+ backref_extent_entry_query_set_t backref_extents;
+
+ void add_backref_extent(
+ paddr_t paddr,
+ paddr_t key,
+ extent_types_t type) {
+ assert(!paddr.is_relative());
+ auto [iter, inserted] = backref_extents.emplace(paddr, key, type);
+ boost::ignore_unused(inserted);
+ assert(inserted);
+ }
+
+ void remove_backref_extent(paddr_t paddr) {
+ auto iter = backref_extents.find(paddr);
+ if (iter != backref_extents.end())
+ backref_extents.erase(iter);
+ }
+
+ backref_extent_entry_query_set_t get_backref_extents_in_range(
+ paddr_t start,
+ paddr_t end) {
+ auto start_iter = backref_extents.lower_bound(start);
+ auto end_iter = backref_extents.upper_bound(end);
+ backref_extent_entry_query_set_t res;
+ res.insert(start_iter, end_iter);
+ return res;
+ }
+
+ friend class crimson::os::seastore::backref::BtreeBackrefManager;
+ friend class crimson::os::seastore::BackrefManager;
+ /**
+ * lru
+ *
+ * holds references to recently used extents
+ */
+ class LRU {
+ // max size (bytes)
+ const size_t capacity = 0;
+
+ // current size (bytes)
+ size_t contents = 0;
+
+ CachedExtent::list lru;
+
+ void trim_to_capacity() {
+ while (contents > capacity) {
+ assert(lru.size() > 0);
+ remove_from_lru(lru.front());
+ }
+ }
+
+ void add_to_lru(CachedExtent &extent) {
+ assert(extent.is_stable_clean() && !extent.is_placeholder());
+
+ if (!extent.primary_ref_list_hook.is_linked()) {
+ contents += extent.get_length();
+ intrusive_ptr_add_ref(&extent);
+ lru.push_back(extent);
+ }
+ trim_to_capacity();
+ }
+
+ public:
+ LRU(size_t capacity) : capacity(capacity) {}
+
+ size_t get_capacity() const {
+ return capacity;
+ }
+
+ size_t get_current_contents_bytes() const {
+ return contents;
+ }
+
+ size_t get_current_contents_extents() const {
+ return lru.size();
+ }
+
+ void remove_from_lru(CachedExtent &extent) {
+ assert(extent.is_stable_clean() && !extent.is_placeholder());
+
+ if (extent.primary_ref_list_hook.is_linked()) {
+ lru.erase(lru.s_iterator_to(extent));
+ assert(contents >= extent.get_length());
+ contents -= extent.get_length();
+ intrusive_ptr_release(&extent);
+ }
+ }
+
+ void move_to_top(CachedExtent &extent) {
+ assert(extent.is_stable_clean() && !extent.is_placeholder());
+
+ if (extent.primary_ref_list_hook.is_linked()) {
+ lru.erase(lru.s_iterator_to(extent));
+ intrusive_ptr_release(&extent);
+ assert(contents >= extent.get_length());
+ contents -= extent.get_length();
+ }
+ add_to_lru(extent);
+ }
+
+ void clear() {
+ LOG_PREFIX(Cache::LRU::clear);
+ for (auto iter = lru.begin(); iter != lru.end();) {
+ SUBDEBUG(seastore_cache, "clearing {}", *iter);
+ remove_from_lru(*(iter++));
+ }
+ }
+
+ ~LRU() {
+ clear();
+ }
+ } lru;
+
+ struct query_counters_t {
+ uint64_t access = 0;
+ uint64_t hit = 0;
+ };
+
+ template <typename CounterT>
+ using counter_by_extent_t = std::array<CounterT, EXTENT_TYPES_MAX>;
+
+ struct invalid_trans_efforts_t {
+ io_stat_t read;
+ io_stat_t mutate;
+ uint64_t mutate_delta_bytes = 0;
+ io_stat_t retire;
+ io_stat_t fresh;
+ io_stat_t fresh_ool_written;
+ counter_by_extent_t<uint64_t> num_trans_invalidated;
+ uint64_t total_trans_invalidated = 0;
+ uint64_t num_ool_records = 0;
+ uint64_t ool_record_bytes = 0;
+ };
+
+ struct commit_trans_efforts_t {
+ counter_by_extent_t<io_stat_t> read_by_ext;
+ counter_by_extent_t<io_stat_t> mutate_by_ext;
+ counter_by_extent_t<uint64_t> delta_bytes_by_ext;
+ counter_by_extent_t<io_stat_t> retire_by_ext;
+ counter_by_extent_t<io_stat_t> fresh_invalid_by_ext; // inline but is already invalid (retired)
+ counter_by_extent_t<io_stat_t> fresh_inline_by_ext;
+ counter_by_extent_t<io_stat_t> fresh_ool_by_ext;
+ uint64_t num_trans = 0; // the number of inline records
+ uint64_t num_ool_records = 0;
+ uint64_t ool_record_metadata_bytes = 0;
+ uint64_t ool_record_data_bytes = 0;
+ uint64_t inline_record_metadata_bytes = 0; // metadata exclude the delta bytes
+ };
+
+ struct success_read_trans_efforts_t {
+ io_stat_t read;
+ uint64_t num_trans = 0;
+ };
+
+ struct tree_efforts_t {
+ uint64_t num_inserts = 0;
+ uint64_t num_erases = 0;
+ uint64_t num_updates = 0;
+
+ void increment(const Transaction::tree_stats_t& incremental) {
+ num_inserts += incremental.num_inserts;
+ num_erases += incremental.num_erases;
+ num_updates += incremental.num_updates;
+ }
+ };
+
+ template <typename CounterT>
+ using counter_by_src_t = std::array<CounterT, TRANSACTION_TYPE_MAX>;
+
+ static constexpr std::size_t NUM_SRC_COMB =
+ TRANSACTION_TYPE_MAX * (TRANSACTION_TYPE_MAX + 1) / 2;
+
+ struct {
+ counter_by_src_t<uint64_t> trans_created_by_src;
+ counter_by_src_t<commit_trans_efforts_t> committed_efforts_by_src;
+ counter_by_src_t<invalid_trans_efforts_t> invalidated_efforts_by_src;
+ counter_by_src_t<query_counters_t> cache_query_by_src;
+ success_read_trans_efforts_t success_read_efforts;
+ uint64_t dirty_bytes = 0;
+
+ uint64_t onode_tree_depth = 0;
+ int64_t onode_tree_extents_num = 0;
+ counter_by_src_t<tree_efforts_t> committed_onode_tree_efforts;
+ counter_by_src_t<tree_efforts_t> invalidated_onode_tree_efforts;
+
+ uint64_t omap_tree_depth = 0;
+ int64_t omap_tree_extents_num = 0;
+ counter_by_src_t<tree_efforts_t> committed_omap_tree_efforts;
+ counter_by_src_t<tree_efforts_t> invalidated_omap_tree_efforts;
+
+ uint64_t lba_tree_depth = 0;
+ int64_t lba_tree_extents_num = 0;
+ counter_by_src_t<tree_efforts_t> committed_lba_tree_efforts;
+ counter_by_src_t<tree_efforts_t> invalidated_lba_tree_efforts;
+
+ uint64_t backref_tree_depth = 0;
+ int64_t backref_tree_extents_num = 0;
+ counter_by_src_t<tree_efforts_t> committed_backref_tree_efforts;
+ counter_by_src_t<tree_efforts_t> invalidated_backref_tree_efforts;
+
+ std::array<uint64_t, NUM_SRC_COMB> trans_conflicts_by_srcs;
+ counter_by_src_t<uint64_t> trans_conflicts_by_unknown;
+
+ version_stat_t committed_dirty_version;
+ version_stat_t committed_reclaim_version;
+ } stats;
+
+ template <typename CounterT>
+ CounterT& get_by_src(
+ counter_by_src_t<CounterT>& counters_by_src,
+ Transaction::src_t src) {
+ assert(static_cast<std::size_t>(src) < counters_by_src.size());
+ return counters_by_src[static_cast<std::size_t>(src)];
+ }
+
+ template <typename CounterT>
+ CounterT& get_by_ext(
+ counter_by_extent_t<CounterT>& counters_by_ext,
+ extent_types_t ext) {
+ auto index = static_cast<uint8_t>(ext);
+ assert(index < EXTENT_TYPES_MAX);
+ return counters_by_ext[index];
+ }
+
+ void account_conflict(Transaction::src_t src1, Transaction::src_t src2) {
+ assert(src1 < Transaction::src_t::MAX);
+ assert(src2 < Transaction::src_t::MAX);
+ if (src1 > src2) {
+ std::swap(src1, src2);
+ }
+ // impossible combinations
+ // should be consistent with trans_srcs_invalidated in register_metrics()
+ assert(!(src1 == Transaction::src_t::READ &&
+ src2 == Transaction::src_t::READ));
+ assert(!(src1 == Transaction::src_t::TRIM_DIRTY &&
+ src2 == Transaction::src_t::TRIM_DIRTY));
+ assert(!(src1 == Transaction::src_t::CLEANER_MAIN &&
+ src2 == Transaction::src_t::CLEANER_MAIN));
+ assert(!(src1 == Transaction::src_t::CLEANER_COLD &&
+ src2 == Transaction::src_t::CLEANER_COLD));
+ assert(!(src1 == Transaction::src_t::TRIM_ALLOC &&
+ src2 == Transaction::src_t::TRIM_ALLOC));
+
+ auto src1_value = static_cast<std::size_t>(src1);
+ auto src2_value = static_cast<std::size_t>(src2);
+ auto num_srcs = static_cast<std::size_t>(Transaction::src_t::MAX);
+ auto conflict_index = num_srcs * src1_value + src2_value -
+ src1_value * (src1_value + 1) / 2;
+ assert(conflict_index < NUM_SRC_COMB);
+ ++stats.trans_conflicts_by_srcs[conflict_index];
+ }
+
+ seastar::metrics::metric_group metrics;
+ void register_metrics();
+
+ /// alloc buffer for cached extent
+ bufferptr alloc_cache_buf(size_t size) {
+ // TODO: memory pooling etc
+ auto bp = ceph::bufferptr(
+ buffer::create_page_aligned(size));
+ bp.zero();
+ return bp;
+ }
+
+ void backref_batch_update(
+ std::vector<backref_entry_ref> &&,
+ const journal_seq_t &);
+
+ /// Add extent to extents handling dirty and refcounting
+ void add_extent(CachedExtentRef ref, const Transaction::src_t* t_src);
+
+ /// Mark exising extent ref dirty -- mainly for replay
+ void mark_dirty(CachedExtentRef ref);
+
+ /// Add dirty extent to dirty list
+ void add_to_dirty(CachedExtentRef ref);
+
+ /// Remove from dirty list
+ void remove_from_dirty(CachedExtentRef ref);
+
+ /// Remove extent from extents handling dirty and refcounting
+ void remove_extent(CachedExtentRef ref);
+
+ /// Retire extent
+ void commit_retire_extent(Transaction& t, CachedExtentRef ref);
+
+ /// Replace prev with next
+ void commit_replace_extent(Transaction& t, CachedExtentRef next, CachedExtentRef prev);
+
+ /// Invalidate extent and mark affected transactions
+ void invalidate_extent(Transaction& t, CachedExtent& extent);
+
+ /// Mark a valid transaction as conflicted
+ void mark_transaction_conflicted(
+ Transaction& t, CachedExtent& conflicting_extent);
+
+ /// Introspect transaction when it is being destructed
+ void on_transaction_destruct(Transaction& t);
+
+ template <typename T>
+ get_extent_ret<T> read_extent(
+ TCachedExtentRef<T>&& extent
+ ) {
+ assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING ||
+ extent->state == CachedExtent::extent_state_t::EXIST_CLEAN ||
+ extent->state == CachedExtent::extent_state_t::CLEAN);
+ extent->set_io_wait();
+ return epm.read(
+ extent->get_paddr(),
+ extent->get_length(),
+ extent->get_bptr()
+ ).safe_then(
+ [extent=std::move(extent)]() mutable {
+ LOG_PREFIX(Cache::read_extent);
+ if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) {
+ extent->state = CachedExtent::extent_state_t::CLEAN;
+ /* TODO: crc should be checked against LBA manager */
+ extent->last_committed_crc = extent->get_crc32c();
+
+ extent->on_clean_read();
+ } else if (extent->state == CachedExtent::extent_state_t::EXIST_CLEAN ||
+ extent->state == CachedExtent::extent_state_t::CLEAN) {
+ /* TODO: crc should be checked against LBA manager */
+ extent->last_committed_crc = extent->get_crc32c();
+ } else {
+ ceph_assert(!extent->is_valid());
+ }
+ extent->complete_io();
+ SUBDEBUG(seastore_cache, "read extent done -- {}", *extent);
+ return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+ std::move(extent));
+ },
+ get_extent_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Cache::get_extent: invalid error"
+ }
+ );
+ }
+
+ // Extents in cache may contain placeholders
+ CachedExtentRef query_cache(
+ paddr_t offset,
+ const src_ext_t* p_metric_key) {
+ query_counters_t* p_counters = nullptr;
+ if (p_metric_key) {
+ p_counters = &get_by_src(stats.cache_query_by_src, p_metric_key->first);
+ ++p_counters->access;
+ }
+ if (auto iter = extents.find_offset(offset);
+ iter != extents.end()) {
+ if (p_metric_key &&
+ // retired_placeholder is not really cached yet
+ iter->get_type() != extent_types_t::RETIRED_PLACEHOLDER) {
+ ++p_counters->hit;
+ }
+ return CachedExtentRef(&*iter);
+ } else {
+ return CachedExtentRef();
+ }
+ }
+
+ template <
+ typename node_key_t,
+ typename node_val_t,
+ typename internal_node_t,
+ typename leaf_node_t,
+ typename pin_t,
+ size_t node_size,
+ bool leaf_has_children>
+ friend class FixedKVBtree;
+};
+using CacheRef = std::unique_ptr<Cache>;
+
+}
diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
new file mode 100644
index 000000000..769b0446a
--- /dev/null
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/transaction.h"
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/btree/fixed_kv_node.h"
+
+namespace {
+ [[maybe_unused]] seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_seastore_tm);
+ }
+}
+
+namespace crimson::os::seastore {
+
+#ifdef DEBUG_CACHED_EXTENT_REF
+
+void intrusive_ptr_add_ref(CachedExtent *ptr)
+{
+ intrusive_ptr_add_ref(
+ static_cast<boost::intrusive_ref_counter<
+ CachedExtent,
+ boost::thread_unsafe_counter>*>(ptr));
+ logger().debug("intrusive_ptr_add_ref: {}", *ptr);
+}
+
+void intrusive_ptr_release(CachedExtent *ptr)
+{
+ logger().debug("intrusive_ptr_release: {}", *ptr);
+ intrusive_ptr_release(
+ static_cast<boost::intrusive_ref_counter<
+ CachedExtent,
+ boost::thread_unsafe_counter>*>(ptr));
+}
+
+#endif
+
+bool is_backref_mapped_extent_node(const CachedExtentRef &extent) {
+ return extent->is_logical()
+ || is_lba_node(extent->get_type())
+ || extent->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL;
+}
+
+std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state)
+{
+ switch (state) {
+ case CachedExtent::extent_state_t::INITIAL_WRITE_PENDING:
+ return out << "INITIAL_WRITE_PENDING";
+ case CachedExtent::extent_state_t::MUTATION_PENDING:
+ return out << "MUTATION_PENDING";
+ case CachedExtent::extent_state_t::CLEAN_PENDING:
+ return out << "CLEAN_PENDING";
+ case CachedExtent::extent_state_t::CLEAN:
+ return out << "CLEAN";
+ case CachedExtent::extent_state_t::DIRTY:
+ return out << "DIRTY";
+ case CachedExtent::extent_state_t::EXIST_CLEAN:
+ return out << "EXIST_CLEAN";
+ case CachedExtent::extent_state_t::EXIST_MUTATION_PENDING:
+ return out << "EXIST_MUTATION_PENDING";
+ case CachedExtent::extent_state_t::INVALID:
+ return out << "INVALID";
+ default:
+ return out << "UNKNOWN";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const CachedExtent &ext)
+{
+ return ext.print(out);
+}
+
+CachedExtent::~CachedExtent()
+{
+ if (parent_index) {
+ assert(is_linked());
+ parent_index->erase(*this);
+ }
+}
+CachedExtent* CachedExtent::get_transactional_view(Transaction &t) {
+ return get_transactional_view(t.get_trans_id());
+}
+
+CachedExtent* CachedExtent::get_transactional_view(transaction_id_t tid) {
+ auto it = mutation_pendings.find(tid, trans_spec_view_t::cmp_t());
+ if (it != mutation_pendings.end()) {
+ return (CachedExtent*)&(*it);
+ } else {
+ return this;
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const parent_tracker_t &tracker) {
+ return out << "parent_tracker=" << (void*)&tracker
+ << ", parent=" << (void*)tracker.get_parent().get();
+}
+
+std::ostream &ChildableCachedExtent::print_detail(std::ostream &out) const {
+ if (parent_tracker) {
+ out << *parent_tracker;
+ } else {
+ out << ", parent_tracker=" << (void*)nullptr;
+ }
+ _print_detail(out);
+ return out;
+}
+
+std::ostream &LogicalCachedExtent::_print_detail(std::ostream &out) const
+{
+ out << ", laddr=" << laddr;
+ return print_detail_l(out);
+}
+
+void child_pos_t::link_child(ChildableCachedExtent *c) {
+ get_parent<FixedKVNode<laddr_t>>()->link_child(c, pos);
+}
+
+void CachedExtent::set_invalid(Transaction &t) {
+ state = extent_state_t::INVALID;
+ if (trans_view_hook.is_linked()) {
+ trans_view_hook.unlink();
+ }
+ on_invalidated(t);
+}
+
+LogicalCachedExtent::~LogicalCachedExtent() {
+ if (has_parent_tracker() && is_valid() && !is_pending()) {
+ assert(get_parent_node());
+ auto parent = get_parent_node<FixedKVNode<laddr_t>>();
+ auto off = parent->lower_bound_offset(laddr);
+ assert(parent->get_key_from_idx(off) == laddr);
+ assert(parent->children[off] == this);
+ parent->children[off] = nullptr;
+ }
+}
+
+void LogicalCachedExtent::on_replace_prior(Transaction &t) {
+ assert(is_mutation_pending());
+ take_prior_parent_tracker();
+ assert(get_parent_node());
+ auto parent = get_parent_node<FixedKVNode<laddr_t>>();
+ //TODO: can this search be avoided?
+ auto off = parent->lower_bound_offset(laddr);
+ assert(parent->get_key_from_idx(off) == laddr);
+ parent->children[off] = this;
+}
+
+parent_tracker_t::~parent_tracker_t() {
+ // this is parent's tracker, reset it
+ auto &p = (FixedKVNode<laddr_t>&)*parent;
+ if (p.my_tracker == this) {
+ p.my_tracker = nullptr;
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
+{
+ return out << "LBAMapping(" << rhs.get_key() << "~" << rhs.get_length()
+ << "->" << rhs.get_val();
+}
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs)
+{
+ bool first = true;
+ out << '[';
+ for (const auto &i: rhs) {
+ out << (first ? "" : ",") << *i;
+ first = false;
+ }
+ return out << ']';
+}
+
+}
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
new file mode 100644
index 000000000..02f8ae46c
--- /dev/null
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -0,0 +1,1304 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "seastar/core/shared_future.hh"
+
+#include "include/buffer.h"
+#include "crimson/common/errorator.h"
+#include "crimson/common/interruptible_future.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+struct btree_lba_manager_test;
+
+namespace crimson::os::seastore {
+
+class Transaction;
+class CachedExtent;
+using CachedExtentRef = boost::intrusive_ptr<CachedExtent>;
+class SegmentedAllocator;
+class TransactionManager;
+class ExtentPlacementManager;
+
+template <
+ typename node_key_t,
+ typename node_val_t,
+ typename internal_node_t,
+ typename leaf_node_t,
+ typename pin_t,
+ size_t node_size,
+ bool leaf_has_children>
+class FixedKVBtree;
+template <typename, typename>
+class BtreeNodeMapping;
+
+// #define DEBUG_CACHED_EXTENT_REF
+#ifdef DEBUG_CACHED_EXTENT_REF
+
+void intrusive_ptr_add_ref(CachedExtent *);
+void intrusive_ptr_release(CachedExtent *);
+
+#endif
+
+template <typename T>
+using TCachedExtentRef = boost::intrusive_ptr<T>;
+
+/**
+ * CachedExtent
+ */
+namespace onode {
+ class DummyNodeExtent;
+ class TestReplayExtent;
+}
+
+template <typename T>
+class read_set_item_t {
+ using set_hook_t = boost::intrusive::set_member_hook<
+ boost::intrusive::link_mode<
+ boost::intrusive::auto_unlink>>;
+ set_hook_t trans_hook;
+ using set_hook_options = boost::intrusive::member_hook<
+ read_set_item_t,
+ set_hook_t,
+ &read_set_item_t::trans_hook>;
+
+public:
+ struct cmp_t {
+ using is_transparent = paddr_t;
+ bool operator()(const read_set_item_t<T> &lhs, const read_set_item_t &rhs) const;
+ bool operator()(const paddr_t &lhs, const read_set_item_t<T> &rhs) const;
+ bool operator()(const read_set_item_t<T> &lhs, const paddr_t &rhs) const;
+ };
+
+ struct trans_cmp_t {
+ bool operator()(
+ const read_set_item_t<Transaction> &lhs,
+ const read_set_item_t<Transaction> &rhs) const {
+ return lhs.t < rhs.t;
+ }
+ bool operator()(
+ const Transaction *lhs,
+ const read_set_item_t<Transaction> &rhs) const {
+ return lhs < rhs.t;
+ }
+ bool operator()(
+ const read_set_item_t<Transaction> &lhs,
+ const Transaction *rhs) const {
+ return lhs.t < rhs;
+ }
+ };
+
+ using trans_set_t = boost::intrusive::set<
+ read_set_item_t,
+ set_hook_options,
+ boost::intrusive::constant_time_size<false>,
+ boost::intrusive::compare<trans_cmp_t>>;
+
+ T *t = nullptr;
+ CachedExtentRef ref;
+
+ read_set_item_t(T *t, CachedExtentRef ref);
+ read_set_item_t(const read_set_item_t &) = delete;
+ read_set_item_t(read_set_item_t &&) = default;
+ ~read_set_item_t() = default;
+};
+template <typename T>
+using read_set_t = std::set<
+ read_set_item_t<T>,
+ typename read_set_item_t<T>::cmp_t>;
+
+struct trans_spec_view_t {
+ // if the extent is pending, contains the id of the owning transaction;
+ // TRANS_ID_NULL otherwise
+ transaction_id_t pending_for_transaction = TRANS_ID_NULL;
+
+ struct cmp_t {
+ bool operator()(
+ const trans_spec_view_t &lhs,
+ const trans_spec_view_t &rhs) const
+ {
+ return lhs.pending_for_transaction < rhs.pending_for_transaction;
+ }
+ bool operator()(
+ const transaction_id_t &lhs,
+ const trans_spec_view_t &rhs) const
+ {
+ return lhs < rhs.pending_for_transaction;
+ }
+ bool operator()(
+ const trans_spec_view_t &lhs,
+ const transaction_id_t &rhs) const
+ {
+ return lhs.pending_for_transaction < rhs;
+ }
+ };
+
+ using trans_view_hook_t =
+ boost::intrusive::set_member_hook<
+ boost::intrusive::link_mode<
+ boost::intrusive::auto_unlink>>;
+ trans_view_hook_t trans_view_hook;
+
+ using trans_view_member_options =
+ boost::intrusive::member_hook<
+ trans_spec_view_t,
+ trans_view_hook_t,
+ &trans_spec_view_t::trans_view_hook>;
+ using trans_view_set_t = boost::intrusive::set<
+ trans_spec_view_t,
+ trans_view_member_options,
+ boost::intrusive::constant_time_size<false>,
+ boost::intrusive::compare<cmp_t>>;
+};
+
+class ExtentIndex;
+class CachedExtent
+ : public boost::intrusive_ref_counter<
+ CachedExtent, boost::thread_unsafe_counter>,
+ public trans_spec_view_t {
+ enum class extent_state_t : uint8_t {
+ INITIAL_WRITE_PENDING, // In Transaction::write_set and fresh_block_list
+ MUTATION_PENDING, // In Transaction::write_set and mutated_block_list
+ CLEAN_PENDING, // CLEAN, but not yet read out
+ CLEAN, // In Cache::extent_index, Transaction::read_set
+ // during write, contents match disk, version == 0
+ DIRTY, // Same as CLEAN, but contents do not match disk,
+ // version > 0
+ EXIST_CLEAN, // Similar to CLEAN, but its metadata not yet
+ // persisted to disk.
+ // In Transaction::write_set and existing_block_list.
+ // After transaction commits, state becomes CLEAN
+ // and add extent to Cache. Modifing such extents
+ // will cause state turn to EXIST_MUTATION_PENDING.
+ EXIST_MUTATION_PENDING,// Similar to MUTATION_PENDING, but its prior_instance
+ // is empty.
+ // In Transaction::write_set, existing_block_list and
+ // mutated_block_list. State becomes DIRTY and it is
+ // added to Cache after transaction commits.
+ INVALID // Part of no ExtentIndex set
+ } state = extent_state_t::INVALID;
+ friend std::ostream &operator<<(std::ostream &, extent_state_t);
+ // allow a dummy extent to pretend it is at a specific state
+ friend class onode::DummyNodeExtent;
+ friend class onode::TestReplayExtent;
+
+ template <
+ typename node_key_t,
+ typename node_val_t,
+ typename internal_node_t,
+ typename leaf_node_t,
+ typename pin_t,
+ size_t node_size,
+ bool leaf_has_children>
+ friend class FixedKVBtree;
+ uint32_t last_committed_crc = 0;
+
+ // Points at current version while in state MUTATION_PENDING
+ CachedExtentRef prior_instance;
+
+ // time of the last modification
+ sea_time_point modify_time = NULL_TIME;
+
+public:
+ void init(extent_state_t _state,
+ paddr_t paddr,
+ placement_hint_t hint,
+ rewrite_gen_t gen,
+ transaction_id_t trans_id) {
+ assert(gen == NULL_GENERATION || is_rewrite_generation(gen));
+ state = _state;
+ set_paddr(paddr);
+ user_hint = hint;
+ rewrite_generation = gen;
+ pending_for_transaction = trans_id;
+ }
+
+ void set_modify_time(sea_time_point t) {
+ modify_time = t;
+ }
+
+ sea_time_point get_modify_time() const {
+ return modify_time;
+ }
+
+ /**
+ * duplicate_for_write
+ *
+ * Implementation should return a fresh CachedExtentRef
+ * which represents a copy of *this until on_delta_write()
+ * is complete, at which point the user may assume *this
+ * will be in state INVALID. As such, the implementation
+ * may involve a copy of get_bptr(), or an ancillary
+ * structure which defers updating the actual buffer until
+ * on_delta_write().
+ */
+ virtual CachedExtentRef duplicate_for_write(Transaction &t) = 0;
+
+ /**
+ * prepare_write
+ *
+ * Called prior to reading buffer.
+ * Implemenation may use this callback to fully write out
+ * updates to the buffer.
+ */
+ virtual void prepare_write() {}
+
+ /**
+ * prepare_commit
+ *
+ * Called prior to committing the transaction in which this extent
+ * is living.
+ */
+ virtual void prepare_commit() {}
+
+ /**
+ * on_initial_write
+ *
+ * Called after commit of extent. State will be CLEAN.
+ * Implentation may use this call to fixup the buffer
+ * with the newly available absolute get_paddr().
+ */
+ virtual void on_initial_write() {}
+
+ /**
+ * on_clean_read
+ *
+ * Called after read of initially written extent.
+ * State will be CLEAN. Implentation may use this
+ * call to fixup the buffer with the newly available
+ * absolute get_paddr().
+ */
+ virtual void on_clean_read() {}
+
+ /**
+ * on_delta_write
+ *
+ * Called after commit of delta. State will be DIRTY.
+ * Implentation may use this call to fixup any relative
+ * references in the the buffer with the passed
+ * record_block_offset record location.
+ */
+ virtual void on_delta_write(paddr_t record_block_offset) {}
+
+ /**
+ * on_replace_prior
+ *
+ * Called after the extent has replaced a previous one. State
+ * of the extent must be MUTATION_PENDING. Implementation
+ * may use this call to synchronize states that must be synchronized
+ * with the states of Cache and can't wait till transaction
+ * completes.
+ */
+ virtual void on_replace_prior(Transaction &t) {}
+
+ /**
+ * on_invalidated
+ *
+ * Called after the extent is invalidated, either by Cache::invalidate_extent
+ * or Transaction::add_to_retired_set. Implementation may use this
+ * call to adjust states that must be changed immediately once
+ * invalidated.
+ */
+ virtual void on_invalidated(Transaction &t) {}
+ /**
+ * get_type
+ *
+ * Returns concrete type.
+ */
+ virtual extent_types_t get_type() const = 0;
+
+ virtual bool is_logical() const {
+ return false;
+ }
+
+ virtual bool may_conflict() const {
+ return true;
+ }
+
+ friend std::ostream &operator<<(std::ostream &, extent_state_t);
+ virtual std::ostream &print_detail(std::ostream &out) const { return out; }
+ std::ostream &print(std::ostream &out) const {
+ std::string prior_poffset_str = prior_poffset
+ ? fmt::format("{}", *prior_poffset)
+ : "nullopt";
+ out << "CachedExtent(addr=" << this
+ << ", type=" << get_type()
+ << ", version=" << version
+ << ", dirty_from_or_retired_at=" << dirty_from_or_retired_at
+ << ", modify_time=" << sea_time_point_printer_t{modify_time}
+ << ", paddr=" << get_paddr()
+ << ", prior_paddr=" << prior_poffset_str
+ << ", length=" << get_length()
+ << ", state=" << state
+ << ", last_committed_crc=" << last_committed_crc
+ << ", refcount=" << use_count()
+ << ", user_hint=" << user_hint
+ << ", fully_loaded=" << is_fully_loaded()
+ << ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation};
+ if (state != extent_state_t::INVALID &&
+ state != extent_state_t::CLEAN_PENDING) {
+ print_detail(out);
+ }
+ return out << ")";
+ }
+
+ /**
+ * get_delta
+ *
+ * Must return a valid delta usable in apply_delta() in submit_transaction
+ * if state == MUTATION_PENDING.
+ */
+ virtual ceph::bufferlist get_delta() = 0;
+
+ /**
+ * apply_delta
+ *
+ * bl is a delta obtained previously from get_delta. The versions will
+ * match. Implementation should mutate buffer based on bl. base matches
+ * the address passed on_delta_write.
+ *
+ * Implementation *must* use set_last_committed_crc to update the crc to
+ * what the crc of the buffer would have been at submission. For physical
+ * extents that use base to adjust internal record-relative deltas, this
+ * means that the crc should be of the buffer after applying the delta,
+ * but before that adjustment. We do it this way because the crc in the
+ * commit path does not yet know the record base address.
+ *
+ * LogicalCachedExtent overrides this method and provides a simpler
+ * apply_delta override for LogicalCachedExtent implementers.
+ */
+ virtual void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &bl) = 0;
+
+ /**
+ * Called on dirty CachedExtent implementation after replay.
+ * Implementation should perform any reads/in-memory-setup
+ * necessary. (for instance, the lba implementation will use this
+ * to load in lba_manager blocks)
+ */
+ using complete_load_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ virtual complete_load_ertr::future<> complete_load() {
+ return complete_load_ertr::now();
+ }
+
+ /**
+ * cast
+ *
+ * Returns a TCachedExtentRef of the specified type.
+ * TODO: add dynamic check that the requested type is actually correct.
+ */
+ template <typename T>
+ TCachedExtentRef<T> cast() {
+ return TCachedExtentRef<T>(static_cast<T*>(this));
+ }
+ template <typename T>
+ TCachedExtentRef<const T> cast() const {
+ return TCachedExtentRef<const T>(static_cast<const T*>(this));
+ }
+
+ /// Returns true if extent can be mutated in an open transaction
+ bool is_mutable() const {
+ return state == extent_state_t::INITIAL_WRITE_PENDING ||
+ state == extent_state_t::MUTATION_PENDING ||
+ state == extent_state_t::EXIST_MUTATION_PENDING;
+ }
+
+ /// Returns true if extent is part of an open transaction
+ bool is_pending() const {
+ return is_mutable() || state == extent_state_t::EXIST_CLEAN;
+ }
+
+ /// Returns true if extent is stable and shared among transactions
+ bool is_stable() const {
+ return state == extent_state_t::CLEAN_PENDING ||
+ state == extent_state_t::CLEAN ||
+ state == extent_state_t::DIRTY;
+ }
+
+ /// Returns true if extent has a pending delta
+ bool is_mutation_pending() const {
+ return state == extent_state_t::MUTATION_PENDING;
+ }
+
+ /// Returns true if extent is a fresh extent
+ bool is_initial_pending() const {
+ return state == extent_state_t::INITIAL_WRITE_PENDING;
+ }
+
+ /// Returns true if extent is clean (does not have deltas on disk)
+ bool is_clean() const {
+ ceph_assert(is_valid());
+ return state == extent_state_t::INITIAL_WRITE_PENDING ||
+ state == extent_state_t::CLEAN ||
+ state == extent_state_t::CLEAN_PENDING ||
+ state == extent_state_t::EXIST_CLEAN;
+ }
+
+ // Returs true if extent is stable and clean
+ bool is_stable_clean() const {
+ ceph_assert(is_valid());
+ return state == extent_state_t::CLEAN ||
+ state == extent_state_t::CLEAN_PENDING;
+ }
+
+ /// Ruturns true if data is persisted while metadata isn't
+ bool is_exist_clean() const {
+ return state == extent_state_t::EXIST_CLEAN;
+ }
+
+ /// Returns true if the extent with EXTIST_CLEAN is modified
+ bool is_exist_mutation_pending() const {
+ return state == extent_state_t::EXIST_MUTATION_PENDING;
+ }
+
+ /// Returns true if extent is dirty (has deltas on disk)
+ bool is_dirty() const {
+ ceph_assert(is_valid());
+ return !is_clean();
+ }
+
+ /// Returns true if extent has not been superceded or retired
+ bool is_valid() const {
+ return state != extent_state_t::INVALID;
+ }
+
+ /// Returns true if extent or prior_instance has been invalidated
+ bool has_been_invalidated() const {
+ return !is_valid() || (is_mutation_pending() && !prior_instance->is_valid());
+ }
+
+ /// Returns true if extent is a plcaeholder
+ bool is_placeholder() const {
+ return get_type() == extent_types_t::RETIRED_PLACEHOLDER;
+ }
+
+ bool is_pending_io() const {
+ return !!io_wait_promise;
+ }
+
+ /// Return journal location of oldest relevant delta, only valid while DIRTY
+ auto get_dirty_from() const {
+ ceph_assert(is_dirty());
+ return dirty_from_or_retired_at;
+ }
+
+ /// Return journal location of oldest relevant delta, only valid while RETIRED
+ auto get_retired_at() const {
+ ceph_assert(!is_valid());
+ return dirty_from_or_retired_at;
+ }
+
+ /// Return true if extent is fully loaded or is about to be fully loaded (call
+ /// wait_io() in this case)
+ bool is_fully_loaded() const {
+ return ptr.has_value();
+ }
+
+ /**
+ * get_paddr
+ *
+ * Returns current address of extent. If is_initial_pending(), address will
+ * be relative, otherwise address will be absolute.
+ */
+ paddr_t get_paddr() const { return poffset; }
+
+ /// Returns length of extent data in disk
+ extent_len_t get_length() const {
+ return length;
+ }
+
+ extent_len_t get_loaded_length() const {
+ if (ptr.has_value()) {
+ return ptr->length();
+ } else {
+ return 0;
+ }
+ }
+
+ /// Returns version, get_version() == 0 iff is_clean()
+ extent_version_t get_version() const {
+ return version;
+ }
+
+ /// Returns crc32c of buffer
+ uint32_t get_crc32c() {
+ return ceph_crc32c(
+ 1,
+ reinterpret_cast<const unsigned char *>(get_bptr().c_str()),
+ get_length());
+ }
+
+ /// Get ref to raw buffer
+ bufferptr &get_bptr() {
+ assert(ptr.has_value());
+ return *ptr;
+ }
+ const bufferptr &get_bptr() const {
+ assert(ptr.has_value());
+ return *ptr;
+ }
+
+ /// Compare by paddr
+ friend bool operator< (const CachedExtent &a, const CachedExtent &b) {
+ return a.poffset < b.poffset;
+ }
+ friend bool operator> (const CachedExtent &a, const CachedExtent &b) {
+ return a.poffset > b.poffset;
+ }
+ friend bool operator== (const CachedExtent &a, const CachedExtent &b) {
+ return a.poffset == b.poffset;
+ }
+
+ virtual ~CachedExtent();
+
+ placement_hint_t get_user_hint() const {
+ return user_hint;
+ }
+
+ rewrite_gen_t get_rewrite_generation() const {
+ return rewrite_generation;
+ }
+
+ void invalidate_hints() {
+ user_hint = PLACEMENT_HINT_NULL;
+ rewrite_generation = NULL_GENERATION;
+ }
+
+ /// assign the target rewrite generation for the followup rewrite
+ void set_target_rewrite_generation(rewrite_gen_t gen) {
+ assert(is_target_rewrite_generation(gen));
+
+ user_hint = placement_hint_t::REWRITE;
+ rewrite_generation = gen;
+ }
+
+ bool is_inline() const {
+ return poffset.is_relative();
+ }
+
+ paddr_t get_prior_paddr_and_reset() {
+ assert(prior_poffset);
+ auto ret = *prior_poffset;
+ prior_poffset.reset();
+ return ret;
+ }
+
+ void set_invalid(Transaction &t);
+
+ // a rewrite extent has an invalid prior_instance,
+ // and a mutation_pending extent has a valid prior_instance
+ CachedExtentRef get_prior_instance() {
+ return prior_instance;
+ }
+
+private:
+ template <typename T>
+ friend class read_set_item_t;
+
+ friend struct paddr_cmp;
+ friend struct ref_paddr_cmp;
+ friend class ExtentIndex;
+
+ /// Pointer to containing index (or null)
+ ExtentIndex *parent_index = nullptr;
+
+ /// hook for intrusive extent_index
+ boost::intrusive::set_member_hook<> extent_index_hook;
+ using index_member_options = boost::intrusive::member_hook<
+ CachedExtent,
+ boost::intrusive::set_member_hook<>,
+ &CachedExtent::extent_index_hook>;
+ using index = boost::intrusive::set<CachedExtent, index_member_options>;
+ friend class ExtentIndex;
+ friend class Transaction;
+
+ bool is_linked() {
+ return extent_index_hook.is_linked();
+ }
+
+ /// set bufferptr
+ void set_bptr(ceph::bufferptr &&nptr) {
+ ptr = nptr;
+ }
+
+ /// Returns true if the extent part of the open transaction
+ bool is_pending_in_trans(transaction_id_t id) const {
+ return is_pending() && pending_for_transaction == id;
+ }
+
+ /// hook for intrusive ref list (mainly dirty or lru list)
+ boost::intrusive::list_member_hook<> primary_ref_list_hook;
+ using primary_ref_list_member_options = boost::intrusive::member_hook<
+ CachedExtent,
+ boost::intrusive::list_member_hook<>,
+ &CachedExtent::primary_ref_list_hook>;
+ using list = boost::intrusive::list<
+ CachedExtent,
+ primary_ref_list_member_options>;
+
+ /**
+ * dirty_from_or_retired_at
+ *
+ * Encodes ordering token for primary_ref_list -- dirty_from when
+ * dirty or retired_at if retired.
+ */
+ journal_seq_t dirty_from_or_retired_at;
+
+ /// cache data contents, std::nullopt if no data in cache
+ std::optional<ceph::bufferptr> ptr;
+
+ /// disk data length
+ extent_len_t length;
+
+ /// number of deltas since initial write
+ extent_version_t version = 0;
+
+ /// address of original block -- record relative iff is_initial_pending()
+ paddr_t poffset;
+
+ /// relative address before ool write, used to update mapping
+ std::optional<paddr_t> prior_poffset = std::nullopt;
+
+ /// used to wait while in-progress commit completes
+ std::optional<seastar::shared_promise<>> io_wait_promise;
+ void set_io_wait() {
+ ceph_assert(!io_wait_promise);
+ io_wait_promise = seastar::shared_promise<>();
+ }
+ void complete_io() {
+ ceph_assert(io_wait_promise);
+ io_wait_promise->set_value();
+ io_wait_promise = std::nullopt;
+ }
+
+ seastar::future<> wait_io() {
+ if (!io_wait_promise) {
+ return seastar::now();
+ } else {
+ return io_wait_promise->get_shared_future();
+ }
+ }
+
+ CachedExtent* get_transactional_view(Transaction &t);
+ CachedExtent* get_transactional_view(transaction_id_t tid);
+
+ read_set_item_t<Transaction>::trans_set_t transactions;
+
+ placement_hint_t user_hint = PLACEMENT_HINT_NULL;
+
+ // the target rewrite generation for the followup rewrite
+ // or the rewrite generation for the fresh write
+ rewrite_gen_t rewrite_generation = NULL_GENERATION;
+
+protected:
+ trans_view_set_t mutation_pendings;
+
+ CachedExtent(CachedExtent &&other) = delete;
+ CachedExtent(ceph::bufferptr &&_ptr) : ptr(std::move(_ptr)) {
+ length = ptr->length();
+ assert(length > 0);
+ }
+
+ /// construct new CachedExtent, will deep copy the buffer
+ CachedExtent(const CachedExtent &other)
+ : state(other.state),
+ dirty_from_or_retired_at(other.dirty_from_or_retired_at),
+ length(other.get_length()),
+ version(other.version),
+ poffset(other.poffset) {
+ assert((length % CEPH_PAGE_SIZE) == 0);
+ if (other.is_fully_loaded()) {
+ ptr.emplace(buffer::create_page_aligned(length));
+ other.ptr->copy_out(0, length, ptr->c_str());
+ } else {
+ // the extent must be fully loaded before CoW
+ assert(length == 0); // in case of root
+ }
+ }
+
+ struct share_buffer_t {};
+ /// construct new CachedExtent, will shallow copy the buffer
+ CachedExtent(const CachedExtent &other, share_buffer_t)
+ : state(other.state),
+ dirty_from_or_retired_at(other.dirty_from_or_retired_at),
+ ptr(other.ptr),
+ length(other.get_length()),
+ version(other.version),
+ poffset(other.poffset) {}
+
+ // 0 length is only possible for the RootBlock
+ struct zero_length_t {};
+ CachedExtent(zero_length_t) : ptr(ceph::bufferptr(0)), length(0) {};
+
+ struct retired_placeholder_t{};
+ CachedExtent(retired_placeholder_t, extent_len_t _length)
+ : state(extent_state_t::INVALID),
+ length(_length) {
+ assert(length > 0);
+ }
+
+ /// no buffer extent, for lazy read
+ CachedExtent(extent_len_t _length) : length(_length) {
+ assert(length > 0);
+ }
+
+ friend class Cache;
+ template <typename T, typename... Args>
+ static TCachedExtentRef<T> make_cached_extent_ref(
+ Args&&... args) {
+ return new T(std::forward<Args>(args)...);
+ }
+
+ template <typename T>
+ static TCachedExtentRef<T> make_placeholder_cached_extent_ref(
+ extent_len_t length) {
+ return new T(length);
+ }
+
+ void reset_prior_instance() {
+ prior_instance.reset();
+ }
+
+ /// Sets last_committed_crc
+ void set_last_committed_crc(uint32_t crc) {
+ last_committed_crc = crc;
+ }
+
+ void set_paddr(paddr_t offset, bool need_update_mapping = false) {
+ if (need_update_mapping) {
+ assert(!prior_poffset);
+ prior_poffset = poffset;
+ }
+ poffset = offset;
+ }
+
+ /**
+ * maybe_generate_relative
+ *
+ * There are three kinds of addresses one might want to
+ * store within an extent:
+ * - addr for a block within the same transaction relative to the
+ * physical location of this extent in the
+ * event that we will read it in the initial read of the extent
+ * - addr relative to the physical location of the next record to a
+ * block within that record to contain a delta for this extent in
+ * the event that we'll read it from a delta and overlay it onto a
+ * dirty representation of the extent.
+ * - absolute addr to a block already written outside of the current
+ * transaction.
+ *
+ * This helper checks addr and the current state to create the correct
+ * reference.
+ */
+ paddr_t maybe_generate_relative(paddr_t addr) {
+ if (is_initial_pending() && addr.is_record_relative()) {
+ return addr.block_relative_to(get_paddr());
+ } else {
+ ceph_assert(!addr.is_record_relative() || is_mutation_pending());
+ return addr;
+ }
+ }
+
+ friend class crimson::os::seastore::SegmentedAllocator;
+ friend class crimson::os::seastore::TransactionManager;
+ friend class crimson::os::seastore::ExtentPlacementManager;
+ template <typename, typename>
+ friend class BtreeNodeMapping;
+ friend class ::btree_lba_manager_test;
+};
+
+std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t);
+std::ostream &operator<<(std::ostream &, const CachedExtent&);
+
+bool is_backref_mapped_extent_node(const CachedExtentRef &extent);
+
+/// Compare extents by paddr
+struct paddr_cmp {
+ bool operator()(paddr_t lhs, const CachedExtent &rhs) const {
+ return lhs < rhs.poffset;
+ }
+ bool operator()(const CachedExtent &lhs, paddr_t rhs) const {
+ return lhs.poffset < rhs;
+ }
+};
+
+/// Compare extent refs by paddr
+struct ref_paddr_cmp {
+ using is_transparent = paddr_t;
+ bool operator()(const CachedExtentRef &lhs, const CachedExtentRef &rhs) const {
+ return lhs->poffset < rhs->poffset;
+ }
+ bool operator()(const paddr_t &lhs, const CachedExtentRef &rhs) const {
+ return lhs < rhs->poffset;
+ }
+ bool operator()(const CachedExtentRef &lhs, const paddr_t &rhs) const {
+ return lhs->poffset < rhs;
+ }
+};
+
+template <typename T, typename C>
+class addr_extent_list_base_t
+ : public std::list<std::pair<T, C>> {};
+
+using pextent_list_t = addr_extent_list_base_t<paddr_t, CachedExtentRef>;
+
+template <typename T, typename C, typename Cmp>
+class addr_extent_set_base_t
+ : public std::set<C, Cmp> {};
+
+using pextent_set_t = addr_extent_set_base_t<
+ paddr_t,
+ CachedExtentRef,
+ ref_paddr_cmp
+ >;
+
+template <typename T>
+using t_pextent_list_t = addr_extent_list_base_t<paddr_t, TCachedExtentRef<T>>;
+
+/**
+ * ExtentIndex
+ *
+ * Index of CachedExtent & by poffset, does not hold a reference,
+ * user must ensure each extent is removed prior to deletion
+ */
+class ExtentIndex {
+ friend class Cache;
+ CachedExtent::index extent_index;
+public:
+ auto get_overlap(paddr_t addr, extent_len_t len) {
+ auto bottom = extent_index.upper_bound(addr, paddr_cmp());
+ if (bottom != extent_index.begin())
+ --bottom;
+ if (bottom != extent_index.end() &&
+ bottom->get_paddr().add_offset(bottom->get_length()) <= addr)
+ ++bottom;
+
+ auto top = extent_index.lower_bound(addr.add_offset(len), paddr_cmp());
+ return std::make_pair(
+ bottom,
+ top
+ );
+ }
+
+ void clear() {
+ struct cached_extent_disposer {
+ void operator() (CachedExtent* extent) {
+ extent->parent_index = nullptr;
+ }
+ };
+ extent_index.clear_and_dispose(cached_extent_disposer());
+ bytes = 0;
+ }
+
+ void insert(CachedExtent &extent) {
+ // sanity check
+ ceph_assert(!extent.parent_index);
+ auto [a, b] = get_overlap(
+ extent.get_paddr(),
+ extent.get_length());
+ ceph_assert(a == b);
+
+ [[maybe_unused]] auto [iter, inserted] = extent_index.insert(extent);
+ assert(inserted);
+ extent.parent_index = this;
+
+ bytes += extent.get_length();
+ }
+
+ void erase(CachedExtent &extent) {
+ assert(extent.parent_index);
+ assert(extent.is_linked());
+ [[maybe_unused]] auto erased = extent_index.erase(
+ extent_index.s_iterator_to(extent));
+ extent.parent_index = nullptr;
+
+ assert(erased);
+ bytes -= extent.get_length();
+ }
+
+ void replace(CachedExtent &to, CachedExtent &from) {
+ assert(to.get_length() == from.get_length());
+ extent_index.replace_node(extent_index.s_iterator_to(from), to);
+ from.parent_index = nullptr;
+ to.parent_index = this;
+ }
+
+ bool empty() const {
+ return extent_index.empty();
+ }
+
+ auto find_offset(paddr_t offset) {
+ return extent_index.find(offset, paddr_cmp());
+ }
+
+ auto begin() {
+ return extent_index.begin();
+ }
+
+ auto end() {
+ return extent_index.end();
+ }
+
+ auto size() const {
+ return extent_index.size();
+ }
+
+ auto get_bytes() const {
+ return bytes;
+ }
+
+ ~ExtentIndex() {
+ assert(extent_index.empty());
+ assert(bytes == 0);
+ }
+
+private:
+ uint64_t bytes = 0;
+};
+
+class ChildableCachedExtent;
+class LogicalCachedExtent;
+
+class child_pos_t {
+public:
+ child_pos_t(CachedExtentRef stable_parent, uint16_t pos)
+ : stable_parent(stable_parent), pos(pos) {}
+
+ template <typename parent_t>
+ TCachedExtentRef<parent_t> get_parent() {
+ ceph_assert(stable_parent);
+ return stable_parent->template cast<parent_t>();
+ }
+ uint16_t get_pos() {
+ return pos;
+ }
+ void link_child(ChildableCachedExtent *c);
+private:
+ CachedExtentRef stable_parent;
+ uint16_t pos = std::numeric_limits<uint16_t>::max();
+};
+
+using get_child_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+template <typename T>
+struct get_child_ret_t {
+ std::variant<child_pos_t, get_child_ertr::future<TCachedExtentRef<T>>> ret;
+ get_child_ret_t(child_pos_t pos)
+ : ret(std::move(pos)) {}
+ get_child_ret_t(get_child_ertr::future<TCachedExtentRef<T>> child)
+ : ret(std::move(child)) {}
+
+ bool has_child() const {
+ return ret.index() == 1;
+ }
+
+ child_pos_t &get_child_pos() {
+ ceph_assert(ret.index() == 0);
+ return std::get<0>(ret);
+ }
+
+ get_child_ertr::future<TCachedExtentRef<T>> &get_child_fut() {
+ ceph_assert(ret.index() == 1);
+ return std::get<1>(ret);
+ }
+};
+
+template <typename key_t, typename>
+class PhysicalNodeMapping;
+
+template <typename key_t, typename val_t>
+using PhysicalNodeMappingRef = std::unique_ptr<PhysicalNodeMapping<key_t, val_t>>;
+
+template <typename key_t, typename val_t>
+class PhysicalNodeMapping {
+public:
+ virtual extent_len_t get_length() const = 0;
+ virtual extent_types_t get_type() const = 0;
+ virtual val_t get_val() const = 0;
+ virtual key_t get_key() const = 0;
+ virtual PhysicalNodeMappingRef<key_t, val_t> duplicate() const = 0;
+ virtual bool has_been_invalidated() const = 0;
+ virtual CachedExtentRef get_parent() const = 0;
+ virtual uint16_t get_pos() const = 0;
+ // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h
+ virtual bool is_indirect() const { return false; }
+ virtual key_t get_intermediate_key() const { return min_max_t<key_t>::null; }
+ virtual key_t get_intermediate_base() const { return min_max_t<key_t>::null; }
+ virtual extent_len_t get_intermediate_length() const { return 0; }
+ // The start offset of the pin, must be 0 if the pin is not indirect
+ virtual extent_len_t get_intermediate_offset() const {
+ return std::numeric_limits<extent_len_t>::max();
+ }
+
+ virtual get_child_ret_t<LogicalCachedExtent>
+ get_logical_extent(Transaction &t) = 0;
+
+ void link_child(ChildableCachedExtent *c) {
+ ceph_assert(child_pos);
+ child_pos->link_child(c);
+ }
+
+ virtual ~PhysicalNodeMapping() {}
+protected:
+ std::optional<child_pos_t> child_pos = std::nullopt;
+};
+
+using LBAMapping = PhysicalNodeMapping<laddr_t, paddr_t>;
+using LBAMappingRef = PhysicalNodeMappingRef<laddr_t, paddr_t>;
+
+std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs);
+
+using lba_pin_list_t = std::list<LBAMappingRef>;
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
+
+using BackrefMapping = PhysicalNodeMapping<paddr_t, laddr_t>;
+using BackrefMappingRef = PhysicalNodeMappingRef<paddr_t, laddr_t>;
+
+using backref_pin_list_t = std::list<BackrefMappingRef>;
+
+/**
+ * RetiredExtentPlaceholder
+ *
+ * Cache::retire_extent_addr(Transaction&, paddr_t, extent_len_t) can retire an
+ * extent not currently in cache. In that case, in order to detect transaction
+ * invalidation, we need to add a placeholder to the cache to create the
+ * mapping back to the transaction. And whenever there is a transaction tries
+ * to read the placeholder extent out, Cache is responsible to replace the
+ * placeholder by the real one. Anyway, No placeholder extents should escape
+ * the Cache interface boundary.
+ */
+class RetiredExtentPlaceholder : public CachedExtent {
+
+public:
+ RetiredExtentPlaceholder(extent_len_t length)
+ : CachedExtent(CachedExtent::retired_placeholder_t{}, length) {}
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ ceph_assert(0 == "Should never happen for a placeholder");
+ return CachedExtentRef();
+ }
+
+ ceph::bufferlist get_delta() final {
+ ceph_assert(0 == "Should never happen for a placeholder");
+ return ceph::bufferlist();
+ }
+
+ static constexpr extent_types_t TYPE = extent_types_t::RETIRED_PLACEHOLDER;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &bl) final {
+ ceph_assert(0 == "Should never happen for a placeholder");
+ }
+
+ bool is_logical() const final {
+ return false;
+ }
+
+ std::ostream &print_detail(std::ostream &out) const final {
+ return out << ", RetiredExtentPlaceholder";
+ }
+
+ void on_delta_write(paddr_t record_block_offset) final {
+ ceph_assert(0 == "Should never happen for a placeholder");
+ }
+};
+
+class parent_tracker_t
+ : public boost::intrusive_ref_counter<
+ parent_tracker_t, boost::thread_unsafe_counter> {
+public:
+ parent_tracker_t(CachedExtentRef parent)
+ : parent(parent) {}
+ parent_tracker_t(CachedExtent* parent)
+ : parent(parent) {}
+ ~parent_tracker_t();
+ template <typename T = CachedExtent>
+ TCachedExtentRef<T> get_parent() const {
+ ceph_assert(parent);
+ if constexpr (std::is_same_v<T, CachedExtent>) {
+ return parent;
+ } else {
+ return parent->template cast<T>();
+ }
+ }
+ void reset_parent(CachedExtentRef p) {
+ parent = p;
+ }
+ bool is_valid() const {
+ return parent && parent->is_valid();
+ }
+private:
+ CachedExtentRef parent;
+};
+
+std::ostream &operator<<(std::ostream &, const parent_tracker_t &);
+
+using parent_tracker_ref = boost::intrusive_ptr<parent_tracker_t>;
+
+class ChildableCachedExtent : public CachedExtent {
+public:
+ template <typename... T>
+ ChildableCachedExtent(T&&... t) : CachedExtent(std::forward<T>(t)...) {}
+ bool has_parent_tracker() const {
+ return (bool)parent_tracker;
+ }
+ void reset_parent_tracker(parent_tracker_t *p = nullptr) {
+ parent_tracker.reset(p);
+ }
+ bool is_parent_valid() const {
+ return parent_tracker && parent_tracker->is_valid();
+ }
+ template <typename T = CachedExtent>
+ TCachedExtentRef<T> get_parent_node() const {
+ assert(parent_tracker);
+ return parent_tracker->template get_parent<T>();
+ }
+ void take_prior_parent_tracker() {
+ auto &prior = (ChildableCachedExtent&)(*get_prior_instance());
+ parent_tracker = prior.parent_tracker;
+ }
+ std::ostream &print_detail(std::ostream &out) const final;
+private:
+ parent_tracker_ref parent_tracker;
+ virtual std::ostream &_print_detail(std::ostream &out) const {
+ return out;
+ }
+};
+/**
+ * LogicalCachedExtent
+ *
+ * CachedExtent with associated lba mapping.
+ *
+ * Users of TransactionManager should be using extents derived from
+ * LogicalCachedExtent.
+ */
+class LogicalCachedExtent : public ChildableCachedExtent {
+public:
+ template <typename... T>
+ LogicalCachedExtent(T&&... t)
+ : ChildableCachedExtent(std::forward<T>(t)...)
+ {}
+
+ bool has_laddr() const {
+ return laddr != L_ADDR_NULL;
+ }
+
+ laddr_t get_laddr() const {
+ assert(laddr != L_ADDR_NULL);
+ return laddr;
+ }
+
+ void set_laddr(laddr_t nladdr) {
+ laddr = nladdr;
+ }
+
+ void maybe_set_intermediate_laddr(LBAMapping &mapping) {
+ laddr = mapping.is_indirect()
+ ? mapping.get_intermediate_base()
+ : mapping.get_key();
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &bl) final {
+ apply_delta(bl);
+ set_last_committed_crc(get_crc32c());
+ }
+
+ bool is_logical() const final {
+ return true;
+ }
+
+ std::ostream &_print_detail(std::ostream &out) const final;
+
+ void on_replace_prior(Transaction &t) final;
+
+ virtual ~LogicalCachedExtent();
+protected:
+
+ virtual void apply_delta(const ceph::bufferlist &bl) = 0;
+ virtual std::ostream &print_detail_l(std::ostream &out) const {
+ return out;
+ }
+
+ virtual void logical_on_delta_write() {}
+
+ void on_delta_write(paddr_t record_block_offset) final {
+ assert(is_exist_mutation_pending() ||
+ get_prior_instance());
+ logical_on_delta_write();
+ }
+
+private:
+ // the logical address of the extent, and if shared,
+ // it is the intermediate_base, see BtreeLBAMapping comments.
+ laddr_t laddr = L_ADDR_NULL;
+};
+
+using LogicalCachedExtentRef = TCachedExtentRef<LogicalCachedExtent>;
+struct ref_laddr_cmp {
+ using is_transparent = laddr_t;
+ bool operator()(const LogicalCachedExtentRef &lhs,
+ const LogicalCachedExtentRef &rhs) const {
+ return lhs->get_laddr() < rhs->get_laddr();
+ }
+ bool operator()(const laddr_t &lhs,
+ const LogicalCachedExtentRef &rhs) const {
+ return lhs < rhs->get_laddr();
+ }
+ bool operator()(const LogicalCachedExtentRef &lhs,
+ const laddr_t &rhs) const {
+ return lhs->get_laddr() < rhs;
+ }
+};
+
+template <typename T>
+read_set_item_t<T>::read_set_item_t(T *t, CachedExtentRef ref)
+ : t(t), ref(ref)
+{}
+
+template <typename T>
+inline bool read_set_item_t<T>::cmp_t::operator()(
+ const read_set_item_t<T> &lhs, const read_set_item_t<T> &rhs) const {
+ return lhs.ref->poffset < rhs.ref->poffset;
+}
+template <typename T>
+inline bool read_set_item_t<T>::cmp_t::operator()(
+ const paddr_t &lhs, const read_set_item_t<T> &rhs) const {
+ return lhs < rhs.ref->poffset;
+}
+template <typename T>
+inline bool read_set_item_t<T>::cmp_t::operator()(
+ const read_set_item_t<T> &lhs, const paddr_t &rhs) const {
+ return lhs.ref->poffset < rhs;
+}
+
+using lextent_set_t = addr_extent_set_base_t<
+ laddr_t,
+ LogicalCachedExtentRef,
+ ref_laddr_cmp
+ >;
+
+template <typename T>
+using lextent_list_t = addr_extent_list_base_t<
+ laddr_t, TCachedExtentRef<T>>;
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::CachedExtent> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::LogicalCachedExtent> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/collection_manager.cc b/src/crimson/os/seastore/collection_manager.cc
new file mode 100644
index 000000000..4f5b58d01
--- /dev/null
+++ b/src/crimson/os/seastore/collection_manager.cc
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+//
+#include "crimson/os/seastore/collection_manager.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/collection_manager/flat_collection_manager.h"
+
+namespace crimson::os::seastore::collection_manager {
+
+CollectionManagerRef create_coll_manager(TransactionManager &trans_manager) {
+ return CollectionManagerRef(new FlatCollectionManager(trans_manager));
+}
+
+}
diff --git a/src/crimson/os/seastore/collection_manager.h b/src/crimson/os/seastore/collection_manager.h
new file mode 100644
index 000000000..37913abb4
--- /dev/null
+++ b/src/crimson/os/seastore/collection_manager.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "osd/osd_types.h"
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+namespace crimson::os::seastore {
+
+struct coll_info_t {
+ unsigned split_bits;
+
+ coll_info_t(unsigned bits)
+ : split_bits(bits) {}
+
+ bool operator==(const coll_info_t &rhs) const {
+ return split_bits == rhs.split_bits;
+ }
+};
+
+/// Interface for maintaining set of collections
+class CollectionManager {
+public:
+ using base_iertr = TransactionManager::read_extent_iertr;
+
+ /// Initialize collection manager instance for an empty store
+ using mkfs_iertr = TransactionManager::alloc_extent_iertr;
+ using mkfs_ret = mkfs_iertr::future<coll_root_t>;
+ virtual mkfs_ret mkfs(
+ Transaction &t) = 0;
+
+ /// Create collection
+ using create_iertr = base_iertr;
+ using create_ret = create_iertr::future<>;
+ virtual create_ret create(
+ coll_root_t &root,
+ Transaction &t,
+ coll_t cid,
+ coll_info_t info
+ ) = 0;
+
+ /// List collections with info
+ using list_iertr = base_iertr;
+ using list_ret_bare = std::vector<std::pair<coll_t, coll_info_t>>;
+ using list_ret = list_iertr::future<list_ret_bare>;
+ virtual list_ret list(
+ const coll_root_t &root,
+ Transaction &t) = 0;
+
+ /// Remove cid
+ using remove_iertr = base_iertr;
+ using remove_ret = remove_iertr::future<>;
+ virtual remove_ret remove(
+ const coll_root_t &coll_root,
+ Transaction &t,
+ coll_t cid) = 0;
+
+ /// Update info for cid
+ using update_iertr = base_iertr;
+ using update_ret = base_iertr::future<>;
+ virtual update_ret update(
+ const coll_root_t &coll_root,
+ Transaction &t,
+ coll_t cid,
+ coll_info_t info
+ ) = 0;
+
+ virtual ~CollectionManager() {}
+};
+using CollectionManagerRef = std::unique_ptr<CollectionManager>;
+
+namespace collection_manager {
+/* creat CollectionMapManager for Collection */
+CollectionManagerRef create_coll_manager(
+ TransactionManager &trans_manager);
+
+}
+
+}
diff --git a/src/crimson/os/seastore/collection_manager/collection_flat_node.cc b/src/crimson/os/seastore/collection_manager/collection_flat_node.cc
new file mode 100644
index 000000000..ed17e2b12
--- /dev/null
+++ b/src/crimson/os/seastore/collection_manager/collection_flat_node.cc
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/collection_manager/collection_flat_node.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_seastore);
+ }
+}
+
+namespace crimson::os::seastore::collection_manager {
+
+void delta_t::replay(coll_map_t &l) const
+{
+ switch (op) {
+ case op_t::INSERT: {
+ l.insert(coll, bits);
+ break;
+ }
+ case op_t::UPDATE: {
+ l.update(coll, bits);
+ break;
+ }
+ case op_t::REMOVE: {
+ l.erase(coll);
+ break;
+ }
+ case op_t::INVALID: {
+ assert(0 == "impossible");
+ break;
+ }
+ __builtin_unreachable();
+ }
+}
+
+
+std::ostream &CollectionNode::print_detail_l(std::ostream &out) const
+{
+ return out;
+}
+
+CollectionNode::list_ret
+CollectionNode::list()
+{
+ logger().debug("CollectionNode:{}, {}", __func__, *this);
+ CollectionManager::list_ret_bare list_result;
+ for (auto &[coll, bits] : decoded) {
+ list_result.emplace_back(coll, bits);
+ }
+ return list_ret(
+ interruptible::ready_future_marker{},
+ std::move(list_result));
+}
+
+CollectionNode::create_ret
+CollectionNode::create(coll_context_t cc, coll_t coll, unsigned bits)
+{
+ logger().debug("CollectionNode:{}", __func__);
+ if (!is_mutable()) {
+ auto mut = cc.tm.get_mutable_extent(cc.t, this)->cast<CollectionNode>();
+ return mut->create(cc, coll, bits);
+ }
+ logger().debug("CollectionNode::create {} {} {}", coll, bits, *this);
+ auto [iter, inserted] = decoded.insert(coll, bits);
+ assert(inserted);
+ if (encoded_sizeof((base_coll_map_t&)decoded) > get_bptr().length()) {
+ decoded.erase(iter);
+ return create_ret(
+ interruptible::ready_future_marker{},
+ create_result_t::OVERFLOW);
+ } else {
+ if (auto buffer = maybe_get_delta_buffer(); buffer) {
+ buffer->insert(coll, bits);
+ }
+ copy_to_node();
+ return create_ret(
+ interruptible::ready_future_marker{},
+ create_result_t::SUCCESS);
+ }
+}
+
+CollectionNode::update_ret
+CollectionNode::update(coll_context_t cc, coll_t coll, unsigned bits)
+{
+ logger().debug("trans.{} CollectionNode:{} {} {}",
+ cc.t.get_trans_id(), __func__, coll, bits);
+ if (!is_mutable()) {
+ auto mut = cc.tm.get_mutable_extent(cc.t, this)->cast<CollectionNode>();
+ return mut->update(cc, coll, bits);
+ }
+ if (auto buffer = maybe_get_delta_buffer(); buffer) {
+ buffer->update(coll, bits);
+ }
+ decoded.update(coll, bits);
+ copy_to_node();
+ return seastar::now();
+}
+
+CollectionNode::remove_ret
+CollectionNode::remove(coll_context_t cc, coll_t coll)
+{
+ logger().debug("trans.{} CollectionNode:{} {}",
+ cc.t.get_trans_id(),__func__, coll);
+ if (!is_mutable()) {
+ auto mut = cc.tm.get_mutable_extent(cc.t, this)->cast<CollectionNode>();
+ return mut->remove(cc, coll);
+ }
+ if (auto buffer = maybe_get_delta_buffer(); buffer) {
+ buffer->remove(coll);
+ }
+ decoded.remove(coll);
+ copy_to_node();
+ return seastar::now();
+}
+
+}
diff --git a/src/crimson/os/seastore/collection_manager/collection_flat_node.h b/src/crimson/os/seastore/collection_manager/collection_flat_node.h
new file mode 100644
index 000000000..2690fb5fd
--- /dev/null
+++ b/src/crimson/os/seastore/collection_manager/collection_flat_node.h
@@ -0,0 +1,184 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/collection_manager.h"
+
+namespace crimson::os::seastore::collection_manager {
+struct coll_context_t {
+ TransactionManager &tm;
+ Transaction &t;
+};
+
+using base_coll_map_t = std::map<denc_coll_t, uint32_t>;
+struct coll_map_t : base_coll_map_t {
+ auto insert(coll_t coll, unsigned bits) {
+ return emplace(
+ std::make_pair(denc_coll_t{coll}, bits)
+ );
+ }
+
+ void update(coll_t coll, unsigned bits) {
+ (*this)[denc_coll_t{coll}] = bits;
+ }
+
+ void remove(coll_t coll) {
+ erase(denc_coll_t{coll});
+ }
+};
+
+struct delta_t {
+ enum class op_t : uint_fast8_t {
+ INSERT,
+ UPDATE,
+ REMOVE,
+ INVALID
+ } op = op_t::INVALID;
+
+ denc_coll_t coll;
+ uint32_t bits = 0;
+
+ DENC(delta_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.op, p);
+ denc(v.coll, p);
+ denc(v.bits, p);
+ DENC_FINISH(p);
+ }
+
+ void replay(coll_map_t &l) const;
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::collection_manager::delta_t)
+
+namespace crimson::os::seastore::collection_manager {
+class delta_buffer_t {
+ std::vector<delta_t> buffer;
+public:
+ bool empty() const {
+ return buffer.empty();
+ }
+
+ void insert(coll_t coll, uint32_t bits) {
+ buffer.push_back(delta_t{delta_t::op_t::INSERT, denc_coll_t(coll), bits});
+ }
+ void update(coll_t coll, uint32_t bits) {
+ buffer.push_back(delta_t{delta_t::op_t::UPDATE, denc_coll_t(coll), bits});
+ }
+ void remove(coll_t coll) {
+ buffer.push_back(delta_t{delta_t::op_t::REMOVE, denc_coll_t(coll), 0});
+ }
+ void replay(coll_map_t &l) {
+ for (auto &i: buffer) {
+ i.replay(l);
+ }
+ }
+
+ void clear() { buffer.clear(); }
+
+ DENC(delta_buffer_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.buffer, p);
+ DENC_FINISH(p);
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::collection_manager::delta_buffer_t)
+
+namespace crimson::os::seastore::collection_manager {
+
+struct CollectionNode
+ : LogicalCachedExtent {
+ using CollectionNodeRef = TCachedExtentRef<CollectionNode>;
+
+ explicit CollectionNode(ceph::bufferptr &&ptr)
+ : LogicalCachedExtent(std::move(ptr)) {}
+ explicit CollectionNode(const CollectionNode &other)
+ : LogicalCachedExtent(other),
+ decoded(other.decoded) {}
+
+ static constexpr extent_types_t type = extent_types_t::COLL_BLOCK;
+
+ coll_map_t decoded;
+ delta_buffer_t delta_buffer;
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new CollectionNode(*this));
+ }
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ using list_iertr = CollectionManager::list_iertr;
+ using list_ret = CollectionManager::list_ret;
+ list_ret list();
+
+
+ enum class create_result_t : uint8_t {
+ SUCCESS,
+ OVERFLOW
+ };
+ using create_iertr = CollectionManager::create_iertr;
+ using create_ret = create_iertr::future<create_result_t>;
+ create_ret create(coll_context_t cc, coll_t coll, unsigned bits);
+
+ using remove_iertr = CollectionManager::remove_iertr;
+ using remove_ret = CollectionManager::remove_ret;
+ remove_ret remove(coll_context_t cc, coll_t coll);
+
+ using update_iertr = CollectionManager::update_iertr;
+ using update_ret = CollectionManager::update_ret;
+ update_ret update(coll_context_t cc, coll_t coll, unsigned bits);
+
+ void on_clean_read() final {
+ bufferlist bl;
+ bl.append(get_bptr());
+ auto iter = bl.cbegin();
+ decode((base_coll_map_t&)decoded, iter);
+ }
+
+ void copy_to_node() {
+ bufferlist bl;
+ encode((base_coll_map_t&)decoded, bl);
+ auto iter = bl.begin();
+ auto size = encoded_sizeof((base_coll_map_t&)decoded);
+ assert(size <= get_bptr().length());
+ get_bptr().zero();
+ iter.copy(size, get_bptr().c_str());
+
+ }
+
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::bufferlist bl;
+ encode(delta_buffer, bl);
+ delta_buffer.clear();
+ return bl;
+ }
+
+ void apply_delta(const ceph::bufferlist &bl) final {
+ assert(bl.length());
+ delta_buffer_t buffer;
+ auto bptr = bl.begin();
+ decode(buffer, bptr);
+ buffer.replay(decoded);
+ copy_to_node();
+ }
+
+ static constexpr extent_types_t TYPE = extent_types_t::COLL_BLOCK;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ std::ostream &print_detail_l(std::ostream &out) const final;
+};
+using CollectionNodeRef = CollectionNode::CollectionNodeRef;
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::collection_manager::CollectionNode> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc
new file mode 100644
index 000000000..decb095f6
--- /dev/null
+++ b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "include/stringify.h"
+#include "crimson/os/seastore/collection_manager/flat_collection_manager.h"
+#include "crimson/os/seastore/collection_manager/collection_flat_node.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_seastore);
+ }
+}
+
+namespace crimson::os::seastore::collection_manager {
+
+constexpr static extent_len_t MIN_FLAT_BLOCK_SIZE = 4<<10;
+[[maybe_unused]] constexpr static extent_len_t MAX_FLAT_BLOCK_SIZE = 4<<20;
+
+FlatCollectionManager::FlatCollectionManager(
+ TransactionManager &tm)
+ : tm(tm) {}
+
+FlatCollectionManager::mkfs_ret
+FlatCollectionManager::mkfs(Transaction &t)
+{
+
+ logger().debug("FlatCollectionManager: {}", __func__);
+ return tm.alloc_extent<CollectionNode>(
+ t, L_ADDR_MIN, MIN_FLAT_BLOCK_SIZE
+ ).si_then([](auto&& root_extent) {
+ coll_root_t coll_root = coll_root_t(
+ root_extent->get_laddr(),
+ MIN_FLAT_BLOCK_SIZE
+ );
+ return mkfs_iertr::make_ready_future<coll_root_t>(coll_root);
+ });
+}
+
+FlatCollectionManager::get_root_ret
+FlatCollectionManager::get_coll_root(const coll_root_t &coll_root, Transaction &t)
+{
+ logger().debug("FlatCollectionManager: {}", __func__);
+ assert(coll_root.get_location() != L_ADDR_NULL);
+ auto cc = get_coll_context(t);
+ return cc.tm.read_extent<CollectionNode>(
+ cc.t,
+ coll_root.get_location(),
+ coll_root.get_size()
+ ).si_then([](auto&& e) {
+ return get_root_iertr::make_ready_future<CollectionNodeRef>(std::move(e));
+ });
+}
+
+FlatCollectionManager::create_ret
+FlatCollectionManager::create(coll_root_t &coll_root, Transaction &t,
+ coll_t cid, coll_info_t info)
+{
+ logger().debug("FlatCollectionManager: {}", __func__);
+ return get_coll_root(coll_root, t
+ ).si_then([=, this, &coll_root, &t] (auto &&extent) {
+ return extent->create(
+ get_coll_context(t), cid, info.split_bits
+ ).si_then([=, this, &coll_root, &t] (auto ret) {
+ switch (ret) {
+ case CollectionNode::create_result_t::OVERFLOW: {
+ logger().debug("FlatCollectionManager: {} overflow!", __func__);
+ auto new_size = coll_root.get_size() * 2; // double each time
+
+ // TODO return error probably, but such a nonsensically large number of
+ // collections would create a ton of other problems as well
+ assert(new_size < MAX_FLAT_BLOCK_SIZE);
+ return tm.alloc_extent<CollectionNode>(
+ t, L_ADDR_MIN, new_size
+ ).si_then([=, this, &coll_root, &t] (auto &&root_extent) {
+ coll_root.update(root_extent->get_laddr(), root_extent->get_length());
+
+ root_extent->decoded = extent->decoded;
+ return root_extent->create(
+ get_coll_context(t), cid, info.split_bits
+ ).si_then([=, this, &t](auto result) {
+ assert(result == CollectionNode::create_result_t::SUCCESS);
+ return tm.dec_ref(t, extent->get_laddr());
+ }).si_then([] (auto) {
+ return create_iertr::make_ready_future<>();
+ });
+ });
+ }
+ case CollectionNode::create_result_t::SUCCESS: {
+ return create_iertr::make_ready_future<>();
+ }
+ }
+ __builtin_unreachable();
+ });
+ });
+}
+
+FlatCollectionManager::list_ret
+FlatCollectionManager::list(const coll_root_t &coll_root, Transaction &t)
+{
+ logger().debug("FlatCollectionManager: {}", __func__);
+ return get_coll_root(coll_root, t)
+ .si_then([] (auto extent) {
+ return extent->list();
+ });
+}
+
+FlatCollectionManager::update_ret
+FlatCollectionManager::update(const coll_root_t &coll_root, Transaction &t,
+ coll_t cid, coll_info_t info)
+{
+ logger().debug("FlatCollectionManager: {}", __func__);
+ return get_coll_root(coll_root, t)
+ .si_then([this, &t, cid, info] (auto extent) {
+ return extent->update(get_coll_context(t), cid, info.split_bits);
+ });
+}
+
+FlatCollectionManager::remove_ret
+FlatCollectionManager::remove(const coll_root_t &coll_root, Transaction &t,
+ coll_t cid )
+{
+ logger().debug("FlatCollectionManager: {}", __func__);
+ return get_coll_root(coll_root, t).si_then([this, &t, cid] (auto extent) {
+ return extent->remove(get_coll_context(t), cid);
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/collection_manager/flat_collection_manager.h b/src/crimson/os/seastore/collection_manager/flat_collection_manager.h
new file mode 100644
index 000000000..1321ec1d8
--- /dev/null
+++ b/src/crimson/os/seastore/collection_manager/flat_collection_manager.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/ceph_assert.h"
+
+#include "crimson/os/seastore/collection_manager.h"
+#include "crimson/os/seastore/collection_manager/collection_flat_node.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+namespace crimson::os::seastore::collection_manager {
+
+class FlatCollectionManager : public CollectionManager {
+ TransactionManager &tm;
+
+ coll_context_t get_coll_context(Transaction &t) {
+ return coll_context_t{tm, t};
+ }
+
+ using get_root_iertr = base_iertr;
+ using get_root_ret = get_root_iertr::future<CollectionNodeRef>;
+ get_root_ret get_coll_root(const coll_root_t &coll_root, Transaction &t);
+
+public:
+ explicit FlatCollectionManager(TransactionManager &tm);
+
+ mkfs_ret mkfs(Transaction &t) final;
+
+ create_ret create(coll_root_t &coll_root, Transaction &t, coll_t cid,
+ coll_info_t info) final;
+
+ list_ret list(const coll_root_t &coll_root, Transaction &t) final;
+
+ remove_ret remove(const coll_root_t &coll_root, Transaction &t, coll_t cid) final;
+
+ update_ret update(const coll_root_t &coll_root, Transaction &t, coll_t cid, coll_info_t info) final;
+};
+using FlatCollectionManagerRef = std::unique_ptr<FlatCollectionManager>;
+}
diff --git a/src/crimson/os/seastore/device.cc b/src/crimson/os/seastore/device.cc
new file mode 100644
index 000000000..c3bda82a7
--- /dev/null
+++ b/src/crimson/os/seastore/device.cc
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "device.h"
+
+#include "segment_manager.h"
+#include "random_block_manager.h"
+#include "random_block_manager/rbm_device.h"
+
+namespace crimson::os::seastore {
+
+std::ostream& operator<<(std::ostream& out, const device_spec_t& ds)
+{
+ return out << "device_spec("
+ << "magic=" << ds.magic
+ << ", dtype=" << ds.dtype
+ << ", " << device_id_printer_t{ds.id}
+ << ")";
+}
+
+std::ostream& operator<<(std::ostream& out, const device_config_t& conf)
+{
+ out << "device_config_t("
+ << "major_dev=" << conf.major_dev
+ << ", spec=" << conf.spec
+ << ", meta=" << conf.meta
+ << ", secondary(";
+ for (const auto& [k, v] : conf.secondary_devices) {
+ out << device_id_printer_t{k}
+ << ": " << v << ", ";
+ }
+ return out << "))";
+}
+
+seastar::future<DeviceRef>
+Device::make_device(const std::string& device, device_type_t dtype)
+{
+ if (get_default_backend_of_device(dtype) == backend_type_t::SEGMENTED) {
+ return SegmentManager::get_segment_manager(device, dtype
+ ).then([](DeviceRef ret) {
+ return ret;
+ });
+ }
+ assert(get_default_backend_of_device(dtype) == backend_type_t::RANDOM_BLOCK);
+ return get_rb_device(device
+ ).then([](DeviceRef ret) {
+ return ret;
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/device.h b/src/crimson/os/seastore/device.h
new file mode 100644
index 000000000..ceb1ede64
--- /dev/null
+++ b/src/crimson/os/seastore/device.h
@@ -0,0 +1,175 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include "include/buffer_fwd.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+using magic_t = uint64_t;
+
+struct device_spec_t {
+ magic_t magic = 0;
+ device_type_t dtype = device_type_t::NONE;
+ device_id_t id = DEVICE_ID_NULL;
+ DENC(device_spec_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.magic, p);
+ denc(v.dtype, p);
+ denc(v.id, p);
+ DENC_FINISH(p);
+ }
+};
+
+std::ostream& operator<<(std::ostream&, const device_spec_t&);
+
+using secondary_device_set_t =
+ std::map<device_id_t, device_spec_t>;
+
+struct device_config_t {
+ bool major_dev = false;
+ device_spec_t spec;
+ seastore_meta_t meta;
+ secondary_device_set_t secondary_devices;
+ DENC(device_config_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.major_dev, p);
+ denc(v.spec, p);
+ denc(v.meta, p);
+ denc(v.secondary_devices, p);
+ DENC_FINISH(p);
+ }
+ static device_config_t create_primary(
+ uuid_d new_osd_fsid,
+ device_id_t id,
+ device_type_t d_type,
+ secondary_device_set_t sds) {
+ return device_config_t{
+ true,
+ device_spec_t{
+ (magic_t)std::rand(),
+ d_type,
+ id},
+ seastore_meta_t{new_osd_fsid},
+ sds};
+ }
+ static device_config_t create_secondary(
+ uuid_d new_osd_fsid,
+ device_id_t id,
+ device_type_t d_type,
+ magic_t magic) {
+ return device_config_t{
+ false,
+ device_spec_t{
+ magic,
+ d_type,
+ id},
+ seastore_meta_t{new_osd_fsid},
+ secondary_device_set_t()};
+ }
+};
+
+std::ostream& operator<<(std::ostream&, const device_config_t&);
+
+class Device;
+using DeviceRef = std::unique_ptr<Device>;
+
+/**
+ * Device
+ *
+ * Represents a general device regardless of the underlying medium.
+ */
+class Device {
+// interfaces used by device
+public:
+ virtual ~Device() {}
+
+ virtual seastar::future<> start() {
+ return seastar::now();
+ }
+
+ virtual seastar::future<> stop() {
+ return seastar::now();
+ }
+ // called on the shard to get this shard device;
+ virtual Device& get_sharded_device() {
+ return *this;
+ }
+
+ using access_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::permission_denied,
+ crimson::ct_error::enoent>;
+
+ using mkfs_ertr = access_ertr;
+ using mkfs_ret = mkfs_ertr::future<>;
+ virtual mkfs_ret mkfs(device_config_t) = 0;
+
+ using mount_ertr = access_ertr;
+ using mount_ret = access_ertr::future<>;
+ virtual mount_ret mount() = 0;
+
+ static seastar::future<DeviceRef> make_device(
+ const std::string &device,
+ device_type_t dtype);
+
+// interfaces used by each device shard
+public:
+ virtual device_id_t get_device_id() const = 0;
+
+ virtual magic_t get_magic() const = 0;
+
+ virtual device_type_t get_device_type() const = 0;
+
+ virtual backend_type_t get_backend_type() const = 0;
+
+ virtual const seastore_meta_t &get_meta() const = 0;
+
+ virtual extent_len_t get_block_size() const = 0;
+
+ virtual std::size_t get_available_size() const = 0;
+
+ virtual secondary_device_set_t& get_secondary_devices() = 0;
+
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ virtual close_ertr::future<> close() = 0;
+
+ using read_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ virtual read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) = 0;
+
+ read_ertr::future<ceph::bufferptr> read(
+ paddr_t addr,
+ size_t len
+ ) {
+ auto ptrref = std::make_unique<ceph::bufferptr>(
+ buffer::create_page_aligned(len));
+ return read(addr, len, *ptrref
+ ).safe_then([ptrref=std::move(ptrref)]() mutable {
+ return read_ertr::make_ready_future<bufferptr>(std::move(*ptrref));
+ });
+ }
+};
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::device_spec_t)
+WRITE_CLASS_DENC(crimson::os::seastore::device_config_t)
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::device_config_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::device_spec_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc
new file mode 100644
index 000000000..b7aabefc6
--- /dev/null
+++ b/src/crimson/os/seastore/extent_placement_manager.cc
@@ -0,0 +1,808 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "crimson/os/seastore/extent_placement_manager.h"
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/os/seastore/logging.h"
+
+SET_SUBSYS(seastore_epm);
+
+namespace crimson::os::seastore {
+
+SegmentedOolWriter::SegmentedOolWriter(
+ data_category_t category,
+ rewrite_gen_t gen,
+ SegmentProvider& sp,
+ SegmentSeqAllocator &ssa)
+ : segment_allocator(nullptr, category, gen, sp, ssa),
+ record_submitter(crimson::common::get_conf<uint64_t>(
+ "seastore_journal_iodepth_limit"),
+ crimson::common::get_conf<uint64_t>(
+ "seastore_journal_batch_capacity"),
+ crimson::common::get_conf<Option::size_t>(
+ "seastore_journal_batch_flush_size"),
+ crimson::common::get_conf<double>(
+ "seastore_journal_batch_preferred_fullness"),
+ segment_allocator)
+{
+}
+
+SegmentedOolWriter::alloc_write_ertr::future<>
+SegmentedOolWriter::write_record(
+ Transaction& t,
+ record_t&& record,
+ std::list<LogicalCachedExtentRef>&& extents,
+ bool with_atomic_roll_segment)
+{
+ LOG_PREFIX(SegmentedOolWriter::write_record);
+ assert(extents.size());
+ assert(extents.size() == record.extents.size());
+ assert(!record.deltas.size());
+
+ // account transactional ool writes before write()
+ auto& stats = t.get_ool_write_stats();
+ stats.extents.num += extents.size();
+ stats.extents.bytes += record.size.dlength;
+ stats.md_bytes += record.size.get_raw_mdlength();
+ stats.num_records += 1;
+
+ return record_submitter.submit(
+ std::move(record),
+ with_atomic_roll_segment
+ ).safe_then([this, FNAME, &t, extents=std::move(extents)
+ ](record_locator_t ret) mutable {
+ DEBUGT("{} finish with {} and {} extents",
+ t, segment_allocator.get_name(),
+ ret, extents.size());
+ paddr_t extent_addr = ret.record_block_base;
+ for (auto& extent : extents) {
+ TRACET("{} ool extent written at {} -- {}",
+ t, segment_allocator.get_name(),
+ extent_addr, *extent);
+ t.update_delayed_ool_extent_addr(extent, extent_addr);
+ extent_addr = extent_addr.as_seg_paddr().add_offset(
+ extent->get_length());
+ }
+ });
+}
+
+SegmentedOolWriter::alloc_write_iertr::future<>
+SegmentedOolWriter::do_write(
+ Transaction& t,
+ std::list<LogicalCachedExtentRef>& extents)
+{
+ LOG_PREFIX(SegmentedOolWriter::do_write);
+ assert(!extents.empty());
+ if (!record_submitter.is_available()) {
+ DEBUGT("{} extents={} wait ...",
+ t, segment_allocator.get_name(),
+ extents.size());
+ return trans_intr::make_interruptible(
+ record_submitter.wait_available()
+ ).si_then([this, &t, &extents] {
+ return do_write(t, extents);
+ });
+ }
+ record_t record(TRANSACTION_TYPE_NULL);
+ std::list<LogicalCachedExtentRef> pending_extents;
+ auto commit_time = seastar::lowres_system_clock::now();
+
+ for (auto it = extents.begin(); it != extents.end();) {
+ auto& extent = *it;
+ record_size_t wouldbe_rsize = record.size;
+ wouldbe_rsize.account_extent(extent->get_bptr().length());
+ using action_t = journal::RecordSubmitter::action_t;
+ action_t action = record_submitter.check_action(wouldbe_rsize);
+ if (action == action_t::ROLL) {
+ auto num_extents = pending_extents.size();
+ DEBUGT("{} extents={} submit {} extents and roll, unavailable ...",
+ t, segment_allocator.get_name(),
+ extents.size(), num_extents);
+ auto fut_write = alloc_write_ertr::now();
+ if (num_extents > 0) {
+ assert(record_submitter.check_action(record.size) !=
+ action_t::ROLL);
+ fut_write = write_record(
+ t, std::move(record), std::move(pending_extents),
+ true/* with_atomic_roll_segment */);
+ }
+ return trans_intr::make_interruptible(
+ record_submitter.roll_segment(
+ ).safe_then([fut_write=std::move(fut_write)]() mutable {
+ return std::move(fut_write);
+ })
+ ).si_then([this, &t, &extents] {
+ return do_write(t, extents);
+ });
+ }
+
+ TRACET("{} extents={} add extent to record -- {}",
+ t, segment_allocator.get_name(),
+ extents.size(), *extent);
+ ceph::bufferlist bl;
+ extent->prepare_write();
+ bl.append(extent->get_bptr());
+ assert(bl.length() == extent->get_length());
+ auto modify_time = extent->get_modify_time();
+ if (modify_time == NULL_TIME) {
+ modify_time = commit_time;
+ }
+ record.push_back(
+ extent_t{
+ extent->get_type(),
+ extent->get_laddr(),
+ std::move(bl)},
+ modify_time);
+ pending_extents.push_back(extent);
+ it = extents.erase(it);
+
+ assert(record_submitter.check_action(record.size) == action);
+ if (action == action_t::SUBMIT_FULL) {
+ DEBUGT("{} extents={} submit {} extents ...",
+ t, segment_allocator.get_name(),
+ extents.size(), pending_extents.size());
+ return trans_intr::make_interruptible(
+ write_record(t, std::move(record), std::move(pending_extents))
+ ).si_then([this, &t, &extents] {
+ if (!extents.empty()) {
+ return do_write(t, extents);
+ } else {
+ return alloc_write_iertr::now();
+ }
+ });
+ }
+ // SUBMIT_NOT_FULL: evaluate the next extent
+ }
+
+ auto num_extents = pending_extents.size();
+ DEBUGT("{} submit the rest {} extents ...",
+ t, segment_allocator.get_name(),
+ num_extents);
+ assert(num_extents > 0);
+ return trans_intr::make_interruptible(
+ write_record(t, std::move(record), std::move(pending_extents)));
+}
+
+SegmentedOolWriter::alloc_write_iertr::future<>
+SegmentedOolWriter::alloc_write_ool_extents(
+ Transaction& t,
+ std::list<LogicalCachedExtentRef>& extents)
+{
+ if (extents.empty()) {
+ return alloc_write_iertr::now();
+ }
+ return seastar::with_gate(write_guard, [this, &t, &extents] {
+ return do_write(t, extents);
+ });
+}
+
+void ExtentPlacementManager::init(
+ JournalTrimmerImplRef &&trimmer,
+ AsyncCleanerRef &&cleaner,
+ AsyncCleanerRef &&cold_cleaner)
+{
+ writer_refs.clear();
+ auto cold_segment_cleaner = dynamic_cast<SegmentCleaner*>(cold_cleaner.get());
+ dynamic_max_rewrite_generation = MIN_COLD_GENERATION - 1;
+ if (cold_segment_cleaner) {
+ dynamic_max_rewrite_generation = MAX_REWRITE_GENERATION;
+ }
+
+ if (trimmer->get_journal_type() == journal_type_t::SEGMENTED) {
+ auto segment_cleaner = dynamic_cast<SegmentCleaner*>(cleaner.get());
+ ceph_assert(segment_cleaner != nullptr);
+ auto num_writers = generation_to_writer(dynamic_max_rewrite_generation + 1);
+
+ data_writers_by_gen.resize(num_writers, {});
+ for (rewrite_gen_t gen = OOL_GENERATION; gen < MIN_COLD_GENERATION; ++gen) {
+ writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
+ data_category_t::DATA, gen, *segment_cleaner,
+ *ool_segment_seq_allocator));
+ data_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get();
+ }
+
+ md_writers_by_gen.resize(num_writers, {});
+ for (rewrite_gen_t gen = OOL_GENERATION; gen < MIN_COLD_GENERATION; ++gen) {
+ writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
+ data_category_t::METADATA, gen, *segment_cleaner,
+ *ool_segment_seq_allocator));
+ md_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get();
+ }
+
+ for (auto *device : segment_cleaner->get_segment_manager_group()
+ ->get_segment_managers()) {
+ add_device(device);
+ }
+ } else {
+ assert(trimmer->get_journal_type() == journal_type_t::RANDOM_BLOCK);
+ auto rb_cleaner = dynamic_cast<RBMCleaner*>(cleaner.get());
+ ceph_assert(rb_cleaner != nullptr);
+ auto num_writers = generation_to_writer(dynamic_max_rewrite_generation + 1);
+ data_writers_by_gen.resize(num_writers, {});
+ md_writers_by_gen.resize(num_writers, {});
+ writer_refs.emplace_back(std::make_unique<RandomBlockOolWriter>(
+ rb_cleaner));
+ // TODO: implement eviction in RBCleaner and introduce further writers
+ data_writers_by_gen[generation_to_writer(OOL_GENERATION)] = writer_refs.back().get();
+ md_writers_by_gen[generation_to_writer(OOL_GENERATION)] = writer_refs.back().get();
+ for (auto *rb : rb_cleaner->get_rb_group()->get_rb_managers()) {
+ add_device(rb->get_device());
+ }
+ }
+
+ if (cold_segment_cleaner) {
+ for (rewrite_gen_t gen = MIN_COLD_GENERATION; gen < REWRITE_GENERATIONS; ++gen) {
+ writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
+ data_category_t::DATA, gen, *cold_segment_cleaner,
+ *ool_segment_seq_allocator));
+ data_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get();
+ }
+ for (rewrite_gen_t gen = MIN_COLD_GENERATION; gen < REWRITE_GENERATIONS; ++gen) {
+ writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
+ data_category_t::METADATA, gen, *cold_segment_cleaner,
+ *ool_segment_seq_allocator));
+ md_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get();
+ }
+ for (auto *device : cold_segment_cleaner->get_segment_manager_group()
+ ->get_segment_managers()) {
+ add_device(device);
+ }
+ }
+
+ background_process.init(std::move(trimmer),
+ std::move(cleaner),
+ std::move(cold_cleaner));
+ if (cold_segment_cleaner) {
+ ceph_assert(get_main_backend_type() == backend_type_t::SEGMENTED);
+ ceph_assert(background_process.has_cold_tier());
+ } else {
+ ceph_assert(!background_process.has_cold_tier());
+ }
+}
+
+void ExtentPlacementManager::set_primary_device(Device *device)
+{
+ ceph_assert(primary_device == nullptr);
+ primary_device = device;
+ ceph_assert(devices_by_id[device->get_device_id()] == device);
+}
+
+ExtentPlacementManager::open_ertr::future<>
+ExtentPlacementManager::open_for_write()
+{
+ LOG_PREFIX(ExtentPlacementManager::open_for_write);
+ INFO("started with {} devices", num_devices);
+ ceph_assert(primary_device != nullptr);
+ return crimson::do_for_each(data_writers_by_gen, [](auto &writer) {
+ if (writer) {
+ return writer->open();
+ }
+ return open_ertr::now();
+ }).safe_then([this] {
+ return crimson::do_for_each(md_writers_by_gen, [](auto &writer) {
+ if (writer) {
+ return writer->open();
+ }
+ return open_ertr::now();
+ });
+ });
+}
+
+ExtentPlacementManager::dispatch_result_t
+ExtentPlacementManager::dispatch_delayed_extents(Transaction &t)
+{
+ dispatch_result_t res;
+ res.delayed_extents = t.get_delayed_alloc_list();
+
+ // init projected usage
+ for (auto &extent : t.get_inline_block_list()) {
+ if (extent->is_valid()) {
+ res.usage.inline_usage += extent->get_length();
+ res.usage.cleaner_usage.main_usage += extent->get_length();
+ }
+ }
+
+ for (auto &extent : res.delayed_extents) {
+ if (dispatch_delayed_extent(extent)) {
+ res.usage.inline_usage += extent->get_length();
+ res.usage.cleaner_usage.main_usage += extent->get_length();
+ t.mark_delayed_extent_inline(extent);
+ } else {
+ if (extent->get_rewrite_generation() < MIN_COLD_GENERATION) {
+ res.usage.cleaner_usage.main_usage += extent->get_length();
+ } else {
+ assert(background_process.has_cold_tier());
+ res.usage.cleaner_usage.cold_ool_usage += extent->get_length();
+ }
+ t.mark_delayed_extent_ool(extent);
+ auto writer_ptr = get_writer(
+ extent->get_user_hint(),
+ get_extent_category(extent->get_type()),
+ extent->get_rewrite_generation());
+ res.alloc_map[writer_ptr].emplace_back(extent);
+ }
+ }
+ return res;
+}
+
+ExtentPlacementManager::alloc_paddr_iertr::future<>
+ExtentPlacementManager::write_delayed_ool_extents(
+ Transaction& t,
+ extents_by_writer_t& alloc_map) {
+ return trans_intr::do_for_each(alloc_map, [&t](auto& p) {
+ auto writer = p.first;
+ auto& extents = p.second;
+ return writer->alloc_write_ool_extents(t, extents);
+ });
+}
+
+ExtentPlacementManager::alloc_paddr_iertr::future<>
+ExtentPlacementManager::write_preallocated_ool_extents(
+ Transaction &t,
+ std::list<LogicalCachedExtentRef> extents)
+{
+ LOG_PREFIX(ExtentPlacementManager::write_preallocated_ool_extents);
+ DEBUGT("start with {} allocated extents",
+ t, extents.size());
+ assert(writer_refs.size());
+ return seastar::do_with(
+ std::map<ExtentOolWriter*, std::list<LogicalCachedExtentRef>>(),
+ [this, &t, extents=std::move(extents)](auto& alloc_map) {
+ for (auto& extent : extents) {
+ auto writer_ptr = get_writer(
+ extent->get_user_hint(),
+ get_extent_category(extent->get_type()),
+ extent->get_rewrite_generation());
+ alloc_map[writer_ptr].emplace_back(extent);
+ }
+ return trans_intr::do_for_each(alloc_map, [&t](auto& p) {
+ auto writer = p.first;
+ auto& extents = p.second;
+ return writer->alloc_write_ool_extents(t, extents);
+ });
+ });
+}
+
+ExtentPlacementManager::close_ertr::future<>
+ExtentPlacementManager::close()
+{
+ LOG_PREFIX(ExtentPlacementManager::close);
+ INFO("started");
+ return crimson::do_for_each(data_writers_by_gen, [](auto &writer) {
+ if (writer) {
+ return writer->close();
+ }
+ return close_ertr::now();
+ }).safe_then([this] {
+ return crimson::do_for_each(md_writers_by_gen, [](auto &writer) {
+ if (writer) {
+ return writer->close();
+ }
+ return close_ertr::now();
+ });
+ });
+}
+
+void ExtentPlacementManager::BackgroundProcess::log_state(const char *caller) const
+{
+ LOG_PREFIX(BackgroundProcess::log_state);
+ DEBUG("caller {}, {}, {}",
+ caller,
+ JournalTrimmerImpl::stat_printer_t{*trimmer, true},
+ AsyncCleaner::stat_printer_t{*main_cleaner, true});
+ if (has_cold_tier()) {
+ DEBUG("caller {}, cold_cleaner: {}",
+ caller,
+ AsyncCleaner::stat_printer_t{*cold_cleaner, true});
+ }
+}
+
+void ExtentPlacementManager::BackgroundProcess::start_background()
+{
+ LOG_PREFIX(BackgroundProcess::start_background);
+ INFO("{}, {}",
+ JournalTrimmerImpl::stat_printer_t{*trimmer, true},
+ AsyncCleaner::stat_printer_t{*main_cleaner, true});
+ if (has_cold_tier()) {
+ INFO("cold_cleaner: {}",
+ AsyncCleaner::stat_printer_t{*cold_cleaner, true});
+ }
+ ceph_assert(trimmer->check_is_ready());
+ ceph_assert(state == state_t::SCAN_SPACE);
+ assert(!is_running());
+ process_join = seastar::now();
+ state = state_t::RUNNING;
+ assert(is_running());
+ process_join = run();
+}
+
+seastar::future<>
+ExtentPlacementManager::BackgroundProcess::stop_background()
+{
+ return seastar::futurize_invoke([this] {
+ if (!is_running()) {
+ if (state != state_t::HALT) {
+ state = state_t::STOP;
+ }
+ return seastar::now();
+ }
+ auto ret = std::move(*process_join);
+ process_join.reset();
+ state = state_t::HALT;
+ assert(!is_running());
+ do_wake_background();
+ return ret;
+ }).then([this] {
+ LOG_PREFIX(BackgroundProcess::stop_background);
+ INFO("done, {}, {}",
+ JournalTrimmerImpl::stat_printer_t{*trimmer, true},
+ AsyncCleaner::stat_printer_t{*main_cleaner, true});
+ if (has_cold_tier()) {
+ INFO("done, cold_cleaner: {}",
+ AsyncCleaner::stat_printer_t{*cold_cleaner, true});
+ }
+ // run_until_halt() can be called at HALT
+ });
+}
+
+seastar::future<>
+ExtentPlacementManager::BackgroundProcess::run_until_halt()
+{
+ ceph_assert(state == state_t::HALT);
+ assert(!is_running());
+ if (is_running_until_halt) {
+ return seastar::now();
+ }
+ is_running_until_halt = true;
+ return seastar::do_until(
+ [this] {
+ log_state("run_until_halt");
+ assert(is_running_until_halt);
+ if (background_should_run()) {
+ return false;
+ } else {
+ is_running_until_halt = false;
+ return true;
+ }
+ },
+ [this] {
+ return do_background_cycle();
+ }
+ );
+}
+
+seastar::future<>
+ExtentPlacementManager::BackgroundProcess::reserve_projected_usage(
+ io_usage_t usage)
+{
+ if (!is_ready()) {
+ return seastar::now();
+ }
+ ceph_assert(!blocking_io);
+ // The pipeline configuration prevents another IO from entering
+ // prepare until the prior one exits and clears this.
+ ++stats.io_count;
+
+ auto res = try_reserve_io(usage);
+ if (res.is_successful()) {
+ return seastar::now();
+ } else {
+ abort_io_usage(usage, res);
+ if (!res.reserve_inline_success) {
+ ++stats.io_blocked_count_trim;
+ }
+ if (!res.cleaner_result.is_successful()) {
+ ++stats.io_blocked_count_clean;
+ }
+ ++stats.io_blocking_num;
+ ++stats.io_blocked_count;
+ stats.io_blocked_sum += stats.io_blocking_num;
+
+ return seastar::repeat([this, usage] {
+ blocking_io = seastar::promise<>();
+ return blocking_io->get_future(
+ ).then([this, usage] {
+ ceph_assert(!blocking_io);
+ auto res = try_reserve_io(usage);
+ if (res.is_successful()) {
+ assert(stats.io_blocking_num == 1);
+ --stats.io_blocking_num;
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ } else {
+ abort_io_usage(usage, res);
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ }
+ });
+ });
+ }
+}
+
+seastar::future<>
+ExtentPlacementManager::BackgroundProcess::run()
+{
+ assert(is_running());
+ return seastar::repeat([this] {
+ if (!is_running()) {
+ log_state("run(exit)");
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ return seastar::futurize_invoke([this] {
+ if (background_should_run()) {
+ log_state("run(background)");
+ return do_background_cycle();
+ } else {
+ log_state("run(block)");
+ ceph_assert(!blocking_background);
+ blocking_background = seastar::promise<>();
+ return blocking_background->get_future();
+ }
+ }).then([] {
+ return seastar::stop_iteration::no;
+ });
+ });
+}
+
+/**
+ * Reservation Process
+ *
+ * Most of transctions need to reserve its space usage before performing the
+ * ool writes and committing transactions. If the space reservation is
+ * unsuccessful, the current transaction is blocked, and waits for new
+ * background transactions to finish.
+ *
+ * The following are the reservation requirements for each transaction type:
+ * 1. MUTATE transaction:
+ * (1) inline usage on the trimmer,
+ * (2) inline usage with OOL usage on the main cleaner,
+ * (3) cold OOL usage to the cold cleaner(if it exists).
+ * 2. TRIM_DIRTY/TRIM_ALLOC transaction:
+ * (1) all extents usage on the main cleaner,
+ * (2) usage on the cold cleaner(if it exists)
+ * 3. CLEANER_MAIN:
+ * (1) cleaned extents size on the cold cleaner(if it exists).
+ * 4. CLEANER_COLD transction does not require space reservation.
+ *
+ * The reserve implementation should satisfy the following conditions:
+ * 1. The reservation should be atomic. If a reservation involves several reservations,
+ * such as the MUTATE transaction that needs to reserve space on both the trimmer
+ * and cleaner at the same time, the successful condition is that all of its
+ * sub-reservations succeed. If one or more operations fail, the entire reservation
+ * fails, and the successful operation should be reverted.
+ * 2. The reserve/block relationship should form a DAG to avoid deadlock. For example,
+ * TRIM_ALLOC transaction might be blocked by cleaner due to the failure of reserving
+ * on the cleaner. In such cases, the cleaner must not reserve space on the trimmer
+ * since the trimmer is already blocked by itself.
+ *
+ * Finally the reserve relationship can be represented as follows:
+ *
+ * +-------------------------+----------------+
+ * | | |
+ * | v v
+ * MUTATE ---> TRIM_* ---> CLEANER_MAIN ---> CLEANER_COLD
+ * | ^
+ * | |
+ * +--------------------------------+
+ */
+bool ExtentPlacementManager::BackgroundProcess::try_reserve_cold(std::size_t usage)
+{
+ if (has_cold_tier()) {
+ return cold_cleaner->try_reserve_projected_usage(usage);
+ } else {
+ assert(usage == 0);
+ return true;
+ }
+}
+void ExtentPlacementManager::BackgroundProcess::abort_cold_usage(
+ std::size_t usage, bool success)
+{
+ if (has_cold_tier() && success) {
+ cold_cleaner->release_projected_usage(usage);
+ }
+}
+
+reserve_cleaner_result_t
+ExtentPlacementManager::BackgroundProcess::try_reserve_cleaner(
+ const cleaner_usage_t &usage)
+{
+ return {
+ main_cleaner->try_reserve_projected_usage(usage.main_usage),
+ try_reserve_cold(usage.cold_ool_usage)
+ };
+}
+
+void ExtentPlacementManager::BackgroundProcess::abort_cleaner_usage(
+ const cleaner_usage_t &usage,
+ const reserve_cleaner_result_t &result)
+{
+ if (result.reserve_main_success) {
+ main_cleaner->release_projected_usage(usage.main_usage);
+ }
+ abort_cold_usage(usage.cold_ool_usage, result.reserve_cold_success);
+}
+
+reserve_io_result_t
+ExtentPlacementManager::BackgroundProcess::try_reserve_io(
+ const io_usage_t &usage)
+{
+ return {
+ trimmer->try_reserve_inline_usage(usage.inline_usage),
+ try_reserve_cleaner(usage.cleaner_usage)
+ };
+}
+
+void ExtentPlacementManager::BackgroundProcess::abort_io_usage(
+ const io_usage_t &usage,
+ const reserve_io_result_t &result)
+{
+ if (result.reserve_inline_success) {
+ trimmer->release_inline_usage(usage.inline_usage);
+ }
+ abort_cleaner_usage(usage.cleaner_usage, result.cleaner_result);
+}
+
+seastar::future<>
+ExtentPlacementManager::BackgroundProcess::do_background_cycle()
+{
+ assert(is_ready());
+ bool should_trim = trimmer->should_trim();
+ bool proceed_trim = false;
+ auto trim_size = trimmer->get_trim_size_per_cycle();
+ cleaner_usage_t trim_usage{
+ trim_size,
+ // We take a cautious policy here that the trimmer also reserves
+ // the max value on cold cleaner even if no extents will be rewritten
+ // to the cold tier. Cleaner also takes the same policy.
+ // The reason is that we don't know the exact value of reservation until
+ // the construction of trimmer transaction completes after which the reservation
+ // might fail then the trimmer is possible to be invalidated by cleaner.
+ // Reserving the max size at first could help us avoid these trouble.
+ has_cold_tier() ? trim_size : 0
+ };
+
+ reserve_cleaner_result_t trim_reserve_res;
+ if (should_trim) {
+ trim_reserve_res = try_reserve_cleaner(trim_usage);
+ if (trim_reserve_res.is_successful()) {
+ proceed_trim = true;
+ } else {
+ abort_cleaner_usage(trim_usage, trim_reserve_res);
+ }
+ }
+
+ if (proceed_trim) {
+ return trimmer->trim(
+ ).finally([this, trim_usage] {
+ abort_cleaner_usage(trim_usage, {true, true});
+ });
+ } else {
+ bool should_clean_main =
+ main_cleaner_should_run() ||
+ // make sure cleaner will start
+ // when the trimmer should run but
+ // failed to reserve space.
+ (should_trim && !proceed_trim &&
+ !trim_reserve_res.reserve_main_success);
+ bool proceed_clean_main = false;
+
+ auto main_cold_usage = main_cleaner->get_reclaim_size_per_cycle();
+ if (should_clean_main) {
+ if (has_cold_tier()) {
+ proceed_clean_main = try_reserve_cold(main_cold_usage);
+ } else {
+ proceed_clean_main = true;
+ }
+ }
+
+ bool proceed_clean_cold = false;
+ if (has_cold_tier() &&
+ (cold_cleaner->should_clean_space() ||
+ (should_trim && !proceed_trim &&
+ !trim_reserve_res.reserve_cold_success) ||
+ (should_clean_main && !proceed_clean_main))) {
+ proceed_clean_cold = true;
+ }
+
+ if (!proceed_clean_main && !proceed_clean_cold) {
+ ceph_abort("no background process will start");
+ }
+ return seastar::when_all(
+ [this, proceed_clean_main, main_cold_usage] {
+ if (!proceed_clean_main) {
+ return seastar::now();
+ }
+ return main_cleaner->clean_space(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "do_background_cycle encountered invalid error in main clean_space"
+ }
+ ).finally([this, main_cold_usage] {
+ abort_cold_usage(main_cold_usage, true);
+ });
+ },
+ [this, proceed_clean_cold] {
+ if (!proceed_clean_cold) {
+ return seastar::now();
+ }
+ return cold_cleaner->clean_space(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "do_background_cycle encountered invalid error in cold clean_space"
+ }
+ );
+ }
+ ).discard_result();
+ }
+}
+
+void ExtentPlacementManager::BackgroundProcess::register_metrics()
+{
+ namespace sm = seastar::metrics;
+ metrics.add_group("background_process", {
+ sm::make_counter("io_count", stats.io_count,
+ sm::description("the sum of IOs")),
+ sm::make_counter("io_blocked_count", stats.io_blocked_count,
+ sm::description("IOs that are blocked by gc")),
+ sm::make_counter("io_blocked_count_trim", stats.io_blocked_count_trim,
+ sm::description("IOs that are blocked by trimming")),
+ sm::make_counter("io_blocked_count_clean", stats.io_blocked_count_clean,
+ sm::description("IOs that are blocked by cleaning")),
+ sm::make_counter("io_blocked_sum", stats.io_blocked_sum,
+ sm::description("the sum of blocking IOs"))
+ });
+}
+
+RandomBlockOolWriter::alloc_write_iertr::future<>
+RandomBlockOolWriter::alloc_write_ool_extents(
+ Transaction& t,
+ std::list<LogicalCachedExtentRef>& extents)
+{
+ if (extents.empty()) {
+ return alloc_write_iertr::now();
+ }
+ return seastar::with_gate(write_guard, [this, &t, &extents] {
+ return do_write(t, extents);
+ });
+}
+
+RandomBlockOolWriter::alloc_write_iertr::future<>
+RandomBlockOolWriter::do_write(
+ Transaction& t,
+ std::list<LogicalCachedExtentRef>& extents)
+{
+ LOG_PREFIX(RandomBlockOolWriter::do_write);
+ assert(!extents.empty());
+ DEBUGT("start with {} allocated extents",
+ t, extents.size());
+ return trans_intr::do_for_each(extents,
+ [this, &t, FNAME](auto& ex) {
+ auto paddr = ex->get_paddr();
+ assert(paddr.is_absolute());
+ RandomBlockManager * rbm = rb_cleaner->get_rbm(paddr);
+ assert(rbm);
+ TRACE("extent {}, allocated addr {}", fmt::ptr(ex.get()), paddr);
+ auto& stats = t.get_ool_write_stats();
+ stats.extents.num += 1;
+ stats.extents.bytes += ex->get_length();
+ stats.num_records += 1;
+
+ ex->prepare_write();
+ return rbm->write(paddr,
+ ex->get_bptr()
+ ).handle_error(
+ alloc_write_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error when writing record"}
+ ).safe_then([&t, &ex, paddr, FNAME]() {
+ TRACET("ool extent written at {} -- {}",
+ t, paddr, *ex);
+ t.mark_allocated_extent_ool(ex);
+ return alloc_write_iertr::now();
+ });
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h
new file mode 100644
index 000000000..b94c03ec3
--- /dev/null
+++ b/src/crimson/os/seastore/extent_placement_manager.h
@@ -0,0 +1,915 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include "seastar/core/gate.hh"
+
+#include "crimson/os/seastore/async_cleaner.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/journal/segment_allocator.h"
+#include "crimson/os/seastore/journal/record_submitter.h"
+#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/random_block_manager.h"
+#include "crimson/os/seastore/random_block_manager/block_rb_manager.h"
+#include "crimson/os/seastore/randomblock_manager_group.h"
+
+class transaction_manager_test_t;
+
+namespace crimson::os::seastore {
+
+/**
+ * ExtentOolWriter
+ *
+ * Write the extents as out-of-line and allocate the physical addresses.
+ * Different writers write extents to different locations.
+ */
+class ExtentOolWriter {
+ using base_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+public:
+ virtual ~ExtentOolWriter() {}
+
+ using open_ertr = base_ertr;
+ virtual open_ertr::future<> open() = 0;
+
+ virtual paddr_t alloc_paddr(extent_len_t length) = 0;
+
+ using alloc_write_ertr = base_ertr;
+ using alloc_write_iertr = trans_iertr<alloc_write_ertr>;
+ virtual alloc_write_iertr::future<> alloc_write_ool_extents(
+ Transaction &t,
+ std::list<LogicalCachedExtentRef> &extents) = 0;
+
+ using close_ertr = base_ertr;
+ virtual close_ertr::future<> close() = 0;
+};
+using ExtentOolWriterRef = std::unique_ptr<ExtentOolWriter>;
+
+/**
+ * SegmentedOolWriter
+ *
+ * Different writers write extents to different out-of-line segments provided
+ * by the SegmentProvider.
+ */
+class SegmentedOolWriter : public ExtentOolWriter {
+public:
+ SegmentedOolWriter(data_category_t category,
+ rewrite_gen_t gen,
+ SegmentProvider &sp,
+ SegmentSeqAllocator &ssa);
+
+ open_ertr::future<> open() final {
+ return record_submitter.open(false).discard_result();
+ }
+
+ alloc_write_iertr::future<> alloc_write_ool_extents(
+ Transaction &t,
+ std::list<LogicalCachedExtentRef> &extents) final;
+
+ close_ertr::future<> close() final {
+ return write_guard.close().then([this] {
+ return record_submitter.close();
+ }).safe_then([this] {
+ write_guard = seastar::gate();
+ });
+ }
+
+ paddr_t alloc_paddr(extent_len_t length) final {
+ return make_delayed_temp_paddr(0);
+ }
+
+private:
+ alloc_write_iertr::future<> do_write(
+ Transaction& t,
+ std::list<LogicalCachedExtentRef> &extent);
+
+ alloc_write_ertr::future<> write_record(
+ Transaction& t,
+ record_t&& record,
+ std::list<LogicalCachedExtentRef> &&extents,
+ bool with_atomic_roll_segment=false);
+
+ journal::SegmentAllocator segment_allocator;
+ journal::RecordSubmitter record_submitter;
+ seastar::gate write_guard;
+};
+
+
+class RandomBlockOolWriter : public ExtentOolWriter {
+public:
+ RandomBlockOolWriter(RBMCleaner* rb_cleaner) :
+ rb_cleaner(rb_cleaner) {}
+
+ using open_ertr = ExtentOolWriter::open_ertr;
+ open_ertr::future<> open() final {
+ return open_ertr::now();
+ }
+
+ alloc_write_iertr::future<> alloc_write_ool_extents(
+ Transaction &t,
+ std::list<LogicalCachedExtentRef> &extents) final;
+
+ close_ertr::future<> close() final {
+ return write_guard.close().then([this] {
+ write_guard = seastar::gate();
+ return close_ertr::now();
+ });
+ }
+
+ paddr_t alloc_paddr(extent_len_t length) final {
+ assert(rb_cleaner);
+ return rb_cleaner->alloc_paddr(length);
+ }
+
+private:
+ alloc_write_iertr::future<> do_write(
+ Transaction& t,
+ std::list<LogicalCachedExtentRef> &extent);
+
+ RBMCleaner* rb_cleaner;
+ seastar::gate write_guard;
+};
+
+struct cleaner_usage_t {
+ // The size of all extents write to the main devices, including inline extents
+ // and out-of-line extents.
+ std::size_t main_usage = 0;
+ // The size of extents write to the cold devices
+ std::size_t cold_ool_usage = 0;
+};
+
+struct reserve_cleaner_result_t {
+ bool reserve_main_success = true;
+ bool reserve_cold_success = true;
+
+ bool is_successful() const {
+ return reserve_main_success &&
+ reserve_cold_success;
+ }
+};
+
+/**
+ * io_usage_t
+ *
+ * io_usage_t describes the space usage consumed by client IO.
+ */
+struct io_usage_t {
+ // The total size of all inlined extents, not including deltas and other metadata
+ // produced by Cache::prepare_record.
+ std::size_t inline_usage = 0;
+ cleaner_usage_t cleaner_usage;
+ friend std::ostream &operator<<(std::ostream &out, const io_usage_t &usage) {
+ return out << "io_usage_t("
+ << "inline_usage=" << usage.inline_usage
+ << ", main_cleaner_usage=" << usage.cleaner_usage.main_usage
+ << ", cold_cleaner_usage=" << usage.cleaner_usage.cold_ool_usage
+ << ")";
+ }
+};
+
+struct reserve_io_result_t {
+ bool reserve_inline_success = true;
+ reserve_cleaner_result_t cleaner_result;
+
+ bool is_successful() const {
+ return reserve_inline_success &&
+ cleaner_result.is_successful();
+ }
+};
+
+class ExtentPlacementManager {
+public:
+ ExtentPlacementManager()
+ : ool_segment_seq_allocator(
+ std::make_unique<SegmentSeqAllocator>(segment_type_t::OOL))
+ {
+ devices_by_id.resize(DEVICE_ID_MAX, nullptr);
+ }
+
+ void init(JournalTrimmerImplRef &&, AsyncCleanerRef &&, AsyncCleanerRef &&);
+
+ SegmentSeqAllocator &get_ool_segment_seq_allocator() const {
+ return *ool_segment_seq_allocator;
+ }
+
+ void set_primary_device(Device *device);
+
+ void set_extent_callback(ExtentCallbackInterface *cb) {
+ background_process.set_extent_callback(cb);
+ }
+
+ journal_type_t get_journal_type() const {
+ return background_process.get_journal_type();
+ }
+
+ extent_len_t get_block_size() const {
+ assert(primary_device != nullptr);
+ // assume all the devices have the same block size
+ return primary_device->get_block_size();
+ }
+
+ Device& get_primary_device() {
+ assert(primary_device != nullptr);
+ return *primary_device;
+ }
+
+ store_statfs_t get_stat() const {
+ return background_process.get_stat();
+ }
+
+ using mount_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using mount_ret = mount_ertr::future<>;
+ mount_ret mount() {
+ return background_process.mount();
+ }
+
+ using open_ertr = ExtentOolWriter::open_ertr;
+ open_ertr::future<> open_for_write();
+
+ void start_scan_space() {
+ return background_process.start_scan_space();
+ }
+
+ void start_background() {
+ return background_process.start_background();
+ }
+
+ struct alloc_result_t {
+ paddr_t paddr;
+ bufferptr bp;
+ rewrite_gen_t gen;
+ };
+ alloc_result_t alloc_new_extent(
+ Transaction& t,
+ extent_types_t type,
+ extent_len_t length,
+ placement_hint_t hint,
+#ifdef UNIT_TESTS_BUILT
+ rewrite_gen_t gen,
+ std::optional<paddr_t> external_paddr = std::nullopt
+#else
+ rewrite_gen_t gen
+#endif
+ ) {
+ assert(hint < placement_hint_t::NUM_HINTS);
+ assert(is_target_rewrite_generation(gen));
+ assert(gen == INIT_GENERATION || hint == placement_hint_t::REWRITE);
+
+ data_category_t category = get_extent_category(type);
+ gen = adjust_generation(category, type, hint, gen);
+
+ // XXX: bp might be extended to point to different memory (e.g. PMem)
+ // according to the allocator.
+ auto bp = ceph::bufferptr(
+ buffer::create_page_aligned(length));
+ bp.zero();
+ paddr_t addr;
+#ifdef UNIT_TESTS_BUILT
+ if (unlikely(external_paddr.has_value())) {
+ assert(external_paddr->is_fake());
+ addr = *external_paddr;
+ } else if (gen == INLINE_GENERATION) {
+#else
+ if (gen == INLINE_GENERATION) {
+#endif
+ addr = make_record_relative_paddr(0);
+ } else if (category == data_category_t::DATA) {
+ assert(data_writers_by_gen[generation_to_writer(gen)]);
+ addr = data_writers_by_gen[
+ generation_to_writer(gen)]->alloc_paddr(length);
+ } else {
+ assert(category == data_category_t::METADATA);
+ assert(md_writers_by_gen[generation_to_writer(gen)]);
+ addr = md_writers_by_gen[
+ generation_to_writer(gen)]->alloc_paddr(length);
+ }
+ return {addr, std::move(bp), gen};
+ }
+
+ /**
+ * dispatch_result_t
+ *
+ * ool extents are placed in alloc_map and passed to
+ * EPM::write_delayed_ool_extents,
+ * delayed_extents is used to update lba mapping.
+ * usage is used to reserve projected space
+ */
+ using extents_by_writer_t =
+ std::map<ExtentOolWriter*, std::list<LogicalCachedExtentRef>>;
+ struct dispatch_result_t {
+ extents_by_writer_t alloc_map;
+ std::list<LogicalCachedExtentRef> delayed_extents;
+ io_usage_t usage;
+ };
+
+ /**
+ * dispatch_delayed_extents
+ *
+ * Performs delayed allocation
+ */
+ dispatch_result_t dispatch_delayed_extents(Transaction& t);
+
+ /**
+ * write_delayed_ool_extents
+ *
+ * Do writes for out-of-line extents.
+ */
+ using alloc_paddr_iertr = ExtentOolWriter::alloc_write_iertr;
+ alloc_paddr_iertr::future<> write_delayed_ool_extents(
+ Transaction& t,
+ extents_by_writer_t& alloc_map);
+
+ /**
+ * write_preallocated_ool_extents
+ *
+ * Performs ool writes for extents with pre-allocated addresses.
+ * See Transaction::pre_alloc_list
+ */
+ alloc_paddr_iertr::future<> write_preallocated_ool_extents(
+ Transaction &t,
+ std::list<LogicalCachedExtentRef> extents);
+
+ seastar::future<> stop_background() {
+ return background_process.stop_background();
+ }
+
+ using close_ertr = ExtentOolWriter::close_ertr;
+ close_ertr::future<> close();
+
+ using read_ertr = Device::read_ertr;
+ read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out
+ ) {
+ assert(devices_by_id[addr.get_device_id()] != nullptr);
+ return devices_by_id[addr.get_device_id()]->read(addr, len, out);
+ }
+
+ void mark_space_used(paddr_t addr, extent_len_t len) {
+ background_process.mark_space_used(addr, len);
+ }
+
+ void mark_space_free(paddr_t addr, extent_len_t len) {
+ background_process.mark_space_free(addr, len);
+ }
+
+ void commit_space_used(paddr_t addr, extent_len_t len) {
+ return background_process.commit_space_used(addr, len);
+ }
+
+ seastar::future<> reserve_projected_usage(io_usage_t usage) {
+ return background_process.reserve_projected_usage(usage);
+ }
+
+ void release_projected_usage(const io_usage_t &usage) {
+ background_process.release_projected_usage(usage);
+ }
+
+ backend_type_t get_main_backend_type() const {
+ if (!background_process.is_no_background()) {
+ return background_process.get_main_backend_type();
+ }
+ // for test
+ assert(primary_device);
+ return primary_device->get_backend_type();
+ }
+
+ // Testing interfaces
+
+ void test_init_no_background(Device *test_device) {
+ assert(test_device->get_backend_type() == backend_type_t::SEGMENTED);
+ add_device(test_device);
+ set_primary_device(test_device);
+ }
+
+ bool check_usage() {
+ return background_process.check_usage();
+ }
+
+ seastar::future<> run_background_work_until_halt() {
+ return background_process.run_until_halt();
+ }
+
+private:
+ rewrite_gen_t adjust_generation(
+ data_category_t category,
+ extent_types_t type,
+ placement_hint_t hint,
+ rewrite_gen_t gen) {
+ if (type == extent_types_t::ROOT) {
+ gen = INLINE_GENERATION;
+ } else if (get_main_backend_type() == backend_type_t::SEGMENTED &&
+ is_lba_backref_node(type)) {
+ gen = INLINE_GENERATION;
+ } else if (hint == placement_hint_t::COLD) {
+ assert(gen == INIT_GENERATION);
+ if (background_process.has_cold_tier()) {
+ gen = MIN_COLD_GENERATION;
+ } else {
+ gen = MIN_REWRITE_GENERATION;
+ }
+ } else if (gen == INIT_GENERATION) {
+ if (category == data_category_t::METADATA) {
+ if (get_main_backend_type() == backend_type_t::SEGMENTED) {
+ // with SEGMENTED, default not to ool metadata extents to reduce
+ // padding overhead.
+ // TODO: improve padding so we can default to the ool path.
+ gen = INLINE_GENERATION;
+ } else {
+ // with RBM, all extents must be OOL
+ assert(get_main_backend_type() ==
+ backend_type_t::RANDOM_BLOCK);
+ gen = OOL_GENERATION;
+ }
+ } else {
+ assert(category == data_category_t::DATA);
+ gen = OOL_GENERATION;
+ }
+ } else if (background_process.has_cold_tier()) {
+ gen = background_process.adjust_generation(gen);
+ }
+
+ if (gen > dynamic_max_rewrite_generation) {
+ gen = dynamic_max_rewrite_generation;
+ }
+
+ return gen;
+ }
+
+ void add_device(Device *device) {
+ auto device_id = device->get_device_id();
+ ceph_assert(devices_by_id[device_id] == nullptr);
+ devices_by_id[device_id] = device;
+ ++num_devices;
+ }
+
+ /**
+ * dispatch_delayed_extent
+ *
+ * Specify the extent inline or ool
+ * return true indicates inline otherwise ool
+ */
+ bool dispatch_delayed_extent(LogicalCachedExtentRef& extent) {
+ // TODO: all delayed extents are ool currently
+ boost::ignore_unused(extent);
+ return false;
+ }
+
+ ExtentOolWriter* get_writer(placement_hint_t hint,
+ data_category_t category,
+ rewrite_gen_t gen) {
+ assert(hint < placement_hint_t::NUM_HINTS);
+ assert(is_rewrite_generation(gen));
+ assert(gen != INLINE_GENERATION);
+ assert(gen <= dynamic_max_rewrite_generation);
+ if (category == data_category_t::DATA) {
+ return data_writers_by_gen[generation_to_writer(gen)];
+ } else {
+ assert(category == data_category_t::METADATA);
+ return md_writers_by_gen[generation_to_writer(gen)];
+ }
+ }
+
+ /**
+ * BackgroundProcess
+ *
+ * Background process to schedule background transactions.
+ *
+ * TODO: device tiering
+ */
+ class BackgroundProcess : public BackgroundListener {
+ public:
+ BackgroundProcess() = default;
+
+ void init(JournalTrimmerImplRef &&_trimmer,
+ AsyncCleanerRef &&_cleaner,
+ AsyncCleanerRef &&_cold_cleaner) {
+ trimmer = std::move(_trimmer);
+ trimmer->set_background_callback(this);
+ main_cleaner = std::move(_cleaner);
+ main_cleaner->set_background_callback(this);
+ if (_cold_cleaner) {
+ cold_cleaner = std::move(_cold_cleaner);
+ cold_cleaner->set_background_callback(this);
+
+ cleaners_by_device_id.resize(DEVICE_ID_MAX, nullptr);
+ for (auto id : main_cleaner->get_device_ids()) {
+ cleaners_by_device_id[id] = main_cleaner.get();
+ }
+ for (auto id : cold_cleaner->get_device_ids()) {
+ cleaners_by_device_id[id] = cold_cleaner.get();
+ }
+
+ eviction_state.init(
+ crimson::common::get_conf<double>(
+ "seastore_multiple_tiers_stop_evict_ratio"),
+ crimson::common::get_conf<double>(
+ "seastore_multiple_tiers_default_evict_ratio"),
+ crimson::common::get_conf<double>(
+ "seastore_multiple_tiers_fast_evict_ratio"));
+ }
+ }
+
+ journal_type_t get_journal_type() const {
+ return trimmer->get_journal_type();
+ }
+
+ bool has_cold_tier() const {
+ return cold_cleaner.get() != nullptr;
+ }
+
+ void set_extent_callback(ExtentCallbackInterface *cb) {
+ trimmer->set_extent_callback(cb);
+ main_cleaner->set_extent_callback(cb);
+ if (has_cold_tier()) {
+ cold_cleaner->set_extent_callback(cb);
+ }
+ }
+
+ store_statfs_t get_stat() const {
+ auto stat = main_cleaner->get_stat();
+ if (has_cold_tier()) {
+ stat.add(cold_cleaner->get_stat());
+ }
+ return stat;
+ }
+
+ using mount_ret = ExtentPlacementManager::mount_ret;
+ mount_ret mount() {
+ ceph_assert(state == state_t::STOP);
+ state = state_t::MOUNT;
+ trimmer->reset();
+ stats = {};
+ register_metrics();
+ return main_cleaner->mount(
+ ).safe_then([this] {
+ return has_cold_tier() ? cold_cleaner->mount() : mount_ertr::now();
+ });
+ }
+
+ void start_scan_space() {
+ ceph_assert(state == state_t::MOUNT);
+ state = state_t::SCAN_SPACE;
+ ceph_assert(main_cleaner->check_usage_is_empty());
+ ceph_assert(!has_cold_tier() ||
+ cold_cleaner->check_usage_is_empty());
+ }
+
+ void start_background();
+
+ void mark_space_used(paddr_t addr, extent_len_t len) {
+ if (state < state_t::SCAN_SPACE) {
+ return;
+ }
+
+ if (!has_cold_tier()) {
+ assert(main_cleaner);
+ main_cleaner->mark_space_used(addr, len);
+ } else {
+ auto id = addr.get_device_id();
+ assert(id < cleaners_by_device_id.size());
+ auto cleaner = cleaners_by_device_id[id];
+ assert(cleaner);
+ cleaner->mark_space_used(addr, len);
+ }
+ }
+
+ void mark_space_free(paddr_t addr, extent_len_t len) {
+ if (state < state_t::SCAN_SPACE) {
+ return;
+ }
+
+ if (!has_cold_tier()) {
+ assert(main_cleaner);
+ main_cleaner->mark_space_free(addr, len);
+ } else {
+ auto id = addr.get_device_id();
+ assert(id < cleaners_by_device_id.size());
+ auto cleaner = cleaners_by_device_id[id];
+ assert(cleaner);
+ cleaner->mark_space_free(addr, len);
+ }
+ }
+
+ void commit_space_used(paddr_t addr, extent_len_t len) {
+ if (state < state_t::SCAN_SPACE) {
+ return;
+ }
+
+ if (!has_cold_tier()) {
+ assert(main_cleaner);
+ main_cleaner->commit_space_used(addr, len);
+ } else {
+ auto id = addr.get_device_id();
+ assert(id < cleaners_by_device_id.size());
+ auto cleaner = cleaners_by_device_id[id];
+ assert(cleaner);
+ cleaner->commit_space_used(addr, len);
+ }
+ }
+
+ rewrite_gen_t adjust_generation(rewrite_gen_t gen) {
+ if (has_cold_tier()) {
+ return eviction_state.adjust_generation_with_eviction(gen);
+ } else {
+ return gen;
+ }
+ }
+
+ seastar::future<> reserve_projected_usage(io_usage_t usage);
+
+ void release_projected_usage(const io_usage_t &usage) {
+ if (is_ready()) {
+ trimmer->release_inline_usage(usage.inline_usage);
+ main_cleaner->release_projected_usage(usage.cleaner_usage.main_usage);
+ if (has_cold_tier()) {
+ cold_cleaner->release_projected_usage(usage.cleaner_usage.cold_ool_usage);
+ }
+ }
+ }
+
+ seastar::future<> stop_background();
+ backend_type_t get_main_backend_type() const {
+ return get_journal_type();
+ }
+
+ // Testing interfaces
+
+ bool check_usage() {
+ return main_cleaner->check_usage() &&
+ (!has_cold_tier() || cold_cleaner->check_usage());
+ }
+
+ seastar::future<> run_until_halt();
+
+ bool is_no_background() const {
+ return !trimmer || !main_cleaner;
+ }
+
+ protected:
+ state_t get_state() const final {
+ return state;
+ }
+
+ void maybe_wake_background() final {
+ if (!is_running()) {
+ return;
+ }
+ if (background_should_run()) {
+ do_wake_background();
+ }
+ }
+
+ void maybe_wake_blocked_io() final {
+ if (!is_ready()) {
+ return;
+ }
+ if (!should_block_io() && blocking_io) {
+ blocking_io->set_value();
+ blocking_io = std::nullopt;
+ }
+ }
+
+ private:
+ // reserve helpers
+ bool try_reserve_cold(std::size_t usage);
+ void abort_cold_usage(std::size_t usage, bool success);
+
+ reserve_cleaner_result_t try_reserve_cleaner(const cleaner_usage_t &usage);
+ void abort_cleaner_usage(const cleaner_usage_t &usage,
+ const reserve_cleaner_result_t &result);
+
+ reserve_io_result_t try_reserve_io(const io_usage_t &usage);
+ void abort_io_usage(const io_usage_t &usage,
+ const reserve_io_result_t &result);
+
+ bool is_running() const {
+ if (state == state_t::RUNNING) {
+ assert(process_join);
+ return true;
+ } else {
+ assert(!process_join);
+ return false;
+ }
+ }
+
+ void log_state(const char *caller) const;
+
+ seastar::future<> run();
+
+ void do_wake_background() {
+ if (blocking_background) {
+ blocking_background->set_value();
+ blocking_background = std::nullopt;
+ }
+ }
+
+ // background_should_run() should be atomic with do_background_cycle()
+ // to make sure the condition is consistent.
+ bool background_should_run() {
+ assert(is_ready());
+ maybe_update_eviction_mode();
+ return main_cleaner_should_run()
+ || cold_cleaner_should_run()
+ || trimmer->should_trim();
+ }
+
+ bool main_cleaner_should_run() const {
+ assert(is_ready());
+ return main_cleaner->should_clean_space() ||
+ (has_cold_tier() &&
+ main_cleaner->can_clean_space() &&
+ eviction_state.is_fast_mode());
+ }
+
+ bool cold_cleaner_should_run() const {
+ assert(is_ready());
+ return has_cold_tier() &&
+ cold_cleaner->should_clean_space();
+ }
+
+ bool should_block_io() const {
+ assert(is_ready());
+ return trimmer->should_block_io_on_trim() ||
+ main_cleaner->should_block_io_on_clean() ||
+ (has_cold_tier() &&
+ cold_cleaner->should_block_io_on_clean());
+ }
+
+ void maybe_update_eviction_mode() {
+ if (has_cold_tier()) {
+ auto main_alive_ratio = main_cleaner->get_stat().get_used_raw_ratio();
+ eviction_state.maybe_update_eviction_mode(main_alive_ratio);
+ }
+ }
+
+ struct eviction_state_t {
+ enum class eviction_mode_t {
+ STOP, // generation greater than or equal to MIN_COLD_GENERATION
+ // will be set to MIN_COLD_GENERATION - 1, which means
+ // no extents will be evicted.
+ DEFAULT, // generation incremented with each rewrite. Extents will
+ // be evicted when generation reaches MIN_COLD_GENERATION.
+ FAST, // map all generations located in
+ // [MIN_REWRITE_GENERATION, MIN_COLD_GENERATIOIN) to
+ // MIN_COLD_GENERATION.
+ };
+
+ eviction_mode_t eviction_mode;
+ double stop_evict_ratio;
+ double default_evict_ratio;
+ double fast_evict_ratio;
+
+ void init(double stop_ratio,
+ double default_ratio,
+ double fast_ratio) {
+ ceph_assert(0 <= stop_ratio);
+ ceph_assert(stop_ratio < default_ratio);
+ ceph_assert(default_ratio < fast_ratio);
+ ceph_assert(fast_ratio <= 1);
+ eviction_mode = eviction_mode_t::STOP;
+ stop_evict_ratio = stop_ratio;
+ default_evict_ratio = default_ratio;
+ fast_evict_ratio = fast_ratio;
+ }
+
+ bool is_stop_mode() const {
+ return eviction_mode == eviction_mode_t::STOP;
+ }
+
+ bool is_default_mode() const {
+ return eviction_mode == eviction_mode_t::DEFAULT;
+ }
+
+ bool is_fast_mode() const {
+ return eviction_mode == eviction_mode_t::FAST;
+ }
+
+ rewrite_gen_t adjust_generation_with_eviction(rewrite_gen_t gen) {
+ rewrite_gen_t ret = gen;
+ switch(eviction_mode) {
+ case eviction_mode_t::STOP:
+ if (gen == MIN_COLD_GENERATION) {
+ ret = MIN_COLD_GENERATION - 1;
+ }
+ break;
+ case eviction_mode_t::DEFAULT:
+ break;
+ case eviction_mode_t::FAST:
+ if (gen >= MIN_REWRITE_GENERATION && gen < MIN_COLD_GENERATION) {
+ ret = MIN_COLD_GENERATION;
+ }
+ break;
+ default:
+ ceph_abort("impossible");
+ }
+ return ret;
+ }
+
+ // We change the state of eviction_mode according to the alive ratio
+ // of the main cleaner.
+ //
+ // Use A, B, C, D to represent the state of alive ratio:
+ // A: alive ratio <= stop_evict_ratio
+ // B: alive ratio <= default_evict_ratio
+ // C: alive ratio <= fast_evict_ratio
+ // D: alive ratio > fast_evict_ratio
+ //
+ // and use X, Y, Z to shorten the state of eviction_mode_t:
+ // X: STOP
+ // Y: DEFAULT
+ // Z: FAST
+ //
+ // Then we can use a form like (A && X) to describe the current state
+ // of the main cleaner, which indicates the alive ratio is less than or
+ // equal to stop_evict_ratio and current eviction mode is STOP.
+ //
+ // all valid state transitions show as follow:
+ // (A && X) => (B && X) => (C && Y) => (D && Z) =>
+ // (C && Z) => (B && Y) => (A && X)
+ // `--> (C && Y) => ...
+ //
+ // when the system restarts, the init state is (_ && X), the
+ // transitions should be:
+ // (_ && X) -> (A && X) => normal transition
+ // -> (B && X) => normal transition
+ // -> (C && X) => (C && Y) => normal transition
+ // -> (D && X) => (D && Z) => normal transition
+ void maybe_update_eviction_mode(double main_alive_ratio) {
+ if (main_alive_ratio <= stop_evict_ratio) {
+ eviction_mode = eviction_mode_t::STOP;
+ } else if (main_alive_ratio <= default_evict_ratio) {
+ if (eviction_mode > eviction_mode_t::DEFAULT) {
+ eviction_mode = eviction_mode_t::DEFAULT;
+ }
+ } else if (main_alive_ratio <= fast_evict_ratio) {
+ if (eviction_mode < eviction_mode_t::DEFAULT) {
+ eviction_mode = eviction_mode_t::DEFAULT;
+ }
+ } else {
+ assert(main_alive_ratio > fast_evict_ratio);
+ eviction_mode = eviction_mode_t::FAST;
+ }
+ }
+ };
+
+ seastar::future<> do_background_cycle();
+
+ void register_metrics();
+
+ struct {
+ uint64_t io_blocking_num = 0;
+ uint64_t io_count = 0;
+ uint64_t io_blocked_count = 0;
+ uint64_t io_blocked_count_trim = 0;
+ uint64_t io_blocked_count_clean = 0;
+ uint64_t io_blocked_sum = 0;
+ } stats;
+ seastar::metrics::metric_group metrics;
+
+ JournalTrimmerImplRef trimmer;
+ AsyncCleanerRef main_cleaner;
+
+ /*
+ * cold tier (optional, see has_cold_tier())
+ */
+ AsyncCleanerRef cold_cleaner;
+ std::vector<AsyncCleaner*> cleaners_by_device_id;
+
+ std::optional<seastar::future<>> process_join;
+ std::optional<seastar::promise<>> blocking_background;
+ std::optional<seastar::promise<>> blocking_io;
+ bool is_running_until_halt = false;
+ state_t state = state_t::STOP;
+ eviction_state_t eviction_state;
+
+ friend class ::transaction_manager_test_t;
+ };
+
+ std::vector<ExtentOolWriterRef> writer_refs;
+ std::vector<ExtentOolWriter*> data_writers_by_gen;
+ // gen 0 METADATA writer is the journal writer
+ std::vector<ExtentOolWriter*> md_writers_by_gen;
+
+ std::vector<Device*> devices_by_id;
+ Device* primary_device = nullptr;
+ std::size_t num_devices = 0;
+
+ rewrite_gen_t dynamic_max_rewrite_generation = REWRITE_GENERATIONS;
+ BackgroundProcess background_process;
+ // TODO: drop once paddr->journal_seq_t is introduced
+ SegmentSeqAllocatorRef ool_segment_seq_allocator;
+
+ friend class ::transaction_manager_test_t;
+};
+
+using ExtentPlacementManagerRef = std::unique_ptr<ExtentPlacementManager>;
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::io_usage_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc
new file mode 100644
index 000000000..b0dc1b8c8
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager.cc
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <experimental/iterator>
+#include <iostream>
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/extentmap_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h"
+namespace crimson::os::seastore::extentmap_manager {
+
+ExtentMapManagerRef create_extentmap_manager(
+ TransactionManager &trans_manager) {
+ return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager));
+}
+
+}
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs)
+{
+ return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length
+ << "->" << rhs.laddr << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs)
+{
+ out << '[';
+ std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", "));
+ return out << ']';
+}
+
+}
diff --git a/src/crimson/os/seastore/journal.cc b/src/crimson/os/seastore/journal.cc
new file mode 100644
index 000000000..d4714cf3f
--- /dev/null
+++ b/src/crimson/os/seastore/journal.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal.h"
+#include "journal/segmented_journal.h"
+#include "journal/circular_bounded_journal.h"
+
+namespace crimson::os::seastore::journal {
+
+JournalRef make_segmented(
+ SegmentProvider &provider,
+ JournalTrimmer &trimmer)
+{
+ return std::make_unique<SegmentedJournal>(provider, trimmer);
+}
+
+JournalRef make_circularbounded(
+ JournalTrimmer &trimmer,
+ crimson::os::seastore::random_block_device::RBMDevice* device,
+ std::string path)
+{
+ return std::make_unique<CircularBoundedJournal>(trimmer, device, path);
+}
+
+}
diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h
new file mode 100644
index 000000000..18c0797a8
--- /dev/null
+++ b/src/crimson/os/seastore/journal.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include "crimson/os/seastore/ordering_handle.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_seq_allocator.h"
+
+namespace crimson::os::seastore {
+
+namespace random_block_device {
+class RBMDevice;
+}
+
+class SegmentManagerGroup;
+class SegmentProvider;
+class JournalTrimmer;
+
+class Journal {
+public:
+ virtual JournalTrimmer &get_trimmer() = 0;
+ /**
+ * initializes journal for mkfs writes -- must run prior to calls
+ * to submit_record.
+ */
+ using open_for_mkfs_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using open_for_mkfs_ret = open_for_mkfs_ertr::future<journal_seq_t>;
+ virtual open_for_mkfs_ret open_for_mkfs() = 0;
+
+ /**
+ * initializes journal for new writes -- must run prior to calls
+ * to submit_record. Should be called after replay if not a new
+ * Journal.
+ */
+ using open_for_mount_ertr = open_for_mkfs_ertr;
+ using open_for_mount_ret = open_for_mkfs_ret;
+ virtual open_for_mount_ret open_for_mount() = 0;
+
+ /// close journal
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ virtual close_ertr::future<> close() = 0;
+
+ /**
+ * submit_record
+ *
+ * write record with the ordering handle
+ */
+ using submit_record_ertr = crimson::errorator<
+ crimson::ct_error::erange,
+ crimson::ct_error::input_output_error
+ >;
+ using submit_record_ret = submit_record_ertr::future<
+ record_locator_t
+ >;
+ virtual submit_record_ret submit_record(
+ record_t &&record,
+ OrderingHandle &handle
+ ) = 0;
+
+ /**
+ * flush
+ *
+ * Wait for all outstanding IOs on handle to commit.
+ * Note, flush() machinery must go through the same pipeline
+ * stages and locks as submit_record.
+ */
+ virtual seastar::future<> flush(OrderingHandle &handle) = 0;
+
+ /// sets write pipeline reference
+ virtual void set_write_pipeline(WritePipeline *_write_pipeline) = 0;
+
+ /**
+ * Read deltas and pass to delta_handler
+ *
+ * record_block_start (argument to delta_handler) is the start of the
+ * of the first block in the record
+ */
+ using replay_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ using replay_ret = replay_ertr::future<>;
+ using delta_handler_t = std::function<
+ replay_ertr::future<bool>(
+ const record_locator_t&,
+ const delta_info_t&,
+ const journal_seq_t&, // dirty_tail
+ const journal_seq_t&, // alloc_tail
+ sea_time_point modify_time)>;
+ virtual replay_ret replay(
+ delta_handler_t &&delta_handler) = 0;
+
+ virtual seastar::future<> finish_commit(
+ transaction_type_t type) = 0;
+
+ virtual ~Journal() {}
+
+ virtual journal_type_t get_type() = 0;
+};
+using JournalRef = std::unique_ptr<Journal>;
+
+namespace journal {
+
+JournalRef make_segmented(
+ SegmentProvider &provider,
+ JournalTrimmer &trimmer);
+
+JournalRef make_circularbounded(
+ JournalTrimmer &trimmer,
+ crimson::os::seastore::random_block_device::RBMDevice* device,
+ std::string path);
+
+}
+
+}
diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.cc b/src/crimson/os/seastore/journal/circular_bounded_journal.cc
new file mode 100644
index 000000000..ec41bfab1
--- /dev/null
+++ b/src/crimson/os/seastore/journal/circular_bounded_journal.cc
@@ -0,0 +1,387 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/iterator/counting_iterator.hpp>
+
+#include "crimson/common/errorator-loop.h"
+#include "include/intarith.h"
+#include "crimson/os/seastore/async_cleaner.h"
+#include "crimson/os/seastore/journal/circular_bounded_journal.h"
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/journal/circular_journal_space.h"
+
+SET_SUBSYS(seastore_journal);
+
+namespace crimson::os::seastore::journal {
+
+CircularBoundedJournal::CircularBoundedJournal(
+ JournalTrimmer &trimmer,
+ RBMDevice* device,
+ const std::string &path)
+ : trimmer(trimmer), path(path),
+ cjs(device),
+ record_submitter(crimson::common::get_conf<uint64_t>(
+ "seastore_journal_iodepth_limit"),
+ crimson::common::get_conf<uint64_t>(
+ "seastore_journal_batch_capacity"),
+ crimson::common::get_conf<Option::size_t>(
+ "seastore_journal_batch_flush_size"),
+ crimson::common::get_conf<double>(
+ "seastore_journal_batch_preferred_fullness"),
+ cjs)
+ {}
+
+CircularBoundedJournal::open_for_mkfs_ret
+CircularBoundedJournal::open_for_mkfs()
+{
+ return record_submitter.open(true
+ ).safe_then([this](auto ret) {
+ return open_for_mkfs_ret(
+ open_for_mkfs_ertr::ready_future_marker{},
+ get_written_to());
+ });
+}
+
+CircularBoundedJournal::open_for_mount_ret
+CircularBoundedJournal::open_for_mount()
+{
+ return record_submitter.open(false
+ ).safe_then([this](auto ret) {
+ return open_for_mount_ret(
+ open_for_mount_ertr::ready_future_marker{},
+ get_written_to());
+ });
+}
+
+CircularBoundedJournal::close_ertr::future<> CircularBoundedJournal::close()
+{
+ return record_submitter.close();
+}
+
+CircularBoundedJournal::submit_record_ret
+CircularBoundedJournal::submit_record(
+ record_t &&record,
+ OrderingHandle &handle)
+{
+ LOG_PREFIX(CircularBoundedJournal::submit_record);
+ DEBUG("H{} {} start ...", (void*)&handle, record);
+ assert(write_pipeline);
+ return do_submit_record(std::move(record), handle);
+}
+
+CircularBoundedJournal::submit_record_ret
+CircularBoundedJournal::do_submit_record(
+ record_t &&record,
+ OrderingHandle &handle)
+{
+ LOG_PREFIX(CircularBoundedJournal::do_submit_record);
+ if (!record_submitter.is_available()) {
+ DEBUG("H{} wait ...", (void*)&handle);
+ return record_submitter.wait_available(
+ ).safe_then([this, record=std::move(record), &handle]() mutable {
+ return do_submit_record(std::move(record), handle);
+ });
+ }
+ auto action = record_submitter.check_action(record.size);
+ if (action == RecordSubmitter::action_t::ROLL) {
+ return record_submitter.roll_segment(
+ ).safe_then([this, record=std::move(record), &handle]() mutable {
+ return do_submit_record(std::move(record), handle);
+ });
+ }
+
+ DEBUG("H{} submit {} ...",
+ (void*)&handle,
+ action == RecordSubmitter::action_t::SUBMIT_FULL ?
+ "FULL" : "NOT_FULL");
+ auto submit_fut = record_submitter.submit(std::move(record));
+ return handle.enter(write_pipeline->device_submission
+ ).then([submit_fut=std::move(submit_fut)]() mutable {
+ return std::move(submit_fut);
+ }).safe_then([FNAME, this, &handle](record_locator_t result) {
+ return handle.enter(write_pipeline->finalize
+ ).then([FNAME, this, result, &handle] {
+ DEBUG("H{} finish with {}", (void*)&handle, result);
+ auto new_committed_to = result.write_result.get_end_seq();
+ record_submitter.update_committed_to(new_committed_to);
+ return result;
+ });
+ });
+}
+
+Journal::replay_ret CircularBoundedJournal::replay_segment(
+ cbj_delta_handler_t &handler, scan_valid_records_cursor& cursor)
+{
+ LOG_PREFIX(Journal::replay_segment);
+ return seastar::do_with(
+ RecordScanner::found_record_handler_t(
+ [this, &handler, FNAME](
+ record_locator_t locator,
+ const record_group_header_t& r_header,
+ const bufferlist& mdbuf)
+ -> RecordScanner::scan_valid_records_ertr::future<>
+ {
+ auto maybe_record_deltas_list = try_decode_deltas(
+ r_header, mdbuf, locator.record_block_base);
+ if (!maybe_record_deltas_list) {
+ // This should be impossible, we did check the crc on the mdbuf
+ ERROR("unable to decode deltas for record {} at {}",
+ r_header, locator.record_block_base);
+ return crimson::ct_error::input_output_error::make();
+ }
+ assert(locator.write_result.start_seq != JOURNAL_SEQ_NULL);
+ auto cursor_addr = convert_paddr_to_abs_addr(locator.write_result.start_seq.offset);
+ DEBUG("{} at {}", r_header, cursor_addr);
+ journal_seq_t start_seq = locator.write_result.start_seq;
+ auto write_result = write_result_t{
+ start_seq,
+ r_header.mdlength + r_header.dlength
+ };
+ auto expected_seq = locator.write_result.start_seq.segment_seq;
+ cursor_addr += (r_header.mdlength + r_header.dlength);
+ if (cursor_addr >= get_journal_end()) {
+ cursor_addr = get_records_start();
+ ++expected_seq;
+ paddr_t addr = convert_abs_addr_to_paddr(
+ cursor_addr,
+ get_device_id());
+ write_result.start_seq.offset = addr;
+ write_result.start_seq.segment_seq = expected_seq;
+ }
+ paddr_t addr = convert_abs_addr_to_paddr(
+ cursor_addr,
+ get_device_id());
+ set_written_to(
+ journal_seq_t{expected_seq, addr});
+ return seastar::do_with(
+ std::move(*maybe_record_deltas_list),
+ [write_result,
+ &handler,
+ FNAME](auto& record_deltas_list) {
+ return crimson::do_for_each(
+ record_deltas_list,
+ [write_result,
+ &handler, FNAME](record_deltas_t& record_deltas) {
+ auto locator = record_locator_t{
+ record_deltas.record_block_base,
+ write_result
+ };
+ DEBUG("processing {} deltas at block_base {}",
+ record_deltas.deltas.size(),
+ locator);
+ return crimson::do_for_each(
+ record_deltas.deltas,
+ [locator,
+ &handler](auto& p) {
+ auto& modify_time = p.first;
+ auto& delta = p.second;
+ return handler(
+ locator,
+ delta,
+ modify_time).discard_result();
+ });
+ });
+ });
+ }),
+ [=, this, &cursor](auto &dhandler) {
+ return scan_valid_records(
+ cursor,
+ cjs.get_cbj_header().magic,
+ std::numeric_limits<size_t>::max(),
+ dhandler).safe_then([](auto){}
+ ).handle_error(
+ replay_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "shouldn't meet with any other error other replay_ertr"
+ }
+ );
+ }
+ );
+}
+
+
+Journal::replay_ret CircularBoundedJournal::scan_valid_record_delta(
+ cbj_delta_handler_t &&handler, journal_seq_t tail)
+{
+ LOG_PREFIX(Journal::scan_valid_record_delta);
+ INFO("starting at {} ", tail);
+ return seastar::do_with(
+ scan_valid_records_cursor(tail),
+ std::move(handler),
+ bool(false),
+ [this] (auto &cursor, auto &handler, auto &rolled) {
+ return crimson::repeat([this, &handler, &cursor, &rolled]()
+ -> replay_ertr::future<seastar::stop_iteration>
+ {
+ return replay_segment(handler, cursor
+ ).safe_then([this, &cursor, &rolled] {
+ if (!rolled) {
+ cursor.last_valid_header_found = false;
+ }
+ if (!cursor.is_complete()) {
+ try_read_rolled_header(cursor);
+ rolled = true;
+ return replay_ertr::make_ready_future<
+ seastar::stop_iteration>(seastar::stop_iteration::no);
+ }
+ return replay_ertr::make_ready_future<
+ seastar::stop_iteration>(seastar::stop_iteration::yes);
+ });
+ });
+ });
+}
+
+RecordScanner::read_ret CircularBoundedJournal::read(paddr_t start, size_t len)
+{
+ LOG_PREFIX(CircularBoundedJournal::read);
+ rbm_abs_addr addr = convert_paddr_to_abs_addr(start);
+ DEBUG("reading data from addr {} read length {}", addr, len);
+ auto bptr = bufferptr(ceph::buffer::create_page_aligned(len));
+ return cjs.read(addr, bptr
+ ).safe_then([bptr=std::move(bptr)]() {
+ return read_ret(
+ RecordScanner::read_ertr::ready_future_marker{},
+ std::move(bptr)
+ );
+ });
+}
+
+bool CircularBoundedJournal::is_record_segment_seq_invalid(
+ scan_valid_records_cursor &cursor,
+ record_group_header_t &r_header)
+{
+ LOG_PREFIX(CircularBoundedJournal::is_record_segment_seq_invalid);
+ auto print_invalid = [FNAME](auto &r_header) {
+ DEBUG("invalid header: {}", r_header);
+ return true;
+ };
+ if (cursor.seq.offset == convert_abs_addr_to_paddr(
+ get_records_start(), get_device_id())) {
+ if ((r_header.committed_to.segment_seq == NULL_SEG_SEQ &&
+ cursor.seq.segment_seq != 0) ||
+ r_header.committed_to.segment_seq != cursor.seq.segment_seq - 1) {
+ return print_invalid(r_header);
+ }
+ } else if (r_header.committed_to.segment_seq != cursor.seq.segment_seq) {
+ /*
+ * Assuing that seastore issues several records using submit_recods()
+ * as shown in the following example.
+ *
+ * Example )
+ * a. submit_record(a);
+ * b. submit_record(b);
+ * c. submit_record(c);
+ * d. roll to begin
+ * e. submit_record(d);
+ * f. submit_record(e);
+ * g. submit_record(f);
+ *
+ * In this example, we need to consider the two cases.
+ * case 1)
+ * records a - e were issued in a batch manner
+ * case 2)
+ * When starts to submit_record(e) at step 6, submit(b) has completed its finalize phase,
+ * so the header of e's committed_to points to the end of b.
+ *
+ * To handle these cases correctly, the following condition is added.
+ */
+ if ((r_header.committed_to.offset >= cursor.last_committed.offset &&
+ r_header.committed_to.segment_seq == cursor.last_committed.segment_seq) &&
+ r_header.committed_to.segment_seq == cursor.seq.segment_seq - 1) {
+ return false;
+ }
+ return print_invalid(r_header);
+ }
+ return false;
+}
+
+Journal::replay_ret CircularBoundedJournal::replay(
+ delta_handler_t &&delta_handler)
+{
+ /*
+ * read records from last applied record prior to written_to, and replay
+ */
+ LOG_PREFIX(CircularBoundedJournal::replay);
+ return cjs.read_header(
+ ).handle_error(
+ open_for_mount_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error read_header"
+ }).safe_then([this, FNAME, delta_handler=std::move(delta_handler)](auto p)
+ mutable {
+ auto &[head, bl] = *p;
+ cjs.set_cbj_header(head);
+ DEBUG("header : {}", cjs.get_cbj_header());
+ cjs.set_initialized(true);
+ return seastar::do_with(
+ std::move(delta_handler),
+ std::map<paddr_t, journal_seq_t>(),
+ [this](auto &d_handler, auto &map) {
+ auto build_paddr_seq_map = [&map](
+ const auto &offsets,
+ const auto &e,
+ sea_time_point modify_time)
+ {
+ if (e.type == extent_types_t::ALLOC_INFO) {
+ alloc_delta_t alloc_delta;
+ decode(alloc_delta, e.bl);
+ if (alloc_delta.op == alloc_delta_t::op_types_t::CLEAR) {
+ for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) {
+ map[alloc_blk.paddr] = offsets.write_result.start_seq;
+ }
+ }
+ }
+ return replay_ertr::make_ready_future<bool>(true);
+ };
+ auto tail = get_dirty_tail() <= get_alloc_tail() ?
+ get_dirty_tail() : get_alloc_tail();
+ set_written_to(tail);
+ // The first pass to build the paddr->journal_seq_t map
+ // from extent allocations
+ return scan_valid_record_delta(std::move(build_paddr_seq_map), tail
+ ).safe_then([this, &map, &d_handler, tail]() {
+ auto call_d_handler_if_valid = [this, &map, &d_handler](
+ const auto &offsets,
+ const auto &e,
+ sea_time_point modify_time)
+ {
+ if (map.find(e.paddr) == map.end() ||
+ map[e.paddr] <= offsets.write_result.start_seq) {
+ return d_handler(
+ offsets,
+ e,
+ get_dirty_tail(),
+ get_alloc_tail(),
+ modify_time
+ );
+ }
+ return replay_ertr::make_ready_future<bool>(true);
+ };
+ // The second pass to replay deltas
+ return scan_valid_record_delta(std::move(call_d_handler_if_valid), tail);
+ });
+ }).safe_then([this]() {
+ // make sure that committed_to is JOURNAL_SEQ_NULL if jounal is the initial state
+ if (get_written_to() !=
+ journal_seq_t{0,
+ convert_abs_addr_to_paddr(get_records_start(),
+ get_device_id())}) {
+ record_submitter.update_committed_to(get_written_to());
+ }
+ trimmer.update_journal_tails(
+ get_dirty_tail(),
+ get_alloc_tail());
+ });
+ });
+}
+
+seastar::future<> CircularBoundedJournal::finish_commit(transaction_type_t type) {
+ if (is_trim_transaction(type)) {
+ return update_journal_tail(
+ trimmer.get_dirty_tail(),
+ trimmer.get_alloc_tail());
+ }
+ return seastar::now();
+}
+
+}
diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.h b/src/crimson/os/seastore/journal/circular_bounded_journal.h
new file mode 100644
index 000000000..debe535ae
--- /dev/null
+++ b/src/crimson/os/seastore/journal/circular_bounded_journal.h
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+
+#include <boost/intrusive_ptr.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+#include "include/denc.h"
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/os/seastore/journal.h"
+#include "include/uuid.h"
+#include "crimson/os/seastore/random_block_manager.h"
+#include "crimson/os/seastore/random_block_manager/rbm_device.h"
+#include <list>
+#include "crimson/os/seastore/journal/record_submitter.h"
+#include "crimson/os/seastore/journal/circular_journal_space.h"
+#include "crimson/os/seastore/record_scanner.h"
+
+namespace crimson::os::seastore::journal {
+
+using RBMDevice = random_block_device::RBMDevice;
+
+/**
+ * CircularBoundedJournal
+ *
+ *
+ * CircularBoundedJournal (CBJournal) is the journal that works like circular
+ * queue. With CBJournal, Seastore will append some of the records if the size
+ * of the record is small (most likely metadata), at which point the head
+ * (written_to) will be moved. Then, eventually, Seastore applies the records
+ * in CBjournal to RBM (TODO).
+ *
+ * - Commit time
+ * After submit_record is done, written_to is increased(this in-memory value)
+ * ---written_to represents where the new record will be appended. Note that
+ * applied_to is not changed here.
+ *
+ * - Replay time
+ * At replay time, CBJournal begins to replay records in CBjournal by reading
+ * records from dirty_tail. Then, CBJournal examines whether the records is valid
+ * one by one, at which point written_to is recovered
+ * if the valid record is founded. Note that applied_to is stored
+ * permanently when the apply work---applying the records in CBJournal to RBM---
+ * is done by CBJournal (TODO).
+ *
+ * TODO: apply records from CircularBoundedJournal to RandomBlockManager
+ *
+ */
+
+constexpr uint64_t DEFAULT_BLOCK_SIZE = 4096;
+
+class CircularBoundedJournal : public Journal, RecordScanner {
+public:
+ CircularBoundedJournal(
+ JournalTrimmer &trimmer, RBMDevice* device, const std::string &path);
+ ~CircularBoundedJournal() {}
+
+ JournalTrimmer &get_trimmer() final {
+ return trimmer;
+ }
+
+ open_for_mkfs_ret open_for_mkfs() final;
+
+ open_for_mount_ret open_for_mount() final;
+
+ close_ertr::future<> close() final;
+
+ journal_type_t get_type() final {
+ return journal_type_t::RANDOM_BLOCK;
+ }
+
+ submit_record_ret submit_record(
+ record_t &&record,
+ OrderingHandle &handle
+ ) final;
+
+ seastar::future<> flush(
+ OrderingHandle &handle
+ ) final {
+ // TODO
+ return seastar::now();
+ }
+
+ replay_ret replay(delta_handler_t &&delta_handler) final;
+
+ rbm_abs_addr get_rbm_addr(journal_seq_t seq) const {
+ return convert_paddr_to_abs_addr(seq.offset);
+ }
+
+ /**
+ *
+ * CircularBoundedJournal write
+ *
+ * NVMe will support a large block write (< 512KB) with atomic write unit command.
+ * With this command, we expect that the most of incoming data can be stored
+ * as a single write call, which has lower overhead than existing
+ * way that uses a combination of system calls such as write() and sync().
+ *
+ */
+
+ seastar::future<> update_journal_tail(
+ journal_seq_t dirty,
+ journal_seq_t alloc) {
+ return cjs.update_journal_tail(dirty, alloc);
+ }
+ journal_seq_t get_dirty_tail() const {
+ return cjs.get_dirty_tail();
+ }
+ journal_seq_t get_alloc_tail() const {
+ return cjs.get_alloc_tail();
+ }
+
+ void set_write_pipeline(WritePipeline *_write_pipeline) final {
+ write_pipeline = _write_pipeline;
+ }
+
+ device_id_t get_device_id() const {
+ return cjs.get_device_id();
+ }
+ extent_len_t get_block_size() const {
+ return cjs.get_block_size();
+ }
+
+ rbm_abs_addr get_journal_end() const {
+ return cjs.get_journal_end();
+ }
+
+ void set_written_to(journal_seq_t seq) {
+ cjs.set_written_to(seq);
+ }
+
+ journal_seq_t get_written_to() {
+ return cjs.get_written_to();
+ }
+
+ rbm_abs_addr get_records_start() const {
+ return cjs.get_records_start();
+ }
+
+ seastar::future<> finish_commit(transaction_type_t type) final;
+
+ using cbj_delta_handler_t = std::function<
+ replay_ertr::future<bool>(
+ const record_locator_t&,
+ const delta_info_t&,
+ sea_time_point modify_time)>;
+
+ Journal::replay_ret scan_valid_record_delta(
+ cbj_delta_handler_t &&delta_handler,
+ journal_seq_t tail);
+
+ submit_record_ret do_submit_record(record_t &&record, OrderingHandle &handle);
+
+ void try_read_rolled_header(scan_valid_records_cursor &cursor) {
+ paddr_t addr = convert_abs_addr_to_paddr(
+ get_records_start(),
+ get_device_id());
+ cursor.seq.offset = addr;
+ cursor.seq.segment_seq += 1;
+ }
+
+ void initialize_cursor(scan_valid_records_cursor& cursor) final {
+ cursor.block_size = get_block_size();
+ };
+
+ Journal::replay_ret replay_segment(
+ cbj_delta_handler_t &handler, scan_valid_records_cursor& cursor);
+
+ read_ret read(paddr_t start, size_t len) final;
+
+ bool is_record_segment_seq_invalid(scan_valid_records_cursor &cursor,
+ record_group_header_t &h) final;
+
+ int64_t get_segment_end_offset(paddr_t addr) final {
+ return get_journal_end();
+ }
+
+ // Test interfaces
+
+ CircularJournalSpace& get_cjs() {
+ return cjs;
+ }
+
+ read_validate_record_metadata_ret test_read_validate_record_metadata(
+ scan_valid_records_cursor &cursor,
+ segment_nonce_t nonce)
+ {
+ return read_validate_record_metadata(cursor, nonce);
+ }
+
+ void test_initialize_cursor(scan_valid_records_cursor &cursor)
+ {
+ initialize_cursor(cursor);
+ }
+
+private:
+ JournalTrimmer &trimmer;
+ std::string path;
+ WritePipeline *write_pipeline = nullptr;
+ /**
+ * initialized
+ *
+ * true after open_device_read_header, set to false in close().
+ * Indicates that device is open and in-memory header is valid.
+ */
+ bool initialized = false;
+
+ // start address where the newest record will be written
+ // should be in range [get_records_start(), get_journal_end())
+ // written_to.segment_seq is circulation seq to track
+ // the sequence to written records
+ CircularJournalSpace cjs;
+ RecordSubmitter record_submitter;
+};
+
+}
+
diff --git a/src/crimson/os/seastore/journal/circular_journal_space.cc b/src/crimson/os/seastore/journal/circular_journal_space.cc
new file mode 100644
index 000000000..123bb9135
--- /dev/null
+++ b/src/crimson/os/seastore/journal/circular_journal_space.cc
@@ -0,0 +1,249 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "circular_journal_space.h"
+
+#include <fmt/format.h>
+#include <fmt/os.h>
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/async_cleaner.h"
+#include "crimson/os/seastore/journal/circular_bounded_journal.h"
+
+SET_SUBSYS(seastore_journal);
+
+namespace crimson::os::seastore::journal {
+
+std::ostream &operator<<(std::ostream &out,
+ const CircularJournalSpace::cbj_header_t &header)
+{
+ return out << "cbj_header_t("
+ << "dirty_tail=" << header.dirty_tail
+ << ", alloc_tail=" << header.alloc_tail
+ << ", magic=" << header.magic
+ << ")";
+}
+
+CircularJournalSpace::CircularJournalSpace(RBMDevice * device) : device(device) {}
+
+bool CircularJournalSpace::needs_roll(std::size_t length) const {
+ if (length + get_rbm_addr(get_written_to()) > get_journal_end()) {
+ return true;
+ }
+ return false;
+}
+
+extent_len_t CircularJournalSpace::get_block_size() const {
+ return device->get_block_size();
+}
+
+CircularJournalSpace::roll_ertr::future<> CircularJournalSpace::roll() {
+ paddr_t paddr = convert_abs_addr_to_paddr(
+ get_records_start(),
+ get_device_id());
+ auto seq = get_written_to();
+ seq.segment_seq++;
+ assert(seq.segment_seq < MAX_SEG_SEQ);
+ set_written_to(
+ journal_seq_t{seq.segment_seq, paddr});
+ return roll_ertr::now();
+}
+
+CircularJournalSpace::write_ret
+CircularJournalSpace::write(ceph::bufferlist&& to_write) {
+ LOG_PREFIX(CircularJournalSpace::write);
+ assert(get_written_to().segment_seq != NULL_SEG_SEQ);
+ auto encoded_size = to_write.length();
+ if (encoded_size > get_records_available_size()) {
+ ceph_abort("should be impossible with EPM reservation");
+ }
+ assert(encoded_size + get_rbm_addr(get_written_to())
+ < get_journal_end());
+
+ journal_seq_t j_seq = get_written_to();
+ auto target = get_rbm_addr(get_written_to());
+ auto new_written_to = target + encoded_size;
+ assert(new_written_to < get_journal_end());
+ paddr_t paddr = convert_abs_addr_to_paddr(
+ new_written_to,
+ get_device_id());
+ set_written_to(
+ journal_seq_t{get_written_to().segment_seq, paddr});
+ DEBUG("{}, target {}", to_write.length(), target);
+
+ auto write_result = write_result_t{
+ j_seq,
+ encoded_size
+ };
+ return device_write_bl(target, to_write
+ ).safe_then([this, target,
+ length=encoded_size,
+ write_result,
+ FNAME] {
+ DEBUG("commit target {} used_size {} written length {}",
+ target, get_records_used_size(), length);
+ return write_result;
+ }).handle_error(
+ base_ertr::pass_further{},
+ crimson::ct_error::assert_all{ "Invalid error" }
+ );
+}
+
+segment_nonce_t calc_new_nonce(
+ uint32_t crc,
+ unsigned char const *data,
+ unsigned length)
+{
+ crc &= std::numeric_limits<uint32_t>::max() >> 1;
+ return ceph_crc32c(crc, data, length);
+}
+
+CircularJournalSpace::open_ret CircularJournalSpace::open(bool is_mkfs) {
+ std::ostringstream oss;
+ oss << device_id_printer_t{get_device_id()};
+ print_name = oss.str();
+
+ if (is_mkfs) {
+ LOG_PREFIX(CircularJournalSpace::open);
+ assert(device);
+ ceph::bufferlist bl;
+ CircularJournalSpace::cbj_header_t head;
+ assert(device->get_journal_size());
+ head.dirty_tail =
+ journal_seq_t{0,
+ convert_abs_addr_to_paddr(
+ get_records_start(),
+ device->get_device_id())};
+ head.alloc_tail = head.dirty_tail;
+ auto meta = device->get_meta();
+ head.magic = calc_new_nonce(
+ std::rand() % std::numeric_limits<uint32_t>::max(),
+ reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()),
+ sizeof(meta.seastore_id.uuid));
+ encode(head, bl);
+ header = head;
+ set_written_to(head.dirty_tail);
+ initialized = true;
+ DEBUG(
+ "initialize header block in CircularJournalSpace length {}, head: {}",
+ bl.length(), header);
+ return write_header(
+ ).safe_then([this]() {
+ return open_ret(
+ open_ertr::ready_future_marker{},
+ get_written_to());
+ }).handle_error(
+ open_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error write_header"
+ }
+ );
+ }
+ ceph_assert(initialized);
+ if (written_to.segment_seq == NULL_SEG_SEQ) {
+ written_to.segment_seq = 0;
+ }
+ return open_ret(
+ open_ertr::ready_future_marker{},
+ get_written_to());
+}
+
+ceph::bufferlist CircularJournalSpace::encode_header()
+{
+ bufferlist bl;
+ encode(header, bl);
+ auto header_crc_filler = bl.append_hole(sizeof(checksum_t));
+ auto bliter = bl.cbegin();
+ auto header_crc = bliter.crc32c(
+ ceph::encoded_sizeof_bounded<cbj_header_t>(),
+ -1);
+ ceph_le32 header_crc_le;
+ header_crc_le = header_crc;
+ header_crc_filler.copy_in(
+ sizeof(checksum_t),
+ reinterpret_cast<const char *>(&header_crc_le));
+ return bl;
+}
+
+CircularJournalSpace::write_ertr::future<> CircularJournalSpace::device_write_bl(
+ rbm_abs_addr offset, bufferlist &bl)
+{
+ LOG_PREFIX(CircularJournalSpace::device_write_bl);
+ auto length = bl.length();
+ if (offset + length > get_journal_end()) {
+ return crimson::ct_error::erange::make();
+ }
+ DEBUG(
+ "overwrite in CircularJournalSpace, offset {}, length {}",
+ offset,
+ length);
+ return device->writev(offset, bl
+ ).handle_error(
+ write_ertr::pass_further{},
+ crimson::ct_error::assert_all{ "Invalid error device->write" }
+ );
+}
+
+CircularJournalSpace::read_header_ret
+CircularJournalSpace::read_header()
+{
+ LOG_PREFIX(CircularJournalSpace::read_header);
+ assert(device);
+ auto bptr = bufferptr(ceph::buffer::create_page_aligned(
+ device->get_block_size()));
+ DEBUG("reading {}", device->get_shard_journal_start());
+ return device->read(device->get_shard_journal_start(), bptr
+ ).safe_then([bptr, FNAME]() mutable
+ -> read_header_ret {
+ bufferlist bl;
+ bl.append(bptr);
+ auto bp = bl.cbegin();
+ cbj_header_t cbj_header;
+ try {
+ decode(cbj_header, bp);
+ } catch (ceph::buffer::error &e) {
+ ERROR("unable to read header block");
+ return crimson::ct_error::enoent::make();
+ }
+ auto bliter = bl.cbegin();
+ auto test_crc = bliter.crc32c(
+ ceph::encoded_sizeof_bounded<cbj_header_t>(),
+ -1);
+ ceph_le32 recorded_crc_le;
+ decode(recorded_crc_le, bliter);
+ uint32_t recorded_crc = recorded_crc_le;
+ if (test_crc != recorded_crc) {
+ ERROR("error, header crc mismatch.");
+ return read_header_ret(
+ read_header_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+ return read_header_ret(
+ read_header_ertr::ready_future_marker{},
+ std::make_pair(cbj_header, bl)
+ );
+ });
+}
+
+CircularJournalSpace::write_ertr::future<>
+CircularJournalSpace::write_header()
+{
+ LOG_PREFIX(CircularJournalSpace::write_header);
+ ceph::bufferlist bl = encode_header();
+ ceph_assert(bl.length() <= get_block_size());
+ DEBUG(
+ "sync header of CircularJournalSpace, length {}",
+ bl.length());
+ assert(device);
+ auto iter = bl.begin();
+ assert(bl.length() < get_block_size());
+ bufferptr bp = bufferptr(ceph::buffer::create_page_aligned(get_block_size()));
+ iter.copy(bl.length(), bp.c_str());
+ return device->write(device->get_shard_journal_start(), std::move(bp)
+ ).handle_error(
+ write_ertr::pass_further{},
+ crimson::ct_error::assert_all{ "Invalid error device->write" }
+ );
+}
+
+}
diff --git a/src/crimson/os/seastore/journal/circular_journal_space.h b/src/crimson/os/seastore/journal/circular_journal_space.h
new file mode 100644
index 000000000..c88b65ad5
--- /dev/null
+++ b/src/crimson/os/seastore/journal/circular_journal_space.h
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <optional>
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/buffer.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/journal.h"
+#include "crimson/os/seastore/random_block_manager.h"
+#include "crimson/os/seastore/random_block_manager/rbm_device.h"
+#include "crimson/os/seastore/journal/record_submitter.h"
+#include "crimson/os/seastore/async_cleaner.h"
+
+namespace crimson::os::seastore {
+ class SegmentProvider;
+ class JournalTrimmer;
+}
+
+namespace crimson::os::seastore::journal {
+
+class CircularBoundedJournal;
+class CircularJournalSpace : public JournalAllocator {
+
+ public:
+ const std::string& get_name() const final {
+ return print_name;
+ }
+
+ extent_len_t get_block_size() const final;
+
+ bool can_write() const final {
+ return (device != nullptr);
+ }
+
+ segment_nonce_t get_nonce() const final {
+ return header.magic;
+ }
+
+ bool needs_roll(std::size_t length) const final;
+
+ roll_ertr::future<> roll() final;
+
+ write_ret write(ceph::bufferlist&& to_write) final;
+
+ void update_modify_time(record_t& record) final {}
+
+ close_ertr::future<> close() final {
+ return write_header(
+ ).safe_then([this]() -> close_ertr::future<> {
+ initialized = false;
+ return close_ertr::now();
+ }).handle_error(
+ Journal::open_for_mount_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error write_header"
+ }
+ );
+ }
+
+ open_ret open(bool is_mkfs) final;
+
+ public:
+ CircularJournalSpace(RBMDevice * device);
+
+ struct cbj_header_t;
+ using write_ertr = Journal::submit_record_ertr;
+ /*
+ * device_write_bl
+ *
+ * @param device address to write
+ * @param bufferlist to write
+ *
+ */
+ write_ertr::future<> device_write_bl(rbm_abs_addr offset, ceph::bufferlist &bl);
+
+ using read_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ using read_header_ertr = read_ertr;
+ using read_header_ret = read_header_ertr::future<
+ std::optional<std::pair<cbj_header_t, bufferlist>>
+ >;
+ /*
+ * read_header
+ *
+ * read header block from given absolute address
+ *
+ * @param absolute address
+ *
+ */
+ read_header_ret read_header();
+
+ ceph::bufferlist encode_header();
+
+ write_ertr::future<> write_header();
+
+
+ /**
+ * CircularBoundedJournal structure
+ *
+ * +-------------------------------------------------------+
+ * | header | record | record | record | record | ... |
+ * +-------------------------------------------------------+
+ * ^-----------block aligned-----------------^
+ * <----fixed---->
+ */
+
+ struct cbj_header_t {
+ // start offset of CircularBoundedJournal in the device
+ journal_seq_t dirty_tail;
+ journal_seq_t alloc_tail;
+ segment_nonce_t magic;
+
+ DENC(cbj_header_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.dirty_tail, p);
+ denc(v.alloc_tail, p);
+ denc(v.magic, p);
+ DENC_FINISH(p);
+ }
+ };
+
+ /**
+ *
+ * Write position for CircularBoundedJournal
+ *
+ * | written to rbm | written length to CircularBoundedJournal | new write |
+ * ----------------->------------------------------------------------>
+ * ^ ^
+ * applied_to written_to
+ *
+ */
+
+ journal_seq_t get_written_to() const {
+ return written_to;
+ }
+ rbm_abs_addr get_rbm_addr(journal_seq_t seq) const {
+ return convert_paddr_to_abs_addr(seq.offset);
+ }
+ void set_written_to(journal_seq_t seq) {
+ rbm_abs_addr addr = convert_paddr_to_abs_addr(seq.offset);
+ assert(addr >= get_records_start());
+ assert(addr < get_journal_end());
+ written_to = seq;
+ }
+ device_id_t get_device_id() const {
+ return device->get_device_id();
+ }
+
+ journal_seq_t get_dirty_tail() const {
+ return header.dirty_tail;
+ }
+ journal_seq_t get_alloc_tail() const {
+ return header.alloc_tail;
+ }
+
+ /*
+ Size-related interfaces
+ +---------------------------------------------------------+
+ | header | record | record | record | record | ... |
+ +---------------------------------------------------------+
+ ^ ^ ^
+ | | |
+ get_journal_start | get_journal_end
+ get_records_start
+ <-- get_records_total_size + block_size -->
+ <--------------- get_journal_size ------------------------>
+ */
+
+ size_t get_records_used_size() const {
+ auto rbm_written_to = get_rbm_addr(get_written_to());
+ auto rbm_tail = get_rbm_addr(get_dirty_tail());
+ return rbm_written_to >= rbm_tail ?
+ rbm_written_to - rbm_tail :
+ rbm_written_to + get_records_total_size() + get_block_size()
+ - rbm_tail;
+ }
+ size_t get_records_total_size() const {
+ assert(device);
+ // a block is for header and a block is reserved to denote the end
+ return device->get_journal_size() - (2 * get_block_size());
+ }
+ rbm_abs_addr get_records_start() const {
+ assert(device);
+ return device->get_shard_journal_start() + get_block_size();
+ }
+ size_t get_records_available_size() const {
+ return get_records_total_size() - get_records_used_size();
+ }
+ bool is_available_size(uint64_t size) {
+ auto rbm_written_to = get_rbm_addr(get_written_to());
+ auto rbm_tail = get_rbm_addr(get_dirty_tail());
+ if (rbm_written_to > rbm_tail &&
+ (get_journal_end() - rbm_written_to) < size &&
+ size > (get_records_used_size() -
+ (get_journal_end() - rbm_written_to))) {
+ return false;
+ }
+ return get_records_available_size() >= size;
+ }
+ rbm_abs_addr get_journal_end() const {
+ assert(device);
+ return device->get_shard_journal_start() + device->get_journal_size();
+ }
+
+ read_ertr::future<> read(
+ uint64_t offset,
+ bufferptr &bptr) {
+ assert(device);
+ return device->read(offset, bptr);
+ }
+
+ seastar::future<> update_journal_tail(
+ journal_seq_t dirty,
+ journal_seq_t alloc) {
+ header.dirty_tail = dirty;
+ header.alloc_tail = alloc;
+ return write_header(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "encountered invalid error in update_journal_tail"
+ });
+ }
+
+ void set_initialized(bool init) {
+ initialized = init;
+ }
+
+ void set_cbj_header(cbj_header_t& head) {
+ header = head;
+ }
+
+ cbj_header_t get_cbj_header() {
+ return header;
+ }
+
+ private:
+ std::string print_name;
+ cbj_header_t header;
+ RBMDevice* device;
+ journal_seq_t written_to;
+ bool initialized = false;
+};
+
+std::ostream &operator<<(std::ostream &out, const CircularJournalSpace::cbj_header_t &header);
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal::CircularJournalSpace::cbj_header_t)
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::journal::CircularJournalSpace::cbj_header_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/journal/record_submitter.cc b/src/crimson/os/seastore/journal/record_submitter.cc
new file mode 100644
index 000000000..5ca53b436
--- /dev/null
+++ b/src/crimson/os/seastore/journal/record_submitter.cc
@@ -0,0 +1,533 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "record_submitter.h"
+
+#include <fmt/format.h>
+#include <fmt/os.h>
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/async_cleaner.h"
+
+SET_SUBSYS(seastore_journal);
+
+namespace crimson::os::seastore::journal {
+
+RecordBatch::add_pending_ret
+RecordBatch::add_pending(
+ const std::string& name,
+ record_t&& record,
+ extent_len_t block_size)
+{
+ LOG_PREFIX(RecordBatch::add_pending);
+ auto new_size = get_encoded_length_after(record, block_size);
+ auto dlength_offset = pending.size.dlength;
+ TRACE("{} batches={}, write_size={}, dlength_offset={} ...",
+ name,
+ pending.get_size() + 1,
+ new_size.get_encoded_length(),
+ dlength_offset);
+ assert(state != state_t::SUBMITTING);
+ assert(evaluate_submit(record.size, block_size).submit_size == new_size);
+
+ pending.push_back(
+ std::move(record), block_size);
+ assert(pending.size == new_size);
+ if (state == state_t::EMPTY) {
+ assert(!io_promise.has_value());
+ io_promise = seastar::shared_promise<maybe_promise_result_t>();
+ } else {
+ assert(io_promise.has_value());
+ }
+ state = state_t::PENDING;
+
+ return io_promise->get_shared_future(
+ ).then([dlength_offset, FNAME, &name
+ ](auto maybe_promise_result) -> add_pending_ret {
+ if (!maybe_promise_result.has_value()) {
+ ERROR("{} write failed", name);
+ return crimson::ct_error::input_output_error::make();
+ }
+ auto write_result = maybe_promise_result->write_result;
+ auto submit_result = record_locator_t{
+ write_result.start_seq.offset.add_offset(
+ maybe_promise_result->mdlength + dlength_offset),
+ write_result
+ };
+ TRACE("{} write finish with {}", name, submit_result);
+ return add_pending_ret(
+ add_pending_ertr::ready_future_marker{},
+ submit_result);
+ });
+}
+
+std::pair<ceph::bufferlist, record_group_size_t>
+RecordBatch::encode_batch(
+ const journal_seq_t& committed_to,
+ segment_nonce_t segment_nonce)
+{
+ assert(state == state_t::PENDING);
+ assert(pending.get_size() > 0);
+ assert(io_promise.has_value());
+
+ state = state_t::SUBMITTING;
+ submitting_size = pending.get_size();
+ auto gsize = pending.size;
+ submitting_length = gsize.get_encoded_length();
+ submitting_mdlength = gsize.get_mdlength();
+ auto bl = encode_records(pending, committed_to, segment_nonce);
+ // Note: pending is cleared here
+ assert(bl.length() == submitting_length);
+ return std::make_pair(bl, gsize);
+}
+
+void RecordBatch::set_result(
+ maybe_result_t maybe_write_result)
+{
+ maybe_promise_result_t result;
+ if (maybe_write_result.has_value()) {
+ assert(maybe_write_result->length == submitting_length);
+ result = promise_result_t{
+ *maybe_write_result,
+ submitting_mdlength
+ };
+ }
+ assert(state == state_t::SUBMITTING);
+ assert(io_promise.has_value());
+
+ state = state_t::EMPTY;
+ submitting_size = 0;
+ submitting_length = 0;
+ submitting_mdlength = 0;
+ io_promise->set_value(result);
+ io_promise.reset();
+}
+
+std::pair<ceph::bufferlist, record_group_size_t>
+RecordBatch::submit_pending_fast(
+ record_t&& record,
+ extent_len_t block_size,
+ const journal_seq_t& committed_to,
+ segment_nonce_t segment_nonce)
+{
+ auto new_size = get_encoded_length_after(record, block_size);
+ std::ignore = new_size;
+ assert(state == state_t::EMPTY);
+ assert(evaluate_submit(record.size, block_size).submit_size == new_size);
+
+ auto group = record_group_t(std::move(record), block_size);
+ auto size = group.size;
+ assert(size == new_size);
+ auto bl = encode_records(group, committed_to, segment_nonce);
+ assert(bl.length() == size.get_encoded_length());
+ return std::make_pair(std::move(bl), size);
+}
+
+RecordSubmitter::RecordSubmitter(
+ std::size_t io_depth,
+ std::size_t batch_capacity,
+ std::size_t batch_flush_size,
+ double preferred_fullness,
+ JournalAllocator& ja)
+ : io_depth_limit{io_depth},
+ preferred_fullness{preferred_fullness},
+ journal_allocator{ja},
+ batches(new RecordBatch[io_depth + 1])
+{
+ LOG_PREFIX(RecordSubmitter);
+ INFO("{} io_depth_limit={}, batch_capacity={}, batch_flush_size={}, "
+ "preferred_fullness={}",
+ get_name(), io_depth, batch_capacity,
+ batch_flush_size, preferred_fullness);
+ ceph_assert(io_depth > 0);
+ ceph_assert(batch_capacity > 0);
+ ceph_assert(preferred_fullness >= 0 &&
+ preferred_fullness <= 1);
+ free_batch_ptrs.reserve(io_depth + 1);
+ for (std::size_t i = 0; i <= io_depth; ++i) {
+ batches[i].initialize(i, batch_capacity, batch_flush_size);
+ free_batch_ptrs.push_back(&batches[i]);
+ }
+ pop_free_batch();
+}
+
+bool RecordSubmitter::is_available() const
+{
+ auto ret = !wait_available_promise.has_value() &&
+ !has_io_error;
+#ifndef NDEBUG
+ if (ret) {
+ // unconditional invariants
+ ceph_assert(journal_allocator.can_write());
+ ceph_assert(p_current_batch != nullptr);
+ ceph_assert(!p_current_batch->is_submitting());
+ // the current batch accepts a further write
+ ceph_assert(!p_current_batch->needs_flush());
+ if (!p_current_batch->is_empty()) {
+ auto submit_length =
+ p_current_batch->get_submit_size().get_encoded_length();
+ ceph_assert(!journal_allocator.needs_roll(submit_length));
+ }
+ // I'm not rolling
+ }
+#endif
+ return ret;
+}
+
+RecordSubmitter::wa_ertr::future<>
+RecordSubmitter::wait_available()
+{
+ LOG_PREFIX(RecordSubmitter::wait_available);
+ assert(!is_available());
+ if (has_io_error) {
+ ERROR("{} I/O is failed before wait", get_name());
+ return crimson::ct_error::input_output_error::make();
+ }
+ return wait_available_promise->get_shared_future(
+ ).then([FNAME, this]() -> wa_ertr::future<> {
+ if (has_io_error) {
+ ERROR("{} I/O is failed after wait", get_name());
+ return crimson::ct_error::input_output_error::make();
+ }
+ return wa_ertr::now();
+ });
+}
+
+RecordSubmitter::action_t
+RecordSubmitter::check_action(
+ const record_size_t& rsize) const
+{
+ assert(is_available());
+ auto eval = p_current_batch->evaluate_submit(
+ rsize, journal_allocator.get_block_size());
+ if (journal_allocator.needs_roll(eval.submit_size.get_encoded_length())) {
+ return action_t::ROLL;
+ } else if (eval.is_full) {
+ return action_t::SUBMIT_FULL;
+ } else {
+ return action_t::SUBMIT_NOT_FULL;
+ }
+}
+
+RecordSubmitter::roll_segment_ertr::future<>
+RecordSubmitter::roll_segment()
+{
+ LOG_PREFIX(RecordSubmitter::roll_segment);
+ ceph_assert(p_current_batch->needs_flush() ||
+ is_available());
+ // #1 block concurrent submissions due to rolling
+ wait_available_promise = seastar::shared_promise<>();
+ ceph_assert(!wait_unfull_flush_promise.has_value());
+ return [FNAME, this] {
+ if (p_current_batch->is_pending()) {
+ if (state == state_t::FULL) {
+ DEBUG("{} wait flush ...", get_name());
+ wait_unfull_flush_promise = seastar::promise<>();
+ return wait_unfull_flush_promise->get_future();
+ } else { // IDLE/PENDING
+ DEBUG("{} flush", get_name());
+ flush_current_batch();
+ return seastar::now();
+ }
+ } else {
+ assert(p_current_batch->is_empty());
+ return seastar::now();
+ }
+ }().then_wrapped([FNAME, this](auto fut) {
+ if (fut.failed()) {
+ ERROR("{} rolling is skipped unexpectedly, available", get_name());
+ has_io_error = true;
+ wait_available_promise->set_value();
+ wait_available_promise.reset();
+ return roll_segment_ertr::now();
+ } else {
+ // start rolling in background
+ std::ignore = journal_allocator.roll(
+ ).safe_then([FNAME, this] {
+ // good
+ DEBUG("{} rolling done, available", get_name());
+ assert(!has_io_error);
+ wait_available_promise->set_value();
+ wait_available_promise.reset();
+ }).handle_error(
+ crimson::ct_error::all_same_way([FNAME, this](auto e) {
+ ERROR("{} got error {}, available", get_name(), e);
+ has_io_error = true;
+ wait_available_promise->set_value();
+ wait_available_promise.reset();
+ })
+ ).handle_exception([FNAME, this](auto e) {
+ ERROR("{} got exception {}, available", get_name(), e);
+ has_io_error = true;
+ wait_available_promise->set_value();
+ wait_available_promise.reset();
+ });
+ // wait for background rolling
+ return wait_available();
+ }
+ });
+}
+
+RecordSubmitter::submit_ret
+RecordSubmitter::submit(
+ record_t&& record,
+ bool with_atomic_roll_segment)
+{
+ LOG_PREFIX(RecordSubmitter::submit);
+ ceph_assert(is_available());
+ assert(check_action(record.size) != action_t::ROLL);
+ journal_allocator.update_modify_time(record);
+ auto eval = p_current_batch->evaluate_submit(
+ record.size, journal_allocator.get_block_size());
+ bool needs_flush = (
+ state == state_t::IDLE ||
+ eval.submit_size.get_fullness() > preferred_fullness ||
+ // RecordBatch::needs_flush()
+ eval.is_full ||
+ p_current_batch->get_num_records() + 1 >=
+ p_current_batch->get_batch_capacity());
+ if (p_current_batch->is_empty() &&
+ needs_flush &&
+ state != state_t::FULL) {
+ // fast path with direct write
+ increment_io();
+ auto [to_write, sizes] = p_current_batch->submit_pending_fast(
+ std::move(record),
+ journal_allocator.get_block_size(),
+ get_committed_to(),
+ journal_allocator.get_nonce());
+ DEBUG("{} fast submit {}, committed_to={}, outstanding_io={} ...",
+ get_name(), sizes, get_committed_to(), num_outstanding_io);
+ account_submission(1, sizes);
+ return journal_allocator.write(std::move(to_write)
+ ).safe_then([mdlength = sizes.get_mdlength()](auto write_result) {
+ return record_locator_t{
+ write_result.start_seq.offset.add_offset(mdlength),
+ write_result
+ };
+ }).finally([this] {
+ decrement_io_with_flush();
+ });
+ }
+ // indirect batched write
+ auto write_fut = p_current_batch->add_pending(
+ get_name(),
+ std::move(record),
+ journal_allocator.get_block_size());
+ if (needs_flush) {
+ if (state == state_t::FULL) {
+ // #2 block concurrent submissions due to lack of resource
+ DEBUG("{} added with {} pending, outstanding_io={}, unavailable, wait flush ...",
+ get_name(),
+ p_current_batch->get_num_records(),
+ num_outstanding_io);
+ if (with_atomic_roll_segment) {
+ // wait_available_promise and wait_unfull_flush_promise
+ // need to be delegated to the follow-up atomic roll_segment();
+ assert(p_current_batch->is_pending());
+ } else {
+ wait_available_promise = seastar::shared_promise<>();
+ ceph_assert(!wait_unfull_flush_promise.has_value());
+ wait_unfull_flush_promise = seastar::promise<>();
+ // flush and mark available in background
+ std::ignore = wait_unfull_flush_promise->get_future(
+ ).finally([FNAME, this] {
+ DEBUG("{} flush done, available", get_name());
+ wait_available_promise->set_value();
+ wait_available_promise.reset();
+ });
+ }
+ } else {
+ DEBUG("{} added pending, flush", get_name());
+ flush_current_batch();
+ }
+ } else {
+ // will flush later
+ DEBUG("{} added with {} pending, outstanding_io={}",
+ get_name(),
+ p_current_batch->get_num_records(),
+ num_outstanding_io);
+ assert(!p_current_batch->needs_flush());
+ }
+ return write_fut;
+}
+
+RecordSubmitter::open_ret
+RecordSubmitter::open(bool is_mkfs)
+{
+ return journal_allocator.open(is_mkfs
+ ).safe_then([this](journal_seq_t ret) {
+ LOG_PREFIX(RecordSubmitter::open);
+ DEBUG("{} register metrics", get_name());
+ stats = {};
+ namespace sm = seastar::metrics;
+ std::vector<sm::label_instance> label_instances;
+ label_instances.push_back(sm::label_instance("submitter", get_name()));
+ metrics.add_group(
+ "journal",
+ {
+ sm::make_counter(
+ "record_num",
+ stats.record_batch_stats.num_io,
+ sm::description("total number of records submitted"),
+ label_instances
+ ),
+ sm::make_counter(
+ "record_batch_num",
+ stats.record_batch_stats.num_io_grouped,
+ sm::description("total number of records batched"),
+ label_instances
+ ),
+ sm::make_counter(
+ "io_num",
+ stats.io_depth_stats.num_io,
+ sm::description("total number of io submitted"),
+ label_instances
+ ),
+ sm::make_counter(
+ "io_depth_num",
+ stats.io_depth_stats.num_io_grouped,
+ sm::description("total number of io depth"),
+ label_instances
+ ),
+ sm::make_counter(
+ "record_group_padding_bytes",
+ stats.record_group_padding_bytes,
+ sm::description("bytes of metadata padding when write record groups"),
+ label_instances
+ ),
+ sm::make_counter(
+ "record_group_metadata_bytes",
+ stats.record_group_metadata_bytes,
+ sm::description("bytes of raw metadata when write record groups"),
+ label_instances
+ ),
+ sm::make_counter(
+ "record_group_data_bytes",
+ stats.record_group_data_bytes,
+ sm::description("bytes of data when write record groups"),
+ label_instances
+ ),
+ }
+ );
+ return ret;
+ });
+}
+
+RecordSubmitter::close_ertr::future<>
+RecordSubmitter::close()
+{
+ committed_to = JOURNAL_SEQ_NULL;
+ ceph_assert(state == state_t::IDLE);
+ ceph_assert(num_outstanding_io == 0);
+ ceph_assert(p_current_batch != nullptr);
+ ceph_assert(p_current_batch->is_empty());
+ ceph_assert(!wait_available_promise.has_value());
+ has_io_error = false;
+ ceph_assert(!wait_unfull_flush_promise.has_value());
+ metrics.clear();
+ return journal_allocator.close();
+}
+
+void RecordSubmitter::update_state()
+{
+ if (num_outstanding_io == 0) {
+ state = state_t::IDLE;
+ } else if (num_outstanding_io < io_depth_limit) {
+ state = state_t::PENDING;
+ } else if (num_outstanding_io == io_depth_limit) {
+ state = state_t::FULL;
+ } else {
+ ceph_abort("fatal error: io-depth overflow");
+ }
+}
+
+void RecordSubmitter::decrement_io_with_flush()
+{
+ LOG_PREFIX(RecordSubmitter::decrement_io_with_flush);
+ assert(num_outstanding_io > 0);
+ auto prv_state = state;
+ --num_outstanding_io;
+ update_state();
+
+ if (prv_state == state_t::FULL) {
+ if (wait_unfull_flush_promise.has_value()) {
+ DEBUG("{} flush, resolve wait_unfull_flush_promise", get_name());
+ assert(!p_current_batch->is_empty());
+ assert(wait_available_promise.has_value());
+ flush_current_batch();
+ wait_unfull_flush_promise->set_value();
+ wait_unfull_flush_promise.reset();
+ return;
+ }
+ } else {
+ ceph_assert(!wait_unfull_flush_promise.has_value());
+ }
+
+ auto needs_flush = (
+ !p_current_batch->is_empty() && (
+ state == state_t::IDLE ||
+ p_current_batch->get_submit_size().get_fullness() > preferred_fullness ||
+ p_current_batch->needs_flush()
+ ));
+ if (needs_flush) {
+ DEBUG("{} flush", get_name());
+ flush_current_batch();
+ }
+}
+
+void RecordSubmitter::account_submission(
+ std::size_t num,
+ const record_group_size_t& size)
+{
+ stats.record_group_padding_bytes +=
+ (size.get_mdlength() - size.get_raw_mdlength());
+ stats.record_group_metadata_bytes += size.get_raw_mdlength();
+ stats.record_group_data_bytes += size.dlength;
+ stats.record_batch_stats.increment(num);
+}
+
+void RecordSubmitter::finish_submit_batch(
+ RecordBatch* p_batch,
+ maybe_result_t maybe_result)
+{
+ assert(p_batch->is_submitting());
+ p_batch->set_result(maybe_result);
+ free_batch_ptrs.push_back(p_batch);
+ decrement_io_with_flush();
+}
+
+void RecordSubmitter::flush_current_batch()
+{
+ LOG_PREFIX(RecordSubmitter::flush_current_batch);
+ RecordBatch* p_batch = p_current_batch;
+ assert(p_batch->is_pending());
+ p_current_batch = nullptr;
+ pop_free_batch();
+
+ increment_io();
+ auto num = p_batch->get_num_records();
+ auto [to_write, sizes] = p_batch->encode_batch(
+ get_committed_to(), journal_allocator.get_nonce());
+ DEBUG("{} {} records, {}, committed_to={}, outstanding_io={} ...",
+ get_name(), num, sizes, get_committed_to(), num_outstanding_io);
+ account_submission(num, sizes);
+ std::ignore = journal_allocator.write(std::move(to_write)
+ ).safe_then([this, p_batch, FNAME, num, sizes=sizes](auto write_result) {
+ TRACE("{} {} records, {}, write done with {}",
+ get_name(), num, sizes, write_result);
+ finish_submit_batch(p_batch, write_result);
+ }).handle_error(
+ crimson::ct_error::all_same_way([this, p_batch, FNAME, num, sizes=sizes](auto e) {
+ ERROR("{} {} records, {}, got error {}",
+ get_name(), num, sizes, e);
+ finish_submit_batch(p_batch, std::nullopt);
+ })
+ ).handle_exception([this, p_batch, FNAME, num, sizes=sizes](auto e) {
+ ERROR("{} {} records, {}, got exception {}",
+ get_name(), num, sizes, e);
+ finish_submit_batch(p_batch, std::nullopt);
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/journal/record_submitter.h b/src/crimson/os/seastore/journal/record_submitter.h
new file mode 100644
index 000000000..eedd2dd8c
--- /dev/null
+++ b/src/crimson/os/seastore/journal/record_submitter.h
@@ -0,0 +1,347 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <optional>
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/buffer.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/segment_manager_group.h"
+#include "crimson/os/seastore/segment_seq_allocator.h"
+
+namespace crimson::os::seastore {
+ class SegmentProvider;
+ class JournalTrimmer;
+}
+
+namespace crimson::os::seastore::journal {
+
+class JournalAllocator {
+public:
+ using base_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ virtual const std::string& get_name() const = 0;
+
+ virtual void update_modify_time(record_t& record) = 0;
+
+ virtual extent_len_t get_block_size() const = 0;
+
+ using close_ertr = base_ertr;
+ virtual close_ertr::future<> close() = 0;
+
+ virtual segment_nonce_t get_nonce() const = 0;
+
+ using write_ertr = base_ertr;
+ using write_ret = write_ertr::future<write_result_t>;
+ virtual write_ret write(ceph::bufferlist&& to_write) = 0;
+
+ virtual bool can_write() const = 0;
+
+ using roll_ertr = base_ertr;
+ virtual roll_ertr::future<> roll() = 0;
+
+ virtual bool needs_roll(std::size_t length) const = 0;
+
+ using open_ertr = base_ertr;
+ using open_ret = open_ertr::future<journal_seq_t>;
+ virtual open_ret open(bool is_mkfs) = 0;
+
+};
+
+/**
+ * RecordBatch
+ *
+ * Maintain a batch of records for submit.
+ */
+class RecordBatch {
+ enum class state_t {
+ EMPTY = 0,
+ PENDING,
+ SUBMITTING
+ };
+
+public:
+ RecordBatch() = default;
+ RecordBatch(RecordBatch&&) = delete;
+ RecordBatch(const RecordBatch&) = delete;
+ RecordBatch& operator=(RecordBatch&&) = delete;
+ RecordBatch& operator=(const RecordBatch&) = delete;
+
+ bool is_empty() const {
+ return state == state_t::EMPTY;
+ }
+
+ bool is_pending() const {
+ return state == state_t::PENDING;
+ }
+
+ bool is_submitting() const {
+ return state == state_t::SUBMITTING;
+ }
+
+ std::size_t get_index() const {
+ return index;
+ }
+
+ std::size_t get_num_records() const {
+ return pending.get_size();
+ }
+
+ std::size_t get_batch_capacity() const {
+ return batch_capacity;
+ }
+
+ const record_group_size_t& get_submit_size() const {
+ assert(state != state_t::EMPTY);
+ return pending.size;
+ }
+
+ bool needs_flush() const {
+ assert(state != state_t::SUBMITTING);
+ assert(pending.get_size() <= batch_capacity);
+ if (state == state_t::EMPTY) {
+ return false;
+ } else {
+ assert(state == state_t::PENDING);
+ return (pending.get_size() >= batch_capacity ||
+ pending.size.get_encoded_length() > batch_flush_size);
+ }
+ }
+
+ struct evaluation_t {
+ record_group_size_t submit_size;
+ bool is_full;
+ };
+ evaluation_t evaluate_submit(
+ const record_size_t& rsize,
+ extent_len_t block_size) const {
+ assert(!needs_flush());
+ auto submit_size = pending.size.get_encoded_length_after(
+ rsize, block_size);
+ bool is_full = submit_size.get_encoded_length() > batch_flush_size;
+ return {submit_size, is_full};
+ }
+
+ void initialize(std::size_t i,
+ std::size_t _batch_capacity,
+ std::size_t _batch_flush_size) {
+ ceph_assert(_batch_capacity > 0);
+ index = i;
+ batch_capacity = _batch_capacity;
+ batch_flush_size = _batch_flush_size;
+ pending.reserve(batch_capacity);
+ }
+
+ // Add to the batch, the future will be resolved after the batch is
+ // written.
+ //
+ // Set write_result_t::write_length to 0 if the record is not the first one
+ // in the batch.
+ using add_pending_ertr = JournalAllocator::write_ertr;
+ using add_pending_ret = add_pending_ertr::future<record_locator_t>;
+ add_pending_ret add_pending(
+ const std::string& name,
+ record_t&&,
+ extent_len_t block_size);
+
+ // Encode the batched records for write.
+ std::pair<ceph::bufferlist, record_group_size_t> encode_batch(
+ const journal_seq_t& committed_to,
+ segment_nonce_t segment_nonce);
+
+ // Set the write result and reset for reuse
+ using maybe_result_t = std::optional<write_result_t>;
+ void set_result(maybe_result_t maybe_write_end_seq);
+
+ // The fast path that is equivalent to submit a single record as a batch.
+ //
+ // Essentially, equivalent to the combined logic of:
+ // add_pending(), encode_batch() and set_result() above without
+ // the intervention of the shared io_promise.
+ //
+ // Note the current RecordBatch can be reused afterwards.
+ std::pair<ceph::bufferlist, record_group_size_t> submit_pending_fast(
+ record_t&&,
+ extent_len_t block_size,
+ const journal_seq_t& committed_to,
+ segment_nonce_t segment_nonce);
+
+private:
+ record_group_size_t get_encoded_length_after(
+ const record_t& record,
+ extent_len_t block_size) const {
+ return pending.size.get_encoded_length_after(
+ record.size, block_size);
+ }
+
+ state_t state = state_t::EMPTY;
+ std::size_t index = 0;
+ std::size_t batch_capacity = 0;
+ std::size_t batch_flush_size = 0;
+
+ record_group_t pending;
+ std::size_t submitting_size = 0;
+ extent_len_t submitting_length = 0;
+ extent_len_t submitting_mdlength = 0;
+
+ struct promise_result_t {
+ write_result_t write_result;
+ extent_len_t mdlength;
+ };
+ using maybe_promise_result_t = std::optional<promise_result_t>;
+ std::optional<seastar::shared_promise<maybe_promise_result_t> > io_promise;
+};
+
+/**
+ * RecordSubmitter
+ *
+ * Submit records concurrently with RecordBatch with SegmentAllocator.
+ *
+ * Configurations and controls:
+ * - io_depth: the io-depth limit to SegmentAllocator;
+ * - batch_capacity: the number limit of records in a RecordBatch;
+ * - batch_flush_size: the bytes threshold to force flush a RecordBatch to
+ * control the maximum latency;
+ * - preferred_fullness: the fullness threshold to flush a RecordBatch;
+ */
+class RecordSubmitter {
+ enum class state_t {
+ IDLE = 0, // outstanding_io == 0
+ PENDING, // outstanding_io < io_depth_limit
+ FULL // outstanding_io == io_depth_limit
+ // OVERFLOW: outstanding_io > io_depth_limit is impossible
+ };
+
+ struct grouped_io_stats {
+ uint64_t num_io = 0;
+ uint64_t num_io_grouped = 0;
+
+ void increment(uint64_t num_grouped_io) {
+ ++num_io;
+ num_io_grouped += num_grouped_io;
+ }
+ };
+
+ using base_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+
+public:
+ RecordSubmitter(std::size_t io_depth,
+ std::size_t batch_capacity,
+ std::size_t batch_flush_size,
+ double preferred_fullness,
+ JournalAllocator&);
+
+ const std::string& get_name() const {
+ return journal_allocator.get_name();
+ }
+
+ journal_seq_t get_committed_to() const {
+ return committed_to;
+ }
+
+ // whether is available to submit a record
+ bool is_available() const;
+
+ // wait for available if cannot submit, should check is_available() again
+ // when the future is resolved.
+ using wa_ertr = base_ertr;
+ wa_ertr::future<> wait_available();
+
+ // when available, check for the submit action
+ // according to the pending record size
+ enum class action_t {
+ ROLL,
+ SUBMIT_FULL,
+ SUBMIT_NOT_FULL
+ };
+ action_t check_action(const record_size_t&) const;
+
+ // when available, roll the segment if needed
+ using roll_segment_ertr = base_ertr;
+ roll_segment_ertr::future<> roll_segment();
+
+ // when available, submit the record if possible
+ using submit_ertr = base_ertr;
+ using submit_ret = submit_ertr::future<record_locator_t>;
+ submit_ret submit(record_t&&, bool with_atomic_roll_segment=false);
+
+ void update_committed_to(const journal_seq_t& new_committed_to) {
+ assert(new_committed_to != JOURNAL_SEQ_NULL);
+ assert(committed_to == JOURNAL_SEQ_NULL ||
+ committed_to <= new_committed_to);
+ committed_to = new_committed_to;
+ }
+
+ // open for write, generate the correct print name, and register metrics
+ using open_ertr = base_ertr;
+ using open_ret = open_ertr::future<journal_seq_t>;
+ open_ret open(bool is_mkfs);
+
+ using close_ertr = base_ertr;
+ close_ertr::future<> close();
+
+private:
+ void update_state();
+
+ void increment_io() {
+ ++num_outstanding_io;
+ stats.io_depth_stats.increment(num_outstanding_io);
+ update_state();
+ }
+
+ void decrement_io_with_flush();
+
+ void pop_free_batch() {
+ assert(p_current_batch == nullptr);
+ assert(!free_batch_ptrs.empty());
+ p_current_batch = free_batch_ptrs.front();
+ assert(p_current_batch->is_empty());
+ assert(p_current_batch == &batches[p_current_batch->get_index()]);
+ free_batch_ptrs.pop_front();
+ }
+
+ void account_submission(std::size_t, const record_group_size_t&);
+
+ using maybe_result_t = RecordBatch::maybe_result_t;
+ void finish_submit_batch(RecordBatch*, maybe_result_t);
+
+ void flush_current_batch();
+
+ state_t state = state_t::IDLE;
+ std::size_t num_outstanding_io = 0;
+ std::size_t io_depth_limit;
+ double preferred_fullness;
+
+ JournalAllocator& journal_allocator;
+ // committed_to may be in a previous journal segment
+ journal_seq_t committed_to = JOURNAL_SEQ_NULL;
+
+ std::unique_ptr<RecordBatch[]> batches;
+ // should not be nullptr after constructed
+ RecordBatch* p_current_batch = nullptr;
+ seastar::circular_buffer<RecordBatch*> free_batch_ptrs;
+
+ // blocked for rolling or lack of resource
+ std::optional<seastar::shared_promise<> > wait_available_promise;
+ bool has_io_error = false;
+ // when needs flush but io depth is full,
+ // wait for decrement_io_with_flush()
+ std::optional<seastar::promise<> > wait_unfull_flush_promise;
+
+ struct {
+ grouped_io_stats record_batch_stats;
+ grouped_io_stats io_depth_stats;
+ uint64_t record_group_padding_bytes = 0;
+ uint64_t record_group_metadata_bytes = 0;
+ uint64_t record_group_data_bytes = 0;
+ } stats;
+ seastar::metrics::metric_group metrics;
+};
+
+}
diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc
new file mode 100644
index 000000000..61e1be585
--- /dev/null
+++ b/src/crimson/os/seastore/journal/segment_allocator.cc
@@ -0,0 +1,283 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "segment_allocator.h"
+
+#include <fmt/format.h>
+#include <fmt/os.h>
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/async_cleaner.h"
+
+SET_SUBSYS(seastore_journal);
+
+namespace crimson::os::seastore::journal {
+
+SegmentAllocator::SegmentAllocator(
+ JournalTrimmer *trimmer,
+ data_category_t category,
+ rewrite_gen_t gen,
+ SegmentProvider &sp,
+ SegmentSeqAllocator &ssa)
+ : print_name{fmt::format("{}_G{}", category, gen)},
+ type{trimmer == nullptr ?
+ segment_type_t::OOL :
+ segment_type_t::JOURNAL},
+ category{category},
+ gen{gen},
+ segment_provider{sp},
+ sm_group{*sp.get_segment_manager_group()},
+ segment_seq_allocator(ssa),
+ trimmer{trimmer}
+{
+ reset();
+}
+
+segment_nonce_t calc_new_nonce(
+ segment_type_t type,
+ uint32_t crc,
+ unsigned char const *data,
+ unsigned length)
+{
+ crc &= std::numeric_limits<uint32_t>::max() >> 1;
+ crc |= static_cast<uint32_t>(type) << 31;
+ return ceph_crc32c(crc, data, length);
+}
+
+SegmentAllocator::open_ret
+SegmentAllocator::do_open(bool is_mkfs)
+{
+ LOG_PREFIX(SegmentAllocator::do_open);
+ ceph_assert(!current_segment);
+ segment_seq_t new_segment_seq =
+ segment_seq_allocator.get_and_inc_next_segment_seq();
+ auto meta = sm_group.get_meta();
+ current_segment_nonce = calc_new_nonce(
+ type,
+ new_segment_seq,
+ reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()),
+ sizeof(meta.seastore_id.uuid));
+ auto new_segment_id = segment_provider.allocate_segment(
+ new_segment_seq, type, category, gen);
+ ceph_assert(new_segment_id != NULL_SEG_ID);
+ return sm_group.open(new_segment_id
+ ).handle_error(
+ open_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in SegmentAllocator::do_open open"
+ }
+ ).safe_then([this, is_mkfs, FNAME, new_segment_seq](auto sref) {
+ // initialize new segment
+ segment_id_t segment_id = sref->get_segment_id();
+ journal_seq_t dirty_tail;
+ journal_seq_t alloc_tail;
+ if (type == segment_type_t::JOURNAL) {
+ dirty_tail = trimmer->get_dirty_tail();
+ alloc_tail = trimmer->get_alloc_tail();
+ if (is_mkfs) {
+ ceph_assert(dirty_tail == JOURNAL_SEQ_NULL);
+ ceph_assert(alloc_tail == JOURNAL_SEQ_NULL);
+ auto mkfs_seq = journal_seq_t{
+ new_segment_seq,
+ paddr_t::make_seg_paddr(segment_id, 0)
+ };
+ dirty_tail = mkfs_seq;
+ alloc_tail = mkfs_seq;
+ } else {
+ ceph_assert(dirty_tail != JOURNAL_SEQ_NULL);
+ ceph_assert(alloc_tail != JOURNAL_SEQ_NULL);
+ }
+ } else { // OOL
+ ceph_assert(!is_mkfs);
+ dirty_tail = JOURNAL_SEQ_NULL;
+ alloc_tail = JOURNAL_SEQ_NULL;
+ }
+ auto header = segment_header_t{
+ new_segment_seq,
+ segment_id,
+ dirty_tail,
+ alloc_tail,
+ current_segment_nonce,
+ type,
+ category,
+ gen};
+ INFO("{} writing header {}", print_name, header);
+
+ auto header_length = get_block_size();
+ bufferlist bl;
+ encode(header, bl);
+ bufferptr bp(ceph::buffer::create_page_aligned(header_length));
+ bp.zero();
+ auto iter = bl.cbegin();
+ iter.copy(bl.length(), bp.c_str());
+ bl.clear();
+ bl.append(bp);
+
+ ceph_assert(sref->get_write_ptr() == 0);
+ assert((unsigned)header_length == bl.length());
+ written_to = header_length;
+ auto new_journal_seq = journal_seq_t{
+ new_segment_seq,
+ paddr_t::make_seg_paddr(segment_id, written_to)};
+ segment_provider.update_segment_avail_bytes(
+ type, new_journal_seq.offset);
+ return sref->write(0, std::move(bl)
+ ).handle_error(
+ open_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in SegmentAllocator::do_open write"
+ }
+ ).safe_then([this,
+ FNAME,
+ new_journal_seq,
+ sref=std::move(sref)]() mutable {
+ ceph_assert(!current_segment);
+ current_segment = std::move(sref);
+ DEBUG("{} rolled new segment id={}",
+ print_name, current_segment->get_segment_id());
+ ceph_assert(new_journal_seq.segment_seq ==
+ segment_provider.get_seg_info(current_segment->get_segment_id()).seq);
+ return new_journal_seq;
+ });
+ });
+}
+
+SegmentAllocator::open_ret
+SegmentAllocator::open(bool is_mkfs)
+{
+ LOG_PREFIX(SegmentAllocator::open);
+ auto& device_ids = sm_group.get_device_ids();
+ ceph_assert(device_ids.size());
+ std::ostringstream oss;
+ for (auto& device_id : device_ids) {
+ oss << device_id_printer_t{device_id} << "_";
+ }
+ oss << fmt::format("{}_G{}", category, gen);
+ print_name = oss.str();
+
+ DEBUG("{}", print_name);
+ return do_open(is_mkfs);
+}
+
+SegmentAllocator::roll_ertr::future<>
+SegmentAllocator::roll()
+{
+ ceph_assert(can_write());
+ return close_segment().safe_then([this] {
+ return do_open(false).discard_result();
+ });
+}
+
+SegmentAllocator::write_ret
+SegmentAllocator::write(ceph::bufferlist&& to_write)
+{
+ LOG_PREFIX(SegmentAllocator::write);
+ assert(can_write());
+ auto write_length = to_write.length();
+ auto write_start_offset = written_to;
+ auto write_start_seq = journal_seq_t{
+ segment_provider.get_seg_info(current_segment->get_segment_id()).seq,
+ paddr_t::make_seg_paddr(
+ current_segment->get_segment_id(), write_start_offset)
+ };
+ TRACE("{} {}~{}", print_name, write_start_seq, write_length);
+ assert(write_length > 0);
+ assert((write_length % get_block_size()) == 0);
+ assert(!needs_roll(write_length));
+
+ auto write_result = write_result_t{
+ write_start_seq,
+ write_length
+ };
+ written_to += write_length;
+ segment_provider.update_segment_avail_bytes(
+ type,
+ paddr_t::make_seg_paddr(
+ current_segment->get_segment_id(), written_to)
+ );
+ return current_segment->write(
+ write_start_offset, std::move(to_write)
+ ).handle_error(
+ write_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in SegmentAllocator::write"
+ }
+ ).safe_then([write_result, cs=current_segment] {
+ return write_result;
+ });
+}
+
+SegmentAllocator::close_ertr::future<>
+SegmentAllocator::close()
+{
+ return [this] {
+ LOG_PREFIX(SegmentAllocator::close);
+ if (current_segment) {
+ DEBUG("{} close current segment", print_name);
+ return close_segment();
+ } else {
+ INFO("{} no current segment", print_name);
+ return close_segment_ertr::now();
+ }
+ }().finally([this] {
+ reset();
+ });
+}
+
+SegmentAllocator::close_segment_ertr::future<>
+SegmentAllocator::close_segment()
+{
+ LOG_PREFIX(SegmentAllocator::close_segment);
+ assert(can_write());
+ // Note: make sure no one can access the current segment once closing
+ auto seg_to_close = std::move(current_segment);
+ auto close_segment_id = seg_to_close->get_segment_id();
+ auto close_seg_info = segment_provider.get_seg_info(close_segment_id);
+ ceph_assert((close_seg_info.modify_time == NULL_TIME &&
+ close_seg_info.num_extents == 0) ||
+ (close_seg_info.modify_time != NULL_TIME &&
+ close_seg_info.num_extents != 0));
+ auto tail = segment_tail_t{
+ close_seg_info.seq,
+ close_segment_id,
+ current_segment_nonce,
+ type,
+ timepoint_to_mod(close_seg_info.modify_time),
+ close_seg_info.num_extents};
+ ceph::bufferlist bl;
+ encode(tail, bl);
+ INFO("{} close segment {}, written_to={}",
+ print_name,
+ tail,
+ written_to);
+
+ bufferptr bp(ceph::buffer::create_page_aligned(get_block_size()));
+ bp.zero();
+ auto iter = bl.cbegin();
+ iter.copy(bl.length(), bp.c_str());
+ bl.clear();
+ bl.append(bp);
+
+ assert(bl.length() == sm_group.get_rounded_tail_length());
+
+ auto p_seg_to_close = seg_to_close.get();
+ return p_seg_to_close->advance_wp(
+ sm_group.get_segment_size() - sm_group.get_rounded_tail_length()
+ ).safe_then([this, FNAME, bl=std::move(bl), p_seg_to_close]() mutable {
+ DEBUG("Writing tail info to segment {}", p_seg_to_close->get_segment_id());
+ return p_seg_to_close->write(
+ sm_group.get_segment_size() - sm_group.get_rounded_tail_length(),
+ std::move(bl));
+ }).safe_then([p_seg_to_close] {
+ return p_seg_to_close->close();
+ }).safe_then([this, seg_to_close=std::move(seg_to_close)] {
+ segment_provider.close_segment(seg_to_close->get_segment_id());
+ }).handle_error(
+ close_segment_ertr::pass_further{},
+ crimson::ct_error::assert_all {
+ "Invalid error in SegmentAllocator::close_segment"
+ });
+
+}
+
+}
diff --git a/src/crimson/os/seastore/journal/segment_allocator.h b/src/crimson/os/seastore/journal/segment_allocator.h
new file mode 100644
index 000000000..292c23070
--- /dev/null
+++ b/src/crimson/os/seastore/journal/segment_allocator.h
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <optional>
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/buffer.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/segment_manager_group.h"
+#include "crimson/os/seastore/segment_seq_allocator.h"
+#include "crimson/os/seastore/journal/record_submitter.h"
+#include "crimson/os/seastore/async_cleaner.h"
+
+namespace crimson::os::seastore {
+ class SegmentProvider;
+ class JournalTrimmer;
+}
+
+namespace crimson::os::seastore::journal {
+
+/**
+ * SegmentAllocator
+ *
+ * Maintain an available segment for writes.
+ */
+class SegmentAllocator : public JournalAllocator {
+
+ public:
+ // SegmentAllocator specific methods
+ SegmentAllocator(JournalTrimmer *trimmer,
+ data_category_t category,
+ rewrite_gen_t gen,
+ SegmentProvider &sp,
+ SegmentSeqAllocator &ssa);
+
+ segment_id_t get_segment_id() const {
+ assert(can_write());
+ return current_segment->get_segment_id();
+ }
+
+ extent_len_t get_max_write_length() const {
+ return sm_group.get_segment_size() -
+ sm_group.get_rounded_header_length() -
+ sm_group.get_rounded_tail_length();
+ }
+
+ public:
+ // overriding methods
+ const std::string& get_name() const final {
+ return print_name;
+ }
+
+ extent_len_t get_block_size() const final {
+ return sm_group.get_block_size();
+ }
+
+ bool can_write() const final {
+ return !!current_segment;
+ }
+
+ segment_nonce_t get_nonce() const final {
+ assert(can_write());
+ return current_segment_nonce;
+ }
+
+ // returns true iff the current segment has insufficient space
+ bool needs_roll(std::size_t length) const final {
+ assert(can_write());
+ assert(current_segment->get_write_capacity() ==
+ sm_group.get_segment_size());
+ auto write_capacity = current_segment->get_write_capacity() -
+ sm_group.get_rounded_tail_length();
+ return length + written_to > std::size_t(write_capacity);
+ }
+
+ // open for write and generate the correct print name
+ open_ret open(bool is_mkfs) final;
+
+ // close the current segment and initialize next one
+ roll_ertr::future<> roll() final;
+
+ // write the buffer, return the write result
+ //
+ // May be called concurrently, but writes may complete in any order.
+ // If rolling/opening, no write is allowed.
+ write_ret write(ceph::bufferlist&& to_write) final;
+
+ using close_ertr = base_ertr;
+ close_ertr::future<> close() final;
+
+ void update_modify_time(record_t& record) final {
+ segment_provider.update_modify_time(
+ get_segment_id(),
+ record.modify_time,
+ record.extents.size());
+ }
+
+ private:
+ open_ret do_open(bool is_mkfs);
+
+ void reset() {
+ current_segment.reset();
+ written_to = 0;
+
+ current_segment_nonce = 0;
+ }
+
+ using close_segment_ertr = base_ertr;
+ close_segment_ertr::future<> close_segment();
+
+ // device id is not available during construction,
+ // so generate the print_name later.
+ std::string print_name;
+ const segment_type_t type; // JOURNAL or OOL
+ const data_category_t category;
+ const rewrite_gen_t gen;
+ SegmentProvider &segment_provider;
+ SegmentManagerGroup &sm_group;
+ SegmentRef current_segment;
+ segment_off_t written_to;
+ SegmentSeqAllocator &segment_seq_allocator;
+ segment_nonce_t current_segment_nonce;
+ JournalTrimmer *trimmer;
+};
+
+}
diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc
new file mode 100644
index 000000000..58df91374
--- /dev/null
+++ b/src/crimson/os/seastore/journal/segmented_journal.cc
@@ -0,0 +1,433 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+
+#include <boost/iterator/counting_iterator.hpp>
+
+#include "include/intarith.h"
+
+#include "segmented_journal.h"
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/os/seastore/logging.h"
+
+SET_SUBSYS(seastore_journal);
+
+/*
+ * format:
+ * - H<handle-addr> information
+ *
+ * levels:
+ * - INFO: major initiation, closing, rolling and replay operations
+ * - DEBUG: INFO details, major submit operations
+ * - TRACE: DEBUG details
+ */
+
+namespace crimson::os::seastore::journal {
+
+SegmentedJournal::SegmentedJournal(
+ SegmentProvider &segment_provider,
+ JournalTrimmer &trimmer)
+ : segment_seq_allocator(
+ new SegmentSeqAllocator(segment_type_t::JOURNAL)),
+ journal_segment_allocator(&trimmer,
+ data_category_t::METADATA,
+ INLINE_GENERATION,
+ segment_provider,
+ *segment_seq_allocator),
+ record_submitter(crimson::common::get_conf<uint64_t>(
+ "seastore_journal_iodepth_limit"),
+ crimson::common::get_conf<uint64_t>(
+ "seastore_journal_batch_capacity"),
+ crimson::common::get_conf<Option::size_t>(
+ "seastore_journal_batch_flush_size"),
+ crimson::common::get_conf<double>(
+ "seastore_journal_batch_preferred_fullness"),
+ journal_segment_allocator),
+ sm_group(*segment_provider.get_segment_manager_group()),
+ trimmer{trimmer}
+{
+}
+
+SegmentedJournal::open_for_mkfs_ret
+SegmentedJournal::open_for_mkfs()
+{
+ return record_submitter.open(true);
+}
+
+SegmentedJournal::open_for_mount_ret
+SegmentedJournal::open_for_mount()
+{
+ return record_submitter.open(false);
+}
+
+SegmentedJournal::close_ertr::future<> SegmentedJournal::close()
+{
+ LOG_PREFIX(Journal::close);
+ INFO("closing, committed_to={}",
+ record_submitter.get_committed_to());
+ return record_submitter.close();
+}
+
+SegmentedJournal::prep_replay_segments_fut
+SegmentedJournal::prep_replay_segments(
+ std::vector<std::pair<segment_id_t, segment_header_t>> segments)
+{
+ LOG_PREFIX(Journal::prep_replay_segments);
+ if (segments.empty()) {
+ ERROR("no journal segments for replay");
+ return crimson::ct_error::input_output_error::make();
+ }
+ std::sort(
+ segments.begin(),
+ segments.end(),
+ [](const auto &lt, const auto &rt) {
+ return lt.second.segment_seq <
+ rt.second.segment_seq;
+ });
+
+ segment_seq_allocator->set_next_segment_seq(
+ segments.rbegin()->second.segment_seq + 1);
+ std::for_each(
+ segments.begin(),
+ segments.end(),
+ [FNAME](auto &seg)
+ {
+ if (seg.first != seg.second.physical_segment_id ||
+ seg.second.get_type() != segment_type_t::JOURNAL) {
+ ERROR("illegal journal segment for replay -- {}", seg.second);
+ ceph_abort();
+ }
+ });
+
+ auto last_segment_id = segments.rbegin()->first;
+ auto last_header = segments.rbegin()->second;
+ return scan_last_segment(last_segment_id, last_header
+ ).safe_then([this, FNAME, segments=std::move(segments)] {
+ INFO("dirty_tail={}, alloc_tail={}",
+ trimmer.get_dirty_tail(),
+ trimmer.get_alloc_tail());
+ auto journal_tail = trimmer.get_journal_tail();
+ auto journal_tail_paddr = journal_tail.offset;
+ ceph_assert(journal_tail != JOURNAL_SEQ_NULL);
+ ceph_assert(journal_tail_paddr != P_ADDR_NULL);
+ auto from = std::find_if(
+ segments.begin(),
+ segments.end(),
+ [&journal_tail_paddr](const auto &seg) -> bool {
+ auto& seg_addr = journal_tail_paddr.as_seg_paddr();
+ return seg.first == seg_addr.get_segment_id();
+ });
+ if (from->second.segment_seq != journal_tail.segment_seq) {
+ ERROR("journal_tail {} does not match {}",
+ journal_tail, from->second);
+ ceph_abort();
+ }
+
+ auto num_segments = segments.end() - from;
+ INFO("{} segments to replay", num_segments);
+ auto ret = replay_segments_t(num_segments);
+ std::transform(
+ from, segments.end(), ret.begin(),
+ [this](const auto &p) {
+ auto ret = journal_seq_t{
+ p.second.segment_seq,
+ paddr_t::make_seg_paddr(
+ p.first,
+ sm_group.get_block_size())
+ };
+ return std::make_pair(ret, p.second);
+ });
+ ret[0].first.offset = journal_tail_paddr;
+ return prep_replay_segments_fut(
+ replay_ertr::ready_future_marker{},
+ std::move(ret));
+ });
+}
+
+SegmentedJournal::scan_last_segment_ertr::future<>
+SegmentedJournal::scan_last_segment(
+ const segment_id_t &segment_id,
+ const segment_header_t &segment_header)
+{
+ LOG_PREFIX(SegmentedJournal::scan_last_segment);
+ assert(segment_id == segment_header.physical_segment_id);
+ trimmer.update_journal_tails(
+ segment_header.dirty_tail, segment_header.alloc_tail);
+ auto seq = journal_seq_t{
+ segment_header.segment_seq,
+ paddr_t::make_seg_paddr(segment_id, 0)
+ };
+ INFO("scanning journal tail deltas -- {}", segment_header);
+ return seastar::do_with(
+ scan_valid_records_cursor(seq),
+ SegmentManagerGroup::found_record_handler_t(
+ [FNAME, this](
+ record_locator_t locator,
+ const record_group_header_t& record_group_header,
+ const bufferlist& mdbuf
+ ) -> SegmentManagerGroup::scan_valid_records_ertr::future<>
+ {
+ DEBUG("decoding {} at {}", record_group_header, locator);
+ bool has_tail_delta = false;
+ auto maybe_headers = try_decode_record_headers(
+ record_group_header, mdbuf);
+ if (!maybe_headers) {
+ // This should be impossible, we did check the crc on the mdbuf
+ ERROR("unable to decode headers from {} at {}",
+ record_group_header, locator);
+ ceph_abort();
+ }
+ for (auto &record_header : *maybe_headers) {
+ ceph_assert(is_valid_transaction(record_header.type));
+ if (is_background_transaction(record_header.type)) {
+ has_tail_delta = true;
+ }
+ }
+ if (has_tail_delta) {
+ bool found_delta = false;
+ auto maybe_record_deltas_list = try_decode_deltas(
+ record_group_header, mdbuf, locator.record_block_base);
+ if (!maybe_record_deltas_list) {
+ ERROR("unable to decode deltas from {} at {}",
+ record_group_header, locator);
+ ceph_abort();
+ }
+ for (auto &record_deltas : *maybe_record_deltas_list) {
+ for (auto &[ctime, delta] : record_deltas.deltas) {
+ if (delta.type == extent_types_t::JOURNAL_TAIL) {
+ found_delta = true;
+ journal_tail_delta_t tail_delta;
+ decode(tail_delta, delta.bl);
+ auto start_seq = locator.write_result.start_seq;
+ DEBUG("got {}, at {}", tail_delta, start_seq);
+ ceph_assert(tail_delta.dirty_tail != JOURNAL_SEQ_NULL);
+ ceph_assert(tail_delta.alloc_tail != JOURNAL_SEQ_NULL);
+ trimmer.update_journal_tails(
+ tail_delta.dirty_tail, tail_delta.alloc_tail);
+ }
+ }
+ }
+ ceph_assert(found_delta);
+ }
+ return seastar::now();
+ }),
+ [this, nonce=segment_header.segment_nonce](auto &cursor, auto &handler)
+ {
+ return sm_group.scan_valid_records(
+ cursor,
+ nonce,
+ std::numeric_limits<std::size_t>::max(),
+ handler).discard_result();
+ });
+}
+
+SegmentedJournal::replay_ertr::future<>
+SegmentedJournal::replay_segment(
+ journal_seq_t seq,
+ segment_header_t header,
+ delta_handler_t &handler,
+ replay_stats_t &stats)
+{
+ LOG_PREFIX(Journal::replay_segment);
+ INFO("starting at {} -- {}", seq, header);
+ return seastar::do_with(
+ scan_valid_records_cursor(seq),
+ SegmentManagerGroup::found_record_handler_t(
+ [&handler, this, &stats](
+ record_locator_t locator,
+ const record_group_header_t& header,
+ const bufferlist& mdbuf)
+ -> SegmentManagerGroup::scan_valid_records_ertr::future<>
+ {
+ LOG_PREFIX(Journal::replay_segment);
+ ++stats.num_record_groups;
+ auto maybe_record_deltas_list = try_decode_deltas(
+ header, mdbuf, locator.record_block_base);
+ if (!maybe_record_deltas_list) {
+ // This should be impossible, we did check the crc on the mdbuf
+ ERROR("unable to decode deltas for record {} at {}",
+ header, locator);
+ return crimson::ct_error::input_output_error::make();
+ }
+
+ return seastar::do_with(
+ std::move(*maybe_record_deltas_list),
+ [write_result=locator.write_result,
+ this,
+ FNAME,
+ &handler,
+ &stats](auto& record_deltas_list)
+ {
+ return crimson::do_for_each(
+ record_deltas_list,
+ [write_result,
+ this,
+ FNAME,
+ &handler,
+ &stats](record_deltas_t& record_deltas)
+ {
+ ++stats.num_records;
+ auto locator = record_locator_t{
+ record_deltas.record_block_base,
+ write_result
+ };
+ DEBUG("processing {} deltas at block_base {}",
+ record_deltas.deltas.size(),
+ locator);
+ return crimson::do_for_each(
+ record_deltas.deltas,
+ [locator,
+ this,
+ &handler,
+ &stats](auto &p)
+ {
+ auto& modify_time = p.first;
+ auto& delta = p.second;
+ return handler(
+ locator,
+ delta,
+ trimmer.get_dirty_tail(),
+ trimmer.get_alloc_tail(),
+ modify_time
+ ).safe_then([&stats, delta_type=delta.type](bool is_applied) {
+ if (is_applied) {
+ // see Cache::replay_delta()
+ assert(delta_type != extent_types_t::JOURNAL_TAIL);
+ if (delta_type == extent_types_t::ALLOC_INFO) {
+ ++stats.num_alloc_deltas;
+ } else {
+ ++stats.num_dirty_deltas;
+ }
+ }
+ });
+ });
+ });
+ });
+ }),
+ [=, this](auto &cursor, auto &dhandler) {
+ return sm_group.scan_valid_records(
+ cursor,
+ header.segment_nonce,
+ std::numeric_limits<size_t>::max(),
+ dhandler).safe_then([](auto){}
+ ).handle_error(
+ replay_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "shouldn't meet with any other error other replay_ertr"
+ }
+ );
+ }
+ );
+}
+
+SegmentedJournal::replay_ret SegmentedJournal::replay(
+ delta_handler_t &&delta_handler)
+{
+ LOG_PREFIX(Journal::replay);
+ return sm_group.find_journal_segment_headers(
+ ).safe_then([this, FNAME, delta_handler=std::move(delta_handler)]
+ (auto &&segment_headers) mutable -> replay_ret {
+ INFO("got {} segments", segment_headers.size());
+ return seastar::do_with(
+ std::move(delta_handler),
+ replay_segments_t(),
+ replay_stats_t(),
+ [this, segment_headers=std::move(segment_headers), FNAME]
+ (auto &handler, auto &segments, auto &stats) mutable -> replay_ret {
+ return prep_replay_segments(std::move(segment_headers)
+ ).safe_then([this, &handler, &segments, &stats](auto replay_segs) mutable {
+ segments = std::move(replay_segs);
+ return crimson::do_for_each(segments,[this, &handler, &stats](auto i) mutable {
+ return replay_segment(i.first, i.second, handler, stats);
+ });
+ }).safe_then([&stats, FNAME] {
+ INFO("replay done, record_groups={}, records={}, "
+ "alloc_deltas={}, dirty_deltas={}",
+ stats.num_record_groups,
+ stats.num_records,
+ stats.num_alloc_deltas,
+ stats.num_dirty_deltas);
+ });
+ });
+ });
+}
+
+seastar::future<> SegmentedJournal::flush(OrderingHandle &handle)
+{
+ LOG_PREFIX(SegmentedJournal::flush);
+ DEBUG("H{} flush ...", (void*)&handle);
+ assert(write_pipeline);
+ return handle.enter(write_pipeline->device_submission
+ ).then([this, &handle] {
+ return handle.enter(write_pipeline->finalize);
+ }).then([FNAME, &handle] {
+ DEBUG("H{} flush done", (void*)&handle);
+ });
+}
+
+SegmentedJournal::submit_record_ret
+SegmentedJournal::do_submit_record(
+ record_t &&record,
+ OrderingHandle &handle)
+{
+ LOG_PREFIX(SegmentedJournal::do_submit_record);
+ if (!record_submitter.is_available()) {
+ DEBUG("H{} wait ...", (void*)&handle);
+ return record_submitter.wait_available(
+ ).safe_then([this, record=std::move(record), &handle]() mutable {
+ return do_submit_record(std::move(record), handle);
+ });
+ }
+ auto action = record_submitter.check_action(record.size);
+ if (action == RecordSubmitter::action_t::ROLL) {
+ DEBUG("H{} roll, unavailable ...", (void*)&handle);
+ return record_submitter.roll_segment(
+ ).safe_then([this, record=std::move(record), &handle]() mutable {
+ return do_submit_record(std::move(record), handle);
+ });
+ } else { // SUBMIT_FULL/NOT_FULL
+ DEBUG("H{} submit {} ...",
+ (void*)&handle,
+ action == RecordSubmitter::action_t::SUBMIT_FULL ?
+ "FULL" : "NOT_FULL");
+ auto submit_fut = record_submitter.submit(std::move(record));
+ return handle.enter(write_pipeline->device_submission
+ ).then([submit_fut=std::move(submit_fut)]() mutable {
+ return std::move(submit_fut);
+ }).safe_then([FNAME, this, &handle](record_locator_t result) {
+ return handle.enter(write_pipeline->finalize
+ ).then([FNAME, this, result, &handle] {
+ DEBUG("H{} finish with {}", (void*)&handle, result);
+ auto new_committed_to = result.write_result.get_end_seq();
+ record_submitter.update_committed_to(new_committed_to);
+ return result;
+ });
+ });
+ }
+}
+
+SegmentedJournal::submit_record_ret
+SegmentedJournal::submit_record(
+ record_t &&record,
+ OrderingHandle &handle)
+{
+ LOG_PREFIX(SegmentedJournal::submit_record);
+ DEBUG("H{} {} start ...", (void*)&handle, record);
+ assert(write_pipeline);
+ auto expected_size = record_group_size_t(
+ record.size,
+ journal_segment_allocator.get_block_size()
+ ).get_encoded_length();
+ auto max_record_length = journal_segment_allocator.get_max_write_length();
+ if (expected_size > max_record_length) {
+ ERROR("H{} {} exceeds max record size {}",
+ (void*)&handle, record, max_record_length);
+ return crimson::ct_error::erange::make();
+ }
+
+ return do_submit_record(std::move(record), handle);
+}
+
+}
diff --git a/src/crimson/os/seastore/journal/segmented_journal.h b/src/crimson/os/seastore/journal/segmented_journal.h
new file mode 100644
index 000000000..3d580817c
--- /dev/null
+++ b/src/crimson/os/seastore/journal/segmented_journal.h
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+#include "include/denc.h"
+
+#include "crimson/os/seastore/async_cleaner.h"
+#include "crimson/os/seastore/journal.h"
+#include "crimson/os/seastore/segment_manager_group.h"
+#include "crimson/os/seastore/ordering_handle.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/osd/exceptions.h"
+#include "segment_allocator.h"
+#include "crimson/os/seastore/segment_seq_allocator.h"
+#include "record_submitter.h"
+
+namespace crimson::os::seastore::journal {
+/**
+ * Manages stream of atomically written records to a SegmentManager.
+ */
+class SegmentedJournal : public Journal {
+public:
+ SegmentedJournal(
+ SegmentProvider &segment_provider,
+ JournalTrimmer &trimmer);
+ ~SegmentedJournal() {}
+
+ JournalTrimmer &get_trimmer() final {
+ return trimmer;
+ }
+
+ open_for_mkfs_ret open_for_mkfs() final;
+
+ open_for_mount_ret open_for_mount() final;
+
+ close_ertr::future<> close() final;
+
+ submit_record_ret submit_record(
+ record_t &&record,
+ OrderingHandle &handle) final;
+
+ seastar::future<> flush(OrderingHandle &handle) final;
+
+ replay_ret replay(delta_handler_t &&delta_handler) final;
+
+ void set_write_pipeline(WritePipeline *_write_pipeline) final {
+ write_pipeline = _write_pipeline;
+ }
+
+ journal_type_t get_type() final {
+ return journal_type_t::SEGMENTED;
+ }
+ seastar::future<> finish_commit(transaction_type_t type) {
+ return seastar::now();
+ }
+
+private:
+ submit_record_ret do_submit_record(
+ record_t &&record,
+ OrderingHandle &handle
+ );
+
+ SegmentSeqAllocatorRef segment_seq_allocator;
+ SegmentAllocator journal_segment_allocator;
+ RecordSubmitter record_submitter;
+ SegmentManagerGroup &sm_group;
+ JournalTrimmer &trimmer;
+ WritePipeline* write_pipeline = nullptr;
+
+ /// return ordered vector of segments to replay
+ using replay_segments_t = std::vector<
+ std::pair<journal_seq_t, segment_header_t>>;
+ using prep_replay_segments_fut = replay_ertr::future<
+ replay_segments_t>;
+ prep_replay_segments_fut prep_replay_segments(
+ std::vector<std::pair<segment_id_t, segment_header_t>> segments);
+
+ /// scan the last segment for tail deltas
+ using scan_last_segment_ertr = replay_ertr;
+ scan_last_segment_ertr::future<> scan_last_segment(
+ const segment_id_t&, const segment_header_t&);
+
+ struct replay_stats_t {
+ std::size_t num_record_groups = 0;
+ std::size_t num_records = 0;
+ std::size_t num_alloc_deltas = 0;
+ std::size_t num_dirty_deltas = 0;
+ };
+
+ /// replays records starting at start through end of segment
+ replay_ertr::future<>
+ replay_segment(
+ journal_seq_t start, ///< [in] starting addr, seq
+ segment_header_t header, ///< [in] segment header
+ delta_handler_t &delta_handler, ///< [in] processes deltas in order
+ replay_stats_t &stats ///< [out] replay stats
+ );
+};
+
+}
diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc
new file mode 100644
index 000000000..d113bbd1e
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager.cc
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
+
+namespace crimson::os::seastore {
+
+LBAManager::update_mappings_ret
+LBAManager::update_mappings(
+ Transaction& t,
+ const std::list<LogicalCachedExtentRef>& extents)
+{
+ return trans_intr::do_for_each(extents,
+ [this, &t](auto &extent) {
+ return update_mapping(
+ t,
+ extent->get_laddr(),
+ extent->get_prior_paddr_and_reset(),
+ extent->get_paddr(),
+ nullptr // all the extents should have already been
+ // added to the fixed_kv_btree
+ );
+ });
+}
+
+LBAManagerRef lba_manager::create_lba_manager(Cache &cache) {
+ return LBAManagerRef(new btree::BtreeLBAManager(cache));
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h
new file mode 100644
index 000000000..d7adf2304
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager.h
@@ -0,0 +1,237 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * Abstract interface for managing the logical to physical mapping
+ */
+class LBAManager {
+public:
+ using base_iertr = Cache::base_iertr;
+
+ using mkfs_iertr = base_iertr;
+ using mkfs_ret = mkfs_iertr::future<>;
+ virtual mkfs_ret mkfs(
+ Transaction &t
+ ) = 0;
+
+ /**
+ * Fetches mappings for laddr_t in range [offset, offset + len)
+ *
+ * Future will not resolve until all pins have resolved (set_paddr called)
+ * For indirect lba mappings, get_mappings will always retrieve the original
+ * lba value.
+ */
+ using get_mappings_iertr = base_iertr;
+ using get_mappings_ret = get_mappings_iertr::future<lba_pin_list_t>;
+ virtual get_mappings_ret get_mappings(
+ Transaction &t,
+ laddr_t offset, extent_len_t length) = 0;
+
+ /**
+ * Fetches mappings for a list of laddr_t in range [offset, offset + len)
+ *
+ * Future will not resolve until all pins have resolved (set_paddr called)
+ * For indirect lba mappings, get_mappings will always retrieve the original
+ * lba value.
+ */
+ virtual get_mappings_ret get_mappings(
+ Transaction &t,
+ laddr_list_t &&extent_lisk) = 0;
+
+ /**
+ * Fetches the mapping for laddr_t
+ *
+ * Future will not resolve until the pin has resolved (set_paddr called)
+ * For indirect lba mappings, get_mapping will always retrieve the original
+ * lba value.
+ */
+ using get_mapping_iertr = base_iertr::extend<
+ crimson::ct_error::enoent>;
+ using get_mapping_ret = get_mapping_iertr::future<LBAMappingRef>;
+ virtual get_mapping_ret get_mapping(
+ Transaction &t,
+ laddr_t offset) = 0;
+
+ /**
+ * Allocates a new mapping referenced by LBARef
+ *
+ * Offset will be relative to the block offset of the record
+ * This mapping will block from transaction submission until set_paddr
+ * is called on the LBAMapping.
+ */
+ using alloc_extent_iertr = base_iertr;
+ using alloc_extent_ret = alloc_extent_iertr::future<LBAMappingRef>;
+ virtual alloc_extent_ret alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ paddr_t addr,
+ LogicalCachedExtent &nextent) = 0;
+
+ virtual alloc_extent_ret clone_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ laddr_t intermediate_key,
+ paddr_t actual_addr,
+ laddr_t intermediate_base) = 0;
+
+ virtual alloc_extent_ret reserve_region(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len) = 0;
+
+ struct ref_update_result_t {
+ unsigned refcount = 0;
+ pladdr_t addr;
+ extent_len_t length = 0;
+ };
+ using ref_iertr = base_iertr::extend<
+ crimson::ct_error::enoent>;
+ using ref_ret = ref_iertr::future<ref_update_result_t>;
+
+ /**
+ * Decrements ref count on extent
+ *
+ * @return returns resulting refcount
+ */
+ virtual ref_ret decref_extent(
+ Transaction &t,
+ laddr_t addr,
+ bool cascade_remove) = 0;
+
+ /**
+ * Increments ref count on extent
+ *
+ * @return returns resulting refcount
+ */
+ virtual ref_ret incref_extent(
+ Transaction &t,
+ laddr_t addr) = 0;
+
+ /**
+ * Increments ref count on extent
+ *
+ * @return returns resulting refcount
+ */
+ virtual ref_ret incref_extent(
+ Transaction &t,
+ laddr_t addr,
+ int delta) = 0;
+
+ /**
+ * Should be called after replay on each cached extent.
+ * Implementation must initialize the LBAMapping on any
+ * LogicalCachedExtent's and may also read in any dependent
+ * structures, etc.
+ *
+ * @return returns whether the extent is alive
+ */
+ using init_cached_extent_iertr = base_iertr;
+ using init_cached_extent_ret = init_cached_extent_iertr::future<bool>;
+ virtual init_cached_extent_ret init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e) = 0;
+
+ using check_child_trackers_ret = base_iertr::future<>;
+ virtual check_child_trackers_ret check_child_trackers(Transaction &t) = 0;
+
+ /**
+ * Calls f for each mapping in [begin, end)
+ */
+ using scan_mappings_iertr = base_iertr;
+ using scan_mappings_ret = scan_mappings_iertr::future<>;
+ using scan_mappings_func_t = std::function<
+ void(laddr_t, paddr_t, extent_len_t)>;
+ virtual scan_mappings_ret scan_mappings(
+ Transaction &t,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &&f) = 0;
+
+ /**
+ * rewrite_extent
+ *
+ * rewrite extent into passed transaction
+ */
+ using rewrite_extent_iertr = base_iertr;
+ using rewrite_extent_ret = rewrite_extent_iertr::future<>;
+ virtual rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent) = 0;
+
+ /**
+ * update_mapping
+ *
+ * update lba mapping for a delayed allocated extent
+ */
+ using update_mapping_iertr = base_iertr;
+ using update_mapping_ret = base_iertr::future<>;
+ virtual update_mapping_ret update_mapping(
+ Transaction& t,
+ laddr_t laddr,
+ paddr_t prev_addr,
+ paddr_t paddr,
+ LogicalCachedExtent *nextent) = 0;
+
+ /**
+ * update_mappings
+ *
+ * update lba mappings for delayed allocated extents
+ */
+ using update_mappings_iertr = update_mapping_iertr;
+ using update_mappings_ret = update_mapping_ret;
+ update_mappings_ret update_mappings(
+ Transaction& t,
+ const std::list<LogicalCachedExtentRef>& extents);
+
+ /**
+ * get_physical_extent_if_live
+ *
+ * Returns extent at addr/laddr if still live (if laddr
+ * still points at addr). Extent must be an internal, physical
+ * extent.
+ *
+ * Returns a null CachedExtentRef if extent is not live.
+ */
+ using get_physical_extent_if_live_iertr = base_iertr;
+ using get_physical_extent_if_live_ret =
+ get_physical_extent_if_live_iertr::future<CachedExtentRef>;
+ virtual get_physical_extent_if_live_ret get_physical_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ extent_len_t len) = 0;
+
+ virtual ~LBAManager() {}
+};
+using LBAManagerRef = std::unique_ptr<LBAManager>;
+
+class Cache;
+namespace lba_manager {
+LBAManagerRef create_lba_manager(Cache &cache);
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
new file mode 100644
index 000000000..a607cd612
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
@@ -0,0 +1,761 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <seastar/core/metrics.hh>
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/logging.h"
+
+SET_SUBSYS(seastore_lba);
+/*
+ * levels:
+ * - INFO: mkfs
+ * - DEBUG: modification operations
+ * - TRACE: read operations, DEBUG details
+ */
+
+namespace crimson::os::seastore {
+
+template <typename T>
+Transaction::tree_stats_t& get_tree_stats(Transaction &t)
+{
+ return t.get_lba_tree_stats();
+}
+
+template Transaction::tree_stats_t&
+get_tree_stats<
+ crimson::os::seastore::lba_manager::btree::LBABtree>(
+ Transaction &t);
+
+template <typename T>
+phy_tree_root_t& get_phy_tree_root(root_t &r)
+{
+ return r.lba_root;
+}
+
+template phy_tree_root_t&
+get_phy_tree_root<
+ crimson::os::seastore::lba_manager::btree::LBABtree>(root_t &r);
+
+template <>
+const get_phy_tree_root_node_ret get_phy_tree_root_node<
+ crimson::os::seastore::lba_manager::btree::LBABtree>(
+ const RootBlockRef &root_block, op_context_t<laddr_t> c)
+{
+ auto lba_root = root_block->lba_root_node;
+ if (lba_root) {
+ ceph_assert(lba_root->is_initial_pending()
+ == root_block->is_pending());
+ return {true,
+ trans_intr::make_interruptible(
+ c.cache.get_extent_viewable_by_trans(c.trans, lba_root))};
+ } else if (root_block->is_pending()) {
+ auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance());
+ lba_root = prior.lba_root_node;
+ if (lba_root) {
+ return {true,
+ trans_intr::make_interruptible(
+ c.cache.get_extent_viewable_by_trans(c.trans, lba_root))};
+ } else {
+ return {false,
+ trans_intr::make_interruptible(
+ Cache::get_extent_ertr::make_ready_future<
+ CachedExtentRef>())};
+ }
+ } else {
+ return {false,
+ trans_intr::make_interruptible(
+ Cache::get_extent_ertr::make_ready_future<
+ CachedExtentRef>())};
+ }
+}
+
+template <typename ROOT>
+void link_phy_tree_root_node(RootBlockRef &root_block, ROOT* lba_root) {
+ root_block->lba_root_node = lba_root;
+ ceph_assert(lba_root != nullptr);
+ lba_root->root_block = root_block;
+}
+
+template void link_phy_tree_root_node(
+ RootBlockRef &root_block, lba_manager::btree::LBAInternalNode* lba_root);
+template void link_phy_tree_root_node(
+ RootBlockRef &root_block, lba_manager::btree::LBALeafNode* lba_root);
+template void link_phy_tree_root_node(
+ RootBlockRef &root_block, lba_manager::btree::LBANode* lba_root);
+
+template <>
+void unlink_phy_tree_root_node<laddr_t>(RootBlockRef &root_block) {
+ root_block->lba_root_node = nullptr;
+}
+
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+BtreeLBAManager::mkfs_ret
+BtreeLBAManager::mkfs(
+ Transaction &t)
+{
+ LOG_PREFIX(BtreeLBAManager::mkfs);
+ INFOT("start", t);
+ return cache.get_root(t).si_then([this, &t](auto croot) {
+ assert(croot->is_mutation_pending());
+ croot->get_root().lba_root = LBABtree::mkfs(croot, get_context(t));
+ return mkfs_iertr::now();
+ }).handle_error_interruptible(
+ mkfs_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in BtreeLBAManager::mkfs"
+ }
+ );
+}
+
+BtreeLBAManager::get_mappings_ret
+BtreeLBAManager::get_mappings(
+ Transaction &t,
+ laddr_t offset, extent_len_t length)
+{
+ LOG_PREFIX(BtreeLBAManager::get_mappings);
+ TRACET("{}~{}", t, offset, length);
+ auto c = get_context(t);
+ return with_btree_state<LBABtree, lba_pin_list_t>(
+ cache,
+ c,
+ [c, offset, length, FNAME, this](auto &btree, auto &ret) {
+ return seastar::do_with(
+ std::list<BtreeLBAMappingRef>(),
+ [offset, length, c, FNAME, this, &ret, &btree](auto &pin_list) {
+ return LBABtree::iterate_repeat(
+ c,
+ btree.upper_bound_right(c, offset),
+ [&pin_list, offset, length, c, FNAME](auto &pos) {
+ if (pos.is_end() || pos.get_key() >= (offset + length)) {
+ TRACET("{}~{} done with {} results",
+ c.trans, offset, length, pin_list.size());
+ return LBABtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ }
+ TRACET("{}~{} got {}, {}, repeat ...",
+ c.trans, offset, length, pos.get_key(), pos.get_val());
+ ceph_assert((pos.get_key() + pos.get_val().len) > offset);
+ pin_list.push_back(pos.get_pin(c));
+ return LBABtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ }).si_then([this, &ret, c, &pin_list] {
+ return _get_original_mappings(c, pin_list
+ ).si_then([&ret](auto _ret) {
+ ret = std::move(_ret);
+ });
+ });
+ });
+ });
+}
+
+BtreeLBAManager::_get_original_mappings_ret
+BtreeLBAManager::_get_original_mappings(
+ op_context_t<laddr_t> c,
+ std::list<BtreeLBAMappingRef> &pin_list)
+{
+ return seastar::do_with(
+ lba_pin_list_t(),
+ [this, c, &pin_list](auto &ret) {
+ return trans_intr::do_for_each(
+ pin_list,
+ [this, c, &ret](auto &pin) {
+ LOG_PREFIX(BtreeLBAManager::get_mappings);
+ if (pin->get_raw_val().is_paddr()) {
+ ret.emplace_back(std::move(pin));
+ return get_mappings_iertr::now();
+ }
+ TRACET(
+ "getting original mapping for indirect mapping {}~{}",
+ c.trans, pin->get_key(), pin->get_length());
+ return this->get_mappings(
+ c.trans, pin->get_raw_val().get_laddr(), pin->get_length()
+ ).si_then([&pin, &ret, c](auto new_pin_list) {
+ LOG_PREFIX(BtreeLBAManager::get_mappings);
+ assert(new_pin_list.size() == 1);
+ auto &new_pin = new_pin_list.front();
+ auto intermediate_key = pin->get_raw_val().get_laddr();
+ assert(!new_pin->is_indirect());
+ assert(new_pin->get_key() <= intermediate_key);
+ assert(new_pin->get_key() + new_pin->get_length() >=
+ intermediate_key + pin->get_length());
+
+ TRACET("Got mapping {}~{} for indirect mapping {}~{}, "
+ "intermediate_key {}",
+ c.trans,
+ new_pin->get_key(), new_pin->get_length(),
+ pin->get_key(), pin->get_length(),
+ pin->get_raw_val().get_laddr());
+ auto &btree_new_pin = static_cast<BtreeLBAMapping&>(*new_pin);
+ btree_new_pin.set_key_for_indirect(
+ pin->get_key(),
+ pin->get_length(),
+ pin->get_raw_val().get_laddr());
+ ret.emplace_back(std::move(new_pin));
+ return seastar::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further{},
+ crimson::ct_error::assert_all("unexpected enoent")
+ );
+ }
+ ).si_then([&ret] {
+ return std::move(ret);
+ });
+ });
+}
+
+
+BtreeLBAManager::get_mappings_ret
+BtreeLBAManager::get_mappings(
+ Transaction &t,
+ laddr_list_t &&list)
+{
+ LOG_PREFIX(BtreeLBAManager::get_mappings);
+ TRACET("{}", t, list);
+ auto l = std::make_unique<laddr_list_t>(std::move(list));
+ auto retptr = std::make_unique<lba_pin_list_t>();
+ auto &ret = *retptr;
+ return trans_intr::do_for_each(
+ l->begin(),
+ l->end(),
+ [this, &t, &ret](const auto &p) {
+ return this->get_mappings(t, p.first, p.second).si_then(
+ [&ret](auto res) {
+ ret.splice(ret.end(), res, res.begin(), res.end());
+ return get_mappings_iertr::now();
+ });
+ }).si_then([l=std::move(l), retptr=std::move(retptr)]() mutable {
+ return std::move(*retptr);
+ });
+}
+
+BtreeLBAManager::get_mapping_ret
+BtreeLBAManager::get_mapping(
+ Transaction &t,
+ laddr_t offset)
+{
+ LOG_PREFIX(BtreeLBAManager::get_mapping);
+ TRACET("{}", t, offset);
+ return _get_mapping(t, offset
+ ).si_then([](auto pin) {
+ return get_mapping_iertr::make_ready_future<LBAMappingRef>(std::move(pin));
+ });
+}
+
+BtreeLBAManager::_get_mapping_ret
+BtreeLBAManager::_get_mapping(
+ Transaction &t,
+ laddr_t offset)
+{
+ LOG_PREFIX(BtreeLBAManager::_get_mapping);
+ TRACET("{}", t, offset);
+ auto c = get_context(t);
+ return with_btree_ret<LBABtree, BtreeLBAMappingRef>(
+ cache,
+ c,
+ [FNAME, c, offset, this](auto &btree) {
+ return btree.lower_bound(
+ c, offset
+ ).si_then([FNAME, offset, c](auto iter) -> _get_mapping_ret {
+ if (iter.is_end() || iter.get_key() != offset) {
+ ERRORT("laddr={} doesn't exist", c.trans, offset);
+ return crimson::ct_error::enoent::make();
+ } else {
+ TRACET("{} got {}, {}",
+ c.trans, offset, iter.get_key(), iter.get_val());
+ auto e = iter.get_pin(c);
+ return _get_mapping_ret(
+ interruptible::ready_future_marker{},
+ std::move(e));
+ }
+ }).si_then([this, c](auto pin) -> _get_mapping_ret {
+ if (pin->get_raw_val().is_laddr()) {
+ return seastar::do_with(
+ std::move(pin),
+ [this, c](auto &pin) {
+ return _get_mapping(
+ c.trans, pin->get_raw_val().get_laddr()
+ ).si_then([&pin](auto new_pin) {
+ ceph_assert(pin->get_length() == new_pin->get_length());
+ new_pin->set_key_for_indirect(
+ pin->get_key(),
+ pin->get_length());
+ return new_pin;
+ });
+ });
+ } else {
+ return get_mapping_iertr::make_ready_future<BtreeLBAMappingRef>(std::move(pin));
+ }
+ });
+ });
+}
+
+BtreeLBAManager::alloc_extent_ret
+BtreeLBAManager::_alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ pladdr_t addr,
+ paddr_t actual_addr,
+ laddr_t intermediate_base,
+ LogicalCachedExtent* nextent)
+{
+ struct state_t {
+ laddr_t last_end;
+
+ std::optional<typename LBABtree::iterator> insert_iter;
+ std::optional<typename LBABtree::iterator> ret;
+
+ state_t(laddr_t hint) : last_end(hint) {}
+ };
+
+ LOG_PREFIX(BtreeLBAManager::_alloc_extent);
+ TRACET("{}~{}, hint={}", t, addr, len, hint);
+ auto c = get_context(t);
+ ++stats.num_alloc_extents;
+ auto lookup_attempts = stats.num_alloc_extents_iter_nexts;
+ return crimson::os::seastore::with_btree_state<LBABtree, state_t>(
+ cache,
+ c,
+ hint,
+ [this, FNAME, c, hint, len, addr, lookup_attempts,
+ &t, nextent](auto &btree, auto &state) {
+ return LBABtree::iterate_repeat(
+ c,
+ btree.upper_bound_right(c, hint),
+ [this, &state, len, addr, &t, hint, FNAME, lookup_attempts](auto &pos) {
+ ++stats.num_alloc_extents_iter_nexts;
+ if (pos.is_end()) {
+ DEBUGT("{}~{}, hint={}, state: end, done with {} attempts, insert at {}",
+ t, addr, len, hint,
+ stats.num_alloc_extents_iter_nexts - lookup_attempts,
+ state.last_end);
+ state.insert_iter = pos;
+ return typename LBABtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ } else if (pos.get_key() >= (state.last_end + len)) {
+ DEBUGT("{}~{}, hint={}, state: {}~{}, done with {} attempts, insert at {} -- {}",
+ t, addr, len, hint,
+ pos.get_key(), pos.get_val().len,
+ stats.num_alloc_extents_iter_nexts - lookup_attempts,
+ state.last_end,
+ pos.get_val());
+ state.insert_iter = pos;
+ return typename LBABtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ } else {
+ state.last_end = pos.get_key() + pos.get_val().len;
+ TRACET("{}~{}, hint={}, state: {}~{}, repeat ... -- {}",
+ t, addr, len, hint,
+ pos.get_key(), pos.get_val().len,
+ pos.get_val());
+ return typename LBABtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ }
+ }).si_then([FNAME, c, addr, len, hint, &btree, &state, nextent] {
+ return btree.insert(
+ c,
+ *state.insert_iter,
+ state.last_end,
+ lba_map_val_t{len, pladdr_t(addr), 1, 0},
+ nextent
+ ).si_then([&state, FNAME, c, addr, len, hint, nextent](auto &&p) {
+ auto [iter, inserted] = std::move(p);
+ TRACET("{}~{}, hint={}, inserted at {}",
+ c.trans, addr, len, hint, state.last_end);
+ if (nextent) {
+ ceph_assert(addr.is_paddr());
+ nextent->set_laddr(iter.get_key());
+ }
+ ceph_assert(inserted);
+ state.ret = iter;
+ });
+ });
+ }).si_then([c, actual_addr, addr, intermediate_base](auto &&state) {
+ auto ret_pin = state.ret->get_pin(c);
+ if (actual_addr != P_ADDR_NULL) {
+ ceph_assert(addr.is_laddr());
+ ret_pin->set_paddr(actual_addr);
+ ret_pin->set_intermediate_base(intermediate_base);
+ } else {
+ ceph_assert(addr.is_paddr());
+ }
+ return alloc_extent_iertr::make_ready_future<LBAMappingRef>(
+ std::move(ret_pin));
+ });
+}
+
+static bool is_lba_node(const CachedExtent &e)
+{
+ return is_lba_node(e.get_type());
+}
+
+BtreeLBAManager::base_iertr::template future<>
+_init_cached_extent(
+ op_context_t<laddr_t> c,
+ const CachedExtentRef &e,
+ LBABtree &btree,
+ bool &ret)
+{
+ if (e->is_logical()) {
+ auto logn = e->cast<LogicalCachedExtent>();
+ return btree.lower_bound(
+ c,
+ logn->get_laddr()
+ ).si_then([e, c, logn, &ret](auto iter) {
+ LOG_PREFIX(BtreeLBAManager::init_cached_extent);
+ if (!iter.is_end() &&
+ iter.get_key() == logn->get_laddr() &&
+ iter.get_val().pladdr.is_paddr() &&
+ iter.get_val().pladdr.get_paddr() == logn->get_paddr()) {
+ assert(!iter.get_leaf_node()->is_pending());
+ iter.get_leaf_node()->link_child(logn.get(), iter.get_leaf_pos());
+ logn->set_laddr(iter.get_pin(c)->get_key());
+ ceph_assert(iter.get_val().len == e->get_length());
+ DEBUGT("logical extent {} live", c.trans, *logn);
+ ret = true;
+ } else {
+ DEBUGT("logical extent {} not live", c.trans, *logn);
+ ret = false;
+ }
+ });
+ } else {
+ return btree.init_cached_extent(c, e
+ ).si_then([&ret](bool is_alive) {
+ ret = is_alive;
+ });
+ }
+}
+
+BtreeLBAManager::init_cached_extent_ret
+BtreeLBAManager::init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e)
+{
+ LOG_PREFIX(BtreeLBAManager::init_cached_extent);
+ TRACET("{}", t, *e);
+ return seastar::do_with(bool(), [this, e, &t](bool &ret) {
+ auto c = get_context(t);
+ return with_btree<LBABtree>(
+ cache, c,
+ [c, e, &ret](auto &btree) -> base_iertr::future<> {
+ LOG_PREFIX(BtreeLBAManager::init_cached_extent);
+ DEBUGT("extent {}", c.trans, *e);
+ return _init_cached_extent(c, e, btree, ret);
+ }
+ ).si_then([&ret] { return ret; });
+ });
+}
+
+BtreeLBAManager::check_child_trackers_ret
+BtreeLBAManager::check_child_trackers(
+ Transaction &t) {
+ auto c = get_context(t);
+ return with_btree<LBABtree>(
+ cache, c,
+ [c](auto &btree) {
+ return btree.check_child_trackers(c);
+ });
+}
+
+BtreeLBAManager::scan_mappings_ret
+BtreeLBAManager::scan_mappings(
+ Transaction &t,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &&f)
+{
+ LOG_PREFIX(BtreeLBAManager::scan_mappings);
+ DEBUGT("begin: {}, end: {}", t, begin, end);
+
+ auto c = get_context(t);
+ return with_btree<LBABtree>(
+ cache,
+ c,
+ [c, f=std::move(f), begin, end](auto &btree) mutable {
+ return LBABtree::iterate_repeat(
+ c,
+ btree.upper_bound_right(c, begin),
+ [f=std::move(f), begin, end](auto &pos) {
+ if (pos.is_end() || pos.get_key() >= end) {
+ return typename LBABtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::yes);
+ }
+ ceph_assert((pos.get_key() + pos.get_val().len) > begin);
+ f(pos.get_key(), pos.get_val().pladdr.get_paddr(), pos.get_val().len);
+ return LBABtree::iterate_repeat_ret_inner(
+ interruptible::ready_future_marker{},
+ seastar::stop_iteration::no);
+ });
+ });
+}
+
+BtreeLBAManager::rewrite_extent_ret
+BtreeLBAManager::rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent)
+{
+ LOG_PREFIX(BtreeLBAManager::rewrite_extent);
+ if (extent->has_been_invalidated()) {
+ ERRORT("extent has been invalidated -- {}", t, *extent);
+ ceph_abort();
+ }
+ assert(!extent->is_logical());
+
+ if (is_lba_node(*extent)) {
+ DEBUGT("rewriting lba extent -- {}", t, *extent);
+ auto c = get_context(t);
+ return with_btree<LBABtree>(
+ cache,
+ c,
+ [c, extent](auto &btree) mutable {
+ return btree.rewrite_extent(c, extent);
+ });
+ } else {
+ DEBUGT("skip non lba extent -- {}", t, *extent);
+ return rewrite_extent_iertr::now();
+ }
+}
+
+BtreeLBAManager::update_mapping_ret
+BtreeLBAManager::update_mapping(
+ Transaction& t,
+ laddr_t laddr,
+ paddr_t prev_addr,
+ paddr_t addr,
+ LogicalCachedExtent *nextent)
+{
+ LOG_PREFIX(BtreeLBAManager::update_mapping);
+ TRACET("laddr={}, paddr {} => {}", t, laddr, prev_addr, addr);
+ return _update_mapping(
+ t,
+ laddr,
+ [prev_addr, addr](
+ const lba_map_val_t &in) {
+ assert(!addr.is_null());
+ lba_map_val_t ret = in;
+ ceph_assert(in.pladdr.is_paddr());
+ ceph_assert(in.pladdr.get_paddr() == prev_addr);
+ ret.pladdr = addr;
+ return ret;
+ },
+ nextent
+ ).si_then([&t, laddr, prev_addr, addr, FNAME](auto result) {
+ DEBUGT("laddr={}, paddr {} => {} done -- {}",
+ t, laddr, prev_addr, addr, result);
+ },
+ update_mapping_iertr::pass_further{},
+ /* ENOENT in particular should be impossible */
+ crimson::ct_error::assert_all{
+ "Invalid error in BtreeLBAManager::update_mapping"
+ }
+ );
+}
+
+BtreeLBAManager::get_physical_extent_if_live_ret
+BtreeLBAManager::get_physical_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ extent_len_t len)
+{
+ LOG_PREFIX(BtreeLBAManager::get_physical_extent_if_live);
+ DEBUGT("{}, laddr={}, paddr={}, length={}",
+ t, type, laddr, addr, len);
+ ceph_assert(is_lba_node(type));
+ auto c = get_context(t);
+ return with_btree_ret<LBABtree, CachedExtentRef>(
+ cache,
+ c,
+ [c, type, addr, laddr, len](auto &btree) {
+ if (type == extent_types_t::LADDR_INTERNAL) {
+ return btree.get_internal_if_live(c, addr, laddr, len);
+ } else {
+ assert(type == extent_types_t::LADDR_LEAF ||
+ type == extent_types_t::DINK_LADDR_LEAF);
+ return btree.get_leaf_if_live(c, addr, laddr, len);
+ }
+ });
+}
+
+void BtreeLBAManager::register_metrics()
+{
+ LOG_PREFIX(BtreeLBAManager::register_metrics);
+ DEBUG("start");
+ stats = {};
+ namespace sm = seastar::metrics;
+ metrics.add_group(
+ "LBA",
+ {
+ sm::make_counter(
+ "alloc_extents",
+ stats.num_alloc_extents,
+ sm::description("total number of lba alloc_extent operations")
+ ),
+ sm::make_counter(
+ "alloc_extents_iter_nexts",
+ stats.num_alloc_extents_iter_nexts,
+ sm::description("total number of iterator next operations during extent allocation")
+ ),
+ }
+ );
+}
+
+BtreeLBAManager::ref_iertr::future<std::optional<std::pair<paddr_t, extent_len_t>>>
+BtreeLBAManager::_decref_intermediate(
+ Transaction &t,
+ laddr_t addr,
+ extent_len_t len)
+{
+ auto c = get_context(t);
+ return with_btree<LBABtree>(
+ cache,
+ c,
+ [c, addr, len](auto &btree) mutable {
+ return btree.upper_bound_right(
+ c, addr
+ ).si_then([&btree, addr, len, c](auto iter) {
+ return seastar::do_with(
+ std::move(iter),
+ [&btree, addr, len, c](auto &iter) {
+ ceph_assert(!iter.is_end());
+ ceph_assert(iter.get_key() <= addr);
+ auto val = iter.get_val();
+ ceph_assert(iter.get_key() + val.len >= addr + len);
+ ceph_assert(val.pladdr.is_paddr());
+ ceph_assert(val.refcount >= 1);
+ val.refcount -= 1;
+
+ LOG_PREFIX(BtreeLBAManager::_decref_intermediate);
+ TRACET("decreased refcount of intermediate key {} -- {}",
+ c.trans,
+ iter.get_key(),
+ val);
+
+ if (!val.refcount) {
+ return btree.remove(c, iter
+ ).si_then([val] {
+ return std::make_optional<
+ std::pair<paddr_t, extent_len_t>>(
+ val.pladdr.get_paddr(), val.len);
+ });
+ } else {
+ return btree.update(c, iter, val, nullptr
+ ).si_then([](auto) {
+ return seastar::make_ready_future<
+ std::optional<std::pair<paddr_t, extent_len_t>>>(std::nullopt);
+ });
+ }
+ });
+ });
+ });
+}
+
+BtreeLBAManager::update_refcount_ret
+BtreeLBAManager::update_refcount(
+ Transaction &t,
+ laddr_t addr,
+ int delta,
+ bool cascade_remove)
+{
+ LOG_PREFIX(BtreeLBAManager::update_refcount);
+ TRACET("laddr={}, delta={}", t, addr, delta);
+ return _update_mapping(
+ t,
+ addr,
+ [delta](const lba_map_val_t &in) {
+ lba_map_val_t out = in;
+ ceph_assert((int)out.refcount + delta >= 0);
+ out.refcount += delta;
+ return out;
+ },
+ nullptr
+ ).si_then([&t, addr, delta, FNAME, this, cascade_remove](auto result) {
+ DEBUGT("laddr={}, delta={} done -- {}", t, addr, delta, result);
+ auto fut = ref_iertr::make_ready_future<
+ std::optional<std::pair<paddr_t, extent_len_t>>>();
+ if (!result.refcount && result.pladdr.is_laddr() && cascade_remove) {
+ fut = _decref_intermediate(
+ t,
+ result.pladdr.get_laddr(),
+ result.len
+ );
+ }
+ return fut.si_then([result](auto removed) {
+ if (result.pladdr.is_laddr()
+ && removed) {
+ return ref_update_result_t{
+ result.refcount,
+ removed->first,
+ removed->second};
+ } else {
+ return ref_update_result_t{
+ result.refcount,
+ result.pladdr,
+ result.len
+ };
+ }
+ });
+ });
+}
+
+BtreeLBAManager::_update_mapping_ret
+BtreeLBAManager::_update_mapping(
+ Transaction &t,
+ laddr_t addr,
+ update_func_t &&f,
+ LogicalCachedExtent* nextent)
+{
+ auto c = get_context(t);
+ return with_btree_ret<LBABtree, lba_map_val_t>(
+ cache,
+ c,
+ [f=std::move(f), c, addr, nextent](auto &btree) mutable {
+ return btree.lower_bound(
+ c, addr
+ ).si_then([&btree, f=std::move(f), c, addr, nextent](auto iter)
+ -> _update_mapping_ret {
+ if (iter.is_end() || iter.get_key() != addr) {
+ LOG_PREFIX(BtreeLBAManager::_update_mapping);
+ ERRORT("laddr={} doesn't exist", c.trans, addr);
+ return crimson::ct_error::enoent::make();
+ }
+
+ auto ret = f(iter.get_val());
+ if (ret.refcount == 0) {
+ return btree.remove(
+ c,
+ iter
+ ).si_then([ret] {
+ return ret;
+ });
+ } else {
+ return btree.update(
+ c,
+ iter,
+ ret,
+ nextent
+ ).si_then([ret](auto) {
+ return ret;
+ });
+ }
+ });
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
new file mode 100644
index 000000000..892600ed0
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -0,0 +1,396 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/btree/fixed_kv_btree.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/cache.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
+// To support cloning, there are two kinds of lba mappings:
+// 1. physical lba mapping: the pladdr in the value of which is the paddr of
+// the corresponding extent;
+// 2. indirect lba mapping: the pladdr in the value of which is an laddr pointing
+// to the physical lba mapping that's pointing to the actual paddr of the
+// extent being searched;
+//
+// Accordingly, BtreeLBAMapping may also work under two modes: indirect or direct
+// 1. BtreeLBAMappings that come from quering an indirect lba mapping in the lba tree
+// are indirect;
+// 2. BtreeLBAMappings that come from quering a physical lba mapping in the lba tree
+// are direct.
+//
+// For direct BtreeLBAMappings, there are two important fields:
+// 1. key: the laddr of the lba mapping being queried;
+// 2. paddr: the paddr recorded in the value of the lba mapping being queried.
+// For indirect BtreeLBAMappings, BtreeLBAMapping has three important fields:
+// 1. key: the laddr key of the lba entry being queried;
+// 2. intermediate_key: the laddr within the scope of the physical lba mapping
+// that the current indirect lba mapping points to; although an indirect mapping
+// points to the start of the physical lba mapping, it may change to other
+// laddr after remap
+// 3. intermediate_base: the laddr key of the physical lba mapping, intermediate_key
+// and intermediate_base should be the same when doing cloning
+// 4. intermediate_offset: intermediate_key - intermediate_base
+// 5. paddr: the paddr recorded in the physical lba mapping pointed to by the
+// indirect lba mapping being queried;
+//
+// NOTE THAT, for direct BtreeLBAMappings, their intermediate_keys are the same as
+// their keys.
+public:
+ BtreeLBAMapping(op_context_t<laddr_t> ctx)
+ : BtreeNodeMapping(ctx) {}
+ BtreeLBAMapping(
+ op_context_t<laddr_t> c,
+ CachedExtentRef parent,
+ uint16_t pos,
+ lba_map_val_t &val,
+ lba_node_meta_t meta)
+ : BtreeNodeMapping(
+ c,
+ parent,
+ pos,
+ val.pladdr.is_paddr() ? val.pladdr.get_paddr() : P_ADDR_NULL,
+ val.len,
+ meta),
+ key(meta.begin),
+ indirect(val.pladdr.is_laddr() ? true : false),
+ intermediate_key(indirect ? val.pladdr.get_laddr() : L_ADDR_NULL),
+ intermediate_length(indirect ? val.len : 0),
+ raw_val(val.pladdr),
+ map_val(val)
+ {}
+
+ lba_map_val_t get_map_val() const {
+ return map_val;
+ }
+
+ bool is_indirect() const final {
+ return indirect;
+ }
+
+ void set_key_for_indirect(
+ laddr_t new_key,
+ extent_len_t length,
+ laddr_t interkey = L_ADDR_NULL)
+ {
+ turn_indirect(interkey);
+ key = new_key;
+ intermediate_length = len;
+ len = length;
+ }
+
+ laddr_t get_key() const final {
+ return key;
+ }
+
+ pladdr_t get_raw_val() const {
+ return raw_val;
+ }
+
+ void set_paddr(paddr_t addr) {
+ value = addr;
+ }
+
+ laddr_t get_intermediate_key() const final {
+ assert(is_indirect());
+ assert(intermediate_key != L_ADDR_NULL);
+ return intermediate_key;
+ }
+
+ laddr_t get_intermediate_base() const final {
+ assert(is_indirect());
+ assert(intermediate_base != L_ADDR_NULL);
+ return intermediate_base;
+ }
+
+ extent_len_t get_intermediate_offset() const final {
+ assert(intermediate_key >= intermediate_base);
+ assert((intermediate_key == L_ADDR_NULL)
+ == (intermediate_base == L_ADDR_NULL));
+ return intermediate_key - intermediate_base;
+ }
+
+ extent_len_t get_intermediate_length() const final {
+ assert(is_indirect());
+ assert(intermediate_length);
+ return intermediate_length;
+ }
+
+ void set_intermediate_base(laddr_t base) {
+ intermediate_base = base;
+ }
+
+protected:
+ std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate(
+ op_context_t<laddr_t> ctx) const final {
+ auto pin = std::unique_ptr<BtreeLBAMapping>(new BtreeLBAMapping(ctx));
+ pin->key = key;
+ pin->intermediate_base = intermediate_base;
+ pin->intermediate_key = intermediate_key;
+ pin->indirect = indirect;
+ pin->raw_val = raw_val;
+ pin->map_val = map_val;
+ return pin;
+ }
+private:
+ void turn_indirect(laddr_t interkey) {
+ assert(value.is_paddr());
+ intermediate_base = key;
+ intermediate_key = (interkey == L_ADDR_NULL ? key : interkey);
+ indirect = true;
+ }
+ laddr_t key = L_ADDR_NULL;
+ bool indirect = false;
+ laddr_t intermediate_key = L_ADDR_NULL;
+ laddr_t intermediate_base = L_ADDR_NULL;
+ extent_len_t intermediate_length = 0;
+ pladdr_t raw_val;
+ lba_map_val_t map_val;
+};
+
+using BtreeLBAMappingRef = std::unique_ptr<BtreeLBAMapping>;
+
+using LBABtree = FixedKVBtree<
+ laddr_t, lba_map_val_t, LBAInternalNode,
+ LBALeafNode, BtreeLBAMapping, LBA_BLOCK_SIZE, true>;
+
+/**
+ * BtreeLBAManager
+ *
+ * Uses a wandering btree to track two things:
+ * 1) lba state including laddr_t -> paddr_t mapping
+ * 2) reverse paddr_t -> laddr_t mapping for gc (TODO)
+ *
+ * Generally, any transaction will involve
+ * 1) deltas against lba tree nodes
+ * 2) new lba tree nodes
+ * - Note, there must necessarily be a delta linking
+ * these new nodes into the tree -- might be a
+ * bootstrap_state_t delta if new root
+ *
+ * get_mappings, alloc_extent_*, etc populate a Transaction
+ * which then gets submitted
+ */
+class BtreeLBAManager : public LBAManager {
+public:
+ BtreeLBAManager(Cache &cache)
+ : cache(cache)
+ {
+ register_metrics();
+ }
+
+ mkfs_ret mkfs(
+ Transaction &t) final;
+
+ get_mappings_ret get_mappings(
+ Transaction &t,
+ laddr_t offset, extent_len_t length) final;
+
+ get_mappings_ret get_mappings(
+ Transaction &t,
+ laddr_list_t &&list) final;
+
+ get_mapping_ret get_mapping(
+ Transaction &t,
+ laddr_t offset) final;
+
+ alloc_extent_ret reserve_region(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len)
+ {
+ return _alloc_extent(
+ t,
+ hint,
+ len,
+ P_ADDR_ZERO,
+ P_ADDR_NULL,
+ L_ADDR_NULL,
+ nullptr);
+ }
+
+ alloc_extent_ret clone_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ laddr_t intermediate_key,
+ paddr_t actual_addr,
+ laddr_t intermediate_base)
+ {
+ return _alloc_extent(
+ t,
+ hint,
+ len,
+ intermediate_key,
+ actual_addr,
+ intermediate_base,
+ nullptr);
+ }
+
+ alloc_extent_ret alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ paddr_t addr,
+ LogicalCachedExtent &ext) final
+ {
+ return _alloc_extent(
+ t,
+ hint,
+ len,
+ addr,
+ P_ADDR_NULL,
+ L_ADDR_NULL,
+ &ext);
+ }
+
+ ref_ret decref_extent(
+ Transaction &t,
+ laddr_t addr,
+ bool cascade_remove) final {
+ return update_refcount(t, addr, -1, cascade_remove);
+ }
+
+ ref_ret incref_extent(
+ Transaction &t,
+ laddr_t addr) final {
+ return update_refcount(t, addr, 1, false);
+ }
+
+ ref_ret incref_extent(
+ Transaction &t,
+ laddr_t addr,
+ int delta) final {
+ ceph_assert(delta > 0);
+ return update_refcount(t, addr, delta, false);
+ }
+
+ /**
+ * init_cached_extent
+ *
+ * Checks whether e is live (reachable from lba tree) and drops or initializes
+ * accordingly.
+ *
+ * Returns if e is live.
+ */
+ init_cached_extent_ret init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e) final;
+
+ check_child_trackers_ret check_child_trackers(Transaction &t) final;
+
+ scan_mappings_ret scan_mappings(
+ Transaction &t,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &&f) final;
+
+ rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent) final;
+
+ update_mapping_ret update_mapping(
+ Transaction& t,
+ laddr_t laddr,
+ paddr_t prev_addr,
+ paddr_t paddr,
+ LogicalCachedExtent*) final;
+
+ get_physical_extent_if_live_ret get_physical_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ extent_len_t len) final;
+private:
+ Cache &cache;
+
+
+ struct {
+ uint64_t num_alloc_extents = 0;
+ uint64_t num_alloc_extents_iter_nexts = 0;
+ } stats;
+
+ op_context_t<laddr_t> get_context(Transaction &t) {
+ return op_context_t<laddr_t>{cache, t};
+ }
+
+ seastar::metrics::metric_group metrics;
+ void register_metrics();
+
+ /**
+ * update_refcount
+ *
+ * Updates refcount, returns resulting refcount
+ */
+ using update_refcount_ret = ref_ret;
+ update_refcount_ret update_refcount(
+ Transaction &t,
+ laddr_t addr,
+ int delta,
+ bool cascade_remove);
+
+ /**
+ * _update_mapping
+ *
+ * Updates mapping, removes if f returns nullopt
+ */
+ using _update_mapping_iertr = ref_iertr;
+ using _update_mapping_ret = ref_iertr::future<lba_map_val_t>;
+ using update_func_t = std::function<
+ lba_map_val_t(const lba_map_val_t &v)
+ >;
+ _update_mapping_ret _update_mapping(
+ Transaction &t,
+ laddr_t addr,
+ update_func_t &&f,
+ LogicalCachedExtent*);
+
+ alloc_extent_ret _alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ pladdr_t addr,
+ paddr_t actual_addr,
+ laddr_t intermediate_base,
+ LogicalCachedExtent*);
+
+ using _get_mapping_ret = get_mapping_iertr::future<BtreeLBAMappingRef>;
+ _get_mapping_ret _get_mapping(
+ Transaction &t,
+ laddr_t offset);
+
+ using _get_original_mappings_ret = get_mappings_ret;
+ _get_original_mappings_ret _get_original_mappings(
+ op_context_t<laddr_t> c,
+ std::list<BtreeLBAMappingRef> &pin_list);
+
+ ref_iertr::future<std::optional<std::pair<paddr_t, extent_len_t>>>
+ _decref_intermediate(
+ Transaction &t,
+ laddr_t addr,
+ extent_len_t len);
+};
+using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>;
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc
new file mode 100644
index 000000000..66dc94394
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/logging.h"
+
+SET_SUBSYS(seastore_lba);
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+std::ostream& operator<<(std::ostream& out, const lba_map_val_t& v)
+{
+ return out << "lba_map_val_t("
+ << v.pladdr
+ << "~" << v.len
+ << ", refcount=" << v.refcount
+ << ", checksum=" << v.checksum
+ << ")";
+}
+
+std::ostream &LBALeafNode::_print_detail(std::ostream &out) const
+{
+ out << ", size=" << this->get_size()
+ << ", meta=" << this->get_meta()
+ << ", my_tracker=" << (void*)this->my_tracker;
+ if (this->my_tracker) {
+ out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get();
+ }
+ return out << ", root_block=" << (void*)this->root_block.get();
+}
+
+void LBALeafNode::resolve_relative_addrs(paddr_t base)
+{
+ LOG_PREFIX(LBALeafNode::resolve_relative_addrs);
+ for (auto i: *this) {
+ auto val = i->get_val();
+ if (val.pladdr.is_paddr() &&
+ val.pladdr.get_paddr().is_relative()) {
+ val.pladdr = base.add_relative(val.pladdr.get_paddr());
+ TRACE("{} -> {}", i->get_val().pladdr, val.pladdr);
+ i->set_val(val);
+ }
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
new file mode 100644
index 000000000..ffce2c1b5
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
@@ -0,0 +1,294 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/mman.h>
+#include <memory>
+#include <string.h>
+
+
+#include "include/buffer.h"
+
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/cached_extent.h"
+
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+#include "crimson/os/seastore/btree/fixed_kv_btree.h"
+#include "crimson/os/seastore/btree/fixed_kv_node.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+using base_iertr = LBAManager::base_iertr;
+using LBANode = FixedKVNode<laddr_t>;
+
+/**
+ * lba_map_val_t
+ *
+ * struct representing a single lba mapping
+ */
+struct lba_map_val_t {
+ extent_len_t len = 0; ///< length of mapping
+ pladdr_t pladdr; ///< physical addr of mapping or
+ // laddr of a physical lba mapping(see btree_lba_manager.h)
+ uint32_t refcount = 0; ///< refcount
+ uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO)
+
+ lba_map_val_t() = default;
+ lba_map_val_t(
+ extent_len_t len,
+ pladdr_t pladdr,
+ uint32_t refcount,
+ uint32_t checksum)
+ : len(len), pladdr(pladdr), refcount(refcount), checksum(checksum) {}
+ bool operator==(const lba_map_val_t&) const = default;
+};
+
+std::ostream& operator<<(std::ostream& out, const lba_map_val_t&);
+
+constexpr size_t LBA_BLOCK_SIZE = 4096;
+
+using lba_node_meta_t = fixed_kv_node_meta_t<laddr_t>;
+
+using lba_node_meta_le_t = fixed_kv_node_meta_le_t<laddr_le_t>;
+
+/**
+ * LBAInternalNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ * size : uint32_t[1] 4b
+ * (padding) : 4b
+ * meta : lba_node_meta_le_t[3] (1*24)b
+ * keys : laddr_t[255] (254*8)b
+ * values : paddr_t[255] (254*8)b
+ * = 4096
+
+ * TODO: make the above capacity calculation part of FixedKVNodeLayout
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t INTERNAL_NODE_CAPACITY = 254;
+struct LBAInternalNode
+ : FixedKVInternalNode<
+ INTERNAL_NODE_CAPACITY,
+ laddr_t, laddr_le_t,
+ LBA_BLOCK_SIZE,
+ LBAInternalNode> {
+ using Ref = TCachedExtentRef<LBAInternalNode>;
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ LBAInternalNode(T&&... t) :
+ FixedKVInternalNode(std::forward<T>(t)...) {}
+
+ static constexpr extent_types_t TYPE = extent_types_t::LADDR_INTERNAL;
+
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+};
+using LBAInternalNodeRef = LBAInternalNode::Ref;
+
+/**
+ * LBALeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ * size : uint32_t[1] 4b
+ * (padding) : 4b
+ * meta : lba_node_meta_le_t[3] (1*24)b
+ * keys : laddr_t[170] (140*8)b
+ * values : lba_map_val_t[170] (140*21)b
+ * = 4092
+ *
+ * TODO: update FixedKVNodeLayout to handle the above calculation
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t LEAF_NODE_CAPACITY = 140;
+
+/**
+ * lba_map_val_le_t
+ *
+ * On disk layout for lba_map_val_t.
+ */
+struct lba_map_val_le_t {
+ extent_len_le_t len = init_extent_len_le(0);
+ pladdr_le_t pladdr;
+ ceph_le32 refcount{0};
+ ceph_le32 checksum{0};
+
+ lba_map_val_le_t() = default;
+ lba_map_val_le_t(const lba_map_val_le_t &) = default;
+ explicit lba_map_val_le_t(const lba_map_val_t &val)
+ : len(init_extent_len_le(val.len)),
+ pladdr(pladdr_le_t(val.pladdr)),
+ refcount(val.refcount),
+ checksum(val.checksum) {}
+
+ operator lba_map_val_t() const {
+ return lba_map_val_t{ len, pladdr, refcount, checksum };
+ }
+};
+
+struct LBALeafNode
+ : FixedKVLeafNode<
+ LEAF_NODE_CAPACITY,
+ laddr_t, laddr_le_t,
+ lba_map_val_t, lba_map_val_le_t,
+ LBA_BLOCK_SIZE,
+ LBALeafNode,
+ true> {
+ using Ref = TCachedExtentRef<LBALeafNode>;
+ using parent_type_t = FixedKVLeafNode<
+ LEAF_NODE_CAPACITY,
+ laddr_t, laddr_le_t,
+ lba_map_val_t, lba_map_val_le_t,
+ LBA_BLOCK_SIZE,
+ LBALeafNode,
+ true>;
+ using internal_const_iterator_t =
+ typename parent_type_t::node_layout_t::const_iterator;
+ using internal_iterator_t =
+ typename parent_type_t::node_layout_t::iterator;
+ template <typename... T>
+ LBALeafNode(T&&... t) :
+ parent_type_t(std::forward<T>(t)...) {}
+
+ static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF;
+
+ bool validate_stable_children() final {
+ LOG_PREFIX(LBALeafNode::validate_stable_children);
+ if (this->children.empty()) {
+ return false;
+ }
+
+ for (auto i : *this) {
+ auto child = (LogicalCachedExtent*)this->children[i.get_offset()];
+ if (is_valid_child_ptr(child) && child->get_laddr() != i.get_key()) {
+ SUBERROR(seastore_fixedkv_tree,
+ "stable child not valid: child {}, key {}",
+ *child,
+ i.get_key());
+ ceph_abort();
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void update(
+ internal_const_iterator_t iter,
+ lba_map_val_t val,
+ LogicalCachedExtent* nextent) final {
+ LOG_PREFIX(LBALeafNode::update);
+ if (nextent) {
+ SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, {}",
+ this->pending_for_transaction,
+ iter.get_offset(),
+ *nextent);
+ // child-ptr may already be correct, see LBAManager::update_mappings()
+ this->update_child_ptr(iter, nextent);
+ }
+ if (val.pladdr.is_paddr()) {
+ val.pladdr = maybe_generate_relative(val.pladdr.get_paddr());
+ }
+ return this->journal_update(
+ iter,
+ val,
+ this->maybe_get_delta_buffer());
+ }
+
+ internal_const_iterator_t insert(
+ internal_const_iterator_t iter,
+ laddr_t addr,
+ lba_map_val_t val,
+ LogicalCachedExtent* nextent) final {
+ LOG_PREFIX(LBALeafNode::insert);
+ SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}, extent {}",
+ this->pending_for_transaction,
+ iter.get_offset(),
+ addr,
+ (void*)nextent);
+ this->insert_child_ptr(iter, nextent);
+ if (val.pladdr.is_paddr()) {
+ val.pladdr = maybe_generate_relative(val.pladdr.get_paddr());
+ }
+ this->journal_insert(
+ iter,
+ addr,
+ val,
+ this->maybe_get_delta_buffer());
+ return iter;
+ }
+
+ void remove(internal_const_iterator_t iter) final {
+ LOG_PREFIX(LBALeafNode::remove);
+ SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}",
+ this->pending_for_transaction,
+ iter.get_offset(),
+ iter.get_key());
+ assert(iter != this->end());
+ this->remove_child_ptr(iter);
+ return this->journal_remove(
+ iter,
+ this->maybe_get_delta_buffer());
+ }
+
+ // See LBAInternalNode, same concept
+ void resolve_relative_addrs(paddr_t base);
+ void node_resolve_vals(
+ internal_iterator_t from,
+ internal_iterator_t to) const final
+ {
+ if (this->is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ auto val = i->get_val();
+ if (val.pladdr.is_paddr()
+ && val.pladdr.get_paddr().is_relative()) {
+ assert(val.pladdr.get_paddr().is_block_relative());
+ val.pladdr = this->get_paddr().add_relative(val.pladdr.get_paddr());
+ i->set_val(val);
+ }
+ }
+ }
+ }
+ void node_unresolve_vals(
+ internal_iterator_t from,
+ internal_iterator_t to) const final
+ {
+ if (this->is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ auto val = i->get_val();
+ if (val.pladdr.is_paddr()
+ && val.pladdr.get_paddr().is_relative()) {
+ assert(val.pladdr.get_paddr().is_record_relative());
+ val.pladdr = val.pladdr.get_paddr().block_relative_to(this->get_paddr());
+ i->set_val(val);
+ }
+ }
+ }
+ }
+
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ std::ostream &_print_detail(std::ostream &out) const final;
+};
+using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>;
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::lba_manager::btree::lba_node_meta_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::lba_manager::btree::lba_map_val_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::lba_manager::btree::LBAInternalNode> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::lba_manager::btree::LBALeafNode> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/logging.h b/src/crimson/os/seastore/logging.h
new file mode 100644
index 000000000..3f12ee72c
--- /dev/null
+++ b/src/crimson/os/seastore/logging.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <fmt/format.h>
+
+#include "crimson/common/log.h"
+
+#define LOGT(level_, MSG, t, ...) \
+ LOCAL_LOGGER.log(level_, "{} trans.{} {}: " MSG, (void*)&t, \
+ (t).get_trans_id(), FNAME , ##__VA_ARGS__)
+#define SUBLOGT(subname_, level_, MSG, t, ...) \
+ LOGGER(subname_).log(level_, "{} trans.{} {}: " MSG, (void*)&t, \
+ (t).get_trans_id(), FNAME , ##__VA_ARGS__)
+
+#define TRACET(...) LOGT(seastar::log_level::trace, __VA_ARGS__)
+#define SUBTRACET(subname_, ...) SUBLOGT(subname_, seastar::log_level::trace, __VA_ARGS__)
+
+#define DEBUGT(...) LOGT(seastar::log_level::debug, __VA_ARGS__)
+#define SUBDEBUGT(subname_, ...) SUBLOGT(subname_, seastar::log_level::debug, __VA_ARGS__)
+
+#define INFOT(...) LOGT(seastar::log_level::info, __VA_ARGS__)
+#define SUBINFOT(subname_, ...) SUBLOGT(subname_, seastar::log_level::info, __VA_ARGS__)
+
+#define WARNT(...) LOGT(seastar::log_level::warn, __VA_ARGS__)
+#define SUBWARNT(subname_, ...) SUBLOGT(subname_, seastar::log_level::warn, __VA_ARGS__)
+
+#define ERRORT(...) LOGT(seastar::log_level::error, __VA_ARGS__)
+#define SUBERRORT(subname_, ...) SUBLOGT(subname_, seastar::log_level::error, __VA_ARGS__)
diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc
new file mode 100644
index 000000000..0d852696b
--- /dev/null
+++ b/src/crimson/os/seastore/object_data_handler.cc
@@ -0,0 +1,1638 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <utility>
+#include <functional>
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/object_data_handler.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_seastore_odata);
+ }
+}
+
+SET_SUBSYS(seastore_odata);
+
+namespace crimson::os::seastore {
+#define assert_aligned(x) ceph_assert(((x)%ctx.tm.get_block_size()) == 0)
+
+using context_t = ObjectDataHandler::context_t;
+using get_iertr = ObjectDataHandler::write_iertr;
+
+/**
+ * extent_to_write_t
+ *
+ * Encapsulates smallest write operations in overwrite.
+ * Indicates a zero/existing extent or a data extent based on whether
+ * to_write is populate.
+ * Should be handled by prepare_ops_list.
+ */
+struct extent_to_write_t {
+ enum class type_t {
+ DATA,
+ ZERO,
+ EXISTING,
+ };
+ type_t type;
+
+ /// pin of original extent, not nullptr if type == EXISTING
+ LBAMappingRef pin;
+
+ laddr_t addr;
+ extent_len_t len;
+
+ /// non-nullopt if and only if type == DATA
+ std::optional<bufferlist> to_write;
+
+ extent_to_write_t(const extent_to_write_t &) = delete;
+ extent_to_write_t(extent_to_write_t &&) = default;
+
+ bool is_data() const {
+ return type == type_t::DATA;
+ }
+
+ bool is_zero() const {
+ return type == type_t::ZERO;
+ }
+
+ bool is_existing() const {
+ return type == type_t::EXISTING;
+ }
+
+ laddr_t get_end_addr() const {
+ return addr + len;
+ }
+
+ static extent_to_write_t create_data(
+ laddr_t addr, bufferlist to_write) {
+ return extent_to_write_t(addr, to_write);
+ }
+
+ static extent_to_write_t create_zero(
+ laddr_t addr, extent_len_t len) {
+ return extent_to_write_t(addr, len);
+ }
+
+ static extent_to_write_t create_existing(
+ LBAMappingRef &&pin, laddr_t addr, extent_len_t len) {
+ assert(pin);
+ return extent_to_write_t(std::move(pin), addr, len);
+ }
+
+private:
+ extent_to_write_t(laddr_t addr, bufferlist to_write)
+ : type(type_t::DATA), addr(addr), len(to_write.length()),
+ to_write(to_write) {}
+
+ extent_to_write_t(laddr_t addr, extent_len_t len)
+ : type(type_t::ZERO), addr(addr), len(len) {}
+
+ extent_to_write_t(LBAMappingRef &&pin, laddr_t addr, extent_len_t len)
+ : type(type_t::EXISTING), pin(std::move(pin)), addr(addr), len(len) {}
+};
+using extent_to_write_list_t = std::list<extent_to_write_t>;
+
+// Encapsulates extents to be written out using do_remappings.
+struct extent_to_remap_t {
+ enum class type_t {
+ REMAP,
+ OVERWRITE
+ };
+ type_t type;
+ /// pin of original extent
+ LBAMappingRef pin;
+ /// offset of remapped extent or overwrite part of overwrite extent.
+ /// overwrite part of overwrite extent might correspond to mutiple
+ /// fresh write extent.
+ extent_len_t new_offset;
+ /// length of remapped extent or overwrite part of overwrite extent
+ extent_len_t new_len;
+
+ extent_to_remap_t(const extent_to_remap_t &) = delete;
+ extent_to_remap_t(extent_to_remap_t &&) = default;
+
+ bool is_remap() const {
+ return type == type_t::REMAP;
+ }
+
+ bool is_overwrite() const {
+ assert((new_offset != 0) && (pin->get_length() != new_offset + new_len));
+ return type == type_t::OVERWRITE;
+ }
+
+ using remap_entry = TransactionManager::remap_entry;
+ remap_entry create_remap_entry() {
+ assert(is_remap());
+ return remap_entry(
+ new_offset,
+ new_len);
+ }
+
+ remap_entry create_left_remap_entry() {
+ assert(is_overwrite());
+ return remap_entry(
+ 0,
+ new_offset);
+ }
+
+ remap_entry create_right_remap_entry() {
+ assert(is_overwrite());
+ return remap_entry(
+ new_offset + new_len,
+ pin->get_length() - new_offset - new_len);
+ }
+
+ static extent_to_remap_t create_remap(
+ LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) {
+ return extent_to_remap_t(type_t::REMAP,
+ std::move(pin), new_offset, new_len);
+ }
+
+ static extent_to_remap_t create_overwrite(
+ LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) {
+ return extent_to_remap_t(type_t::OVERWRITE,
+ std::move(pin), new_offset, new_len);
+ }
+
+private:
+ extent_to_remap_t(type_t type,
+ LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len)
+ : type(type),
+ pin(std::move(pin)), new_offset(new_offset), new_len(new_len) {}
+};
+using extent_to_remap_list_t = std::list<extent_to_remap_t>;
+
+// Encapsulates extents to be written out using do_insertions.
+struct extent_to_insert_t {
+ enum class type_t {
+ DATA,
+ ZERO
+ };
+ type_t type;
+ /// laddr of new extent
+ laddr_t addr;
+ /// length of new extent
+ extent_len_t len;
+ /// non-nullopt if type == DATA
+ std::optional<bufferlist> bl;
+
+ extent_to_insert_t(const extent_to_insert_t &) = default;
+ extent_to_insert_t(extent_to_insert_t &&) = default;
+
+ bool is_data() const {
+ return type == type_t::DATA;
+ }
+
+ bool is_zero() const {
+ return type == type_t::ZERO;
+ }
+
+ static extent_to_insert_t create_data(
+ laddr_t addr, extent_len_t len, std::optional<bufferlist> bl) {
+ return extent_to_insert_t(addr, len, bl);
+ }
+
+ static extent_to_insert_t create_zero(
+ laddr_t addr, extent_len_t len) {
+ return extent_to_insert_t(addr, len);
+ }
+
+private:
+ extent_to_insert_t(laddr_t addr, extent_len_t len,
+ std::optional<bufferlist> bl)
+ :type(type_t::DATA), addr(addr), len(len), bl(bl) {}
+
+ extent_to_insert_t(laddr_t addr, extent_len_t len)
+ :type(type_t::ZERO), addr(addr), len(len) {}
+};
+using extent_to_insert_list_t = std::list<extent_to_insert_t>;
+
+// Encapsulates extents to be retired in do_removals.
+using extent_to_remove_list_t = std::list<LBAMappingRef>;
+
+struct overwrite_ops_t {
+ extent_to_remap_list_t to_remap;
+ extent_to_insert_list_t to_insert;
+ extent_to_remove_list_t to_remove;
+};
+
+// prepare to_remap, to_retire, to_insert list
+overwrite_ops_t prepare_ops_list(
+ lba_pin_list_t &pins_to_remove,
+ extent_to_write_list_t &to_write) {
+ assert(pins_to_remove.size() != 0);
+ overwrite_ops_t ops;
+ ops.to_remove.swap(pins_to_remove);
+ if (to_write.empty()) {
+ logger().debug("empty to_write");
+ return ops;
+ }
+ long unsigned int visitted = 0;
+ auto& front = to_write.front();
+ auto& back = to_write.back();
+
+ // prepare overwrite, happens in one original extent.
+ if (ops.to_remove.size() == 1 &&
+ front.is_existing() && back.is_existing()) {
+ visitted += 2;
+ assert(to_write.size() > 2);
+ assert(front.addr == front.pin->get_key());
+ assert(back.addr > back.pin->get_key());
+ ops.to_remap.push_back(extent_to_remap_t::create_overwrite(
+ std::move(front.pin),
+ front.len,
+ back.addr - front.addr - front.len));
+ ops.to_remove.pop_front();
+ } else {
+ // prepare to_remap, happens in one or multiple extents
+ if (front.is_existing()) {
+ visitted++;
+ assert(to_write.size() > 1);
+ assert(front.addr == front.pin->get_key());
+ ops.to_remap.push_back(extent_to_remap_t::create_remap(
+ std::move(front.pin),
+ 0,
+ front.len));
+ ops.to_remove.pop_front();
+ }
+ if (back.is_existing()) {
+ visitted++;
+ assert(to_write.size() > 1);
+ assert(back.addr + back.len ==
+ back.pin->get_key() + back.pin->get_length());
+ ops.to_remap.push_back(extent_to_remap_t::create_remap(
+ std::move(back.pin),
+ back.addr - back.pin->get_key(),
+ back.len));
+ ops.to_remove.pop_back();
+ }
+ }
+
+ // prepare to_insert
+ for (auto &region : to_write) {
+ if (region.is_data()) {
+ visitted++;
+ assert(region.to_write.has_value());
+ ops.to_insert.push_back(extent_to_insert_t::create_data(
+ region.addr, region.len, region.to_write));
+ } else if (region.is_zero()) {
+ visitted++;
+ assert(!(region.to_write.has_value()));
+ ops.to_insert.push_back(extent_to_insert_t::create_zero(
+ region.addr, region.len));
+ }
+ }
+
+ logger().debug(
+ "to_remap list size: {}"
+ " to_insert list size: {}"
+ " to_remove list size: {}",
+ ops.to_remap.size(), ops.to_insert.size(), ops.to_remove.size());
+ assert(visitted == to_write.size());
+ return ops;
+}
+
+/**
+ * append_extent_to_write
+ *
+ * Appends passed extent_to_write_t maintaining invariant that the
+ * list may not contain consecutive zero elements by checking and
+ * combining them.
+ */
+void append_extent_to_write(
+ extent_to_write_list_t &to_write, extent_to_write_t &&to_append)
+{
+ assert(to_write.empty() ||
+ to_write.back().get_end_addr() == to_append.addr);
+ if (to_write.empty() ||
+ to_write.back().is_data() ||
+ to_append.is_data() ||
+ to_write.back().type != to_append.type) {
+ to_write.push_back(std::move(to_append));
+ } else {
+ to_write.back().len += to_append.len;
+ }
+}
+
+/**
+ * splice_extent_to_write
+ *
+ * splices passed extent_to_write_list_t maintaining invariant that the
+ * list may not contain consecutive zero elements by checking and
+ * combining them.
+ */
+void splice_extent_to_write(
+ extent_to_write_list_t &to_write, extent_to_write_list_t &&to_splice)
+{
+ if (!to_splice.empty()) {
+ append_extent_to_write(to_write, std::move(to_splice.front()));
+ to_splice.pop_front();
+ to_write.splice(to_write.end(), std::move(to_splice));
+ }
+}
+
+/// Creates remap extents in to_remap
+ObjectDataHandler::write_ret do_remappings(
+ context_t ctx,
+ extent_to_remap_list_t &to_remap)
+{
+ return trans_intr::do_for_each(
+ to_remap,
+ [ctx](auto &region) {
+ if (region.is_remap()) {
+ return ctx.tm.remap_pin<ObjectDataBlock, 1>(
+ ctx.t,
+ std::move(region.pin),
+ std::array{
+ region.create_remap_entry()
+ }
+ ).si_then([&region](auto pins) {
+ ceph_assert(pins.size() == 1);
+ ceph_assert(region.new_len == pins[0]->get_length());
+ return ObjectDataHandler::write_iertr::now();
+ });
+ } else if (region.is_overwrite()) {
+ return ctx.tm.remap_pin<ObjectDataBlock, 2>(
+ ctx.t,
+ std::move(region.pin),
+ std::array{
+ region.create_left_remap_entry(),
+ region.create_right_remap_entry()
+ }
+ ).si_then([&region](auto pins) {
+ ceph_assert(pins.size() == 2);
+ ceph_assert(region.pin->get_key() == pins[0]->get_key());
+ ceph_assert(region.pin->get_key() + pins[0]->get_length() +
+ region.new_len == pins[1]->get_key());
+ return ObjectDataHandler::write_iertr::now();
+ });
+ } else {
+ ceph_abort("impossible");
+ return ObjectDataHandler::write_iertr::now();
+ }
+ });
+}
+
+ObjectDataHandler::write_ret do_removals(
+ context_t ctx,
+ lba_pin_list_t &to_remove)
+{
+ return trans_intr::do_for_each(
+ to_remove,
+ [ctx](auto &pin) {
+ LOG_PREFIX(object_data_handler.cc::do_removals);
+ DEBUGT("decreasing ref: {}",
+ ctx.t,
+ pin->get_key());
+ return ctx.tm.dec_ref(
+ ctx.t,
+ pin->get_key()
+ ).si_then(
+ [](auto){},
+ ObjectDataHandler::write_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "object_data_handler::do_removals invalid error"
+ }
+ );
+ });
+}
+
+/// Creates zero/data extents in to_insert
+ObjectDataHandler::write_ret do_insertions(
+ context_t ctx,
+ extent_to_insert_list_t &to_insert)
+{
+ return trans_intr::do_for_each(
+ to_insert,
+ [ctx](auto &region) {
+ LOG_PREFIX(object_data_handler.cc::do_insertions);
+ if (region.is_data()) {
+ assert_aligned(region.addr);
+ assert_aligned(region.len);
+ ceph_assert(region.len == region.bl->length());
+ DEBUGT("allocating extent: {}~{}",
+ ctx.t,
+ region.addr,
+ region.len);
+ return ctx.tm.alloc_extent<ObjectDataBlock>(
+ ctx.t,
+ region.addr,
+ region.len
+ ).si_then([&region](auto extent) {
+ if (extent->get_laddr() != region.addr) {
+ logger().debug(
+ "object_data_handler::do_insertions alloc got addr {},"
+ " should have been {}",
+ extent->get_laddr(),
+ region.addr);
+ }
+ ceph_assert(extent->get_laddr() == region.addr);
+ ceph_assert(extent->get_length() == region.len);
+ auto iter = region.bl->cbegin();
+ iter.copy(region.len, extent->get_bptr().c_str());
+ return ObjectDataHandler::write_iertr::now();
+ });
+ } else if (region.is_zero()) {
+ DEBUGT("reserving: {}~{}",
+ ctx.t,
+ region.addr,
+ region.len);
+ return ctx.tm.reserve_region(
+ ctx.t,
+ region.addr,
+ region.len
+ ).si_then([FNAME, ctx, &region](auto pin) {
+ ceph_assert(pin->get_length() == region.len);
+ if (pin->get_key() != region.addr) {
+ ERRORT(
+ "inconsistent laddr: pin: {} region {}",
+ ctx.t,
+ pin->get_key(),
+ region.addr);
+ }
+ ceph_assert(pin->get_key() == region.addr);
+ return ObjectDataHandler::write_iertr::now();
+ });
+ } else {
+ ceph_abort("impossible");
+ return ObjectDataHandler::write_iertr::now();
+ }
+ });
+}
+
+enum class overwrite_operation_t {
+ UNKNOWN,
+ OVERWRITE_ZERO, // fill unaligned data with zero
+ MERGE_EXISTING, // if present, merge data with the clean/pending extent
+ SPLIT_EXISTING, // split the existing extent, and fill unaligned data
+};
+
+std::ostream& operator<<(
+ std::ostream &out,
+ const overwrite_operation_t &operation)
+{
+ switch (operation) {
+ case overwrite_operation_t::UNKNOWN:
+ return out << "UNKNOWN";
+ case overwrite_operation_t::OVERWRITE_ZERO:
+ return out << "OVERWRITE_ZERO";
+ case overwrite_operation_t::MERGE_EXISTING:
+ return out << "MERGE_EXISTING";
+ case overwrite_operation_t::SPLIT_EXISTING:
+ return out << "SPLIT_EXISTING";
+ default:
+ return out << "!IMPOSSIBLE_OPERATION";
+ }
+}
+
+/**
+ * overwrite_plan_t
+ *
+ * |<--------------------------pins_size---------------------------------------------->|
+ * pin_begin(aligned) pin_end(aligned)
+ * |<------aligned_data_size-------------------------->| (aligned-bl)
+ * aligned_data_begin aligned_data_end
+ * |<-data_size->| (bl)
+ * data_begin end
+ * left(l) right(r)
+ * |<l_extent_size>|<l_alignment_size>| |<r_alignment_size>|<r_extent_size>|
+ * |<-----------left_size------------>| |<-----------right_size----------->|
+ *
+ * |<-----(existing left extent/pin)----->| |<-----(existing right extent/pin)----->|
+ * left_paddr right_paddr
+ */
+struct overwrite_plan_t {
+ // addresses
+ laddr_t pin_begin;
+ laddr_t pin_end;
+ paddr_t left_paddr;
+ paddr_t right_paddr;
+ laddr_t data_begin;
+ laddr_t data_end;
+ laddr_t aligned_data_begin;
+ laddr_t aligned_data_end;
+
+ // operations
+ overwrite_operation_t left_operation;
+ overwrite_operation_t right_operation;
+
+ // helper member
+ extent_len_t block_size;
+
+public:
+ extent_len_t get_left_size() const {
+ return data_begin - pin_begin;
+ }
+
+ extent_len_t get_left_extent_size() const {
+ return aligned_data_begin - pin_begin;
+ }
+
+ extent_len_t get_left_alignment_size() const {
+ return data_begin - aligned_data_begin;
+ }
+
+ extent_len_t get_right_size() const {
+ return pin_end - data_end;
+ }
+
+ extent_len_t get_right_extent_size() const {
+ return pin_end - aligned_data_end;
+ }
+
+ extent_len_t get_right_alignment_size() const {
+ return aligned_data_end - data_end;
+ }
+
+ extent_len_t get_aligned_data_size() const {
+ return aligned_data_end - aligned_data_begin;
+ }
+
+ extent_len_t get_pins_size() const {
+ return pin_end - pin_begin;
+ }
+
+ friend std::ostream& operator<<(
+ std::ostream& out,
+ const overwrite_plan_t& overwrite_plan) {
+ return out << "overwrite_plan_t("
+ << "pin_begin=" << overwrite_plan.pin_begin
+ << ", pin_end=" << overwrite_plan.pin_end
+ << ", left_paddr=" << overwrite_plan.left_paddr
+ << ", right_paddr=" << overwrite_plan.right_paddr
+ << ", data_begin=" << overwrite_plan.data_begin
+ << ", data_end=" << overwrite_plan.data_end
+ << ", aligned_data_begin=" << overwrite_plan.aligned_data_begin
+ << ", aligned_data_end=" << overwrite_plan.aligned_data_end
+ << ", left_operation=" << overwrite_plan.left_operation
+ << ", right_operation=" << overwrite_plan.right_operation
+ << ", block_size=" << overwrite_plan.block_size
+ << ")";
+ }
+
+ overwrite_plan_t(laddr_t offset,
+ extent_len_t len,
+ const lba_pin_list_t& pins,
+ extent_len_t block_size,
+ Transaction& t) :
+ pin_begin(pins.front()->get_key()),
+ pin_end(pins.back()->get_key() + pins.back()->get_length()),
+ left_paddr(pins.front()->get_val()),
+ right_paddr(pins.back()->get_val()),
+ data_begin(offset),
+ data_end(offset + len),
+ aligned_data_begin(p2align((uint64_t)data_begin, (uint64_t)block_size)),
+ aligned_data_end(p2roundup((uint64_t)data_end, (uint64_t)block_size)),
+ left_operation(overwrite_operation_t::UNKNOWN),
+ right_operation(overwrite_operation_t::UNKNOWN),
+ block_size(block_size) {
+ validate();
+ evaluate_operations(t);
+ assert(left_operation != overwrite_operation_t::UNKNOWN);
+ assert(right_operation != overwrite_operation_t::UNKNOWN);
+ }
+
+private:
+ // refer to overwrite_plan_t description
+ void validate() const {
+ ceph_assert(pin_begin % block_size == 0);
+ ceph_assert(pin_end % block_size == 0);
+ ceph_assert(aligned_data_begin % block_size == 0);
+ ceph_assert(aligned_data_end % block_size == 0);
+
+ ceph_assert(pin_begin <= aligned_data_begin);
+ ceph_assert(aligned_data_begin <= data_begin);
+ ceph_assert(data_begin <= data_end);
+ ceph_assert(data_end <= aligned_data_end);
+ ceph_assert(aligned_data_end <= pin_end);
+ }
+
+ /*
+ * When trying to modify a portion of an object data block, follow
+ * the read-full-extent-then-merge-new-data strategy, if the write
+ * amplification caused by it is not greater than
+ * seastore_obj_data_write_amplification; otherwise, split the
+ * original extent into at most three parts: origin-left, part-to-be-modified
+ * and origin-right.
+ */
+ void evaluate_operations(Transaction& t) {
+ auto actual_write_size = get_pins_size();
+ auto aligned_data_size = get_aligned_data_size();
+ auto left_ext_size = get_left_extent_size();
+ auto right_ext_size = get_right_extent_size();
+
+ auto can_merge = [](Transaction& t, paddr_t paddr) {
+ CachedExtentRef ext;
+ if (paddr.is_relative() || paddr.is_delayed()) {
+ return true;
+ } else if (t.get_extent(paddr, &ext) ==
+ Transaction::get_extent_ret::PRESENT) {
+ // FIXME: there is no need to lookup the cache if the pin can
+ // be associated with the extent state
+ if (ext->is_mutable()) {
+ return true;
+ }
+ }
+ return false;
+ };
+ if (left_paddr.is_zero()) {
+ actual_write_size -= left_ext_size;
+ left_ext_size = 0;
+ left_operation = overwrite_operation_t::OVERWRITE_ZERO;
+ } else if (can_merge(t, left_paddr)) {
+ aligned_data_size += left_ext_size;
+ left_ext_size = 0;
+ left_operation = overwrite_operation_t::MERGE_EXISTING;
+ }
+
+ if (right_paddr.is_zero()) {
+ actual_write_size -= right_ext_size;
+ right_ext_size = 0;
+ right_operation = overwrite_operation_t::OVERWRITE_ZERO;
+ } else if (can_merge(t, right_paddr)) {
+ aligned_data_size += right_ext_size;
+ right_ext_size = 0;
+ right_operation = overwrite_operation_t::MERGE_EXISTING;
+ }
+
+ while (left_operation == overwrite_operation_t::UNKNOWN ||
+ right_operation == overwrite_operation_t::UNKNOWN) {
+ if (((double)actual_write_size / (double)aligned_data_size) <=
+ crimson::common::get_conf<double>("seastore_obj_data_write_amplification")) {
+ break;
+ }
+ if (left_ext_size == 0 && right_ext_size == 0) {
+ break;
+ }
+ if (left_ext_size >= right_ext_size) {
+ // split left
+ assert(left_operation == overwrite_operation_t::UNKNOWN);
+ actual_write_size -= left_ext_size;
+ left_ext_size = 0;
+ left_operation = overwrite_operation_t::SPLIT_EXISTING;
+ } else { // left_ext_size < right_ext_size
+ // split right
+ assert(right_operation == overwrite_operation_t::UNKNOWN);
+ actual_write_size -= right_ext_size;
+ right_ext_size = 0;
+ right_operation = overwrite_operation_t::SPLIT_EXISTING;
+ }
+ }
+
+ if (left_operation == overwrite_operation_t::UNKNOWN) {
+ // no split left, so merge with left
+ left_operation = overwrite_operation_t::MERGE_EXISTING;
+ }
+
+ if (right_operation == overwrite_operation_t::UNKNOWN) {
+ // no split right, so merge with right
+ right_operation = overwrite_operation_t::MERGE_EXISTING;
+ }
+ }
+};
+
+} // namespace crimson::os::seastore
+
+#if FMT_VERSION >= 90000
+template<> struct fmt::formatter<crimson::os::seastore::overwrite_plan_t> : fmt::ostream_formatter {};
+#endif
+
+namespace crimson::os::seastore {
+
+/**
+ * operate_left
+ *
+ * Proceed overwrite_plan.left_operation.
+ */
+using operate_ret_bare = std::pair<
+ std::optional<extent_to_write_t>,
+ std::optional<bufferptr>>;
+using operate_ret = get_iertr::future<operate_ret_bare>;
+operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan)
+{
+ if (overwrite_plan.get_left_size() == 0) {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::nullopt,
+ std::nullopt);
+ }
+
+ if (overwrite_plan.left_operation == overwrite_operation_t::OVERWRITE_ZERO) {
+ assert(pin->get_val().is_zero());
+ auto zero_extent_len = overwrite_plan.get_left_extent_size();
+ assert_aligned(zero_extent_len);
+ auto zero_prepend_len = overwrite_plan.get_left_alignment_size();
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ (zero_extent_len == 0
+ ? std::nullopt
+ : std::make_optional(extent_to_write_t::create_zero(
+ overwrite_plan.pin_begin, zero_extent_len))),
+ (zero_prepend_len == 0
+ ? std::nullopt
+ : std::make_optional(bufferptr(
+ ceph::buffer::create(zero_prepend_len, 0))))
+ );
+ } else if (overwrite_plan.left_operation == overwrite_operation_t::MERGE_EXISTING) {
+ auto prepend_len = overwrite_plan.get_left_size();
+ if (prepend_len == 0) {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::nullopt,
+ std::nullopt);
+ } else {
+ extent_len_t off = pin->get_intermediate_offset();
+ return ctx.tm.read_pin<ObjectDataBlock>(
+ ctx.t, pin->duplicate()
+ ).si_then([prepend_len, off](auto left_extent) {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::nullopt,
+ std::make_optional(bufferptr(
+ left_extent->get_bptr(),
+ off,
+ prepend_len)));
+ });
+ }
+ } else {
+ assert(overwrite_plan.left_operation == overwrite_operation_t::SPLIT_EXISTING);
+
+ auto extent_len = overwrite_plan.get_left_extent_size();
+ assert(extent_len);
+ std::optional<extent_to_write_t> left_to_write_extent =
+ std::make_optional(extent_to_write_t::create_existing(
+ pin->duplicate(),
+ pin->get_key(),
+ extent_len));
+
+ auto prepend_len = overwrite_plan.get_left_alignment_size();
+ if (prepend_len == 0) {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::move(left_to_write_extent),
+ std::nullopt);
+ } else {
+ extent_len_t off = pin->get_intermediate_offset();
+ return ctx.tm.read_pin<ObjectDataBlock>(
+ ctx.t, pin->duplicate()
+ ).si_then([prepend_offset=extent_len + off, prepend_len,
+ left_to_write_extent=std::move(left_to_write_extent)]
+ (auto left_extent) mutable {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::move(left_to_write_extent),
+ std::make_optional(bufferptr(
+ left_extent->get_bptr(),
+ prepend_offset,
+ prepend_len)));
+ });
+ }
+ }
+};
+
+/**
+ * operate_right
+ *
+ * Proceed overwrite_plan.right_operation.
+ */
+operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan)
+{
+ if (overwrite_plan.get_right_size() == 0) {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::nullopt,
+ std::nullopt);
+ }
+
+ auto right_pin_begin = pin->get_key();
+ assert(overwrite_plan.data_end >= right_pin_begin);
+ if (overwrite_plan.right_operation == overwrite_operation_t::OVERWRITE_ZERO) {
+ assert(pin->get_val().is_zero());
+ auto zero_suffix_len = overwrite_plan.get_right_alignment_size();
+ auto zero_extent_len = overwrite_plan.get_right_extent_size();
+ assert_aligned(zero_extent_len);
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ (zero_extent_len == 0
+ ? std::nullopt
+ : std::make_optional(extent_to_write_t::create_zero(
+ overwrite_plan.aligned_data_end, zero_extent_len))),
+ (zero_suffix_len == 0
+ ? std::nullopt
+ : std::make_optional(bufferptr(
+ ceph::buffer::create(zero_suffix_len, 0))))
+ );
+ } else if (overwrite_plan.right_operation == overwrite_operation_t::MERGE_EXISTING) {
+ auto append_len = overwrite_plan.get_right_size();
+ if (append_len == 0) {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::nullopt,
+ std::nullopt);
+ } else {
+ auto append_offset =
+ overwrite_plan.data_end
+ - right_pin_begin
+ + pin->get_intermediate_offset();
+ return ctx.tm.read_pin<ObjectDataBlock>(
+ ctx.t, pin->duplicate()
+ ).si_then([append_offset, append_len](auto right_extent) {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::nullopt,
+ std::make_optional(bufferptr(
+ right_extent->get_bptr(),
+ append_offset,
+ append_len)));
+ });
+ }
+ } else {
+ assert(overwrite_plan.right_operation == overwrite_operation_t::SPLIT_EXISTING);
+
+ auto extent_len = overwrite_plan.get_right_extent_size();
+ assert(extent_len);
+ std::optional<extent_to_write_t> right_to_write_extent =
+ std::make_optional(extent_to_write_t::create_existing(
+ pin->duplicate(),
+ overwrite_plan.aligned_data_end,
+ extent_len));
+
+ auto append_len = overwrite_plan.get_right_alignment_size();
+ if (append_len == 0) {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::move(right_to_write_extent),
+ std::nullopt);
+ } else {
+ auto append_offset =
+ overwrite_plan.data_end
+ - right_pin_begin
+ + pin->get_intermediate_offset();
+ return ctx.tm.read_pin<ObjectDataBlock>(
+ ctx.t, pin->duplicate()
+ ).si_then([append_offset, append_len,
+ right_to_write_extent=std::move(right_to_write_extent)]
+ (auto right_extent) mutable {
+ return get_iertr::make_ready_future<operate_ret_bare>(
+ std::move(right_to_write_extent),
+ std::make_optional(bufferptr(
+ right_extent->get_bptr(),
+ append_offset,
+ append_len)));
+ });
+ }
+ }
+};
+
+template <typename F>
+auto with_object_data(
+ ObjectDataHandler::context_t ctx,
+ F &&f)
+{
+ return seastar::do_with(
+ ctx.onode.get_layout().object_data.get(),
+ std::forward<F>(f),
+ [ctx](auto &object_data, auto &f) {
+ return std::invoke(f, object_data
+ ).si_then([ctx, &object_data] {
+ if (object_data.must_update()) {
+ ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data);
+ }
+ return seastar::now();
+ });
+ });
+}
+
+template <typename F>
+auto with_objects_data(
+ ObjectDataHandler::context_t ctx,
+ F &&f)
+{
+ ceph_assert(ctx.d_onode);
+ return seastar::do_with(
+ ctx.onode.get_layout().object_data.get(),
+ ctx.d_onode->get_layout().object_data.get(),
+ std::forward<F>(f),
+ [ctx](auto &object_data, auto &d_object_data, auto &f) {
+ return std::invoke(f, object_data, d_object_data
+ ).si_then([ctx, &object_data, &d_object_data] {
+ if (object_data.must_update()) {
+ ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data);
+ }
+ if (d_object_data.must_update()) {
+ ctx.d_onode->get_mutable_layout(
+ ctx.t).object_data.update(d_object_data);
+ }
+ return seastar::now();
+ });
+ });
+}
+
+ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
+ context_t ctx,
+ object_data_t &object_data,
+ extent_len_t size)
+{
+ LOG_PREFIX(ObjectDataHandler::prepare_data_reservation);
+ ceph_assert(size <= max_object_size);
+ if (!object_data.is_null()) {
+ ceph_assert(object_data.get_reserved_data_len() == max_object_size);
+ DEBUGT("reservation present: {}~{}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len());
+ return write_iertr::now();
+ } else {
+ DEBUGT("reserving: {}~{}",
+ ctx.t,
+ ctx.onode.get_data_hint(),
+ max_object_size);
+ return ctx.tm.reserve_region(
+ ctx.t,
+ ctx.onode.get_data_hint(),
+ max_object_size
+ ).si_then([max_object_size=max_object_size, &object_data](auto pin) {
+ ceph_assert(pin->get_length() == max_object_size);
+ object_data.update_reserved(
+ pin->get_key(),
+ pin->get_length());
+ return write_iertr::now();
+ });
+ }
+}
+
+ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
+ context_t ctx, object_data_t &object_data, extent_len_t size)
+{
+ ceph_assert(!object_data.is_null());
+ ceph_assert(size <= object_data.get_reserved_data_len());
+ return seastar::do_with(
+ lba_pin_list_t(),
+ extent_to_write_list_t(),
+ [ctx, size, &object_data](auto &pins, auto &to_write) {
+ LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
+ DEBUGT("object_data: {}~{}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len());
+ return ctx.tm.get_pins(
+ ctx.t,
+ object_data.get_reserved_data_base() + size,
+ object_data.get_reserved_data_len() - size
+ ).si_then([ctx, size, &pins, &object_data, &to_write](auto _pins) {
+ _pins.swap(pins);
+ ceph_assert(pins.size());
+ if (!size) {
+ // no need to reserve region if we are truncating the object's
+ // size to 0
+ return clear_iertr::now();
+ }
+ auto &pin = *pins.front();
+ ceph_assert(pin.get_key() >= object_data.get_reserved_data_base());
+ ceph_assert(
+ pin.get_key() <= object_data.get_reserved_data_base() + size);
+ auto pin_offset = pin.get_key() -
+ object_data.get_reserved_data_base();
+ if ((pin.get_key() == (object_data.get_reserved_data_base() + size)) ||
+ (pin.get_val().is_zero())) {
+ /* First pin is exactly at the boundary or is a zero pin. Either way,
+ * remove all pins and add a single zero pin to the end. */
+ to_write.push_back(extent_to_write_t::create_zero(
+ pin.get_key(),
+ object_data.get_reserved_data_len() - pin_offset));
+ return clear_iertr::now();
+ } else {
+ /* First pin overlaps the boundary and has data, remap it
+ * if aligned or rewrite it if not aligned to size */
+ auto roundup_size = p2roundup(size, ctx.tm.get_block_size());
+ auto append_len = roundup_size - size;
+ if (append_len == 0) {
+ LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
+ TRACET("First pin overlaps the boundary and has aligned data"
+ "create existing at addr:{}, len:{}",
+ ctx.t, pin.get_key(), size - pin_offset);
+ to_write.push_back(extent_to_write_t::create_existing(
+ pin.duplicate(),
+ pin.get_key(),
+ size - pin_offset));
+ to_write.push_back(extent_to_write_t::create_zero(
+ object_data.get_reserved_data_base() + roundup_size,
+ object_data.get_reserved_data_len() - roundup_size));
+ return clear_iertr::now();
+ } else {
+ return ctx.tm.read_pin<ObjectDataBlock>(
+ ctx.t,
+ pin.duplicate()
+ ).si_then([ctx, size, pin_offset, append_len, roundup_size,
+ &pin, &object_data, &to_write](auto extent) {
+ bufferlist bl;
+ bl.append(
+ bufferptr(
+ extent->get_bptr(),
+ pin.get_intermediate_offset(),
+ size - pin_offset
+ ));
+ bl.append_zero(append_len);
+ LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
+ TRACET("First pin overlaps the boundary and has unaligned data"
+ "create data at addr:{}, len:{}",
+ ctx.t, pin.get_key(), bl.length());
+ to_write.push_back(extent_to_write_t::create_data(
+ pin.get_key(),
+ bl));
+ to_write.push_back(extent_to_write_t::create_zero(
+ object_data.get_reserved_data_base() + roundup_size,
+ object_data.get_reserved_data_len() - roundup_size));
+ return clear_iertr::now();
+ });
+ }
+ }
+ }).si_then([ctx, size, &to_write, &object_data, &pins] {
+ return seastar::do_with(
+ prepare_ops_list(pins, to_write),
+ [ctx, size, &object_data](auto &ops) {
+ return do_remappings(ctx, ops.to_remap
+ ).si_then([ctx, &ops] {
+ return do_removals(ctx, ops.to_remove);
+ }).si_then([ctx, &ops] {
+ return do_insertions(ctx, ops.to_insert);
+ }).si_then([size, &object_data] {
+ if (size == 0) {
+ object_data.clear();
+ }
+ return ObjectDataHandler::clear_iertr::now();
+ });
+ });
+ });
+ });
+}
+
+/**
+ * get_to_writes_with_zero_buffer
+ *
+ * Returns extent_to_write_t's reflecting a zero region extending
+ * from offset~len with headptr optionally on the left and tailptr
+ * optionally on the right.
+ */
+extent_to_write_list_t get_to_writes_with_zero_buffer(
+ const extent_len_t block_size,
+ laddr_t offset, extent_len_t len,
+ std::optional<bufferptr> &&headptr, std::optional<bufferptr> &&tailptr)
+{
+ auto zero_left = p2roundup(offset, (laddr_t)block_size);
+ auto zero_right = p2align(offset + len, (laddr_t)block_size);
+ auto left = headptr ? (offset - headptr->length()) : offset;
+ auto right = tailptr ?
+ (offset + len + tailptr->length()) :
+ (offset + len);
+
+ assert(
+ (headptr && ((zero_left - left) ==
+ p2roundup(headptr->length(), block_size))) ^
+ (!headptr && (zero_left == left)));
+ assert(
+ (tailptr && ((right - zero_right) ==
+ p2roundup(tailptr->length(), block_size))) ^
+ (!tailptr && (right == zero_right)));
+
+ assert(right > left);
+ assert((left % block_size) == 0);
+ assert((right % block_size) == 0);
+
+ // zero region too small for a reserved section,
+ // headptr and tailptr in same extent
+ if (zero_right <= zero_left) {
+ bufferlist bl;
+ if (headptr) {
+ bl.append(*headptr);
+ }
+ bl.append_zero(
+ right - left - bl.length() - (tailptr ? tailptr->length() : 0));
+ if (tailptr) {
+ bl.append(*tailptr);
+ }
+ assert(bl.length() % block_size == 0);
+ assert(bl.length() == (right - left));
+ extent_to_write_list_t ret;
+ ret.push_back(extent_to_write_t::create_data(left, bl));
+ return ret;
+ } else {
+ // reserved section between ends, headptr and tailptr in different extents
+ extent_to_write_list_t ret;
+ if (headptr) {
+ bufferlist headbl;
+ headbl.append(*headptr);
+ headbl.append_zero(zero_left - left - headbl.length());
+ assert(headbl.length() % block_size == 0);
+ assert(headbl.length() > 0);
+ ret.push_back(extent_to_write_t::create_data(left, headbl));
+ }
+ // reserved zero region
+ ret.push_back(extent_to_write_t::create_zero(zero_left, zero_right - zero_left));
+ assert(ret.back().len % block_size == 0);
+ assert(ret.back().len > 0);
+ if (tailptr) {
+ bufferlist tailbl;
+ tailbl.append(*tailptr);
+ tailbl.append_zero(right - zero_right - tailbl.length());
+ assert(tailbl.length() % block_size == 0);
+ assert(tailbl.length() > 0);
+ ret.push_back(extent_to_write_t::create_data(zero_right, tailbl));
+ }
+ return ret;
+ }
+}
+
+/**
+ * get_to_writes
+ *
+ * Returns extent_to_write_t's from bl.
+ *
+ * TODO: probably add some kind of upper limit on extent size.
+ */
+extent_to_write_list_t get_to_writes(laddr_t offset, bufferlist &bl)
+{
+ auto ret = extent_to_write_list_t();
+ ret.push_back(extent_to_write_t::create_data(offset, bl));
+ return ret;
+};
+
+ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
+ context_t ctx,
+ laddr_t offset,
+ extent_len_t len,
+ std::optional<bufferlist> &&bl,
+ lba_pin_list_t &&_pins)
+{
+ if (bl.has_value()) {
+ assert(bl->length() == len);
+ }
+ overwrite_plan_t overwrite_plan(offset, len, _pins, ctx.tm.get_block_size(), ctx.t);
+ return seastar::do_with(
+ std::move(_pins),
+ extent_to_write_list_t(),
+ [ctx, len, offset, overwrite_plan, bl=std::move(bl)]
+ (auto &pins, auto &to_write) mutable
+ {
+ LOG_PREFIX(ObjectDataHandler::overwrite);
+ DEBUGT("overwrite: {}~{}",
+ ctx.t,
+ offset,
+ len);
+ ceph_assert(pins.size() >= 1);
+ DEBUGT("overwrite: split overwrite_plan {}", ctx.t, overwrite_plan);
+
+ return operate_left(
+ ctx,
+ pins.front(),
+ overwrite_plan
+ ).si_then([ctx, len, offset, overwrite_plan, bl=std::move(bl),
+ &to_write, &pins](auto p) mutable {
+ auto &[left_extent, headptr] = p;
+ if (left_extent) {
+ ceph_assert(left_extent->addr == overwrite_plan.pin_begin);
+ append_extent_to_write(to_write, std::move(*left_extent));
+ }
+ if (headptr) {
+ assert(headptr->length() > 0);
+ }
+ return operate_right(
+ ctx,
+ pins.back(),
+ overwrite_plan
+ ).si_then([ctx, len, offset,
+ pin_begin=overwrite_plan.pin_begin,
+ pin_end=overwrite_plan.pin_end,
+ bl=std::move(bl), headptr=std::move(headptr),
+ &to_write, &pins](auto p) mutable {
+ auto &[right_extent, tailptr] = p;
+ if (bl.has_value()) {
+ auto write_offset = offset;
+ bufferlist write_bl;
+ if (headptr) {
+ write_bl.append(*headptr);
+ write_offset -= headptr->length();
+ assert_aligned(write_offset);
+ }
+ write_bl.claim_append(*bl);
+ if (tailptr) {
+ write_bl.append(*tailptr);
+ assert_aligned(write_bl.length());
+ }
+ splice_extent_to_write(
+ to_write,
+ get_to_writes(write_offset, write_bl));
+ } else {
+ splice_extent_to_write(
+ to_write,
+ get_to_writes_with_zero_buffer(
+ ctx.tm.get_block_size(),
+ offset,
+ len,
+ std::move(headptr),
+ std::move(tailptr)));
+ }
+ if (right_extent) {
+ ceph_assert(right_extent->get_end_addr() == pin_end);
+ append_extent_to_write(to_write, std::move(*right_extent));
+ }
+ assert(to_write.size());
+ assert(pin_begin == to_write.front().addr);
+ assert(pin_end == to_write.back().get_end_addr());
+
+ return seastar::do_with(
+ prepare_ops_list(pins, to_write),
+ [ctx](auto &ops) {
+ return do_remappings(ctx, ops.to_remap
+ ).si_then([ctx, &ops] {
+ return do_removals(ctx, ops.to_remove);
+ }).si_then([ctx, &ops] {
+ return do_insertions(ctx, ops.to_insert);
+ });
+ });
+ });
+ });
+ });
+}
+
+ObjectDataHandler::zero_ret ObjectDataHandler::zero(
+ context_t ctx,
+ objaddr_t offset,
+ extent_len_t len)
+{
+ return with_object_data(
+ ctx,
+ [this, ctx, offset, len](auto &object_data) {
+ LOG_PREFIX(ObjectDataHandler::zero);
+ DEBUGT("zero to {}~{}, object_data: {}~{}, is_null {}",
+ ctx.t,
+ offset,
+ len,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len(),
+ object_data.is_null());
+ return prepare_data_reservation(
+ ctx,
+ object_data,
+ p2roundup(offset + len, ctx.tm.get_block_size())
+ ).si_then([this, ctx, offset, len, &object_data] {
+ auto logical_offset = object_data.get_reserved_data_base() + offset;
+ return ctx.tm.get_pins(
+ ctx.t,
+ logical_offset,
+ len
+ ).si_then([this, ctx, logical_offset, len](auto pins) {
+ return overwrite(
+ ctx, logical_offset, len,
+ std::nullopt, std::move(pins));
+ });
+ });
+ });
+}
+
+ObjectDataHandler::write_ret ObjectDataHandler::write(
+ context_t ctx,
+ objaddr_t offset,
+ const bufferlist &bl)
+{
+ return with_object_data(
+ ctx,
+ [this, ctx, offset, &bl](auto &object_data) {
+ LOG_PREFIX(ObjectDataHandler::write);
+ DEBUGT("writing to {}~{}, object_data: {}~{}, is_null {}",
+ ctx.t,
+ offset,
+ bl.length(),
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len(),
+ object_data.is_null());
+ return prepare_data_reservation(
+ ctx,
+ object_data,
+ p2roundup(offset + bl.length(), ctx.tm.get_block_size())
+ ).si_then([this, ctx, offset, &object_data, &bl] {
+ auto logical_offset = object_data.get_reserved_data_base() + offset;
+ return ctx.tm.get_pins(
+ ctx.t,
+ logical_offset,
+ bl.length()
+ ).si_then([this, ctx,logical_offset, &bl](
+ auto pins) {
+ return overwrite(
+ ctx, logical_offset, bl.length(),
+ bufferlist(bl), std::move(pins));
+ });
+ });
+ });
+}
+
+ObjectDataHandler::read_ret ObjectDataHandler::read(
+ context_t ctx,
+ objaddr_t obj_offset,
+ extent_len_t len)
+{
+ return seastar::do_with(
+ bufferlist(),
+ [ctx, obj_offset, len](auto &ret) {
+ return with_object_data(
+ ctx,
+ [ctx, obj_offset, len, &ret](const auto &object_data) {
+ LOG_PREFIX(ObjectDataHandler::read);
+ DEBUGT("reading {}~{}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len());
+ /* Assumption: callers ensure that onode size is <= reserved
+ * size and that len is adjusted here prior to call */
+ ceph_assert(!object_data.is_null());
+ ceph_assert((obj_offset + len) <= object_data.get_reserved_data_len());
+ ceph_assert(len > 0);
+ laddr_t loffset =
+ object_data.get_reserved_data_base() + obj_offset;
+ return ctx.tm.get_pins(
+ ctx.t,
+ loffset,
+ len
+ ).si_then([ctx, loffset, len, &ret](auto _pins) {
+ // offset~len falls within reserved region and len > 0
+ ceph_assert(_pins.size() >= 1);
+ ceph_assert((*_pins.begin())->get_key() <= loffset);
+ return seastar::do_with(
+ std::move(_pins),
+ loffset,
+ [ctx, loffset, len, &ret](auto &pins, auto &current) {
+ return trans_intr::do_for_each(
+ pins,
+ [ctx, loffset, len, &current, &ret](auto &pin)
+ -> read_iertr::future<> {
+ ceph_assert(current <= (loffset + len));
+ ceph_assert(
+ (loffset + len) > pin->get_key());
+ laddr_t end = std::min(
+ pin->get_key() + pin->get_length(),
+ loffset + len);
+ if (pin->get_val().is_zero()) {
+ ceph_assert(end > current); // See LBAManager::get_mappings
+ ret.append_zero(end - current);
+ current = end;
+ return seastar::now();
+ } else {
+ LOG_PREFIX(ObjectDataHandler::read);
+ auto key = pin->get_key();
+ bool is_indirect = pin->is_indirect();
+ extent_len_t off = pin->get_intermediate_offset();
+ DEBUGT("reading {}~{}, indirect: {}, "
+ "intermediate offset: {}, current: {}, end: {}",
+ ctx.t,
+ key,
+ pin->get_length(),
+ is_indirect,
+ off,
+ current,
+ end);
+ return ctx.tm.read_pin<ObjectDataBlock>(
+ ctx.t,
+ std::move(pin)
+ ).si_then([&ret, &current, end, key, off,
+ is_indirect](auto extent) {
+ ceph_assert(
+ is_indirect
+ ? (key - off + extent->get_length()) >= end
+ : (extent->get_laddr() + extent->get_length()) >= end);
+ ceph_assert(end > current);
+ ret.append(
+ bufferptr(
+ extent->get_bptr(),
+ off + current - (is_indirect ? key : extent->get_laddr()),
+ end - current));
+ current = end;
+ return seastar::now();
+ }).handle_error_interruptible(
+ read_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "ObjectDataHandler::read hit invalid error"
+ }
+ );
+ }
+ });
+ });
+ });
+ }).si_then([&ret] {
+ return std::move(ret);
+ });
+ });
+}
+
+ObjectDataHandler::fiemap_ret ObjectDataHandler::fiemap(
+ context_t ctx,
+ objaddr_t obj_offset,
+ extent_len_t len)
+{
+ return seastar::do_with(
+ std::map<uint64_t, uint64_t>(),
+ [ctx, obj_offset, len](auto &ret) {
+ return with_object_data(
+ ctx,
+ [ctx, obj_offset, len, &ret](const auto &object_data) {
+ LOG_PREFIX(ObjectDataHandler::fiemap);
+ DEBUGT(
+ "{}~{}, reservation {}~{}",
+ ctx.t,
+ obj_offset,
+ len,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len());
+ /* Assumption: callers ensure that onode size is <= reserved
+ * size and that len is adjusted here prior to call */
+ ceph_assert(!object_data.is_null());
+ ceph_assert((obj_offset + len) <= object_data.get_reserved_data_len());
+ ceph_assert(len > 0);
+ laddr_t loffset =
+ object_data.get_reserved_data_base() + obj_offset;
+ return ctx.tm.get_pins(
+ ctx.t,
+ loffset,
+ len
+ ).si_then([loffset, len, &object_data, &ret](auto &&pins) {
+ ceph_assert(pins.size() >= 1);
+ ceph_assert((*pins.begin())->get_key() <= loffset);
+ for (auto &&i: pins) {
+ if (!(i->get_val().is_zero())) {
+ auto ret_left = std::max(i->get_key(), loffset);
+ auto ret_right = std::min(
+ i->get_key() + i->get_length(),
+ loffset + len);
+ assert(ret_right > ret_left);
+ ret.emplace(
+ std::make_pair(
+ ret_left - object_data.get_reserved_data_base(),
+ ret_right - ret_left
+ ));
+ }
+ }
+ });
+ }).si_then([&ret] {
+ return std::move(ret);
+ });
+ });
+}
+
+ObjectDataHandler::truncate_ret ObjectDataHandler::truncate(
+ context_t ctx,
+ objaddr_t offset)
+{
+ return with_object_data(
+ ctx,
+ [this, ctx, offset](auto &object_data) {
+ LOG_PREFIX(ObjectDataHandler::truncate);
+ DEBUGT("truncating {}~{} offset: {}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len(),
+ offset);
+ if (offset < object_data.get_reserved_data_len()) {
+ return trim_data_reservation(ctx, object_data, offset);
+ } else if (offset > object_data.get_reserved_data_len()) {
+ return prepare_data_reservation(
+ ctx,
+ object_data,
+ p2roundup(offset, ctx.tm.get_block_size()));
+ } else {
+ return truncate_iertr::now();
+ }
+ });
+}
+
+ObjectDataHandler::clear_ret ObjectDataHandler::clear(
+ context_t ctx)
+{
+ return with_object_data(
+ ctx,
+ [this, ctx](auto &object_data) {
+ LOG_PREFIX(ObjectDataHandler::clear);
+ DEBUGT("clearing: {}~{}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len());
+ if (object_data.is_null()) {
+ return clear_iertr::now();
+ }
+ return trim_data_reservation(ctx, object_data, 0);
+ });
+}
+
+ObjectDataHandler::clone_ret ObjectDataHandler::clone_extents(
+ context_t ctx,
+ object_data_t &object_data,
+ lba_pin_list_t &pins,
+ laddr_t data_base)
+{
+ LOG_PREFIX(ObjectDataHandler::clone_extents);
+ TRACET(" object_data: {}~{}, data_base: {}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len(),
+ data_base);
+ return ctx.tm.dec_ref(
+ ctx.t,
+ object_data.get_reserved_data_base()
+ ).si_then(
+ [&pins, &object_data, ctx, data_base](auto) mutable {
+ return seastar::do_with(
+ (extent_len_t)0,
+ [&object_data, ctx, data_base, &pins](auto &last_pos) {
+ return trans_intr::do_for_each(
+ pins,
+ [&last_pos, &object_data, ctx, data_base](auto &pin) {
+ auto offset = pin->get_key() - data_base;
+ ceph_assert(offset == last_pos);
+ auto fut = TransactionManager::alloc_extent_iertr
+ ::make_ready_future<LBAMappingRef>();
+ auto addr = object_data.get_reserved_data_base() + offset;
+ if (pin->get_val().is_zero()) {
+ fut = ctx.tm.reserve_region(ctx.t, addr, pin->get_length());
+ } else {
+ fut = ctx.tm.clone_pin(ctx.t, addr, *pin);
+ }
+ return fut.si_then(
+ [&pin, &last_pos, offset](auto) {
+ last_pos = offset + pin->get_length();
+ return seastar::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all("not possible")
+ );
+ }).si_then([&last_pos, &object_data, ctx] {
+ if (last_pos != object_data.get_reserved_data_len()) {
+ return ctx.tm.reserve_region(
+ ctx.t,
+ object_data.get_reserved_data_base() + last_pos,
+ object_data.get_reserved_data_len() - last_pos
+ ).si_then([](auto) {
+ return seastar::now();
+ });
+ }
+ return TransactionManager::reserve_extent_iertr::now();
+ });
+ });
+ },
+ ObjectDataHandler::write_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "object_data_handler::clone invalid error"
+ }
+ );
+}
+
+ObjectDataHandler::clone_ret ObjectDataHandler::clone(
+ context_t ctx)
+{
+ // the whole clone procedure can be seperated into the following steps:
+ // 1. let clone onode(d_object_data) take the head onode's
+ // object data base;
+ // 2. reserve a new region in lba tree for the head onode;
+ // 3. clone all extents of the clone onode, see transaction_manager.h
+ // for the details of clone_pin;
+ // 4. reserve the space between the head onode's size and its reservation
+ // length.
+ return with_objects_data(
+ ctx,
+ [ctx, this](auto &object_data, auto &d_object_data) {
+ ceph_assert(d_object_data.is_null());
+ if (object_data.is_null()) {
+ return clone_iertr::now();
+ }
+ return prepare_data_reservation(
+ ctx,
+ d_object_data,
+ object_data.get_reserved_data_len()
+ ).si_then([&object_data, &d_object_data, ctx, this] {
+ assert(!object_data.is_null());
+ auto base = object_data.get_reserved_data_base();
+ auto len = object_data.get_reserved_data_len();
+ object_data.clear();
+ LOG_PREFIX(ObjectDataHandler::clone);
+ DEBUGT("cloned obj reserve_data_base: {}, len {}",
+ ctx.t,
+ d_object_data.get_reserved_data_base(),
+ d_object_data.get_reserved_data_len());
+ return prepare_data_reservation(
+ ctx,
+ object_data,
+ d_object_data.get_reserved_data_len()
+ ).si_then([&d_object_data, ctx, &object_data, base, len, this] {
+ LOG_PREFIX("ObjectDataHandler::clone");
+ DEBUGT("head obj reserve_data_base: {}, len {}",
+ ctx.t,
+ object_data.get_reserved_data_base(),
+ object_data.get_reserved_data_len());
+ return ctx.tm.get_pins(ctx.t, base, len
+ ).si_then([ctx, &object_data, &d_object_data, base, this](auto pins) {
+ return seastar::do_with(
+ std::move(pins),
+ [ctx, &object_data, &d_object_data, base, this](auto &pins) {
+ return clone_extents(ctx, object_data, pins, base
+ ).si_then([ctx, &d_object_data, base, &pins, this] {
+ return clone_extents(ctx, d_object_data, pins, base);
+ }).si_then([&pins, ctx] {
+ return do_removals(ctx, pins);
+ });
+ });
+ });
+ });
+ });
+ });
+}
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/object_data_handler.h b/src/crimson/os/seastore/object_data_handler.h
new file mode 100644
index 000000000..b5f432d5a
--- /dev/null
+++ b/src/crimson/os/seastore/object_data_handler.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include "include/buffer.h"
+
+#include "test/crimson/seastore/test_block.h" // TODO
+
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore {
+
+struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent {
+ using Ref = TCachedExtentRef<ObjectDataBlock>;
+
+ explicit ObjectDataBlock(ceph::bufferptr &&ptr)
+ : LogicalCachedExtent(std::move(ptr)) {}
+ explicit ObjectDataBlock(const ObjectDataBlock &other)
+ : LogicalCachedExtent(other) {}
+ explicit ObjectDataBlock(extent_len_t length)
+ : LogicalCachedExtent(length) {}
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ return CachedExtentRef(new ObjectDataBlock(*this));
+ };
+
+ static constexpr extent_types_t TYPE = extent_types_t::OBJECT_DATA_BLOCK;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ ceph::bufferlist get_delta() final {
+ /* Currently, we always allocate fresh ObjectDataBlock's rather than
+ * mutating existing ones. */
+ ceph_assert(0 == "Should be impossible");
+ }
+
+ void apply_delta(const ceph::bufferlist &bl) final {
+ // See get_delta()
+ ceph_assert(0 == "Should be impossible");
+ }
+};
+using ObjectDataBlockRef = TCachedExtentRef<ObjectDataBlock>;
+
+class ObjectDataHandler {
+public:
+ using base_iertr = TransactionManager::base_iertr;
+
+ ObjectDataHandler(uint32_t mos) : max_object_size(mos) {}
+
+ struct context_t {
+ TransactionManager &tm;
+ Transaction &t;
+ Onode &onode;
+ Onode *d_onode = nullptr; // The desination node in case of clone
+ };
+
+ /// Writes bl to [offset, offset + bl.length())
+ using write_iertr = base_iertr;
+ using write_ret = write_iertr::future<>;
+ write_ret write(
+ context_t ctx,
+ objaddr_t offset,
+ const bufferlist &bl);
+
+ using zero_iertr = base_iertr;
+ using zero_ret = zero_iertr::future<>;
+ zero_ret zero(
+ context_t ctx,
+ objaddr_t offset,
+ extent_len_t len);
+
+ /// Reads data in [offset, offset + len)
+ using read_iertr = base_iertr;
+ using read_ret = read_iertr::future<bufferlist>;
+ read_ret read(
+ context_t ctx,
+ objaddr_t offset,
+ extent_len_t len);
+
+ /// sparse read data, get range interval in [offset, offset + len)
+ using fiemap_iertr = base_iertr;
+ using fiemap_ret = fiemap_iertr::future<std::map<uint64_t, uint64_t>>;
+ fiemap_ret fiemap(
+ context_t ctx,
+ objaddr_t offset,
+ extent_len_t len);
+
+ /// Clears data past offset
+ using truncate_iertr = base_iertr;
+ using truncate_ret = truncate_iertr::future<>;
+ truncate_ret truncate(
+ context_t ctx,
+ objaddr_t offset);
+
+ /// Clears data and reservation
+ using clear_iertr = base_iertr;
+ using clear_ret = clear_iertr::future<>;
+ clear_ret clear(context_t ctx);
+
+ /// Clone data of an Onode
+ using clone_iertr = base_iertr;
+ using clone_ret = clone_iertr::future<>;
+ clone_ret clone(context_t ctx);
+
+private:
+ /// Updates region [_offset, _offset + bl.length) to bl
+ write_ret overwrite(
+ context_t ctx, ///< [in] ctx
+ laddr_t offset, ///< [in] write offset
+ extent_len_t len, ///< [in] len to write, len == bl->length() if bl
+ std::optional<bufferlist> &&bl, ///< [in] buffer to write, empty for zeros
+ lba_pin_list_t &&pins ///< [in] set of pins overlapping above region
+ );
+
+ /// Ensures object_data reserved region is prepared
+ write_ret prepare_data_reservation(
+ context_t ctx,
+ object_data_t &object_data,
+ extent_len_t size);
+
+ /// Trims data past size
+ clear_ret trim_data_reservation(
+ context_t ctx,
+ object_data_t &object_data,
+ extent_len_t size);
+
+ clone_ret clone_extents(
+ context_t ctx,
+ object_data_t &object_data,
+ lba_pin_list_t &pins,
+ laddr_t data_base);
+
+private:
+ /**
+ * max_object_size
+ *
+ * For now, we allocate a fixed region of laddr space of size max_object_size
+ * for any object. In the future, once we have the ability to remap logical
+ * mappings (necessary for clone), we'll add the ability to grow and shrink
+ * these regions and remove this assumption.
+ */
+ const uint32_t max_object_size = 0;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::ObjectDataBlock> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/omap_manager.cc b/src/crimson/os/seastore/omap_manager.cc
new file mode 100644
index 000000000..7ad37a2e9
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager.cc
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <experimental/iterator>
+#include <iostream>
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h"
+
+namespace crimson::os::seastore::omap_manager {
+
+OMapManagerRef create_omap_manager(TransactionManager &trans_manager) {
+ return OMapManagerRef(new BtreeOMapManager(trans_manager));
+}
+
+}
+
+namespace std {
+std::ostream &operator<<(std::ostream &out, const std::pair<std::string, std::string> &rhs)
+{
+ return out << "key_value_map (" << rhs.first<< "->" << rhs.second << ")";
+}
+}
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const std::list<std::string> &rhs)
+{
+ out << '[';
+ std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", "));
+ return out << ']';
+}
+
+std::ostream &operator<<(std::ostream &out, const std::vector<std::pair<std::string, std::string>> &rhs)
+{
+ out << '[';
+ std::ostream_iterator<std::pair<std::string, std::string>> out_it(out, ", ");
+ std::copy(rhs.begin(), rhs.end(), out_it);
+ return out << ']';
+}
+
+}
diff --git a/src/crimson/os/seastore/omap_manager.h b/src/crimson/os/seastore/omap_manager.h
new file mode 100644
index 000000000..fc4e03e2b
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#define OMAP_INNER_BLOCK_SIZE 4096
+#define OMAP_LEAF_BLOCK_SIZE 8192
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const std::list<std::string> &rhs);
+std::ostream &operator<<(std::ostream &out, const std::map<std::string, std::string> &rhs);
+
+class OMapManager {
+ /* all OMapManager API use reference to transfer input string parameters,
+ * the upper caller should guarantee the referenced string values alive (not freed)
+ * until these functions future resolved.
+ */
+public:
+ using base_iertr = TransactionManager::base_iertr;
+
+ /**
+ * allocate omap tree root node
+ *
+ * @param Transaction &t, current transaction
+ * @retval return the omap_root_t structure.
+ */
+ using initialize_omap_iertr = base_iertr;
+ using initialize_omap_ret = initialize_omap_iertr::future<omap_root_t>;
+ virtual initialize_omap_ret initialize_omap(Transaction &t, laddr_t hint) = 0;
+
+ /**
+ * get value(string) by key(string)
+ *
+ * @param omap_root_t &omap_root, omap btree root information
+ * @param Transaction &t, current transaction
+ * @param string &key, omap string key
+ * @retval return string key->string value mapping pair.
+ */
+ using omap_get_value_iertr = base_iertr;
+ using omap_get_value_ret = omap_get_value_iertr::future<
+ std::optional<bufferlist>>;
+ virtual omap_get_value_ret omap_get_value(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key) = 0;
+
+ /**
+ * set key value mapping in omap
+ *
+ * @param omap_root_t &omap_root, omap btree root information
+ * @param Transaction &t, current transaction
+ * @param string &key, omap string key
+ * @param string &value, mapped value corresponding key
+ */
+ using omap_set_key_iertr = base_iertr;
+ using omap_set_key_ret = omap_set_key_iertr::future<>;
+ virtual omap_set_key_ret omap_set_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key,
+ const ceph::bufferlist &value) = 0;
+
+ using omap_set_keys_iertr = base_iertr;
+ using omap_set_keys_ret = omap_set_keys_iertr::future<>;
+ virtual omap_set_keys_ret omap_set_keys(
+ omap_root_t &omap_root,
+ Transaction &t,
+ std::map<std::string, ceph::bufferlist>&& keys) = 0;
+
+ /**
+ * remove key value mapping in omap tree
+ *
+ * @param omap_root_t &omap_root, omap btree root information
+ * @param Transaction &t, current transaction
+ * @param string &key, omap string key
+ */
+ using omap_rm_key_iertr = base_iertr;
+ using omap_rm_key_ret = omap_rm_key_iertr::future<>;
+ virtual omap_rm_key_ret omap_rm_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key) = 0;
+
+ /**
+ * omap_list
+ *
+ * Scans key/value pairs in order.
+ *
+ * @param omap_root: omap btree root information
+ * @param t: current transaction
+ * @param first: range start, nullopt sorts before any string,
+ * behavior based on config.inclusive,
+ * must alive during the call
+ * @param last: range end, nullopt sorts after any string,
+ * behavior based on config.inclusive,
+ * must alive during the call
+ * @param config: see below for params
+ * @retval listed key->value and bool indicating complete
+ */
+ struct omap_list_config_t {
+ /// max results to return
+ size_t max_result_size = 128;
+
+ /// true denotes behavior like lower_bound, upper_bound otherwise
+ /// range start behavior
+ bool first_inclusive = false;
+ /// range end behavior
+ bool last_inclusive = false;
+
+ omap_list_config_t(
+ size_t max_result_size,
+ bool first_inclusive,
+ bool last_inclusive)
+ : max_result_size(max_result_size),
+ first_inclusive(first_inclusive),
+ last_inclusive(last_inclusive) {}
+ omap_list_config_t() {}
+ omap_list_config_t(const omap_list_config_t &) = default;
+ omap_list_config_t(omap_list_config_t &&) = default;
+ omap_list_config_t &operator=(const omap_list_config_t &) = default;
+ omap_list_config_t &operator=(omap_list_config_t &&) = default;
+
+ auto with_max(size_t max) {
+ this->max_result_size = max;
+ return *this;
+ }
+
+ auto without_max() {
+ this->max_result_size = std::numeric_limits<size_t>::max();
+ return *this;
+ }
+
+ auto with_inclusive(
+ bool first_inclusive,
+ bool last_inclusive) {
+ this->first_inclusive = first_inclusive;
+ this->last_inclusive = last_inclusive;
+ return *this;
+ }
+
+ auto with_reduced_max(size_t reduced_by) const {
+ assert(reduced_by <= max_result_size);
+ return omap_list_config_t(
+ max_result_size - reduced_by,
+ first_inclusive,
+ last_inclusive);
+ }
+ };
+ using omap_list_iertr = base_iertr;
+ using omap_list_bare_ret = std::tuple<
+ bool,
+ std::map<std::string, bufferlist, std::less<>>>;
+ using omap_list_ret = omap_list_iertr::future<omap_list_bare_ret>;
+ virtual omap_list_ret omap_list(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config = omap_list_config_t()) = 0;
+
+ /**
+ * remove key value mappings in a key range from omap tree
+ *
+ * @param omap_root_t &omap_root, omap btree root information
+ * @param Transaction &t, current transaction
+ * @param string &first, range start
+ * @param string &last, range end
+ */
+ using omap_rm_key_range_iertr = base_iertr;
+ using omap_rm_key_range_ret = omap_rm_key_range_iertr::future<>;
+ virtual omap_rm_key_range_ret omap_rm_key_range(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &first,
+ const std::string &last,
+ omap_list_config_t config) = 0;
+
+ /**
+ * clear all omap tree key->value mapping
+ *
+ * @param omap_root_t &omap_root, omap btree root information
+ * @param Transaction &t, current transaction
+ */
+ using omap_clear_iertr = base_iertr;
+ using omap_clear_ret = omap_clear_iertr::future<>;
+ virtual omap_clear_ret omap_clear(omap_root_t &omap_root, Transaction &t) = 0;
+
+ virtual ~OMapManager() {}
+};
+using OMapManagerRef = std::unique_ptr<OMapManager>;
+
+namespace omap_manager {
+
+OMapManagerRef create_omap_manager (
+ TransactionManager &trans_manager);
+}
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc
new file mode 100644
index 000000000..1782d7ee6
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc
@@ -0,0 +1,293 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
+
+SET_SUBSYS(seastore_omap);
+
+namespace crimson::os::seastore::omap_manager {
+
+BtreeOMapManager::BtreeOMapManager(
+ TransactionManager &tm)
+ : tm(tm) {}
+
+BtreeOMapManager::initialize_omap_ret
+BtreeOMapManager::initialize_omap(Transaction &t, laddr_t hint)
+{
+ LOG_PREFIX(BtreeOMapManager::initialize_omap);
+ DEBUGT("hint: {}", t, hint);
+ return tm.alloc_extent<OMapLeafNode>(t, hint, OMAP_LEAF_BLOCK_SIZE)
+ .si_then([hint, &t](auto&& root_extent) {
+ root_extent->set_size(0);
+ omap_node_meta_t meta{1};
+ root_extent->set_meta(meta);
+ omap_root_t omap_root;
+ omap_root.update(root_extent->get_laddr(), 1, hint);
+ t.get_omap_tree_stats().depth = 1u;
+ t.get_omap_tree_stats().extents_num_delta++;
+ return initialize_omap_iertr::make_ready_future<omap_root_t>(omap_root);
+ });
+}
+
+BtreeOMapManager::get_root_ret
+BtreeOMapManager::get_omap_root(omap_context_t oc, const omap_root_t &omap_root)
+{
+ assert(omap_root.get_location() != L_ADDR_NULL);
+ laddr_t laddr = omap_root.get_location();
+ return omap_load_extent(oc, laddr, omap_root.get_depth());
+}
+
+BtreeOMapManager::handle_root_split_ret
+BtreeOMapManager::handle_root_split(
+ omap_context_t oc,
+ omap_root_t &omap_root,
+ const OMapNode::mutation_result_t& mresult)
+{
+ LOG_PREFIX(BtreeOMapManager::handle_root_split);
+ DEBUGT("{}", oc.t, omap_root);
+ return oc.tm.alloc_extent<OMapInnerNode>(oc.t, omap_root.hint,
+ OMAP_INNER_BLOCK_SIZE)
+ .si_then([&omap_root, mresult, oc](auto&& nroot) -> handle_root_split_ret {
+ auto [left, right, pivot] = *(mresult.split_tuple);
+ omap_node_meta_t meta{omap_root.depth + 1};
+ nroot->set_meta(meta);
+ nroot->journal_inner_insert(nroot->iter_begin(), left->get_laddr(),
+ "", nroot->maybe_get_delta_buffer());
+ nroot->journal_inner_insert(nroot->iter_begin() + 1, right->get_laddr(),
+ pivot, nroot->maybe_get_delta_buffer());
+ omap_root.update(nroot->get_laddr(), omap_root.get_depth() + 1, omap_root.hint);
+ oc.t.get_omap_tree_stats().depth = omap_root.depth;
+ ++(oc.t.get_omap_tree_stats().extents_num_delta);
+ return seastar::now();
+ });
+}
+
+BtreeOMapManager::handle_root_merge_ret
+BtreeOMapManager::handle_root_merge(
+ omap_context_t oc,
+ omap_root_t &omap_root,
+ OMapNode::mutation_result_t mresult)
+{
+ LOG_PREFIX(BtreeOMapManager::handle_root_merge);
+ DEBUGT("{}", oc.t, omap_root);
+ auto root = *(mresult.need_merge);
+ auto iter = root->cast<OMapInnerNode>()->iter_begin();
+ omap_root.update(
+ iter->get_val(),
+ omap_root.depth -= 1,
+ omap_root.hint);
+ oc.t.get_omap_tree_stats().depth = omap_root.depth;
+ oc.t.get_omap_tree_stats().extents_num_delta--;
+ return oc.tm.dec_ref(oc.t, root->get_laddr()
+ ).si_then([](auto &&ret) -> handle_root_merge_ret {
+ return seastar::now();
+ }).handle_error_interruptible(
+ handle_root_merge_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in handle_root_merge"
+ }
+ );
+}
+
+BtreeOMapManager::omap_get_value_ret
+BtreeOMapManager::omap_get_value(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_get_value);
+ DEBUGT("key={}", t, key);
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, &t, &key, &omap_root](auto&& extent) {
+ return extent->get_value(get_omap_context(t, omap_root.hint), key);
+ }).si_then([](auto &&e) {
+ return omap_get_value_ret(
+ interruptible::ready_future_marker{},
+ std::move(e));
+ });
+}
+
+BtreeOMapManager::omap_set_keys_ret
+BtreeOMapManager::omap_set_keys(
+ omap_root_t &omap_root,
+ Transaction &t,
+ std::map<std::string, ceph::bufferlist>&& keys)
+{
+ return seastar::do_with(std::move(keys), [&, this](auto& keys) {
+ return trans_intr::do_for_each(
+ keys.begin(),
+ keys.end(),
+ [&, this](auto &p) {
+ return omap_set_key(omap_root, t, p.first, p.second);
+ });
+ });
+}
+
+BtreeOMapManager::omap_set_key_ret
+BtreeOMapManager::omap_set_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key,
+ const ceph::bufferlist &value)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_set_key);
+ DEBUGT("{} -> {}", t, key, value);
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, &t, &key, &value, &omap_root](auto root) {
+ return root->insert(get_omap_context(t, omap_root.hint), key, value);
+ }).si_then([this, &omap_root, &t](auto mresult) -> omap_set_key_ret {
+ if (mresult.status == mutation_status_t::SUCCESS)
+ return seastar::now();
+ else if (mresult.status == mutation_status_t::WAS_SPLIT)
+ return handle_root_split(get_omap_context(t, omap_root.hint), omap_root, mresult);
+ else
+ return seastar::now();
+ });
+}
+
+BtreeOMapManager::omap_rm_key_ret
+BtreeOMapManager::omap_rm_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_rm_key);
+ DEBUGT("{}", t, key);
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, &t, &key, &omap_root](auto root) {
+ return root->rm_key(get_omap_context(t, omap_root.hint), key);
+ }).si_then([this, &omap_root, &t](auto mresult) -> omap_rm_key_ret {
+ if (mresult.status == mutation_status_t::SUCCESS) {
+ return seastar::now();
+ } else if (mresult.status == mutation_status_t::WAS_SPLIT) {
+ return handle_root_split(get_omap_context(t, omap_root.hint), omap_root, mresult);
+ } else if (mresult.status == mutation_status_t::NEED_MERGE) {
+ auto root = *(mresult.need_merge);
+ if (root->get_node_size() == 1 && omap_root.depth != 1) {
+ return handle_root_merge(get_omap_context(t, omap_root.hint), omap_root, mresult);
+ } else {
+ return seastar::now();
+ }
+ } else {
+ return seastar::now();
+ }
+ });
+
+}
+
+BtreeOMapManager::omap_rm_key_range_ret
+BtreeOMapManager::omap_rm_key_range(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &first,
+ const std::string &last,
+ omap_list_config_t config)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_rm_key_range);
+ DEBUGT("{} ~ {}", t, first, last);
+ assert(first <= last);
+ return seastar::do_with(
+ std::make_optional<std::string>(first),
+ std::make_optional<std::string>(last),
+ [this, &omap_root, &t, config](auto &first, auto &last) {
+ return omap_list(
+ omap_root,
+ t,
+ first,
+ last,
+ config);
+ }).si_then([this, &omap_root, &t](auto results) {
+ LOG_PREFIX(BtreeOMapManager::omap_rm_key_range);
+ auto &[complete, kvs] = results;
+ std::vector<std::string> keys;
+ for (const auto& [k, _] : kvs) {
+ keys.push_back(k);
+ }
+ DEBUGT("total {} keys to remove", t, keys.size());
+ return seastar::do_with(
+ std::move(keys),
+ [this, &omap_root, &t](auto& keys) {
+ return trans_intr::do_for_each(
+ keys.begin(),
+ keys.end(),
+ [this, &omap_root, &t](auto& key) {
+ return omap_rm_key(omap_root, t, key);
+ });
+ });
+ });
+}
+
+BtreeOMapManager::omap_list_ret
+BtreeOMapManager::omap_list(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_list);
+ if (first && last) {
+ DEBUGT("{}, first: {}, last: {}", t, omap_root, *first, *last);
+ assert(last >= first);
+ } else if (first) {
+ DEBUGT("{}, first: {}", t, omap_root, *first);
+ } else if (last) {
+ DEBUGT("{}, last: {}", t, omap_root, *last);
+ } else {
+ DEBUGT("{}", t, omap_root);
+ }
+
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, config, &t, &first, &last, &omap_root](auto extent) {
+ return extent->list(
+ get_omap_context(t, omap_root.hint),
+ first,
+ last,
+ config);
+ });
+}
+
+BtreeOMapManager::omap_clear_ret
+BtreeOMapManager::omap_clear(
+ omap_root_t &omap_root,
+ Transaction &t)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_clear);
+ DEBUGT("{}", t, omap_root);
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, &t, &omap_root](auto extent) {
+ return extent->clear(get_omap_context(t, omap_root.hint));
+ }).si_then([this, &omap_root, &t] {
+ return tm.dec_ref(
+ t, omap_root.get_location()
+ ).si_then([&omap_root] (auto ret) {
+ omap_root.update(
+ L_ADDR_NULL,
+ 0, L_ADDR_MIN);
+ return omap_clear_iertr::now();
+ });
+ }).handle_error_interruptible(
+ omap_clear_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in BtreeOMapManager::omap_clear"
+ }
+ );
+}
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h
new file mode 100644
index 000000000..7fcba64c0
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+namespace crimson::os::seastore::omap_manager {
+/**
+ * BtreeOMapManager
+ *
+ * Uses a btree to track :
+ * string -> string mapping for each onode omap
+ */
+
+class BtreeOMapManager : public OMapManager {
+ TransactionManager &tm;
+
+ omap_context_t get_omap_context(
+ Transaction &t, laddr_t addr_min) {
+ return omap_context_t{tm, t, addr_min};
+ }
+
+ /* get_omap_root
+ *
+ * load omap tree root node
+ */
+ using get_root_iertr = base_iertr;
+ using get_root_ret = get_root_iertr::future<OMapNodeRef>;
+ static get_root_ret get_omap_root(
+ omap_context_t c,
+ const omap_root_t &omap_root);
+
+ /* handle_root_split
+ *
+ * root has been split and needs to update omap_root_t
+ */
+ using handle_root_split_iertr = base_iertr;
+ using handle_root_split_ret = handle_root_split_iertr::future<>;
+ handle_root_split_ret handle_root_split(
+ omap_context_t c,
+ omap_root_t &omap_root,
+ const OMapNode::mutation_result_t& mresult);
+
+ /* handle_root_merge
+ *
+ * root node has only one item and it is not leaf node, need remove a layer
+ */
+ using handle_root_merge_iertr = base_iertr;
+ using handle_root_merge_ret = handle_root_merge_iertr::future<>;
+ handle_root_merge_ret handle_root_merge(
+ omap_context_t oc,
+ omap_root_t &omap_root,
+ OMapNode:: mutation_result_t mresult);
+
+public:
+ explicit BtreeOMapManager(TransactionManager &tm);
+
+ initialize_omap_ret initialize_omap(Transaction &t, laddr_t hint) final;
+
+ omap_get_value_ret omap_get_value(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key) final;
+
+ omap_set_key_ret omap_set_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key, const ceph::bufferlist &value) final;
+
+ omap_set_keys_ret omap_set_keys(
+ omap_root_t &omap_root,
+ Transaction &t,
+ std::map<std::string, ceph::bufferlist>&& keys) final;
+
+ omap_rm_key_ret omap_rm_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key) final;
+
+ omap_rm_key_range_ret omap_rm_key_range(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &first,
+ const std::string &last,
+ omap_list_config_t config) final;
+
+ omap_list_ret omap_list(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config = omap_list_config_t()) final;
+
+ omap_clear_ret omap_clear(
+ omap_root_t &omap_root,
+ Transaction &t) final;
+
+};
+using BtreeOMapManagerRef = std::unique_ptr<BtreeOMapManager>;
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
new file mode 100644
index 000000000..795daeddb
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <string>
+#include <vector>
+
+//#include <boost/iterator/counting_iterator.hpp>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+
+namespace crimson::os::seastore::omap_manager{
+
+struct omap_context_t {
+ TransactionManager &tm;
+ Transaction &t;
+ laddr_t hint;
+};
+
+enum class mutation_status_t : uint8_t {
+ SUCCESS = 0,
+ WAS_SPLIT = 1,
+ NEED_MERGE = 2,
+ FAIL = 3
+};
+
+struct OMapNode : LogicalCachedExtent {
+ using base_iertr = OMapManager::base_iertr;
+
+ using OMapNodeRef = TCachedExtentRef<OMapNode>;
+
+ struct mutation_result_t {
+ mutation_status_t status;
+ /// Only populated if WAS_SPLIT, indicates the newly created left and right nodes
+ /// from splitting the target entry during insertion.
+ std::optional<std::tuple<OMapNodeRef, OMapNodeRef, std::string>> split_tuple;
+ /// only sopulated if need merged, indicate which entry need be doing merge in upper layer.
+ std::optional<OMapNodeRef> need_merge;
+
+ mutation_result_t(mutation_status_t s, std::optional<std::tuple<OMapNodeRef,
+ OMapNodeRef, std::string>> tuple, std::optional<OMapNodeRef> n_merge)
+ : status(s),
+ split_tuple(tuple),
+ need_merge(n_merge) {}
+ };
+
+ OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+ OMapNode(const OMapNode &other)
+ : LogicalCachedExtent(other) {}
+
+ using get_value_iertr = base_iertr;
+ using get_value_ret = OMapManager::omap_get_value_ret;
+ virtual get_value_ret get_value(
+ omap_context_t oc,
+ const std::string &key) = 0;
+
+ using insert_iertr = base_iertr;
+ using insert_ret = insert_iertr::future<mutation_result_t>;
+ virtual insert_ret insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value) = 0;
+
+ using rm_key_iertr = base_iertr;
+ using rm_key_ret = rm_key_iertr::future<mutation_result_t>;
+ virtual rm_key_ret rm_key(
+ omap_context_t oc,
+ const std::string &key) = 0;
+
+ using omap_list_config_t = OMapManager::omap_list_config_t;
+ using list_iertr = base_iertr;
+ using list_bare_ret = OMapManager::omap_list_bare_ret;
+ using list_ret = OMapManager::omap_list_ret;
+ virtual list_ret list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config) = 0;
+
+ using clear_iertr = base_iertr;
+ using clear_ret = clear_iertr::future<>;
+ virtual clear_ret clear(omap_context_t oc) = 0;
+
+ using full_merge_iertr = base_iertr;
+ using full_merge_ret = full_merge_iertr::future<OMapNodeRef>;
+ virtual full_merge_ret make_full_merge(
+ omap_context_t oc,
+ OMapNodeRef right) = 0;
+
+ using make_balanced_iertr = base_iertr;
+ using make_balanced_ret = make_balanced_iertr::future
+ <std::tuple<OMapNodeRef, OMapNodeRef, std::string>>;
+ virtual make_balanced_ret make_balanced(
+ omap_context_t oc,
+ OMapNodeRef _right) = 0;
+
+ virtual omap_node_meta_t get_node_meta() const = 0;
+ virtual bool extent_will_overflow(
+ size_t ksize,
+ std::optional<size_t> vsize) const = 0;
+ virtual bool can_merge(OMapNodeRef right) const = 0;
+ virtual bool extent_is_below_min() const = 0;
+ virtual uint32_t get_node_size() = 0;
+
+ virtual ~OMapNode() = default;
+};
+
+using OMapNodeRef = OMapNode::OMapNodeRef;
+
+using omap_load_extent_iertr = OMapNode::base_iertr;
+omap_load_extent_iertr::future<OMapNodeRef>
+omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth);
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapNode> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
new file mode 100644
index 000000000..4db58414a
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
@@ -0,0 +1,738 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <string.h>
+#include "include/buffer.h"
+#include "include/byteorder.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
+#include "seastar/core/thread.hh"
+
+SET_SUBSYS(seastore_omap);
+
+namespace crimson::os::seastore::omap_manager {
+
+std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs)
+{
+ return out << "omap_inner_key (" << rhs.key_off<< " - " << rhs.key_len
+ << " - " << rhs.laddr << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs)
+{
+ return out << "omap_leaf_key_t (" << rhs.key_off<< " - " << rhs.key_len
+ << " - " << rhs.val_len << ")";
+}
+
+std::ostream &OMapInnerNode::print_detail_l(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", depth=" << get_meta().depth;
+}
+
+using dec_ref_iertr = OMapInnerNode::base_iertr;
+using dec_ref_ret = dec_ref_iertr::future<>;
+template <typename T>
+dec_ref_ret dec_ref(omap_context_t oc, T&& addr) {
+ return oc.tm.dec_ref(oc.t, std::forward<T>(addr)).handle_error_interruptible(
+ dec_ref_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in OMapInnerNode helper dec_ref"
+ }
+ ).si_then([](auto &&e) {});
+}
+
+/**
+ * make_split_insert
+ *
+ * insert an entry at iter, with the address of key.
+ * will result in a split outcome encoded in the returned mutation_result_t
+ */
+OMapInnerNode::make_split_insert_ret
+OMapInnerNode::make_split_insert(
+ omap_context_t oc,
+ internal_iterator_t iter,
+ std::string key,
+ laddr_t laddr)
+{
+ LOG_PREFIX(OMapInnerNode::make_split_insert);
+ DEBUGT("this: {}, key: {}", oc.t, *this, key);
+ return make_split_children(oc).si_then([=] (auto tuple) {
+ auto [left, right, pivot] = tuple;
+ if (pivot > key) {
+ auto liter = left->iter_idx(iter.get_index());
+ left->journal_inner_insert(liter, laddr, key,
+ left->maybe_get_delta_buffer());
+ } else { //right
+ auto riter = right->iter_idx(iter.get_index() - left->get_node_size());
+ right->journal_inner_insert(riter, laddr, key,
+ right->maybe_get_delta_buffer());
+ }
+ ++(oc.t.get_omap_tree_stats().extents_num_delta);
+ return make_split_insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::WAS_SPLIT, tuple, std::nullopt));
+ });
+
+}
+
+
+OMapInnerNode::handle_split_ret
+OMapInnerNode::handle_split(
+ omap_context_t oc,
+ internal_iterator_t iter,
+ mutation_result_t mresult)
+{
+ LOG_PREFIX(OMapInnerNode::handle_split);
+ DEBUGT("this: {}", oc.t, *this);
+ if (!is_mutable()) {
+ auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapInnerNode>();
+ auto mut_iter = mut->iter_idx(iter.get_index());
+ return mut->handle_split(oc, mut_iter, mresult);
+ }
+ auto [left, right, pivot] = *(mresult.split_tuple);
+ //update operation will not cause node overflow, so we can do it first.
+ journal_inner_update(iter, left->get_laddr(), maybe_get_delta_buffer());
+ bool overflow = extent_will_overflow(pivot.size(), std::nullopt);
+ if (!overflow) {
+ journal_inner_insert(iter + 1, right->get_laddr(), pivot,
+ maybe_get_delta_buffer());
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+ } else {
+ return make_split_insert(oc, iter + 1, pivot, right->get_laddr())
+ .si_then([this, oc] (auto m_result) {
+ return dec_ref(oc, get_laddr())
+ .si_then([m_result = std::move(m_result)] {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ m_result);
+ });
+ });
+ }
+}
+
+OMapInnerNode::get_value_ret
+OMapInnerNode::get_value(
+ omap_context_t oc,
+ const std::string &key)
+{
+ LOG_PREFIX(OMapInnerNode::get_value);
+ DEBUGT("key = {}, this: {}", oc.t, key, *this);
+ auto child_pt = get_containing_child(key);
+ assert(child_pt != iter_cend());
+ auto laddr = child_pt->get_val();
+ return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+ [oc, &key] (auto extent) {
+ return extent->get_value(oc, key);
+ }).finally([ref = OMapNodeRef(this)] {});
+}
+
+OMapInnerNode::insert_ret
+OMapInnerNode::insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value)
+{
+ LOG_PREFIX(OMapInnerNode::insert);
+ DEBUGT("{}->{}, this: {}", oc.t, key, value, *this);
+ auto child_pt = get_containing_child(key);
+ assert(child_pt != iter_cend());
+ auto laddr = child_pt->get_val();
+ return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+ [oc, &key, &value] (auto extent) {
+ return extent->insert(oc, key, value);
+ }).si_then([this, oc, child_pt] (auto mresult) {
+ if (mresult.status == mutation_status_t::SUCCESS) {
+ return insert_iertr::make_ready_future<mutation_result_t>(mresult);
+ } else if (mresult.status == mutation_status_t::WAS_SPLIT) {
+ return handle_split(oc, child_pt, mresult);
+ } else {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+ }
+ });
+}
+
+OMapInnerNode::rm_key_ret
+OMapInnerNode::rm_key(omap_context_t oc, const std::string &key)
+{
+ LOG_PREFIX(OMapInnerNode::rm_key);
+ DEBUGT("key={}, this: {}", oc.t, key, *this);
+ auto child_pt = get_containing_child(key);
+ assert(child_pt != iter_cend());
+ auto laddr = child_pt->get_val();
+ return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+ [this, oc, &key, child_pt] (auto extent) {
+ return extent->rm_key(oc, key)
+ .si_then([this, oc, child_pt, extent = std::move(extent)] (auto mresult) {
+ switch (mresult.status) {
+ case mutation_status_t::SUCCESS:
+ case mutation_status_t::FAIL:
+ return rm_key_iertr::make_ready_future<mutation_result_t>(mresult);
+ case mutation_status_t::NEED_MERGE: {
+ if (get_node_size() >1)
+ return merge_entry(oc, child_pt, *(mresult.need_merge));
+ else
+ return rm_key_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS,
+ std::nullopt, std::nullopt));
+ }
+ case mutation_status_t::WAS_SPLIT:
+ return handle_split(oc, child_pt, mresult);
+ default:
+ return rm_key_iertr::make_ready_future<mutation_result_t>(mresult);
+ }
+ });
+ });
+}
+
+OMapInnerNode::list_ret
+OMapInnerNode::list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config)
+{
+ LOG_PREFIX(OMapInnerNode::list);
+ if (first && last) {
+ DEBUGT("first: {}, last: {}, this: {}", oc.t, *first, *last, *this);
+ assert(*first <= *last);
+ } else if (first) {
+ DEBUGT("first: {}, this: {}", oc.t, *first, *this);
+ } else if (last) {
+ DEBUGT("last: {}, this: {}", oc.t, *last, *this);
+ } else {
+ DEBUGT("this: {}", oc.t, *this);
+ }
+
+ auto first_iter = first ?
+ get_containing_child(*first) :
+ iter_cbegin();
+ auto last_iter = last ?
+ get_containing_child(*last) + 1:
+ iter_cend();
+ assert(first_iter != iter_cend());
+
+ return seastar::do_with(
+ first_iter,
+ last_iter,
+ iter_t(first_iter),
+ list_bare_ret(false, {}),
+ [this, &first, &last, oc, config](
+ auto &fiter,
+ auto &liter,
+ auto &iter,
+ auto &ret)
+ {
+ auto &complete = std::get<0>(ret);
+ auto &result = std::get<1>(ret);
+ return trans_intr::repeat(
+ [&, config, oc, this]() -> list_iertr::future<seastar::stop_iteration>
+ {
+ if (iter == liter) {
+ complete = true;
+ return list_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ assert(result.size() < config.max_result_size);
+ auto laddr = iter->get_val();
+ return omap_load_extent(
+ oc, laddr,
+ get_meta().depth - 1
+ ).si_then([&, config, oc](auto &&extent) {
+ return seastar::do_with(
+ iter == fiter ? first : std::optional<std::string>(std::nullopt),
+ iter == liter - 1 ? last : std::optional<std::string>(std::nullopt),
+ [&result, extent = std::move(extent), config, oc](
+ auto &nfirst,
+ auto &nlast) {
+ return extent->list(
+ oc,
+ nfirst,
+ nlast,
+ config.with_reduced_max(result.size()));
+ }).si_then([&, config](auto &&child_ret) mutable {
+ boost::ignore_unused(config); // avoid clang warning;
+ auto &[child_complete, child_result] = child_ret;
+ if (result.size() && child_result.size()) {
+ assert(child_result.begin()->first > result.rbegin()->first);
+ }
+ if (child_result.size() && first && iter == fiter) {
+ if (config.first_inclusive) {
+ assert(child_result.begin()->first >= *first);
+ } else {
+ assert(child_result.begin()->first > *first);
+ }
+ }
+ if (child_result.size() && last && iter == liter - 1) {
+ auto biter = --(child_result.end());
+ if (config.last_inclusive) {
+ assert(biter->first <= *last);
+ } else {
+ assert(biter->first < *last);
+ }
+ }
+ result.merge(std::move(child_result));
+ if (result.size() == config.max_result_size) {
+ return list_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ ++iter;
+ assert(child_complete);
+ return list_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ });
+ }).si_then([&ret, ref = OMapNodeRef(this)] {
+ return list_iertr::make_ready_future<list_bare_ret>(std::move(ret));
+ });
+ });
+}
+
+OMapInnerNode::clear_ret
+OMapInnerNode::clear(omap_context_t oc)
+{
+ LOG_PREFIX(OMapInnerNode::clear);
+ DEBUGT("this: {}", oc.t, *this);
+ return trans_intr::do_for_each(iter_begin(), iter_end(),
+ [oc, this](auto iter) {
+ auto laddr = iter->get_val();
+ auto ndepth = get_meta().depth - 1;
+ if (ndepth > 1) {
+ return omap_load_extent(oc, laddr, ndepth
+ ).si_then([oc](auto &&extent) {
+ return extent->clear(oc);
+ }).si_then([oc, laddr] {
+ return dec_ref(oc, laddr);
+ }).si_then([ref = OMapNodeRef(this)] {
+ return clear_iertr::now();
+ });
+ } else {
+ assert(ndepth == 1);
+ return dec_ref(oc, laddr
+ ).si_then([ref = OMapNodeRef(this)] {
+ return clear_iertr::now();
+ });
+ }
+ });
+}
+
+OMapInnerNode::split_children_ret
+OMapInnerNode:: make_split_children(omap_context_t oc)
+{
+ LOG_PREFIX(OMapInnerNode::make_split_children);
+ DEBUGT("this: {}", oc.t, *this);
+ return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint,
+ OMAP_INNER_BLOCK_SIZE, 2)
+ .si_then([this, oc] (auto &&ext_pair) {
+ LOG_PREFIX(OMapInnerNode::make_split_children);
+ auto left = ext_pair.front();
+ auto right = ext_pair.back();
+ DEBUGT("this: {}, split into: l {} r {}", oc.t, *this, *left, *right);
+ return split_children_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(left, right, split_into(*left, *right)));
+ });
+}
+
+OMapInnerNode::full_merge_ret
+OMapInnerNode::make_full_merge(omap_context_t oc, OMapNodeRef right)
+{
+ LOG_PREFIX(OMapInnerNode::make_full_merge);
+ DEBUGT("", oc.t);
+ return oc.tm.alloc_extent<OMapInnerNode>(oc.t, oc.hint,
+ OMAP_INNER_BLOCK_SIZE)
+ .si_then([this, right] (auto &&replacement) {
+ replacement->merge_from(*this, *right->cast<OMapInnerNode>());
+ return full_merge_ret(
+ interruptible::ready_future_marker{},
+ std::move(replacement));
+ });
+}
+
+OMapInnerNode::make_balanced_ret
+OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
+{
+ LOG_PREFIX(OMapInnerNode::make_balanced);
+ DEBUGT("l: {}, r: {}", oc.t, *this, *_right);
+ ceph_assert(_right->get_type() == TYPE);
+ return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint,
+ OMAP_INNER_BLOCK_SIZE, 2)
+ .si_then([this, _right] (auto &&replacement_pair){
+ auto replacement_left = replacement_pair.front();
+ auto replacement_right = replacement_pair.back();
+ auto &right = *_right->cast<OMapInnerNode>();
+ return make_balanced_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(replacement_left, replacement_right,
+ balance_into_new_nodes(*this, right,
+ *replacement_left, *replacement_right)));
+ });
+}
+
+OMapInnerNode::merge_entry_ret
+OMapInnerNode::merge_entry(
+ omap_context_t oc,
+ internal_iterator_t iter,
+ OMapNodeRef entry)
+{
+ LOG_PREFIX(OMapInnerNode::merge_entry);
+ DEBUGT("{}, parent: {}", oc.t, *entry, *this);
+ if (!is_mutable()) {
+ auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapInnerNode>();
+ auto mut_iter = mut->iter_idx(iter->get_index());
+ return mut->merge_entry(oc, mut_iter, entry);
+ }
+ auto is_left = (iter + 1) == iter_cend();
+ auto donor_iter = is_left ? iter - 1 : iter + 1;
+ return omap_load_extent(oc, donor_iter->get_val(), get_meta().depth - 1
+ ).si_then([=, this](auto &&donor) mutable {
+ LOG_PREFIX(OMapInnerNode::merge_entry);
+ auto [l, r] = is_left ?
+ std::make_pair(donor, entry) : std::make_pair(entry, donor);
+ auto [liter, riter] = is_left ?
+ std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+ if (l->can_merge(r)) {
+ DEBUGT("make_full_merge l {} r {}", oc.t, *l, *r);
+ assert(entry->extent_is_below_min());
+ return l->make_full_merge(oc, r
+ ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this]
+ (auto &&replacement) {
+ LOG_PREFIX(OMapInnerNode::merge_entry);
+ DEBUGT("to update parent: {}", oc.t, *this);
+ journal_inner_update(
+ liter,
+ replacement->get_laddr(),
+ maybe_get_delta_buffer());
+ journal_inner_remove(riter, maybe_get_delta_buffer());
+ //retire extent
+ std::vector<laddr_t> dec_laddrs {l->get_laddr(), r->get_laddr()};
+ return dec_ref(oc, dec_laddrs
+ ).si_then([this, oc] {
+ --(oc.t.get_omap_tree_stats().extents_num_delta);
+ if (extent_is_below_min()) {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::NEED_MERGE,
+ std::nullopt, this));
+ } else {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS,
+ std::nullopt, std::nullopt));
+ }
+ });
+ });
+ } else {
+ DEBUGT("balanced l {} r {}", oc.t, *l, *r);
+ return l->make_balanced(oc, r
+ ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) {
+ LOG_PREFIX(OMapInnerNode::merge_entry);
+ DEBUGT("to update parent: {}", oc.t, *this);
+ auto [replacement_l, replacement_r, replacement_pivot] = tuple;
+ //update operation will not cuase node overflow, so we can do it first
+ journal_inner_update(
+ liter,
+ replacement_l->get_laddr(),
+ maybe_get_delta_buffer());
+ bool overflow = extent_will_overflow(replacement_pivot.size(),
+ std::nullopt);
+ if (!overflow) {
+ journal_inner_remove(riter, maybe_get_delta_buffer());
+ journal_inner_insert(
+ riter,
+ replacement_r->get_laddr(),
+ replacement_pivot,
+ maybe_get_delta_buffer());
+ std::vector<laddr_t> dec_laddrs{l->get_laddr(), r->get_laddr()};
+ return dec_ref(oc, dec_laddrs
+ ).si_then([] {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS,
+ std::nullopt, std::nullopt));
+ });
+ } else {
+ DEBUGT("balanced and split {} r {}", oc.t, *l, *r);
+ //use remove and insert to instead of replace,
+ //remove operation will not cause node split, so we can do it first
+ journal_inner_remove(riter, maybe_get_delta_buffer());
+ return make_split_insert(oc, riter, replacement_pivot,
+ replacement_r->get_laddr()
+ ).si_then([this, oc, l = l, r = r](auto mresult) {
+ std::vector<laddr_t> dec_laddrs{
+ l->get_laddr(),
+ r->get_laddr(),
+ get_laddr()};
+ return dec_ref(oc, dec_laddrs
+ ).si_then([mresult = std::move(mresult)] {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{}, mresult);
+ });
+ });
+ }
+ });
+ }
+ });
+
+}
+
+OMapInnerNode::internal_iterator_t
+OMapInnerNode::get_containing_child(const std::string &key)
+{
+ auto iter = std::find_if(iter_begin(), iter_end(),
+ [&key](auto it) { return it.contains(key); });
+ return iter;
+}
+
+std::ostream &OMapLeafNode::print_detail_l(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", depth=" << get_meta().depth;
+}
+
+OMapLeafNode::get_value_ret
+OMapLeafNode::get_value(omap_context_t oc, const std::string &key)
+{
+ LOG_PREFIX(OMapLeafNode::get_value);
+ DEBUGT("key = {}, this: {}", oc.t, *this, key);
+ auto ite = find_string_key(key);
+ if (ite != iter_end()) {
+ auto value = ite->get_val();
+ return get_value_ret(
+ interruptible::ready_future_marker{},
+ value);
+ } else {
+ return get_value_ret(
+ interruptible::ready_future_marker{},
+ std::nullopt);
+ }
+}
+
+OMapLeafNode::insert_ret
+OMapLeafNode::insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value)
+{
+ LOG_PREFIX(OMapLeafNode::insert);
+ DEBUGT("{} -> {}, this: {}", oc.t, key, value, *this);
+ bool overflow = extent_will_overflow(key.size(), value.length());
+ if (!overflow) {
+ if (!is_mutable()) {
+ auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapLeafNode>();
+ return mut->insert(oc, key, value);
+ }
+ auto replace_pt = find_string_key(key);
+ if (replace_pt != iter_end()) {
+ ++(oc.t.get_omap_tree_stats().num_updates);
+ journal_leaf_update(replace_pt, key, value, maybe_get_delta_buffer());
+ } else {
+ ++(oc.t.get_omap_tree_stats().num_inserts);
+ auto insert_pt = string_lower_bound(key);
+ journal_leaf_insert(insert_pt, key, value, maybe_get_delta_buffer());
+
+ DEBUGT("inserted {}, this: {}", oc.t, insert_pt.get_key(), *this);
+ }
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+ } else {
+ return make_split_children(oc).si_then([this, oc, &key, &value] (auto tuple) {
+ auto [left, right, pivot] = tuple;
+ auto replace_pt = find_string_key(key);
+ if (replace_pt != iter_end()) {
+ ++(oc.t.get_omap_tree_stats().num_updates);
+ if (key < pivot) { //left
+ auto mut_iter = left->iter_idx(replace_pt->get_index());
+ left->journal_leaf_update(mut_iter, key, value, left->maybe_get_delta_buffer());
+ } else if (key >= pivot) { //right
+ auto mut_iter = right->iter_idx(replace_pt->get_index() - left->get_node_size());
+ right->journal_leaf_update(mut_iter, key, value, right->maybe_get_delta_buffer());
+ }
+ } else {
+ ++(oc.t.get_omap_tree_stats().num_inserts);
+ auto insert_pt = string_lower_bound(key);
+ if (key < pivot) { //left
+ auto mut_iter = left->iter_idx(insert_pt->get_index());
+ left->journal_leaf_insert(mut_iter, key, value, left->maybe_get_delta_buffer());
+ } else {
+ auto mut_iter = right->iter_idx(insert_pt->get_index() - left->get_node_size());
+ right->journal_leaf_insert(mut_iter, key, value, right->maybe_get_delta_buffer());
+ }
+ }
+ ++(oc.t.get_omap_tree_stats().extents_num_delta);
+ return dec_ref(oc, get_laddr())
+ .si_then([tuple = std::move(tuple)] {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::WAS_SPLIT, tuple, std::nullopt));
+ });
+ });
+ }
+}
+
+OMapLeafNode::rm_key_ret
+OMapLeafNode::rm_key(omap_context_t oc, const std::string &key)
+{
+ LOG_PREFIX(OMapLeafNode::rm_key);
+ DEBUGT("{}, this: {}", oc.t, key, *this);
+ auto rm_pt = find_string_key(key);
+ if (!is_mutable() && rm_pt != iter_end()) {
+ auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapLeafNode>();
+ return mut->rm_key(oc, key);
+ }
+
+ if (rm_pt != iter_end()) {
+ ++(oc.t.get_omap_tree_stats().num_erases);
+ journal_leaf_remove(rm_pt, maybe_get_delta_buffer());
+ if (extent_is_below_min()) {
+ return rm_key_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::NEED_MERGE, std::nullopt,
+ this->cast<OMapNode>()));
+ } else {
+ return rm_key_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+ }
+ } else {
+ return rm_key_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::FAIL, std::nullopt, std::nullopt));
+ }
+
+}
+
+OMapLeafNode::list_ret
+OMapLeafNode::list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config)
+{
+ LOG_PREFIX(OMapLeafNode::list);
+ DEBUGT(
+ "first {} last {} max_result_size {} first_inclusive {} \
+ last_inclusive {}, this: {}",
+ oc.t,
+ first ? first->c_str() : "",
+ last ? last->c_str() : "",
+ config.max_result_size,
+ config.first_inclusive,
+ config.last_inclusive,
+ *this
+ );
+ auto ret = list_bare_ret(false, {});
+ auto &[complete, result] = ret;
+ auto iter = first ?
+ (config.first_inclusive ?
+ string_lower_bound(*first) :
+ string_upper_bound(*first)) :
+ iter_begin();
+ auto liter = last ?
+ (config.last_inclusive ?
+ string_upper_bound(*last) :
+ string_lower_bound(*last)) :
+ iter_end();
+
+ for (; iter != liter && result.size() < config.max_result_size; iter++) {
+ result.emplace(std::make_pair(iter->get_key(), iter->get_val()));
+ }
+
+ complete = (iter == liter);
+
+ return list_iertr::make_ready_future<list_bare_ret>(
+ std::move(ret));
+}
+
+OMapLeafNode::clear_ret
+OMapLeafNode::clear(omap_context_t oc)
+{
+ return clear_iertr::now();
+}
+
+OMapLeafNode::split_children_ret
+OMapLeafNode::make_split_children(omap_context_t oc)
+{
+ LOG_PREFIX(OMapLeafNode::make_split_children);
+ DEBUGT("this: {}", oc.t, *this);
+ return oc.tm.alloc_extents<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE, 2)
+ .si_then([this] (auto &&ext_pair) {
+ auto left = ext_pair.front();
+ auto right = ext_pair.back();
+ return split_children_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(left, right, split_into(*left, *right)));
+ });
+}
+
+OMapLeafNode::full_merge_ret
+OMapLeafNode::make_full_merge(omap_context_t oc, OMapNodeRef right)
+{
+ ceph_assert(right->get_type() == TYPE);
+ LOG_PREFIX(OMapLeafNode::make_full_merge);
+ DEBUGT("this: {}", oc.t, *this);
+ return oc.tm.alloc_extent<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE)
+ .si_then([this, right] (auto &&replacement) {
+ replacement->merge_from(*this, *right->cast<OMapLeafNode>());
+ return full_merge_ret(
+ interruptible::ready_future_marker{},
+ std::move(replacement));
+ });
+}
+
+OMapLeafNode::make_balanced_ret
+OMapLeafNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
+{
+ ceph_assert(_right->get_type() == TYPE);
+ LOG_PREFIX(OMapLeafNode::make_balanced);
+ DEBUGT("this: {}", oc.t, *this);
+ return oc.tm.alloc_extents<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE, 2)
+ .si_then([this, _right] (auto &&replacement_pair) {
+ auto replacement_left = replacement_pair.front();
+ auto replacement_right = replacement_pair.back();
+ auto &right = *_right->cast<OMapLeafNode>();
+ return make_balanced_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(
+ replacement_left, replacement_right,
+ balance_into_new_nodes(
+ *this, right,
+ *replacement_left, *replacement_right)));
+ });
+}
+
+
+omap_load_extent_iertr::future<OMapNodeRef>
+omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth)
+{
+ ceph_assert(depth > 0);
+ if (depth > 1) {
+ return oc.tm.read_extent<OMapInnerNode>(oc.t, laddr,
+ OMAP_INNER_BLOCK_SIZE)
+ .handle_error_interruptible(
+ omap_load_extent_iertr::pass_further{},
+ crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
+ ).si_then(
+ [](auto&& e) {
+ return seastar::make_ready_future<OMapNodeRef>(std::move(e));
+ });
+ } else {
+ return oc.tm.read_extent<OMapLeafNode>(oc.t, laddr, OMAP_LEAF_BLOCK_SIZE
+ ).handle_error_interruptible(
+ omap_load_extent_iertr::pass_further{},
+ crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
+ ).si_then(
+ [](auto&& e) {
+ return seastar::make_ready_future<OMapNodeRef>(std::move(e));
+ });
+ }
+}
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
new file mode 100644
index 000000000..a2b51bbb0
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string.h>
+
+#include "include/buffer.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+
+namespace crimson::os::seastore::omap_manager {
+
+/**
+ * OMapInnerNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * omap Tree.
+ *
+ * Layout (4k):
+ * num_entries: meta : keys : values :
+ */
+
+struct OMapInnerNode
+ : OMapNode,
+ StringKVInnerNodeLayout {
+ using OMapInnerNodeRef = TCachedExtentRef<OMapInnerNode>;
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ OMapInnerNode(T&&... t) :
+ OMapNode(std::forward<T>(t)...),
+ StringKVInnerNodeLayout(get_bptr().c_str()) {}
+
+ omap_node_meta_t get_node_meta() const final { return get_meta(); }
+ bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const {
+ return is_overflow(ksize);
+ }
+ bool can_merge(OMapNodeRef right) const {
+ return !is_overflow(*right->cast<OMapInnerNode>());
+ }
+ bool extent_is_below_min() const { return below_min(); }
+ uint32_t get_node_size() { return get_size(); }
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new OMapInnerNode(*this));
+ }
+
+ delta_inner_buffer_t delta_buffer;
+ delta_inner_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ get_value_ret get_value(omap_context_t oc, const std::string &key) final;
+
+ insert_ret insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value) final;
+
+ rm_key_ret rm_key(
+ omap_context_t oc,
+ const std::string &key) final;
+
+ list_ret list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config) final;
+
+ clear_ret clear(omap_context_t oc) final;
+
+ using split_children_iertr = base_iertr;
+ using split_children_ret = split_children_iertr::future
+ <std::tuple<OMapInnerNodeRef, OMapInnerNodeRef, std::string>>;
+ split_children_ret make_split_children(omap_context_t oc);
+
+ full_merge_ret make_full_merge(
+ omap_context_t oc, OMapNodeRef right) final;
+
+ make_balanced_ret make_balanced(
+ omap_context_t oc, OMapNodeRef right) final;
+
+ using make_split_insert_iertr = base_iertr;
+ using make_split_insert_ret = make_split_insert_iertr::future<mutation_result_t>;
+ make_split_insert_ret make_split_insert(
+ omap_context_t oc, internal_iterator_t iter,
+ std::string key, laddr_t laddr);
+
+ using merge_entry_iertr = base_iertr;
+ using merge_entry_ret = merge_entry_iertr::future<mutation_result_t>;
+ merge_entry_ret merge_entry(
+ omap_context_t oc,
+ internal_iterator_t iter, OMapNodeRef entry);
+
+ using handle_split_iertr = base_iertr;
+ using handle_split_ret = handle_split_iertr::future<mutation_result_t>;
+ handle_split_ret handle_split(
+ omap_context_t oc, internal_iterator_t iter,
+ mutation_result_t mresult);
+
+ std::ostream &print_detail_l(std::ostream &out) const final;
+
+ static constexpr extent_types_t TYPE = extent_types_t::OMAP_INNER;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ ceph::bufferlist get_delta() final {
+ ceph::bufferlist bl;
+ if (!delta_buffer.empty()) {
+ encode(delta_buffer, bl);
+ delta_buffer.clear();
+ }
+ return bl;
+ }
+
+ void apply_delta(const ceph::bufferlist &bl) final {
+ assert(bl.length());
+ delta_inner_buffer_t buffer;
+ auto bptr = bl.cbegin();
+ decode(buffer, bptr);
+ buffer.replay(*this);
+ }
+
+ internal_iterator_t get_containing_child(const std::string &key);
+};
+using OMapInnerNodeRef = OMapInnerNode::OMapInnerNodeRef;
+
+/**
+ * OMapLeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * OMap Tree.
+ *
+ * Layout (4k):
+ * num_entries: meta : keys : values :
+ */
+
+struct OMapLeafNode
+ : OMapNode,
+ StringKVLeafNodeLayout {
+
+ using OMapLeafNodeRef = TCachedExtentRef<OMapLeafNode>;
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ OMapLeafNode(T&&... t) :
+ OMapNode(std::forward<T>(t)...),
+ StringKVLeafNodeLayout(get_bptr().c_str()) {}
+
+ omap_node_meta_t get_node_meta() const final { return get_meta(); }
+ bool extent_will_overflow(
+ size_t ksize, std::optional<size_t> vsize) const {
+ return is_overflow(ksize, *vsize);
+ }
+ bool can_merge(OMapNodeRef right) const {
+ return !is_overflow(*right->cast<OMapLeafNode>());
+ }
+ bool extent_is_below_min() const { return below_min(); }
+ uint32_t get_node_size() { return get_size(); }
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new OMapLeafNode(*this));
+ }
+
+ delta_leaf_buffer_t delta_buffer;
+ delta_leaf_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ get_value_ret get_value(
+ omap_context_t oc, const std::string &key) final;
+
+ insert_ret insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value) final;
+
+ rm_key_ret rm_key(
+ omap_context_t oc, const std::string &key) final;
+
+ list_ret list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config) final;
+
+ clear_ret clear(
+ omap_context_t oc) final;
+
+ using split_children_iertr = base_iertr;
+ using split_children_ret = split_children_iertr::future
+ <std::tuple<OMapLeafNodeRef, OMapLeafNodeRef, std::string>>;
+ split_children_ret make_split_children(
+ omap_context_t oc);
+
+ full_merge_ret make_full_merge(
+ omap_context_t oc,
+ OMapNodeRef right) final;
+
+ make_balanced_ret make_balanced(
+ omap_context_t oc,
+ OMapNodeRef _right) final;
+
+ static constexpr extent_types_t TYPE = extent_types_t::OMAP_LEAF;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ ceph::bufferlist get_delta() final {
+ ceph::bufferlist bl;
+ if (!delta_buffer.empty()) {
+ encode(delta_buffer, bl);
+ delta_buffer.clear();
+ }
+ return bl;
+ }
+
+ void apply_delta(const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_leaf_buffer_t buffer;
+ auto bptr = bl.cbegin();
+ decode(buffer, bptr);
+ buffer.replay(*this);
+ }
+
+ std::ostream &print_detail_l(std::ostream &out) const final;
+
+ std::pair<internal_iterator_t, internal_iterator_t>
+ get_leaf_entries(std::string &key);
+
+};
+using OMapLeafNodeRef = OMapLeafNode::OMapLeafNodeRef;
+
+std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs);
+std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs);
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapInnerNode> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapLeafNode> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_types.h b/src/crimson/os/seastore/omap_manager/btree/omap_types.h
new file mode 100644
index 000000000..9e0d10e03
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_types.h
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore::omap_manager {
+
+struct omap_node_meta_t {
+ depth_t depth = 0;
+
+ std::pair<omap_node_meta_t, omap_node_meta_t> split_into() const {
+ return std::make_pair(
+ omap_node_meta_t{depth},
+ omap_node_meta_t{depth});
+ }
+
+ static omap_node_meta_t merge_from(
+ const omap_node_meta_t &lhs, const omap_node_meta_t &rhs) {
+ assert(lhs.depth == rhs.depth);
+ return omap_node_meta_t{lhs.depth};
+ }
+
+ static std::pair<omap_node_meta_t, omap_node_meta_t>
+ rebalance(const omap_node_meta_t &lhs, const omap_node_meta_t &rhs) {
+ assert(lhs.depth == rhs.depth);
+ return std::make_pair(
+ omap_node_meta_t{lhs.depth},
+ omap_node_meta_t{lhs.depth});
+ }
+};
+
+struct omap_node_meta_le_t {
+ depth_le_t depth = init_depth_le(0);
+
+ omap_node_meta_le_t() = default;
+ omap_node_meta_le_t(const omap_node_meta_le_t &) = default;
+ explicit omap_node_meta_le_t(const omap_node_meta_t &val)
+ : depth(init_depth_le(val.depth)) {}
+
+ operator omap_node_meta_t() const {
+ return omap_node_meta_t{ depth };
+ }
+};
+
+struct omap_inner_key_t {
+ uint16_t key_off = 0;
+ uint16_t key_len = 0;
+ laddr_t laddr = 0;
+
+ omap_inner_key_t() = default;
+ omap_inner_key_t(uint16_t off, uint16_t len, laddr_t addr)
+ : key_off(off), key_len(len), laddr(addr) {}
+
+ inline bool operator==(const omap_inner_key_t b) const {
+ return key_off == b.key_off && key_len == b.key_len && laddr == b.laddr;
+ }
+ inline bool operator!=(const omap_inner_key_t b) const {
+ return key_off != b.key_off || key_len != b.key_len || laddr != b.laddr;
+ }
+ DENC(omap_inner_key_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.key_off, p);
+ denc(v.key_len, p);
+ denc(v.laddr, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct omap_inner_key_le_t {
+ ceph_le16 key_off{0};
+ ceph_le16 key_len{0};
+ laddr_le_t laddr{0};
+
+ omap_inner_key_le_t() = default;
+ omap_inner_key_le_t(const omap_inner_key_le_t &) = default;
+ explicit omap_inner_key_le_t(const omap_inner_key_t &key)
+ : key_off(key.key_off),
+ key_len(key.key_len),
+ laddr(key.laddr) {}
+
+ operator omap_inner_key_t() const {
+ return omap_inner_key_t{uint16_t(key_off), uint16_t(key_len), laddr_t(laddr)};
+ }
+
+ omap_inner_key_le_t& operator=(omap_inner_key_t key) {
+ key_off = key.key_off;
+ key_len = key.key_len;
+ laddr = laddr_le_t(key.laddr);
+ return *this;
+ }
+
+ inline bool operator==(const omap_inner_key_le_t b) const {
+ return key_off == b.key_off && key_len == b.key_len && laddr == b.laddr;
+ }
+};
+
+struct omap_leaf_key_t {
+ uint16_t key_off = 0;
+ uint16_t key_len = 0;
+ uint16_t val_len = 0;
+
+ omap_leaf_key_t() = default;
+ omap_leaf_key_t(uint16_t k_off, uint16_t k_len, uint16_t v_len)
+ : key_off(k_off), key_len(k_len), val_len(v_len) {}
+
+ inline bool operator==(const omap_leaf_key_t b) const {
+ return key_off == b.key_off && key_len == b.key_len &&
+ val_len == b.val_len;
+ }
+ inline bool operator!=(const omap_leaf_key_t b) const {
+ return key_off != b.key_off || key_len != b.key_len ||
+ val_len != b.val_len;
+ }
+
+ DENC(omap_leaf_key_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.key_off, p);
+ denc(v.key_len, p);
+ denc(v.val_len, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct omap_leaf_key_le_t {
+ ceph_le16 key_off{0};
+ ceph_le16 key_len{0};
+ ceph_le16 val_len{0};
+
+ omap_leaf_key_le_t() = default;
+ omap_leaf_key_le_t(const omap_leaf_key_le_t &) = default;
+ explicit omap_leaf_key_le_t(const omap_leaf_key_t &key)
+ : key_off(key.key_off),
+ key_len(key.key_len),
+ val_len(key.val_len) {}
+
+ operator omap_leaf_key_t() const {
+ return omap_leaf_key_t{uint16_t(key_off), uint16_t(key_len),
+ uint16_t(val_len)};
+ }
+
+ omap_leaf_key_le_t& operator=(omap_leaf_key_t key) {
+ key_off = key.key_off;
+ key_len = key.key_len;
+ val_len = key.val_len;
+ return *this;
+ }
+
+ inline bool operator==(const omap_leaf_key_le_t b) const {
+ return key_off == b.key_off && key_len == b.key_len &&
+ val_len == b.val_len;
+ }
+};
+
+}
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::omap_manager::omap_inner_key_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::omap_manager::omap_leaf_key_t)
diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
new file mode 100644
index 000000000..72b13fedf
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
@@ -0,0 +1,1550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "include/byteorder.h"
+#include "include/denc.h"
+#include "include/encoding.h"
+
+#include "crimson/common/layout.h"
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+
+namespace crimson::os::seastore::omap_manager {
+class StringKVInnerNodeLayout;
+class StringKVLeafNodeLayout;
+
+/**
+ * copy_from_foreign
+ *
+ * Copy from another node entries to this node.
+ * [from_src, to_src) is another node entry range.
+ * tgt is this node entry to copy to.
+ * tgt and from_src must be from different nodes.
+ * from_src and to_src must be in the same node.
+ */
+template <typename iterator, typename const_iterator>
+static void copy_from_foreign(
+ iterator tgt,
+ const_iterator from_src,
+ const_iterator to_src) {
+ assert(tgt->node != from_src->node);
+ assert(to_src->node == from_src->node);
+ if (from_src == to_src)
+ return;
+
+ auto to_copy = from_src->get_right_ptr_end() - to_src->get_right_ptr_end();
+ assert(to_copy > 0);
+ memcpy(
+ tgt->get_right_ptr_end() - to_copy,
+ to_src->get_right_ptr_end(),
+ to_copy);
+ memcpy(
+ tgt->get_node_key_ptr(),
+ from_src->get_node_key_ptr(),
+ to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+
+ auto offset_diff = tgt->get_right_offset_end() - from_src->get_right_offset_end();
+ for (auto i = tgt; i != tgt + (to_src - from_src); ++i) {
+ i->update_offset(offset_diff);
+ }
+}
+
+/**
+ * copy_from_local
+ *
+ * Copies entries from [from_src, to_src) to tgt.
+ * tgt, from_src, and to_src must be from the same node.
+ */
+template <typename iterator>
+static void copy_from_local(
+ unsigned len,
+ iterator tgt,
+ iterator from_src,
+ iterator to_src) {
+ assert(tgt->node == from_src->node);
+ assert(to_src->node == from_src->node);
+
+ auto to_copy = from_src->get_right_ptr_end() - to_src->get_right_ptr_end();
+ assert(to_copy > 0);
+ int adjust_offset = tgt > from_src? -len : len;
+ memmove(to_src->get_right_ptr_end() + adjust_offset,
+ to_src->get_right_ptr_end(),
+ to_copy);
+
+ for ( auto ite = from_src; ite < to_src; ite++) {
+ ite->update_offset(-adjust_offset);
+ }
+ memmove(tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+ to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+}
+
+struct delta_inner_t {
+ enum class op_t : uint_fast8_t {
+ INSERT,
+ UPDATE,
+ REMOVE,
+ } op;
+ std::string key;
+ laddr_t addr;
+
+ DENC(delta_inner_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.op, p);
+ denc(v.key, p);
+ denc(v.addr, p);
+ DENC_FINISH(p);
+ }
+
+ void replay(StringKVInnerNodeLayout &l);
+ bool operator==(const delta_inner_t &rhs) const {
+ return op == rhs.op &&
+ key == rhs.key &&
+ addr == rhs.addr;
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_inner_t)
+
+namespace crimson::os::seastore::omap_manager {
+struct delta_leaf_t {
+ enum class op_t : uint_fast8_t {
+ INSERT,
+ UPDATE,
+ REMOVE,
+ } op;
+ std::string key;
+ ceph::bufferlist val;
+
+ DENC(delta_leaf_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.op, p);
+ denc(v.key, p);
+ denc(v.val, p);
+ DENC_FINISH(p);
+ }
+
+ void replay(StringKVLeafNodeLayout &l);
+ bool operator==(const delta_leaf_t &rhs) const {
+ return op == rhs.op &&
+ key == rhs.key &&
+ val == rhs.val;
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_leaf_t)
+
+namespace crimson::os::seastore::omap_manager {
+class delta_inner_buffer_t {
+ std::vector<delta_inner_t> buffer;
+public:
+ bool empty() const {
+ return buffer.empty();
+ }
+ void insert(
+ const std::string &key,
+ laddr_t addr) {
+ buffer.push_back(
+ delta_inner_t{
+ delta_inner_t::op_t::INSERT,
+ key,
+ addr
+ });
+ }
+ void update(
+ const std::string &key,
+ laddr_t addr) {
+ buffer.push_back(
+ delta_inner_t{
+ delta_inner_t::op_t::UPDATE,
+ key,
+ addr
+ });
+ }
+ void remove(const std::string &key) {
+ buffer.push_back(
+ delta_inner_t{
+ delta_inner_t::op_t::REMOVE,
+ key,
+ L_ADDR_NULL
+ });
+ }
+
+ void replay(StringKVInnerNodeLayout &node) {
+ for (auto &i: buffer) {
+ i.replay(node);
+ }
+ }
+
+ void clear() {
+ buffer.clear();
+ }
+
+ DENC(delta_inner_buffer_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.buffer, p);
+ DENC_FINISH(p);
+ }
+
+ bool operator==(const delta_inner_buffer_t &rhs) const {
+ return buffer == rhs.buffer;
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_inner_buffer_t)
+
+namespace crimson::os::seastore::omap_manager {
+class delta_leaf_buffer_t {
+ std::vector<delta_leaf_t> buffer;
+public:
+ bool empty() const {
+ return buffer.empty();
+ }
+ void insert(
+ const std::string &key,
+ const ceph::bufferlist &val) {
+ buffer.push_back(
+ delta_leaf_t{
+ delta_leaf_t::op_t::INSERT,
+ key,
+ val
+ });
+ }
+ void update(
+ const std::string &key,
+ const ceph::bufferlist &val) {
+ buffer.push_back(
+ delta_leaf_t{
+ delta_leaf_t::op_t::UPDATE,
+ key,
+ val
+ });
+ }
+ void remove(const std::string &key) {
+ buffer.push_back(
+ delta_leaf_t{
+ delta_leaf_t::op_t::REMOVE,
+ key,
+ bufferlist()
+ });
+ }
+
+ void replay(StringKVLeafNodeLayout &node) {
+ for (auto &i: buffer) {
+ i.replay(node);
+ }
+ }
+
+ void clear() {
+ buffer.clear();
+ }
+
+ DENC(delta_leaf_buffer_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.buffer, p);
+ DENC_FINISH(p);
+ }
+
+ bool operator==(const delta_leaf_buffer_t &rhs) const {
+ return buffer == rhs.buffer;
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_leaf_buffer_t)
+
+namespace crimson::os::seastore::omap_manager {
+/**
+ * StringKVInnerNodeLayout
+ *
+ * Uses absl::container_internal::Layout for the actual key memory layout.
+ *
+ * The primary interface exposed is centered on the iterator
+ * and related methods.
+ *
+ * Also included are helpers for doing splits and merges as for a btree.
+ *
+ * layout diagram:
+ *
+ * # <----------------------------- node range --------------------------------------------> #
+ * # #<~># free space #
+ * # <------------- left part -----------------------------> # <~# <----- right keys -----> #
+ * # # <------------ left keys --------------> #~> # #
+ * # # keys [2, n) |<~># #<~>| right keys [2, n) #
+ * # # <--- key 0 ----> | <--- key 1 ----> | # # | <- k1 -> | <-- k0 --> #
+ * # # | | # # | | #
+ * # num_ | meta # key | key | val | key | key | val | # # | key | key #
+ * # keys | depth # off | len | laddr| off | len | laddr| # # | buff | buff #
+ * # | # 0 | 0 | 0 | 1 | 1 | 1 |...#...#...| key 1 | key 0 #
+ * # | | | <- off --+----------> #
+ * # | | ^ | <- off --> #
+ * | | | ^
+ * | +----------------------------------+ |
+ * +----------------------------------------------------------------+
+ */
+class StringKVInnerNodeLayout {
+ char *buf = nullptr;
+
+ using L = absl::container_internal::Layout<ceph_le32, omap_node_meta_le_t, omap_inner_key_le_t>;
+ static constexpr L layout{1, 1, 1}; // = L::Partial(1, 1, 1);
+ friend class delta_inner_t;
+public:
+ template <bool is_const>
+ class iter_t {
+ friend class StringKVInnerNodeLayout;
+
+ template <typename iterator, typename const_iterator>
+ friend void copy_from_foreign(iterator, const_iterator, const_iterator);
+ template <typename iterator>
+ friend void copy_from_local(unsigned, iterator, iterator, iterator);
+
+ using parent_t = typename crimson::common::maybe_const_t<StringKVInnerNodeLayout, is_const>::type;
+
+ mutable parent_t node;
+ uint16_t index;
+
+ iter_t(
+ parent_t parent,
+ uint16_t index) : node(parent), index(index) {}
+
+ public:
+ using iterator_category = std::input_iterator_tag;
+ using value_type = StringKVInnerNodeLayout;
+ using difference_type = std::ptrdiff_t;
+ using pointer = StringKVInnerNodeLayout*;
+ using reference = iter_t&;
+
+ iter_t(const iter_t &) = default;
+ iter_t(iter_t &&) = default;
+ iter_t &operator=(const iter_t &) = default;
+ iter_t &operator=(iter_t &&) = default;
+
+ operator iter_t<!is_const>() const {
+ static_assert(!is_const);
+ return iter_t<!is_const>(node, index);
+ }
+
+ iter_t &operator*() { return *this; }
+ iter_t *operator->() { return this; }
+
+ iter_t operator++(int) {
+ auto ret = *this;
+ ++index;
+ return ret;
+ }
+
+ iter_t &operator++() {
+ ++index;
+ return *this;
+ }
+
+ iter_t operator--(int) {
+ auto ret = *this;
+ assert(index > 0);
+ --index;
+ return ret;
+ }
+
+ iter_t &operator--() {
+ assert(index > 0);
+ --index;
+ return *this;
+ }
+
+ uint16_t operator-(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index - rhs.index;
+ }
+
+ iter_t operator+(uint16_t off) const {
+ return iter_t(node, index + off);
+ }
+ iter_t operator-(uint16_t off) const {
+ return iter_t(node, index - off);
+ }
+
+ uint16_t operator<(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index < rhs.index;
+ }
+
+ uint16_t operator>(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index > rhs.index;
+ }
+
+ friend bool operator==(const iter_t &lhs, const iter_t &rhs) {
+ assert(lhs.node == rhs.node);
+ return lhs.index == rhs.index;
+ }
+
+ private:
+ omap_inner_key_t get_node_key() const {
+ omap_inner_key_le_t kint = node->get_node_key_ptr()[index];
+ return omap_inner_key_t(kint);
+ }
+ auto get_node_key_ptr() const {
+ return reinterpret_cast<
+ typename crimson::common::maybe_const_t<char, is_const>::type>(
+ node->get_node_key_ptr() + index);
+ }
+
+ uint32_t get_node_val_offset() const {
+ return get_node_key().key_off;
+ }
+ auto get_node_val_ptr() const {
+ auto tail = node->buf + OMAP_INNER_BLOCK_SIZE;
+ if (*this == node->iter_end())
+ return tail;
+ else {
+ return tail - get_node_val_offset();
+ }
+ }
+
+ int get_right_offset_end() const {
+ if (index == 0)
+ return 0;
+ else
+ return (*this - 1)->get_node_val_offset();
+ }
+ auto get_right_ptr_end() const {
+ return node->buf + OMAP_INNER_BLOCK_SIZE - get_right_offset_end();
+ }
+
+ void update_offset(int offset) {
+ static_assert(!is_const);
+ auto key = get_node_key();
+ assert(offset + key.key_off >= 0);
+ key.key_off += offset;
+ set_node_key(key);
+ }
+
+ void set_node_key(omap_inner_key_t _lb) {
+ static_assert(!is_const);
+ omap_inner_key_le_t lb;
+ lb = _lb;
+ node->get_node_key_ptr()[index] = lb;
+ }
+
+ void set_node_val(const std::string &str) {
+ static_assert(!is_const);
+ assert(str.size() == get_node_key().key_len);
+ assert(get_node_key().key_off >= str.size());
+ assert(get_node_key().key_off < OMAP_INNER_BLOCK_SIZE);
+ assert(str.size() < OMAP_INNER_BLOCK_SIZE);
+ ::memcpy(get_node_val_ptr(), str.data(), str.size());
+ }
+
+ public:
+ uint16_t get_index() const {
+ return index;
+ }
+
+ std::string get_key() const {
+ return std::string(
+ get_node_val_ptr(),
+ get_node_key().key_len);
+ }
+
+ laddr_t get_val() const {
+ return get_node_key().laddr;
+ }
+
+ bool contains(std::string_view key) const {
+ assert(*this != node->iter_end());
+ auto next = *this + 1;
+ if (next == node->iter_end()) {
+ return get_key() <= key;
+ } else {
+ return (get_key() <= key) && (next->get_key() > key);
+ }
+ }
+ };
+ using const_iterator = iter_t<true>;
+ using iterator = iter_t<false>;
+
+public:
+ void journal_inner_insert(
+ const_iterator _iter,
+ const laddr_t laddr,
+ const std::string &key,
+ delta_inner_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->insert(
+ key,
+ laddr);
+ }
+ inner_insert(iter, key, laddr);
+ }
+
+ void journal_inner_update(
+ const_iterator _iter,
+ const laddr_t laddr,
+ delta_inner_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ auto key = iter->get_key();
+ if (recorder) {
+ recorder->update(key, laddr);
+ }
+ inner_update(iter, laddr);
+ }
+
+ void journal_inner_remove(
+ const_iterator _iter,
+ delta_inner_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->remove(iter->get_key());
+ }
+ inner_remove(iter);
+ }
+
+ StringKVInnerNodeLayout(char *buf) :
+ buf(buf) {}
+
+ uint32_t get_size() const {
+ ceph_le32 &size = *layout.template Pointer<0>(buf);
+ return uint32_t(size);
+ }
+
+ /**
+ * set_size
+ *
+ * Set size representation to match size
+ */
+ void set_size(uint32_t size) {
+ ceph_le32 s;
+ s = size;
+ *layout.template Pointer<0>(buf) = s;
+ }
+
+ const_iterator iter_cbegin() const {
+ return const_iterator(
+ this,
+ 0);
+ }
+ const_iterator iter_begin() const {
+ return iter_cbegin();
+ }
+
+ const_iterator iter_cend() const {
+ return const_iterator(
+ this,
+ get_size());
+ }
+ const_iterator iter_end() const {
+ return iter_cend();
+ }
+
+ iterator iter_begin() {
+ return iterator(
+ this,
+ 0);
+ }
+
+ iterator iter_end() {
+ return iterator(
+ this,
+ get_size());
+ }
+
+ const_iterator iter_idx(uint16_t off) const {
+ return const_iterator(
+ this,
+ off);
+ }
+
+ const_iterator string_lower_bound(std::string_view str) const {
+ auto it = std::lower_bound(boost::make_counting_iterator<uint16_t>(0),
+ boost::make_counting_iterator<uint16_t>(get_size()),
+ str,
+ [this](uint16_t i, std::string_view str) {
+ const_iterator iter(this, i);
+ return iter->get_key() < str;
+ });
+ return const_iterator(this, *it);
+ }
+
+ iterator string_lower_bound(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.string_lower_bound(str).index);
+ }
+
+ const_iterator string_upper_bound(std::string_view str) const {
+ auto it = std::upper_bound(boost::make_counting_iterator<uint16_t>(0),
+ boost::make_counting_iterator<uint16_t>(get_size()),
+ str,
+ [this](std::string_view str, uint16_t i) {
+ const_iterator iter(this, i);
+ return str < iter->get_key();
+ });
+ return const_iterator(this, *it);
+ }
+
+ iterator string_upper_bound(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.string_upper_bound(str).index);
+ }
+
+ const_iterator find_string_key(std::string_view str) const {
+ auto ret = iter_begin();
+ for (; ret != iter_end(); ++ret) {
+ std::string s = ret->get_key();
+ if (s == str)
+ break;
+ }
+ return ret;
+ }
+
+ iterator find_string_key(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.find_string_key(str).index);
+ }
+
+ const_iterator get_split_pivot() const {
+ uint32_t total_size = omap_inner_key_t(
+ get_node_key_ptr()[get_size()-1]).key_off;
+ uint32_t pivot_size = total_size / 2;
+ uint32_t size = 0;
+ for (auto ite = iter_begin(); ite < iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len;
+ if (size >= pivot_size){
+ return ite;
+ }
+ }
+ return iter_end();
+ }
+
+
+ /**
+ * get_meta/set_meta
+ *
+ * Enables stashing a templated type within the layout.
+ * Cannot be modified after initial write as it is not represented
+ * in delta_t
+ */
+ omap_node_meta_t get_meta() const {
+ omap_node_meta_le_t &metaint = *layout.template Pointer<1>(buf);
+ return omap_node_meta_t(metaint);
+ }
+ void set_meta(const omap_node_meta_t &meta) {
+ *layout.template Pointer<1>(buf) = omap_node_meta_le_t(meta);
+ }
+
+ uint32_t used_space() const {
+ uint32_t count = get_size();
+ if (count) {
+ omap_inner_key_t last_key = omap_inner_key_t(get_node_key_ptr()[count-1]);
+ return last_key.key_off + count * sizeof(omap_inner_key_le_t);
+ } else {
+ return 0;
+ }
+ }
+
+ uint32_t free_space() const {
+ return capacity() - used_space();
+ }
+
+ uint16_t capacity() const {
+ return OMAP_INNER_BLOCK_SIZE
+ - (reinterpret_cast<char*>(layout.template Pointer<2>(buf))
+ - reinterpret_cast<char*>(layout.template Pointer<0>(buf)));
+ }
+
+ bool is_overflow(size_t ksize) const {
+ return free_space() < (sizeof(omap_inner_key_le_t) + ksize);
+ }
+
+ bool is_overflow(const StringKVInnerNodeLayout &rhs) const {
+ return free_space() < rhs.used_space();
+ }
+
+ bool below_min() const {
+ return free_space() > (capacity() / 2);
+ }
+
+ bool operator==(const StringKVInnerNodeLayout &rhs) const {
+ if (get_size() != rhs.get_size()) {
+ return false;
+ }
+
+ auto iter = iter_begin();
+ auto iter2 = rhs.iter_begin();
+ while (iter != iter_end()) {
+ if (iter->get_key() != iter2->get_key() ||
+ iter->get_val() != iter2->get_val()) {
+ return false;
+ }
+ iter++;
+ iter2++;
+ }
+ return true;
+ }
+
+ /**
+ * split_into
+ *
+ * Takes *this and splits its contents into left and right.
+ */
+ std::string split_into(
+ StringKVInnerNodeLayout &left,
+ StringKVInnerNodeLayout &right) const {
+ auto piviter = get_split_pivot();
+ assert(piviter != iter_end());
+
+ copy_from_foreign(left.iter_begin(), iter_begin(), piviter);
+ left.set_size(piviter - iter_begin());
+
+ copy_from_foreign(right.iter_begin(), piviter, iter_end());
+ right.set_size(iter_end() - piviter);
+
+ auto [lmeta, rmeta] = get_meta().split_into();
+ left.set_meta(lmeta);
+ right.set_meta(rmeta);
+
+ return piviter->get_key();
+ }
+
+ /**
+ * merge_from
+ *
+ * Takes two nodes and copies their contents into *this.
+ *
+ * precondition: left.size() + right.size() < CAPACITY
+ */
+ void merge_from(
+ const StringKVInnerNodeLayout &left,
+ const StringKVInnerNodeLayout &right) {
+ copy_from_foreign(
+ iter_end(),
+ left.iter_begin(),
+ left.iter_end());
+ set_size(left.get_size());
+
+ copy_from_foreign(
+ iter_end(),
+ right.iter_begin(),
+ right.iter_end());
+ set_size(left.get_size() + right.get_size());
+ set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta()));
+ }
+
+ /**
+ * balance_into_new_nodes
+ *
+ * Takes the contents of left and right and copies them into
+ * replacement_left and replacement_right such that
+ * the size of replacement_left just >= 1/2 of (left + right)
+ */
+ static std::string balance_into_new_nodes(
+ const StringKVInnerNodeLayout &left,
+ const StringKVInnerNodeLayout &right,
+ StringKVInnerNodeLayout &replacement_left,
+ StringKVInnerNodeLayout &replacement_right)
+ {
+ uint32_t left_size = omap_inner_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
+ uint32_t right_size = omap_inner_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
+ uint32_t total = left_size + right_size;
+ uint32_t pivot_size = total / 2;
+ uint32_t pivot_idx = 0;
+ if (pivot_size < left_size) {
+ uint32_t size = 0;
+ for (auto ite = left.iter_begin(); ite < left.iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len;
+ if (size >= pivot_size){
+ pivot_idx = ite.get_index();
+ break;
+ }
+ }
+ } else {
+ uint32_t more_size = pivot_size - left_size;
+ uint32_t size = 0;
+ for (auto ite = right.iter_begin(); ite < right.iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len;
+ if (size >= more_size){
+ pivot_idx = ite.get_index() + left.get_size();
+ break;
+ }
+ }
+ }
+
+ auto replacement_pivot = pivot_idx >= left.get_size() ?
+ right.iter_idx(pivot_idx - left.get_size())->get_key() :
+ left.iter_idx(pivot_idx)->get_key();
+
+ if (pivot_size < left_size) {
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ left.iter_begin(),
+ left.iter_idx(pivot_idx));
+ replacement_left.set_size(pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ left.iter_idx(pivot_idx),
+ left.iter_end());
+ replacement_right.set_size(left.get_size() - pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ right.iter_begin(),
+ right.iter_end());
+ replacement_right.set_size(right.get_size() + left.get_size()- pivot_idx);
+ } else {
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ left.iter_begin(),
+ left.iter_end());
+ replacement_left.set_size(left.get_size());
+
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ right.iter_begin(),
+ right.iter_idx(pivot_idx - left.get_size()));
+ replacement_left.set_size(pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ right.iter_idx(pivot_idx - left.get_size()),
+ right.iter_end());
+ replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+ }
+
+ auto [lmeta, rmeta] = omap_node_meta_t::rebalance(
+ left.get_meta(), right.get_meta());
+ replacement_left.set_meta(lmeta);
+ replacement_right.set_meta(rmeta);
+ return replacement_pivot;
+ }
+
+private:
+ void inner_insert(
+ iterator iter,
+ const std::string &key,
+ laddr_t val) {
+ if (iter != iter_begin()) {
+ assert((iter - 1)->get_key() < key);
+ }
+ if (iter != iter_end()) {
+ assert(iter->get_key() > key);
+ }
+ assert(!is_overflow(key.size()));
+
+ if (iter != iter_end()) {
+ copy_from_local(key.size(), iter + 1, iter, iter_end());
+ }
+
+ omap_inner_key_t nkey;
+ nkey.key_len = key.size();
+ nkey.laddr = val;
+ if (iter != iter_begin()) {
+ auto pkey = (iter - 1).get_node_key();
+ nkey.key_off = nkey.key_len + pkey.key_off;
+ } else {
+ nkey.key_off = nkey.key_len;
+ }
+
+ iter->set_node_key(nkey);
+ set_size(get_size() + 1);
+ iter->set_node_val(key);
+ }
+
+ void inner_update(
+ iterator iter,
+ laddr_t addr) {
+ assert(iter != iter_end());
+ auto node_key = iter->get_node_key();
+ node_key.laddr = addr;
+ iter->set_node_key(node_key);
+ }
+
+ void inner_remove(iterator iter) {
+ assert(iter != iter_end());
+ if ((iter + 1) != iter_end())
+ copy_from_local(iter->get_node_key().key_len, iter, iter + 1, iter_end());
+ set_size(get_size() - 1);
+ }
+
+ /**
+ * get_key_ptr
+ *
+ * Get pointer to start of key array
+ */
+ omap_inner_key_le_t *get_node_key_ptr() {
+ return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+ }
+ const omap_inner_key_le_t *get_node_key_ptr() const {
+ return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+ }
+
+};
+
+/**
+ * StringKVLeafNodeLayout
+ *
+ * layout diagram:
+ *
+ * # <----------------------------- node range -------------------------------------------------> #
+ * # #<~># free space #
+ * # <------------- left part ---------------------------> # <~# <----- right key-value pairs --> #
+ * # # <------------ left keys ------------> #~> # #
+ * # # keys [2, n) |<~># #<~>| right kvs [2, n) #
+ * # # <--- key 0 ---> | <--- key 1 ---> | # # | <-- kv 1 --> | <-- kv 0 --> #
+ * # # | | # # | | #
+ * # num_ | meta # key | key | val | key | key | val | # # | key | val | key | val #
+ * # keys | depth # off | len | len | off | len | len | # # | buff | buff | buff | buff #
+ * # # 0 | 0 | 0 | 1 | 1 | 1 |...#...#...| key 1 | val 1| key 0 | val 0 #
+ * # | | | <--- off ----+-------------> #
+ * # | | ^ | <--- off ---> #
+ * | | | ^
+ * | +-----------------------------------+ |
+ * +-------------------------------------------------------------------+
+ */
+class StringKVLeafNodeLayout {
+ char *buf = nullptr;
+
+ using L = absl::container_internal::Layout<ceph_le32, omap_node_meta_le_t, omap_leaf_key_le_t>;
+ static constexpr L layout{1, 1, 1}; // = L::Partial(1, 1, 1);
+ friend class delta_leaf_t;
+
+public:
+ template <bool is_const>
+ class iter_t {
+ friend class StringKVLeafNodeLayout;
+ using parent_t = typename crimson::common::maybe_const_t<StringKVLeafNodeLayout, is_const>::type;
+
+ template <typename iterator, typename const_iterator>
+ friend void copy_from_foreign(iterator, const_iterator, const_iterator);
+ template <typename iterator>
+ friend void copy_from_local(unsigned, iterator, iterator, iterator);
+
+ parent_t node;
+ uint16_t index;
+
+ iter_t(
+ parent_t parent,
+ uint16_t index) : node(parent), index(index) {}
+
+ public:
+ iter_t(const iter_t &) = default;
+ iter_t(iter_t &&) = default;
+ iter_t &operator=(const iter_t &) = default;
+ iter_t &operator=(iter_t &&) = default;
+
+ operator iter_t<!is_const>() const {
+ static_assert(!is_const);
+ return iter_t<!is_const>(node, index);
+ }
+
+ iter_t &operator*() { return *this; }
+ iter_t *operator->() { return this; }
+
+ iter_t operator++(int) {
+ auto ret = *this;
+ ++index;
+ return ret;
+ }
+
+ iter_t &operator++() {
+ ++index;
+ return *this;
+ }
+
+ uint16_t operator-(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index - rhs.index;
+ }
+
+ iter_t operator+(uint16_t off) const {
+ return iter_t(
+ node,
+ index + off);
+ }
+ iter_t operator-(uint16_t off) const {
+ return iter_t(
+ node,
+ index - off);
+ }
+
+ uint16_t operator<(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index < rhs.index;
+ }
+
+ uint16_t operator>(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index > rhs.index;
+ }
+
+ bool operator==(const iter_t &rhs) const {
+ assert(node == rhs.node);
+ return rhs.index == index;
+ }
+
+ bool operator!=(const iter_t &rhs) const {
+ assert(node == rhs.node);
+ return index != rhs.index;
+ }
+
+ private:
+ omap_leaf_key_t get_node_key() const {
+ omap_leaf_key_le_t kint = node->get_node_key_ptr()[index];
+ return omap_leaf_key_t(kint);
+ }
+ auto get_node_key_ptr() const {
+ return reinterpret_cast<
+ typename crimson::common::maybe_const_t<char, is_const>::type>(
+ node->get_node_key_ptr() + index);
+ }
+
+ uint32_t get_node_val_offset() const {
+ return get_node_key().key_off;
+ }
+ auto get_node_val_ptr() const {
+ auto tail = node->buf + OMAP_LEAF_BLOCK_SIZE;
+ if (*this == node->iter_end())
+ return tail;
+ else {
+ return tail - get_node_val_offset();
+ }
+ }
+
+ int get_right_offset_end() const {
+ if (index == 0)
+ return 0;
+ else
+ return (*this - 1)->get_node_val_offset();
+ }
+ auto get_right_ptr_end() const {
+ return node->buf + OMAP_LEAF_BLOCK_SIZE - get_right_offset_end();
+ }
+
+ void update_offset(int offset) {
+ auto key = get_node_key();
+ assert(offset + key.key_off >= 0);
+ key.key_off += offset;
+ set_node_key(key);
+ }
+
+ void set_node_key(omap_leaf_key_t _lb) const {
+ static_assert(!is_const);
+ omap_leaf_key_le_t lb;
+ lb = _lb;
+ node->get_node_key_ptr()[index] = lb;
+ }
+
+ void set_node_val(const std::string &key, const ceph::bufferlist &val) {
+ static_assert(!is_const);
+ auto node_key = get_node_key();
+ assert(key.size() == node_key.key_len);
+ assert(val.length() == node_key.val_len);
+ ::memcpy(get_node_val_ptr(), key.data(), key.size());
+ auto bliter = val.begin();
+ bliter.copy(node_key.val_len, get_node_val_ptr() + node_key.key_len);
+ }
+
+ public:
+ uint16_t get_index() const {
+ return index;
+ }
+
+ std::string get_key() const {
+ return std::string(
+ get_node_val_ptr(),
+ get_node_key().key_len);
+ }
+
+ std::string get_str_val() const {
+ auto node_key = get_node_key();
+ return std::string(
+ get_node_val_ptr() + node_key.key_len,
+ get_node_key().val_len);
+ }
+
+ ceph::bufferlist get_val() const {
+ auto node_key = get_node_key();
+ ceph::bufferlist bl;
+ ceph::bufferptr bptr(
+ get_node_val_ptr() + node_key.key_len,
+ get_node_key().val_len);
+ bl.append(bptr);
+ return bl;
+ }
+ };
+ using const_iterator = iter_t<true>;
+ using iterator = iter_t<false>;
+
+public:
+ void journal_leaf_insert(
+ const_iterator _iter,
+ const std::string &key,
+ const ceph::bufferlist &val,
+ delta_leaf_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->insert(
+ key,
+ val);
+ }
+ leaf_insert(iter, key, val);
+ }
+
+ void journal_leaf_update(
+ const_iterator _iter,
+ const std::string &key,
+ const ceph::bufferlist &val,
+ delta_leaf_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->remove(iter->get_key());
+ recorder->insert(key, val);
+ }
+ leaf_update(iter, key, val);
+ }
+
+ void journal_leaf_remove(
+ const_iterator _iter,
+ delta_leaf_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->remove(iter->get_key());
+ }
+ leaf_remove(iter);
+ }
+
+ StringKVLeafNodeLayout(char *buf) :
+ buf(buf) {}
+
+ const_iterator iter_begin() const {
+ return const_iterator(
+ this,
+ 0);
+ }
+
+ const_iterator iter_end() const {
+ return const_iterator(
+ this,
+ get_size());
+ }
+
+ iterator iter_begin() {
+ return iterator(
+ this,
+ 0);
+ }
+
+ iterator iter_end() {
+ return iterator(
+ this,
+ get_size());
+ }
+
+ const_iterator iter_idx(uint16_t off) const {
+ return const_iterator(
+ this,
+ off);
+ }
+
+ const_iterator string_lower_bound(std::string_view str) const {
+ uint16_t start = 0, end = get_size();
+ while (start != end) {
+ unsigned mid = (start + end) / 2;
+ const_iterator iter(this, mid);
+ std::string s = iter->get_key();
+ if (s < str) {
+ start = ++mid;
+ } else if (s > str) {
+ end = mid;
+ } else {
+ return iter;
+ }
+ }
+ return const_iterator(this, start);
+ }
+
+ iterator string_lower_bound(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.string_lower_bound(str).index);
+ }
+
+ const_iterator string_upper_bound(std::string_view str) const {
+ auto ret = iter_begin();
+ for (; ret != iter_end(); ++ret) {
+ std::string s = ret->get_key();
+ if (s > str)
+ break;
+ }
+ return ret;
+ }
+
+ iterator string_upper_bound(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.string_upper_bound(str).index);
+ }
+
+ const_iterator find_string_key(std::string_view str) const {
+ auto ret = iter_begin();
+ for (; ret != iter_end(); ++ret) {
+ std::string s = ret->get_key();
+ if (s == str)
+ break;
+ }
+ return ret;
+ }
+ iterator find_string_key(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.find_string_key(str).index);
+ }
+
+ const_iterator get_split_pivot() const {
+ uint32_t total_size = omap_leaf_key_t(get_node_key_ptr()[get_size()-1]).key_off;
+ uint32_t pivot_size = total_size / 2;
+ uint32_t size = 0;
+ for (auto ite = iter_begin(); ite < iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len + node_key.val_len;
+ if (size >= pivot_size){
+ return ite;
+ }
+ }
+ return iter_end();
+ }
+
+ uint32_t get_size() const {
+ ceph_le32 &size = *layout.template Pointer<0>(buf);
+ return uint32_t(size);
+ }
+
+ /**
+ * set_size
+ *
+ * Set size representation to match size
+ */
+ void set_size(uint32_t size) {
+ ceph_le32 s;
+ s = size;
+ *layout.template Pointer<0>(buf) = s;
+ }
+
+ /**
+ * get_meta/set_meta
+ *
+ * Enables stashing a templated type within the layout.
+ * Cannot be modified after initial write as it is not represented
+ * in delta_t
+ */
+ omap_node_meta_t get_meta() const {
+ omap_node_meta_le_t &metaint = *layout.template Pointer<1>(buf);
+ return omap_node_meta_t(metaint);
+ }
+ void set_meta(const omap_node_meta_t &meta) {
+ *layout.template Pointer<1>(buf) = omap_node_meta_le_t(meta);
+ }
+
+ uint32_t used_space() const {
+ uint32_t count = get_size();
+ if (count) {
+ omap_leaf_key_t last_key = omap_leaf_key_t(get_node_key_ptr()[count-1]);
+ return last_key.key_off + count * sizeof(omap_leaf_key_le_t);
+ } else {
+ return 0;
+ }
+ }
+
+ uint32_t free_space() const {
+ return capacity() - used_space();
+ }
+
+ uint32_t capacity() const {
+ return OMAP_LEAF_BLOCK_SIZE
+ - (reinterpret_cast<char*>(layout.template Pointer<2>(buf))
+ - reinterpret_cast<char*>(layout.template Pointer<0>(buf)));
+ }
+
+ bool is_overflow(size_t ksize, size_t vsize) const {
+ return free_space() < (sizeof(omap_leaf_key_le_t) + ksize + vsize);
+ }
+
+ bool is_overflow(const StringKVLeafNodeLayout &rhs) const {
+ return free_space() < rhs.used_space();
+ }
+
+ bool below_min() const {
+ return free_space() > (capacity() / 2);
+ }
+
+ bool operator==(const StringKVLeafNodeLayout &rhs) const {
+ if (get_size() != rhs.get_size()) {
+ return false;
+ }
+
+ auto iter = iter_begin();
+ auto iter2 = rhs.iter_begin();
+ while (iter != iter_end()) {
+ if(iter->get_key() != iter2->get_key() ||
+ iter->get_val() != iter2->get_val()) {
+ return false;
+ }
+ iter++;
+ iter2++;
+ }
+ return true;
+ }
+
+ /**
+ * split_into
+ *
+ * Takes *this and splits its contents into left and right.
+ */
+ std::string split_into(
+ StringKVLeafNodeLayout &left,
+ StringKVLeafNodeLayout &right) const {
+ auto piviter = get_split_pivot();
+ assert (piviter != iter_end());
+
+ copy_from_foreign(left.iter_begin(), iter_begin(), piviter);
+ left.set_size(piviter - iter_begin());
+
+ copy_from_foreign(right.iter_begin(), piviter, iter_end());
+ right.set_size(iter_end() - piviter);
+
+ auto [lmeta, rmeta] = get_meta().split_into();
+ left.set_meta(lmeta);
+ right.set_meta(rmeta);
+
+ return piviter->get_key();
+ }
+
+ /**
+ * merge_from
+ *
+ * Takes two nodes and copies their contents into *this.
+ *
+ * precondition: left.size() + right.size() < CAPACITY
+ */
+ void merge_from(
+ const StringKVLeafNodeLayout &left,
+ const StringKVLeafNodeLayout &right)
+ {
+ copy_from_foreign(
+ iter_end(),
+ left.iter_begin(),
+ left.iter_end());
+ set_size(left.get_size());
+ copy_from_foreign(
+ iter_end(),
+ right.iter_begin(),
+ right.iter_end());
+ set_size(left.get_size() + right.get_size());
+ set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta()));
+ }
+
+ /**
+ * balance_into_new_nodes
+ *
+ * Takes the contents of left and right and copies them into
+ * replacement_left and replacement_right such that
+ * the size of replacement_left side just >= 1/2 of the total size (left + right).
+ */
+ static std::string balance_into_new_nodes(
+ const StringKVLeafNodeLayout &left,
+ const StringKVLeafNodeLayout &right,
+ StringKVLeafNodeLayout &replacement_left,
+ StringKVLeafNodeLayout &replacement_right)
+ {
+ uint32_t left_size = omap_leaf_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
+ uint32_t right_size = omap_leaf_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
+ uint32_t total = left_size + right_size;
+ uint32_t pivot_size = total / 2;
+ uint32_t pivot_idx = 0;
+ if (pivot_size < left_size) {
+ uint32_t size = 0;
+ for (auto ite = left.iter_begin(); ite < left.iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len + node_key.val_len;
+ if (size >= pivot_size){
+ pivot_idx = ite.get_index();
+ break;
+ }
+ }
+ } else {
+ uint32_t more_size = pivot_size - left_size;
+ uint32_t size = 0;
+ for (auto ite = right.iter_begin(); ite < right.iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len + node_key.val_len;
+ if (size >= more_size){
+ pivot_idx = ite.get_index() + left.get_size();
+ break;
+ }
+ }
+ }
+
+ auto replacement_pivot = pivot_idx >= left.get_size() ?
+ right.iter_idx(pivot_idx - left.get_size())->get_key() :
+ left.iter_idx(pivot_idx)->get_key();
+
+ if (pivot_size < left_size) {
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ left.iter_begin(),
+ left.iter_idx(pivot_idx));
+ replacement_left.set_size(pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ left.iter_idx(pivot_idx),
+ left.iter_end());
+ replacement_right.set_size(left.get_size() - pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ right.iter_begin(),
+ right.iter_end());
+ replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+ } else {
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ left.iter_begin(),
+ left.iter_end());
+ replacement_left.set_size(left.get_size());
+
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ right.iter_begin(),
+ right.iter_idx(pivot_idx - left.get_size()));
+ replacement_left.set_size(pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ right.iter_idx(pivot_idx - left.get_size()),
+ right.iter_end());
+ replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+ }
+
+ auto [lmeta, rmeta] = omap_node_meta_t::rebalance(
+ left.get_meta(), right.get_meta());
+ replacement_left.set_meta(lmeta);
+ replacement_right.set_meta(rmeta);
+ return replacement_pivot;
+ }
+
+private:
+ void leaf_insert(
+ iterator iter,
+ const std::string &key,
+ const bufferlist &val) {
+ if (iter != iter_begin()) {
+ assert((iter - 1)->get_key() < key);
+ }
+ if (iter != iter_end()) {
+ assert(iter->get_key() > key);
+ }
+ assert(!is_overflow(key.size(), val.length()));
+ omap_leaf_key_t node_key;
+ if (iter == iter_begin()) {
+ node_key.key_off = key.size() + val.length();
+ node_key.key_len = key.size();
+ node_key.val_len = val.length();
+ } else {
+ node_key.key_off = (iter - 1)->get_node_key().key_off +
+ (key.size() + val.length());
+ node_key.key_len = key.size();
+ node_key.val_len = val.length();
+ }
+ if (get_size() != 0 && iter != iter_end())
+ copy_from_local(node_key.key_len + node_key.val_len, iter + 1, iter, iter_end());
+
+ iter->set_node_key(node_key);
+ set_size(get_size() + 1);
+ iter->set_node_val(key, val);
+ }
+
+ void leaf_update(
+ iterator iter,
+ const std::string &key,
+ const ceph::bufferlist &val) {
+ assert(iter != iter_end());
+ leaf_remove(iter);
+ assert(!is_overflow(key.size(), val.length()));
+ leaf_insert(iter, key, val);
+ }
+
+ void leaf_remove(iterator iter) {
+ assert(iter != iter_end());
+ if ((iter + 1) != iter_end()) {
+ omap_leaf_key_t key = iter->get_node_key();
+ copy_from_local(key.key_len + key.val_len, iter, iter + 1, iter_end());
+ }
+ set_size(get_size() - 1);
+ }
+
+ /**
+ * get_key_ptr
+ *
+ * Get pointer to start of key array
+ */
+ omap_leaf_key_le_t *get_node_key_ptr() {
+ return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+ }
+ const omap_leaf_key_le_t *get_node_key_ptr() const {
+ return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+ }
+
+};
+
+inline void delta_inner_t::replay(StringKVInnerNodeLayout &l) {
+ switch (op) {
+ case op_t::INSERT: {
+ l.inner_insert(l.string_lower_bound(key), key, addr);
+ break;
+ }
+ case op_t::UPDATE: {
+ auto iter = l.find_string_key(key);
+ assert(iter != l.iter_end());
+ l.inner_update(iter, addr);
+ break;
+ }
+ case op_t::REMOVE: {
+ auto iter = l.find_string_key(key);
+ assert(iter != l.iter_end());
+ l.inner_remove(iter);
+ break;
+ }
+ default:
+ assert(0 == "Impossible");
+ }
+}
+
+inline void delta_leaf_t::replay(StringKVLeafNodeLayout &l) {
+ switch (op) {
+ case op_t::INSERT: {
+ l.leaf_insert(l.string_lower_bound(key), key, val);
+ break;
+ }
+ case op_t::UPDATE: {
+ auto iter = l.find_string_key(key);
+ assert(iter != l.iter_end());
+ l.leaf_update(iter, key, val);
+ break;
+ }
+ case op_t::REMOVE: {
+ auto iter = l.find_string_key(key);
+ assert(iter != l.iter_end());
+ l.leaf_remove(iter);
+ break;
+ }
+ default:
+ assert(0 == "Impossible");
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc
new file mode 100644
index 000000000..96b69fb7c
--- /dev/null
+++ b/src/crimson/os/seastore/onode.cc
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode.h"
+#include <iostream>
+
+namespace crimson::os::seastore {
+
+std::ostream& operator<<(std::ostream &out, const Onode &rhs)
+{
+ auto &layout = rhs.get_layout();
+ return out << "Onode("
+ << "size=" << static_cast<uint32_t>(layout.size)
+ << ")";
+}
+
+}
+
diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h
new file mode 100644
index 000000000..069daa3df
--- /dev/null
+++ b/src/crimson/os/seastore/onode.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iosfwd>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/byteorder.h"
+#include "seastore_types.h"
+
+namespace crimson::os::seastore {
+
+struct onode_layout_t {
+ // The expected decode size of object_info_t without oid.
+ static constexpr int MAX_OI_LENGTH = 232;
+ // We might want to move the ss field out of onode_layout_t.
+ // The reason is that ss_attr may grow to relative large, as
+ // its clone_overlap may grow to a large size, if applications
+ // set objects to a relative large size(for the purpose of reducing
+ // the number of objects per OSD, so that all objects' metadata
+ // can be cached in memory) and do many modifications between
+ // snapshots.
+ // TODO: implement flexible-sized onode value to store inline ss_attr
+ // effectively.
+ static constexpr int MAX_SS_LENGTH = 1;
+
+ ceph_le32 size{0};
+ ceph_le32 oi_size{0};
+ ceph_le32 ss_size{0};
+ omap_root_le_t omap_root;
+ omap_root_le_t xattr_root;
+
+ object_data_le_t object_data;
+
+ char oi[MAX_OI_LENGTH];
+ char ss[MAX_SS_LENGTH];
+} __attribute__((packed));
+
+class Transaction;
+
+/**
+ * Onode
+ *
+ * Interface manipulated by seastore. OnodeManager implementations should
+ * return objects derived from this interface with layout referencing
+ * internal representation of onode_layout_t.
+ */
+class Onode : public boost::intrusive_ref_counter<
+ Onode,
+ boost::thread_unsafe_counter>
+{
+protected:
+ virtual laddr_t get_hint() const = 0;
+ const uint32_t default_metadata_offset = 0;
+ const uint32_t default_metadata_range = 0;
+public:
+ Onode(uint32_t ddr, uint32_t dmr)
+ : default_metadata_offset(ddr),
+ default_metadata_range(dmr)
+ {}
+
+ virtual bool is_alive() const = 0;
+ virtual const onode_layout_t &get_layout() const = 0;
+ virtual onode_layout_t &get_mutable_layout(Transaction &t) = 0;
+ virtual ~Onode() = default;
+
+ laddr_t get_metadata_hint(uint64_t block_size) const {
+ assert(default_metadata_offset);
+ assert(default_metadata_range);
+ uint64_t range_blocks = default_metadata_range / block_size;
+ return get_hint() + default_metadata_offset +
+ (((uint32_t)std::rand() % range_blocks) * block_size);
+ }
+ laddr_t get_data_hint() const {
+ return get_hint();
+ }
+};
+
+
+std::ostream& operator<<(std::ostream &out, const Onode &rhs);
+using OnodeRef = boost::intrusive_ptr<Onode>;
+}
+
+#if FMT_VERSION >= 90000
+template<> struct fmt::formatter<crimson::os::seastore::Onode> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/onode_manager.h b/src/crimson/os/seastore/onode_manager.h
new file mode 100644
index 000000000..123c9e4f8
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/buffer_fwd.h"
+#include "include/ceph_assert.h"
+#include "common/hobject.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/osd/exceptions.h"
+
+namespace crimson::os::seastore {
+
+class OnodeManager {
+ using base_iertr = TransactionManager::base_iertr;
+public:
+ using mkfs_iertr = base_iertr;
+ using mkfs_ret = mkfs_iertr::future<>;
+ virtual mkfs_ret mkfs(Transaction &t) = 0;
+
+ using contains_onode_iertr = base_iertr;
+ using contains_onode_ret = contains_onode_iertr::future<bool>;
+ virtual contains_onode_ret contains_onode(
+ Transaction &trans,
+ const ghobject_t &hoid) = 0;
+
+ using get_onode_iertr = base_iertr::extend<
+ crimson::ct_error::enoent>;
+ using get_onode_ret = get_onode_iertr::future<
+ OnodeRef>;
+ virtual get_onode_ret get_onode(
+ Transaction &trans,
+ const ghobject_t &hoid) = 0;
+
+ using get_or_create_onode_iertr = base_iertr::extend<
+ crimson::ct_error::value_too_large>;
+ using get_or_create_onode_ret = get_or_create_onode_iertr::future<
+ OnodeRef>;
+ virtual get_or_create_onode_ret get_or_create_onode(
+ Transaction &trans,
+ const ghobject_t &hoid) = 0;
+
+ using get_or_create_onodes_iertr = base_iertr::extend<
+ crimson::ct_error::value_too_large>;
+ using get_or_create_onodes_ret = get_or_create_onodes_iertr::future<
+ std::vector<OnodeRef>>;
+ virtual get_or_create_onodes_ret get_or_create_onodes(
+ Transaction &trans,
+ const std::vector<ghobject_t> &hoids) = 0;
+
+ using write_dirty_iertr = base_iertr;
+ using write_dirty_ret = write_dirty_iertr::future<>;
+ virtual write_dirty_ret write_dirty(
+ Transaction &trans,
+ const std::vector<OnodeRef> &onodes) = 0;
+
+ using erase_onode_iertr = base_iertr;
+ using erase_onode_ret = erase_onode_iertr::future<>;
+ virtual erase_onode_ret erase_onode(
+ Transaction &trans,
+ OnodeRef &onode) = 0;
+
+ using list_onodes_iertr = base_iertr;
+ using list_onodes_bare_ret = std::tuple<std::vector<ghobject_t>, ghobject_t>;
+ using list_onodes_ret = list_onodes_iertr::future<list_onodes_bare_ret>;
+ virtual list_onodes_ret list_onodes(
+ Transaction &trans,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) = 0;
+
+ virtual ~OnodeManager() {}
+};
+using OnodeManagerRef = std::unique_ptr<OnodeManager>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc
new file mode 100644
index 000000000..bff27ab65
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/logging.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h"
+
+SET_SUBSYS(seastore_onode);
+
+namespace crimson::os::seastore::onode {
+
+FLTreeOnodeManager::contains_onode_ret FLTreeOnodeManager::contains_onode(
+ Transaction &trans,
+ const ghobject_t &hoid)
+{
+ return tree.contains(trans, hoid);
+}
+
+FLTreeOnodeManager::get_onode_ret FLTreeOnodeManager::get_onode(
+ Transaction &trans,
+ const ghobject_t &hoid)
+{
+ LOG_PREFIX(FLTreeOnodeManager::get_onode);
+ return tree.find(
+ trans, hoid
+ ).si_then([this, &hoid, &trans, FNAME](auto cursor)
+ -> get_onode_ret {
+ if (cursor == tree.end()) {
+ DEBUGT("no entry for {}", trans, hoid);
+ return crimson::ct_error::enoent::make();
+ }
+ auto val = OnodeRef(new FLTreeOnode(
+ default_data_reservation,
+ default_metadata_range,
+ cursor.value()));
+ return get_onode_iertr::make_ready_future<OnodeRef>(
+ val
+ );
+ });
+}
+
+FLTreeOnodeManager::get_or_create_onode_ret
+FLTreeOnodeManager::get_or_create_onode(
+ Transaction &trans,
+ const ghobject_t &hoid)
+{
+ LOG_PREFIX(FLTreeOnodeManager::get_or_create_onode);
+ return tree.insert(
+ trans, hoid,
+ OnodeTree::tree_value_config_t{sizeof(onode_layout_t)}
+ ).si_then([this, &trans, &hoid, FNAME](auto p)
+ -> get_or_create_onode_ret {
+ auto [cursor, created] = std::move(p);
+ auto val = OnodeRef(new FLTreeOnode(
+ default_data_reservation,
+ default_metadata_range,
+ cursor.value()));
+ if (created) {
+ DEBUGT("created onode for entry for {}", trans, hoid);
+ val->get_mutable_layout(trans) = onode_layout_t{};
+ }
+ return get_or_create_onode_iertr::make_ready_future<OnodeRef>(
+ val
+ );
+ });
+}
+
+FLTreeOnodeManager::get_or_create_onodes_ret
+FLTreeOnodeManager::get_or_create_onodes(
+ Transaction &trans,
+ const std::vector<ghobject_t> &hoids)
+{
+ return seastar::do_with(
+ std::vector<OnodeRef>(),
+ [this, &hoids, &trans](auto &ret) {
+ ret.reserve(hoids.size());
+ return trans_intr::do_for_each(
+ hoids,
+ [this, &trans, &ret](auto &hoid) {
+ return get_or_create_onode(trans, hoid
+ ).si_then([&ret](auto &&onoderef) {
+ ret.push_back(std::move(onoderef));
+ });
+ }).si_then([&ret] {
+ return std::move(ret);
+ });
+ });
+}
+
+FLTreeOnodeManager::write_dirty_ret FLTreeOnodeManager::write_dirty(
+ Transaction &trans,
+ const std::vector<OnodeRef> &onodes)
+{
+ return trans_intr::do_for_each(
+ onodes,
+ [&trans](auto &onode) -> eagain_ifuture<> {
+ if (!onode) {
+ return eagain_iertr::make_ready_future<>();
+ }
+ auto &flonode = static_cast<FLTreeOnode&>(*onode);
+ if (!flonode.is_alive()) {
+ return eagain_iertr::make_ready_future<>();
+ }
+ switch (flonode.status) {
+ case FLTreeOnode::status_t::MUTATED: {
+ flonode.populate_recorder(trans);
+ return eagain_iertr::make_ready_future<>();
+ }
+ case FLTreeOnode::status_t::STABLE: {
+ return eagain_iertr::make_ready_future<>();
+ }
+ default:
+ __builtin_unreachable();
+ }
+ });
+}
+
+FLTreeOnodeManager::erase_onode_ret FLTreeOnodeManager::erase_onode(
+ Transaction &trans,
+ OnodeRef &onode)
+{
+ auto &flonode = static_cast<FLTreeOnode&>(*onode);
+ assert(flonode.is_alive());
+ if (flonode.status == FLTreeOnode::status_t::MUTATED) {
+ flonode.populate_recorder(trans);
+ }
+ flonode.mark_delete();
+ return tree.erase(trans, flonode);
+}
+
+FLTreeOnodeManager::list_onodes_ret FLTreeOnodeManager::list_onodes(
+ Transaction &trans,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit)
+{
+ return tree.lower_bound(trans, start
+ ).si_then([this, &trans, end, limit] (auto&& cursor) {
+ using crimson::os::seastore::onode::full_key_t;
+ return seastar::do_with(
+ limit,
+ std::move(cursor),
+ list_onodes_bare_ret(),
+ [this, &trans, end] (auto& to_list, auto& current_cursor, auto& ret) {
+ return trans_intr::repeat(
+ [this, &trans, end, &to_list, &current_cursor, &ret] ()
+ -> eagain_ifuture<seastar::stop_iteration> {
+ if (current_cursor.is_end()) {
+ std::get<1>(ret) = ghobject_t::get_max();
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ } else if (current_cursor.get_ghobj() >= end) {
+ std::get<1>(ret) = end;
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ if (to_list == 0) {
+ std::get<1>(ret) = current_cursor.get_ghobj();
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ std::get<0>(ret).emplace_back(current_cursor.get_ghobj());
+ return tree.get_next(trans, current_cursor
+ ).si_then([&to_list, &current_cursor] (auto&& next_cursor) mutable {
+ // we intentionally hold the current_cursor during get_next() to
+ // accelerate tree lookup.
+ --to_list;
+ current_cursor = next_cursor;
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ }).si_then([&ret] () mutable {
+ return seastar::make_ready_future<list_onodes_bare_ret>(
+ std::move(ret));
+ // return ret;
+ });
+ });
+ });
+}
+
+FLTreeOnodeManager::~FLTreeOnodeManager() {}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h
new file mode 100644
index 000000000..09998fbfa
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h
@@ -0,0 +1,171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/value.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/tree.h"
+
+namespace crimson::os::seastore::onode {
+
+struct FLTreeOnode final : Onode, Value {
+ static constexpr tree_conf_t TREE_CONF = {
+ value_magic_t::ONODE,
+ 256, // max_ns_size
+ // same to option osd_max_object_namespace_len
+ 2048, // max_oid_size
+ // same to option osd_max_object_name_len
+ 1200, // max_value_payload_size
+ // see crimson::os::seastore::onode_layout_t
+ 8192, // internal_node_size
+ // see the formula in validate_tree_config
+ 16384 // leaf_node_size
+ // see the formula in validate_tree_config
+ };
+
+ enum class status_t {
+ STABLE,
+ MUTATED,
+ DELETED
+ } status = status_t::STABLE;
+
+ FLTreeOnode(FLTreeOnode&&) = default;
+ FLTreeOnode& operator=(FLTreeOnode&&) = delete;
+
+ FLTreeOnode(const FLTreeOnode&) = default;
+ FLTreeOnode& operator=(const FLTreeOnode&) = delete;
+
+ template <typename... T>
+ FLTreeOnode(uint32_t ddr, uint32_t dmr, T&&... args)
+ : Onode(ddr, dmr),
+ Value(std::forward<T>(args)...) {}
+
+ template <typename... T>
+ FLTreeOnode(T&&... args)
+ : Onode(0, 0),
+ Value(std::forward<T>(args)...) {}
+
+ struct Recorder : public ValueDeltaRecorder {
+ Recorder(bufferlist &bl) : ValueDeltaRecorder(bl) {}
+
+ value_magic_t get_header_magic() const final {
+ return TREE_CONF.value_magic;
+ }
+
+ void apply_value_delta(
+ ceph::bufferlist::const_iterator &bliter,
+ NodeExtentMutable &value,
+ laddr_t) final {
+ assert(value.get_length() == sizeof(onode_layout_t));
+ bliter.copy(value.get_length(), value.get_write());
+ }
+
+ void record_delta(NodeExtentMutable &value) {
+ // TODO: probably could use versioning, etc
+ assert(value.get_length() == sizeof(onode_layout_t));
+ ceph::buffer::ptr bptr(value.get_length());
+ memcpy(bptr.c_str(), value.get_read(), value.get_length());
+ get_encoded(value).append(bptr);
+ }
+ };
+
+ bool is_alive() const {
+ return status != status_t::DELETED;
+ }
+ const onode_layout_t &get_layout() const final {
+ assert(status != status_t::DELETED);
+ return *read_payload<onode_layout_t>();
+ }
+
+ onode_layout_t &get_mutable_layout(Transaction &t) final {
+ assert(status != status_t::DELETED);
+ auto p = prepare_mutate_payload<
+ onode_layout_t,
+ Recorder>(t);
+ status = status_t::MUTATED;
+ return *reinterpret_cast<onode_layout_t*>(p.first.get_write());
+ };
+
+ void populate_recorder(Transaction &t) {
+ assert(status == status_t::MUTATED);
+ auto p = prepare_mutate_payload<
+ onode_layout_t,
+ Recorder>(t);
+ if (p.second) {
+ p.second->record_delta(
+ p.first);
+ }
+ status = status_t::STABLE;
+ }
+
+ void mark_delete() {
+ assert(status != status_t::DELETED);
+ status = status_t::DELETED;
+ }
+
+ laddr_t get_hint() const final {
+ return Value::get_hint();
+ }
+ ~FLTreeOnode() final {}
+};
+
+using OnodeTree = Btree<FLTreeOnode>;
+
+using crimson::common::get_conf;
+
+class FLTreeOnodeManager : public crimson::os::seastore::OnodeManager {
+ OnodeTree tree;
+
+ uint32_t default_data_reservation = 0;
+ uint32_t default_metadata_offset = 0;
+ uint32_t default_metadata_range = 0;
+public:
+ FLTreeOnodeManager(TransactionManager &tm) :
+ tree(NodeExtentManager::create_seastore(tm)),
+ default_data_reservation(
+ get_conf<uint64_t>("seastore_default_max_object_size")),
+ default_metadata_offset(default_data_reservation),
+ default_metadata_range(
+ get_conf<uint64_t>("seastore_default_object_metadata_reservation"))
+ {}
+
+ mkfs_ret mkfs(Transaction &t) {
+ return tree.mkfs(t);
+ }
+
+ contains_onode_ret contains_onode(
+ Transaction &trans,
+ const ghobject_t &hoid) final;
+
+ get_onode_ret get_onode(
+ Transaction &trans,
+ const ghobject_t &hoid) final;
+
+ get_or_create_onode_ret get_or_create_onode(
+ Transaction &trans,
+ const ghobject_t &hoid) final;
+
+ get_or_create_onodes_ret get_or_create_onodes(
+ Transaction &trans,
+ const std::vector<ghobject_t> &hoids) final;
+
+ write_dirty_ret write_dirty(
+ Transaction &trans,
+ const std::vector<OnodeRef> &onodes) final;
+
+ erase_onode_ret erase_onode(
+ Transaction &trans,
+ OnodeRef &onode) final;
+
+ list_onodes_ret list_onodes(
+ Transaction &trans,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) final;
+
+ ~FLTreeOnodeManager();
+};
+using FLTreeOnodeManagerRef = std::unique_ptr<FLTreeOnodeManager>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h
new file mode 100644
index 000000000..43f8b87ed
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore::onode {
+
+using eagain_iertr = trans_iertr<
+ crimson::errorator<crimson::ct_error::input_output_error> >;
+template <class ValueT=void>
+using eagain_ifuture = eagain_iertr::future<ValueT>;
+
+using crimson::os::seastore::Transaction;
+using crimson::os::seastore::TransactionRef;
+using crimson::os::seastore::laddr_t;
+using crimson::os::seastore::L_ADDR_MIN;
+using crimson::os::seastore::L_ADDR_NULL;
+using crimson::os::seastore::extent_len_t;
+
+class DeltaRecorder;
+class NodeExtent;
+class NodeExtentManager;
+class RootNodeTracker;
+struct ValueBuilder;
+using DeltaRecorderURef = std::unique_ptr<DeltaRecorder>;
+using NodeExtentRef = crimson::os::seastore::TCachedExtentRef<NodeExtent>;
+using NodeExtentManagerURef = std::unique_ptr<NodeExtentManager>;
+using RootNodeTrackerURef = std::unique_ptr<RootNodeTracker>;
+struct context_t {
+ NodeExtentManager& nm;
+ const ValueBuilder& vb;
+ Transaction& t;
+};
+
+class LeafNodeImpl;
+class InternalNodeImpl;
+class NodeImpl;
+using LeafNodeImplURef = std::unique_ptr<LeafNodeImpl>;
+using InternalNodeImplURef = std::unique_ptr<InternalNodeImpl>;
+using NodeImplURef = std::unique_ptr<NodeImpl>;
+
+using level_t = uint8_t;
+constexpr auto MAX_LEVEL = std::numeric_limits<level_t>::max();
+
+// a type only to index within a node, 32 bits should be enough
+using index_t = uint32_t;
+constexpr auto INDEX_END = std::numeric_limits<index_t>::max();
+constexpr auto INDEX_LAST = INDEX_END - 0x4;
+constexpr auto INDEX_UPPER_BOUND = INDEX_END - 0x8;
+inline bool is_valid_index(index_t index) { return index < INDEX_UPPER_BOUND; }
+
+// we support up to 64 KiB tree nodes
+using node_offset_t = uint16_t;
+constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12;
+constexpr auto MAX_NODE_SIZE =
+ (extent_len_t)std::numeric_limits<node_offset_t>::max() + 1;
+inline bool is_valid_node_size(extent_len_t node_size) {
+ return (node_size > 0 &&
+ node_size <= MAX_NODE_SIZE &&
+ node_size % DISK_BLOCK_SIZE == 0);
+}
+
+using string_size_t = uint16_t;
+
+enum class MatchKindBS : int8_t { NE = -1, EQ = 0 };
+
+enum class MatchKindCMP : int8_t { LT = -1, EQ = 0, GT };
+inline MatchKindCMP toMatchKindCMP(int value) {
+ if (value > 0) {
+ return MatchKindCMP::GT;
+ } else if (value < 0) {
+ return MatchKindCMP::LT;
+ } else {
+ return MatchKindCMP::EQ;
+ }
+}
+template <typename Type>
+MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) {
+ if (l > r) {
+ return MatchKindCMP::GT;
+ } else if (l < r) {
+ return MatchKindCMP::LT;
+ } else {
+ return MatchKindCMP::EQ;
+ }
+}
+
+inline MatchKindCMP toMatchKindCMP(
+ std::string_view l, std::string_view r) {
+ return toMatchKindCMP(l.compare(r));
+}
+
+inline MatchKindCMP reverse(MatchKindCMP cmp) {
+ if (cmp == MatchKindCMP::LT) {
+ return MatchKindCMP::GT;
+ } else if (cmp == MatchKindCMP::GT) {
+ return MatchKindCMP::LT;
+ } else {
+ return cmp;
+ }
+}
+
+struct tree_stats_t {
+ size_t size_persistent_leaf = 0;
+ size_t size_persistent_internal = 0;
+ size_t size_filled_leaf = 0;
+ size_t size_filled_internal = 0;
+ size_t size_logical_leaf = 0;
+ size_t size_logical_internal = 0;
+ size_t size_overhead_leaf = 0;
+ size_t size_overhead_internal = 0;
+ size_t size_value_leaf = 0;
+ size_t size_value_internal = 0;
+ unsigned num_kvs_leaf = 0;
+ unsigned num_kvs_internal = 0;
+ unsigned num_nodes_leaf = 0;
+ unsigned num_nodes_internal = 0;
+ unsigned height = 0;
+
+ size_t size_persistent() const {
+ return size_persistent_leaf + size_persistent_internal; }
+ size_t size_filled() const {
+ return size_filled_leaf + size_filled_internal; }
+ size_t size_logical() const {
+ return size_logical_leaf + size_logical_internal; }
+ size_t size_overhead() const {
+ return size_overhead_leaf + size_overhead_internal; }
+ size_t size_value() const {
+ return size_value_leaf + size_value_internal; }
+ unsigned num_kvs() const {
+ return num_kvs_leaf + num_kvs_internal; }
+ unsigned num_nodes() const {
+ return num_nodes_leaf + num_nodes_internal; }
+
+ double ratio_fullness() const {
+ return (double)size_filled() / size_persistent(); }
+ double ratio_key_compression() const {
+ return (double)(size_filled() - size_value()) / (size_logical() - size_value()); }
+ double ratio_overhead() const {
+ return (double)size_overhead() / size_filled(); }
+ double ratio_keys_leaf() const {
+ return (double)num_kvs_leaf / num_kvs(); }
+ double ratio_nodes_leaf() const {
+ return (double)num_nodes_leaf / num_nodes(); }
+ double ratio_filled_leaf() const {
+ return (double)size_filled_leaf / size_filled(); }
+};
+inline std::ostream& operator<<(std::ostream& os, const tree_stats_t& stats) {
+ os << "Tree stats:"
+ << "\n height = " << stats.height
+ << "\n num values = " << stats.num_kvs_leaf
+ << "\n num nodes = " << stats.num_nodes()
+ << " (leaf=" << stats.num_nodes_leaf
+ << ", internal=" << stats.num_nodes_internal << ")"
+ << "\n size persistent = " << stats.size_persistent() << "B"
+ << "\n size filled = " << stats.size_filled() << "B"
+ << " (value=" << stats.size_value_leaf << "B"
+ << ", rest=" << stats.size_filled() - stats.size_value_leaf << "B)"
+ << "\n size logical = " << stats.size_logical() << "B"
+ << "\n size overhead = " << stats.size_overhead() << "B"
+ << "\n ratio fullness = " << stats.ratio_fullness()
+ << "\n ratio keys leaf = " << stats.ratio_keys_leaf()
+ << "\n ratio nodes leaf = " << stats.ratio_nodes_leaf()
+ << "\n ratio filled leaf = " << stats.ratio_filled_leaf()
+ << "\n ratio key compression = " << stats.ratio_key_compression();
+ assert(stats.num_kvs_internal + 1 == stats.num_nodes());
+ return os;
+}
+
+template <typename PtrType>
+void reset_ptr(PtrType& ptr, const char* origin_base,
+ const char* new_base, extent_len_t node_size) {
+ assert((const char*)ptr > origin_base);
+ assert((const char*)ptr - origin_base < (int)node_size);
+ ptr = reinterpret_cast<PtrType>(
+ (const char*)ptr - origin_base + new_base);
+}
+
+}
+
+#if FMT_VERSION >= 90000
+template<>
+struct fmt::formatter<crimson::os::seastore::onode::tree_stats_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc
new file mode 100644
index 000000000..6f08f4d3c
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc
@@ -0,0 +1,2282 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node.h"
+
+#include <cassert>
+#include <exception>
+#include <sstream>
+
+#include "common/likely.h"
+#include "crimson/common/utility.h"
+#include "crimson/os/seastore/logging.h"
+
+#include "node_extent_manager.h"
+#include "node_impl.h"
+#include "stages/node_stage_layout.h"
+
+SET_SUBSYS(seastore_onode);
+
+namespace fmt {
+template <typename T>
+const void* ptr(const ::boost::intrusive_ptr<T>& p) {
+ return p.get();
+}
+}
+
+namespace crimson::os::seastore::onode {
+/*
+ * tree_cursor_t
+ */
+
+// create from insert
+tree_cursor_t::tree_cursor_t(Ref<LeafNode> node, const search_position_t& pos)
+ : ref_leaf_node{node}, position{pos}, cache{ref_leaf_node}
+{
+ assert(is_tracked());
+ ref_leaf_node->do_track_cursor<true>(*this);
+ // do not account updates for the inserted values
+ is_mutated = true;
+}
+
+// create from lookup
+tree_cursor_t::tree_cursor_t(
+ Ref<LeafNode> node, const search_position_t& pos,
+ const key_view_t& key_view, const value_header_t* p_value_header)
+ : ref_leaf_node{node}, position{pos}, cache{ref_leaf_node}
+{
+ assert(is_tracked());
+ update_cache_same_node(key_view, p_value_header);
+ ref_leaf_node->do_track_cursor<true>(*this);
+}
+
+// lookup reaches the end, contain leaf node for further insert
+tree_cursor_t::tree_cursor_t(Ref<LeafNode> node)
+ : ref_leaf_node{node}, position{search_position_t::end()}, cache{ref_leaf_node}
+{
+ assert(is_end());
+ assert(ref_leaf_node->is_level_tail());
+}
+
+// create an invalid tree_cursor_t
+tree_cursor_t::~tree_cursor_t()
+{
+ if (is_tracked()) {
+ ref_leaf_node->do_untrack_cursor(*this);
+ }
+}
+
+eagain_ifuture<Ref<tree_cursor_t>>
+tree_cursor_t::get_next(context_t c)
+{
+ assert(is_tracked());
+ return ref_leaf_node->get_next_cursor(c, position);
+}
+
+void tree_cursor_t::assert_next_to(
+ const tree_cursor_t& prv, value_magic_t magic) const
+{
+#ifndef NDEBUG
+ assert(!prv.is_end());
+ if (is_end()) {
+ assert(ref_leaf_node == prv.ref_leaf_node);
+ assert(ref_leaf_node->is_level_tail());
+ } else if (is_tracked()) {
+ auto key = get_key_view(magic);
+ auto prv_key = prv.get_key_view(magic);
+ assert(key > prv_key);
+ if (ref_leaf_node == prv.ref_leaf_node) {
+ position.assert_next_to(prv.position);
+ } else {
+ assert(!prv.ref_leaf_node->is_level_tail());
+ assert(position == search_position_t::begin());
+ }
+ } else {
+ assert(is_invalid());
+ ceph_abort("impossible");
+ }
+#endif
+}
+
+template <bool FORCE_MERGE>
+eagain_ifuture<Ref<tree_cursor_t>>
+tree_cursor_t::erase(context_t c, bool get_next)
+{
+ assert(is_tracked());
+ return ref_leaf_node->erase<FORCE_MERGE>(c, position, get_next);
+}
+template eagain_ifuture<Ref<tree_cursor_t>>
+tree_cursor_t::erase<true>(context_t, bool);
+template eagain_ifuture<Ref<tree_cursor_t>>
+tree_cursor_t::erase<false>(context_t, bool);
+
+std::strong_ordering tree_cursor_t::compare_to(
+ const tree_cursor_t& o, value_magic_t magic) const
+{
+ if (!is_tracked() && !o.is_tracked()) {
+ return std::strong_ordering::equal;
+ } else if (!is_tracked()) {
+ return std::strong_ordering::greater;
+ } else if (!o.is_tracked()) {
+ return std::strong_ordering::less;
+ }
+
+ assert(is_tracked() && o.is_tracked());
+ // all tracked cursors are singletons
+ if (this == &o) {
+ return std::strong_ordering::equal;
+ }
+
+ std::strong_ordering ret = std::strong_ordering::equal;
+ if (ref_leaf_node == o.ref_leaf_node) {
+ ret = position <=> o.position;
+ } else {
+ auto key = get_key_view(magic);
+ auto o_key = o.get_key_view(magic);
+ ret = key <=> o_key;
+ }
+ assert(ret != 0);
+ return ret;
+}
+
+eagain_ifuture<>
+tree_cursor_t::extend_value(context_t c, value_size_t extend_size)
+{
+ assert(is_tracked());
+ return ref_leaf_node->extend_value(c, position, extend_size);
+}
+
+eagain_ifuture<>
+tree_cursor_t::trim_value(context_t c, value_size_t trim_size)
+{
+ assert(is_tracked());
+ return ref_leaf_node->trim_value(c, position, trim_size);
+}
+
+template <bool VALIDATE>
+void tree_cursor_t::update_track(
+ Ref<LeafNode> node, const search_position_t& pos)
+{
+ // I must be already untracked
+ assert(is_tracked());
+ assert(!ref_leaf_node->check_is_tracking(*this));
+ // track the new node and new pos
+ assert(!pos.is_end());
+ ref_leaf_node = node;
+ position = pos;
+ // we lazy update the key/value information until user asked
+ cache.invalidate();
+ ref_leaf_node->do_track_cursor<VALIDATE>(*this);
+}
+template void tree_cursor_t::update_track<true>(Ref<LeafNode>, const search_position_t&);
+template void tree_cursor_t::update_track<false>(Ref<LeafNode>, const search_position_t&);
+
+void tree_cursor_t::update_cache_same_node(const key_view_t& key_view,
+ const value_header_t* p_value_header) const
+{
+ assert(is_tracked());
+ cache.update_all(ref_leaf_node->get_version(), key_view, p_value_header);
+ cache.validate_is_latest(position);
+}
+
+void tree_cursor_t::invalidate()
+{
+ assert(is_tracked());
+ ref_leaf_node.reset();
+ assert(is_invalid());
+ // I must be removed from LeafNode
+}
+
+/*
+ * tree_cursor_t::Cache
+ */
+
+tree_cursor_t::Cache::Cache(Ref<LeafNode>& ref_leaf_node)
+ : ref_leaf_node{ref_leaf_node} {}
+
+void tree_cursor_t::Cache::update_all(const node_version_t& current_version,
+ const key_view_t& _key_view,
+ const value_header_t* _p_value_header)
+{
+ assert(_p_value_header);
+
+ needs_update_all = false;
+ version = current_version;
+
+ p_node_base = ref_leaf_node->read();
+ key_view = _key_view;
+ p_value_header = _p_value_header;
+ assert((const char*)p_value_header > p_node_base);
+ assert((const char*)p_value_header - p_node_base <
+ (int)ref_leaf_node->get_node_size());
+
+ value_payload_mut.reset();
+ p_value_recorder = nullptr;
+}
+
+void tree_cursor_t::Cache::maybe_duplicate(const node_version_t& current_version)
+{
+ assert(!needs_update_all);
+ assert(version.layout == current_version.layout);
+ if (version.state == current_version.state) {
+ // cache is already latest.
+ } else if (version.state < current_version.state) {
+ // the extent has been copied but the layout has not been changed.
+ assert(p_node_base != nullptr);
+ assert(key_view.has_value());
+ assert(p_value_header != nullptr);
+
+ auto current_p_node_base = ref_leaf_node->read();
+ assert(current_p_node_base != p_node_base);
+ auto node_size = ref_leaf_node->get_node_size();
+
+ version.state = current_version.state;
+ reset_ptr(p_value_header, p_node_base,
+ current_p_node_base, node_size);
+ key_view->reset_to(p_node_base, current_p_node_base, node_size);
+ value_payload_mut.reset();
+ p_value_recorder = nullptr;
+
+ p_node_base = current_p_node_base;
+ } else {
+ // It is impossible to change state backwards, see node_types.h.
+ ceph_abort("impossible");
+ }
+}
+
+void tree_cursor_t::Cache::make_latest(
+ value_magic_t magic, const search_position_t& pos)
+{
+ auto current_version = ref_leaf_node->get_version();
+ if (needs_update_all || version.layout != current_version.layout) {
+ auto [_key_view, _p_value_header] = ref_leaf_node->get_kv(pos);
+ update_all(current_version, _key_view, _p_value_header);
+ } else {
+ maybe_duplicate(current_version);
+ }
+ assert(p_value_header->magic == magic);
+ validate_is_latest(pos);
+}
+
+void tree_cursor_t::Cache::validate_is_latest(const search_position_t& pos) const
+{
+#ifndef NDEBUG
+ assert(!needs_update_all);
+ assert(version == ref_leaf_node->get_version());
+
+ auto [_key_view, _p_value_header] = ref_leaf_node->get_kv(pos);
+ assert(p_node_base == ref_leaf_node->read());
+ assert(key_view ==_key_view);
+ assert(p_value_header == _p_value_header);
+#endif
+}
+
+std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+tree_cursor_t::Cache::prepare_mutate_value_payload(
+ context_t c, const search_position_t& pos)
+{
+ make_latest(c.vb.get_header_magic(), pos);
+ if (!value_payload_mut.has_value()) {
+ assert(!p_value_recorder);
+ auto value_mutable = ref_leaf_node->prepare_mutate_value_payload(c);
+ auto current_version = ref_leaf_node->get_version();
+ maybe_duplicate(current_version);
+ value_payload_mut = p_value_header->get_payload_mutable(value_mutable.first);
+ p_value_recorder = value_mutable.second;
+ validate_is_latest(pos);
+ }
+ return {*value_payload_mut, p_value_recorder};
+}
+
+/*
+ * Node
+ */
+
+Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {}
+
+Node::~Node()
+{
+ if (!is_tracked()) {
+ // possible scenarios:
+ // a. I'm erased;
+ // b. Eagain happened after the node extent is allocated/loaded
+ // and before the node is initialized correctly;
+ } else {
+ assert(!impl->is_extent_retired());
+ if (is_root()) {
+ super->do_untrack_root(*this);
+ } else {
+ _parent_info->ptr->do_untrack_child(*this);
+ }
+ }
+}
+
+level_t Node::level() const
+{
+ return impl->level();
+}
+
+eagain_ifuture<Node::search_result_t> Node::lower_bound(
+ context_t c, const key_hobj_t& key)
+{
+ return seastar::do_with(
+ MatchHistory(), [this, c, &key](auto& history) {
+ return lower_bound_tracked(c, key, history);
+ }
+ );
+}
+
+eagain_ifuture<std::pair<Ref<tree_cursor_t>, bool>> Node::insert(
+ context_t c,
+ const key_hobj_t& key,
+ value_config_t vconf,
+ Ref<Node>&& this_ref)
+{
+ return seastar::do_with(
+ MatchHistory(), [this, c, &key, vconf,
+ this_ref = std::move(this_ref)] (auto& history) mutable {
+ return lower_bound_tracked(c, key, history
+ ).si_then([c, &key, vconf, &history,
+ this_ref = std::move(this_ref)] (auto result) mutable {
+ // the cursor in the result should already hold the root node upwards
+ this_ref.reset();
+ if (result.match() == MatchKindBS::EQ) {
+ return eagain_iertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>(
+ std::make_pair(result.p_cursor, false));
+ } else {
+ auto leaf_node = result.p_cursor->get_leaf_node();
+ return leaf_node->insert_value(
+ c, key, vconf, result.p_cursor->get_position(), history, result.mstat
+ ).si_then([](auto p_cursor) {
+ return seastar::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>(
+ std::make_pair(p_cursor, true));
+ });
+ }
+ });
+ }
+ );
+}
+
+eagain_ifuture<std::size_t> Node::erase(
+ context_t c,
+ const key_hobj_t& key,
+ Ref<Node>&& this_ref)
+{
+ return lower_bound(c, key
+ ).si_then([c, this_ref = std::move(this_ref)] (auto result) mutable {
+ // the cursor in the result should already hold the root node upwards
+ this_ref.reset();
+ if (result.match() != MatchKindBS::EQ) {
+ return eagain_iertr::make_ready_future<std::size_t>(0);
+ }
+ auto ref_cursor = result.p_cursor;
+ return ref_cursor->erase(c, false
+ ).si_then([ref_cursor] (auto next_cursor) {
+ assert(ref_cursor->is_invalid());
+ assert(!next_cursor);
+ return std::size_t(1);
+ });
+ });
+}
+
+eagain_ifuture<tree_stats_t> Node::get_tree_stats(context_t c)
+{
+ return seastar::do_with(
+ tree_stats_t(), [this, c](auto& stats) {
+ return do_get_tree_stats(c, stats).si_then([&stats] {
+ return stats;
+ });
+ }
+ );
+}
+
+std::ostream& Node::dump(std::ostream& os) const
+{
+ return impl->dump(os);
+}
+
+std::ostream& Node::dump_brief(std::ostream& os) const
+{
+ return impl->dump_brief(os);
+}
+
+const std::string& Node::get_name() const
+{
+ return impl->get_name();
+}
+
+void Node::test_make_destructable(
+ context_t c, NodeExtentMutable& mut, Super::URef&& _super)
+{
+ impl->test_set_tail(mut);
+ make_root(c, std::move(_super));
+}
+
+eagain_ifuture<> Node::mkfs(context_t c, RootNodeTracker& root_tracker)
+{
+ LOG_PREFIX(OTree::Node::mkfs);
+ return LeafNode::allocate_root(c, root_tracker
+ ).si_then([c, FNAME](auto ret) {
+ c.t.get_onode_tree_stats().extents_num_delta++;
+ INFOT("allocated root {}", c.t, ret->get_name());
+ });
+}
+
+eagain_ifuture<Ref<Node>> Node::load_root(context_t c, RootNodeTracker& root_tracker)
+{
+ LOG_PREFIX(OTree::Node::load_root);
+ return c.nm.get_super(c.t, root_tracker
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::input_output_error::handle([FNAME, c] {
+ ERRORT("EIO during get_super()", c.t);
+ ceph_abort("fatal error");
+ })
+ ).si_then([c, &root_tracker, FNAME](auto&& _super) {
+ assert(_super);
+ auto root_addr = _super->get_root_laddr();
+ assert(root_addr != L_ADDR_NULL);
+ TRACET("loading root_addr={:x} ...", c.t, root_addr);
+ return Node::load(c, root_addr, true
+ ).si_then([c, _super = std::move(_super),
+ &root_tracker, FNAME](auto root) mutable {
+ TRACET("loaded {}", c.t, root->get_name());
+ assert(root->impl->field_type() == field_type_t::N0);
+ root->as_root(std::move(_super));
+ std::ignore = c; // as only used in an assert
+ std::ignore = root_tracker;
+ assert(root == root_tracker.get_root(c.t));
+ return seastar::make_ready_future<Ref<Node>>(root);
+ });
+ });
+}
+
+void Node::make_root(context_t c, Super::URef&& _super)
+{
+ _super->write_root_laddr(c, impl->laddr());
+ as_root(std::move(_super));
+ c.t.get_onode_tree_stats().depth = static_cast<uint64_t>(level()) + 1;
+}
+
+void Node::as_root(Super::URef&& _super)
+{
+ assert(!is_tracked());
+ assert(_super->get_root_laddr() == impl->laddr());
+ assert(impl->is_level_tail());
+ super = std::move(_super);
+ super->do_track_root(*this);
+ assert(is_root());
+}
+
+Super::URef Node::deref_super()
+{
+ assert(is_root());
+ assert(super->get_root_laddr() == impl->laddr());
+ assert(impl->is_level_tail());
+ super->do_untrack_root(*this);
+ auto ret = std::move(super);
+ assert(!is_tracked());
+ return ret;
+}
+
+eagain_ifuture<> Node::upgrade_root(context_t c, laddr_t hint)
+{
+ LOG_PREFIX(OTree::Node::upgrade_root);
+ assert(impl->field_type() == field_type_t::N0);
+ auto super_to_move = deref_super();
+ return InternalNode::allocate_root(
+ c, hint, impl->level(), impl->laddr(), std::move(super_to_move)
+ ).si_then([this, c, FNAME](auto new_root) {
+ as_child(search_position_t::end(), new_root);
+ INFOT("upgraded from {} to {}",
+ c.t, get_name(), new_root->get_name());
+ });
+}
+
+template <bool VALIDATE>
+void Node::as_child(const search_position_t& pos, Ref<InternalNode> parent_node)
+{
+ assert(!is_tracked() || !is_root());
+#ifndef NDEBUG
+ // Although I might have an outdated _parent_info during fixing,
+ // I must be already untracked.
+ if (_parent_info.has_value()) {
+ assert(!_parent_info->ptr->check_is_tracking(*this));
+ }
+#endif
+ _parent_info = parent_info_t{pos, parent_node};
+ parent_info().ptr->do_track_child<VALIDATE>(*this);
+ assert(!is_root());
+}
+template void Node::as_child<true>(const search_position_t&, Ref<InternalNode>);
+template void Node::as_child<false>(const search_position_t&, Ref<InternalNode>);
+
+Ref<InternalNode> Node::deref_parent()
+{
+ assert(!is_root());
+ auto parent_ref = std::move(parent_info().ptr);
+ parent_ref->do_untrack_child(*this);
+ _parent_info.reset();
+ assert(!is_tracked());
+ return parent_ref;
+}
+
+eagain_ifuture<> Node::apply_split_to_parent(
+ context_t c,
+ Ref<Node>&& this_ref,
+ Ref<Node>&& split_right,
+ bool update_right_index)
+{
+ assert(!is_root());
+ assert(this == this_ref.get());
+ // TODO(cross-node string dedup)
+ return parent_info().ptr->apply_child_split(
+ c, std::move(this_ref), std::move(split_right), update_right_index);
+}
+
+eagain_ifuture<Ref<tree_cursor_t>>
+Node::get_next_cursor_from_parent(context_t c)
+{
+ assert(!impl->is_level_tail());
+ assert(!is_root());
+ return parent_info().ptr->get_next_cursor(c, parent_info().position);
+}
+
+template <bool FORCE_MERGE>
+eagain_ifuture<>
+Node::try_merge_adjacent(
+ context_t c, bool update_parent_index, Ref<Node>&& this_ref)
+{
+ LOG_PREFIX(OTree::Node::try_merge_adjacent);
+ assert(this == this_ref.get());
+ impl->validate_non_empty();
+ assert(!is_root());
+ if constexpr (!FORCE_MERGE) {
+ if (!impl->is_size_underflow() &&
+ !impl->has_single_value()) {
+ // skip merge
+ if (update_parent_index) {
+ return fix_parent_index(c, std::move(this_ref), false);
+ } else {
+ parent_info().ptr->validate_child_tracked(*this);
+ return eagain_iertr::now();
+ }
+ }
+ }
+
+ return parent_info().ptr->get_child_peers(c, parent_info().position
+ ).si_then([c, this_ref = std::move(this_ref), this, FNAME,
+ update_parent_index] (auto lr_nodes) mutable -> eagain_ifuture<> {
+ auto& [lnode, rnode] = lr_nodes;
+ Ref<Node> left_for_merge;
+ Ref<Node> right_for_merge;
+ Ref<Node>* p_this_ref;
+ bool is_left;
+ if (!lnode && !rnode) {
+ // XXX: this is possible before node rebalance is implemented,
+ // when its parent cannot merge with its peers and has only one child
+ // (this node).
+ p_this_ref = &this_ref;
+ } else if (!lnode) {
+ left_for_merge = std::move(this_ref);
+ p_this_ref = &left_for_merge;
+ right_for_merge = std::move(rnode);
+ is_left = true;
+ } else if (!rnode) {
+ left_for_merge = std::move(lnode);
+ right_for_merge = std::move(this_ref);
+ p_this_ref = &right_for_merge;
+ is_left = false;
+ } else { // lnode && rnode
+ if (lnode->impl->free_size() > rnode->impl->free_size()) {
+ left_for_merge = std::move(lnode);
+ right_for_merge = std::move(this_ref);
+ p_this_ref = &right_for_merge;
+ is_left = false;
+ } else { // lnode free size <= rnode free size
+ left_for_merge = std::move(this_ref);
+ p_this_ref = &left_for_merge;
+ right_for_merge = std::move(rnode);
+ is_left = true;
+ }
+ }
+
+ if (left_for_merge) {
+ assert(right_for_merge);
+ auto [merge_stage, merge_size] = left_for_merge->impl->evaluate_merge(
+ *right_for_merge->impl);
+ if (merge_size <= left_for_merge->impl->total_size()) {
+ // proceed merge
+ bool update_index_after_merge;
+ if (is_left) {
+ update_index_after_merge = false;
+ } else {
+ update_index_after_merge = update_parent_index;
+ }
+ DEBUGT("merge {} and {} at merge_stage={}, merge_size={}B, "
+ "update_index={}, is_left={} ...",
+ c.t, left_for_merge->get_name(), right_for_merge->get_name(),
+ merge_stage, merge_size, update_index_after_merge, is_left);
+ // we currently cannot generate delta depends on another extent content,
+ // so use rebuild_extent() as a workaround to rebuild the node from a
+ // fresh extent, thus no need to generate delta.
+ auto left_addr = left_for_merge->impl->laddr();
+ return left_for_merge->rebuild_extent(c
+ ).si_then([c, update_index_after_merge,
+ left_addr,
+ merge_stage = merge_stage,
+ merge_size = merge_size,
+ left_for_merge = std::move(left_for_merge),
+ right_for_merge = std::move(right_for_merge)] (auto left_mut) mutable {
+ if (left_for_merge->impl->node_type() == node_type_t::LEAF) {
+ auto& left = *static_cast<LeafNode*>(left_for_merge.get());
+ left.on_layout_change();
+ }
+ search_position_t left_last_pos = left_for_merge->impl->merge(
+ left_mut, *right_for_merge->impl, merge_stage, merge_size);
+ left_for_merge->track_merge(right_for_merge, merge_stage, left_last_pos);
+ --(c.t.get_onode_tree_stats().extents_num_delta);
+ return left_for_merge->parent_info().ptr->apply_children_merge(
+ c, std::move(left_for_merge), left_addr,
+ std::move(right_for_merge), update_index_after_merge);
+ });
+ } else {
+ // size will overflow if merge
+ }
+ }
+
+ // cannot merge
+ if (update_parent_index) {
+ return fix_parent_index(c, std::move(*p_this_ref), false);
+ } else {
+ parent_info().ptr->validate_child_tracked(*this);
+ return eagain_iertr::now();
+ }
+ // XXX: rebalance
+ });
+}
+template eagain_ifuture<> Node::try_merge_adjacent<true>(context_t, bool, Ref<Node>&&);
+template eagain_ifuture<> Node::try_merge_adjacent<false>(context_t, bool, Ref<Node>&&);
+
+eagain_ifuture<> Node::erase_node(context_t c, Ref<Node>&& this_ref)
+{
+ // To erase a node:
+ // 1. I'm supposed to have already untracked any children or cursors
+ // 2. unlink parent/super --ptr-> me
+ // 3. unlink me --ref-> parent/super
+ // 4. retire extent
+ // 5. destruct node
+ assert(this_ref.get() == this);
+ assert(!is_tracking());
+ assert(!is_root());
+ assert(this_ref->use_count() == 1);
+ return parent_info().ptr->erase_child(c, std::move(this_ref));
+}
+
+template <bool FORCE_MERGE>
+eagain_ifuture<> Node::fix_parent_index(
+ context_t c, Ref<Node>&& this_ref, bool check_downgrade)
+{
+ assert(!is_root());
+ assert(this == this_ref.get());
+ return parent_info().ptr->fix_index<FORCE_MERGE>(
+ c, std::move(this_ref), check_downgrade);
+}
+template eagain_ifuture<> Node::fix_parent_index<true>(context_t, Ref<Node>&&, bool);
+template eagain_ifuture<> Node::fix_parent_index<false>(context_t, Ref<Node>&&, bool);
+
+eagain_ifuture<Ref<Node>> Node::load(
+ context_t c, laddr_t addr, bool expect_is_level_tail)
+{
+ LOG_PREFIX(OTree::Node::load);
+ return c.nm.read_extent(c.t, addr
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::input_output_error::handle(
+ [FNAME, c, addr, expect_is_level_tail] {
+ ERRORT("EIO -- addr={:x}, is_level_tail={}",
+ c.t, addr, expect_is_level_tail);
+ ceph_abort("fatal error");
+ }),
+ crimson::ct_error::invarg::handle(
+ [FNAME, c, addr, expect_is_level_tail] {
+ ERRORT("EINVAL -- addr={:x}, is_level_tail={}",
+ c.t, addr, expect_is_level_tail);
+ ceph_abort("fatal error");
+ }),
+ crimson::ct_error::enoent::handle(
+ [FNAME, c, addr, expect_is_level_tail] {
+ ERRORT("ENOENT -- addr={:x}, is_level_tail={}",
+ c.t, addr, expect_is_level_tail);
+ ceph_abort("fatal error");
+ }),
+ crimson::ct_error::erange::handle(
+ [FNAME, c, addr, expect_is_level_tail] {
+ ERRORT("ERANGE -- addr={:x}, is_level_tail={}",
+ c.t, addr, expect_is_level_tail);
+ ceph_abort("fatal error");
+ })
+ ).si_then([FNAME, c, addr, expect_is_level_tail](auto extent)
+ -> eagain_ifuture<Ref<Node>> {
+ assert(extent);
+ auto header = extent->get_header();
+ auto field_type = header.get_field_type();
+ if (!field_type) {
+ ERRORT("load addr={:x}, is_level_tail={} error, "
+ "got invalid header -- {}",
+ c.t, addr, expect_is_level_tail, fmt::ptr(extent));
+ ceph_abort("fatal error");
+ }
+ if (header.get_is_level_tail() != expect_is_level_tail) {
+ ERRORT("load addr={:x}, is_level_tail={} error, "
+ "is_level_tail mismatch -- {}",
+ c.t, addr, expect_is_level_tail, fmt::ptr(extent));
+ ceph_abort("fatal error");
+ }
+
+ auto node_type = header.get_node_type();
+ if (node_type == node_type_t::LEAF) {
+ if (extent->get_length() != c.vb.get_leaf_node_size()) {
+ ERRORT("load addr={:x}, is_level_tail={} error, "
+ "leaf length mismatch -- {}",
+ c.t, addr, expect_is_level_tail, fmt::ptr(extent));
+ ceph_abort("fatal error");
+ }
+ auto impl = LeafNodeImpl::load(extent, *field_type);
+ auto *derived_ptr = impl.get();
+ return eagain_iertr::make_ready_future<Ref<Node>>(
+ new LeafNode(derived_ptr, std::move(impl)));
+ } else if (node_type == node_type_t::INTERNAL) {
+ if (extent->get_length() != c.vb.get_internal_node_size()) {
+ ERRORT("load addr={:x}, is_level_tail={} error, "
+ "internal length mismatch -- {}",
+ c.t, addr, expect_is_level_tail, fmt::ptr(extent));
+ ceph_abort("fatal error");
+ }
+ auto impl = InternalNodeImpl::load(extent, *field_type);
+ auto *derived_ptr = impl.get();
+ return eagain_iertr::make_ready_future<Ref<Node>>(
+ new InternalNode(derived_ptr, std::move(impl)));
+ } else {
+ ceph_abort("impossible path");
+ }
+ });
+}
+
+eagain_ifuture<NodeExtentMutable> Node::rebuild_extent(context_t c)
+{
+ LOG_PREFIX(OTree::Node::rebuild_extent);
+ DEBUGT("{} ...", c.t, get_name());
+ assert(!is_root());
+ // assume I'm already ref counted by caller
+
+ // note: laddr can be changed after rebuild, but we don't fix the parent
+ // mapping as it is part of the merge process.
+ return impl->rebuild_extent(c);
+}
+
+eagain_ifuture<> Node::retire(context_t c, Ref<Node>&& this_ref)
+{
+ LOG_PREFIX(OTree::Node::retire);
+ DEBUGT("{} ...", c.t, get_name());
+ assert(this_ref.get() == this);
+ assert(!is_tracking());
+ assert(!is_tracked());
+ assert(this_ref->use_count() == 1);
+
+ return impl->retire_extent(c
+ ).si_then([this_ref = std::move(this_ref)]{ /* deallocate node */});
+}
+
+void Node::make_tail(context_t c)
+{
+ LOG_PREFIX(OTree::Node::make_tail);
+ assert(!impl->is_level_tail());
+ assert(!impl->is_keys_empty());
+ DEBUGT("{} ...", c.t, get_name());
+ impl->prepare_mutate(c);
+ auto tail_pos = impl->make_tail();
+ if (impl->node_type() == node_type_t::INTERNAL) {
+ auto& node = *static_cast<InternalNode*>(this);
+ node.track_make_tail(tail_pos);
+ }
+}
+
+/*
+ * InternalNode
+ */
+
+InternalNode::InternalNode(InternalNodeImpl* impl, NodeImplURef&& impl_ref)
+ : Node(std::move(impl_ref)), impl{impl} {}
+
+eagain_ifuture<Ref<tree_cursor_t>>
+InternalNode::get_next_cursor(context_t c, const search_position_t& pos)
+{
+ impl->validate_non_empty();
+ if (pos.is_end()) {
+ assert(impl->is_level_tail());
+ return get_next_cursor_from_parent(c);
+ }
+
+ search_position_t next_pos = pos;
+ const laddr_packed_t* p_child_addr = nullptr;
+ impl->get_next_slot(next_pos, nullptr, &p_child_addr);
+ if (next_pos.is_end() && !impl->is_level_tail()) {
+ return get_next_cursor_from_parent(c);
+ } else {
+ if (next_pos.is_end()) {
+ p_child_addr = impl->get_tail_value();
+ }
+ assert(p_child_addr);
+ return get_or_track_child(c, next_pos, p_child_addr->value
+ ).si_then([c](auto child) {
+ return child->lookup_smallest(c);
+ });
+ }
+}
+
+eagain_ifuture<> InternalNode::apply_child_split(
+ context_t c, Ref<Node>&& left_child, Ref<Node>&& right_child,
+ bool update_right_index)
+{
+ LOG_PREFIX(OTree::InternalNode::apply_child_split);
+ auto& left_pos = left_child->parent_info().position;
+
+#ifndef NDEBUG
+ assert(left_child->parent_info().ptr.get() == this);
+ assert(!left_child->impl->is_level_tail());
+ if (left_pos.is_end()) {
+ assert(impl->is_level_tail());
+ assert(right_child->impl->is_level_tail());
+ assert(!update_right_index);
+ }
+
+ // right_child has not assigned parent yet
+ assert(!right_child->is_tracked());
+#endif
+
+ impl->prepare_mutate(c);
+
+ DEBUGT("apply {}'s child {} to split to {}, update_index={} ...",
+ c.t, get_name(), left_child->get_name(),
+ right_child->get_name(), update_right_index);
+
+ // update layout from left_pos => left_child_addr to right_child_addr
+ auto left_child_addr = left_child->impl->laddr();
+ auto right_child_addr = right_child->impl->laddr();
+ impl->replace_child_addr(left_pos, right_child_addr, left_child_addr);
+
+ // update track from left_pos => left_child to right_child
+ replace_track(right_child, left_child, update_right_index);
+
+ auto left_key = *left_child->impl->get_pivot_index();
+ Ref<Node> this_ref = this;
+ return insert_or_split(
+ c, left_pos, left_key, left_child,
+ (update_right_index ? right_child : nullptr)
+ ).si_then([this, c,
+ this_ref = std::move(this_ref)] (auto split_right) mutable {
+ if (split_right) {
+ // even if update_right_index could be true,
+ // we haven't fixed the right_child index of this node yet,
+ // so my parent index should be correct now.
+ return apply_split_to_parent(
+ c, std::move(this_ref), std::move(split_right), false);
+ } else {
+ return eagain_iertr::now();
+ }
+ }).si_then([c, update_right_index,
+ right_child = std::move(right_child)] () mutable {
+ if (update_right_index) {
+ // XXX: might not need to call validate_tracked_children() in fix_index()
+ return right_child->fix_parent_index(c, std::move(right_child), false);
+ } else {
+ // there is no need to call try_merge_adjacent() because
+ // the filled size of the inserted node or the split right node
+ // won't be reduced if update_right_index is false.
+ return eagain_iertr::now();
+ }
+ });
+}
+
+eagain_ifuture<> InternalNode::erase_child(context_t c, Ref<Node>&& child_ref)
+{
+ LOG_PREFIX(OTree::InternalNode::erase_child);
+ // this is a special version of recursive merge
+ impl->validate_non_empty();
+ assert(child_ref->use_count() == 1);
+ validate_child_tracked(*child_ref);
+
+ // fix the child's previous node as the new tail,
+ // and trigger prv_child_ref->try_merge_adjacent() at the end
+ bool fix_tail = (child_ref->parent_info().position.is_end() &&
+ !impl->is_keys_empty());
+ return eagain_iertr::now().si_then([c, this, fix_tail] {
+ if (fix_tail) {
+ search_position_t new_tail_pos;
+ const laddr_packed_t* new_tail_p_addr = nullptr;
+ impl->get_largest_slot(&new_tail_pos, nullptr, &new_tail_p_addr);
+ return get_or_track_child(c, new_tail_pos, new_tail_p_addr->value);
+ } else {
+ return eagain_iertr::make_ready_future<Ref<Node>>();
+ }
+ }).si_then([c, this, child_ref = std::move(child_ref), FNAME]
+ (auto&& new_tail_child) mutable {
+ auto child_pos = child_ref->parent_info().position;
+ if (new_tail_child) {
+ DEBUGT("erase {}'s child {} at pos({}), "
+ "and fix new child tail {} at pos({}) ...",
+ c.t, get_name(), child_ref->get_name(), child_pos,
+ new_tail_child->get_name(), new_tail_child->parent_info().position);
+ assert(!new_tail_child->impl->is_level_tail());
+ new_tail_child->make_tail(c);
+ assert(new_tail_child->impl->is_level_tail());
+ if (new_tail_child->impl->node_type() == node_type_t::LEAF) {
+ // no need to proceed merge because the filled size is not changed
+ new_tail_child.reset();
+ }
+ } else {
+ DEBUGT("erase {}'s child {} at pos({}) ...",
+ c.t, get_name(), child_ref->get_name(), child_pos);
+ }
+
+ Ref<Node> this_ref = child_ref->deref_parent();
+ assert(this_ref == this);
+ return child_ref->retire(c, std::move(child_ref)
+ ).si_then([c, this, child_pos, FNAME,
+ this_ref = std::move(this_ref)] () mutable {
+ if (impl->has_single_value()) {
+ // fast path without mutating the extent
+ DEBUGT("{} has one value left, erase ...", c.t, get_name());
+#ifndef NDEBUG
+ if (impl->is_level_tail()) {
+ assert(child_pos.is_end());
+ } else {
+ assert(child_pos == search_position_t::begin());
+ }
+#endif
+
+ if (is_root()) {
+ // Note: if merge/split works as expected, we should never encounter the
+ // situation when the internal root has <=1 children:
+ //
+ // A newly created internal root (see Node::upgrade_root()) will have 2
+ // children after split is finished.
+ //
+ // When merge happens, children will try to merge each other, and if the
+ // root detects there is only one child left, the root will be
+ // down-graded to the only child.
+ //
+ // In order to preserve the invariant, we need to make sure the new
+ // internal root also has at least 2 children.
+ ceph_abort("trying to erase the last item from the internal root node");
+ }
+
+ // track erase
+ assert(tracked_child_nodes.empty());
+
+ // no child should be referencing this node now, this_ref is the last one.
+ assert(this_ref->use_count() == 1);
+ return Node::erase_node(c, std::move(this_ref));
+ }
+
+ impl->prepare_mutate(c);
+ auto [erase_stage, next_or_last_pos] = impl->erase(child_pos);
+ if (child_pos.is_end()) {
+ // next_or_last_pos as last_pos
+ track_make_tail(next_or_last_pos);
+ } else {
+ // next_or_last_pos as next_pos
+ track_erase(child_pos, erase_stage);
+ }
+ validate_tracked_children();
+
+ if (is_root()) {
+ return try_downgrade_root(c, std::move(this_ref));
+ } else {
+ bool update_parent_index;
+ if (impl->is_level_tail()) {
+ update_parent_index = false;
+ } else {
+ // next_or_last_pos as next_pos
+ next_or_last_pos.is_end() ? update_parent_index = true
+ : update_parent_index = false;
+ }
+ return try_merge_adjacent(c, update_parent_index, std::move(this_ref));
+ }
+ }).si_then([c, new_tail_child = std::move(new_tail_child)] () mutable {
+ // finally, check if the new tail child needs to merge
+ if (new_tail_child && !new_tail_child->is_root()) {
+ assert(new_tail_child->impl->is_level_tail());
+ return new_tail_child->try_merge_adjacent(
+ c, false, std::move(new_tail_child));
+ } else {
+ return eagain_iertr::now();
+ }
+ });
+ });
+}
+
+template <bool FORCE_MERGE>
+eagain_ifuture<> InternalNode::fix_index(
+ context_t c, Ref<Node>&& child, bool check_downgrade)
+{
+ LOG_PREFIX(OTree::InternalNode::fix_index);
+ impl->validate_non_empty();
+
+ validate_child_inconsistent(*child);
+ auto& child_pos = child->parent_info().position;
+ Ref<Node> this_ref = child->deref_parent();
+ assert(this_ref == this);
+ validate_tracked_children();
+
+ impl->prepare_mutate(c);
+
+ key_view_t new_key = *child->impl->get_pivot_index();
+ DEBUGT("fix {}'s index of child {} at pos({}), new_key={} ...",
+ c.t, get_name(), child->get_name(), child_pos, new_key);
+
+ // erase the incorrect item
+ auto [erase_stage, next_pos] = impl->erase(child_pos);
+ track_erase(child_pos, erase_stage);
+ validate_tracked_children();
+
+ // find out whether there is a need to fix parent index recursively
+ bool update_parent_index;
+ if (impl->is_level_tail()) {
+ update_parent_index = false;
+ } else {
+ next_pos.is_end() ? update_parent_index = true
+ : update_parent_index = false;
+ }
+
+ return insert_or_split(c, next_pos, new_key, child
+ ).si_then([this, c, update_parent_index, check_downgrade,
+ this_ref = std::move(this_ref)] (auto split_right) mutable {
+ if (split_right) {
+ // after split, the parent index to the split_right will be incorrect
+ // if update_parent_index is true.
+ return apply_split_to_parent(
+ c, std::move(this_ref), std::move(split_right), update_parent_index);
+ } else {
+ // no split path
+ if (is_root()) {
+ if (check_downgrade) {
+ return try_downgrade_root(c, std::move(this_ref));
+ } else {
+ // no need to call try_downgrade_root() because the number of keys
+ // has not changed, and I must have at least 2 keys.
+ assert(!impl->is_keys_empty());
+ return eagain_iertr::now();
+ }
+ } else {
+ // for non-root, maybe need merge adjacent or fix parent,
+ // because the filled node size may be reduced.
+ return try_merge_adjacent<FORCE_MERGE>(
+ c, update_parent_index, std::move(this_ref));
+ }
+ }
+ });
+}
+
+template <bool FORCE_MERGE>
+eagain_ifuture<> InternalNode::apply_children_merge(
+ context_t c, Ref<Node>&& left_child, laddr_t origin_left_addr,
+ Ref<Node>&& right_child, bool update_index)
+{
+ LOG_PREFIX(OTree::InternalNode::apply_children_merge);
+ auto left_pos = left_child->parent_info().position;
+ auto left_addr = left_child->impl->laddr();
+ auto& right_pos = right_child->parent_info().position;
+ auto right_addr = right_child->impl->laddr();
+ DEBUGT("apply {}'s child {} (was {:#x}) at pos({}), "
+ "to merge with {} at pos({}), update_index={} ...",
+ c.t, get_name(), left_child->get_name(), origin_left_addr, left_pos,
+ right_child->get_name(), right_pos, update_index);
+
+#ifndef NDEBUG
+ assert(left_child->parent_info().ptr == this);
+ assert(!left_pos.is_end());
+ const laddr_packed_t* p_value_left;
+ impl->get_slot(left_pos, nullptr, &p_value_left);
+ assert(p_value_left->value == origin_left_addr);
+
+ assert(right_child->use_count() == 1);
+ assert(right_child->parent_info().ptr == this);
+ const laddr_packed_t* p_value_right;
+ if (right_pos.is_end()) {
+ assert(right_child->impl->is_level_tail());
+ assert(left_child->impl->is_level_tail());
+ assert(impl->is_level_tail());
+ assert(!update_index);
+ p_value_right = impl->get_tail_value();
+ } else {
+ assert(!right_child->impl->is_level_tail());
+ assert(!left_child->impl->is_level_tail());
+ impl->get_slot(right_pos, nullptr, &p_value_right);
+ }
+ assert(p_value_right->value == right_addr);
+#endif
+
+ // XXX: we may jump to try_downgrade_root() without mutating this node.
+
+ // update layout from right_pos => right_addr to left_addr
+ impl->prepare_mutate(c);
+ impl->replace_child_addr(right_pos, left_addr, right_addr);
+
+ // update track from right_pos => right_child to left_child
+ left_child->deref_parent();
+ replace_track(left_child, right_child, update_index);
+
+ // erase left_pos from layout
+ auto [erase_stage, next_pos] = impl->erase(left_pos);
+ track_erase<false>(left_pos, erase_stage);
+ assert(next_pos == left_child->parent_info().position);
+
+ // All good to retire the right_child.
+ // I'm already ref-counted by left_child.
+ return right_child->retire(c, std::move(right_child)
+ ).si_then([c, this, update_index,
+ left_child = std::move(left_child)] () mutable {
+ if (update_index) {
+ // I'm all good but:
+ // - my number of keys is reduced by 1
+ // - my size may underflow, but try_merge_adjacent() is already part of fix_index()
+ return left_child->fix_parent_index<FORCE_MERGE>(c, std::move(left_child), true);
+ } else {
+ validate_tracked_children();
+ Ref<Node> this_ref = this;
+ left_child.reset();
+ // I'm all good but:
+ // - my number of keys is reduced by 1
+ // - my size may underflow
+ if (is_root()) {
+ return try_downgrade_root(c, std::move(this_ref));
+ } else {
+ return try_merge_adjacent<FORCE_MERGE>(
+ c, false, std::move(this_ref));
+ }
+ }
+ });
+}
+template eagain_ifuture<> InternalNode::apply_children_merge<true>(
+ context_t, Ref<Node>&&, laddr_t, Ref<Node>&&, bool);
+template eagain_ifuture<> InternalNode::apply_children_merge<false>(
+ context_t, Ref<Node>&&, laddr_t, Ref<Node>&&, bool);
+
+eagain_ifuture<std::pair<Ref<Node>, Ref<Node>>> InternalNode::get_child_peers(
+ context_t c, const search_position_t& pos)
+{
+ // assume I'm already ref counted by caller
+ search_position_t prev_pos;
+ const laddr_packed_t* prev_p_child_addr = nullptr;
+ search_position_t next_pos;
+ const laddr_packed_t* next_p_child_addr = nullptr;
+
+ if (pos.is_end()) {
+ assert(impl->is_level_tail());
+ if (!impl->is_keys_empty()) {
+ // got previous child only
+ impl->get_largest_slot(&prev_pos, nullptr, &prev_p_child_addr);
+ assert(prev_pos < pos);
+ assert(prev_p_child_addr != nullptr);
+ } else {
+ // no keys, so no peer children
+ }
+ } else { // !pos.is_end()
+ if (pos != search_position_t::begin()) {
+ // got previous child
+ prev_pos = pos;
+ impl->get_prev_slot(prev_pos, nullptr, &prev_p_child_addr);
+ assert(prev_pos < pos);
+ assert(prev_p_child_addr != nullptr);
+ } else {
+ // is already the first child, so no previous child
+ }
+
+ next_pos = pos;
+ impl->get_next_slot(next_pos, nullptr, &next_p_child_addr);
+ if (next_pos.is_end()) {
+ if (impl->is_level_tail()) {
+ // the next child is the tail
+ next_p_child_addr = impl->get_tail_value();
+ assert(pos < next_pos);
+ assert(next_p_child_addr != nullptr);
+ } else {
+ // next child doesn't exist
+ assert(next_p_child_addr == nullptr);
+ }
+ } else {
+ // got the next child
+ assert(pos < next_pos);
+ assert(next_p_child_addr != nullptr);
+ }
+ }
+
+ return eagain_iertr::now().si_then([this, c, prev_pos, prev_p_child_addr] {
+ if (prev_p_child_addr != nullptr) {
+ return get_or_track_child(c, prev_pos, prev_p_child_addr->value);
+ } else {
+ return eagain_iertr::make_ready_future<Ref<Node>>();
+ }
+ }).si_then([this, c, next_pos, next_p_child_addr] (Ref<Node> lnode) {
+ if (next_p_child_addr != nullptr) {
+ return get_or_track_child(c, next_pos, next_p_child_addr->value
+ ).si_then([lnode] (Ref<Node> rnode) {
+ return seastar::make_ready_future<std::pair<Ref<Node>, Ref<Node>>>(
+ lnode, rnode);
+ });
+ } else {
+ return eagain_iertr::make_ready_future<std::pair<Ref<Node>, Ref<Node>>>(
+ lnode, nullptr);
+ }
+ });
+}
+
+eagain_ifuture<Ref<InternalNode>> InternalNode::allocate_root(
+ context_t c, laddr_t hint, level_t old_root_level,
+ laddr_t old_root_addr, Super::URef&& super)
+{
+ // support tree height up to 256
+ ceph_assert(old_root_level < MAX_LEVEL);
+ return InternalNode::allocate(c, hint, field_type_t::N0, true, old_root_level + 1
+ ).si_then([c, old_root_addr,
+ super = std::move(super)](auto fresh_node) mutable {
+ auto root = fresh_node.node;
+ assert(root->impl->is_keys_empty());
+ auto p_value = root->impl->get_tail_value();
+ fresh_node.mut.copy_in_absolute(
+ const_cast<laddr_packed_t*>(p_value), old_root_addr);
+ root->make_root_from(c, std::move(super), old_root_addr);
+ ++(c.t.get_onode_tree_stats().extents_num_delta);
+ return root;
+ });
+}
+
+eagain_ifuture<Ref<tree_cursor_t>>
+InternalNode::lookup_smallest(context_t c)
+{
+ impl->validate_non_empty();
+ auto position = search_position_t::begin();
+ const laddr_packed_t* p_child_addr;
+ impl->get_slot(position, nullptr, &p_child_addr);
+ return get_or_track_child(c, position, p_child_addr->value
+ ).si_then([c](auto child) {
+ return child->lookup_smallest(c);
+ });
+}
+
+eagain_ifuture<Ref<tree_cursor_t>>
+InternalNode::lookup_largest(context_t c)
+{
+ // NOTE: unlike LeafNode::lookup_largest(), this only works for the tail
+ // internal node to return the tail child address.
+ impl->validate_non_empty();
+ assert(impl->is_level_tail());
+ auto p_child_addr = impl->get_tail_value();
+ return get_or_track_child(c, search_position_t::end(), p_child_addr->value
+ ).si_then([c](auto child) {
+ return child->lookup_largest(c);
+ });
+}
+
+eagain_ifuture<Node::search_result_t>
+InternalNode::lower_bound_tracked(
+ context_t c, const key_hobj_t& key, MatchHistory& history)
+{
+ auto result = impl->lower_bound(key, history);
+ return get_or_track_child(c, result.position, result.p_value->value
+ ).si_then([c, &key, &history](auto child) {
+ // XXX(multi-type): pass result.mstat to child
+ return child->lower_bound_tracked(c, key, history);
+ });
+}
+
+eagain_ifuture<> InternalNode::do_get_tree_stats(
+ context_t c, tree_stats_t& stats)
+{
+ impl->validate_non_empty();
+ auto nstats = impl->get_stats();
+ stats.size_persistent_internal += nstats.size_persistent;
+ stats.size_filled_internal += nstats.size_filled;
+ stats.size_logical_internal += nstats.size_logical;
+ stats.size_overhead_internal += nstats.size_overhead;
+ stats.size_value_internal += nstats.size_value;
+ stats.num_kvs_internal += nstats.num_kvs;
+ stats.num_nodes_internal += 1;
+
+ Ref<Node> this_ref = this;
+ return seastar::do_with(
+ search_position_t(), (const laddr_packed_t*)(nullptr),
+ [this, this_ref, c, &stats](auto& pos, auto& p_child_addr) {
+ pos = search_position_t::begin();
+ impl->get_slot(pos, nullptr, &p_child_addr);
+ return trans_intr::repeat(
+ [this, this_ref, c, &stats, &pos, &p_child_addr]()
+ -> eagain_ifuture<seastar::stop_iteration> {
+ return get_or_track_child(c, pos, p_child_addr->value
+ ).si_then([c, &stats](auto child) {
+ return child->do_get_tree_stats(c, stats);
+ }).si_then([this, this_ref, &pos, &p_child_addr] {
+ if (pos.is_end()) {
+ return seastar::stop_iteration::yes;
+ } else {
+ impl->get_next_slot(pos, nullptr, &p_child_addr);
+ if (pos.is_end()) {
+ if (impl->is_level_tail()) {
+ p_child_addr = impl->get_tail_value();
+ return seastar::stop_iteration::no;
+ } else {
+ return seastar::stop_iteration::yes;
+ }
+ } else {
+ return seastar::stop_iteration::no;
+ }
+ }
+ });
+ });
+ }
+ );
+}
+
+void InternalNode::track_merge(
+ Ref<Node> _right_node, match_stage_t stage, search_position_t& left_last_pos)
+{
+ assert(level() == _right_node->level());
+ assert(impl->node_type() == _right_node->impl->node_type());
+ auto& right_node = *static_cast<InternalNode*>(_right_node.get());
+ if (right_node.tracked_child_nodes.empty()) {
+ return;
+ }
+
+ match_stage_t curr_stage = STAGE_BOTTOM;
+
+ // prepare the initial left_last_pos for offset
+ while (curr_stage < stage) {
+ left_last_pos.index_by_stage(curr_stage) = 0;
+ ++curr_stage;
+ }
+ ++left_last_pos.index_by_stage(curr_stage);
+
+ // fix the tracked child nodes of right_node, stage by stage.
+ auto& right_tracked_children = right_node.tracked_child_nodes;
+ auto rit = right_tracked_children.begin();
+ while (curr_stage <= STAGE_TOP) {
+ auto right_pos_until = search_position_t::begin();
+ right_pos_until.index_by_stage(curr_stage) = INDEX_UPPER_BOUND;
+ auto rend = right_tracked_children.lower_bound(right_pos_until);
+ while (rit != rend) {
+ auto new_pos = rit->second->parent_info().position;
+ assert(new_pos == rit->first);
+ assert(rit->second->parent_info().ptr == &right_node);
+ new_pos += left_last_pos;
+ auto p_child = rit->second;
+ rit = right_tracked_children.erase(rit);
+ p_child->as_child(new_pos, this);
+ }
+ left_last_pos.index_by_stage(curr_stage) = 0;
+ ++curr_stage;
+ }
+
+ // fix the end tracked child node of right_node, if exists.
+ if (rit != right_tracked_children.end()) {
+ assert(rit->first == search_position_t::end());
+ assert(rit->second->parent_info().position == search_position_t::end());
+ assert(right_node.impl->is_level_tail());
+ assert(impl->is_level_tail());
+ auto p_child = rit->second;
+ rit = right_tracked_children.erase(rit);
+ p_child->as_child(search_position_t::end(), this);
+ }
+ assert(right_tracked_children.empty());
+
+ validate_tracked_children();
+}
+
+eagain_ifuture<> InternalNode::test_clone_root(
+ context_t c_other, RootNodeTracker& tracker_other) const
+{
+ assert(is_root());
+ assert(impl->is_level_tail());
+ assert(impl->field_type() == field_type_t::N0);
+ Ref<const Node> this_ref = this;
+ return InternalNode::allocate(c_other, L_ADDR_MIN, field_type_t::N0, true, impl->level()
+ ).si_then([this, c_other, &tracker_other](auto fresh_other) {
+ impl->test_copy_to(fresh_other.mut);
+ auto cloned_root = fresh_other.node;
+ return c_other.nm.get_super(c_other.t, tracker_other
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::assert_all{"Invalid error during test clone"}
+ ).si_then([c_other, cloned_root](auto&& super_other) {
+ assert(super_other);
+ cloned_root->make_root_new(c_other, std::move(super_other));
+ return cloned_root;
+ });
+ }).si_then([this_ref, this, c_other](auto cloned_root) {
+ // clone tracked children
+ // In some unit tests, the children are stubbed out that they
+ // don't exist in NodeExtentManager, and are only tracked in memory.
+ return trans_intr::do_for_each(
+ tracked_child_nodes.begin(),
+ tracked_child_nodes.end(),
+ [this_ref, c_other, cloned_root](auto& kv) {
+ assert(kv.first == kv.second->parent_info().position);
+ return kv.second->test_clone_non_root(c_other, cloned_root);
+ }
+ );
+ });
+}
+
+eagain_ifuture<> InternalNode::try_downgrade_root(
+ context_t c, Ref<Node>&& this_ref)
+{
+ LOG_PREFIX(OTree::InternalNode::try_downgrade_root);
+ assert(this_ref.get() == this);
+ assert(is_root());
+ assert(impl->is_level_tail());
+ if (!impl->is_keys_empty()) {
+ // I have more than 1 values, no need to downgrade
+ return eagain_iertr::now();
+ }
+
+ // proceed downgrade root to the only child
+ laddr_t child_addr = impl->get_tail_value()->value;
+ return get_or_track_child(c, search_position_t::end(), child_addr
+ ).si_then([c, this, FNAME,
+ this_ref = std::move(this_ref)] (auto child) mutable {
+ INFOT("downgrade {} to new root {}",
+ c.t, get_name(), child->get_name());
+ // Invariant, see InternalNode::erase_child()
+ // the new internal root should have at least 2 children.
+ assert(child->impl->is_level_tail());
+ if (child->impl->node_type() == node_type_t::INTERNAL) {
+ ceph_assert(!child->impl->is_keys_empty());
+ }
+
+ assert(tracked_child_nodes.size() == 1);
+ child->deref_parent();
+ auto super_to_move = deref_super();
+ child->make_root_from(c, std::move(super_to_move), impl->laddr());
+ --(c.t.get_onode_tree_stats().extents_num_delta);
+ return retire(c, std::move(this_ref));
+ });
+}
+
+eagain_ifuture<Ref<InternalNode>> InternalNode::insert_or_split(
+ context_t c,
+ const search_position_t& pos,
+ const key_view_t& insert_key,
+ Ref<Node> insert_child,
+ Ref<Node> outdated_child)
+{
+ LOG_PREFIX(OTree::InternalNode::insert_or_split);
+ // XXX: check the insert_child is unlinked from this node
+#ifndef NDEBUG
+ auto _insert_key = *insert_child->impl->get_pivot_index();
+ assert(insert_key == _insert_key);
+#endif
+ auto insert_value = insert_child->impl->laddr();
+ auto insert_pos = pos;
+ DEBUGT("insert {} with insert_key={}, insert_child={}, insert_pos({}), "
+ "outdated_child={} ...",
+ c.t, get_name(), insert_key, insert_child->get_name(),
+ insert_pos, (outdated_child ? "True" : "False"));
+ auto [insert_stage, insert_size] = impl->evaluate_insert(
+ insert_key, insert_value, insert_pos);
+ auto free_size = impl->free_size();
+ if (free_size >= insert_size) {
+ // proceed to insert
+ [[maybe_unused]] auto p_value = impl->insert(
+ insert_key, insert_value, insert_pos, insert_stage, insert_size);
+ assert(impl->free_size() == free_size - insert_size);
+ assert(insert_pos <= pos);
+ assert(p_value->value == insert_value);
+
+ if (outdated_child) {
+ track_insert<false>(insert_pos, insert_stage, insert_child);
+ validate_child_inconsistent(*outdated_child);
+#ifndef NDEBUG
+ do_untrack_child(*outdated_child);
+ validate_tracked_children();
+ do_track_child<false>(*outdated_child);
+#endif
+ } else {
+ track_insert(insert_pos, insert_stage, insert_child);
+ validate_tracked_children();
+ }
+
+ return eagain_iertr::make_ready_future<Ref<InternalNode>>(nullptr);
+ }
+
+ // proceed to split with insert
+ // assume I'm already ref-counted by caller
+ laddr_t left_hint, right_hint;
+ {
+ key_view_t left_key;
+ impl->get_slot(search_position_t::begin(), &left_key, nullptr);
+ left_hint = left_key.get_hint();
+ key_view_t right_key;
+ impl->get_largest_slot(nullptr, &right_key, nullptr);
+ right_hint = right_key.get_hint();
+ }
+ return (is_root() ? upgrade_root(c, left_hint) : eagain_iertr::now()
+ ).si_then([this, c, right_hint] {
+ return InternalNode::allocate(
+ c, right_hint, impl->field_type(), impl->is_level_tail(), impl->level());
+ }).si_then([this, insert_key, insert_child, insert_pos,
+ insert_stage=insert_stage, insert_size=insert_size,
+ outdated_child, c, FNAME](auto fresh_right) mutable {
+ // I'm the left_node and need to split into the right_node
+ auto right_node = fresh_right.node;
+ DEBUGT("proceed split {} to fresh {} with insert_child={},"
+ " outdated_child={} ...",
+ c.t, get_name(), right_node->get_name(),
+ insert_child->get_name(),
+ (outdated_child ? outdated_child->get_name() : "N/A"));
+ auto insert_value = insert_child->impl->laddr();
+ auto [split_pos, is_insert_left, p_value] = impl->split_insert(
+ fresh_right.mut, *right_node->impl, insert_key, insert_value,
+ insert_pos, insert_stage, insert_size);
+ assert(p_value->value == insert_value);
+ track_split(split_pos, right_node);
+
+ if (outdated_child) {
+ if (is_insert_left) {
+ track_insert<false>(insert_pos, insert_stage, insert_child);
+ } else {
+ right_node->template track_insert<false>(insert_pos, insert_stage, insert_child);
+ }
+#ifndef NDEBUG
+ auto& _parent = outdated_child->parent_info().ptr;
+ _parent->validate_child_inconsistent(*outdated_child);
+ _parent->do_untrack_child(*outdated_child);
+ validate_tracked_children();
+ right_node->validate_tracked_children();
+ _parent->do_track_child<false>(*outdated_child);
+#endif
+ } else {
+ if (is_insert_left) {
+ track_insert(insert_pos, insert_stage, insert_child);
+ } else {
+ right_node->track_insert(insert_pos, insert_stage, insert_child);
+ }
+ validate_tracked_children();
+ right_node->validate_tracked_children();
+ }
+ ++(c.t.get_onode_tree_stats().extents_num_delta);
+ return right_node;
+ });
+}
+
+eagain_ifuture<Ref<Node>> InternalNode::get_or_track_child(
+ context_t c, const search_position_t& position, laddr_t child_addr)
+{
+ LOG_PREFIX(OTree::InternalNode::get_or_track_child);
+ Ref<Node> this_ref = this;
+ return [this, position, child_addr, c, FNAME] {
+ auto found = tracked_child_nodes.find(position);
+ if (found != tracked_child_nodes.end()) {
+ TRACET("loaded child tracked {} at pos({}) addr={:x}",
+ c.t, found->second->get_name(), position, child_addr);
+ return eagain_iertr::make_ready_future<Ref<Node>>(found->second);
+ }
+ // the child is not loaded yet
+ TRACET("loading child at pos({}) addr={:x} ...",
+ c.t, position, child_addr);
+ bool level_tail = position.is_end();
+ return Node::load(c, child_addr, level_tail
+ ).si_then([this, position, c, FNAME] (auto child) {
+ TRACET("loaded child untracked {}",
+ c.t, child->get_name());
+ if (child->level() + 1 != level()) {
+ ERRORT("loaded child {} error from parent {} at pos({}), level mismatch",
+ c.t, child->get_name(), get_name(), position);
+ ceph_abort("fatal error");
+ }
+ child->as_child(position, this);
+ return child;
+ });
+ }().si_then([this_ref, this, position, child_addr] (auto child) {
+ assert(child_addr == child->impl->laddr());
+ assert(position == child->parent_info().position);
+ std::ignore = position;
+ std::ignore = child_addr;
+ validate_child_tracked(*child);
+ return child;
+ });
+}
+
+template <bool VALIDATE>
+void InternalNode::track_insert(
+ const search_position_t& insert_pos, match_stage_t insert_stage,
+ Ref<Node> insert_child, Ref<Node> nxt_child)
+{
+ // update tracks
+ auto pos_upper_bound = insert_pos;
+ pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND;
+ auto first = tracked_child_nodes.lower_bound(insert_pos);
+ auto last = tracked_child_nodes.lower_bound(pos_upper_bound);
+ std::vector<Node*> nodes;
+ std::for_each(first, last, [&nodes](auto& kv) {
+ nodes.push_back(kv.second);
+ });
+ tracked_child_nodes.erase(first, last);
+ for (auto& node : nodes) {
+ auto _pos = node->parent_info().position;
+ assert(!_pos.is_end());
+ ++_pos.index_by_stage(insert_stage);
+ node->as_child<VALIDATE>(_pos, this);
+ }
+ // track insert
+ insert_child->as_child(insert_pos, this);
+
+#ifndef NDEBUG
+ // validate left_child is before right_child
+ if (nxt_child) {
+ auto iter = tracked_child_nodes.find(insert_pos);
+ ++iter;
+ assert(iter->second == nxt_child);
+ }
+#endif
+}
+template void InternalNode::track_insert<true>(const search_position_t&, match_stage_t, Ref<Node>, Ref<Node>);
+template void InternalNode::track_insert<false>(const search_position_t&, match_stage_t, Ref<Node>, Ref<Node>);
+
+void InternalNode::replace_track(
+ Ref<Node> new_child, Ref<Node> old_child, bool is_new_child_outdated)
+{
+ assert(!new_child->is_tracked());
+ auto& pos = old_child->parent_info().position;
+ auto this_ref = old_child->deref_parent();
+ assert(this_ref == this);
+ if (is_new_child_outdated) {
+ // we need to keep track of the outdated child through
+ // insert and split.
+ new_child->as_child<false>(pos, this);
+ } else {
+ new_child->as_child(pos, this);
+ }
+
+#ifndef NDEBUG
+ if (is_new_child_outdated) {
+ validate_child_inconsistent(*new_child);
+ } else {
+ validate_child_tracked(*new_child);
+ }
+#endif
+}
+
+void InternalNode::track_split(
+ const search_position_t& split_pos, Ref<InternalNode> right_node)
+{
+ auto iter = tracked_child_nodes.lower_bound(split_pos);
+ while (iter != tracked_child_nodes.end()) {
+ auto new_pos = iter->first;
+ auto p_node = iter->second;
+ iter = tracked_child_nodes.erase(iter);
+ new_pos -= split_pos;
+ p_node->as_child<false>(new_pos, right_node);
+ }
+}
+
+template <bool VALIDATE>
+void InternalNode::track_erase(
+ const search_position_t& erase_pos, match_stage_t erase_stage)
+{
+ auto first = tracked_child_nodes.lower_bound(erase_pos);
+ assert(first == tracked_child_nodes.end() ||
+ first->first != erase_pos);
+ auto pos_upper_bound = erase_pos;
+ pos_upper_bound.index_by_stage(erase_stage) = INDEX_UPPER_BOUND;
+ auto last = tracked_child_nodes.lower_bound(pos_upper_bound);
+ std::vector<Node*> p_nodes;
+ std::for_each(first, last, [&p_nodes](auto& kv) {
+ p_nodes.push_back(kv.second);
+ });
+ tracked_child_nodes.erase(first, last);
+ for (auto& p_node: p_nodes) {
+ auto new_pos = p_node->parent_info().position;
+ assert(new_pos.index_by_stage(erase_stage) > 0);
+ --new_pos.index_by_stage(erase_stage);
+ p_node->as_child<VALIDATE>(new_pos, this);
+ }
+}
+template void InternalNode::track_erase<true>(const search_position_t&, match_stage_t);
+template void InternalNode::track_erase<false>(const search_position_t&, match_stage_t);
+
+void InternalNode::track_make_tail(const search_position_t& last_pos)
+{
+ // assume I'm ref counted by the caller.
+ assert(impl->is_level_tail());
+ assert(!last_pos.is_end());
+ assert(tracked_child_nodes.find(search_position_t::end()) ==
+ tracked_child_nodes.end());
+ auto last_it = tracked_child_nodes.find(last_pos);
+ if (last_it != tracked_child_nodes.end()) {
+ assert(std::next(last_it) == tracked_child_nodes.end());
+ auto p_last_child = last_it->second;
+ tracked_child_nodes.erase(last_it);
+ p_last_child->as_child(search_position_t::end(), this);
+ } else {
+ assert(tracked_child_nodes.lower_bound(last_pos) ==
+ tracked_child_nodes.end());
+ }
+}
+
+void InternalNode::validate_child(const Node& child) const
+{
+#ifndef NDEBUG
+ assert(impl->level() - 1 == child.impl->level());
+ assert(this == child.parent_info().ptr);
+ auto& child_pos = child.parent_info().position;
+ if (child_pos.is_end()) {
+ assert(impl->is_level_tail());
+ assert(child.impl->is_level_tail());
+ assert(impl->get_tail_value()->value == child.impl->laddr());
+ } else {
+ assert(!child.impl->is_level_tail());
+ key_view_t index_key;
+ const laddr_packed_t* p_child_addr;
+ impl->get_slot(child_pos, &index_key, &p_child_addr);
+ assert(index_key == *child.impl->get_pivot_index());
+ assert(p_child_addr->value == child.impl->laddr());
+ }
+ // XXX(multi-type)
+ assert(impl->field_type() <= child.impl->field_type());
+#endif
+}
+
+void InternalNode::validate_child_inconsistent(const Node& child) const
+{
+#ifndef NDEBUG
+ assert(impl->level() - 1 == child.impl->level());
+ assert(check_is_tracking(child));
+ auto& child_pos = child.parent_info().position;
+ // the tail value has no key to fix
+ assert(!child_pos.is_end());
+ assert(!child.impl->is_level_tail());
+
+ key_view_t current_key;
+ const laddr_packed_t* p_value;
+ impl->get_slot(child_pos, &current_key, &p_value);
+ key_view_t new_key = *child.impl->get_pivot_index();
+ assert(current_key != new_key);
+ assert(p_value->value == child.impl->laddr());
+#endif
+}
+
+eagain_ifuture<InternalNode::fresh_node_t> InternalNode::allocate(
+ context_t c, laddr_t hint, field_type_t field_type, bool is_level_tail, level_t level)
+{
+ return InternalNodeImpl::allocate(c, hint, field_type, is_level_tail, level
+ ).si_then([](auto&& fresh_impl) {
+ auto *derived_ptr = fresh_impl.impl.get();
+ auto node = Ref<InternalNode>(new InternalNode(
+ derived_ptr, std::move(fresh_impl.impl)));
+ return fresh_node_t{node, fresh_impl.mut};
+ });
+}
+
+/*
+ * LeafNode
+ */
+
+LeafNode::LeafNode(LeafNodeImpl* impl, NodeImplURef&& impl_ref)
+ : Node(std::move(impl_ref)), impl{impl} {}
+
+bool LeafNode::is_level_tail() const
+{
+ return impl->is_level_tail();
+}
+
+node_version_t LeafNode::get_version() const
+{
+ return {layout_version, impl->get_extent_state()};
+}
+
+const char* LeafNode::read() const
+{
+ return impl->read();
+}
+
+extent_len_t LeafNode::get_node_size() const
+{
+ return impl->get_node_size();
+}
+
+std::tuple<key_view_t, const value_header_t*>
+LeafNode::get_kv(const search_position_t& pos) const
+{
+ key_view_t key_view;
+ const value_header_t* p_value_header;
+ impl->get_slot(pos, &key_view, &p_value_header);
+ return {key_view, p_value_header};
+}
+
+eagain_ifuture<Ref<tree_cursor_t>>
+LeafNode::get_next_cursor(context_t c, const search_position_t& pos)
+{
+ impl->validate_non_empty();
+ search_position_t next_pos = pos;
+ key_view_t index_key;
+ const value_header_t* p_value_header = nullptr;
+ impl->get_next_slot(next_pos, &index_key, &p_value_header);
+ if (next_pos.is_end()) {
+ if (unlikely(is_level_tail())) {
+ return eagain_iertr::make_ready_future<Ref<tree_cursor_t>>(
+ tree_cursor_t::create_end(this));
+ } else {
+ return get_next_cursor_from_parent(c);
+ }
+ } else {
+ return eagain_iertr::make_ready_future<Ref<tree_cursor_t>>(
+ get_or_track_cursor(next_pos, index_key, p_value_header));
+ }
+}
+
+template <bool FORCE_MERGE>
+eagain_ifuture<Ref<tree_cursor_t>>
+LeafNode::erase(context_t c, const search_position_t& pos, bool get_next)
+{
+ LOG_PREFIX(OTree::LeafNode::erase);
+ assert(!pos.is_end());
+ assert(!impl->is_keys_empty());
+ Ref<Node> this_ref = this;
+ DEBUGT("erase {}'s pos({}), get_next={} ...",
+ c.t, get_name(), pos, get_next);
+ ++(c.t.get_onode_tree_stats().num_erases);
+
+ // get the next cursor
+ return eagain_iertr::now().si_then([c, &pos, get_next, this] {
+ if (get_next) {
+ return get_next_cursor(c, pos);
+ } else {
+ return eagain_iertr::make_ready_future<Ref<tree_cursor_t>>();
+ }
+ }).si_then([c, &pos, this_ref = std::move(this_ref),
+ this, FNAME] (Ref<tree_cursor_t> next_cursor) mutable {
+ if (next_cursor && next_cursor->is_end()) {
+ // reset the node reference from the end cursor
+ next_cursor.reset();
+ }
+ return eagain_iertr::now().si_then(
+ [c, &pos, this_ref = std::move(this_ref), this, FNAME] () mutable {
+ assert_moveable(this_ref);
+#ifndef NDEBUG
+ assert(!impl->is_keys_empty());
+ if (impl->has_single_value()) {
+ assert(pos == search_position_t::begin());
+ }
+#endif
+ if (!is_root() && impl->has_single_value()) {
+ // we need to keep the root as an empty leaf node
+ // fast path without mutating the extent
+ // track_erase
+ DEBUGT("{} has one value left, erase ...", c.t, get_name());
+ assert(tracked_cursors.size() == 1);
+ auto iter = tracked_cursors.begin();
+ assert(iter->first == pos);
+ iter->second->invalidate();
+ tracked_cursors.clear();
+
+ // no cursor should be referencing this node now, this_ref is the last one.
+ assert(this_ref->use_count() == 1);
+ return Node::erase_node(c, std::move(this_ref));
+ }
+
+ on_layout_change();
+ impl->prepare_mutate(c);
+ auto [erase_stage, next_pos] = impl->erase(pos);
+ track_erase(pos, erase_stage);
+ validate_tracked_cursors();
+
+ if (is_root()) {
+ return eagain_iertr::now();
+ } else {
+ bool update_parent_index;
+ if (impl->is_level_tail()) {
+ update_parent_index = false;
+ } else {
+ next_pos.is_end() ? update_parent_index = true
+ : update_parent_index = false;
+ }
+ return try_merge_adjacent<FORCE_MERGE>(
+ c, update_parent_index, std::move(this_ref));
+ }
+ }).si_then([next_cursor] {
+ return next_cursor;
+ });
+ });
+}
+template eagain_ifuture<Ref<tree_cursor_t>>
+LeafNode::erase<true>(context_t, const search_position_t&, bool);
+template eagain_ifuture<Ref<tree_cursor_t>>
+LeafNode::erase<false>(context_t, const search_position_t&, bool);
+
+eagain_ifuture<> LeafNode::extend_value(
+ context_t c, const search_position_t& pos, value_size_t extend_size)
+{
+ ceph_abort("not implemented");
+ return eagain_iertr::now();
+}
+
+eagain_ifuture<> LeafNode::trim_value(
+ context_t c, const search_position_t& pos, value_size_t trim_size)
+{
+ ceph_abort("not implemented");
+ return eagain_iertr::now();
+}
+
+std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+LeafNode::prepare_mutate_value_payload(context_t c)
+{
+ return impl->prepare_mutate_value_payload(c);
+}
+
+eagain_ifuture<Ref<tree_cursor_t>>
+LeafNode::lookup_smallest(context_t)
+{
+ if (unlikely(impl->is_keys_empty())) {
+ assert(is_root());
+ return seastar::make_ready_future<Ref<tree_cursor_t>>(
+ tree_cursor_t::create_end(this));
+ }
+ auto pos = search_position_t::begin();
+ key_view_t index_key;
+ const value_header_t* p_value_header;
+ impl->get_slot(pos, &index_key, &p_value_header);
+ return seastar::make_ready_future<Ref<tree_cursor_t>>(
+ get_or_track_cursor(pos, index_key, p_value_header));
+}
+
+eagain_ifuture<Ref<tree_cursor_t>>
+LeafNode::lookup_largest(context_t)
+{
+ if (unlikely(impl->is_keys_empty())) {
+ assert(is_root());
+ return seastar::make_ready_future<Ref<tree_cursor_t>>(
+ tree_cursor_t::create_end(this));
+ }
+ search_position_t pos;
+ key_view_t index_key;
+ const value_header_t* p_value_header = nullptr;
+ impl->get_largest_slot(&pos, &index_key, &p_value_header);
+ return seastar::make_ready_future<Ref<tree_cursor_t>>(
+ get_or_track_cursor(pos, index_key, p_value_header));
+}
+
+eagain_ifuture<Node::search_result_t>
+LeafNode::lower_bound_tracked(
+ context_t c, const key_hobj_t& key, MatchHistory& history)
+{
+ key_view_t index_key;
+ auto result = impl->lower_bound(key, history, &index_key);
+ Ref<tree_cursor_t> cursor;
+ if (result.position.is_end()) {
+ assert(!result.p_value);
+ cursor = tree_cursor_t::create_end(this);
+ } else {
+ cursor = get_or_track_cursor(result.position, index_key, result.p_value);
+ }
+ search_result_t ret{cursor, result.mstat};
+ ret.validate_input_key(key, c.vb.get_header_magic());
+ return seastar::make_ready_future<search_result_t>(ret);
+}
+
+eagain_ifuture<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats)
+{
+ auto nstats = impl->get_stats();
+ stats.size_persistent_leaf += nstats.size_persistent;
+ stats.size_filled_leaf += nstats.size_filled;
+ stats.size_logical_leaf += nstats.size_logical;
+ stats.size_overhead_leaf += nstats.size_overhead;
+ stats.size_value_leaf += nstats.size_value;
+ stats.num_kvs_leaf += nstats.num_kvs;
+ stats.num_nodes_leaf += 1;
+ return eagain_iertr::now();
+}
+
+void LeafNode::track_merge(
+ Ref<Node> _right_node, match_stage_t stage, search_position_t& left_last_pos)
+{
+ assert(level() == _right_node->level());
+ // assert(impl->node_type() == _right_node->impl->node_type());
+ auto& right_node = *static_cast<LeafNode*>(_right_node.get());
+ if (right_node.tracked_cursors.empty()) {
+ return;
+ }
+
+ match_stage_t curr_stage = STAGE_BOTTOM;
+
+ // prepare the initial left_last_pos for offset
+ while (curr_stage < stage) {
+ left_last_pos.index_by_stage(curr_stage) = 0;
+ ++curr_stage;
+ }
+ ++left_last_pos.index_by_stage(curr_stage);
+
+ // fix the tracked child nodes of right_node, stage by stage.
+ auto& right_tracked_cursors = right_node.tracked_cursors;
+ auto rit = right_tracked_cursors.begin();
+ while (curr_stage <= STAGE_TOP) {
+ auto right_pos_until = search_position_t::begin();
+ right_pos_until.index_by_stage(curr_stage) = INDEX_UPPER_BOUND;
+ auto rend = right_tracked_cursors.lower_bound(right_pos_until);
+ while (rit != rend) {
+ auto new_pos = rit->second->get_position();
+ assert(new_pos == rit->first);
+ assert(rit->second->get_leaf_node().get() == &right_node);
+ new_pos += left_last_pos;
+ auto p_cursor = rit->second;
+ rit = right_tracked_cursors.erase(rit);
+ p_cursor->update_track<true>(this, new_pos);
+ }
+ left_last_pos.index_by_stage(curr_stage) = 0;
+ ++curr_stage;
+ }
+ assert(right_tracked_cursors.empty());
+
+ validate_tracked_cursors();
+}
+
+eagain_ifuture<> LeafNode::test_clone_root(
+ context_t c_other, RootNodeTracker& tracker_other) const
+{
+ assert(is_root());
+ assert(impl->is_level_tail());
+ assert(impl->field_type() == field_type_t::N0);
+ Ref<const Node> this_ref = this;
+ return LeafNode::allocate(c_other, L_ADDR_MIN, field_type_t::N0, true
+ ).si_then([this, c_other, &tracker_other](auto fresh_other) {
+ impl->test_copy_to(fresh_other.mut);
+ auto cloned_root = fresh_other.node;
+ return c_other.nm.get_super(c_other.t, tracker_other
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::assert_all{"Invalid error during test clone"}
+ ).si_then([c_other, cloned_root](auto&& super_other) {
+ assert(super_other);
+ cloned_root->make_root_new(c_other, std::move(super_other));
+ });
+ }).si_then([this_ref]{});
+}
+
+eagain_ifuture<Ref<tree_cursor_t>> LeafNode::insert_value(
+ context_t c, const key_hobj_t& key, value_config_t vconf,
+ const search_position_t& pos, const MatchHistory& history,
+ match_stat_t mstat)
+{
+ LOG_PREFIX(OTree::LeafNode::insert_value);
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(impl->is_level_tail());
+ }
+#endif
+ DEBUGT("insert {} with insert_key={}, insert_value={}, insert_pos({}), "
+ "history={}, mstat({}) ...",
+ c.t, get_name(), key, vconf, pos, history, mstat);
+ ++(c.t.get_onode_tree_stats().num_inserts);
+ search_position_t insert_pos = pos;
+ auto [insert_stage, insert_size] = impl->evaluate_insert(
+ key, vconf, history, mstat, insert_pos);
+ auto free_size = impl->free_size();
+ if (free_size >= insert_size) {
+ // proceed to insert
+ on_layout_change();
+ impl->prepare_mutate(c);
+ auto p_value_header = impl->insert(key, vconf, insert_pos, insert_stage, insert_size);
+ assert(impl->free_size() == free_size - insert_size);
+ assert(insert_pos <= pos);
+ assert(p_value_header->payload_size == vconf.payload_size);
+ auto ret = track_insert(insert_pos, insert_stage, p_value_header);
+ validate_tracked_cursors();
+ return eagain_iertr::make_ready_future<Ref<tree_cursor_t>>(ret);
+ }
+ // split and insert
+ Ref<Node> this_ref = this;
+ laddr_t left_hint, right_hint;
+ {
+ key_view_t left_key;
+ impl->get_slot(search_position_t::begin(), &left_key, nullptr);
+ left_hint = left_key.get_hint();
+ key_view_t right_key;
+ impl->get_largest_slot(nullptr, &right_key, nullptr);
+ right_hint = right_key.get_hint();
+ }
+ return (is_root() ? upgrade_root(c, left_hint) : eagain_iertr::now()
+ ).si_then([this, c, right_hint] {
+ return LeafNode::allocate(c, right_hint, impl->field_type(), impl->is_level_tail());
+ }).si_then([this_ref = std::move(this_ref), this, c, &key, vconf, FNAME,
+ insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable {
+ auto right_node = fresh_right.node;
+ DEBUGT("proceed split {} to fresh {} ...",
+ c.t, get_name(), right_node->get_name());
+ // no need to bump version for right node, as it is fresh
+ on_layout_change();
+ impl->prepare_mutate(c);
+ auto [split_pos, is_insert_left, p_value_header] = impl->split_insert(
+ fresh_right.mut, *right_node->impl, key, vconf,
+ insert_pos, insert_stage, insert_size);
+ assert(p_value_header->payload_size == vconf.payload_size);
+ track_split(split_pos, right_node);
+ Ref<tree_cursor_t> ret;
+ if (is_insert_left) {
+ ret = track_insert(insert_pos, insert_stage, p_value_header);
+ } else {
+ ret = right_node->track_insert(insert_pos, insert_stage, p_value_header);
+ }
+ validate_tracked_cursors();
+ right_node->validate_tracked_cursors();
+
+ ++(c.t.get_onode_tree_stats().extents_num_delta);
+ return apply_split_to_parent(
+ c, std::move(this_ref), std::move(right_node), false
+ ).si_then([ret] {
+ return ret;
+ });
+ // TODO (optimize)
+ // try to acquire space from siblings before split... see btrfs
+ });
+}
+
+eagain_ifuture<Ref<LeafNode>> LeafNode::allocate_root(
+ context_t c, RootNodeTracker& root_tracker)
+{
+ LOG_PREFIX(OTree::LeafNode::allocate_root);
+ return LeafNode::allocate(c, L_ADDR_MIN, field_type_t::N0, true
+ ).si_then([c, &root_tracker, FNAME](auto fresh_node) {
+ auto root = fresh_node.node;
+ return c.nm.get_super(c.t, root_tracker
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::input_output_error::handle([FNAME, c] {
+ ERRORT("EIO during get_super()", c.t);
+ ceph_abort("fatal error");
+ })
+ ).si_then([c, root](auto&& super) {
+ assert(super);
+ root->make_root_new(c, std::move(super));
+ return root;
+ });
+ });
+}
+
+Ref<tree_cursor_t> LeafNode::get_or_track_cursor(
+ const search_position_t& position,
+ const key_view_t& key, const value_header_t* p_value_header)
+{
+ assert(!position.is_end());
+ assert(p_value_header);
+ Ref<tree_cursor_t> p_cursor;
+ auto found = tracked_cursors.find(position);
+ if (found == tracked_cursors.end()) {
+ p_cursor = tree_cursor_t::create_tracked(
+ this, position, key, p_value_header);
+ } else {
+ p_cursor = found->second;
+ assert(p_cursor->get_leaf_node() == this);
+ assert(p_cursor->get_position() == position);
+ p_cursor->update_cache_same_node(key, p_value_header);
+ }
+ return p_cursor;
+}
+
+void LeafNode::validate_cursor(const tree_cursor_t& cursor) const
+{
+#ifndef NDEBUG
+ assert(this == cursor.get_leaf_node().get());
+ assert(cursor.is_tracked());
+ assert(!impl->is_extent_retired());
+
+ // We need to make sure user has freed all the cursors before submitting the
+ // according transaction. Otherwise the below checks will have undefined
+ // behaviors.
+ auto [key, p_value_header] = get_kv(cursor.get_position());
+ auto magic = p_value_header->magic;
+ assert(key == cursor.get_key_view(magic));
+ assert(p_value_header == cursor.read_value_header(magic));
+#endif
+}
+
+Ref<tree_cursor_t> LeafNode::track_insert(
+ const search_position_t& insert_pos, match_stage_t insert_stage,
+ const value_header_t* p_value_header)
+{
+ // update cursor position
+ auto pos_upper_bound = insert_pos;
+ pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND;
+ auto first = tracked_cursors.lower_bound(insert_pos);
+ auto last = tracked_cursors.lower_bound(pos_upper_bound);
+ std::vector<tree_cursor_t*> p_cursors;
+ std::for_each(first, last, [&p_cursors](auto& kv) {
+ p_cursors.push_back(kv.second);
+ });
+ tracked_cursors.erase(first, last);
+ for (auto& p_cursor : p_cursors) {
+ search_position_t new_pos = p_cursor->get_position();
+ ++new_pos.index_by_stage(insert_stage);
+ p_cursor->update_track<true>(this, new_pos);
+ }
+
+ // track insert
+ // TODO: getting key_view_t from stage::proceed_insert() and
+ // stage::append_insert() has not supported yet
+ return tree_cursor_t::create_inserted(
+ this, insert_pos);
+}
+
+void LeafNode::track_split(
+ const search_position_t& split_pos, Ref<LeafNode> right_node)
+{
+ // update cursor ownership and position
+ auto iter = tracked_cursors.lower_bound(split_pos);
+ while (iter != tracked_cursors.end()) {
+ auto new_pos = iter->first;
+ auto p_cursor = iter->second;
+ iter = tracked_cursors.erase(iter);
+ new_pos -= split_pos;
+ p_cursor->update_track<false>(right_node, new_pos);
+ }
+}
+
+void LeafNode::track_erase(
+ const search_position_t& erase_pos, match_stage_t erase_stage)
+{
+ // erase tracking and invalidate the erased cursor
+ auto to_erase = tracked_cursors.find(erase_pos);
+ assert(to_erase != tracked_cursors.end());
+ to_erase->second->invalidate();
+ auto first = tracked_cursors.erase(to_erase);
+
+ // update cursor position
+ assert(first == tracked_cursors.lower_bound(erase_pos));
+ auto pos_upper_bound = erase_pos;
+ pos_upper_bound.index_by_stage(erase_stage) = INDEX_UPPER_BOUND;
+ auto last = tracked_cursors.lower_bound(pos_upper_bound);
+ std::vector<tree_cursor_t*> p_cursors;
+ std::for_each(first, last, [&p_cursors](auto& kv) {
+ p_cursors.push_back(kv.second);
+ });
+ tracked_cursors.erase(first, last);
+ for (auto& p_cursor : p_cursors) {
+ search_position_t new_pos = p_cursor->get_position();
+ assert(new_pos.index_by_stage(erase_stage) > 0);
+ --new_pos.index_by_stage(erase_stage);
+ p_cursor->update_track<true>(this, new_pos);
+ }
+}
+
+eagain_ifuture<LeafNode::fresh_node_t> LeafNode::allocate(
+ context_t c, laddr_t hint, field_type_t field_type, bool is_level_tail)
+{
+ return LeafNodeImpl::allocate(c, hint, field_type, is_level_tail
+ ).si_then([](auto&& fresh_impl) {
+ auto *derived_ptr = fresh_impl.impl.get();
+ auto node = Ref<LeafNode>(new LeafNode(
+ derived_ptr, std::move(fresh_impl.impl)));
+ return fresh_node_t{node, fresh_impl.mut};
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
new file mode 100644
index 000000000..0b764172e
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
@@ -0,0 +1,743 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <compare>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "crimson/common/type_helpers.h"
+
+#include "node_extent_mutable.h"
+#include "stages/key_layout.h"
+#include "stages/stage_types.h"
+#include "super.h"
+#include "value.h"
+
+/**
+ * Tree example (2 levels):
+ *
+ * Root node keys: [ 3 7 ]
+ * values: [p1 p2 p3]
+ * / | \
+ * ------- | -------
+ * | | |
+ * V V V
+ * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12]
+ * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9]
+ *
+ * Tree structure properties:
+ * - As illustrated above, the parent key is strictly equal to its left child's
+ * largest key;
+ * - If a tree is indexing multiple seastore transactions, each transaction
+ * will be mapped to a Super which points to a distinct root node. So the
+ * transactions are isolated at tree level. However, tree nodes from
+ * different transactions can reference the same seastore CachedExtent before
+ * modification;
+ * - The resources of the transactional tree are tracked by tree_cursor_ts held
+ * by users. As long as any cursor is alive, the according tree hierarchy is
+ * alive and keeps tracked. See the reversed resource management sections
+ * below;
+ */
+
+namespace crimson::os::seastore::onode {
+
+class LeafNode;
+class InternalNode;
+
+using layout_version_t = uint32_t;
+struct node_version_t {
+ layout_version_t layout;
+ nextent_state_t state;
+
+ bool operator==(const node_version_t& rhs) const {
+ return (layout == rhs.layout && state == rhs.state);
+ }
+ bool operator!=(const node_version_t& rhs) const {
+ return !(*this == rhs);
+ }
+};
+
+/**
+ * tree_cursor_t
+ *
+ * A cursor points to a position (LeafNode and search_position_t) of the tree
+ * where it can find the according key and value pair. The position is updated
+ * by LeafNode insert/split/delete/merge internally and is kept valid. It also
+ * caches the key-value information for a specific node layout version.
+ *
+ * Exposes public interfaces for Btree::Cursor.
+ */
+class tree_cursor_t final
+ : public boost::intrusive_ref_counter<
+ tree_cursor_t, boost::thread_unsafe_counter> {
+ public:
+ ~tree_cursor_t();
+ tree_cursor_t(const tree_cursor_t&) = delete;
+ tree_cursor_t(tree_cursor_t&&) = delete;
+ tree_cursor_t& operator=(const tree_cursor_t&) = delete;
+ tree_cursor_t& operator=(tree_cursor_t&&) = delete;
+
+ // public to Btree
+
+ /**
+ * is_end
+ *
+ * Represents one-past-the-last of all the sorted key-value
+ * pairs in the tree. An end cursor won't contain valid key-value
+ * information.
+ */
+ bool is_end() const { return !!ref_leaf_node && position.is_end(); }
+
+ /**
+ * is_tracked
+ *
+ * Represents a key-value pair stored in the tree, which is always tracked
+ * across insert/split/erase/merge operations.
+ */
+ bool is_tracked() const { return !!ref_leaf_node && !position.is_end(); }
+
+ /**
+ * is_invalid
+ *
+ * Represents an invalid cursor which was once valid and tracked by the tree
+ * but is now erased and untracked. User may still hold an invalid cursor.
+ */
+ bool is_invalid() const { return !ref_leaf_node; }
+
+ /// Returns the key view in tree if it is not an end cursor.
+ const key_view_t& get_key_view(value_magic_t magic) const {
+ assert(is_tracked());
+ return cache.get_key_view(magic, position);
+ }
+
+ /// Returns the next tree_cursor_t in tree, can be end if there's no next.
+ eagain_ifuture<Ref<tree_cursor_t>> get_next(context_t);
+
+ /// Check that this is next to prv
+ void assert_next_to(const tree_cursor_t&, value_magic_t) const;
+
+ /// Erases the key-value pair from tree.
+ template <bool FORCE_MERGE = false>
+ eagain_ifuture<Ref<tree_cursor_t>> erase(context_t, bool get_next);
+
+ std::strong_ordering compare_to(const tree_cursor_t&, value_magic_t) const;
+
+ // public to Value
+
+ /// Get the latest value_header_t pointer for read.
+ const value_header_t* read_value_header(value_magic_t magic) const {
+ assert(is_tracked());
+ return cache.get_p_value_header(magic, position);
+ }
+
+ /// Prepare the node extent to be mutable and recorded.
+ std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+ prepare_mutate_value_payload(context_t c) {
+ assert(is_tracked());
+ if (!is_mutated) {
+ is_mutated = true;
+ ++(c.t.get_onode_tree_stats().num_updates);
+ }
+ return cache.prepare_mutate_value_payload(c, position);
+ }
+
+ /// Extends the size of value payload.
+ eagain_ifuture<> extend_value(context_t, value_size_t);
+
+ /// Trim and shrink the value payload.
+ eagain_ifuture<> trim_value(context_t, value_size_t);
+
+ static Ref<tree_cursor_t> get_invalid() {
+ Ref<tree_cursor_t> INVALID = new tree_cursor_t();
+ return INVALID;
+ }
+
+ private:
+ // create from insert
+ tree_cursor_t(Ref<LeafNode>, const search_position_t&);
+ // create from lookup
+ tree_cursor_t(Ref<LeafNode>, const search_position_t&,
+ const key_view_t&, const value_header_t*);
+ // lookup reaches the end, contain leaf node for further insert
+ tree_cursor_t(Ref<LeafNode>);
+ // create an invalid tree_cursor_t
+ tree_cursor_t() : cache{ref_leaf_node} {}
+
+ const search_position_t& get_position() const { return position; }
+ Ref<LeafNode> get_leaf_node() const { return ref_leaf_node; }
+ template <bool VALIDATE>
+ void update_track(Ref<LeafNode>, const search_position_t&);
+ void update_cache_same_node(const key_view_t&,
+ const value_header_t*) const;
+ void invalidate();
+
+ static Ref<tree_cursor_t> create_inserted(
+ Ref<LeafNode> node, const search_position_t& pos) {
+ return new tree_cursor_t(node, pos);
+ }
+
+ static Ref<tree_cursor_t> create_tracked(
+ Ref<LeafNode> node, const search_position_t& pos,
+ const key_view_t& key, const value_header_t* p_header) {
+ return new tree_cursor_t(node, pos, key, p_header);
+ }
+
+ static Ref<tree_cursor_t> create_end(Ref<LeafNode> node) {
+ return new tree_cursor_t(node);
+ }
+
+ /**
+ * Reversed resource management (tree_cursor_t)
+ *
+ * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be
+ * alive as long as any of it's cursors is still referenced by user.
+ */
+ Ref<LeafNode> ref_leaf_node;
+ search_position_t position;
+
+ // account 1 update even if there are multiple updates to the same value
+ bool is_mutated = false;
+
+ /** Cache
+ *
+ * Cached memory pointers or views which may be outdated due to
+ * extent copy-on-write or asynchronous leaf node updates.
+ */
+ class Cache {
+ public:
+ Cache(Ref<LeafNode>&);
+ void validate_is_latest(const search_position_t&) const;
+ void invalidate() { needs_update_all = true; }
+ void update_all(const node_version_t&, const key_view_t&, const value_header_t*);
+ const key_view_t& get_key_view(
+ value_magic_t magic, const search_position_t& pos) {
+ make_latest(magic, pos);
+ return *key_view;
+ }
+ const value_header_t* get_p_value_header(
+ value_magic_t magic, const search_position_t& pos) {
+ make_latest(magic, pos);
+ return p_value_header;
+ }
+ std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+ prepare_mutate_value_payload(context_t, const search_position_t&);
+
+ private:
+ void maybe_duplicate(const node_version_t&);
+ void make_latest(value_magic_t, const search_position_t&);
+
+ // metadata about how cache is valid
+ Ref<LeafNode>& ref_leaf_node;
+ bool needs_update_all = true;
+ node_version_t version;
+
+ // cached key value info
+ const char* p_node_base = nullptr;
+ std::optional<key_view_t> key_view;
+ const value_header_t* p_value_header = nullptr;
+
+ // cached data-structures to update value payload
+ std::optional<NodeExtentMutable> value_payload_mut;
+ ValueDeltaRecorder* p_value_recorder = nullptr;
+ };
+ mutable Cache cache;
+
+ friend class LeafNode;
+ friend class Node; // get_position(), get_leaf_node()
+};
+
+/**
+ * Node
+ *
+ * An abstracted class for both InternalNode and LeafNode.
+ *
+ * Exposes public interfaces for Btree.
+ */
+class Node
+ : public boost::intrusive_ref_counter<
+ Node, boost::thread_unsafe_counter> {
+ public:
+ // public to Btree
+ struct search_result_t {
+ bool is_end() const { return p_cursor->is_end(); }
+ Ref<tree_cursor_t> p_cursor;
+ match_stat_t mstat;
+
+ MatchKindBS match() const {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+ return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE);
+ }
+
+ void validate_input_key(const key_hobj_t& key, value_magic_t magic) const {
+#ifndef NDEBUG
+ if (match() == MatchKindBS::EQ) {
+ assert(key == p_cursor->get_key_view(magic));
+ } else {
+ assert(match() == MatchKindBS::NE);
+ if (p_cursor->is_tracked()) {
+ assert(key < p_cursor->get_key_view(magic));
+ } else if (p_cursor->is_end()) {
+ // good
+ } else {
+ assert(p_cursor->is_invalid());
+ ceph_abort("impossible");
+ }
+ }
+#endif
+ }
+ };
+
+ virtual ~Node();
+ Node(const Node&) = delete;
+ Node(Node&&) = delete;
+ Node& operator=(const Node&) = delete;
+ Node& operator=(Node&&) = delete;
+
+ /**
+ * level
+ *
+ * A positive value denotes the level (or height) of this node in tree.
+ * 0 means LeafNode, positive means InternalNode.
+ */
+ level_t level() const;
+
+ /**
+ * lookup_smallest
+ *
+ * Returns a cursor pointing to the smallest key in the sub-tree formed by
+ * this node.
+ *
+ * Returns an end cursor if it is an empty root node.
+ */
+ virtual eagain_ifuture<Ref<tree_cursor_t>> lookup_smallest(context_t) = 0;
+
+ /**
+ * lookup_largest
+ *
+ * Returns a cursor pointing to the largest key in the sub-tree formed by
+ * this node.
+ *
+ * Returns an end cursor if it is an empty root node.
+ */
+ virtual eagain_ifuture<Ref<tree_cursor_t>> lookup_largest(context_t) = 0;
+
+ /**
+ * lower_bound
+ *
+ * Returns a cursor pointing to the first element in the range [first, last)
+ * of the sub-tree which does not compare less than the input key. The
+ * result also denotes whether the pointed key is equal to the input key.
+ *
+ * Returns an end cursor with MatchKindBS::NE if:
+ * - It is an empty root node;
+ * - Or the input key is larger than all the keys in the sub-tree;
+ */
+ eagain_ifuture<search_result_t> lower_bound(context_t c, const key_hobj_t& key);
+
+ /**
+ * insert
+ *
+ * Try to insert a key-value pair into the sub-tree formed by this node.
+ *
+ * Returns a boolean denoting whether the insertion is successful:
+ * - If true, the returned cursor points to the inserted element in tree;
+ * - If false, the returned cursor points to the conflicting element in tree;
+ */
+ eagain_ifuture<std::pair<Ref<tree_cursor_t>, bool>> insert(
+ context_t, const key_hobj_t&, value_config_t, Ref<Node>&&);
+
+ /**
+ * erase
+ *
+ * Removes a key-value pair from the sub-tree formed by this node.
+ *
+ * Returns the number of erased key-value pairs (0 or 1).
+ */
+ eagain_ifuture<std::size_t> erase(context_t, const key_hobj_t&, Ref<Node>&&);
+
+ /// Recursively collects the statistics of the sub-tree formed by this node
+ eagain_ifuture<tree_stats_t> get_tree_stats(context_t);
+
+ /// Returns an ostream containing a dump of all the elements in the node.
+ std::ostream& dump(std::ostream&) const;
+
+ /// Returns an ostream containing an one-line summary of this node.
+ std::ostream& dump_brief(std::ostream&) const;
+
+ /// Print the node name
+ const std::string& get_name() const;
+
+ /// Initializes the tree by allocating an empty root node.
+ static eagain_ifuture<> mkfs(context_t, RootNodeTracker&);
+
+ /// Loads the tree root. The tree must be initialized.
+ static eagain_ifuture<Ref<Node>> load_root(context_t, RootNodeTracker&);
+
+ // Only for unit test purposes.
+ void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&);
+ virtual eagain_ifuture<> test_clone_root(context_t, RootNodeTracker&) const = 0;
+
+ protected:
+ virtual eagain_ifuture<> test_clone_non_root(context_t, Ref<InternalNode>) const {
+ ceph_abort("impossible path");
+ }
+ virtual eagain_ifuture<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) = 0;
+ virtual eagain_ifuture<> do_get_tree_stats(context_t, tree_stats_t&) = 0;
+
+ virtual bool is_tracking() const = 0;
+
+ virtual void track_merge(Ref<Node>, match_stage_t, search_position_t&) = 0;
+
+ protected:
+ Node(NodeImplURef&&);
+
+ bool is_tracked() const {
+ assert(!(super && _parent_info.has_value()));
+ return (super || _parent_info.has_value());
+ }
+
+ bool is_root() const {
+ assert(is_tracked());
+ return !_parent_info.has_value();
+ }
+
+ // as root
+ void make_root(context_t c, Super::URef&& _super);
+ void make_root_new(context_t c, Super::URef&& _super) {
+ assert(_super->get_root_laddr() == L_ADDR_NULL);
+ make_root(c, std::move(_super));
+ }
+ void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) {
+ assert(_super->get_root_laddr() == from_addr);
+ make_root(c, std::move(_super));
+ }
+ void as_root(Super::URef&& _super);
+ eagain_ifuture<> upgrade_root(context_t, laddr_t);
+
+ Super::URef deref_super();
+
+ // as child/non-root
+ template <bool VALIDATE = true>
+ void as_child(const search_position_t&, Ref<InternalNode>);
+
+ struct parent_info_t {
+ search_position_t position;
+ Ref<InternalNode> ptr;
+ };
+ const parent_info_t& parent_info() const { return *_parent_info; }
+
+ Ref<InternalNode> deref_parent();
+
+ eagain_ifuture<> apply_split_to_parent(context_t, Ref<Node>&&, Ref<Node>&&, bool);
+ eagain_ifuture<Ref<tree_cursor_t>> get_next_cursor_from_parent(context_t);
+ template <bool FORCE_MERGE = false>
+ eagain_ifuture<> try_merge_adjacent(context_t, bool, Ref<Node>&&);
+ eagain_ifuture<> erase_node(context_t, Ref<Node>&&);
+ template <bool FORCE_MERGE = false>
+ eagain_ifuture<> fix_parent_index(context_t, Ref<Node>&&, bool);
+ eagain_ifuture<NodeExtentMutable> rebuild_extent(context_t);
+ eagain_ifuture<> retire(context_t, Ref<Node>&&);
+ void make_tail(context_t);
+
+ private:
+ /**
+ * Reversed resource management (Node)
+ *
+ * Root Node holds a reference to its parent Super class, so its parent
+ * will be alive as long as this root node is alive.
+ *
+ * None-root Node holds a reference to its parent Node, so its parent will
+ * be alive as long as any of it's children is alive.
+ */
+ // as root
+ Super::URef super;
+ // as child/non-root
+ std::optional<parent_info_t> _parent_info;
+
+ private:
+ static eagain_ifuture<Ref<Node>> load(context_t, laddr_t, bool expect_is_level_tail);
+
+ NodeImplURef impl;
+ friend class InternalNode;
+};
+inline std::ostream& operator<<(std::ostream& os, const Node& node) {
+ return node.dump_brief(os);
+}
+
+/**
+ * InternalNode
+ *
+ * A concrete implementation of Node class that represents an internal tree
+ * node. Its level is always positive and its values are logical block
+ * addresses to its child nodes. An internal node cannot be empty.
+ */
+class InternalNode final : public Node {
+ public:
+ // public to Node
+ InternalNode(InternalNodeImpl*, NodeImplURef&&);
+ ~InternalNode() override { assert(tracked_child_nodes.empty()); }
+ InternalNode(const InternalNode&) = delete;
+ InternalNode(InternalNode&&) = delete;
+ InternalNode& operator=(const InternalNode&) = delete;
+ InternalNode& operator=(InternalNode&&) = delete;
+
+ eagain_ifuture<Ref<tree_cursor_t>> get_next_cursor(context_t, const search_position_t&);
+
+ eagain_ifuture<> apply_child_split(context_t, Ref<Node>&& left, Ref<Node>&& right, bool);
+
+ template <bool VALIDATE>
+ void do_track_child(Node& child) {
+ if constexpr (VALIDATE) {
+ validate_child(child);
+ }
+ auto& child_pos = child.parent_info().position;
+ assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end());
+ tracked_child_nodes[child_pos] = &child;
+ }
+
+ void do_untrack_child(const Node& child) {
+ assert(check_is_tracking(child));
+ auto& child_pos = child.parent_info().position;
+ [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos);
+ assert(removed);
+ }
+
+ bool check_is_tracking(const Node& child) const {
+ auto& child_pos = child.parent_info().position;
+ auto found = tracked_child_nodes.find(child_pos);
+ if (found != tracked_child_nodes.end() && found->second == &child) {
+ assert(child.parent_info().ptr == this);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ eagain_ifuture<std::pair<Ref<Node>, Ref<Node>>> get_child_peers(
+ context_t, const search_position_t&);
+
+ eagain_ifuture<> erase_child(context_t, Ref<Node>&&);
+
+ template <bool FORCE_MERGE = false>
+ eagain_ifuture<> fix_index(context_t, Ref<Node>&&, bool);
+
+ template <bool FORCE_MERGE = false>
+ eagain_ifuture<> apply_children_merge(
+ context_t, Ref<Node>&& left, laddr_t, Ref<Node>&& right, bool update_index);
+
+ void validate_child_tracked(const Node& child) const {
+ validate_child(child);
+ assert(tracked_child_nodes.find(child.parent_info().position) !=
+ tracked_child_nodes.end());
+ assert(tracked_child_nodes.find(child.parent_info().position)->second == &child);
+ }
+
+ void validate_child_inconsistent(const Node& child) const;
+
+ void validate_tracked_children() const {
+#ifndef NDEBUG
+ for (auto& kv : tracked_child_nodes) {
+ assert(kv.first == kv.second->parent_info().position);
+ validate_child(*kv.second);
+ }
+#endif
+ }
+
+ void track_make_tail(const search_position_t&);
+
+ static eagain_ifuture<Ref<InternalNode>> allocate_root(
+ context_t, laddr_t, level_t, laddr_t, Super::URef&&);
+
+ protected:
+ eagain_ifuture<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
+ eagain_ifuture<Ref<tree_cursor_t>> lookup_largest(context_t) override;
+ eagain_ifuture<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) override;
+ eagain_ifuture<> do_get_tree_stats(context_t, tree_stats_t&) override;
+ bool is_tracking() const override {
+ return !tracked_child_nodes.empty();
+ }
+ void track_merge(Ref<Node>, match_stage_t, search_position_t&) override;
+
+ eagain_ifuture<> test_clone_root(context_t, RootNodeTracker&) const override;
+
+ private:
+ eagain_ifuture<> try_downgrade_root(context_t, Ref<Node>&&);
+
+ eagain_ifuture<Ref<InternalNode>> insert_or_split(
+ context_t, const search_position_t&, const key_view_t&, Ref<Node>,
+ Ref<Node> outdated_child=nullptr);
+
+ // XXX: extract a common tracker for InternalNode to track Node,
+ // and LeafNode to track tree_cursor_t.
+ eagain_ifuture<Ref<Node>> get_or_track_child(context_t, const search_position_t&, laddr_t);
+ template <bool VALIDATE = true>
+ void track_insert(
+ const search_position_t&, match_stage_t, Ref<Node>, Ref<Node> nxt_child = nullptr);
+ void replace_track(Ref<Node> new_child, Ref<Node> old_child, bool);
+ void track_split(const search_position_t&, Ref<InternalNode>);
+ template <bool VALIDATE = true>
+ void track_erase(const search_position_t&, match_stage_t);
+ void validate_child(const Node& child) const;
+
+ struct fresh_node_t {
+ Ref<InternalNode> node;
+ NodeExtentMutable mut;
+ std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
+ return std::make_pair(Ref<Node>(node), mut);
+ }
+ };
+ static eagain_ifuture<fresh_node_t> allocate(context_t, laddr_t, field_type_t, bool, level_t);
+
+ private:
+ /**
+ * Reversed resource management (InternalNode)
+ *
+ * InteralNode keeps track of its child nodes which are still alive in
+ * memory, and their positions will be updated throughout
+ * insert/split/delete/merge operations of this node.
+ */
+ // XXX: leverage intrusive data structure to control memory overhead
+ std::map<search_position_t, Node*> tracked_child_nodes;
+ InternalNodeImpl* impl;
+};
+
+/**
+ * LeafNode
+ *
+ * A concrete implementation of Node class that represents a leaf tree node.
+ * Its level is always 0. A leaf node can only be empty if it is root.
+ */
+class LeafNode final : public Node {
+ public:
+ // public to tree_cursor_t
+ ~LeafNode() override { assert(tracked_cursors.empty()); }
+ LeafNode(const LeafNode&) = delete;
+ LeafNode(LeafNode&&) = delete;
+ LeafNode& operator=(const LeafNode&) = delete;
+ LeafNode& operator=(LeafNode&&) = delete;
+
+ bool is_level_tail() const;
+ node_version_t get_version() const;
+ const char* read() const;
+ extent_len_t get_node_size() const;
+ std::tuple<key_view_t, const value_header_t*> get_kv(const search_position_t&) const;
+ eagain_ifuture<Ref<tree_cursor_t>> get_next_cursor(context_t, const search_position_t&);
+
+ /**
+ * erase
+ *
+ * Removes a key-value pair from the position.
+ *
+ * If get_next is true, returns the cursor pointing to the next key-value
+ * pair that followed the erased element, which can be nullptr if is end.
+ */
+ template <bool FORCE_MERGE>
+ eagain_ifuture<Ref<tree_cursor_t>> erase(
+ context_t, const search_position_t&, bool get_next);
+
+ template <bool VALIDATE>
+ void do_track_cursor(tree_cursor_t& cursor) {
+ if constexpr (VALIDATE) {
+ validate_cursor(cursor);
+ }
+ auto& cursor_pos = cursor.get_position();
+ assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end());
+ tracked_cursors.emplace(cursor_pos, &cursor);
+ }
+ void do_untrack_cursor(const tree_cursor_t& cursor) {
+ validate_cursor(cursor);
+ auto& cursor_pos = cursor.get_position();
+ assert(check_is_tracking(cursor));
+ [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos);
+ assert(removed);
+ }
+ bool check_is_tracking(const tree_cursor_t& cursor) const {
+ auto& cursor_pos = cursor.get_position();
+ auto found = tracked_cursors.find(cursor_pos);
+ if (found != tracked_cursors.end() && found->second == &cursor) {
+ assert(cursor.ref_leaf_node == this);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ eagain_ifuture<> extend_value(context_t, const search_position_t&, value_size_t);
+ eagain_ifuture<> trim_value(context_t, const search_position_t&, value_size_t);
+
+ std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+ prepare_mutate_value_payload(context_t);
+
+ protected:
+ eagain_ifuture<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
+ eagain_ifuture<Ref<tree_cursor_t>> lookup_largest(context_t) override;
+ eagain_ifuture<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) override;
+ eagain_ifuture<> do_get_tree_stats(context_t, tree_stats_t&) override;
+ bool is_tracking() const override {
+ return !tracked_cursors.empty();
+ }
+ void track_merge(Ref<Node>, match_stage_t, search_position_t&) override;
+
+ eagain_ifuture<> test_clone_root(context_t, RootNodeTracker&) const override;
+
+ private:
+ LeafNode(LeafNodeImpl*, NodeImplURef&&);
+ eagain_ifuture<Ref<tree_cursor_t>> insert_value(
+ context_t, const key_hobj_t&, value_config_t,
+ const search_position_t&, const MatchHistory&,
+ match_stat_t mstat);
+ static eagain_ifuture<Ref<LeafNode>> allocate_root(context_t, RootNodeTracker&);
+ friend class Node;
+
+ private:
+ // XXX: extract a common tracker for InternalNode to track Node,
+ // and LeafNode to track tree_cursor_t.
+ Ref<tree_cursor_t> get_or_track_cursor(
+ const search_position_t&, const key_view_t&, const value_header_t*);
+ Ref<tree_cursor_t> track_insert(
+ const search_position_t&, match_stage_t, const value_header_t*);
+ void track_split(const search_position_t&, Ref<LeafNode>);
+ void track_erase(const search_position_t&, match_stage_t);
+ void validate_tracked_cursors() const {
+#ifndef NDEBUG
+ for (auto& kv : tracked_cursors) {
+ assert(kv.first == kv.second->get_position());
+ validate_cursor(*kv.second);
+ }
+#endif
+ }
+ void validate_cursor(const tree_cursor_t& cursor) const;
+ // invalidate p_value pointers in tree_cursor_t
+ void on_layout_change() { ++layout_version; }
+
+ struct fresh_node_t {
+ Ref<LeafNode> node;
+ NodeExtentMutable mut;
+ std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
+ return std::make_pair(Ref<Node>(node), mut);
+ }
+ };
+ static eagain_ifuture<fresh_node_t> allocate(context_t, laddr_t, field_type_t, bool);
+
+ private:
+ /**
+ * Reversed resource management (LeafNode)
+ *
+ * LeafNode keeps track of the referencing cursors which are still alive in
+ * memory, and their positions will be updated throughout
+ * insert/split/delete/merge operations of this node.
+ */
+ // XXX: leverage intrusive data structure to control memory overhead
+ std::map<search_position_t, tree_cursor_t*> tracked_cursors;
+ LeafNodeImpl* impl;
+ layout_version_t layout_version = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h
new file mode 100644
index 000000000..ea26195de
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/buffer.h"
+#include "node_types.h"
+#include "value.h"
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * DeltaRecorder
+ *
+ * An abstracted class to encapsulate different implementations to apply delta
+ * to a specific node layout.
+ */
+class DeltaRecorder {
+ public:
+ virtual ~DeltaRecorder() {
+ /* May be non-empty if transaction is abandoned without
+ * being submitted -- conflicts are a particularly common
+ * example (denoted generally by returning crimson::ct_error::eagain).
+ */
+ }
+
+ bool is_empty() const {
+ return encoded.length() == 0;
+ }
+
+ ceph::bufferlist get_delta() {
+ return std::move(encoded);
+ }
+
+ ValueDeltaRecorder* get_value_recorder() const {
+ assert(value_recorder);
+ return value_recorder.get();
+ }
+
+ virtual node_type_t node_type() const = 0;
+ virtual field_type_t field_type() const = 0;
+ virtual void apply_delta(ceph::bufferlist::const_iterator&,
+ NodeExtentMutable&,
+ const NodeExtent&) = 0;
+
+ protected:
+ DeltaRecorder() = default;
+ DeltaRecorder(const ValueBuilder& vb)
+ : value_recorder{vb.build_value_recorder(encoded)} {}
+
+ ceph::bufferlist encoded;
+ std::unique_ptr<ValueDeltaRecorder> value_recorder;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h
new file mode 100644
index 000000000..1a03036d3
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h
@@ -0,0 +1,619 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/logging.h"
+
+#include "node_extent_manager.h"
+#include "node_delta_recorder.h"
+#include "node_layout_replayable.h"
+#include "value.h"
+
+#ifndef NDEBUG
+#include "node_extent_manager/test_replay.h"
+#endif
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * DeltaRecorderT
+ *
+ * Responsible to encode and decode delta, and apply delta for a specific node
+ * layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class DeltaRecorderT final: public DeltaRecorder {
+ public:
+ using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>;
+ using node_stage_t = typename layout_t::node_stage_t;
+ using position_t = typename layout_t::position_t;
+ using StagedIterator = typename layout_t::StagedIterator;
+ using value_input_t = typename layout_t::value_input_t;
+ static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE;
+
+ ~DeltaRecorderT() override = default;
+
+ template <KeyT KT>
+ void encode_insert(
+ const full_key_t<KT>& key,
+ const value_input_t& value,
+ const position_t& insert_pos,
+ const match_stage_t& insert_stage,
+ const node_offset_t& insert_size) {
+ ceph::encode(node_delta_op_t::INSERT, encoded);
+ encode_key(key, encoded);
+ encode_value(value, encoded);
+ insert_pos.encode(encoded);
+ ceph::encode(insert_stage, encoded);
+ ceph::encode(insert_size, encoded);
+ }
+
+ void encode_split(
+ const StagedIterator& split_at,
+ const char* p_node_start) {
+ ceph::encode(node_delta_op_t::SPLIT, encoded);
+ split_at.encode(p_node_start, encoded);
+ }
+
+ template <KeyT KT>
+ void encode_split_insert(
+ const StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_input_t& value,
+ const position_t& insert_pos,
+ const match_stage_t& insert_stage,
+ const node_offset_t& insert_size,
+ const char* p_node_start) {
+ ceph::encode(node_delta_op_t::SPLIT_INSERT, encoded);
+ split_at.encode(p_node_start, encoded);
+ encode_key(key, encoded);
+ encode_value(value, encoded);
+ insert_pos.encode(encoded);
+ ceph::encode(insert_stage, encoded);
+ ceph::encode(insert_size, encoded);
+ }
+
+ void encode_update_child_addr(
+ const laddr_t new_addr,
+ const laddr_packed_t* p_addr,
+ const char* p_node_start,
+ extent_len_t node_size) {
+ ceph::encode(node_delta_op_t::UPDATE_CHILD_ADDR, encoded);
+ ceph::encode(new_addr, encoded);
+ int node_offset = reinterpret_cast<const char*>(p_addr) - p_node_start;
+ assert(node_offset > 0 && node_offset < (int)node_size);
+ ceph::encode(static_cast<node_offset_t>(node_offset), encoded);
+ }
+
+ void encode_erase(
+ const position_t& erase_pos) {
+ ceph::encode(node_delta_op_t::ERASE, encoded);
+ erase_pos.encode(encoded);
+ }
+
+ void encode_make_tail() {
+ ceph::encode(node_delta_op_t::MAKE_TAIL, encoded);
+ }
+
+ static DeltaRecorderURef create_for_encode(const ValueBuilder& v_builder) {
+ return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT(v_builder));
+ }
+
+ static DeltaRecorderURef create_for_replay() {
+ return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT());
+ }
+
+ protected:
+ DeltaRecorderT() : DeltaRecorder() {}
+ DeltaRecorderT(const ValueBuilder& vb) : DeltaRecorder(vb) {}
+ node_type_t node_type() const override { return NODE_TYPE; }
+ field_type_t field_type() const override { return FIELD_TYPE; }
+ void apply_delta(ceph::bufferlist::const_iterator& delta,
+ NodeExtentMutable& mut,
+ const NodeExtent& node) override {
+ LOG_PREFIX(OTree::Extent::Replay);
+ assert(is_empty());
+ node_stage_t stage(reinterpret_cast<const FieldType*>(mut.get_read()),
+ mut.get_length());
+ node_delta_op_t op;
+ try {
+ ceph::decode(op, delta);
+ switch (op) {
+ case node_delta_op_t::INSERT: {
+ SUBDEBUG(seastore_onode, "decoding INSERT ...");
+ auto key = key_hobj_t::decode(delta);
+ auto value = decode_value(delta);
+ auto insert_pos = position_t::decode(delta);
+ match_stage_t insert_stage;
+ ceph::decode(insert_stage, delta);
+ node_offset_t insert_size;
+ ceph::decode(insert_size, delta);
+ SUBDEBUG(seastore_onode,
+ "apply {}, {}, insert_pos({}), insert_stage={}, "
+ "insert_size={}B ...",
+ key, value, insert_pos, insert_stage, insert_size);
+ layout_t::template insert<KeyT::HOBJ>(
+ mut, stage, key, value, insert_pos, insert_stage, insert_size);
+ break;
+ }
+ case node_delta_op_t::SPLIT: {
+ SUBDEBUG(seastore_onode, "decoding SPLIT ...");
+ auto split_at = StagedIterator::decode(
+ mut.get_read(), mut.get_length(), delta);
+ SUBDEBUG(seastore_onode, "apply split_at={} ...", split_at);
+ layout_t::split(mut, stage, split_at);
+ break;
+ }
+ case node_delta_op_t::SPLIT_INSERT: {
+ SUBDEBUG(seastore_onode, "decoding SPLIT_INSERT ...");
+ auto split_at = StagedIterator::decode(
+ mut.get_read(), mut.get_length(), delta);
+ auto key = key_hobj_t::decode(delta);
+ auto value = decode_value(delta);
+ auto insert_pos = position_t::decode(delta);
+ match_stage_t insert_stage;
+ ceph::decode(insert_stage, delta);
+ node_offset_t insert_size;
+ ceph::decode(insert_size, delta);
+ SUBDEBUG(seastore_onode,
+ "apply split_at={}, {}, {}, insert_pos({}), insert_stage={}, "
+ "insert_size={}B ...",
+ split_at, key, value, insert_pos, insert_stage, insert_size);
+ layout_t::template split_insert<KeyT::HOBJ>(
+ mut, stage, split_at, key, value, insert_pos, insert_stage, insert_size);
+ break;
+ }
+ case node_delta_op_t::UPDATE_CHILD_ADDR: {
+ SUBDEBUG(seastore_onode, "decoding UPDATE_CHILD_ADDR ...");
+ laddr_t new_addr;
+ ceph::decode(new_addr, delta);
+ node_offset_t update_offset;
+ ceph::decode(update_offset, delta);
+ auto p_addr = reinterpret_cast<laddr_packed_t*>(
+ mut.get_write() + update_offset);
+ SUBDEBUG(seastore_onode,
+ "apply {:#x} to offset {:#x} ...",
+ new_addr, update_offset);
+ layout_t::update_child_addr(mut, new_addr, p_addr);
+ break;
+ }
+ case node_delta_op_t::ERASE: {
+ SUBDEBUG(seastore_onode, "decoding ERASE ...");
+ auto erase_pos = position_t::decode(delta);
+ SUBDEBUG(seastore_onode, "apply erase_pos({}) ...", erase_pos);
+ layout_t::erase(mut, stage, erase_pos);
+ break;
+ }
+ case node_delta_op_t::MAKE_TAIL: {
+ SUBDEBUG(seastore_onode, "decoded MAKE_TAIL, apply ...");
+ layout_t::make_tail(mut, stage);
+ break;
+ }
+ case node_delta_op_t::SUBOP_UPDATE_VALUE: {
+ SUBDEBUG(seastore_onode, "decoding SUBOP_UPDATE_VALUE ...");
+ node_offset_t value_header_offset;
+ ceph::decode(value_header_offset, delta);
+ auto p_header = mut.get_read() + value_header_offset;
+ auto p_header_ = reinterpret_cast<const value_header_t*>(p_header);
+ SUBDEBUG(seastore_onode, "update {} at {:#x} ...", *p_header_, value_header_offset);
+ auto payload_mut = p_header_->get_payload_mutable(mut);
+ auto value_addr = node.get_laddr() + payload_mut.get_node_offset();
+ get_value_replayer(p_header_->magic)->apply_value_delta(
+ delta, payload_mut, value_addr);
+ break;
+ }
+ default:
+ SUBERROR(seastore_onode,
+ "got unknown op {} when replay {}",
+ op, node);
+ ceph_abort("fatal error");
+ }
+ } catch (buffer::error& e) {
+ SUBERROR(seastore_onode,
+ "got decode error {} when replay {}",
+ e.what(), node);
+ ceph_abort("fatal error");
+ }
+ }
+
+ private:
+ ValueDeltaRecorder* get_value_replayer(value_magic_t magic) {
+ // Replay procedure is independent of Btree and happens at lower level in
+ // seastore. There is no ValueBuilder so the recoder needs to build the
+ // ValueDeltaRecorder by itself.
+ if (value_replayer) {
+ if (value_replayer->get_header_magic() != magic) {
+ ceph_abort_msgf("OTree::Extent::Replay: value magic mismatch %x != %x",
+ value_replayer->get_header_magic(), magic);
+ }
+ } else {
+ value_replayer = build_value_recorder_by_type(encoded, magic);
+ if (!value_replayer) {
+ ceph_abort_msgf("OTree::Extent::Replay: got unexpected value magic = %x",
+ magic);
+ }
+ }
+ return value_replayer.get();
+ }
+
+ void encode_value(const value_input_t& value, ceph::bufferlist& encoded) const {
+ if constexpr (std::is_same_v<value_input_t, laddr_t>) {
+ // NODE_TYPE == node_type_t::INTERNAL
+ ceph::encode(value, encoded);
+ } else if constexpr (std::is_same_v<value_input_t, value_config_t>) {
+ // NODE_TYPE == node_type_t::LEAF
+ value.encode(encoded);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ value_input_t decode_value(ceph::bufferlist::const_iterator& delta) const {
+ if constexpr (std::is_same_v<value_input_t, laddr_t>) {
+ // NODE_TYPE == node_type_t::INTERNAL
+ laddr_t value;
+ ceph::decode(value, delta);
+ return value;
+ } else if constexpr (std::is_same_v<value_input_t, value_config_t>) {
+ // NODE_TYPE == node_type_t::LEAF
+ return value_config_t::decode(delta);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ std::unique_ptr<ValueDeltaRecorder> value_replayer;
+};
+
+/**
+ * NodeExtentAccessorT
+ *
+ * This component is responsible to reference and mutate the underlying
+ * NodeExtent, record mutation parameters when needed, and apply the recorded
+ * modifications for a specific node layout.
+ *
+ * For possible internal states, see node_types.h.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class NodeExtentAccessorT {
+ public:
+ using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>;
+ using node_stage_t = typename layout_t::node_stage_t;
+ using position_t = typename layout_t::position_t;
+ using recorder_t = DeltaRecorderT<FieldType, NODE_TYPE>;
+ using StagedIterator = typename layout_t::StagedIterator;
+ using value_input_t = typename layout_t::value_input_t;
+ using value_t = typename layout_t::value_t;
+ static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE;
+
+ NodeExtentAccessorT(NodeExtentRef extent)
+ : extent{extent},
+ node_stage{reinterpret_cast<const FieldType*>(extent->get_read()),
+ extent->get_length()} {
+ assert(is_valid_node_size(extent->get_length()));
+ if (extent->is_initial_pending()) {
+ state = nextent_state_t::FRESH;
+ mut.emplace(extent->get_mutable());
+ assert(extent->get_recorder() == nullptr);
+ recorder = nullptr;
+ } else if (extent->is_mutation_pending()) {
+ state = nextent_state_t::MUTATION_PENDING;
+ mut.emplace(extent->get_mutable());
+ auto p_recorder = extent->get_recorder();
+ assert(p_recorder != nullptr);
+ assert(p_recorder->node_type() == NODE_TYPE);
+ assert(p_recorder->field_type() == FIELD_TYPE);
+ recorder = static_cast<recorder_t*>(p_recorder);
+ } else if (!extent->is_mutable() && extent->is_valid()) {
+ state = nextent_state_t::READ_ONLY;
+ // mut is empty
+ assert(extent->get_recorder() == nullptr ||
+ extent->get_recorder()->is_empty());
+ recorder = nullptr;
+ } else {
+ // extent is invalid or retired
+ ceph_abort("impossible path");
+ }
+#ifndef NDEBUG
+ auto ref_recorder = recorder_t::create_for_replay();
+ test_recorder = static_cast<recorder_t*>(ref_recorder.get());
+ test_extent = TestReplayExtent::create(
+ get_length(), std::move(ref_recorder));
+#endif
+ }
+ ~NodeExtentAccessorT() = default;
+ NodeExtentAccessorT(const NodeExtentAccessorT&) = delete;
+ NodeExtentAccessorT(NodeExtentAccessorT&&) = delete;
+ NodeExtentAccessorT& operator=(const NodeExtentAccessorT&) = delete;
+ NodeExtentAccessorT& operator=(NodeExtentAccessorT&&) = delete;
+
+ const node_stage_t& read() const { return node_stage; }
+ laddr_t get_laddr() const { return extent->get_laddr(); }
+ extent_len_t get_length() const {
+ auto len = extent->get_length();
+ assert(is_valid_node_size(len));
+ return len;
+ }
+ nextent_state_t get_state() const {
+ assert(!is_retired());
+ // we cannot rely on the underlying extent state because
+ // FRESH/MUTATION_PENDING can become DIRTY after transaction submission.
+ return state;
+ }
+
+ bool is_retired() const {
+ if (extent) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ // must be called before any mutate attempes.
+ // for the safety of mixed read and mutate, call before read.
+ void prepare_mutate(context_t c) {
+ assert(!is_retired());
+ if (state == nextent_state_t::READ_ONLY) {
+ assert(!extent->is_mutable());
+ auto ref_recorder = recorder_t::create_for_encode(c.vb);
+ recorder = static_cast<recorder_t*>(ref_recorder.get());
+ extent = extent->mutate(c, std::move(ref_recorder));
+ state = nextent_state_t::MUTATION_PENDING;
+ assert(extent->is_mutation_pending());
+ node_stage = node_stage_t(reinterpret_cast<const FieldType*>(extent->get_read()),
+ get_length());
+ assert(recorder == static_cast<recorder_t*>(extent->get_recorder()));
+ mut.emplace(extent->get_mutable());
+ }
+ assert(extent->is_mutable());
+ }
+
+ template <KeyT KT>
+ const value_t* insert_replayable(
+ const full_key_t<KT>& key,
+ const value_input_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ assert(extent->is_mutable());
+ assert(state != nextent_state_t::READ_ONLY);
+ if (state == nextent_state_t::MUTATION_PENDING) {
+ recorder->template encode_insert<KT>(
+ key, value, insert_pos, insert_stage, insert_size);
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->template encode_insert<KT>(
+ key, value, insert_pos, insert_stage, insert_size);
+#endif
+ auto ret = layout_t::template insert<KT>(
+ *mut, read(), key, value,
+ insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ return ret;
+ }
+
+ void split_replayable(StagedIterator& split_at) {
+ assert(extent->is_mutable());
+ assert(state != nextent_state_t::READ_ONLY);
+ if (state == nextent_state_t::MUTATION_PENDING) {
+ recorder->encode_split(split_at, read().p_start());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->encode_split(split_at, read().p_start());
+#endif
+ layout_t::split(*mut, read(), split_at);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ }
+
+ template <KeyT KT>
+ const value_t* split_insert_replayable(
+ StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_input_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ assert(extent->is_mutable());
+ assert(state != nextent_state_t::READ_ONLY);
+ if (state == nextent_state_t::MUTATION_PENDING) {
+ recorder->template encode_split_insert<KT>(
+ split_at, key, value, insert_pos, insert_stage, insert_size,
+ read().p_start());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->template encode_split_insert<KT>(
+ split_at, key, value, insert_pos, insert_stage, insert_size,
+ read().p_start());
+#endif
+ auto ret = layout_t::template split_insert<KT>(
+ *mut, read(), split_at, key, value,
+ insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ return ret;
+ }
+
+ void update_child_addr_replayable(
+ const laddr_t new_addr, laddr_packed_t* p_addr) {
+ assert(extent->is_mutable());
+ assert(state != nextent_state_t::READ_ONLY);
+ if (state == nextent_state_t::MUTATION_PENDING) {
+ recorder->encode_update_child_addr(
+ new_addr, p_addr, read().p_start(), get_length());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->encode_update_child_addr(
+ new_addr, p_addr, read().p_start(), get_length());
+#endif
+ layout_t::update_child_addr(*mut, new_addr, p_addr);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ }
+
+ std::tuple<match_stage_t, position_t> erase_replayable(const position_t& pos) {
+ assert(extent->is_mutable());
+ assert(state != nextent_state_t::READ_ONLY);
+ if (state == nextent_state_t::MUTATION_PENDING) {
+ recorder->encode_erase(pos);
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->encode_erase(pos);
+#endif
+ auto ret = layout_t::erase(*mut, read(), pos);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ return ret;
+ }
+
+ position_t make_tail_replayable() {
+ assert(extent->is_mutable());
+ assert(state != nextent_state_t::READ_ONLY);
+ if (state == nextent_state_t::MUTATION_PENDING) {
+ recorder->encode_make_tail();
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->encode_make_tail();
+#endif
+ auto ret = layout_t::make_tail(*mut, read());
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ return ret;
+ }
+
+ std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+ prepare_mutate_value_payload(context_t c) {
+ prepare_mutate(c);
+ ValueDeltaRecorder* p_value_recorder = nullptr;
+ if (state == nextent_state_t::MUTATION_PENDING) {
+ p_value_recorder = recorder->get_value_recorder();
+ }
+ return {*mut, p_value_recorder};
+ }
+
+ void test_copy_to(NodeExtentMutable& to) const {
+ assert(extent->get_length() == to.get_length());
+ std::memcpy(to.get_write(), extent->get_read(), get_length());
+ }
+
+ eagain_ifuture<NodeExtentMutable> rebuild(context_t c, laddr_t hint) {
+ LOG_PREFIX(OTree::Extent::rebuild);
+ assert(!is_retired());
+ if (state == nextent_state_t::FRESH) {
+ assert(extent->is_initial_pending());
+ // already fresh and no need to record
+ return eagain_iertr::make_ready_future<NodeExtentMutable>(*mut);
+ }
+ assert(!extent->is_initial_pending());
+ auto alloc_size = get_length();
+ return c.nm.alloc_extent(c.t, hint, alloc_size
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::input_output_error::handle(
+ [FNAME, c, alloc_size, l_to_discard = extent->get_laddr()] {
+ SUBERRORT(seastore_onode,
+ "EIO during allocate -- node_size={}, to_discard={:x}",
+ c.t, alloc_size, l_to_discard);
+ ceph_abort("fatal error");
+ })
+ ).si_then([this, c, FNAME] (auto fresh_extent) {
+ SUBDEBUGT(seastore_onode,
+ "update addr from {:#x} to {:#x} ...",
+ c.t, extent->get_laddr(), fresh_extent->get_laddr());
+ assert(fresh_extent);
+ assert(fresh_extent->is_initial_pending());
+ assert(fresh_extent->get_recorder() == nullptr);
+ assert(get_length() == fresh_extent->get_length());
+ auto fresh_mut = fresh_extent->get_mutable();
+ std::memcpy(fresh_mut.get_write(), extent->get_read(), get_length());
+ NodeExtentRef to_discard = extent;
+
+ extent = fresh_extent;
+ node_stage = node_stage_t(reinterpret_cast<const FieldType*>(extent->get_read()),
+ get_length());
+ state = nextent_state_t::FRESH;
+ mut.emplace(fresh_mut);
+ recorder = nullptr;
+
+ return c.nm.retire_extent(c.t, to_discard
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::input_output_error::handle(
+ [FNAME, c, l_to_discard = to_discard->get_laddr(),
+ l_fresh = fresh_extent->get_laddr()] {
+ SUBERRORT(seastore_onode,
+ "EIO during retire -- to_disgard={:x}, fresh={:x}",
+ c.t, l_to_discard, l_fresh);
+ ceph_abort("fatal error");
+ }),
+ crimson::ct_error::enoent::handle(
+ [FNAME, c, l_to_discard = to_discard->get_laddr(),
+ l_fresh = fresh_extent->get_laddr()] {
+ SUBERRORT(seastore_onode,
+ "ENOENT during retire -- to_disgard={:x}, fresh={:x}",
+ c.t, l_to_discard, l_fresh);
+ ceph_abort("fatal error");
+ })
+ );
+ }).si_then([this, c] {
+ boost::ignore_unused(c); // avoid clang warning;
+ assert(!c.t.is_conflicted());
+ return *mut;
+ });
+ }
+
+ eagain_ifuture<> retire(context_t c) {
+ LOG_PREFIX(OTree::Extent::retire);
+ assert(!is_retired());
+ auto addr = extent->get_laddr();
+ return c.nm.retire_extent(c.t, std::move(extent)
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::input_output_error::handle(
+ [FNAME, c, addr] {
+ SUBERRORT(seastore_onode, "EIO -- addr={:x}", c.t, addr);
+ ceph_abort("fatal error");
+ }),
+ crimson::ct_error::enoent::handle(
+ [FNAME, c, addr] {
+ SUBERRORT(seastore_onode, "ENOENT -- addr={:x}", c.t, addr);
+ ceph_abort("fatal error");
+ })
+#ifndef NDEBUG
+ ).si_then([c] {
+ assert(!c.t.is_conflicted());
+ }
+#endif
+ );
+ }
+
+ private:
+ NodeExtentRef extent;
+ node_stage_t node_stage;
+ nextent_state_t state;
+ std::optional<NodeExtentMutable> mut;
+ // owned by extent
+ recorder_t* recorder;
+
+#ifndef NDEBUG
+ // verify record replay using a different memory block
+ TestReplayExtent::Ref test_extent;
+ recorder_t* test_recorder;
+#endif
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc
new file mode 100644
index 000000000..8e6f16a74
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_extent_manager.h"
+
+#include "node_extent_manager/dummy.h"
+#include "node_extent_manager/seastore.h"
+
+namespace crimson::os::seastore::onode {
+
+NodeExtentManagerURef NodeExtentManager::create_dummy(bool is_sync)
+{
+ if (is_sync) {
+ return NodeExtentManagerURef(new DummyNodeExtentManager<true>());
+ } else {
+ return NodeExtentManagerURef(new DummyNodeExtentManager<false>());
+ }
+}
+
+NodeExtentManagerURef NodeExtentManager::create_seastore(
+ TransactionManager &tm, laddr_t min_laddr, double p_eagain)
+{
+ if (p_eagain == 0.0) {
+ return NodeExtentManagerURef(
+ new SeastoreNodeExtentManager<false>(tm, min_laddr, p_eagain));
+ } else {
+ return NodeExtentManagerURef(
+ new SeastoreNodeExtentManager<true>(tm, min_laddr, p_eagain));
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h
new file mode 100644
index 000000000..f8772929c
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#include "fwd.h"
+#include "node_extent_mutable.h"
+#include "node_types.h"
+#include "stages/node_stage_layout.h"
+#include "super.h"
+
+/**
+ * node_extent_manager.h
+ *
+ * Contains general interfaces for different backends (Dummy and Seastore).
+ */
+
+namespace crimson::os::seastore::onode {
+
+using crimson::os::seastore::LogicalCachedExtent;
+class NodeExtent : public LogicalCachedExtent {
+ public:
+ virtual ~NodeExtent() = default;
+ const node_header_t& get_header() const {
+ return *reinterpret_cast<const node_header_t*>(get_read());
+ }
+ const char* get_read() const {
+ return get_bptr().c_str();
+ }
+ NodeExtentMutable get_mutable() {
+ assert(is_mutable());
+ return do_get_mutable();
+ }
+
+ virtual DeltaRecorder* get_recorder() const = 0;
+ virtual NodeExtentRef mutate(context_t, DeltaRecorderURef&&) = 0;
+
+ protected:
+ template <typename... T>
+ NodeExtent(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {}
+
+ NodeExtentMutable do_get_mutable() {
+ return NodeExtentMutable(get_bptr().c_str(), get_length());
+ }
+
+ std::ostream& print_detail_l(std::ostream& out) const final {
+ return out << ", fltree_header=" << get_header();
+ }
+
+ /**
+ * Abstracted interfaces to implement:
+ * - CacheExtent::duplicate_for_write() -> CachedExtentRef
+ * - CacheExtent::get_type() -> extent_types_t
+ * - CacheExtent::get_delta() -> ceph::bufferlist
+ * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void
+ */
+};
+
+using crimson::os::seastore::TransactionManager;
+class NodeExtentManager {
+ using base_iertr = TransactionManager::base_iertr;
+ public:
+ virtual ~NodeExtentManager() = default;
+
+ virtual bool is_read_isolated() const = 0;
+
+ using read_iertr = base_iertr::extend<
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ virtual read_iertr::future<NodeExtentRef> read_extent(
+ Transaction&, laddr_t) = 0;
+
+ using alloc_iertr = base_iertr;
+ virtual alloc_iertr::future<NodeExtentRef> alloc_extent(
+ Transaction&, laddr_t hint, extent_len_t) = 0;
+
+ using retire_iertr = base_iertr::extend<
+ crimson::ct_error::enoent>;
+ virtual retire_iertr::future<> retire_extent(
+ Transaction&, NodeExtentRef) = 0;
+
+ using getsuper_iertr = base_iertr;
+ virtual getsuper_iertr::future<Super::URef> get_super(
+ Transaction&, RootNodeTracker&) = 0;
+
+ virtual std::ostream& print(std::ostream& os) const = 0;
+
+ static NodeExtentManagerURef create_dummy(bool is_sync);
+ static NodeExtentManagerURef create_seastore(
+ TransactionManager &tm, laddr_t min_laddr = L_ADDR_MIN, double p_eagain = 0.0);
+};
+inline std::ostream& operator<<(std::ostream& os, const NodeExtentManager& nm) {
+ return nm.print(os);
+}
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::onode::NodeExtent> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h
new file mode 100644
index 000000000..24df8b548
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <chrono>
+#include <seastar/core/sleep.hh>
+
+#include "include/buffer_raw.h"
+#include "crimson/os/seastore/logging.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+/**
+ * dummy.h
+ *
+ * Dummy backend implementations for test purposes.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class DummySuper final: public Super {
+ public:
+ DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr)
+ : Super(t, tracker), p_root_laddr{p_root_laddr} {}
+ ~DummySuper() override = default;
+ protected:
+ laddr_t get_root_laddr() const override { return *p_root_laddr; }
+ void write_root_laddr(context_t c, laddr_t addr) override {
+ LOG_PREFIX(OTree::Dummy);
+ SUBDEBUGT(seastore_onode, "update root {:#x} ...", c.t, addr);
+ *p_root_laddr = addr;
+ }
+ private:
+ laddr_t* p_root_laddr;
+};
+
+class DummyNodeExtent final: public NodeExtent {
+ public:
+ DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) {
+ state = extent_state_t::INITIAL_WRITE_PENDING;
+ }
+ DummyNodeExtent(const DummyNodeExtent& other) = delete;
+ ~DummyNodeExtent() override = default;
+
+ void retire() {
+ assert(state == extent_state_t::INITIAL_WRITE_PENDING);
+ state = extent_state_t::INVALID;
+ bufferptr empty_bptr;
+ get_bptr().swap(empty_bptr);
+ }
+
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override {
+ ceph_abort("impossible path"); }
+ DeltaRecorder* get_recorder() const override {
+ return nullptr; }
+ CachedExtentRef duplicate_for_write(Transaction&) override {
+ ceph_abort("impossible path"); }
+ extent_types_t get_type() const override {
+ return extent_types_t::TEST_BLOCK; }
+ ceph::bufferlist get_delta() override {
+ ceph_abort("impossible path"); }
+ void apply_delta(const ceph::bufferlist&) override {
+ ceph_abort("impossible path"); }
+};
+
+template <bool SYNC>
+class DummyNodeExtentManager final: public NodeExtentManager {
+ static constexpr size_t ALIGNMENT = 4096;
+ public:
+ ~DummyNodeExtentManager() override = default;
+ std::size_t size() const { return allocate_map.size(); }
+
+ protected:
+ bool is_read_isolated() const override { return false; }
+
+ read_iertr::future<NodeExtentRef> read_extent(
+ Transaction& t, laddr_t addr) override {
+ SUBTRACET(seastore_onode, "reading at {:#x} ...", t, addr);
+ if constexpr (SYNC) {
+ return read_extent_sync(t, addr);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, addr] {
+ return read_extent_sync(t, addr);
+ });
+ }
+ }
+
+ alloc_iertr::future<NodeExtentRef> alloc_extent(
+ Transaction& t, laddr_t hint, extent_len_t len) override {
+ SUBTRACET(seastore_onode, "allocating {}B with hint {:#x} ...", t, len, hint);
+ if constexpr (SYNC) {
+ return alloc_extent_sync(t, len);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, len] {
+ return alloc_extent_sync(t, len);
+ });
+ }
+ }
+
+ retire_iertr::future<> retire_extent(
+ Transaction& t, NodeExtentRef extent) override {
+ SUBTRACET(seastore_onode,
+ "retiring {}B at {:#x} -- {} ...",
+ t, extent->get_length(), extent->get_laddr(), *extent);
+ if constexpr (SYNC) {
+ return retire_extent_sync(t, extent);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, extent] {
+ return retire_extent_sync(t, extent);
+ });
+ }
+ }
+
+ getsuper_iertr::future<Super::URef> get_super(
+ Transaction& t, RootNodeTracker& tracker) override {
+ SUBTRACET(seastore_onode, "get root ...", t);
+ if constexpr (SYNC) {
+ return get_super_sync(t, tracker);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, &tracker] {
+ return get_super_sync(t, tracker);
+ });
+ }
+ }
+
+ std::ostream& print(std::ostream& os) const override {
+ return os << "DummyNodeExtentManager(sync=" << SYNC << ")";
+ }
+
+ private:
+ read_iertr::future<NodeExtentRef> read_extent_sync(
+ Transaction& t, laddr_t addr) {
+ auto iter = allocate_map.find(addr);
+ assert(iter != allocate_map.end());
+ auto extent = iter->second;
+ SUBTRACET(seastore_onode,
+ "read {}B at {:#x} -- {}",
+ t, extent->get_length(), extent->get_laddr(), *extent);
+ assert(extent->get_laddr() == addr);
+ return read_iertr::make_ready_future<NodeExtentRef>(extent);
+ }
+
+ alloc_iertr::future<NodeExtentRef> alloc_extent_sync(
+ Transaction& t, extent_len_t len) {
+ assert(len % ALIGNMENT == 0);
+ auto r = ceph::buffer::create_aligned(len, ALIGNMENT);
+ auto addr = reinterpret_cast<laddr_t>(r->get_data());
+ auto bp = ceph::bufferptr(std::move(r));
+ auto extent = Ref<DummyNodeExtent>(new DummyNodeExtent(std::move(bp)));
+ extent->set_laddr(addr);
+ assert(allocate_map.find(extent->get_laddr()) == allocate_map.end());
+ allocate_map.insert({extent->get_laddr(), extent});
+ SUBDEBUGT(seastore_onode,
+ "allocated {}B at {:#x} -- {}",
+ t, extent->get_length(), extent->get_laddr(), *extent);
+ assert(extent->get_length() == len);
+ return alloc_iertr::make_ready_future<NodeExtentRef>(extent);
+ }
+
+ retire_iertr::future<> retire_extent_sync(
+ Transaction& t, NodeExtentRef _extent) {
+ auto& extent = static_cast<DummyNodeExtent&>(*_extent.get());
+ auto addr = extent.get_laddr();
+ auto len = extent.get_length();
+ extent.retire();
+ auto iter = allocate_map.find(addr);
+ assert(iter != allocate_map.end());
+ allocate_map.erase(iter);
+ SUBDEBUGT(seastore_onode, "retired {}B at {:#x}", t, len, addr);
+ return retire_iertr::now();
+ }
+
+ getsuper_iertr::future<Super::URef> get_super_sync(
+ Transaction& t, RootNodeTracker& tracker) {
+ SUBTRACET(seastore_onode, "got root {:#x}", t, root_laddr);
+ return getsuper_iertr::make_ready_future<Super::URef>(
+ Super::URef(new DummySuper(t, tracker, &root_laddr)));
+ }
+
+ static LOG_PREFIX(OTree::Dummy);
+
+ std::map<laddr_t, Ref<DummyNodeExtent>> allocate_map;
+ laddr_t root_laddr = L_ADDR_NULL;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::onode::DummyNodeExtent> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc
new file mode 100644
index 000000000..3b52c5dc0
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "seastore.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h"
+
+namespace {
+LOG_PREFIX(OTree::Seastore);
+}
+
+SET_SUBSYS(seastore_onode);
+
+namespace crimson::os::seastore::onode {
+
+static DeltaRecorderURef create_replay_recorder(
+ node_type_t node_type, field_type_t field_type)
+{
+ if (node_type == node_type_t::LEAF) {
+ if (field_type == field_type_t::N0) {
+ return DeltaRecorderT<node_fields_0_t, node_type_t::LEAF>::create_for_replay();
+ } else if (field_type == field_type_t::N1) {
+ return DeltaRecorderT<node_fields_1_t, node_type_t::LEAF>::create_for_replay();
+ } else if (field_type == field_type_t::N2) {
+ return DeltaRecorderT<node_fields_2_t, node_type_t::LEAF>::create_for_replay();
+ } else if (field_type == field_type_t::N3) {
+ return DeltaRecorderT<leaf_fields_3_t, node_type_t::LEAF>::create_for_replay();
+ } else {
+ ceph_abort("impossible path");
+ }
+ } else if (node_type == node_type_t::INTERNAL) {
+ if (field_type == field_type_t::N0) {
+ return DeltaRecorderT<node_fields_0_t, node_type_t::INTERNAL>::create_for_replay();
+ } else if (field_type == field_type_t::N1) {
+ return DeltaRecorderT<node_fields_1_t, node_type_t::INTERNAL>::create_for_replay();
+ } else if (field_type == field_type_t::N2) {
+ return DeltaRecorderT<node_fields_2_t, node_type_t::INTERNAL>::create_for_replay();
+ } else if (field_type == field_type_t::N3) {
+ return DeltaRecorderT<internal_fields_3_t, node_type_t::INTERNAL>::create_for_replay();
+ } else {
+ ceph_abort("impossible path");
+ }
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+NodeExtentRef SeastoreNodeExtent::mutate(
+ context_t c, DeltaRecorderURef&& _recorder)
+{
+ DEBUGT("mutate {} ...", c.t, *this);
+ auto p_handle = static_cast<TransactionManagerHandle*>(&c.nm);
+ auto extent = p_handle->tm.get_mutable_extent(c.t, this);
+ auto ret = extent->cast<SeastoreNodeExtent>();
+ // A replayed extent may already have an empty recorder, we discard it for
+ // simplicity.
+ assert(!ret->recorder || ret->recorder->is_empty());
+ ret->recorder = std::move(_recorder);
+ return ret;
+}
+
+void SeastoreNodeExtent::apply_delta(const ceph::bufferlist& bl)
+{
+ DEBUG("replay {} ...", *this);
+ if (!recorder) {
+ auto header = get_header();
+ auto field_type = header.get_field_type();
+ if (!field_type.has_value()) {
+ ERROR("replay got invalid node -- {}", *this);
+ ceph_abort("fatal error");
+ }
+ auto node_type = header.get_node_type();
+ recorder = create_replay_recorder(node_type, *field_type);
+ } else {
+#ifndef NDEBUG
+ auto header = get_header();
+ assert(recorder->node_type() == header.get_node_type());
+ assert(recorder->field_type() == *header.get_field_type());
+#endif
+ }
+ auto mut = do_get_mutable();
+ auto p = bl.cbegin();
+ while (p != bl.end()) {
+ recorder->apply_delta(p, mut, *this);
+ }
+ DEBUG("relay done!");
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
new file mode 100644
index 000000000..f7cfa8c21
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
@@ -0,0 +1,223 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <random>
+
+#include "crimson/os/seastore/logging.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h"
+
+/**
+ * seastore.h
+ *
+ * Seastore backend implementations.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class SeastoreSuper final: public Super {
+ public:
+ SeastoreSuper(Transaction& t, RootNodeTracker& tracker,
+ laddr_t root_addr, TransactionManager& tm)
+ : Super(t, tracker), root_addr{root_addr}, tm{tm} {}
+ ~SeastoreSuper() override = default;
+ protected:
+ laddr_t get_root_laddr() const override {
+ return root_addr;
+ }
+ void write_root_laddr(context_t c, laddr_t addr) override {
+ LOG_PREFIX(OTree::Seastore);
+ SUBDEBUGT(seastore_onode, "update root {:#x} ...", c.t, addr);
+ root_addr = addr;
+ tm.write_onode_root(c.t, addr);
+ }
+ private:
+ laddr_t root_addr;
+ TransactionManager &tm;
+};
+
+class SeastoreNodeExtent final: public NodeExtent {
+ public:
+ SeastoreNodeExtent(ceph::bufferptr &&ptr)
+ : NodeExtent(std::move(ptr)) {}
+ SeastoreNodeExtent(const SeastoreNodeExtent& other)
+ : NodeExtent(other) {}
+ ~SeastoreNodeExtent() override = default;
+
+ constexpr static extent_types_t TYPE = extent_types_t::ONODE_BLOCK_STAGED;
+ extent_types_t get_type() const override {
+ return TYPE;
+ }
+
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override;
+
+ DeltaRecorder* get_recorder() const override {
+ return recorder.get();
+ }
+
+ CachedExtentRef duplicate_for_write(Transaction&) override {
+ return CachedExtentRef(new SeastoreNodeExtent(*this));
+ }
+ ceph::bufferlist get_delta() override {
+ assert(recorder);
+ return recorder->get_delta();
+ }
+ void apply_delta(const ceph::bufferlist&) override;
+
+ private:
+ DeltaRecorderURef recorder;
+};
+
+class TransactionManagerHandle : public NodeExtentManager {
+ public:
+ TransactionManagerHandle(TransactionManager &tm) : tm{tm} {}
+ TransactionManager &tm;
+};
+
+template <bool INJECT_EAGAIN=false>
+class SeastoreNodeExtentManager final: public TransactionManagerHandle {
+ public:
+ SeastoreNodeExtentManager(
+ TransactionManager &tm, laddr_t min, double p_eagain)
+ : TransactionManagerHandle(tm), addr_min{min}, p_eagain{p_eagain} {
+ if constexpr (INJECT_EAGAIN) {
+ assert(p_eagain > 0.0 && p_eagain < 1.0);
+ } else {
+ assert(p_eagain == 0.0);
+ }
+ }
+
+ ~SeastoreNodeExtentManager() override = default;
+
+ void set_generate_eagain(bool enable) {
+ generate_eagain = enable;
+ }
+
+ protected:
+ bool is_read_isolated() const override { return true; }
+
+ read_iertr::future<NodeExtentRef> read_extent(
+ Transaction& t, laddr_t addr) override {
+ SUBTRACET(seastore_onode, "reading at {:#x} ...", t, addr);
+ if constexpr (INJECT_EAGAIN) {
+ if (trigger_eagain()) {
+ SUBDEBUGT(seastore_onode, "reading at {:#x}: trigger eagain", t, addr);
+ t.test_set_conflict();
+ return read_iertr::make_ready_future<NodeExtentRef>();
+ }
+ }
+ return tm.read_extent<SeastoreNodeExtent>(t, addr
+ ).si_then([addr, &t](auto&& e) -> read_iertr::future<NodeExtentRef> {
+ SUBTRACET(seastore_onode,
+ "read {}B at {:#x} -- {}",
+ t, e->get_length(), e->get_laddr(), *e);
+ assert(e->get_laddr() == addr);
+ std::ignore = addr;
+ return read_iertr::make_ready_future<NodeExtentRef>(e);
+ });
+ }
+
+ alloc_iertr::future<NodeExtentRef> alloc_extent(
+ Transaction& t, laddr_t hint, extent_len_t len) override {
+ SUBTRACET(seastore_onode, "allocating {}B with hint {:#x} ...", t, len, hint);
+ if constexpr (INJECT_EAGAIN) {
+ if (trigger_eagain()) {
+ SUBDEBUGT(seastore_onode, "allocating {}B: trigger eagain", t, len);
+ t.test_set_conflict();
+ return alloc_iertr::make_ready_future<NodeExtentRef>();
+ }
+ }
+ return tm.alloc_extent<SeastoreNodeExtent>(t, hint, len
+ ).si_then([len, &t](auto extent) {
+ SUBDEBUGT(seastore_onode,
+ "allocated {}B at {:#x} -- {}",
+ t, extent->get_length(), extent->get_laddr(), *extent);
+ if (!extent->is_initial_pending()) {
+ SUBERRORT(seastore_onode,
+ "allocated {}B but got invalid extent: {}",
+ t, len, *extent);
+ ceph_abort("fatal error");
+ }
+ assert(extent->get_length() == len);
+ std::ignore = len;
+ return NodeExtentRef(extent);
+ });
+ }
+
+ retire_iertr::future<> retire_extent(
+ Transaction& t, NodeExtentRef _extent) override {
+ LogicalCachedExtentRef extent = _extent;
+ auto addr = extent->get_laddr();
+ auto len = extent->get_length();
+ SUBDEBUGT(seastore_onode,
+ "retiring {}B at {:#x} -- {} ...",
+ t, len, addr, *extent);
+ if constexpr (INJECT_EAGAIN) {
+ if (trigger_eagain()) {
+ SUBDEBUGT(seastore_onode,
+ "retiring {}B at {:#x} -- {} : trigger eagain",
+ t, len, addr, *extent);
+ t.test_set_conflict();
+ return retire_iertr::now();
+ }
+ }
+ return tm.dec_ref(t, extent).si_then([addr, len, &t] (unsigned cnt) {
+ assert(cnt == 0);
+ SUBTRACET(seastore_onode, "retired {}B at {:#x} ...", t, len, addr);
+ });
+ }
+
+ getsuper_iertr::future<Super::URef> get_super(
+ Transaction& t, RootNodeTracker& tracker) override {
+ SUBTRACET(seastore_onode, "get root ...", t);
+ if constexpr (INJECT_EAGAIN) {
+ if (trigger_eagain()) {
+ SUBDEBUGT(seastore_onode, "get root: trigger eagain", t);
+ t.test_set_conflict();
+ return getsuper_iertr::make_ready_future<Super::URef>();
+ }
+ }
+ return tm.read_onode_root(t).si_then([this, &t, &tracker](auto root_addr) {
+ SUBTRACET(seastore_onode, "got root {:#x}", t, root_addr);
+ return Super::URef(new SeastoreSuper(t, tracker, root_addr, tm));
+ });
+ }
+
+ std::ostream& print(std::ostream& os) const override {
+ os << "SeastoreNodeExtentManager";
+ if constexpr (INJECT_EAGAIN) {
+ os << "(p_eagain=" << p_eagain << ")";
+ }
+ return os;
+ }
+
+ private:
+ static LOG_PREFIX(OTree::Seastore);
+
+ const laddr_t addr_min;
+
+ // XXX: conditional members by INJECT_EAGAIN
+ bool trigger_eagain() {
+ if (generate_eagain) {
+ double dice = rd();
+ assert(rd.min() == 0);
+ dice /= rd.max();
+ return dice <= p_eagain;
+ } else {
+ return false;
+ }
+ }
+ bool generate_eagain = true;
+ std::random_device rd;
+ double p_eagain;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::onode::SeastoreNodeExtent> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h
new file mode 100644
index 000000000..bce74e381
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+/** test_replay.h
+ *
+ * A special version of NodeExtent to help verify delta encode, decode and
+ * replay in recorder_t under debug build.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class TestReplayExtent final: public NodeExtent {
+ public:
+ using Ref = crimson::os::seastore::TCachedExtentRef<TestReplayExtent>;
+
+ void prepare_replay(NodeExtentRef from_extent) {
+ assert(get_length() == from_extent->get_length());
+ auto mut = do_get_mutable();
+ std::memcpy(mut.get_write(), from_extent->get_read(), get_length());
+ }
+
+ void replay_and_verify(NodeExtentRef replayed_extent) {
+ assert(get_length() == replayed_extent->get_length());
+ auto mut = do_get_mutable();
+ auto bl = recorder->get_delta();
+ assert(bl.length());
+ auto p = bl.cbegin();
+ recorder->apply_delta(p, mut, *this);
+ assert(p == bl.end());
+ auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length());
+ ceph_assert(cmp == 0 && "replay mismatch!");
+ }
+
+ static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) {
+ auto r = ceph::buffer::create_aligned(length, 4096);
+ auto bp = ceph::bufferptr(std::move(r));
+ return new TestReplayExtent(std::move(bp), std::move(recorder));
+ }
+
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override {
+ ceph_abort("impossible path"); }
+ DeltaRecorder* get_recorder() const override {
+ ceph_abort("impossible path"); }
+ CachedExtentRef duplicate_for_write(Transaction&) override {
+ ceph_abort("impossible path"); }
+ extent_types_t get_type() const override {
+ return extent_types_t::TEST_BLOCK; }
+ ceph::bufferlist get_delta() override {
+ ceph_abort("impossible path"); }
+ void apply_delta(const ceph::bufferlist&) override {
+ ceph_abort("impossible path"); }
+
+ private:
+ TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder)
+ : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) {
+ state = extent_state_t::MUTATION_PENDING;
+ }
+ DeltaRecorderURef recorder;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h
new file mode 100644
index 000000000..6f92ca9ed
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstring>
+
+#include "fwd.h"
+
+#pragma once
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * NodeExtentMutable
+ *
+ * A thin wrapper of NodeExtent to make sure that only the newly allocated
+ * or the duplicated NodeExtent is mutable, and the memory modifications are
+ * safe within the extent range.
+ */
+class NodeExtentMutable {
+ public:
+ void copy_in_absolute(void* dst, const void* src, extent_len_t len) {
+ assert(is_safe(dst, len));
+ std::memcpy(dst, src, len);
+ }
+ template <typename T>
+ void copy_in_absolute(void* dst, const T& src) {
+ copy_in_absolute(dst, &src, sizeof(T));
+ }
+
+ const void* copy_in_relative(
+ extent_len_t dst_offset, const void* src, extent_len_t len) {
+ auto dst = get_write() + dst_offset;
+ copy_in_absolute(dst, src, len);
+ return dst;
+ }
+ template <typename T>
+ const T* copy_in_relative(
+ extent_len_t dst_offset, const T& src) {
+ auto dst = copy_in_relative(dst_offset, &src, sizeof(T));
+ return static_cast<const T*>(dst);
+ }
+
+ void shift_absolute(const void* src, extent_len_t len, int offset) {
+ assert(is_safe(src, len));
+ char* to = (char*)src + offset;
+ assert(is_safe(to, len));
+ if (len != 0) {
+ std::memmove(to, src, len);
+ }
+ }
+ void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) {
+ shift_absolute(get_write() + src_offset, len, offset);
+ }
+
+ void set_absolute(void* dst, int value, extent_len_t len) {
+ assert(is_safe(dst, len));
+ std::memset(dst, value, len);
+ }
+ void set_relative(extent_len_t dst_offset, int value, extent_len_t len) {
+ auto dst = get_write() + dst_offset;
+ set_absolute(dst, value, len);
+ }
+
+ template <typename T>
+ void validate_inplace_update(const T& updated) {
+ assert(is_safe(&updated, sizeof(T)));
+ }
+
+ const char* get_read() const { return p_start; }
+ char* get_write() { return p_start; }
+ extent_len_t get_length() const {
+#ifndef NDEBUG
+ if (node_offset == 0) {
+ assert(is_valid_node_size(length));
+ }
+#endif
+ return length;
+ }
+ node_offset_t get_node_offset() const { return node_offset; }
+
+ NodeExtentMutable get_mutable_absolute(const void* dst, node_offset_t len) const {
+ assert(node_offset == 0);
+ assert(is_safe(dst, len));
+ assert((const char*)dst != get_read());
+ auto ret = *this;
+ node_offset_t offset = (const char*)dst - get_read();
+ assert(offset != 0);
+ ret.p_start += offset;
+ ret.length = len;
+ ret.node_offset = offset;
+ return ret;
+ }
+ NodeExtentMutable get_mutable_relative(
+ node_offset_t offset, node_offset_t len) const {
+ return get_mutable_absolute(get_read() + offset, len);
+ }
+
+ private:
+ NodeExtentMutable(char* p_start, extent_len_t length)
+ : p_start{p_start}, length{length} {}
+ bool is_safe(const void* src, extent_len_t len) const {
+ return ((const char*)src >= p_start) &&
+ ((const char*)src + len <= p_start + length);
+ }
+
+ char* p_start;
+ extent_len_t length;
+ node_offset_t node_offset = 0;
+
+ friend class NodeExtent;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc
new file mode 100644
index 000000000..5db0f83dd
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_impl.h"
+#include "node_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#ifdef UNIT_TESTS_BUILT
+last_split_info_t last_split = {};
+#endif
+
+// XXX: branchless allocation
+eagain_ifuture<InternalNodeImpl::fresh_impl_t>
+InternalNodeImpl::allocate(
+ context_t c, laddr_t hint, field_type_t type, bool is_level_tail, level_t level)
+{
+ if (type == field_type_t::N0) {
+ return InternalNode0::allocate(c, hint, is_level_tail, level);
+ } else if (type == field_type_t::N1) {
+ return InternalNode1::allocate(c, hint, is_level_tail, level);
+ } else if (type == field_type_t::N2) {
+ return InternalNode2::allocate(c, hint, is_level_tail, level);
+ } else if (type == field_type_t::N3) {
+ return InternalNode3::allocate(c, hint, is_level_tail, level);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+eagain_ifuture<LeafNodeImpl::fresh_impl_t>
+LeafNodeImpl::allocate(
+ context_t c, laddr_t hint, field_type_t type, bool is_level_tail)
+{
+ if (type == field_type_t::N0) {
+ return LeafNode0::allocate(c, hint, is_level_tail, 0);
+ } else if (type == field_type_t::N1) {
+ return LeafNode1::allocate(c, hint, is_level_tail, 0);
+ } else if (type == field_type_t::N2) {
+ return LeafNode2::allocate(c, hint, is_level_tail, 0);
+ } else if (type == field_type_t::N3) {
+ return LeafNode3::allocate(c, hint, is_level_tail, 0);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+InternalNodeImplURef InternalNodeImpl::load(
+ NodeExtentRef extent, field_type_t type)
+{
+ if (type == field_type_t::N0) {
+ return InternalNode0::load(extent);
+ } else if (type == field_type_t::N1) {
+ return InternalNode1::load(extent);
+ } else if (type == field_type_t::N2) {
+ return InternalNode2::load(extent);
+ } else if (type == field_type_t::N3) {
+ return InternalNode3::load(extent);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+LeafNodeImplURef LeafNodeImpl::load(
+ NodeExtentRef extent, field_type_t type)
+{
+ if (type == field_type_t::N0) {
+ return LeafNode0::load(extent);
+ } else if (type == field_type_t::N1) {
+ return LeafNode1::load(extent);
+ } else if (type == field_type_t::N2) {
+ return LeafNode2::load(extent);
+ } else if (type == field_type_t::N3) {
+ return LeafNode3::load(extent);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h
new file mode 100644
index 000000000..cf452618b
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "node_extent_mutable.h"
+#include "node_types.h"
+#include "stages/stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+#ifdef UNIT_TESTS_BUILT
+enum class InsertType { BEGIN, LAST, MID };
+struct split_expectation_t {
+ match_stage_t split_stage;
+ match_stage_t insert_stage;
+ bool is_insert_left;
+ InsertType insert_type;
+};
+struct last_split_info_t {
+ search_position_t split_pos;
+ match_stage_t insert_stage;
+ bool is_insert_left;
+ InsertType insert_type;
+ bool match(const split_expectation_t& e) const {
+ match_stage_t split_stage;
+ if (split_pos.nxt.nxt.index == 0) {
+ if (split_pos.nxt.index == 0) {
+ split_stage = 2;
+ } else {
+ split_stage = 1;
+ }
+ } else {
+ split_stage = 0;
+ }
+ return split_stage == e.split_stage &&
+ insert_stage == e.insert_stage &&
+ is_insert_left == e.is_insert_left &&
+ insert_type == e.insert_type;
+ }
+ bool match_split_pos(const search_position_t& pos) const {
+ return split_pos == pos;
+ }
+};
+extern last_split_info_t last_split;
+#endif
+
+struct key_hobj_t;
+struct key_view_t;
+class NodeExtentMutable;
+
+/**
+ * NodeImpl
+ *
+ * Hides type specific node layout implementations for Node.
+ */
+class NodeImpl {
+ public:
+ virtual ~NodeImpl() = default;
+
+ virtual node_type_t node_type() const = 0;
+ virtual field_type_t field_type() const = 0;
+ virtual laddr_t laddr() const = 0;
+ virtual const char* read() const = 0;
+ virtual extent_len_t get_node_size() const = 0;
+ virtual nextent_state_t get_extent_state() const = 0;
+ virtual void prepare_mutate(context_t) = 0;
+ virtual bool is_level_tail() const = 0;
+
+ /* Invariants for num_keys and num_values:
+ * - for leaf node and non-tail internal node, num_keys == num_values;
+ * - for tail internal node, num_keys + 1 == num_values;
+ * - all node must have at least 1 value, except the root leaf node;
+ * - the root internal node must have more than 1 values;
+ */
+ virtual void validate_non_empty() const = 0;
+ virtual bool is_keys_empty() const = 0;
+ // under the assumption that node is not empty
+ virtual bool has_single_value() const = 0;
+
+ virtual level_t level() const = 0;
+ virtual node_offset_t free_size() const = 0;
+ virtual extent_len_t total_size() const = 0;
+ virtual bool is_extent_retired() const = 0;
+ virtual std::optional<key_view_t> get_pivot_index() const = 0;
+ virtual bool is_size_underflow() const = 0;
+
+ virtual std::tuple<match_stage_t, search_position_t> erase(const search_position_t&) = 0;
+ virtual std::tuple<match_stage_t, std::size_t> evaluate_merge(NodeImpl&) = 0;
+ virtual search_position_t merge(NodeExtentMutable&, NodeImpl&, match_stage_t, extent_len_t) = 0;
+ virtual eagain_ifuture<NodeExtentMutable> rebuild_extent(context_t) = 0;
+ virtual eagain_ifuture<> retire_extent(context_t) = 0;
+ virtual search_position_t make_tail() = 0;
+
+ virtual node_stats_t get_stats() const = 0;
+ virtual std::ostream& dump(std::ostream&) const = 0;
+ virtual std::ostream& dump_brief(std::ostream&) const = 0;
+ virtual const std::string& get_name() const = 0;
+ virtual void validate_layout() const = 0;
+
+ virtual void test_copy_to(NodeExtentMutable&) const = 0;
+ virtual void test_set_tail(NodeExtentMutable&) = 0;
+
+ protected:
+ NodeImpl() = default;
+};
+
+/**
+ * InternalNodeImpl
+ *
+ * Hides type specific node layout implementations for InternalNode.
+ */
+class InternalNodeImpl : public NodeImpl {
+ public:
+ struct internal_marker_t {};
+ virtual ~InternalNodeImpl() = default;
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual void get_slot(const search_position_t&, // IN
+ key_view_t* = nullptr, // OUT
+ const laddr_packed_t** = nullptr) const { // OUT
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual void get_prev_slot(search_position_t&, // IN&OUT
+ key_view_t* = nullptr, // OUT
+ const laddr_packed_t** = nullptr) const { // OUT
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual void get_next_slot(search_position_t&, // IN&OUT
+ key_view_t* = nullptr, // OUT
+ const laddr_packed_t** = nullptr) const { // OUT
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual void get_largest_slot(search_position_t* = nullptr, // OUT
+ key_view_t* = nullptr, // OUT
+ const laddr_packed_t** = nullptr) const { // OUT
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual lookup_result_t<node_type_t::INTERNAL> lower_bound(
+ const key_hobj_t&, MatchHistory&,
+ key_view_t* = nullptr, internal_marker_t = {}) const {
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const laddr_packed_t* insert(
+ const key_view_t&, const laddr_t&, search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual std::tuple<search_position_t, bool, const laddr_packed_t*> split_insert(
+ NodeExtentMutable&, NodeImpl&, const key_view_t&, const laddr_t&,
+ search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+
+ virtual const laddr_packed_t* get_tail_value() const = 0;
+
+ virtual void replace_child_addr(const search_position_t&, laddr_t dst, laddr_t src) = 0;
+
+ virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_view_t&, const laddr_t&, search_position_t&) const = 0;
+
+ struct fresh_impl_t {
+ InternalNodeImplURef impl;
+ NodeExtentMutable mut;
+ std::pair<NodeImplURef, NodeExtentMutable> make_pair() {
+ return {std::move(impl), mut};
+ }
+ };
+ static eagain_ifuture<fresh_impl_t> allocate(context_t, laddr_t, field_type_t, bool, level_t);
+
+ static InternalNodeImplURef load(NodeExtentRef, field_type_t);
+
+ protected:
+ InternalNodeImpl() = default;
+};
+
+/**
+ * LeafNodeImpl
+ *
+ * Hides type specific node layout implementations for LeafNode.
+ */
+class LeafNodeImpl : public NodeImpl {
+ public:
+ struct leaf_marker_t {};
+ virtual ~LeafNodeImpl() = default;
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual void get_slot(const search_position_t&, // IN
+ key_view_t* = nullptr, // OUT
+ const value_header_t** = nullptr) const { // OUT
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual void get_prev_slot(search_position_t&, // IN&OUT
+ key_view_t* = nullptr, // OUT
+ const value_header_t** = nullptr) const { // OUT
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual void get_next_slot(search_position_t&, // IN&OUT
+ key_view_t* = nullptr, // OUT
+ const value_header_t** = nullptr) const { // OUT
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual void get_largest_slot(search_position_t* = nullptr, // OUT
+ key_view_t* = nullptr, // OUT
+ const value_header_t** = nullptr) const { // OUT
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual lookup_result_t<node_type_t::LEAF> lower_bound(
+ const key_hobj_t&, MatchHistory&,
+ key_view_t* = nullptr, leaf_marker_t = {}) const {
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const value_header_t* insert(
+ const key_hobj_t&, const value_config_t&, search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual std::tuple<search_position_t, bool, const value_header_t*> split_insert(
+ NodeExtentMutable&, NodeImpl&, const key_hobj_t&, const value_config_t&,
+ search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+
+ virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_hobj_t&, const value_config_t&,
+ const MatchHistory&, match_stat_t, search_position_t&) const = 0;
+
+ virtual std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+ prepare_mutate_value_payload(context_t) = 0;
+
+ struct fresh_impl_t {
+ LeafNodeImplURef impl;
+ NodeExtentMutable mut;
+ std::pair<NodeImplURef, NodeExtentMutable> make_pair() {
+ return {std::move(impl), mut};
+ }
+ };
+ static eagain_ifuture<fresh_impl_t> allocate(context_t, laddr_t, field_type_t, bool);
+
+ static LeafNodeImplURef load(NodeExtentRef, field_type_t);
+
+ protected:
+ LeafNodeImpl() = default;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
new file mode 100644
index 000000000..783a0c6cc
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
@@ -0,0 +1,948 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <sstream>
+
+#include "common/likely.h"
+#include "crimson/os/seastore/logging.h"
+
+#include "node_extent_accessor.h"
+#include "node_impl.h"
+#include "stages/node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+template <node_type_t NODE_TYPE> struct insert_key_type;
+template <> struct insert_key_type<node_type_t::INTERNAL> {
+ static constexpr auto type = KeyT::VIEW; };
+template <> struct insert_key_type<node_type_t::LEAF> {
+ static constexpr auto type = KeyT::HOBJ; };
+
+template <node_type_t NODE_TYPE> struct node_impl_type;
+template <> struct node_impl_type<node_type_t::INTERNAL> {
+ using type = InternalNodeImpl; };
+template <> struct node_impl_type<node_type_t::LEAF> {
+ using type = LeafNodeImpl; };
+
+template <node_type_t NODE_TYPE> struct node_marker_type;
+template <> struct node_marker_type<node_type_t::INTERNAL> {
+ using type = InternalNodeImpl::internal_marker_t; };
+template <> struct node_marker_type<node_type_t::LEAF> {
+ using type = LeafNodeImpl::leaf_marker_t; };
+
+/**
+ * NodeLayoutT
+ *
+ * Contains templated and concrete implementations for both InternalNodeImpl
+ * and LeafNodeImpl under a specific node layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl {
+ public:
+ using URef = std::unique_ptr<NodeLayoutT>;
+ using extent_t = NodeExtentAccessorT<FieldType, NODE_TYPE>;
+ using parent_t = typename node_impl_type<NODE_TYPE>::type;
+ using marker_t = typename node_marker_type<NODE_TYPE>::type;
+ using node_stage_t = typename extent_t::node_stage_t;
+ using stage_t = node_to_stage_t<node_stage_t>;
+ using position_t = typename extent_t::position_t;
+ using value_input_t = typename extent_t::value_input_t;
+ using value_t = typename extent_t::value_t;
+ static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE;
+ static constexpr auto KEY_TYPE = insert_key_type<NODE_TYPE>::type;
+ static constexpr auto STAGE = stage_t::STAGE;
+
+ NodeLayoutT(const NodeLayoutT&) = delete;
+ NodeLayoutT(NodeLayoutT&&) = delete;
+ NodeLayoutT& operator=(const NodeLayoutT&) = delete;
+ NodeLayoutT& operator=(NodeLayoutT&&) = delete;
+ ~NodeLayoutT() override = default;
+
+ static URef load(NodeExtentRef extent) {
+ std::unique_ptr<NodeLayoutT> ret(new NodeLayoutT(extent));
+ return ret;
+ }
+
+ static eagain_ifuture<typename parent_t::fresh_impl_t> allocate(
+ context_t c, laddr_t hint, bool is_level_tail, level_t level) {
+ LOG_PREFIX(OTree::Layout::allocate);
+ extent_len_t extent_size;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ extent_size = c.vb.get_leaf_node_size();
+ } else {
+ extent_size = c.vb.get_internal_node_size();
+ }
+ return c.nm.alloc_extent(c.t, hint, extent_size
+ ).handle_error_interruptible(
+ eagain_iertr::pass_further{},
+ crimson::ct_error::input_output_error::handle(
+ [FNAME, c, extent_size, is_level_tail, level] {
+ SUBERRORT(seastore_onode,
+ "EIO -- extent_size={}, is_level_tail={}, level={}",
+ c.t, extent_size, is_level_tail, level);
+ ceph_abort("fatal error");
+ })
+ ).si_then([is_level_tail, level](auto extent) {
+ assert(extent);
+ assert(extent->is_initial_pending());
+ auto mut = extent->get_mutable();
+ node_stage_t::bootstrap_extent(
+ mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level);
+ return typename parent_t::fresh_impl_t{
+ std::unique_ptr<parent_t>(new NodeLayoutT(extent)), mut};
+ });
+ }
+
+ protected:
+ /*
+ * NodeImpl
+ */
+ node_type_t node_type() const override { return NODE_TYPE; }
+ field_type_t field_type() const override { return FIELD_TYPE; }
+ laddr_t laddr() const override { return extent.get_laddr(); }
+ const char* read() const override { return extent.read().p_start(); }
+ extent_len_t get_node_size() const override { return extent.get_length(); }
+ nextent_state_t get_extent_state() const override { return extent.get_state(); }
+ void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); }
+ bool is_level_tail() const override { return extent.read().is_level_tail(); }
+
+ void validate_non_empty() const override {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (is_level_tail()) {
+ return;
+ }
+ }
+ assert(!is_keys_empty());
+ }
+
+ bool is_keys_empty() const override { return extent.read().keys() == 0; }
+
+ bool has_single_value() const override {
+ validate_non_empty();
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ return ((is_level_tail() && is_keys_empty()) ||
+ (!is_level_tail() && stage_t::is_keys_one(extent.read())));
+ } else {
+ return stage_t::is_keys_one(extent.read());
+ }
+ }
+
+ level_t level() const override { return extent.read().level(); }
+ node_offset_t free_size() const override { return extent.read().free_size(); }
+ extent_len_t total_size() const override { return extent.read().total_size(); }
+ bool is_extent_retired() const override { return extent.is_retired(); }
+
+ std::optional<key_view_t> get_pivot_index() const override {
+ if (is_level_tail()) {
+ return std::nullopt;
+ }
+ assert(!is_keys_empty());
+ key_view_t pivot_index;
+ stage_t::template get_largest_slot<false, true, false>(
+ extent.read(), nullptr, &pivot_index, nullptr);
+ return {pivot_index};
+ }
+
+ bool is_size_underflow() const override {
+ /**
+ * There might be 2 node-merge strategies:
+ *
+ * The first is to rebalance and merge nodes and perfer tree fillness as
+ * much as possible in order to save space and improve key density for
+ * lookup, in exchange to the efforts of frequent merge, split and
+ * rebalance. These operations cannot benefit from seastore deltas because
+ * they are allocating fresh extents which need to be write into the
+ * journal as a whole, making write amplification much larger.
+ *
+ * The second is to delay rebalance and merge. When submit the transaction,
+ * simple insert and erase only need to append delta including just enough
+ * information about the inserted/erase item. The downside is tree fillness
+ * is not as good as the first strategy.
+ *
+ * Currently the decision is the second way by delaying merge until the
+ * node is 1/4 full, so that:
+ * - After a split operation (making the node at least 1/2 full):
+ * - The next merge need to erase items taking at least 1/4 space;
+ * - The next split need to insert items taking at most 1/2 space;
+ * - After a merge operation (making the node at least 1/2 full):
+ * - The next merge need to erase items taking at least 1/4 space;
+ * - The next split need to insert items taking at most 1/2 space;
+ * - TODO: before node rebalance is implemented, the node size can be below
+ * the underflow limit if it cannot be merged with peers;
+ */
+ auto& node_stage = extent.read();
+ size_t empty_size = node_stage.size_before(0);
+ size_t filled_kv_size = filled_size() - empty_size;
+ size_t full_kv_size = node_stage.total_size() - empty_size;
+ return filled_kv_size <= full_kv_size / 4;
+ }
+
+ std::tuple<match_stage_t, search_position_t>
+ erase(const search_position_t& pos) override {
+ LOG_PREFIX(OTree::Layout::erase);
+ SUBDEBUG(seastore_onode, "begin at erase_pos({}) ...", pos);
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::trace))) {
+ std::ostringstream sos;
+ dump(sos);
+ SUBTRACE(seastore_onode, "-- dump\n{}", sos.str());
+ }
+ auto [stage, next_or_last_pos] = extent.erase_replayable(cast_down<STAGE>(pos));
+ SUBDEBUG(seastore_onode, "done at erase_stage={}, n/l_pos({})", stage, next_or_last_pos);
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::trace))) {
+ std::ostringstream sos;
+ dump(sos);
+ SUBTRACE(seastore_onode, "-- dump\n{}", sos.str());
+ }
+#ifndef NDEBUG
+ if (!is_keys_empty()) {
+ validate_layout();
+ }
+#endif
+ return {stage, normalize(std::move(next_or_last_pos))};
+ }
+
+ std::tuple<match_stage_t, std::size_t> evaluate_merge(
+ NodeImpl& _right_node) override {
+ auto& left_node_stage = extent.read();
+ auto& right_node = dynamic_cast<NodeLayoutT&>(_right_node);
+ auto& right_node_stage = right_node.extent.read();
+
+ assert(NODE_TYPE == _right_node.node_type());
+ assert(FIELD_TYPE == _right_node.field_type());
+ assert(!is_level_tail());
+ assert(!is_keys_empty());
+
+ match_stage_t merge_stage;
+ node_offset_t size_comp;
+ if (right_node.is_keys_empty()) {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(right_node.is_level_tail());
+ merge_stage = STAGE;
+ size_comp = right_node_stage.header_size();
+ } else {
+ ceph_abort("impossible path");
+ }
+ } else {
+ key_view_t left_pivot_index;
+ stage_t::template get_largest_slot<false, true, false>(
+ left_node_stage, nullptr, &left_pivot_index, nullptr);
+ std::tie(merge_stage, size_comp) = stage_t::evaluate_merge(
+ left_pivot_index, right_node_stage);
+ }
+ auto size_left = filled_size();
+ auto size_right = right_node.filled_size();
+ assert(size_right > size_comp);
+ std::size_t merge_size = size_left + size_right - size_comp;
+ return {merge_stage, merge_size};
+ }
+
+ search_position_t merge(
+ NodeExtentMutable& mut,
+ NodeImpl& _right_node,
+ match_stage_t merge_stage,
+ extent_len_t merge_size) override {
+ LOG_PREFIX(OTree::Layout::merge);
+
+ auto& left_node_stage = extent.read();
+ auto& right_node = dynamic_cast<NodeLayoutT&>(_right_node);
+ auto& right_node_stage = right_node.extent.read();
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) {
+ {
+ std::ostringstream sos;
+ dump(sos);
+ SUBDEBUG(seastore_onode, "-- left node dump\n{}", sos.str());
+ }
+ {
+ std::ostringstream sos;
+ right_node.dump(sos);
+ SUBDEBUG(seastore_onode, "-- right node dump\n{}", sos.str());
+ }
+ }
+
+ assert(NODE_TYPE == _right_node.node_type());
+ assert(FIELD_TYPE == _right_node.field_type());
+ assert(!is_level_tail());
+ assert(!is_keys_empty());
+
+ if (right_node.is_level_tail()) {
+ node_stage_t::update_is_level_tail(mut, left_node_stage, true);
+ build_name();
+ }
+ position_t left_last_pos;
+ stage_t::template get_largest_slot<true, false, false>(
+ left_node_stage, &left_last_pos, nullptr, nullptr);
+
+ if (right_node.is_keys_empty()) {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(right_node.is_level_tail());
+ laddr_t tail_value = right_node_stage.get_end_p_laddr()->value;
+ auto p_write = left_node_stage.get_end_p_laddr();
+ mut.copy_in_absolute((void*)p_write, tail_value);
+ } else {
+ ceph_abort("impossible path");
+ }
+ } else {
+ typename stage_t::template StagedAppender<KeyT::VIEW> left_appender;
+ left_appender.init_tail(&mut, left_node_stage, merge_stage);
+
+ typename stage_t::StagedIterator right_append_at;
+ right_append_at.set(right_node_stage);
+
+ auto pos_end = position_t::end();
+ stage_t::template append_until<KeyT::VIEW>(
+ right_append_at, left_appender, pos_end, STAGE);
+ assert(right_append_at.is_end());
+ left_appender.wrap();
+ }
+
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ dump(sos);
+ SUBDEBUG(seastore_onode, "-- merged node dump\n{}", sos.str());
+ }
+ assert(merge_size == filled_size());
+ return normalize(std::move(left_last_pos));
+ }
+
+ eagain_ifuture<NodeExtentMutable>
+ rebuild_extent(context_t c) override {
+ assert(!is_keys_empty());
+ key_view_t first_index;
+ stage_t::template get_slot<true, false>(
+ extent.read(), position_t::begin(), &first_index, nullptr);
+ auto hint = first_index.get_hint();
+ return extent.rebuild(c, hint).si_then([this] (auto mut) {
+ // addr may change
+ build_name();
+ return mut;
+ });
+ }
+
+ eagain_ifuture<> retire_extent(context_t c) override {
+ return extent.retire(c);
+ }
+
+ search_position_t make_tail() override {
+ auto&& ret = extent.make_tail_replayable();
+ // is_level_tail is changed
+ build_name();
+ return normalize(std::move(ret));
+ }
+
+ node_stats_t get_stats() const override {
+ node_stats_t stats;
+ auto& node_stage = extent.read();
+ key_view_t index_key;
+ if (!is_keys_empty()) {
+ stage_t::get_stats(node_stage, stats, index_key);
+ }
+ stats.size_persistent = extent.get_length();
+ stats.size_filled = filled_size();
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (is_level_tail()) {
+ stats.size_logical += sizeof(value_t);
+ stats.size_value += sizeof(value_t);
+ stats.num_kvs += 1;
+ }
+ }
+ return stats;
+ }
+
+ std::ostream& dump(std::ostream& os) const override {
+ auto& node_stage = extent.read();
+ auto p_start = node_stage.p_start();
+ dump_brief(os);
+ auto stats = get_stats();
+ os << " num_kvs=" << stats.num_kvs
+ << ", logical=" << stats.size_logical
+ << "B, overhead=" << stats.size_overhead
+ << "B, value=" << stats.size_value << "B";
+ os << ":\n header: " << node_stage_t::header_size() << "B";
+ size_t size = 0u;
+ if (!is_keys_empty()) {
+ stage_t::dump(node_stage, os, " ", size, p_start);
+ } else {
+ size += node_stage_t::header_size();
+ if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) {
+ os << " empty!";
+ }
+ }
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (node_stage.is_level_tail()) {
+ size += sizeof(laddr_t);
+ auto value_ptr = node_stage.get_end_p_laddr();
+ int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+ os << "\n tail value: 0x"
+ << std::hex << value_ptr->value << std::dec
+ << " " << size << "B"
+ << " @" << offset << "B";
+ }
+ }
+ assert(size == filled_size());
+ return os;
+ }
+
+ std::ostream& dump_brief(std::ostream& os) const override {
+ os << name
+ << "(filled=" << filled_size() << "B"
+ << ", free=" << extent.read().free_size() << "B)";
+ return os;
+ }
+
+ const std::string& get_name() const override { return name; }
+
+ void validate_layout() const override {
+#ifndef NDEBUG
+ stage_t::validate(extent.read());
+#endif
+ }
+
+ void test_copy_to(NodeExtentMutable& to) const override {
+ extent.test_copy_to(to);
+ }
+
+ void test_set_tail(NodeExtentMutable& mut) override {
+ node_stage_t::update_is_level_tail(mut, extent.read(), true);
+ build_name();
+ }
+
+ /*
+ * Common
+ */
+ void get_slot(const search_position_t& pos,
+ key_view_t* p_index_key = nullptr,
+ const value_t** pp_value = nullptr) const override {
+ assert(!is_keys_empty());
+ assert(!pos.is_end());
+ if (p_index_key && pp_value) {
+ stage_t::template get_slot<true, true>(
+ extent.read(), cast_down<STAGE>(pos), p_index_key, pp_value);
+ } else if (!p_index_key && pp_value) {
+ stage_t::template get_slot<false, true>(
+ extent.read(), cast_down<STAGE>(pos), nullptr, pp_value);
+ } else if (p_index_key && !pp_value) {
+ stage_t::template get_slot<true, false>(
+ extent.read(), cast_down<STAGE>(pos), p_index_key, nullptr);
+ } else {
+ ceph_abort("impossible path");
+ }
+#ifndef NDEBUG
+ if (pp_value) {
+ assert((const char*)(*pp_value) - extent.read().p_start() <
+ extent.get_length());
+ }
+#endif
+ }
+
+ void get_prev_slot(search_position_t& pos,
+ key_view_t* p_index_key = nullptr,
+ const value_t** pp_value = nullptr) const override {
+ assert(!is_keys_empty());
+ assert(!pos.is_end());
+ auto& _pos = cast_down<STAGE>(pos);
+#ifndef NDEBUG
+ auto nxt_pos = _pos;
+#endif
+ if (!p_index_key && pp_value) {
+ stage_t::template get_prev_slot<false, true>(
+ extent.read(), _pos, nullptr, pp_value);
+ } else {
+ ceph_abort("not implemented");
+ }
+#ifndef NDEBUG
+ auto _nxt_pos = _pos;
+ stage_t::template get_next_slot<false, false>(
+ extent.read(), _nxt_pos, nullptr, nullptr);
+ assert(nxt_pos == _nxt_pos);
+#endif
+ }
+
+ void get_next_slot(search_position_t& pos,
+ key_view_t* p_index_key = nullptr,
+ const value_t** pp_value = nullptr) const override {
+ assert(!is_keys_empty());
+ assert(!pos.is_end());
+ bool find_next;
+ if (p_index_key && pp_value) {
+ find_next = stage_t::template get_next_slot<true, true>(
+ extent.read(), cast_down<STAGE>(pos), p_index_key, pp_value);
+ } else if (!p_index_key && pp_value) {
+ find_next = stage_t::template get_next_slot<false, true>(
+ extent.read(), cast_down<STAGE>(pos), nullptr, pp_value);
+ } else {
+ ceph_abort("not implemented");
+ }
+ if (find_next) {
+ pos = search_position_t::end();
+ }
+ }
+
+ void get_largest_slot(search_position_t* p_pos = nullptr,
+ key_view_t* p_index_key = nullptr,
+ const value_t** pp_value = nullptr) const override {
+ assert(!is_keys_empty());
+ if (p_pos && p_index_key && pp_value) {
+ stage_t::template get_largest_slot<true, true, true>(
+ extent.read(), &cast_down_fill_0<STAGE>(*p_pos), p_index_key, pp_value);
+ } else if (!p_pos && p_index_key && !pp_value) {
+ stage_t::template get_largest_slot<false, true, false>(
+ extent.read(), nullptr, p_index_key, nullptr);
+ } else if (p_pos && !p_index_key && pp_value) {
+ stage_t::template get_largest_slot<true, false, true>(
+ extent.read(), &cast_down_fill_0<STAGE>(*p_pos), nullptr, pp_value);
+ } else if (p_pos && !p_index_key && !pp_value) {
+ stage_t::template get_largest_slot<true, false, false>(
+ extent.read(), &cast_down_fill_0<STAGE>(*p_pos), nullptr, nullptr);
+ } else {
+ ceph_abort("not implemented");
+ }
+ }
+
+
+ lookup_result_t<NODE_TYPE> lower_bound(
+ const key_hobj_t& key, MatchHistory& history,
+ key_view_t* index_key=nullptr, marker_t={}) const override {
+ auto& node_stage = extent.read();
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ if (unlikely(is_keys_empty())) {
+ history.set<STAGE_LEFT>(MatchKindCMP::LT);
+ return lookup_result_t<NODE_TYPE>::end();
+ }
+ }
+ assert(!is_keys_empty());
+
+ typename stage_t::result_t result_raw;
+ if (index_key) {
+ result_raw = stage_t::template lower_bound<true>(
+ node_stage, key, history, index_key);
+#ifndef NDEBUG
+ if (!result_raw.is_end()) {
+ key_view_t index;
+ stage_t::template get_slot<true, false>(
+ node_stage, result_raw.position, &index, nullptr);
+ assert(index == *index_key);
+ }
+#endif
+ } else {
+ result_raw = stage_t::lower_bound(node_stage, key, history);
+ }
+#ifndef NDEBUG
+ if (result_raw.is_end()) {
+ assert(result_raw.mstat == MSTAT_END);
+ } else {
+ key_view_t index;
+ stage_t::template get_slot<true, false>(
+ node_stage, result_raw.position, &index, nullptr);
+ assert_mstat(key, index, result_raw.mstat);
+ }
+#endif
+
+ // calculate MSTAT_LT3
+ if constexpr (FIELD_TYPE == field_type_t::N0) {
+ // currently only internal node checks mstat
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (result_raw.mstat == MSTAT_LT2) {
+ auto cmp =
+ key <=> node_stage[result_raw.position.index].shard_pool;
+ assert(cmp != std::strong_ordering::greater);
+ if (cmp != 0) {
+ result_raw.mstat = MSTAT_LT3;
+ }
+ }
+ }
+ }
+
+ auto result = normalize(std::move(result_raw));
+ if (result.is_end()) {
+ assert(node_stage.is_level_tail());
+ assert(result.p_value == nullptr);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ result.p_value = node_stage.get_end_p_laddr();
+ }
+ } else {
+ assert(result.p_value != nullptr);
+ }
+ return result;
+ }
+
+ const value_t* insert(
+ const full_key_t<KEY_TYPE>& key, const value_input_t& value,
+ search_position_t& insert_pos, match_stage_t& insert_stage,
+ node_offset_t& insert_size) override {
+ LOG_PREFIX(OTree::Layout::insert);
+ SUBDEBUG(seastore_onode,
+ "begin at insert_pos({}), insert_stage={}, insert_size={}B ...",
+ insert_pos, insert_stage, insert_size);
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::trace))) {
+ std::ostringstream sos;
+ dump(sos);
+ SUBTRACE(seastore_onode, "-- dump\n{}", sos.str());
+ }
+ auto ret = extent.template insert_replayable<KEY_TYPE>(
+ key, value, cast_down<STAGE>(insert_pos), insert_stage, insert_size);
+ SUBDEBUG(seastore_onode,
+ "done at insert_pos({}), insert_stage={}, insert_size={}B",
+ insert_pos, insert_stage, insert_size);
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::trace))) {
+ std::ostringstream sos;
+ dump(sos);
+ SUBTRACE(seastore_onode, "-- dump\n{}", sos.str());
+ }
+ validate_layout();
+#ifndef NDEBUG
+ key_view_t index;
+ get_slot(insert_pos, &index, nullptr);
+ assert(index == key);
+#endif
+ return ret;
+ }
+
+ std::tuple<search_position_t, bool, const value_t*> split_insert(
+ NodeExtentMutable& right_mut, NodeImpl& _right_impl,
+ const full_key_t<KEY_TYPE>& key, const value_input_t& value,
+ search_position_t& _insert_pos, match_stage_t& insert_stage,
+ node_offset_t& insert_size) override {
+ LOG_PREFIX(OTree::Layout::split_insert);
+ assert(_right_impl.node_type() == NODE_TYPE);
+ assert(_right_impl.field_type() == FIELD_TYPE);
+ auto& right_impl = dynamic_cast<NodeLayoutT&>(_right_impl);
+ SUBDEBUG(seastore_onode,
+ "begin at insert_pos({}), insert_stage={}, insert_size={}B ...",
+ _insert_pos, insert_stage, insert_size);
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ dump(sos);
+ SUBDEBUG(seastore_onode, "-- dump\n{}", sos.str());
+ }
+#ifdef UNIT_TESTS_BUILT
+ auto insert_stage_pre = insert_stage;
+#endif
+
+ auto& insert_pos = cast_down<STAGE>(_insert_pos);
+ auto& node_stage = extent.read();
+ typename stage_t::StagedIterator split_at;
+ bool is_insert_left;
+ size_t split_size;
+ size_t target_split_size;
+ {
+ size_t empty_size = node_stage.size_before(0);
+ size_t filled_kv_size = filled_size() - empty_size;
+ /** NODE_BLOCK_SIZE considerations
+ *
+ * Generally,
+ * target_split_size = (filled_size + insert_size) / 2
+ * We can have two locate_split() strategies:
+ * A. the simpler one is to locate the largest split position where
+ * the estimated left_node_size <= target_split_size;
+ * B. the fair one takes a further step to calculate the next slot of
+ * P KiB, and if left_node_size + P/2 < target_split_size, compensate
+ * the split position to include the next slot;
+ *
+ * Say that the node_block_size = N KiB, the largest allowed
+ * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I'
+ * that won't lead to "double split" effect, meaning after a split,
+ * the right node size is still larger than N KiB and need to split
+ * again. I think "double split" makes split much more complicated and
+ * we can no longer identify whether the node is safe under concurrent
+ * operations.
+ *
+ * We need to evaluate the worst case in order to identify 'I'. This means:
+ * - filled_size ~= N KiB
+ * - insert_size == N/I KiB
+ * - target_split_size ~= (I+1)/2I * N KiB
+ * To simplify the below calculations, node_block_size is normalized to 1.
+ *
+ * With strategy A, the worst case is when left_node_size cannot include
+ * the next slot that will just overflow the target_split_size:
+ * - left_node_size + 1/I ~= (I+1)/2I
+ * - left_node_size ~= (I-1)/2I
+ * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I
+ * The right_node_size cannot larger than the node_block_size in the
+ * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest
+ * possible insert_size must be smaller than 1/3 of the node_block_size.
+ *
+ * With strategy B, the worst case is when left_node_size cannot include
+ * the next slot that will just overflow the threshold
+ * target_split_size - 1/2I, thus:
+ * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2
+ * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1)
+ * - I > 2
+ * This means the largest possible insert_size must be smaller than 1/2 of
+ * the node_block_size, which is better than strategy A.
+ *
+ * In order to avoid "double split", there is another side-effect we need
+ * to take into consideration: if split happens with snap-gen indexes, the
+ * according ns-oid string needs to be copied to the right node. That is
+ * to say: right_node_size + string_size < node_block_size.
+ *
+ * Say that the largest allowed string size is 1/S of the largest allowed
+ * insert_size N/I KiB. If we go with stragety B, and when split happens
+ * with snap-gen indexes and split just overflow the target_split_size:
+ * - left_node_size ~= target_split_size - 1/2 * (1/I - 1/IS)
+ * ~= 1/2 + 1/2IS
+ * - right_node_size ~= 1 + 1/I - left_node_size + 1/IS
+ * ~= 1/2 + 1/I + 1/2IS < 1
+ * - I > 2 + 1/S (S > 1)
+ *
+ * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most
+ * X KiB ns-oid string and Y KiB of value to store in this BTree, then:
+ * - largest_insert_size ~= X+Y KiB
+ * - 1/S == X/(X+Y)
+ * - I > (3X+2Y)/(X+Y)
+ * - node_block_size(N) == I * insert_size > 3X+2Y KiB
+ *
+ * In conclusion,
+ * (TODO) the current node block size (4 KiB) is too small to
+ * store entire 2 KiB ns-oid string. We need to consider a larger
+ * node_block_size.
+ *
+ * We are setting X = Y = 640 B in order not to break the current
+ * implementations with 4KiB node.
+ *
+ * (TODO) Implement smarter logics to check when "double split" happens.
+ */
+ target_split_size = empty_size + (filled_kv_size + insert_size) / 2;
+ assert(insert_size < (node_stage.total_size() - empty_size) / 2);
+
+ std::optional<bool> _is_insert_left;
+ split_at.set(node_stage);
+ split_size = 0;
+ bool locate_nxt = stage_t::recursively_locate_split_inserted(
+ split_size, 0, target_split_size, insert_pos,
+ insert_stage, insert_size, _is_insert_left, split_at);
+ is_insert_left = *_is_insert_left;
+ SUBDEBUG(seastore_onode,
+ "-- located split_at({}), insert_pos({}), is_insert_left={}, "
+ "split_size={}B(target={}B, current={}B)",
+ split_at, insert_pos, is_insert_left,
+ split_size, target_split_size, filled_size());
+ // split_size can be larger than target_split_size in strategy B
+ // assert(split_size <= target_split_size);
+ if (locate_nxt) {
+ assert(insert_stage == STAGE);
+ assert(split_at.get().is_last());
+ split_at.set_end();
+ assert(insert_pos.index == split_at.index());
+ }
+ }
+
+ auto append_at = split_at;
+ // TODO(cross-node string dedup)
+ typename stage_t::template StagedAppender<KEY_TYPE> right_appender;
+ right_appender.init_empty(&right_mut, right_mut.get_write());
+ const value_t* p_value = nullptr;
+ if (!is_insert_left) {
+ // right node: append [start(append_at), insert_pos)
+ stage_t::template append_until<KEY_TYPE>(
+ append_at, right_appender, insert_pos, insert_stage);
+ SUBDEBUG(seastore_onode,
+ "-- right appended until "
+ "insert_pos({}), insert_stage={}, insert/append the rest ...",
+ insert_pos, insert_stage);
+ // right node: append [insert_pos(key, value)]
+ bool is_front_insert = (insert_pos == position_t::begin());
+ [[maybe_unused]] bool is_end = stage_t::template append_insert<KEY_TYPE>(
+ key, value, append_at, right_appender,
+ is_front_insert, insert_stage, p_value);
+ assert(append_at.is_end() == is_end);
+ } else {
+ SUBDEBUG(seastore_onode, "-- right appending ...");
+ }
+
+ // right node: append (insert_pos, end)
+ auto pos_end = position_t::end();
+ stage_t::template append_until<KEY_TYPE>(
+ append_at, right_appender, pos_end, STAGE);
+ assert(append_at.is_end());
+ right_appender.wrap();
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ right_impl.dump(sos);
+ SUBDEBUG(seastore_onode, "-- right node dump\n{}", sos.str());
+ }
+ right_impl.validate_layout();
+
+ // mutate left node
+ if (is_insert_left) {
+ SUBDEBUG(seastore_onode,
+ "-- left trim/insert at insert_pos({}), insert_stage={} ...",
+ insert_pos, insert_stage);
+ p_value = extent.template split_insert_replayable<KEY_TYPE>(
+ split_at, key, value, insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+ key_view_t index;
+ get_slot(_insert_pos, &index, nullptr);
+ assert(index == key);
+#endif
+ } else {
+ SUBDEBUG(seastore_onode, "-- left trim ...");
+#ifndef NDEBUG
+ key_view_t index;
+ right_impl.get_slot(_insert_pos, &index, nullptr);
+ assert(index == key);
+#endif
+ extent.split_replayable(split_at);
+ }
+ if (right_impl.is_level_tail()) {
+ // is_level_tail of left is changed by split/split_insert
+ build_name();
+ }
+ if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ dump(sos);
+ SUBDEBUG(seastore_onode, "-- left node dump\n{}", sos.str());
+ }
+ validate_layout();
+ assert(p_value);
+
+ auto split_pos = normalize(split_at.get_pos());
+ SUBDEBUG(seastore_onode,
+ "done at insert_pos({}), insert_stage={}, insert_size={}B, "
+ "split_at({}), is_insert_left={}, split_size={}B(target={}B)",
+ _insert_pos, insert_stage, insert_size, split_pos,
+ is_insert_left, split_size, target_split_size);
+ assert(split_size == filled_size());
+
+#ifdef UNIT_TESTS_BUILT
+ InsertType insert_type;
+ search_position_t last_pos;
+ if (is_insert_left) {
+ stage_t::template get_largest_slot<true, false, false>(
+ extent.read(), &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr);
+ } else {
+ node_stage_t right_stage{reinterpret_cast<FieldType*>(right_mut.get_write()),
+ right_mut.get_length()};
+ stage_t::template get_largest_slot<true, false, false>(
+ right_stage, &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr);
+ }
+ if (_insert_pos == search_position_t::begin()) {
+ insert_type = InsertType::BEGIN;
+ } else if (_insert_pos == last_pos) {
+ insert_type = InsertType::LAST;
+ } else {
+ insert_type = InsertType::MID;
+ }
+ last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type};
+#endif
+ return {split_pos, is_insert_left, p_value};
+ }
+
+ /*
+ * InternalNodeImpl
+ */
+ const laddr_packed_t* get_tail_value() const override {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(is_level_tail());
+ return extent.read().get_end_p_laddr();
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ void replace_child_addr(
+ const search_position_t& pos, laddr_t dst, laddr_t src) override {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ LOG_PREFIX(OTree::Layout::replace_child_addr);
+ SUBDEBUG(seastore_onode, "update from {:#x} to {:#x} at pos({}) ...", src, dst, pos);
+ const laddr_packed_t* p_value;
+ if (pos.is_end()) {
+ assert(is_level_tail());
+ p_value = get_tail_value();
+ } else {
+ get_slot(pos, nullptr, &p_value);
+ }
+ assert(p_value->value == src);
+ extent.update_child_addr_replayable(dst, const_cast<laddr_packed_t*>(p_value));
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_view_t& key, const laddr_t& value,
+ search_position_t& insert_pos) const override {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ auto& node_stage = extent.read();
+ match_stage_t insert_stage;
+ node_offset_t insert_size;
+ if (unlikely(is_keys_empty())) {
+ assert(insert_pos.is_end());
+ insert_stage = STAGE;
+ insert_size = stage_t::insert_size(key, value);
+ } else {
+ std::tie(insert_stage, insert_size) = stage_t::evaluate_insert(
+ node_stage, key, value, cast_down<STAGE>(insert_pos), false);
+ }
+ return {insert_stage, insert_size};
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ /*
+ * LeafNodeImpl
+ */
+ std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_hobj_t& key, const value_config_t& value,
+ const MatchHistory& history, match_stat_t mstat,
+ search_position_t& insert_pos) const override {
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ if (unlikely(is_keys_empty())) {
+ assert(insert_pos.is_end());
+ assert(is_level_tail());
+ return {STAGE, stage_t::insert_size(key, value)};
+ } else {
+ return stage_t::evaluate_insert(
+ key, value, history, mstat, cast_down<STAGE>(insert_pos));
+ }
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+ prepare_mutate_value_payload(context_t c) {
+ return extent.prepare_mutate_value_payload(c);
+ }
+
+ private:
+ NodeLayoutT(NodeExtentRef extent) : extent{extent} {
+ build_name();
+ }
+
+ extent_len_t filled_size() const {
+ auto& node_stage = extent.read();
+ auto ret = node_stage.size_before(node_stage.keys());
+ assert(ret == node_stage.total_size() - node_stage.free_size());
+ return ret;
+ }
+
+ // rebuild the name whenever addr, type, level, tail is changed
+ void build_name() {
+ // XXX: maybe also include the extent state
+ std::ostringstream sos;
+ sos << "Node" << NODE_TYPE << FIELD_TYPE
+ << "@0x" << std::hex << extent.get_laddr()
+ << "+" << extent.get_length() << std::dec
+ << "Lv" << (unsigned)level()
+ << (is_level_tail() ? "$" : "");
+ name = sos.str();
+ }
+
+ extent_t extent;
+ std::string name = "Node-N/A";
+};
+
+using InternalNode0 = NodeLayoutT<node_fields_0_t, node_type_t::INTERNAL>;
+using InternalNode1 = NodeLayoutT<node_fields_1_t, node_type_t::INTERNAL>;
+using InternalNode2 = NodeLayoutT<node_fields_2_t, node_type_t::INTERNAL>;
+using InternalNode3 = NodeLayoutT<internal_fields_3_t, node_type_t::INTERNAL>;
+using LeafNode0 = NodeLayoutT<node_fields_0_t, node_type_t::LEAF>;
+using LeafNode1 = NodeLayoutT<node_fields_1_t, node_type_t::LEAF>;
+using LeafNode2 = NodeLayoutT<node_fields_2_t, node_type_t::LEAF>;
+using LeafNode3 = NodeLayoutT<leaf_fields_3_t, node_type_t::LEAF>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h
new file mode 100644
index 000000000..d8a18231e
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "node_extent_mutable.h"
+#include "stages/node_stage.h"
+#include "stages/stage.h"
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * NodeLayoutReplayableT
+ *
+ * Contains templated logics to modify the layout of a NodeExtend which are
+ * also replayable. Used by NodeExtentAccessorT at runtime and by
+ * DeltaRecorderT during replay.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+struct NodeLayoutReplayableT {
+ using node_stage_t = node_extent_t<FieldType, NODE_TYPE>;
+ using stage_t = node_to_stage_t<node_stage_t>;
+ using position_t = typename stage_t::position_t;
+ using StagedIterator = typename stage_t::StagedIterator;
+ using value_input_t = value_input_type_t<NODE_TYPE>;
+ using value_t = value_type_t<NODE_TYPE>;
+ static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE;
+
+ template <KeyT KT>
+ static const value_t* insert(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ const full_key_t<KT>& key,
+ const value_input_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ auto p_value = stage_t::template proceed_insert<KT, false>(
+ mut, node_stage, key, value, insert_pos, insert_stage, insert_size);
+ return p_value;
+ }
+
+ static void split(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ StagedIterator& split_at) {
+ node_stage_t::update_is_level_tail(mut, node_stage, false);
+ stage_t::trim(mut, split_at);
+ }
+
+ template <KeyT KT>
+ static const value_t* split_insert(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_input_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ node_stage_t::update_is_level_tail(mut, node_stage, false);
+ stage_t::trim(mut, split_at);
+ auto p_value = stage_t::template proceed_insert<KT, true>(
+ mut, node_stage, key, value, insert_pos, insert_stage, insert_size);
+ return p_value;
+ }
+
+ static void update_child_addr(
+ NodeExtentMutable& mut, const laddr_t new_addr, laddr_packed_t* p_addr) {
+ assert(NODE_TYPE == node_type_t::INTERNAL);
+ mut.copy_in_absolute(p_addr, new_addr);
+ }
+
+ static std::tuple<match_stage_t, position_t> erase(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ const position_t& _erase_pos) {
+ if (_erase_pos.is_end()) {
+ // must be internal node
+ assert(node_stage.is_level_tail());
+ // return erase_stage, last_pos
+ return update_last_to_tail(mut, node_stage);
+ }
+
+ assert(node_stage.keys() != 0);
+ position_t erase_pos = _erase_pos;
+ auto erase_stage = stage_t::erase(mut, node_stage, erase_pos);
+ // return erase_stage, next_pos
+ return {erase_stage, erase_pos};
+ }
+
+ static position_t make_tail(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage) {
+ assert(!node_stage.is_level_tail());
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ auto [r_stage, r_last_pos] = update_last_to_tail(mut, node_stage);
+ std::ignore = r_stage;
+ return r_last_pos;
+ } else {
+ node_stage_t::update_is_level_tail(mut, node_stage, true);
+ // no need to calculate the last pos
+ return position_t::end();
+ }
+ }
+
+ private:
+ static std::tuple<match_stage_t, position_t> update_last_to_tail(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage) {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(node_stage.keys() != 0);
+ position_t last_pos;
+ laddr_t last_value;
+ {
+ const laddr_packed_t* p_last_value;
+ stage_t::template get_largest_slot<true, false, true>(
+ node_stage, &last_pos, nullptr, &p_last_value);
+ last_value = p_last_value->value;
+ }
+
+ auto erase_pos = last_pos;
+ auto erase_stage = stage_t::erase(mut, node_stage, erase_pos);
+ assert(erase_pos.is_end());
+
+ node_stage_t::update_is_level_tail(mut, node_stage, true);
+ auto p_last_value = const_cast<laddr_packed_t*>(
+ node_stage.get_end_p_laddr());
+ mut.copy_in_absolute(p_last_value, last_value);
+ // return erase_stage, last_pos
+ return {erase_stage, last_pos};
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h
new file mode 100644
index 000000000..22c140b59
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <ostream>
+
+#include "fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+constexpr uint8_t FIELD_TYPE_MAGIC = 0x25;
+enum class field_type_t : uint8_t {
+ N0 = FIELD_TYPE_MAGIC,
+ N1,
+ N2,
+ N3,
+ _MAX
+};
+inline uint8_t to_unsigned(field_type_t type) {
+ auto value = static_cast<uint8_t>(type);
+ assert(value >= FIELD_TYPE_MAGIC);
+ assert(value < static_cast<uint8_t>(field_type_t::_MAX));
+ return value - FIELD_TYPE_MAGIC;
+}
+inline std::ostream& operator<<(std::ostream &os, field_type_t type) {
+ const char* const names[] = {"0", "1", "2", "3"};
+ auto index = to_unsigned(type);
+ os << names[index];
+ return os;
+}
+
+enum class node_type_t : uint8_t {
+ LEAF = 0,
+ INTERNAL
+};
+inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) {
+ const char* const names[] = {"L", "I"};
+ auto index = static_cast<uint8_t>(type);
+ assert(index <= 1u);
+ os << names[index];
+ return os;
+}
+
+struct laddr_packed_t {
+ laddr_t value;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) {
+ return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")";
+}
+
+using match_stat_t = int8_t;
+constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end()
+constexpr match_stat_t MSTAT_EQ = -1; // key == index
+constexpr match_stat_t MSTAT_LT0 = 0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen]
+constexpr match_stat_t MSTAT_LT1 = 1; // key == index [pool/shard crush]; key < index [ns/oid]
+constexpr match_stat_t MSTAT_LT2 = 2; // key < index [pool/shard crush ns/oid] ||
+ // key == index [pool/shard]; key < index [crush]
+constexpr match_stat_t MSTAT_LT3 = 3; // key < index [pool/shard]
+constexpr match_stat_t MSTAT_MIN = MSTAT_END;
+constexpr match_stat_t MSTAT_MAX = MSTAT_LT3;
+
+enum class node_delta_op_t : uint8_t {
+ INSERT,
+ SPLIT,
+ SPLIT_INSERT,
+ UPDATE_CHILD_ADDR,
+ ERASE,
+ MAKE_TAIL,
+ SUBOP_UPDATE_VALUE = 0xff,
+};
+
+/** nextent_state_t
+ *
+ * The possible states of tree node extent(NodeExtentAccessorT).
+ *
+ * State transition implies the following capabilities is changed:
+ * - mutability is changed;
+ * - whether to record;
+ * - memory has been copied;
+ *
+ * load()----+
+ * |
+ * alloc() v
+ * | +--> [READ_ONLY] ---------+
+ * | | | |
+ * | | prepare_mutate() |
+ * | | | |
+ * | v v v
+ * | +--> [MUTATION_PENDING]---+
+ * | | |
+ * | | rebuild()
+ * | | |
+ * | v v
+ * +------->+--> [FRESH] <------------+
+ *
+ * Note that NodeExtentAccessorT might still be MUTATION_PENDING/FRESH while
+ * the internal extent has become DIRTY after the transaction submission is
+ * started while nodes destruction and validation has not been completed yet.
+ */
+enum class nextent_state_t : uint8_t {
+ READ_ONLY = 0, // requires mutate for recording
+ // CLEAN/DIRTY
+ MUTATION_PENDING, // can mutate, needs recording
+ // MUTATION_PENDING
+ FRESH, // can mutate, no recording
+ // INITIAL_WRITE_PENDING
+};
+
+}
+
+template <> struct fmt::formatter<crimson::os::seastore::onode::node_delta_op_t>
+ : fmt::formatter<std::string_view> {
+ using node_delta_op_t = crimson::os::seastore::onode::node_delta_op_t;
+ // parse is inherited from formatter<string_view>.
+ template <typename FormatContext>
+ auto format(node_delta_op_t op, FormatContext& ctx) {
+ std::string_view name = "unknown";
+ switch (op) {
+ case node_delta_op_t::INSERT:
+ name = "insert";
+ break;
+ case node_delta_op_t::SPLIT:
+ name = "split";
+ break;
+ case node_delta_op_t::SPLIT_INSERT:
+ name = "split_insert";
+ break;
+ case node_delta_op_t::UPDATE_CHILD_ADDR:
+ name = "update_child_addr";
+ break;
+ case node_delta_op_t::ERASE:
+ name = "erase";
+ break;
+ case node_delta_op_t::MAKE_TAIL:
+ name = "make_tail";
+ break;
+ case node_delta_op_t::SUBOP_UPDATE_VALUE:
+ name = "subop_update_value";
+ break;
+ }
+ return formatter<string_view>::format(name, ctx);
+ }
+};
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
new file mode 100644
index 000000000..9252fb99a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
@@ -0,0 +1,202 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "item_iterator_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+#define ITER_T item_iterator_t<NODE_TYPE>
+#define ITER_INST(NT) item_iterator_t<NT>
+
+template <node_type_t NODE_TYPE>
+template <IsFullKey Key>
+memory_range_t ITER_T::insert_prefix(
+ NodeExtentMutable& mut, const ITER_T& iter, const Key& key,
+ bool is_end, node_offset_t size, const char* p_left_bound)
+{
+ // 1. insert range
+ char* p_insert;
+ if (is_end) {
+ assert(!iter.has_next());
+ p_insert = const_cast<char*>(iter.p_start());
+ } else {
+ p_insert = const_cast<char*>(iter.p_end());
+ }
+ char* p_insert_front = p_insert - size;
+
+ // 2. shift memory
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = p_insert;
+ mut.shift_absolute(p_shift_start,
+ p_shift_end - p_shift_start,
+ -(int)size);
+
+ // 3. append header
+ p_insert -= sizeof(node_offset_t);
+ node_offset_t back_offset = (p_insert - p_insert_front);
+ mut.copy_in_absolute(p_insert, back_offset);
+ ns_oid_view_t::append(mut, key, p_insert);
+
+ return {p_insert_front, p_insert};
+}
+#define IP_TEMPLATE(NT, Key) \
+ template memory_range_t ITER_INST(NT)::insert_prefix<Key>( \
+ NodeExtentMutable&, const ITER_INST(NT)&, const Key&, \
+ bool, node_offset_t, const char*)
+IP_TEMPLATE(node_type_t::LEAF, key_view_t);
+IP_TEMPLATE(node_type_t::INTERNAL, key_view_t);
+IP_TEMPLATE(node_type_t::LEAF, key_hobj_t);
+IP_TEMPLATE(node_type_t::INTERNAL, key_hobj_t);
+
+template <node_type_t NODE_TYPE>
+void ITER_T::update_size(
+ NodeExtentMutable& mut, const ITER_T& iter, int change)
+{
+ node_offset_t offset = iter.get_back_offset();
+ int new_size = change + offset;
+ assert(new_size > 0 && new_size < (int)mut.get_length());
+ mut.copy_in_absolute(
+ (void*)iter.get_item_range().p_end, node_offset_t(new_size));
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_until(NodeExtentMutable& mut, const ITER_T& iter)
+{
+ assert(iter.index() != 0);
+ size_t ret = iter.p_end() - iter.p_items_start;
+ assert(ret < mut.get_length());
+ return ret;
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_at(
+ NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed)
+{
+ size_t trim_size = iter.p_start() - iter.p_items_start + trimmed;
+ assert(trim_size < mut.get_length());
+ assert(iter.get_back_offset() > trimmed);
+ node_offset_t new_offset = iter.get_back_offset() - trimmed;
+ mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset);
+ return trim_size;
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::erase(
+ NodeExtentMutable& mut, const ITER_T& iter, const char* p_left_bound)
+{
+ node_offset_t erase_size = iter.p_end() - iter.p_start();
+ const char* p_shift_start = p_left_bound;
+ assert(p_left_bound <= iter.p_start());
+ extent_len_t shift_len = iter.p_start() - p_left_bound;
+ int shift_off = erase_size;
+ mut.shift_absolute(p_shift_start, shift_len, shift_off);
+ return erase_size;
+}
+
+#define ITER_TEMPLATE(NT) template class ITER_INST(NT)
+ITER_TEMPLATE(node_type_t::LEAF);
+ITER_TEMPLATE(node_type_t::INTERNAL);
+
+#define APPEND_T ITER_T::Appender<KT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+APPEND_T::Appender(NodeExtentMutable* p_mut,
+ const item_iterator_t& iter,
+ bool open) : p_mut{p_mut}
+{
+ assert(!iter.has_next());
+ if (open) {
+ p_append = const_cast<char*>(iter.get_key().p_start());
+ p_offset_while_open = const_cast<char*>(iter.item_range.p_end);
+ } else {
+ // XXX: this doesn't need to advance the iter to last
+ p_append = const_cast<char*>(iter.p_items_start);
+ }
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+bool APPEND_T::append(const ITER_T& src, index_t& items)
+{
+ auto p_end = src.p_end();
+ bool append_till_end = false;
+ if (is_valid_index(items)) {
+ for (auto i = 1u; i <= items; ++i) {
+ if (!src.has_next()) {
+ assert(i == items);
+ append_till_end = true;
+ break;
+ }
+ ++src;
+ }
+ } else {
+ if (items == INDEX_END) {
+ append_till_end = true;
+ } else {
+ assert(items == INDEX_LAST);
+ }
+ items = 0;
+ while (src.has_next()) {
+ ++src;
+ ++items;
+ }
+ if (append_till_end) {
+ ++items;
+ }
+ }
+
+ const char* p_start;
+ if (append_till_end) {
+ p_start = src.p_start();
+ } else {
+ p_start = src.p_end();
+ }
+ assert(p_end >= p_start);
+ size_t append_size = p_end - p_start;
+ p_append -= append_size;
+ p_mut->copy_in_absolute(p_append, p_start, append_size);
+ return append_till_end;
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key)
+{
+ p_append -= sizeof(node_offset_t);
+ p_offset_while_open = p_append;
+ ns_oid_view_t::append(*p_mut, partial_key, p_append);
+ return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key)
+{
+ p_append -= sizeof(node_offset_t);
+ p_offset_while_open = p_append;
+ ns_oid_view_t::append(*p_mut, key, p_append);
+ return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::wrap_nxt(char* _p_append)
+{
+ assert(_p_append < p_append);
+ p_mut->copy_in_absolute(
+ p_offset_while_open, node_offset_t(p_offset_while_open - _p_append));
+ p_append = _p_append;
+}
+
+#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT>
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
new file mode 100644
index 000000000..9d12474ac
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * item_iterator_t
+ *
+ * The STAGE_STRING implementation for node N0/N1, implements staged contract
+ * as an iterative container to resolve crush hash conflicts.
+ *
+ * The layout of the contaner to index ns, oid strings storing n items:
+ *
+ * # <--------- container range ---------> #
+ * #<~># items [i+1, n) #
+ * # # items [0, i) #<~>#
+ * # # <------ item i -------------> # #
+ * # # <--- item_range ---> | # #
+ * # # | # #
+ * # # next-stage | ns-oid | back_ # #
+ * # # contaner | strings | offset # #
+ * #...# range | | #...#
+ * ^ ^ | ^
+ * | | | |
+ * | +---------------------------+ |
+ * + p_items_start p_items_end +
+ */
+template <node_type_t NODE_TYPE>
+class item_iterator_t {
+ using value_input_t = value_input_type_t<NODE_TYPE>;
+ using value_t = value_type_t<NODE_TYPE>;
+ public:
+ item_iterator_t(const container_range_t& range)
+ : node_size{range.node_size},
+ p_items_start(range.range.p_start),
+ p_items_end(range.range.p_end) {
+ assert(is_valid_node_size(node_size));
+ assert(p_items_start < p_items_end);
+ next_item_range(p_items_end);
+ }
+
+ const char* p_start() const { return item_range.p_start; }
+ const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); }
+ const memory_range_t& get_item_range() const { return item_range; }
+ node_offset_t get_back_offset() const { return back_offset; }
+
+ // container type system
+ using key_get_type = const ns_oid_view_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE;
+ index_t index() const { return _index; }
+ key_get_type get_key() const {
+ if (!key.has_value()) {
+ key = ns_oid_view_t(item_range.p_end);
+ assert(item_range.p_start < (*key).p_start());
+ }
+ return *key;
+ }
+ node_offset_t size() const {
+ size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t);
+ assert(ret < node_size);
+ return ret;
+ };
+ node_offset_t size_to_nxt() const {
+ size_t ret = get_key().size() + sizeof(node_offset_t);
+ assert(ret < node_size);
+ return ret;
+ }
+ node_offset_t size_overhead() const {
+ return sizeof(node_offset_t) + get_key().size_overhead();
+ }
+ container_range_t get_nxt_container() const {
+ return {{item_range.p_start, get_key().p_start()}, node_size};
+ }
+ bool has_next() const {
+ assert(p_items_start <= item_range.p_start);
+ return p_items_start < item_range.p_start;
+ }
+ const item_iterator_t<NODE_TYPE>& operator++() const {
+ assert(has_next());
+ next_item_range(item_range.p_start);
+ key.reset();
+ ++_index;
+ return *this;
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ int start_offset = p_items_start - p_node_start;
+ int stage_size = p_items_end - p_items_start;
+ assert(start_offset > 0);
+ assert(stage_size > 0);
+ assert(start_offset + stage_size <= (int)node_size);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(stage_size), encoded);
+ ceph::encode(_index, encoded);
+ }
+
+ static item_iterator_t decode(const char* p_node_start,
+ extent_len_t node_size,
+ ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t stage_size;
+ ceph::decode(stage_size, delta);
+ assert(start_offset > 0);
+ assert(stage_size > 0);
+ assert((unsigned)start_offset + stage_size <= node_size);
+ index_t index;
+ ceph::decode(index, delta);
+
+ item_iterator_t ret({{p_node_start + start_offset,
+ p_node_start + start_offset + stage_size},
+ node_size});
+ while (index > 0) {
+ ++ret;
+ --index;
+ }
+ return ret;
+ }
+
+ static node_offset_t header_size() { return 0u; }
+
+ template <IsFullKey Key>
+ static node_offset_t estimate_insert(
+ const Key& key, const value_input_t&) {
+ return ns_oid_view_t::estimate_size(key) + sizeof(node_offset_t);
+ }
+
+ template <IsFullKey Key>
+ static memory_range_t insert_prefix(
+ NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter,
+ const Key& key, bool is_end,
+ node_offset_t size, const char* p_left_bound);
+
+ static void update_size(
+ NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change);
+
+ static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&);
+ static node_offset_t trim_at(
+ NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed);
+
+ static node_offset_t erase(
+ NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, const char*);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ void next_item_range(const char* p_end) const {
+ auto p_item_end = p_end - sizeof(node_offset_t);
+ assert(p_items_start < p_item_end);
+ back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value;
+ assert(back_offset);
+ const char* p_item_start = p_item_end - back_offset;
+ assert(p_items_start <= p_item_start);
+ item_range = {p_item_start, p_item_end};
+ }
+
+ extent_len_t node_size;
+ const char* p_items_start;
+ const char* p_items_end;
+ mutable memory_range_t item_range;
+ mutable node_offset_t back_offset;
+ mutable std::optional<ns_oid_view_t> key;
+ mutable index_t _index = 0u;
+};
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+class item_iterator_t<NODE_TYPE>::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {}
+ Appender(NodeExtentMutable*, const item_iterator_t&, bool open);
+ bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items);
+ char* wrap() { return p_append; }
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+ void wrap_nxt(char* _p_append);
+
+ private:
+ NodeExtentMutable* p_mut;
+ char* p_append;
+ char* p_offset_while_open;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
new file mode 100644
index 000000000..4bf717dc2
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "key_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void string_key_view_t::append_str(
+ NodeExtentMutable& mut, std::string_view str, char*& p_append)
+{
+ assert(is_valid_size(str.length()));
+ p_append -= sizeof(string_size_t);
+ string_size_t len = str.length();
+ mut.copy_in_absolute(p_append, len);
+ p_append -= len;
+ mut.copy_in_absolute(p_append, str.data(), len);
+}
+
+void string_key_view_t::append_dedup(
+ NodeExtentMutable& mut, const Type& dedup_type, char*& p_append)
+{
+ p_append -= sizeof(string_size_t);
+ if (dedup_type == Type::MIN) {
+ mut.copy_in_absolute(p_append, MARKER_MIN);
+ } else if (dedup_type == Type::MAX) {
+ mut.copy_in_absolute(p_append, MARKER_MAX);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
new file mode 100644
index 000000000..fcd485355
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
@@ -0,0 +1,910 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <limits>
+#include <optional>
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+using shard_t = int8_t;
+using pool_t = int64_t;
+// Note: this is the reversed version of the object hash
+using crush_hash_t = uint32_t;
+using snap_t = uint64_t;
+using gen_t = uint64_t;
+static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id));
+static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool));
+static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_bitwise_key_u32()));
+static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val));
+static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation));
+
+constexpr auto MAX_SHARD = std::numeric_limits<shard_t>::max();
+constexpr auto MAX_POOL = std::numeric_limits<pool_t>::max();
+constexpr auto MAX_CRUSH = std::numeric_limits<crush_hash_t>::max();
+constexpr auto MAX_SNAP = std::numeric_limits<snap_t>::max();
+constexpr auto MAX_GEN = std::numeric_limits<gen_t>::max();
+
+class NodeExtentMutable;
+class key_view_t;
+class key_hobj_t;
+enum class KeyT { VIEW, HOBJ };
+template <KeyT> struct _full_key_type;
+template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; };
+template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; };
+template <KeyT type>
+using full_key_t = typename _full_key_type<type>::type;
+
+static laddr_t get_lba_hint(shard_t shard, pool_t pool, crush_hash_t crush) {
+ // FIXME: It is possible that PGs from different pools share the same prefix
+ // if the mask 0xFF is not long enough, result in unexpected transaction
+ // conflicts.
+ return ((uint64_t)(shard & 0XFF)<<56 |
+ (uint64_t)(pool & 0xFF)<<48 |
+ (uint64_t)(crush )<<16);
+}
+
+struct node_offset_packed_t {
+ node_offset_t value;
+} __attribute__((packed));
+
+/**
+ * string_key_view_t
+ *
+ * The layout to store char array as an oid or an ns string which may be
+ * compressed.
+ *
+ * (TODO) If compressed, the physical block only stores an unsigned int of
+ * string_size_t, with value MARKER_MIN denoting Type::MIN, and value
+ * MARKER_MAX denoting Type::MAX.
+ *
+ * If not compressed (Type::STR), the physical block stores the char array and
+ * a valid string_size_t value.
+ */
+struct string_key_view_t {
+ enum class Type {MIN, STR, MAX};
+ static constexpr auto MARKER_MAX = std::numeric_limits<string_size_t>::max();
+ static constexpr auto MARKER_MIN = std::numeric_limits<string_size_t>::max() - 1;
+ static constexpr auto VALID_UPPER_BOUND = std::numeric_limits<string_size_t>::max() - 2;
+ static bool is_valid_size(size_t size) {
+ return size <= VALID_UPPER_BOUND;
+ }
+
+ string_key_view_t(const char* p_end) {
+ p_length = p_end - sizeof(string_size_t);
+ std::memcpy(&length, p_length, sizeof(string_size_t));
+ if (is_valid_size(length)) {
+ auto _p_key = p_length - length;
+ p_key = static_cast<const char*>(_p_key);
+ } else {
+ assert(length == MARKER_MAX || length == MARKER_MIN);
+ p_key = nullptr;
+ }
+ }
+ Type type() const {
+ if (length == MARKER_MIN) {
+ return Type::MIN;
+ } else if (length == MARKER_MAX) {
+ return Type::MAX;
+ } else {
+ assert(is_valid_size(length));
+ return Type::STR;
+ }
+ }
+ const char* p_start() const {
+ if (p_key) {
+ return p_key;
+ } else {
+ return p_length;
+ }
+ }
+ const char* p_next_end() const {
+ if (p_key) {
+ return p_start();
+ } else {
+ return p_length + sizeof(string_size_t);
+ }
+ }
+ node_offset_t size() const {
+ size_t ret = length + sizeof(string_size_t);
+ assert(ret < MAX_NODE_SIZE);
+ return ret;
+ }
+ node_offset_t size_logical() const {
+ assert(type() == Type::STR);
+ assert(is_valid_size(length));
+ return length;
+ }
+ node_offset_t size_overhead() const {
+ assert(type() == Type::STR);
+ return sizeof(string_size_t);
+ }
+
+ std::string_view to_string_view() const {
+ assert(type() == Type::STR);
+ assert(is_valid_size(length));
+ return {p_key, length};
+ }
+ bool operator==(const string_key_view_t& x) const {
+ if (type() == x.type() && type() != Type::STR)
+ return true;
+ if (type() != x.type())
+ return false;
+ if (length != x.length)
+ return false;
+ return (memcmp(p_key, x.p_key, length) == 0);
+ }
+ bool operator!=(const string_key_view_t& x) const { return !(*this == x); }
+
+ void reset_to(const char* origin_base,
+ const char* new_base,
+ extent_len_t node_size) {
+ reset_ptr(p_key, origin_base, new_base, node_size);
+ reset_ptr(p_length, origin_base, new_base, node_size);
+#ifndef NDEBUG
+ string_size_t current_length;
+ std::memcpy(&current_length, p_length, sizeof(string_size_t));
+ assert(length == current_length);
+#endif
+ }
+
+ static void append_str(
+ NodeExtentMutable&, std::string_view, char*& p_append);
+
+ static void test_append_str(std::string_view str, char*& p_append) {
+ assert(is_valid_size(str.length()));
+ p_append -= sizeof(string_size_t);
+ string_size_t len = str.length();
+ std::memcpy(p_append, &len, sizeof(string_size_t));
+ p_append -= len;
+ std::memcpy(p_append, str.data(), len);
+ }
+
+ static void append_dedup(
+ NodeExtentMutable&, const Type& dedup_type, char*& p_append);
+
+ static void test_append_dedup(const Type& dedup_type, char*& p_append) {
+ p_append -= sizeof(string_size_t);
+ string_size_t len;
+ if (dedup_type == Type::MIN) {
+ len = MARKER_MIN;
+ } else if (dedup_type == Type::MAX) {
+ len = MARKER_MAX;
+ } else {
+ ceph_abort("impossible path");
+ }
+ std::memcpy(p_append, &len, sizeof(string_size_t));
+ }
+
+ const char* p_key;
+ const char* p_length;
+ // TODO: remove if p_length is aligned
+ string_size_t length;
+};
+
+/**
+ * string_view_masked_t
+ *
+ * A common class to hide the underlying string implementation regardless of a
+ * string_key_view_t (maybe compressed), a string/string_view, or a compressed
+ * string. And leverage this consistant class to do compare, print, convert and
+ * append operations.
+ */
+class string_view_masked_t {
+ public:
+ using Type = string_key_view_t::Type;
+ explicit string_view_masked_t(const string_key_view_t& index)
+ : type{index.type()} {
+ if (type == Type::STR) {
+ view = index.to_string_view();
+ }
+ }
+ explicit string_view_masked_t(std::string_view str)
+ : type{Type::STR}, view{str} {
+ assert(string_key_view_t::is_valid_size(view.size()));
+ }
+
+ Type get_type() const { return type; }
+ std::string_view to_string_view() const {
+ assert(get_type() == Type::STR);
+ return view;
+ }
+ string_size_t size() const {
+ assert(get_type() == Type::STR);
+ assert(string_key_view_t::is_valid_size(view.size()));
+ return view.size();
+ }
+ bool operator==(const string_view_masked_t& x) const {
+ if (get_type() == x.get_type() && get_type() != Type::STR)
+ return true;
+ if (get_type() != x.get_type())
+ return false;
+ if (size() != x.size())
+ return false;
+ return (memcmp(view.data(), x.view.data(), size()) == 0);
+ }
+ auto operator<=>(std::string_view rhs) const {
+ using Type = string_view_masked_t::Type;
+ assert(string_key_view_t::is_valid_size(rhs.size()));
+ auto lhs_type = get_type();
+ if (lhs_type == Type::MIN) {
+ return std::strong_ordering::less;
+ } else if (lhs_type == Type::MAX) {
+ return std::strong_ordering::greater;
+ } else { // r_type == Type::STR
+ assert(string_key_view_t::is_valid_size(size()));
+ return to_string_view() <=> rhs;
+ }
+ }
+ void encode(ceph::bufferlist& bl) const {
+ if (get_type() == Type::MIN) {
+ ceph::encode(string_key_view_t::MARKER_MIN, bl);
+ } else if (get_type() == Type::MAX) {
+ ceph::encode(string_key_view_t::MARKER_MAX, bl);
+ } else {
+ ceph::encode(size(), bl);
+ ceph::encode_nohead(view, bl);
+ }
+ }
+ static auto min() { return string_view_masked_t{Type::MIN}; }
+ static auto max() { return string_view_masked_t{Type::MAX}; }
+ static string_view_masked_t decode(
+ std::string& str_storage, ceph::bufferlist::const_iterator& delta) {
+ string_size_t size;
+ ceph::decode(size, delta);
+ if (size == string_key_view_t::MARKER_MIN) {
+ return min();
+ } else if (size == string_key_view_t::MARKER_MAX) {
+ return max();
+ } else {
+ ceph::decode_nohead(size, str_storage, delta);
+ return string_view_masked_t(str_storage);
+ }
+ }
+
+ private:
+ explicit string_view_masked_t(Type type)
+ : type{type} {}
+
+ Type type;
+ std::string_view view;
+};
+
+inline auto operator<=>(const string_view_masked_t& l, const string_view_masked_t& r) {
+ using Type = string_view_masked_t::Type;
+ auto l_type = l.get_type();
+ auto r_type = r.get_type();
+ if (l_type == Type::STR && r_type == Type::STR) {
+ assert(string_key_view_t::is_valid_size(l.size()));
+ assert(string_key_view_t::is_valid_size(r.size()));
+ return l.to_string_view() <=> r.to_string_view();
+ } else if (l_type == r_type) {
+ return std::strong_ordering::equal;
+ } else if (l_type == Type::MIN || r_type == Type::MAX) {
+ return std::strong_ordering::less;
+ } else { // l_type == Type::MAX || r_type == Type::MIN
+ return std::strong_ordering::greater;
+ }
+}
+
+inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) {
+ using Type = string_view_masked_t::Type;
+ auto type = masked.get_type();
+ if (type == Type::MIN) {
+ return os << "MIN";
+ } else if (type == Type::MAX) {
+ return os << "MAX";
+ } else { // type == Type::STR
+ auto view = masked.to_string_view();
+ if (view.length() <= 12) {
+ os << "\"" << view << "\"";
+ } else {
+ os << "\"" << std::string_view(view.data(), 4) << ".."
+ << std::string_view(view.data() + view.length() - 2, 2)
+ << "/" << view.length() << "B\"";
+ }
+ return os;
+ }
+}
+
+struct ns_oid_view_t {
+ using Type = string_key_view_t::Type;
+
+ ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {}
+ Type type() const { return oid.type(); }
+ const char* p_start() const { return oid.p_start(); }
+ node_offset_t size() const {
+ if (type() == Type::STR) {
+ size_t ret = nspace.size() + oid.size();
+ assert(ret < MAX_NODE_SIZE);
+ return ret;
+ } else {
+ return sizeof(string_size_t);
+ }
+ }
+ node_offset_t size_logical() const {
+ assert(type() == Type::STR);
+ return nspace.size_logical() + oid.size_logical();
+ }
+ node_offset_t size_overhead() const {
+ assert(type() == Type::STR);
+ return nspace.size_overhead() + oid.size_overhead();
+ }
+ bool operator==(const ns_oid_view_t& x) const {
+ return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} &&
+ string_view_masked_t{oid} == string_view_masked_t{x.oid});
+ }
+
+ void reset_to(const char* origin_base,
+ const char* new_base,
+ extent_len_t node_size) {
+ nspace.reset_to(origin_base, new_base, node_size);
+ oid.reset_to(origin_base, new_base, node_size);
+ }
+
+ template <typename Key>
+ static node_offset_t estimate_size(const Key& key);
+
+ template <typename Key>
+ static void append(NodeExtentMutable&,
+ const Key& key,
+ char*& p_append);
+
+ static void append(NodeExtentMutable& mut,
+ const ns_oid_view_t& view,
+ char*& p_append) {
+ if (view.type() == Type::STR) {
+ string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append);
+ string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append);
+ } else {
+ string_key_view_t::append_dedup(mut, view.type(), p_append);
+ }
+ }
+
+ template <typename Key>
+ static void test_append(const Key& key, char*& p_append);
+
+ string_key_view_t nspace;
+ string_key_view_t oid;
+};
+inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) {
+ return os << string_view_masked_t{ns_oid.nspace} << ","
+ << string_view_masked_t{ns_oid.oid};
+}
+inline auto operator<=>(const ns_oid_view_t& l, const ns_oid_view_t& r) {
+ auto ret = (string_view_masked_t{l.nspace} <=> string_view_masked_t{r.nspace});
+ if (ret != 0)
+ return ret;
+ return string_view_masked_t{l.oid} <=> string_view_masked_t{r.oid};
+}
+
+inline const ghobject_t _MIN_OID() {
+ assert(ghobject_t().is_min());
+ // don't extern _MIN_OID
+ return ghobject_t();
+}
+
+/*
+ * Unfortunally the ghobject_t representitive as tree key doesn't have max
+ * field, so we define our own _MAX_OID and translate it from/to
+ * ghobject_t::get_max() if necessary.
+ */
+inline const ghobject_t _MAX_OID() {
+ auto ret = ghobject_t(shard_id_t(MAX_SHARD), MAX_POOL, MAX_CRUSH,
+ "MAX", "MAX", MAX_SNAP, MAX_GEN);
+ assert(ret.hobj.get_hash() == ret.hobj.get_bitwise_key_u32());
+ return ret;
+}
+
+// the valid key stored in tree should be in the range of (_MIN_OID, _MAX_OID)
+template <typename Key>
+bool is_valid_key(const Key& key);
+
+/**
+ * key_hobj_t
+ *
+ * A specialized implementation of a full_key_t storing a ghobject_t passed
+ * from user.
+ */
+class key_hobj_t {
+ public:
+ explicit key_hobj_t(const ghobject_t& _ghobj) {
+ if (_ghobj.is_max()) {
+ ghobj = _MAX_OID();
+ } else {
+ // including when _ghobj.is_min()
+ ghobj = _ghobj;
+ }
+ // I can be in the range of [_MIN_OID, _MAX_OID]
+ assert(ghobj >= _MIN_OID());
+ assert(ghobj <= _MAX_OID());
+ }
+ /*
+ * common interfaces as a full_key_t
+ */
+ shard_t shard() const {
+ return ghobj.shard_id;
+ }
+ pool_t pool() const {
+ return ghobj.hobj.pool;
+ }
+ crush_hash_t crush() const {
+ // Note: this is the reversed version of the object hash
+ return ghobj.hobj.get_bitwise_key_u32();
+ }
+ laddr_t get_hint() const {
+ return get_lba_hint(shard(), pool(), crush());
+ }
+ std::string_view nspace() const {
+ // TODO(cross-node string dedup)
+ return ghobj.hobj.nspace;
+ }
+ string_view_masked_t nspace_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{nspace()};
+ }
+ std::string_view oid() const {
+ // TODO(cross-node string dedup)
+ return ghobj.hobj.oid.name;
+ }
+ string_view_masked_t oid_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{oid()};
+ }
+ ns_oid_view_t::Type dedup_type() const {
+ return _dedup_type;
+ }
+ snap_t snap() const {
+ return ghobj.hobj.snap;
+ }
+ gen_t gen() const {
+ return ghobj.generation;
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "key_hobj(" << (int)shard() << ","
+ << pool() << ",0x" << std::hex << crush() << std::dec << "; "
+ << string_view_masked_t{nspace()} << ","
+ << string_view_masked_t{oid()} << "; "
+ << snap() << "," << gen() << ")";
+ return os;
+ }
+
+ bool is_valid() const {
+ return is_valid_key(*this);
+ }
+
+ static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) {
+ shard_t shard;
+ ceph::decode(shard, delta);
+ pool_t pool;
+ ceph::decode(pool, delta);
+ // Note: this is the reversed version of the object hash
+ crush_hash_t crush;
+ ceph::decode(crush, delta);
+ std::string nspace;
+ [[maybe_unused]] auto nspace_masked = string_view_masked_t::decode(nspace, delta);
+ // TODO(cross-node string dedup)
+ assert(nspace_masked.get_type() == string_view_masked_t::Type::STR);
+ std::string oid;
+ [[maybe_unused]] auto oid_masked = string_view_masked_t::decode(oid, delta);
+ // TODO(cross-node string dedup)
+ assert(oid_masked.get_type() == string_view_masked_t::Type::STR);
+ snap_t snap;
+ ceph::decode(snap, delta);
+ gen_t gen;
+ ceph::decode(gen, delta);
+ return key_hobj_t(ghobject_t(
+ shard_id_t(shard), pool, crush, nspace, oid, snap, gen));
+ }
+
+ private:
+ ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR;
+ ghobject_t ghobj;
+};
+inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) {
+ return key.dump(os);
+}
+
+struct shard_pool_t;
+struct crush_t;
+struct shard_pool_crush_t;
+struct snap_gen_t;
+
+/**
+ * key_view_t
+ *
+ * A specialized implementation of a full_key_t pointing to the locations
+ * storing the full key in a tree node.
+ */
+class key_view_t {
+ public:
+ /**
+ * common interfaces as a full_key_t
+ */
+ inline shard_t shard() const;
+ inline pool_t pool() const;
+ inline crush_hash_t crush() const;
+ laddr_t get_hint() const {
+ return get_lba_hint(shard(), pool(), crush());
+ }
+ std::string_view nspace() const {
+ // TODO(cross-node string dedup)
+ return ns_oid_view().nspace.to_string_view();
+ }
+ string_view_masked_t nspace_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{ns_oid_view().nspace};
+ }
+ std::string_view oid() const {
+ // TODO(cross-node string dedup)
+ return ns_oid_view().oid.to_string_view();
+ }
+ string_view_masked_t oid_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{ns_oid_view().oid};
+ }
+ ns_oid_view_t::Type dedup_type() const {
+ return ns_oid_view().type();
+ }
+ inline snap_t snap() const;
+ inline gen_t gen() const;
+
+ /**
+ * key_view_t specific interfaces
+ */
+ bool has_shard_pool() const {
+ return p_shard_pool != nullptr;
+ }
+ bool has_crush() const {
+ return p_crush != nullptr;
+ }
+ bool has_ns_oid() const {
+ return p_ns_oid.has_value();
+ }
+ bool has_snap_gen() const {
+ return p_snap_gen != nullptr;
+ }
+
+ const shard_pool_t& shard_pool_packed() const {
+ assert(has_shard_pool());
+ return *p_shard_pool;
+ }
+ const crush_t& crush_packed() const {
+ assert(has_crush());
+ return *p_crush;
+ }
+ const ns_oid_view_t& ns_oid_view() const {
+ assert(has_ns_oid());
+ return *p_ns_oid;
+ }
+ const snap_gen_t& snap_gen_packed() const {
+ assert(has_snap_gen());
+ return *p_snap_gen;
+ }
+
+ size_t size_logical() const {
+ return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) +
+ sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical();
+ }
+
+ ghobject_t to_ghobj() const {
+ assert(is_valid_key(*this));
+ return ghobject_t(
+ shard_id_t(shard()), pool(), crush(),
+ std::string(nspace()), std::string(oid()), snap(), gen());
+ }
+
+ void replace(const crush_t& key) { p_crush = &key; }
+ void set(const crush_t& key) {
+ assert(!has_crush());
+ replace(key);
+ }
+ inline void replace(const shard_pool_crush_t& key);
+ inline void set(const shard_pool_crush_t& key);
+ void replace(const ns_oid_view_t& key) { p_ns_oid = key; }
+ void set(const ns_oid_view_t& key) {
+ assert(!has_ns_oid());
+ replace(key);
+ }
+ void replace(const snap_gen_t& key) { p_snap_gen = &key; }
+ void set(const snap_gen_t& key) {
+ assert(!has_snap_gen());
+ replace(key);
+ }
+
+ void reset_to(const char* origin_base,
+ const char* new_base,
+ extent_len_t node_size) {
+ if (p_shard_pool != nullptr) {
+ reset_ptr(p_shard_pool, origin_base, new_base, node_size);
+ }
+ if (p_crush != nullptr) {
+ reset_ptr(p_crush, origin_base, new_base, node_size);
+ }
+ if (p_ns_oid.has_value()) {
+ p_ns_oid->reset_to(origin_base, new_base, node_size);
+ }
+ if (p_snap_gen != nullptr) {
+ reset_ptr(p_snap_gen, origin_base, new_base, node_size);
+ }
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "key_view(";
+ if (has_shard_pool()) {
+ os << (int)shard() << "," << pool() << ",";
+ } else {
+ os << "X,X,";
+ }
+ if (has_crush()) {
+ os << "0x" << std::hex << crush() << std::dec << "; ";
+ } else {
+ os << "X; ";
+ }
+ if (has_ns_oid()) {
+ os << ns_oid_view() << "; ";
+ } else {
+ os << "X,X; ";
+ }
+ if (has_snap_gen()) {
+ os << snap() << "," << gen() << ")";
+ } else {
+ os << "X,X)";
+ }
+ return os;
+ }
+
+ private:
+ const shard_pool_t* p_shard_pool = nullptr;
+ const crush_t* p_crush = nullptr;
+ std::optional<ns_oid_view_t> p_ns_oid;
+ const snap_gen_t* p_snap_gen = nullptr;
+};
+
+template<typename T>
+concept IsFullKey = std::same_as<T, key_hobj_t> || std::same_as<T, key_view_t>;
+
+// TODO: consider alignments
+struct shard_pool_t {
+ auto operator<=>(const shard_pool_t&) const = default;
+
+ pool_t pool() const { return _pool; }
+
+ template <IsFullKey Key>
+ static shard_pool_t from_key(const Key& key) {
+ if constexpr (std::same_as<Key, key_view_t>) {
+ return key.shard_pool_packed();
+ } else {
+ return {key.shard(), key.pool()};
+ }
+ }
+
+ shard_t shard;
+ pool_t _pool;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) {
+ return os << (int)sp.shard << "," << sp.pool();
+}
+
+// Note: this is the reversed version of the object hash
+struct crush_t {
+ auto operator<=>(const crush_t&) const = default;
+
+ template <IsFullKey Key>
+ static crush_t from_key(const Key& key) {
+ if constexpr (std::same_as<Key, key_view_t>) {
+ return key.crush_packed();
+ } else {
+ return {key.crush()};
+ }
+ }
+
+ crush_hash_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const crush_t& c) {
+ return os << "0x" << std::hex << c.crush << std::dec;
+}
+
+struct shard_pool_crush_t {
+ auto operator<=>(const shard_pool_crush_t&) const = default;
+
+ template <IsFullKey Key>
+ static shard_pool_crush_t from_key(const Key& key) {
+ return {shard_pool_t::from_key(key), crush_t::from_key(key)};
+ }
+
+ shard_pool_t shard_pool;
+ crush_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) {
+ return os << spc.shard_pool << ",0x" << std::hex << spc.crush << std::dec;
+}
+
+struct snap_gen_t {
+ auto operator<=>(const snap_gen_t&) const = default;
+
+ template <IsFullKey Key>
+ static snap_gen_t from_key(const Key& key) {
+ if constexpr (std::same_as<Key, key_view_t>) {
+ return key.snap_gen_packed();
+ } else {
+ return {key.snap(), key.gen()};
+ }
+ }
+
+ snap_t snap;
+ gen_t gen;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) {
+ return os << sg.snap << "," << sg.gen;
+}
+
+shard_t key_view_t::shard() const {
+ return shard_pool_packed().shard;
+}
+
+pool_t key_view_t::pool() const {
+ return shard_pool_packed().pool();
+}
+
+crush_hash_t key_view_t::crush() const {
+ return crush_packed().crush;
+}
+
+snap_t key_view_t::snap() const {
+ return snap_gen_packed().snap;
+}
+
+gen_t key_view_t::gen() const {
+ return snap_gen_packed().gen;
+}
+
+void key_view_t::replace(const shard_pool_crush_t& key) {
+ p_shard_pool = &key.shard_pool;
+}
+
+void key_view_t::set(const shard_pool_crush_t& key) {
+ set(key.crush);
+ assert(!has_shard_pool());
+ replace(key);
+}
+
+template <IsFullKey Key>
+void encode_key(const Key& key, ceph::bufferlist& bl) {
+ ceph::encode(key.shard(), bl);
+ ceph::encode(key.pool(), bl);
+ ceph::encode(key.crush(), bl);
+ key.nspace_masked().encode(bl);
+ key.oid_masked().encode(bl);
+ ceph::encode(key.snap(), bl);
+ ceph::encode(key.gen(), bl);
+}
+
+template<IsFullKey LHS, IsFullKey RHS>
+std::strong_ordering operator<=>(const LHS& lhs, const RHS& rhs) noexcept {
+ auto ret = lhs.shard() <=> rhs.shard();
+ if (ret != 0)
+ return ret;
+ ret = lhs.pool() <=> rhs.pool();
+ if (ret != 0)
+ return ret;
+ ret = lhs.crush() <=> rhs.crush();
+ if (ret != 0)
+ return ret;
+ ret = lhs.nspace() <=> rhs.nspace();
+ if (ret != 0)
+ return ret;
+ ret = lhs.oid() <=> rhs.oid();
+ if (ret != 0)
+ return ret;
+ ret = lhs.snap() <=> rhs.snap();
+ if (ret != 0)
+ return ret;
+ return lhs.gen() <=> rhs.gen();
+}
+
+template <typename Key>
+bool is_valid_key(const Key& key) {
+ static_assert(IsFullKey<Key>);
+ return (key > key_hobj_t(ghobject_t()) &&
+ key < key_hobj_t(ghobject_t::get_max()));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) {
+ return key.dump(os);
+}
+
+template <IsFullKey T>
+auto operator<=>(const T& key, const shard_pool_t& target) {
+ auto ret = key.shard() <=> target.shard;
+ if (ret != 0)
+ return ret;
+ return key.pool() <=> target.pool();
+}
+
+template <IsFullKey T>
+auto operator<=>(const T& key, const crush_t& target) {
+ return key.crush() <=> target.crush;
+}
+
+template <IsFullKey T>
+auto operator<=>(const T& key, const shard_pool_crush_t& target) {
+ auto ret = key <=> target.shard_pool;
+ if (ret != 0)
+ return ret;
+ return key <=> target.crush;
+}
+
+template <IsFullKey T>
+auto operator<=>(const T& key, const ns_oid_view_t& target) {
+ auto ret = key.nspace() <=> string_view_masked_t{target.nspace};
+ if (ret != 0)
+ return ret;
+ return key.oid() <=> string_view_masked_t{target.oid};
+}
+
+template <IsFullKey T>
+auto operator<=>(const T& key, const snap_gen_t& target) {
+ auto ret = key.snap() <=> target.snap;
+ if (ret != 0)
+ return ret;
+ return key.gen() <=> target.gen;
+}
+
+template <IsFullKey LHS, typename RHS>
+bool operator==(LHS lhs, RHS rhs) {
+ return lhs <=> rhs == 0;
+}
+
+template <typename Key>
+node_offset_t ns_oid_view_t::estimate_size(const Key& key) {
+ static_assert(IsFullKey<Key>);
+ if constexpr (std::same_as<Key, key_view_t>) {
+ return key.ns_oid_view().size();
+ } else {
+ if (key.dedup_type() != Type::STR) {
+ // size after deduplication
+ return sizeof(string_size_t);
+ } else {
+ return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size();
+ }
+ }
+}
+
+template <typename Key>
+void ns_oid_view_t::append(
+ NodeExtentMutable& mut, const Key& key, char*& p_append) {
+ static_assert(IsFullKey<Key>);
+ if (key.dedup_type() == Type::STR) {
+ string_key_view_t::append_str(mut, key.nspace(), p_append);
+ string_key_view_t::append_str(mut, key.oid(), p_append);
+ } else {
+ string_key_view_t::append_dedup(mut, key.dedup_type(), p_append);
+ }
+}
+
+template <typename Key>
+void ns_oid_view_t::test_append(const Key& key, char*& p_append) {
+ static_assert(IsFullKey<Key>);
+ if (key.dedup_type() == Type::STR) {
+ string_key_view_t::test_append_str(key.nspace(), p_append);
+ string_key_view_t::test_append_str(key.oid(), p_append);
+ } else {
+ string_key_view_t::test_append_dedup(key.dedup_type(), p_append);
+ }
+}
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::onode::key_hobj_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::onode::key_view_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
new file mode 100644
index 000000000..3ed401c37
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
@@ -0,0 +1,420 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+#include "node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#define NODE_T node_extent_t<FieldType, NODE_TYPE>
+#define NODE_INST(FT, NT) node_extent_t<FT, NT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+const char* NODE_T::p_left_bound() const
+{
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ // N3 internal node doesn't have the right part
+ return nullptr;
+ } else {
+ auto ret = p_start() +
+ fields().get_item_end_offset(keys(), node_size);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (is_level_tail()) {
+ ret -= sizeof(laddr_t);
+ }
+ }
+ return ret;
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::size_to_nxt_at(index_t index) const
+{
+ assert(index < keys());
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ return FieldType::estimate_insert_one();
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ auto p_end = p_start() +
+ p_fields->get_item_end_offset(index, node_size);
+ return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size();
+ } else {
+ ceph_abort("N3 node is not nested");
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+container_range_t NODE_T::get_nxt_container(index_t index) const
+{
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("N3 internal node doesn't have the right part");
+ } else {
+ auto item_start_offset = p_fields->get_item_start_offset(
+ index, node_size);
+ auto item_end_offset = p_fields->get_item_end_offset(
+ index, node_size);
+ assert(item_start_offset < item_end_offset);
+ auto item_p_start = p_start() + item_start_offset;
+ auto item_p_end = p_start() + item_end_offset;
+ if constexpr (FIELD_TYPE == field_type_t::N2) {
+ // range for sub_items_t<NODE_TYPE>
+ item_p_end = ns_oid_view_t(item_p_end).p_start();
+ assert(item_p_start < item_p_end);
+ } else {
+ // range for item_iterator_t<NODE_TYPE>
+ }
+ return {{item_p_start, item_p_end}, node_size};
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::bootstrap_extent(
+ NodeExtentMutable& mut,
+ field_type_t field_type, node_type_t node_type,
+ bool is_level_tail, level_t level)
+{
+ node_header_t::bootstrap_extent(
+ mut, field_type, node_type, is_level_tail, level);
+ mut.copy_in_relative(
+ sizeof(node_header_t), typename FieldType::num_keys_t(0u));
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_is_level_tail(
+ NodeExtentMutable& mut, const node_extent_t& extent, bool value)
+{
+ assert(mut.get_length() == extent.node_size);
+ assert(mut.get_read() == extent.p_start());
+ node_header_t::update_is_level_tail(mut, extent.p_fields->header, value);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <IsFullKey Key>
+memory_range_t NODE_T::insert_prefix_at(
+ NodeExtentMutable& mut, const node_extent_t& node, const Key& key,
+ index_t index, node_offset_t size, const char* p_left_bound)
+{
+ assert(mut.get_length() == node.node_size);
+ assert(mut.get_read() == node.p_start());
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ assert(index <= node.keys());
+ assert(p_left_bound == node.p_left_bound());
+ assert(size > FieldType::estimate_insert_one());
+ auto size_right = size - FieldType::estimate_insert_one();
+ const char* p_insert = node.p_start() +
+ node.fields().get_item_end_offset(index, mut.get_length());
+ const char* p_insert_front = p_insert - size_right;
+ FieldType::insert_at(mut, key, node.fields(), index, size_right);
+ mut.shift_absolute(p_left_bound,
+ p_insert - p_left_bound,
+ -(int)size_right);
+ return {p_insert_front, p_insert};
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible");
+ }
+}
+#define IPA_TEMPLATE(FT, NT, Key) \
+ template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<Key>( \
+ NodeExtentMutable&, const node_extent_t&, const Key&, \
+ index_t, node_offset_t, const char*)
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, key_view_t);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, key_view_t);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, key_view_t);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, key_view_t);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, key_view_t);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, key_view_t);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, key_hobj_t);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, key_hobj_t);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, key_hobj_t);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, key_hobj_t);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, key_hobj_t);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, key_hobj_t);
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_size_at(
+ NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change)
+{
+ assert(mut.get_length() == node.node_size);
+ assert(mut.get_read() == node.p_start());
+ assert(index < node.keys());
+ FieldType::update_size_at(mut, node.fields(), index, change);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_until(
+ NodeExtentMutable& mut, const node_extent_t& node, index_t index)
+{
+ assert(mut.get_length() == node.node_size);
+ assert(mut.get_read() == node.p_start());
+ assert(!node.is_level_tail());
+ auto keys = node.keys();
+ assert(index <= keys);
+ if (index == keys) {
+ return 0;
+ }
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("not implemented");
+ } else {
+ mut.copy_in_absolute(
+ (void*)&node.p_fields->num_keys, num_keys_t(index));
+ }
+ // no need to calculate trim size for node
+ return 0;
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_at(
+ NodeExtentMutable& mut, const node_extent_t& node,
+ index_t index, node_offset_t trimmed)
+{
+ assert(mut.get_length() == node.node_size);
+ assert(mut.get_read() == node.p_start());
+ assert(!node.is_level_tail());
+ assert(index < node.keys());
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("not implemented");
+ } else {
+ extent_len_t node_size = mut.get_length();
+ node_offset_t offset = node.p_fields->get_item_start_offset(
+ index, node_size);
+ size_t new_offset = offset + trimmed;
+ assert(new_offset < node.p_fields->get_item_end_offset(index, node_size));
+ mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)),
+ node_offset_t(new_offset));
+ mut.copy_in_absolute(
+ (void*)&node.p_fields->num_keys, num_keys_t(index + 1));
+ }
+ // no need to calculate trim size for node
+ return 0;
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::erase_at(
+ NodeExtentMutable& mut, const node_extent_t& node,
+ index_t index, const char* p_left_bound)
+{
+ assert(mut.get_length() == node.node_size);
+ assert(mut.get_read() == node.p_start());
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ assert(node.keys() > 0);
+ assert(index < node.keys());
+ assert(p_left_bound == node.p_left_bound());
+ return FieldType::erase_at(mut, node.fields(), index, p_left_bound);
+ } else {
+ ceph_abort("not implemented");
+ }
+}
+
+#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT)
+NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF);
+NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF);
+
+#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+APPEND_T::Appender(NodeExtentMutable* p_mut, const node_extent_t& node, bool open)
+ : p_mut{p_mut}, p_start{p_mut->get_write()}
+{
+ assert(p_start == node.p_start());
+ assert(node.keys());
+ assert(node.node_size == p_mut->get_length());
+ extent_len_t node_size = node.node_size;
+ if (open) {
+ // seek as open_nxt()
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ p_append_left = p_start + node.fields().get_key_start_offset(
+ node.keys() - 1, node_size);
+ p_append_left += sizeof(typename FieldType::key_t);
+ p_append_right = p_start +
+ node.fields().get_item_end_offset(node.keys() - 1,
+ node_size);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible path");
+ }
+ num_keys = node.keys() - 1;
+ } else {
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ std::ignore = node_size;
+ ceph_abort("not implemented");
+ } else {
+ p_append_left = p_start + node.fields().get_key_start_offset(
+ node.keys(), node_size);
+ p_append_right = p_start +
+ node.fields().get_item_end_offset(node.keys(),
+ node_size);
+ }
+ num_keys = node.keys();
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(const node_extent_t& src, index_t from, index_t items)
+{
+ assert(from <= src.keys());
+ if (p_src == nullptr) {
+ p_src = &src;
+ } else {
+ assert(p_src == &src);
+ }
+ assert(p_src->node_size == p_mut->get_length());
+ extent_len_t node_size = src.node_size;
+ if (items == 0) {
+ return;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ num_keys += items;
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ std::ignore = node_size;
+ ceph_abort("not implemented");
+ } else {
+ // append left part forwards
+ node_offset_t offset_left_start = src.fields().get_key_start_offset(
+ from, node_size);
+ node_offset_t offset_left_end = src.fields().get_key_start_offset(
+ from + items, node_size);
+ node_offset_t left_size = offset_left_end - offset_left_start;
+ if (num_keys == 0) {
+ // no need to adjust offset
+ assert(from == 0);
+ assert(p_start + offset_left_start == p_append_left);
+ p_mut->copy_in_absolute(p_append_left,
+ src.p_start() + offset_left_start, left_size);
+ } else {
+ node_offset_t step_size = FieldType::estimate_insert_one();
+ extent_len_t offset_base = src.fields().get_item_end_offset(
+ from, node_size);
+ int offset_change = p_append_right - p_start - offset_base;
+ auto p_offset_dst = p_append_left;
+ if constexpr (FIELD_TYPE != field_type_t::N2) {
+ // copy keys
+ p_mut->copy_in_absolute(p_append_left,
+ src.p_start() + offset_left_start, left_size);
+ // point to offset for update
+ p_offset_dst += sizeof(typename FieldType::key_t);
+ }
+ for (auto i = from; i < from + items; ++i) {
+ int new_offset = src.fields().get_item_start_offset(i, node_size) +
+ offset_change;
+ assert(new_offset > 0);
+ assert(new_offset < (int)node_size);
+ p_mut->copy_in_absolute(p_offset_dst, node_offset_t(new_offset));
+ p_offset_dst += step_size;
+ }
+ assert(p_append_left + left_size + sizeof(typename FieldType::key_t) ==
+ p_offset_dst);
+ }
+ p_append_left += left_size;
+
+ // append right part backwards
+ auto offset_right_start = src.fields().get_item_end_offset(
+ from + items, node_size);
+ auto offset_right_end = src.fields().get_item_end_offset(
+ from, node_size);
+ int right_size = offset_right_end - offset_right_start;
+ assert(right_size > 0);
+ assert(right_size < (int)node_size);
+ p_append_right -= right_size;
+ p_mut->copy_in_absolute(p_append_right,
+ src.p_start() + offset_right_start, node_offset_t(right_size));
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(
+ const full_key_t<KT>& key, const value_input_t& value, const value_t*& p_value)
+{
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("should not happen");
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key)
+{
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ FieldType::append_key(*p_mut, partial_key, p_append_left);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ FieldType::append_key(*p_mut, partial_key, p_append_right);
+ } else {
+ ceph_abort("impossible path");
+ }
+ return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key)
+{
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ FieldType::append_key(*p_mut, key, p_append_left);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ FieldType::append_key(*p_mut, key, p_append_right);
+ } else {
+ ceph_abort("impossible path");
+ }
+ return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+char* APPEND_T::wrap()
+{
+ assert(p_append_left <= p_append_right);
+ assert(p_src);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (p_src->is_level_tail()) {
+ laddr_t tail_value = p_src->get_end_p_laddr()->value;
+ p_append_right -= sizeof(laddr_t);
+ assert(p_append_left <= p_append_right);
+ p_mut->copy_in_absolute(p_append_right, tail_value);
+ }
+ }
+ p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys);
+ return p_append_left;
+}
+
+#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT>
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
new file mode 100644
index 000000000..5615998f8
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
@@ -0,0 +1,232 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * node_extent_t
+ *
+ * The top indexing stage implementation for node N0/N1/N2/N3, implements
+ * staged contract as an indexable container, and provides access to node
+ * header.
+ *
+ * The specific field layout are defined by FieldType which are
+ * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and
+ * leaf_fields_3_t. Diagrams see node_stage_layout.h.
+ */
+template <typename FieldType, node_type_t _NODE_TYPE>
+class node_extent_t {
+ public:
+ using value_input_t = value_input_type_t<_NODE_TYPE>;
+ using value_t = value_type_t<_NODE_TYPE>;
+ using num_keys_t = typename FieldType::num_keys_t;
+ static constexpr node_type_t NODE_TYPE = _NODE_TYPE;
+ static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE;
+
+ // TODO: remove
+ node_extent_t() = default;
+
+ node_extent_t(const FieldType* p_fields, extent_len_t node_size)
+ : p_fields{p_fields}, node_size{node_size} {
+ assert(is_valid_node_size(node_size));
+ validate(*p_fields);
+ }
+
+ const char* p_start() const { return fields_start(*p_fields); }
+
+ bool is_level_tail() const { return p_fields->is_level_tail(); }
+ level_t level() const { return p_fields->header.level; }
+ node_offset_t free_size() const {
+ return p_fields->template free_size_before<NODE_TYPE>(
+ keys(), node_size);
+ }
+ extent_len_t total_size() const {
+ return p_fields->total_size(node_size);
+ }
+ const char* p_left_bound() const;
+ template <node_type_t T = NODE_TYPE>
+ std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*>
+ get_end_p_laddr() const {
+ assert(is_level_tail());
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ return p_fields->get_p_child_addr(keys(), node_size);
+ } else {
+ auto offset_start = p_fields->get_item_end_offset(
+ keys(), node_size);
+ assert(offset_start <= node_size);
+ offset_start -= sizeof(laddr_packed_t);
+ auto p_addr = p_start() + offset_start;
+ return reinterpret_cast<const laddr_packed_t*>(p_addr);
+ }
+ }
+
+ // container type system
+ using key_get_type = typename FieldType::key_get_type;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ index_t keys() const { return p_fields->num_keys; }
+ key_get_type operator[] (index_t index) const {
+ return p_fields->get_key(index, node_size);
+ }
+ extent_len_t size_before(index_t index) const {
+ auto free_size = p_fields->template free_size_before<NODE_TYPE>(
+ index, node_size);
+ assert(total_size() >= free_size);
+ return total_size() - free_size;
+ }
+ node_offset_t size_to_nxt_at(index_t index) const;
+ node_offset_t size_overhead_at(index_t index) const {
+ return FieldType::ITEM_OVERHEAD; }
+ container_range_t get_nxt_container(index_t index) const;
+
+ template <typename T = FieldType>
+ std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*>
+ get_p_value(index_t index) const {
+ assert(index < keys());
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ return p_fields->get_p_child_addr(index, node_size);
+ } else {
+ auto range = get_nxt_container(index).range;
+ auto ret = reinterpret_cast<const value_header_t*>(range.p_start);
+ assert(range.p_start + ret->allocation_size() == range.p_end);
+ return ret;
+ }
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ assert(p_node_start == p_start());
+ // nothing to encode as the container range is the entire extent
+ }
+
+ static node_extent_t decode(const char* p_node_start,
+ extent_len_t node_size,
+ ceph::bufferlist::const_iterator& delta) {
+ // nothing to decode
+ return node_extent_t(
+ reinterpret_cast<const FieldType*>(p_node_start),
+ node_size);
+ }
+
+ static void validate(const FieldType& fields) {
+#ifndef NDEBUG
+ assert(fields.header.get_node_type() == NODE_TYPE);
+ assert(fields.header.get_field_type() == FieldType::FIELD_TYPE);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(fields.header.level > 0u);
+ } else {
+ assert(fields.header.level == 0u);
+ }
+#endif
+ }
+
+ static void bootstrap_extent(
+ NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+ static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool);
+
+ static node_offset_t header_size() { return FieldType::HEADER_SIZE; }
+
+ template <IsFullKey Key>
+ static node_offset_t estimate_insert(
+ const Key& key, const value_input_t& value) {
+ auto size = FieldType::estimate_insert_one();
+ if constexpr (FIELD_TYPE == field_type_t::N2) {
+ size += ns_oid_view_t::estimate_size(key);
+ } else if constexpr (FIELD_TYPE == field_type_t::N3 &&
+ NODE_TYPE == node_type_t::LEAF) {
+ size += value.allocation_size();
+ }
+ return size;
+ }
+
+ template <IsFullKey Key>
+ static const value_t* insert_at(
+ NodeExtentMutable& mut, const node_extent_t&,
+ const Key& key, const value_input_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible");
+ }
+ }
+
+ template <IsFullKey Key>
+ static memory_range_t insert_prefix_at(
+ NodeExtentMutable&, const node_extent_t&,
+ const Key& key,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static void update_size_at(
+ NodeExtentMutable&, const node_extent_t&, index_t index, int change);
+
+ static node_offset_t trim_until(
+ NodeExtentMutable&, const node_extent_t&, index_t index);
+ static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&,
+ index_t index, node_offset_t trimmed);
+
+ static node_offset_t erase_at(NodeExtentMutable&, const node_extent_t&,
+ index_t index, const char* p_left_bound);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ const FieldType& fields() const { return *p_fields; }
+ const FieldType* p_fields;
+ extent_len_t node_size;
+};
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+class node_extent_t<FieldType, NODE_TYPE>::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_start{p_append} {
+#ifndef NDEBUG
+ auto p_fields = reinterpret_cast<const FieldType*>(p_append);
+ assert(*(p_fields->header.get_field_type()) == FIELD_TYPE);
+ assert(p_fields->header.get_node_type() == NODE_TYPE);
+ assert(p_fields->num_keys == 0);
+#endif
+ p_append_left = p_start + FieldType::HEADER_SIZE;
+ p_append_right = p_start + p_mut->get_length();
+ }
+ Appender(NodeExtentMutable*, const node_extent_t&, bool open = false);
+ void append(const node_extent_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>&, const value_input_t&, const value_t*&);
+ char* wrap();
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+ void wrap_nxt(char* p_append) {
+ if constexpr (FIELD_TYPE != field_type_t::N3) {
+ assert(p_append < p_append_right);
+ assert(p_append_left < p_append);
+ p_append_right = p_append;
+ auto new_offset = p_append - p_start;
+ assert(new_offset > 0);
+ assert(new_offset < p_mut->get_length());
+ FieldType::append_offset(*p_mut, new_offset, p_append_left);
+ ++num_keys;
+ } else {
+ ceph_abort("not implemented");
+ }
+ }
+
+ private:
+ const node_extent_t* p_src = nullptr;
+ NodeExtentMutable* p_mut;
+ char* p_start;
+ char* p_append_left;
+ char* p_append_right;
+ num_keys_t num_keys = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
new file mode 100644
index 000000000..a0752e0fc
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void node_header_t::bootstrap_extent(
+ NodeExtentMutable& mut,
+ field_type_t field_type, node_type_t node_type,
+ bool is_level_tail, level_t level)
+{
+ node_header_t header;
+ header.set_field_type(field_type);
+ header.set_node_type(node_type);
+ header.set_is_level_tail(is_level_tail);
+ header.level = level;
+ mut.copy_in_relative(0, header);
+}
+
+void node_header_t::update_is_level_tail(
+ NodeExtentMutable& mut, const node_header_t& header, bool value)
+{
+ auto& _header = const_cast<node_header_t&>(header);
+ _header.set_is_level_tail(value);
+ mut.validate_inplace_update(_header);
+}
+
+#define F013_T _node_fields_013_t<SlotType>
+#define F013_INST(ST) _node_fields_013_t<ST>
+
+template <typename SlotType>
+void F013_T::update_size_at(
+ NodeExtentMutable& mut, const me_t& node, index_t index, int change)
+{
+ assert(index <= node.num_keys);
+ [[maybe_unused]] extent_len_t node_size = mut.get_length();
+#ifndef NDEBUG
+ // check underflow
+ if (change < 0 && index != node.num_keys) {
+ assert(node.get_item_start_offset(index, node_size) <
+ node.get_item_end_offset(index, node_size));
+ }
+#endif
+ for (const auto* p_slot = &node.slots[index];
+ p_slot < &node.slots[node.num_keys];
+ ++p_slot) {
+ node_offset_t offset = p_slot->right_offset;
+ int new_offset = offset - change;
+ assert(new_offset > 0);
+ assert(new_offset < (int)node_size);
+ mut.copy_in_absolute(
+ (void*)&(p_slot->right_offset),
+ node_offset_t(new_offset));
+ }
+#ifndef NDEBUG
+ // check overflow
+ if (change > 0 && index != node.num_keys) {
+ assert(node.num_keys > 0);
+ assert(node.get_key_start_offset(node.num_keys, node_size) <=
+ node.slots[node.num_keys - 1].right_offset);
+ }
+#endif
+}
+
+template <typename SlotType>
+void F013_T::append_key(
+ NodeExtentMutable& mut, const key_t& key, char*& p_append)
+{
+ mut.copy_in_absolute(p_append, key);
+ p_append += sizeof(key_t);
+}
+
+template <typename SlotType>
+void F013_T::append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append)
+{
+ mut.copy_in_absolute(p_append, offset_to_right);
+ p_append += sizeof(node_offset_t);
+}
+
+template <typename SlotType>
+template <IsFullKey Key>
+void F013_T::insert_at(
+ NodeExtentMutable& mut, const Key& key,
+ const me_t& node, index_t index, node_offset_t size_right)
+{
+ assert(index <= node.num_keys);
+ extent_len_t node_size = mut.get_length();
+ update_size_at(mut, node, index, size_right);
+ auto p_insert = const_cast<char*>(fields_start(node)) +
+ node.get_key_start_offset(index, node_size);
+ auto p_shift_end = fields_start(node) +
+ node.get_key_start_offset(node.num_keys, node_size);
+ mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one());
+ mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1));
+ append_key(mut, key_t::from_key(key), p_insert);
+ int new_offset = node.get_item_end_offset(index, node_size) - size_right;
+ assert(new_offset > 0);
+ assert(new_offset < (int)node_size);
+ append_offset(mut, new_offset, p_insert);
+}
+#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \
+ insert_at<KT>(NodeExtentMutable&, const KT&, \
+ const F013_INST(ST)&, index_t, node_offset_t)
+IA_TEMPLATE(slot_0_t, key_view_t);
+IA_TEMPLATE(slot_1_t, key_view_t);
+IA_TEMPLATE(slot_3_t, key_view_t);
+IA_TEMPLATE(slot_0_t, key_hobj_t);
+IA_TEMPLATE(slot_1_t, key_hobj_t);
+IA_TEMPLATE(slot_3_t, key_hobj_t);
+
+template <typename SlotType>
+node_offset_t F013_T::erase_at(
+ NodeExtentMutable& mut, const me_t& node, index_t index, const char* p_left_bound)
+{
+ extent_len_t node_size = mut.get_length();
+ auto offset_item_start = node.get_item_start_offset(index, node_size);
+ auto offset_item_end = node.get_item_end_offset(index, node_size);
+ assert(offset_item_start < offset_item_end);
+ auto erase_size = offset_item_end - offset_item_start;
+ // fix and shift the left part
+ update_size_at(mut, node, index + 1, -erase_size);
+ const char* p_shift_start = fields_start(node) +
+ node.get_key_start_offset(index + 1, node_size);
+ extent_len_t shift_len = sizeof(SlotType) * (node.num_keys - index - 1);
+ int shift_off = -(int)sizeof(SlotType);
+ mut.shift_absolute(p_shift_start, shift_len, shift_off);
+ // shift the right part
+ p_shift_start = p_left_bound;
+ shift_len = fields_start(node) + offset_item_start - p_left_bound;
+ shift_off = erase_size;
+ mut.shift_absolute(p_shift_start, shift_len, shift_off);
+ // fix num_keys
+ mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys - 1));
+ return erase_size;
+}
+
+#define F013_TEMPLATE(ST) template struct F013_INST(ST)
+F013_TEMPLATE(slot_0_t);
+F013_TEMPLATE(slot_1_t);
+F013_TEMPLATE(slot_3_t);
+
+void node_fields_2_t::append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append)
+{
+ mut.copy_in_absolute(p_append, offset_to_right);
+ p_append += sizeof(node_offset_t);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
new file mode 100644
index 000000000..1ed4865a6
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
@@ -0,0 +1,406 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "key_layout.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct node_header_t {
+ static constexpr unsigned FIELD_TYPE_BITS = 6u;
+ static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS);
+ static constexpr unsigned NODE_TYPE_BITS = 1u;
+ static constexpr unsigned B_LEVEL_TAIL_BITS = 1u;
+ using bits_t = uint8_t;
+
+ node_header_t() {}
+ std::optional<field_type_t> get_field_type() const {
+ if (field_type >= FIELD_TYPE_MAGIC &&
+ field_type < static_cast<uint8_t>(field_type_t::_MAX)) {
+ return static_cast<field_type_t>(field_type);
+ } else {
+ return std::nullopt;
+ }
+ }
+ node_type_t get_node_type() const {
+ return static_cast<node_type_t>(node_type);
+ }
+ bool get_is_level_tail() const {
+ return is_level_tail;
+ }
+
+ static void bootstrap_extent(
+ NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+ static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool);
+
+ bits_t field_type : FIELD_TYPE_BITS;
+ bits_t node_type : NODE_TYPE_BITS;
+ bits_t is_level_tail : B_LEVEL_TAIL_BITS;
+ static_assert(sizeof(bits_t) * 8 ==
+ FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS);
+ level_t level;
+
+ private:
+ void set_field_type(field_type_t type) {
+ field_type = static_cast<uint8_t>(type);
+ }
+ void set_node_type(node_type_t type) {
+ node_type = static_cast<uint8_t>(type);
+ }
+ void set_is_level_tail(bool value) {
+ is_level_tail = static_cast<uint8_t>(value);
+ }
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const node_header_t& header) {
+ auto field_type = header.get_field_type();
+ if (field_type.has_value()) {
+ os << "header" << header.get_node_type() << *field_type
+ << "(is_level_tail=" << header.get_is_level_tail()
+ << ", level=" << (unsigned)header.level << ")";
+ } else {
+ os << "header(INVALID)";
+ }
+ return os;
+}
+
+template <typename FixedKeyType, field_type_t _FIELD_TYPE>
+struct _slot_t {
+ using key_t = FixedKeyType;
+ static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE;
+ static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t);
+
+ key_t key;
+ node_offset_t right_offset;
+} __attribute__((packed));
+using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>;
+using slot_1_t = _slot_t<crush_t, field_type_t::N1>;
+using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>;
+
+struct node_range_t {
+ extent_len_t start;
+ extent_len_t end;
+};
+
+template <typename FieldType>
+const char* fields_start(const FieldType& node) {
+ return reinterpret_cast<const char*>(&node);
+}
+
+template <node_type_t NODE_TYPE, typename FieldType>
+node_range_t fields_free_range_before(
+ const FieldType& node, index_t index, extent_len_t node_size) {
+ assert(index <= node.num_keys);
+ extent_len_t offset_start = node.get_key_start_offset(index, node_size);
+ extent_len_t offset_end = node.get_item_end_offset(index, node_size);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (node.is_level_tail() && index == node.num_keys) {
+ offset_end -= sizeof(laddr_t);
+ }
+ }
+ assert(offset_start <= offset_end);
+ assert(offset_end - offset_start < node_size);
+ return {offset_start, offset_end};
+}
+
+/**
+ * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t
+ *
+ * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT
+ * layout implementation for leaf node N3.
+ *
+ * The node layout storing n slots:
+ *
+ * # <----------------------------- node range --------------------------------------> #
+ * # #<~># free space #
+ * # <----- left part -----------------------------> # <~# <----- right slots -------> #
+ * # # <---- left slots -------------> #~> # #
+ * # # slots [2, n) |<~># #<~>| right slots [2, n) #
+ * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> #
+ * # # | | # # | | #
+ * # | num_ # | right | | right | # # | next-stage | next-stage #
+ * # header | keys # key | offset | key | offset | # # | container | container #
+ * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 #
+ * | | ^ ^
+ * | | | |
+ * | +----------------+ |
+ * +--------------------------------------------+
+ */
+template <typename SlotType>
+struct _node_fields_013_t {
+ // should be enough to index all keys under 64 KiB node
+ using num_keys_t = uint16_t;
+ using key_t = typename SlotType::key_t;
+ using key_get_type = const key_t&;
+ using me_t = _node_fields_013_t<SlotType>;
+ static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD;
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ extent_len_t total_size(extent_len_t node_size) const {
+ return node_size;
+ }
+ key_get_type get_key(
+ index_t index, extent_len_t node_size) const {
+ assert(index < num_keys);
+ return slots[index].key;
+ }
+ node_offset_t get_key_start_offset(
+ index_t index, extent_len_t node_size) const {
+ assert(index <= num_keys);
+ auto offset = HEADER_SIZE + sizeof(SlotType) * index;
+ assert(offset < node_size);
+ return offset;
+ }
+ node_offset_t get_item_start_offset(
+ index_t index, extent_len_t node_size) const {
+ assert(index < num_keys);
+ auto offset = slots[index].right_offset;
+ assert(offset < node_size);
+ return offset;
+ }
+ const void* p_offset(index_t index) const {
+ assert(index < num_keys);
+ return &slots[index].right_offset;
+ }
+ extent_len_t get_item_end_offset(
+ index_t index, extent_len_t node_size) const {
+ return index == 0 ? node_size
+ : get_item_start_offset(index - 1, node_size);
+ }
+ template <node_type_t NODE_TYPE>
+ node_offset_t free_size_before(
+ index_t index, extent_len_t node_size) const {
+ auto range = fields_free_range_before<NODE_TYPE>(*this, index, node_size);
+ return range.end - range.start;
+ }
+
+ static node_offset_t estimate_insert_one() { return sizeof(SlotType); }
+ template <IsFullKey Key>
+ static void insert_at(
+ NodeExtentMutable&, const Key& key,
+ const me_t& node, index_t index, node_offset_t size_right);
+ static node_offset_t erase_at(NodeExtentMutable&, const me_t&, index_t, const char*);
+ static void update_size_at(
+ NodeExtentMutable&, const me_t& node, index_t index, int change);
+ static void append_key(
+ NodeExtentMutable&, const key_t& key, char*& p_append);
+ template <IsFullKey Key>
+ static void append_key(
+ NodeExtentMutable& mut, const Key& key, char*& p_append) {
+ append_key(mut, key_t::from_key(key), p_append);
+ }
+ static void append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ SlotType slots[];
+} __attribute__((packed));
+using node_fields_0_t = _node_fields_013_t<slot_0_t>;
+using node_fields_1_t = _node_fields_013_t<slot_1_t>;
+
+/**
+ * node_fields_2_t
+ *
+ * The STAGE_STRING layout implementation for node N2.
+ *
+ * The node layout storing n slots:
+ *
+ * # <--------------------------------- node range ----------------------------------------> #
+ * # #<~># free space #
+ * # <------- left part ---------------> # <~# <--------- right slots ---------------------> #
+ * # # <---- offsets ----> #~> #<~>| slots [2, n) #
+ * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> #
+ * # # | # # | | #
+ * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid #
+ * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 #
+ * | | ^ ^
+ * | | | |
+ * | +----------------+ |
+ * +-----------------------------------------------+
+ */
+struct node_fields_2_t {
+ // should be enough to index all keys under 64 KiB node
+ using num_keys_t = uint16_t;
+ using key_t = ns_oid_view_t;
+ using key_get_type = key_t;
+ static constexpr field_type_t FIELD_TYPE = field_type_t::N2;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t);
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ extent_len_t total_size(extent_len_t node_size) const {
+ return node_size;
+ }
+ key_get_type get_key(
+ index_t index, extent_len_t node_size) const {
+ assert(index < num_keys);
+ auto item_end_offset = get_item_end_offset(index, node_size);
+ const char* p_start = fields_start(*this);
+ return key_t(p_start + item_end_offset);
+ }
+ node_offset_t get_key_start_offset(
+ index_t index, extent_len_t node_size) const {
+ assert(index <= num_keys);
+ auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys;
+ assert(offset < node_size);
+ return offset;
+ }
+ node_offset_t get_item_start_offset(
+ index_t index, extent_len_t node_size) const {
+ assert(index < num_keys);
+ auto offset = offsets[index];
+ assert(offset < node_size);
+ return offset;
+ }
+ const void* p_offset(index_t index) const {
+ assert(index < num_keys);
+ return &offsets[index];
+ }
+ extent_len_t get_item_end_offset(
+ index_t index, extent_len_t node_size) const {
+ return index == 0 ? node_size
+ : get_item_start_offset(index - 1, node_size);
+ }
+ template <node_type_t NODE_TYPE>
+ node_offset_t free_size_before(
+ index_t index, extent_len_t node_size) const {
+ auto range = fields_free_range_before<NODE_TYPE>(*this, index, node_size);
+ return range.end - range.start;
+ }
+
+ static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); }
+ template <IsFullKey Key>
+ static void insert_at(
+ NodeExtentMutable& mut, const Key& key,
+ const node_fields_2_t& node, index_t index, node_offset_t size_right) {
+ ceph_abort("not implemented");
+ }
+ static void update_size_at(
+ NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) {
+ ceph_abort("not implemented");
+ }
+ static void append_key(
+ NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+ ns_oid_view_t::append(mut, key, p_append);
+ }
+ template <IsFullKey Key>
+ static void append_key(
+ NodeExtentMutable& mut, const Key& key, char*& p_append) {
+ ns_oid_view_t::append(mut, key, p_append);
+ }
+ static void append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ node_offset_t offsets[];
+} __attribute__((packed));
+
+/**
+ * internal_fields_3_t
+ *
+ * The STAGE_RIGHT layout implementation for N2.
+ *
+ * The node layout storing 3 children:
+ *
+ * # <---------------- node range ---------------------------> #
+ * # # <-- keys ---> # <---- laddrs -----------> #
+ * # free space: # |<~># |<~>#
+ * # # | # | #
+ * # | num_ # key | key | # laddr | laddr | laddr | #
+ * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...#
+ */
+struct internal_fields_3_t {
+ using key_get_type = const snap_gen_t&;
+ // should be enough to index all keys under 64 KiB node
+ using num_keys_t = uint16_t;
+ static constexpr field_type_t FIELD_TYPE = field_type_t::N3;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_SIZE =
+ sizeof(snap_gen_t) + sizeof(laddr_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = 0u;
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ extent_len_t total_size(extent_len_t node_size) const {
+ if (is_level_tail()) {
+ return node_size - sizeof(snap_gen_t);
+ } else {
+ return node_size;
+ }
+ }
+ key_get_type get_key(
+ index_t index, extent_len_t node_size) const {
+ assert(index < num_keys);
+ return keys[index];
+ }
+ template <node_type_t NODE_TYPE>
+ std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t>
+ free_size_before(index_t index, extent_len_t node_size) const {
+ assert(index <= num_keys);
+ assert(num_keys <= get_max_num_keys(node_size));
+ extent_len_t free = total_size(node_size) - HEADER_SIZE -
+ index * ITEM_SIZE;
+ if (is_level_tail() && index == num_keys) {
+ free -= sizeof(laddr_t);
+ }
+ return free;
+ }
+
+ const laddr_packed_t* get_p_child_addr(
+ index_t index, extent_len_t node_size) const {
+#ifndef NDEBUG
+ if (is_level_tail()) {
+ assert(index <= num_keys);
+ } else {
+ assert(index < num_keys);
+ }
+#endif
+ auto p_addrs = reinterpret_cast<const laddr_packed_t*>(
+ &keys[get_num_keys_limit(node_size)]);
+ auto ret = p_addrs + index;
+ assert((const char*)ret < fields_start(*this) + node_size);
+ return ret;
+ }
+
+ static node_offset_t estimate_insert_one() { return ITEM_SIZE; }
+
+ template <IsFullKey Key>
+ static void insert_at(
+ NodeExtentMutable& mut, const Key& key,
+ const internal_fields_3_t& node,
+ index_t index, node_offset_t size_right) {
+ ceph_abort("not implemented");
+ }
+ static void update_size_at(
+ NodeExtentMutable& mut, const internal_fields_3_t& node,
+ index_t index, int change) {
+ ceph_abort("not implemented");
+ }
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ snap_gen_t keys[];
+
+ private:
+ num_keys_t get_max_num_keys(extent_len_t node_size) const {
+ auto num_limit = get_num_keys_limit(node_size);
+ return (is_level_tail() ? num_limit - 1 : num_limit);
+ }
+ static num_keys_t get_num_keys_limit(extent_len_t node_size) {
+ return (node_size - HEADER_SIZE) / ITEM_SIZE;
+ }
+} __attribute__((packed));
+
+using leaf_fields_3_t = _node_fields_013_t<slot_3_t>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
new file mode 100644
index 000000000..7185b15ee
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
@@ -0,0 +1,2488 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <compare>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <type_traits>
+
+#include "common/likely.h"
+
+#include "sub_items_stage.h"
+#include "item_iterator_stage.h"
+
+namespace crimson::os::seastore::onode {
+
+struct search_result_bs_t {
+ index_t index;
+ MatchKindBS match;
+};
+template <typename FGetKey>
+search_result_bs_t binary_search(
+ const key_hobj_t& key,
+ index_t begin, index_t end, FGetKey&& f_get_key) {
+ assert(begin <= end);
+ while (begin < end) {
+ auto total = begin + end;
+ auto mid = total >> 1;
+ // do not copy if return value is reference
+ decltype(f_get_key(mid)) target = f_get_key(mid);
+ auto match = key <=> target;
+ if (match == std::strong_ordering::less) {
+ end = mid;
+ } else if (match == std::strong_ordering::greater) {
+ begin = mid + 1;
+ } else {
+ return {mid, MatchKindBS::EQ};
+ }
+ }
+ return {begin , MatchKindBS::NE};
+}
+
+template <typename PivotType, typename FGet>
+search_result_bs_t binary_search_r(
+ index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) {
+ assert(rend <= rbegin);
+ while (rend < rbegin) {
+ auto total = rend + rbegin + 1;
+ auto mid = total >> 1;
+ // do not copy if return value is reference
+ decltype(f_get(mid)) target = f_get(mid);
+ int match = target - key;
+ if (match < 0) {
+ rend = mid;
+ } else if (match > 0) {
+ rbegin = mid - 1;
+ } else {
+ return {mid, MatchKindBS::EQ};
+ }
+ }
+ return {rbegin, MatchKindBS::NE};
+}
+
+inline bool matchable(field_type_t type, match_stat_t mstat) {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+ /*
+ * compressed prefix by field type:
+ * N0: NONE
+ * N1: pool/shard
+ * N2: pool/shard crush
+ * N3: pool/shard crush ns/oid
+ *
+ * if key matches the node's compressed prefix, return true
+ * else, return false
+ */
+#ifndef NDEBUG
+ if (mstat == MSTAT_END) {
+ assert(type == field_type_t::N0);
+ }
+#endif
+ return mstat + to_unsigned(type) < 4;
+}
+
+inline void assert_mstat(
+ const key_hobj_t& key,
+ const key_view_t& index,
+ match_stat_t mstat) {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2);
+ // key < index ...
+ switch (mstat) {
+ case MSTAT_EQ:
+ break;
+ case MSTAT_LT0:
+ assert(key < index.snap_gen_packed());
+ break;
+ case MSTAT_LT1:
+ assert(key < index.ns_oid_view());
+ break;
+ case MSTAT_LT2:
+ if (index.has_shard_pool()) {
+ assert((key < shard_pool_crush_t{
+ index.shard_pool_packed(), index.crush_packed()}));
+ } else {
+ assert(key < index.crush_packed());
+ }
+ break;
+ default:
+ ceph_abort("impossible path");
+ }
+ // key == index ...
+ switch (mstat) {
+ case MSTAT_EQ:
+ assert(key == index.snap_gen_packed());
+ case MSTAT_LT0:
+ if (!index.has_ns_oid())
+ break;
+ assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX ||
+ key == index.ns_oid_view());
+ case MSTAT_LT1:
+ if (!index.has_crush())
+ break;
+ assert(key == index.crush_packed());
+ if (!index.has_shard_pool())
+ break;
+ assert(key == index.shard_pool_packed());
+ default:
+ break;
+ }
+}
+
+#define NXT_STAGE_T staged<next_param_t>
+
+enum class TrimType { BEFORE, AFTER, AT };
+
+/**
+ * staged
+ *
+ * Implements recursive logic that modifies or reads the node layout
+ * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific
+ * stage implementation is flexible. So the implementations for different
+ * stages can be assembled independently, as long as they follow the
+ * definitions of container interfaces.
+ *
+ * Multi-stage is designed to index different portions of onode keys
+ * stage-by-stage. There are at most 3 stages for a node:
+ * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node;
+ * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes;
+ * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes;
+ *
+ * The intention is to consolidate the high-level indexing implementations at
+ * the level of stage, so we don't need to write them repeatedly for every
+ * stage and for every node type.
+ */
+template <typename Params>
+struct staged {
+ static_assert(Params::STAGE >= STAGE_BOTTOM);
+ static_assert(Params::STAGE <= STAGE_TOP);
+ using container_t = typename Params::container_t;
+ using key_get_type = typename container_t::key_get_type;
+ using next_param_t = typename Params::next_param_t;
+ using position_t = staged_position_t<Params::STAGE>;
+ using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>;
+ using value_input_t = value_input_type_t<Params::NODE_TYPE>;
+ using value_t = value_type_t<Params::NODE_TYPE>;
+ static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE;
+ static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM);
+ static constexpr auto NODE_TYPE = Params::NODE_TYPE;
+ static constexpr auto STAGE = Params::STAGE;
+
+ template <bool is_exclusive>
+ static void _left_or_right(index_t& split_index, index_t insert_index,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_insert_left.has_value());
+ assert(is_valid_index(split_index));
+ if constexpr (is_exclusive) {
+ if (split_index <= insert_index) {
+ // ...[s_index-1] |!| (i_index) [s_index]...
+ // offset i_position to right
+ is_insert_left = false;
+ } else {
+ // ...[s_index-1] (i_index)) |?[s_index]| ...
+ // ...(i_index)...[s_index-1] |?[s_index]| ...
+ is_insert_left = true;
+ --split_index;
+ }
+ } else {
+ if (split_index < insert_index) {
+ // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]...
+ is_insert_left = false;
+ } else if (split_index > insert_index) {
+ // ...[(i_index)s_index-1] |?[s_index]| ...
+ // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ...
+ is_insert_left = true;
+ } else {
+ // ...[s_index-1] |?[(i_index)s_index]| ...
+ // i_to_left = std::nullopt;
+ }
+ }
+ }
+
+ template <ContainerType CTYPE, typename Enable = void> class _iterator_t;
+ template <ContainerType CTYPE>
+ class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> {
+ /*
+ * indexable container type system:
+ * CONTAINER_TYPE = ContainerType::INDEXABLE
+ * keys() const -> index_t
+ * operator[](index_t) const -> key_get_type
+ * size_before(index_t) const -> extent_len_t
+ * size_overhead_at(index_t) const -> node_offset_t
+ * (IS_BOTTOM) get_p_value(index_t) const -> const value_t*
+ * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t
+ * (!IS_BOTTOM) get_nxt_container(index_t) const
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, node_size, delta) -> container_t
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ * (IS_BOTTOM) insert_at(mut, src, key, value,
+ * index, size, p_left_bound) -> const value_t*
+ * (!IS_BOTTOM) insert_prefix_at(mut, src, key,
+ * index, size, p_left_bound) -> memory_range_t
+ * (!IS_BOTTOM) update_size_at(mut, src, index, size)
+ * trim_until(mut, container, index) -> trim_size
+ * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size
+ * erase_at(mut, container, index, p_left_bound) -> erase_size
+ *
+ * Appender::append(const container_t& src, from, items)
+ */
+ public:
+ using me_t = _iterator_t<CTYPE>;
+
+ _iterator_t(const container_t& container) : container{container} {
+ assert(container.keys());
+ }
+
+ index_t index() const {
+ return _index;
+ }
+ key_get_type get_key() const {
+ assert(!is_end());
+ return container[_index];
+ }
+ node_offset_t size_to_nxt() const {
+ assert(!is_end());
+ return container.size_to_nxt_at(_index);
+ }
+ template <typename T = typename NXT_STAGE_T::container_t>
+ std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const {
+ assert(!is_end());
+ return container.get_nxt_container(_index);
+ }
+ template <typename T = value_t>
+ std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const {
+ assert(!is_end());
+ return container.get_p_value(_index);
+ }
+ bool is_last() const {
+ return _index + 1 == container.keys();
+ }
+ bool is_end() const { return _index == container.keys(); }
+ node_offset_t size() const {
+ assert(!is_end());
+ assert(header_size() == container.size_before(0));
+ assert(container.size_before(_index + 1) > container.size_before(_index));
+ return container.size_before(_index + 1) -
+ container.size_before(_index);
+ }
+ node_offset_t size_overhead() const {
+ assert(!is_end());
+ return container.size_overhead_at(_index);
+ }
+
+ me_t& operator++() {
+ assert(!is_end());
+ assert(!is_last());
+ ++_index;
+ return *this;
+ }
+ void seek_at(index_t index) {
+ assert(index < container.keys());
+ seek_till_end(index);
+ }
+ void seek_till_end(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ assert(index <= container.keys());
+ _index = index;
+ }
+ void seek_last() {
+ assert(!is_end());
+ assert(index() == 0);
+ _index = container.keys() - 1;
+ }
+ void set_end() {
+ assert(!is_end());
+ assert(is_last());
+ ++_index;
+ }
+ // Note: possible to return an end iterator
+ MatchKindBS seek(const key_hobj_t& key, bool exclude_last) {
+ assert(!is_end());
+ assert(index() == 0);
+ index_t end_index = container.keys();
+ if (exclude_last) {
+ assert(end_index);
+ --end_index;
+ assert(key < container[end_index]);
+ }
+ auto ret = binary_search(key, _index, end_index,
+ [this] (index_t index) { return container[index]; });
+ _index = ret.index;
+ return ret.match;
+ }
+
+ template <IsFullKey Key, typename T = value_t>
+ std::enable_if_t<IS_BOTTOM, const T*> insert(
+ NodeExtentMutable& mut,
+ const Key& key,
+ const value_input_t& value,
+ node_offset_t insert_size,
+ const char* p_left_bound) {
+ return container_t::insert_at(
+ mut, container, key, value, _index, insert_size, p_left_bound);
+ }
+
+ template <IsFullKey Key, typename T = memory_range_t>
+ std::enable_if_t<!IS_BOTTOM, T> insert_prefix(
+ NodeExtentMutable& mut, const Key& key,
+ node_offset_t size, const char* p_left_bound) {
+ return container_t::insert_prefix_at(
+ mut, container, key, _index, size, p_left_bound);
+ }
+
+ template <typename T = void>
+ std::enable_if_t<!IS_BOTTOM, T>
+ update_size(NodeExtentMutable& mut, int insert_size) {
+ assert(!is_end());
+ container_t::update_size_at(mut, container, _index, insert_size);
+ }
+
+ // Note: possible to return an end iterator when is_exclusive is true
+ template <bool is_exclusive>
+ size_t seek_split_inserted(
+ size_t start_size, size_t extra_size, size_t target_size,
+ index_t& insert_index, size_t insert_size,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_end());
+ assert(index() == 0);
+ // replace insert_index placeholder
+ if constexpr (!is_exclusive) {
+ if (insert_index == INDEX_LAST) {
+ insert_index = container.keys() - 1;
+ }
+ } else {
+ if (insert_index == INDEX_END) {
+ insert_index = container.keys();
+ }
+ }
+ assert(insert_index <= container.keys());
+
+ auto start_size_1 = start_size + extra_size;
+ auto f_get_used_size = [this, start_size, start_size_1,
+ insert_index, insert_size] (index_t index) {
+ size_t current_size;
+ if (unlikely(index == 0)) {
+ current_size = start_size;
+ } else {
+ current_size = start_size_1;
+ if (index > insert_index) {
+ current_size += insert_size;
+ if constexpr (is_exclusive) {
+ --index;
+ }
+ }
+ // already includes header size
+ current_size += container.size_before(index);
+ }
+ return current_size;
+ };
+ index_t s_end;
+ if constexpr (is_exclusive) {
+ s_end = container.keys();
+ } else {
+ s_end = container.keys() - 1;
+ }
+ _index = binary_search_r(0, s_end, f_get_used_size, target_size).index;
+ size_t current_size = f_get_used_size(_index);
+ assert(current_size <= target_size);
+
+ _left_or_right<is_exclusive>(_index, insert_index, is_insert_left);
+ return current_size;
+ }
+
+ size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+ assert(!is_end());
+ assert(index() == 0);
+ auto start_size_1 = start_size + extra_size;
+ auto f_get_used_size = [this, start_size, start_size_1] (index_t index) {
+ size_t current_size;
+ if (unlikely(index == 0)) {
+ current_size = start_size;
+ } else {
+ // already includes header size
+ current_size = start_size_1 + container.size_before(index);
+ }
+ return current_size;
+ };
+ _index = binary_search_r(
+ 0, container.keys() - 1, f_get_used_size, target_size).index;
+ size_t current_size = f_get_used_size(_index);
+ assert(current_size <= target_size);
+ return current_size;
+ }
+
+ // Note: possible to return an end iterater if to_index == INDEX_END
+ template <KeyT KT>
+ void copy_out_until(
+ typename container_t::template Appender<KT>& appender, index_t& to_index) {
+ auto num_keys = container.keys();
+ index_t items;
+ if (to_index == INDEX_END) {
+ items = num_keys - _index;
+ appender.append(container, _index, items);
+ _index = num_keys;
+ to_index = _index;
+ } else if (to_index == INDEX_LAST) {
+ assert(!is_end());
+ items = num_keys - 1 - _index;
+ appender.append(container, _index, items);
+ _index = num_keys - 1;
+ to_index = _index;
+ } else {
+ assert(_index <= to_index);
+ assert(to_index <= num_keys);
+ items = to_index - _index;
+ appender.append(container, _index, items);
+ _index = to_index;
+ }
+ }
+
+ node_offset_t trim_until(NodeExtentMutable& mut) {
+ return container_t::trim_until(mut, container, _index);
+ }
+
+ template <typename T = node_offset_t>
+ std::enable_if_t<!IS_BOTTOM, T>
+ trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+ return container_t::trim_at(mut, container, _index, trimmed);
+ }
+
+ node_offset_t erase(NodeExtentMutable& mut, const char* p_left_bound) {
+ assert(!is_end());
+ return container_t::erase_at(mut, container, _index, p_left_bound);
+ }
+
+ template <KeyT KT>
+ typename container_t::template Appender<KT>
+ get_appender(NodeExtentMutable* p_mut) {
+ assert(_index + 1 == container.keys());
+ return typename container_t::template Appender<KT>(p_mut, container);
+ }
+
+ template <KeyT KT>
+ typename container_t::template Appender<KT>
+ get_appender_opened(NodeExtentMutable* p_mut) {
+ if constexpr (!IS_BOTTOM) {
+ assert(_index + 1 == container.keys());
+ return typename container_t::template Appender<KT>(p_mut, container, true);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ container.encode(p_node_start, encoded);
+ ceph::encode(_index, encoded);
+ }
+
+ static me_t decode(const char* p_node_start,
+ extent_len_t node_size,
+ ceph::bufferlist::const_iterator& delta) {
+ auto container = container_t::decode(
+ p_node_start, node_size, delta);
+ auto ret = me_t(container);
+ index_t index;
+ ceph::decode(index, delta);
+ ret.seek_till_end(index);
+ return ret;
+ }
+
+ static node_offset_t header_size() {
+ return container_t::header_size();
+ }
+
+ template <IsFullKey Key>
+ static node_offset_t estimate_insert(
+ const Key& key, const value_input_t& value) {
+ return container_t::estimate_insert(key, value);
+ }
+
+ private:
+ container_t container;
+ index_t _index = 0;
+ };
+
+ template <ContainerType CTYPE>
+ class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> {
+ /*
+ * iterative container type system (!IS_BOTTOM):
+ * CONTAINER_TYPE = ContainerType::ITERATIVE
+ * index() const -> index_t
+ * get_key() const -> key_get_type
+ * size() const -> node_offset_t
+ * size_to_nxt() const -> node_offset_t
+ * size_overhead() const -> node_offset_t
+ * get_nxt_container() const
+ * has_next() const -> bool
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, node_length, delta) -> container_t
+ * operator++()
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t
+ * update_size(mut, src, size)
+ * trim_until(mut, container) -> trim_size
+ * trim_at(mut, container, trimmed) -> trim_size
+ * erase(mut, container, p_left_bound) -> erase_size
+ */
+ // currently the iterative iterator is only implemented with STAGE_STRING
+ // for in-node space efficiency
+ static_assert(STAGE == STAGE_STRING);
+ public:
+ using me_t = _iterator_t<CTYPE>;
+
+ _iterator_t(const container_t& container) : container{container} {}
+
+ index_t index() const {
+ if (is_end()) {
+ return container.index() + 1;
+ } else {
+ return container.index();
+ }
+ }
+ key_get_type get_key() const {
+ assert(!is_end());
+ return container.get_key();
+ }
+ node_offset_t size_to_nxt() const {
+ assert(!is_end());
+ return container.size_to_nxt();
+ }
+ const typename NXT_STAGE_T::container_t get_nxt_container() const {
+ assert(!is_end());
+ return container.get_nxt_container();
+ }
+ bool is_last() const {
+ assert(!is_end());
+ return !container.has_next();
+ }
+ bool is_end() const {
+#ifndef NDEBUG
+ if (_is_end) {
+ assert(!container.has_next());
+ }
+#endif
+ return _is_end;
+ }
+ node_offset_t size() const {
+ assert(!is_end());
+ return container.size();
+ }
+ node_offset_t size_overhead() const {
+ assert(!is_end());
+ return container.size_overhead();
+ }
+
+ me_t& operator++() {
+ assert(!is_end());
+ assert(!is_last());
+ ++container;
+ return *this;
+ }
+ void seek_at(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ while (index > 0) {
+ assert(container.has_next());
+ ++container;
+ --index;
+ }
+ }
+ void seek_till_end(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ while (index > 0) {
+ if (!container.has_next()) {
+ assert(index == 1);
+ set_end();
+ break;
+ }
+ ++container;
+ --index;
+ }
+ }
+ void seek_last() {
+ assert(!is_end());
+ assert(index() == 0);
+ while (container.has_next()) {
+ ++container;
+ }
+ }
+ void set_end() {
+ assert(!is_end());
+ assert(is_last());
+ _is_end = true;
+ }
+ // Note: possible to return an end iterator
+ MatchKindBS seek(const key_hobj_t& key, bool exclude_last) {
+ assert(!is_end());
+ assert(index() == 0);
+ do {
+ if (exclude_last && is_last()) {
+ assert(key < get_key());
+ return MatchKindBS::NE;
+ }
+ auto match = key <=> get_key();
+ if (match == std::strong_ordering::less) {
+ return MatchKindBS::NE;
+ } else if (match == std::strong_ordering::equal) {
+ return MatchKindBS::EQ;
+ } else {
+ if (container.has_next()) {
+ ++container;
+ } else {
+ // end
+ break;
+ }
+ }
+ } while (true);
+ assert(!exclude_last);
+ set_end();
+ return MatchKindBS::NE;
+ }
+
+ template <IsFullKey Key>
+ memory_range_t insert_prefix(
+ NodeExtentMutable& mut, const Key& key,
+ node_offset_t size, const char* p_left_bound) {
+ return container_t::insert_prefix(
+ mut, container, key, is_end(), size, p_left_bound);
+ }
+
+ void update_size(NodeExtentMutable& mut, int insert_size) {
+ assert(!is_end());
+ container_t::update_size(mut, container, insert_size);
+ }
+
+ // Note: possible to return an end iterator when is_exclusive is true
+ // insert_index can still be INDEX_LAST or INDEX_END
+ template <bool is_exclusive>
+ size_t seek_split_inserted(
+ size_t start_size, size_t extra_size, size_t target_size,
+ index_t& insert_index, size_t insert_size,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_end());
+ assert(index() == 0);
+ size_t current_size = start_size;
+ index_t split_index = 0;
+ extra_size += header_size();
+ do {
+ if constexpr (!is_exclusive) {
+ if (is_last()) {
+ assert(split_index == index());
+ if (insert_index == INDEX_LAST) {
+ insert_index = index();
+ }
+ assert(insert_index <= index());
+ break;
+ }
+ }
+
+ size_t nxt_size = current_size;
+ if (split_index == 0) {
+ nxt_size += extra_size;
+ }
+ if (split_index == insert_index) {
+ nxt_size += insert_size;
+ if constexpr (is_exclusive) {
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+ ++split_index;
+ }
+ }
+ nxt_size += size();
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+
+ if constexpr (is_exclusive) {
+ if (is_last()) {
+ assert(split_index == index());
+ set_end();
+ split_index = index();
+ if (insert_index == INDEX_END) {
+ insert_index = index();
+ }
+ assert(insert_index == index());
+ break;
+ } else {
+ ++(*this);
+ ++split_index;
+ }
+ } else {
+ ++(*this);
+ ++split_index;
+ }
+ } while (true);
+ assert(current_size <= target_size);
+
+ _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left);
+ assert(split_index == index());
+ return current_size;
+ }
+
+ size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+ assert(!is_end());
+ assert(index() == 0);
+ size_t current_size = start_size;
+ do {
+ if (is_last()) {
+ break;
+ }
+
+ size_t nxt_size = current_size;
+ if (index() == 0) {
+ nxt_size += extra_size;
+ }
+ nxt_size += size();
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+ ++(*this);
+ } while (true);
+ assert(current_size <= target_size);
+ return current_size;
+ }
+
+ // Note: possible to return an end iterater if to_index == INDEX_END
+ template <KeyT KT>
+ void copy_out_until(
+ typename container_t::template Appender<KT>& appender, index_t& to_index) {
+ if (is_end()) {
+ assert(!container.has_next());
+ if (to_index == INDEX_END) {
+ to_index = index();
+ }
+ assert(to_index == index());
+ return;
+ }
+ index_t items;
+ if (to_index == INDEX_END || to_index == INDEX_LAST) {
+ items = to_index;
+ } else {
+ assert(is_valid_index(to_index));
+ assert(index() <= to_index);
+ items = to_index - index();
+ }
+ if (appender.append(container, items)) {
+ set_end();
+ }
+ to_index = index();
+ }
+
+ node_offset_t trim_until(NodeExtentMutable& mut) {
+ if (is_end()) {
+ return 0;
+ }
+ return container_t::trim_until(mut, container);
+ }
+
+ node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+ assert(!is_end());
+ return container_t::trim_at(mut, container, trimmed);
+ }
+
+ node_offset_t erase(NodeExtentMutable& mut, const char* p_left_bound) {
+ assert(!is_end());
+ return container_t::erase(mut, container, p_left_bound);
+ }
+
+ template <KeyT KT>
+ typename container_t::template Appender<KT>
+ get_appender(NodeExtentMutable* p_mut) {
+ return typename container_t::template Appender<KT>(p_mut, container, false);
+ }
+
+ template <KeyT KT>
+ typename container_t::template Appender<KT>
+ get_appender_opened(NodeExtentMutable* p_mut) {
+ if constexpr (!IS_BOTTOM) {
+ return typename container_t::template Appender<KT>(p_mut, container, true);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ container.encode(p_node_start, encoded);
+ uint8_t is_end = _is_end;
+ ceph::encode(is_end, encoded);
+ }
+
+ static me_t decode(const char* p_node_start,
+ extent_len_t node_size,
+ ceph::bufferlist::const_iterator& delta) {
+ auto container = container_t::decode(
+ p_node_start, node_size, delta);
+ auto ret = me_t(container);
+ uint8_t is_end;
+ ceph::decode(is_end, delta);
+ if (is_end) {
+ ret.set_end();
+ }
+ return ret;
+ }
+
+ static node_offset_t header_size() {
+ return container_t::header_size();
+ }
+
+ template <IsFullKey Key>
+ static node_offset_t estimate_insert(const Key& key,
+ const value_input_t& value) {
+ return container_t::estimate_insert(key, value);
+ }
+
+ private:
+ container_t container;
+ bool _is_end = false;
+ };
+
+ /*
+ * iterator_t encapsulates both indexable and iterative implementations
+ * from a *non-empty* container.
+ * cstr(const container_t&)
+ * access:
+ * index() -> index_t
+ * get_key() -> key_get_type (const reference or value type)
+ * is_last() -> bool
+ * is_end() -> bool
+ * size() -> node_offset_t
+ * size_overhead() -> node_offset_t
+ * (IS_BOTTOM) get_p_value() -> const value_t*
+ * (!IS_BOTTOM) get_nxt_container() -> container_range_t
+ * (!IS_BOTTOM) size_to_nxt() -> node_offset_t
+ * seek:
+ * operator++() -> iterator_t&
+ * seek_at(index)
+ * seek_till_end(index)
+ * seek_last()
+ * set_end()
+ * seek(key, exclude_last) -> MatchKindBS
+ * insert:
+ * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value
+ * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t
+ * (!IS_BOTTOM) update_size(mut, size)
+ * split:
+ * seek_split_inserted<bool is_exclusive>(
+ * start_size, extra_size, target_size, insert_index, insert_size,
+ * std::optional<bool>& is_insert_left)
+ * -> insert to left/right/unknown (!exclusive)
+ * -> insert to left/right (exclusive, can be end)
+ * -> split_size
+ * seek_split(start_size, extra_size, target_size) -> split_size
+ * copy_out_until(appender, to_index) (can be end)
+ * trim_until(mut) -> trim_size
+ * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size
+ * erase:
+ * erase(mut, p_left_bound) -> erase_size
+ * merge:
+ * get_appender(p_mut) -> Appender
+ * (!IS_BOTTOM)get_appender_opened(p_mut) -> Appender
+ * denc:
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, node_size, delta) -> iterator_t
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ */
+ using iterator_t = _iterator_t<CONTAINER_TYPE>;
+ /* TODO: detailed comments
+ * - trim_until(mut) -> trim_size
+ * * keep 0 to i - 1, and remove the rest, return the size trimmed.
+ * * if this is the end iterator, do nothing and return 0.
+ * * if this is the start iterator, normally needs to go to the higher
+ * stage to trim the entire container.
+ * - trim_at(mut, trimmed) -> trim_size
+ * * trim happens inside the current iterator, causing the size reduced by
+ * <trimmed>, return the total size trimmed.
+ */
+
+ /*
+ * Lookup internals (hide?)
+ */
+
+ static bool is_keys_one(
+ const container_t& container) { // IN
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if (iter.index() == 0) {
+ if constexpr (IS_BOTTOM) {
+ // ok, there is only 1 key
+ return true;
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::is_keys_one(nxt_container);
+ }
+ } else {
+ // more than 1 keys
+ return false;
+ }
+ }
+
+ template <bool GET_KEY>
+ static result_t smallest_result(
+ const iterator_t& iter, key_view_t* p_index_key) {
+ static_assert(!IS_BOTTOM);
+ assert(!iter.is_end());
+ auto nxt_container = iter.get_nxt_container();
+ auto pos_smallest = NXT_STAGE_T::position_t::begin();
+ const value_t* p_value;
+ NXT_STAGE_T::template get_slot<GET_KEY, true>(
+ nxt_container, pos_smallest, p_index_key, &p_value);
+ if constexpr (GET_KEY) {
+ assert(p_index_key);
+ p_index_key->set(iter.get_key());
+ } else {
+ assert(!p_index_key);
+ }
+ return result_t{{iter.index(), pos_smallest}, p_value, STAGE};
+ }
+
+ template <bool GET_KEY>
+ static result_t nxt_lower_bound(
+ const key_hobj_t& key, iterator_t& iter,
+ MatchHistory& history, key_view_t* index_key) {
+ static_assert(!IS_BOTTOM);
+ assert(!iter.is_end());
+ auto nxt_container = iter.get_nxt_container();
+ auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+ nxt_container, key, history, index_key);
+ if (nxt_result.is_end()) {
+ if (iter.is_last()) {
+ return result_t::end();
+ } else {
+ return smallest_result<GET_KEY>(++iter, index_key);
+ }
+ } else {
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ return result_t::from_nxt(iter.index(), nxt_result);
+ }
+ }
+
+ template <bool GET_POS, bool GET_KEY, bool GET_VAL>
+ static void get_largest_slot(
+ const container_t& container, // IN
+ position_t* p_position, // OUT
+ key_view_t* p_index_key, // OUT
+ const value_t** pp_value) { // OUT
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if constexpr (GET_KEY) {
+ assert(p_index_key);
+ p_index_key->set(iter.get_key());
+ } else {
+ assert(!p_index_key);
+ }
+ if constexpr (GET_POS) {
+ assert(p_position);
+ p_position->index = iter.index();
+ } else {
+ assert(!p_position);
+ }
+ if constexpr (IS_BOTTOM) {
+ if constexpr (GET_VAL) {
+ assert(pp_value);
+ *pp_value = iter.get_p_value();
+ } else {
+ assert(!pp_value);
+ }
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ if constexpr (GET_POS) {
+ NXT_STAGE_T::template get_largest_slot<true, GET_KEY, GET_VAL>(
+ nxt_container, &p_position->nxt, p_index_key, pp_value);
+ } else {
+ NXT_STAGE_T::template get_largest_slot<false, GET_KEY, GET_VAL>(
+ nxt_container, nullptr, p_index_key, pp_value);
+ }
+ }
+ }
+
+ template <bool GET_KEY, bool GET_VAL>
+ static void get_slot(
+ const container_t& container, // IN
+ const position_t& pos, // IN
+ key_view_t* p_index_key, // OUT
+ const value_t** pp_value) { // OUT
+ auto iter = iterator_t(container);
+ iter.seek_at(pos.index);
+
+ if constexpr (GET_KEY) {
+ assert(p_index_key);
+ p_index_key->set(iter.get_key());
+ } else {
+ assert(!p_index_key);
+ }
+
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::template get_slot<GET_KEY, GET_VAL>(
+ nxt_container, pos.nxt, p_index_key, pp_value);
+ } else {
+ if constexpr (GET_VAL) {
+ assert(pp_value);
+ *pp_value = iter.get_p_value();
+ } else {
+ assert(!pp_value);
+ }
+ }
+ }
+
+ template <bool GET_KEY = false>
+ static result_t lower_bound(
+ const container_t& container,
+ const key_hobj_t& key,
+ MatchHistory& history,
+ key_view_t* index_key = nullptr) {
+ bool exclude_last = false;
+ if (history.get<STAGE>().has_value()) {
+ if (*history.get<STAGE>() == MatchKindCMP::EQ) {
+ // lookup is short-circuited
+ if constexpr (!IS_BOTTOM) {
+ assert(history.get<STAGE - 1>().has_value());
+ if (history.is_GT<STAGE - 1>()) {
+ auto iter = iterator_t(container);
+ bool test_key_equal;
+ if constexpr (STAGE == STAGE_STRING) {
+ // TODO(cross-node string dedup)
+ // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN);
+ auto cmp = key <=> iter.get_key();
+ assert(cmp != std::strong_ordering::greater);
+ test_key_equal = (cmp == 0);
+ } else {
+ auto cmp = key <=> iter.get_key();
+ // From history, key[stage] == parent[stage][index - 1]
+ // which should be the smallest possible value for all
+ // index[stage][*]
+ assert(cmp != std::strong_ordering::greater);
+ test_key_equal = (cmp == 0);
+ }
+ if (test_key_equal) {
+ return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+ } else {
+ // key[stage] < index[stage][left-most]
+ return smallest_result<GET_KEY>(iter, index_key);
+ }
+ }
+ }
+ // IS_BOTTOM || !history.is_GT<STAGE - 1>()
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if constexpr (STAGE == STAGE_STRING) {
+ // TODO(cross-node string dedup)
+ // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX);
+ assert(key == iter.get_key());
+ } else {
+ assert(key == iter.get_key());
+ }
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ if constexpr (IS_BOTTOM) {
+ auto value_ptr = iter.get_p_value();
+ return result_t{{iter.index()}, value_ptr, MSTAT_EQ};
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+ nxt_container, key, history, index_key);
+ // !history.is_GT<STAGE - 1>() means
+ // key[stage+1 ...] <= index[stage+1 ...][*]
+ assert(!nxt_result.is_end());
+ return result_t::from_nxt(iter.index(), nxt_result);
+ }
+ } else if (*history.get<STAGE>() == MatchKindCMP::LT) {
+ exclude_last = true;
+ }
+ }
+ auto iter = iterator_t(container);
+ auto bs_match = iter.seek(key, exclude_last);
+ if (iter.is_end()) {
+ assert(!exclude_last);
+ assert(bs_match == MatchKindBS::NE);
+ history.set<STAGE>(MatchKindCMP::GT);
+ return result_t::end();
+ }
+ history.set<STAGE>(bs_match == MatchKindBS::EQ ?
+ MatchKindCMP::EQ : MatchKindCMP::LT);
+ if constexpr (IS_BOTTOM) {
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ auto value_ptr = iter.get_p_value();
+ return result_t{{iter.index()}, value_ptr,
+ (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)};
+ } else {
+ if (bs_match == MatchKindBS::EQ) {
+ return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+ } else {
+ return smallest_result<GET_KEY>(iter, index_key);
+ }
+ }
+ }
+
+ template <IsFullKey Key>
+ static node_offset_t insert_size(const Key& key,
+ const value_input_t& value) {
+ if constexpr (IS_BOTTOM) {
+ return iterator_t::estimate_insert(key, value);
+ } else {
+ return iterator_t::estimate_insert(key, value) +
+ NXT_STAGE_T::iterator_t::header_size() +
+ NXT_STAGE_T::insert_size(key, value);
+ }
+ }
+
+ template <IsFullKey Key>
+ static node_offset_t insert_size_at(match_stage_t stage,
+ const Key& key,
+ const value_input_t& value) {
+ if (stage == STAGE) {
+ return insert_size(key, value);
+ } else {
+ assert(stage < STAGE);
+ return NXT_STAGE_T::template insert_size_at(stage, key, value);
+ }
+ }
+
+ template <typename T = std::tuple<match_stage_t, node_offset_t>>
+ static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert(
+ const container_t& container, const key_view_t& key,
+ const value_input_t& value, position_t& position, bool evaluate_last) {
+ auto iter = iterator_t(container);
+ auto& index = position.index;
+ if (evaluate_last || index == INDEX_END) {
+ iter.seek_last();
+ index = iter.index();
+ // evaluate the previous index
+ } else {
+ assert(is_valid_index(index));
+ // evaluate the current index
+ iter.seek_at(index);
+ auto match = key <=> iter.get_key();
+ if (match == 0) {
+ if constexpr (IS_BOTTOM) {
+ ceph_abort("insert conflict at current index!");
+ } else {
+ // insert into the current index
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::evaluate_insert(
+ nxt_container, key, value, position.nxt, false);
+ }
+ } else {
+ assert(match == std::strong_ordering::less);
+ if (index == 0) {
+ // already the first index, so insert at the current index
+ return {STAGE, insert_size(key, value)};
+ }
+ --index;
+ iter = iterator_t(container);
+ iter.seek_at(index);
+ // proceed to evaluate the previous index
+ }
+ }
+
+ // XXX(multi-type): when key is from a different type of node
+ auto match = key <=> iter.get_key();
+ if (match == std::strong_ordering::greater) {
+ // key doesn't match both indexes, so insert at the current index
+ ++index;
+ return {STAGE, insert_size(key, value)};
+ } else {
+ assert(match == std::strong_ordering::equal);
+ if constexpr (IS_BOTTOM) {
+ // ceph_abort?
+ ceph_abort("insert conflict at the previous index!");
+ } else {
+ // insert into the previous index
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::evaluate_insert(
+ nxt_container, key, value, position.nxt, true);
+ }
+ }
+ }
+
+ template <typename T = bool>
+ static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T>
+ compensate_insert_position_at(match_stage_t stage, position_t& position) {
+ auto& index = position.index;
+ if (stage == STAGE) {
+ assert(index == 0);
+ // insert at the end of the current stage
+ index = INDEX_END;
+ return true;
+ } else {
+ if constexpr (IS_BOTTOM) {
+ ceph_abort("impossible path");
+ } else {
+ assert(stage < STAGE);
+ bool compensate = NXT_STAGE_T::
+ compensate_insert_position_at(stage, position.nxt);
+ if (compensate) {
+ assert(is_valid_index(index));
+ if (index == 0) {
+ // insert into the *last* index of the current stage
+ index = INDEX_LAST;
+ return true;
+ } else {
+ --index;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+
+ static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) {
+ assert(insert_stage <= STAGE);
+ if (insert_stage == STAGE) {
+ insert_pos.index = INDEX_END;
+ } else if constexpr (!IS_BOTTOM) {
+ insert_pos.index = INDEX_LAST;
+ NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage);
+ }
+ }
+
+ template <typename T = std::tuple<match_stage_t, node_offset_t>>
+ static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert(
+ const key_hobj_t& key, const value_config_t& value,
+ const MatchHistory& history, match_stat_t mstat, position_t& position) {
+ match_stage_t insert_stage = STAGE_TOP;
+ while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) {
+ assert(insert_stage != STAGE_BOTTOM && "insert conflict!");
+ --insert_stage;
+ }
+
+ if (history.is_GT()) {
+ if (position.is_end()) {
+ // no need to compensate insert position
+ assert(insert_stage <= STAGE && "impossible insert stage");
+ } else if (position == position_t::begin()) {
+ // I must be short-circuited by staged::smallest_result()
+ // in staged::lower_bound(), so we need to rely on mstat instead
+ assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3);
+ if (mstat == MSTAT_LT0) {
+ insert_stage = STAGE_RIGHT;
+ } else if (mstat == MSTAT_LT1) {
+ insert_stage = STAGE_STRING;
+ } else {
+ insert_stage = STAGE_LEFT;
+ }
+ // XXX(multi-type): need to upgrade node type before inserting an
+ // incompatible index at front.
+ assert(insert_stage <= STAGE && "incompatible insert");
+ } else {
+ assert(insert_stage <= STAGE && "impossible insert stage");
+ [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position);
+ assert(!ret);
+ }
+ }
+
+ if (position.is_end()) {
+ patch_insert_end(position, insert_stage);
+ }
+
+ node_offset_t insert_size = insert_size_at(insert_stage, key, value);
+
+ return {insert_stage, insert_size};
+ }
+
+ template <KeyT KT>
+ static const value_t* insert_new(
+ NodeExtentMutable& mut, const memory_range_t& range,
+ const full_key_t<KT>& key, const value_input_t& value) {
+ char* p_insert = const_cast<char*>(range.p_end);
+ const value_t* p_value = nullptr;
+ StagedAppender<KT> appender;
+ appender.init_empty(&mut, p_insert);
+ appender.append(key, value, p_value);
+ [[maybe_unused]] const char* p_insert_front = appender.wrap();
+ assert(p_insert_front == range.p_start);
+ return p_value;
+ }
+
+ template <KeyT KT, bool SPLIT>
+ static const value_t* proceed_insert_recursively(
+ NodeExtentMutable& mut, const container_t& container,
+ const full_key_t<KT>& key, const value_input_t& value,
+ position_t& position, match_stage_t& stage,
+ node_offset_t& _insert_size, const char* p_left_bound) {
+ // proceed insert from right to left
+ assert(stage <= STAGE);
+ auto iter = iterator_t(container);
+ auto& index = position.index;
+
+ bool do_insert = false;
+ if (stage == STAGE) {
+ if (index == INDEX_END) {
+ iter.seek_last();
+ iter.set_end();
+ index = iter.index();
+ } else {
+ assert(is_valid_index(index));
+ iter.seek_till_end(index);
+ }
+ do_insert = true;
+ } else { // stage < STAGE
+ if (index == INDEX_LAST) {
+ iter.seek_last();
+ index = iter.index();
+ } else {
+ assert(is_valid_index(index));
+ iter.seek_till_end(index);
+ }
+ if constexpr (SPLIT) {
+ if (iter.is_end()) {
+ // insert at the higher stage due to split
+ do_insert = true;
+ _insert_size = insert_size(key, value);
+ stage = STAGE;
+ }
+ } else {
+ assert(!iter.is_end());
+ }
+ }
+
+ if (do_insert) {
+ if constexpr (!IS_BOTTOM) {
+ position.nxt = position_t::nxt_t::begin();
+ }
+ assert(_insert_size == insert_size(key, value));
+ if constexpr (IS_BOTTOM) {
+ return iter.insert(
+ mut, key, value, _insert_size, p_left_bound);
+ } else {
+ auto range = iter.insert_prefix(
+ mut, key, _insert_size, p_left_bound);
+ return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+ }
+ } else {
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>(
+ mut, nxt_container, key, value,
+ position.nxt, stage, _insert_size, p_left_bound);
+ iter.update_size(mut, _insert_size);
+ return p_value;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+
+ template <KeyT KT, bool SPLIT>
+ static const value_t* proceed_insert(
+ NodeExtentMutable& mut, const container_t& container,
+ const full_key_t<KT>& key, const value_input_t& value,
+ position_t& position, match_stage_t& stage, node_offset_t& _insert_size) {
+ auto p_left_bound = container.p_left_bound();
+ if (unlikely(!container.keys())) {
+ if (position.is_end()) {
+ position = position_t::begin();
+ assert(stage == STAGE);
+ assert(_insert_size == insert_size(key, value));
+ } else if (position == position_t::begin()) {
+ // when insert into a trimmed and empty left node
+ stage = STAGE;
+ _insert_size = insert_size(key, value);
+ } else {
+ ceph_abort("impossible path");
+ }
+ if constexpr (IS_BOTTOM) {
+ return container_t::insert_at(
+ mut, container, key, value, 0, _insert_size, p_left_bound);
+ } else {
+ auto range = container_t::template insert_prefix_at(
+ mut, container, key, 0, _insert_size, p_left_bound);
+ return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+ }
+ } else {
+ return proceed_insert_recursively<KT, SPLIT>(
+ mut, container, key, value,
+ position, stage, _insert_size, p_left_bound);
+ }
+ }
+
+ static std::ostream& dump(const container_t& container,
+ std::ostream& os,
+ const std::string& prefix,
+ size_t& size,
+ const char* p_start) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ std::string prefix_blank(prefix.size(), ' ');
+ const std::string* p_prefix = &prefix;
+ size += iterator_t::header_size();
+ do {
+ std::ostringstream sos;
+ sos << *p_prefix << iter.get_key() << ": ";
+ std::string i_prefix = sos.str();
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ size += iter.size_to_nxt();
+ NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start);
+ } else {
+ auto value_ptr = iter.get_p_value();
+ int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+ size += iter.size();
+ os << "\n" << i_prefix;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ os << *value_ptr;
+ } else {
+ os << "0x" << std::hex << value_ptr->value << std::dec;
+ }
+ os << " " << size << "B"
+ << " @" << offset << "B";
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ p_prefix = &prefix_blank;
+ }
+ } while (true);
+ return os;
+ }
+
+ static void validate(const container_t& container) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ auto key = iter.get_key();
+ do {
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::validate(nxt_container);
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ assert(key < iter.get_key());
+ key = iter.get_key();
+ }
+ } while (true);
+ }
+
+ static void get_stats(const container_t& container, node_stats_t& stats,
+ key_view_t& index_key) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ stats.size_overhead += iterator_t::header_size();
+ do {
+ index_key.replace(iter.get_key());
+ stats.size_overhead += iter.size_overhead();
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::get_stats(nxt_container, stats, index_key);
+ } else {
+ ++stats.num_kvs;
+ size_t kv_logical_size = index_key.size_logical();
+ size_t value_size;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ value_size = iter.get_p_value()->allocation_size();
+ } else {
+ value_size = sizeof(value_t);
+ }
+ stats.size_value += value_size;
+ kv_logical_size += value_size;
+ stats.size_logical += kv_logical_size;
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ }
+ } while (true);
+ }
+
+ template <bool GET_KEY, bool GET_VAL>
+ static bool get_next_slot(
+ const container_t& container, // IN
+ position_t& pos, // IN&OUT
+ key_view_t* p_index_key, // OUT
+ const value_t** pp_value) { // OUT
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ iter.seek_at(pos.index);
+ bool find_next;
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ find_next = NXT_STAGE_T::template get_next_slot<GET_KEY, GET_VAL>(
+ nxt_container, pos.nxt, p_index_key, pp_value);
+ } else {
+ find_next = true;
+ }
+
+ if (find_next) {
+ if (iter.is_last()) {
+ return true;
+ } else {
+ pos.index = iter.index() + 1;
+ if constexpr (!IS_BOTTOM) {
+ pos.nxt = NXT_STAGE_T::position_t::begin();
+ }
+ get_slot<GET_KEY, GET_VAL>(
+ container, pos, p_index_key, pp_value);
+ return false;
+ }
+ } else { // !find_next && !IS_BOTTOM
+ if constexpr (GET_KEY) {
+ assert(p_index_key);
+ p_index_key->set(iter.get_key());
+ } else {
+ assert(!p_index_key);
+ }
+ return false;
+ }
+ }
+
+ template <bool GET_KEY, bool GET_VAL>
+ static void get_prev_slot(
+ const container_t& container, // IN
+ position_t& pos, // IN&OUT
+ key_view_t* p_index_key, // OUT
+ const value_t** pp_value) { // OUT
+ assert(pos != position_t::begin());
+ assert(!pos.is_end());
+ auto& index = pos.index;
+ auto iter = iterator_t(container);
+ if constexpr (!IS_BOTTOM) {
+ auto& nxt_pos = pos.nxt;
+ if (nxt_pos == NXT_STAGE_T::position_t::begin()) {
+ assert(index);
+ --index;
+ iter.seek_at(index);
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::template get_largest_slot<true, GET_KEY, GET_VAL>(
+ nxt_container, &nxt_pos, p_index_key, pp_value);
+ } else {
+ iter.seek_at(index);
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::template get_prev_slot<GET_KEY, GET_VAL>(
+ nxt_container, nxt_pos, p_index_key, pp_value);
+ }
+ } else {
+ assert(index);
+ --index;
+ iter.seek_at(index);
+ if constexpr (GET_VAL) {
+ assert(pp_value);
+ *pp_value = iter.get_p_value();
+ } else {
+ assert(!pp_value);
+ }
+ }
+ if constexpr (GET_KEY) {
+ p_index_key->set(iter.get_key());
+ } else {
+ assert(!p_index_key);
+ }
+ }
+
+ struct _BaseEmpty {};
+ class _BaseWithNxtIterator {
+ protected:
+ typename NXT_STAGE_T::StagedIterator _nxt;
+ };
+ class StagedIterator
+ : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> {
+ public:
+ StagedIterator() = default;
+ bool valid() const { return iter.has_value(); }
+ index_t index() const {
+ return iter->index();
+ }
+ bool is_end() const { return iter->is_end(); }
+ bool in_progress() const {
+ assert(valid());
+ assert(!is_end());
+ if constexpr (!IS_BOTTOM) {
+ if (this->_nxt.valid()) {
+ if (this->_nxt.index() == 0) {
+ return this->_nxt.in_progress();
+ } else {
+ return true;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ key_get_type get_key() const { return iter->get_key(); }
+
+ iterator_t& get() { return *iter; }
+ void set(const container_t& container) {
+ assert(!valid());
+ iter = iterator_t(container);
+ }
+ void set_end() { iter->set_end(); }
+ typename NXT_STAGE_T::StagedIterator& nxt() {
+ if constexpr (!IS_BOTTOM) {
+ if (!this->_nxt.valid()) {
+ auto nxt_container = iter->get_nxt_container();
+ this->_nxt.set(nxt_container);
+ }
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::StagedIterator& get_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ StagedIterator& operator++() {
+ if (iter->is_last()) {
+ iter->set_end();
+ } else {
+ ++(*iter);
+ }
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.reset();
+ }
+ return *this;
+ }
+ void reset() {
+ if (valid()) {
+ iter.reset();
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.reset();
+ }
+ }
+ }
+
+ template<typename OutputIt>
+ auto do_format_to(OutputIt out, bool is_top) const {
+ if (valid()) {
+ if (iter->is_end()) {
+ return fmt::format_to(out, "END");
+ } else {
+ out = fmt::format_to(out, "{}", index());
+ }
+ } else {
+ if (is_top) {
+ return fmt::format_to(out, "invalid StagedIterator!");
+ } else {
+ out = fmt::format_to(out, "0!");
+ }
+ }
+ if constexpr (!IS_BOTTOM) {
+ out = fmt::format_to(out, ", ");
+ return this->_nxt.do_format_to(out, false);
+ } else {
+ return out;
+ }
+ }
+
+ position_t get_pos() const {
+ if (valid()) {
+ if constexpr (IS_BOTTOM) {
+ return position_t{index()};
+ } else {
+ return position_t{index(), this->_nxt.get_pos()};
+ }
+ } else {
+ return position_t::begin();
+ }
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ uint8_t present = static_cast<bool>(iter);
+ ceph::encode(present, encoded);
+ if (iter.has_value()) {
+ iter->encode(p_node_start, encoded);
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.encode(p_node_start, encoded);
+ }
+ }
+ }
+ static StagedIterator decode(const char* p_node_start,
+ extent_len_t node_size,
+ ceph::bufferlist::const_iterator& delta) {
+ StagedIterator ret;
+ uint8_t present;
+ ceph::decode(present, delta);
+ if (present) {
+ ret.iter = iterator_t::decode(
+ p_node_start, node_size, delta);
+ if constexpr (!IS_BOTTOM) {
+ ret._nxt = NXT_STAGE_T::StagedIterator::decode(
+ p_node_start, node_size, delta);
+ }
+ }
+ return ret;
+ }
+ private:
+ std::optional<iterator_t> iter;
+ };
+
+ static bool recursively_locate_split(
+ size_t& current_size, size_t extra_size,
+ size_t target_size, StagedIterator& split_at) {
+ assert(current_size <= target_size);
+ iterator_t& split_iter = split_at.get();
+ current_size = split_iter.seek_split(current_size, extra_size, target_size);
+ assert(current_size <= target_size);
+ assert(!split_iter.is_end());
+ if (split_iter.index() == 0) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ bool locate_nxt;
+ if constexpr (!IS_BOTTOM) {
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+ } else { // IS_BOTTOM
+ // located upper_bound, fair split strategy
+ size_t nxt_size = split_iter.size() + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ current_size += nxt_size;
+ locate_nxt = true;
+ } else {
+ // exclude next
+ locate_nxt = false;
+ }
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ return true;
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ static bool recursively_locate_split_inserted(
+ size_t& current_size, size_t extra_size, size_t target_size,
+ position_t& insert_pos, match_stage_t insert_stage, size_t insert_size,
+ std::optional<bool>& is_insert_left, StagedIterator& split_at) {
+ assert(current_size <= target_size);
+ assert(!is_insert_left.has_value());
+ iterator_t& split_iter = split_at.get();
+ auto& insert_index = insert_pos.index;
+ if (insert_stage == STAGE) {
+ current_size = split_iter.template seek_split_inserted<true>(
+ current_size, extra_size, target_size,
+ insert_index, insert_size, is_insert_left);
+ assert(is_insert_left.has_value());
+ assert(current_size <= target_size);
+ if (split_iter.index() == 0) {
+ if (insert_index == 0) {
+ if (*is_insert_left == false) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ } else {
+ extra_size += iterator_t::header_size();
+ }
+ } else {
+ extra_size = 0;
+ }
+ if (*is_insert_left == false && split_iter.index() == insert_index) {
+ // split_iter can be end
+ // found the lower-bound of target_size
+ // ...[s_index-1] |!| (i_index) [s_index]...
+
+ // located upper-bound, fair split strategy
+ // look at the next slot (the insert item)
+ size_t nxt_size = insert_size + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ *is_insert_left = true;
+ current_size += nxt_size;
+ if (split_iter.is_end()) {
+ // ...[s_index-1] (i_index) |!|
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ // exclude next
+ return false;
+ }
+ } else {
+ // Already considered insert effect in the current stage.
+ // Look into the next stage to identify the target_size lower-bound w/o
+ // insert effect.
+ assert(!split_iter.is_end());
+ bool locate_nxt;
+ if constexpr (!IS_BOTTOM) {
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+ } else { // IS_BOTTOM
+ // located upper-bound, fair split strategy
+ // look at the next slot
+ size_t nxt_size = split_iter.size() + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ current_size += nxt_size;
+ locate_nxt = true;
+ } else {
+ // exclude next
+ locate_nxt = false;
+ }
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ auto end_index = split_iter.index() + 1;
+ if (insert_index == INDEX_END) {
+ insert_index = end_index;
+ }
+ assert(insert_index <= end_index);
+ if (insert_index == end_index) {
+ assert(*is_insert_left == false);
+ split_iter.set_end();
+ // ...[s_index-1] |!| (i_index)
+ return false;
+ } else {
+ assert(*is_insert_left == true);
+ return true;
+ }
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ } else {
+ if constexpr (!IS_BOTTOM) {
+ assert(insert_stage < STAGE);
+ current_size = split_iter.template seek_split_inserted<false>(
+ current_size, extra_size, target_size,
+ insert_index, insert_size, is_insert_left);
+ assert(!split_iter.is_end());
+ assert(current_size <= target_size);
+ if (split_iter.index() == 0) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ bool locate_nxt;
+ if (!is_insert_left.has_value()) {
+ // Considered insert effect in the current stage, and insert happens
+ // in the lower stage.
+ // Look into the next stage to identify the target_size lower-bound w/
+ // insert effect.
+ assert(split_iter.index() == insert_index);
+ locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted(
+ current_size, extra_size + split_iter.size_to_nxt(), target_size,
+ insert_pos.nxt, insert_stage, insert_size,
+ is_insert_left, split_at.nxt());
+ assert(is_insert_left.has_value());
+#ifndef NDEBUG
+ if (locate_nxt) {
+ assert(*is_insert_left == true);
+ }
+#endif
+ } else {
+ // is_insert_left.has_value() == true
+ // Insert will *not* happen in the lower stage.
+ // Need to look into the next stage to identify the target_size
+ // lower-bound w/ insert effect
+ assert(split_iter.index() != insert_index);
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+#ifndef NDEBUG
+ if (split_iter.index() < insert_index) {
+ assert(*is_insert_left == false);
+ } else {
+ assert(*is_insert_left == true);
+ }
+#endif
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ return true;
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ ceph_abort("impossible path");
+ return false;;
+ }
+ }
+ }
+
+ /*
+ * container appender type system
+ * container_t::Appender(NodeExtentMutable& mut, char* p_append)
+ * append(const container_t& src, index_t from, index_t items)
+ * wrap() -> char*
+ * IF !IS_BOTTOM:
+ * open_nxt(const key_get_type&)
+ * open_nxt(const full_key_t&)
+ * -> std::tuple<NodeExtentMutable&, char*>
+ * wrap_nxt(char* p_append)
+ * ELSE
+ * append(const full_key_t& key, const value_input_t& value)
+ */
+ template <KeyT KT>
+ struct _BaseWithNxtAppender {
+ typename NXT_STAGE_T::template StagedAppender<KT> _nxt;
+ };
+ template <KeyT KT>
+ class StagedAppender
+ : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> {
+ public:
+ StagedAppender() = default;
+ ~StagedAppender() {
+ assert(!require_wrap_nxt);
+ assert(!valid());
+ }
+ bool valid() const { return appender.has_value(); }
+ index_t index() const {
+ assert(valid());
+ return _index;
+ }
+ bool in_progress() const { return require_wrap_nxt; }
+ // TODO: pass by reference
+ void init_empty(NodeExtentMutable* p_mut, char* p_start) {
+ assert(!valid());
+ appender = typename container_t::template Appender<KT>(p_mut, p_start);
+ _index = 0;
+ }
+ void init_tail(NodeExtentMutable* p_mut,
+ const container_t& container,
+ match_stage_t stage) {
+ assert(!valid());
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if (stage == STAGE) {
+ appender = iter.template get_appender<KT>(p_mut);
+ _index = iter.index() + 1;
+ if constexpr (!IS_BOTTOM) {
+ assert(!this->_nxt.valid());
+ }
+ } else {
+ assert(stage < STAGE);
+ if constexpr (!IS_BOTTOM) {
+ appender = iter.template get_appender_opened<KT>(p_mut);
+ _index = iter.index();
+ require_wrap_nxt = true;
+ auto nxt_container = iter.get_nxt_container();
+ this->_nxt.init_tail(p_mut, nxt_container, stage);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+ // possible to make src_iter end if to_index == INDEX_END
+ void append_until(StagedIterator& src_iter, index_t& to_index) {
+ assert(!require_wrap_nxt);
+ auto s_index = src_iter.index();
+ src_iter.get().template copy_out_until<KT>(*appender, to_index);
+ assert(src_iter.index() == to_index);
+ assert(to_index >= s_index);
+ auto increment = (to_index - s_index);
+ if (increment) {
+ _index += increment;
+ if constexpr (!IS_BOTTOM) {
+ src_iter.get_nxt().reset();
+ }
+ }
+ }
+ void append(const full_key_t<KT>& key,
+ const value_input_t& value, const value_t*& p_value) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ auto& nxt = open_nxt(key);
+ nxt.append(key, value, p_value);
+ wrap_nxt();
+ } else {
+ appender->append(key, value, p_value);
+ ++_index;
+ }
+ }
+ char* wrap() {
+ assert(valid());
+ assert(_index > 0);
+ if constexpr (!IS_BOTTOM) {
+ if (require_wrap_nxt) {
+ wrap_nxt();
+ }
+ }
+ auto ret = appender->wrap();
+ appender.reset();
+ return ret;
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>&
+ open_nxt(key_get_type paritial_key) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ require_wrap_nxt = true;
+ auto [p_mut, p_append] = appender->open_nxt(paritial_key);
+ this->_nxt.init_empty(p_mut, p_append);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>&
+ open_nxt(const full_key_t<KT>& key) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ require_wrap_nxt = true;
+ auto [p_mut, p_append] = appender->open_nxt(key);
+ this->_nxt.init_empty(p_mut, p_append);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ assert(require_wrap_nxt);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ void wrap_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ assert(require_wrap_nxt);
+ require_wrap_nxt = false;
+ auto p_append = this->_nxt.wrap();
+ appender->wrap_nxt(p_append);
+ ++_index;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ private:
+ std::optional<typename container_t::template Appender<KT>> appender;
+ index_t _index;
+ bool require_wrap_nxt = false;
+ };
+
+ template <KeyT KT>
+ static void _append_range(
+ StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) {
+ if (src_iter.is_end()) {
+ // append done
+ assert(to_index == INDEX_END);
+ to_index = src_iter.index();
+ } else if constexpr (!IS_BOTTOM) {
+ if (appender.in_progress()) {
+ // appender has appended something at the current item,
+ // cannot append the current item as-a-whole
+ index_t to_index_nxt = INDEX_END;
+ NXT_STAGE_T::template _append_range<KT>(
+ src_iter.nxt(), appender.get_nxt(), to_index_nxt);
+ ++src_iter;
+ appender.wrap_nxt();
+ } else if (src_iter.in_progress()) {
+ // src_iter is not at the beginning of the current item,
+ // cannot append the current item as-a-whole
+ index_t to_index_nxt = INDEX_END;
+ NXT_STAGE_T::template _append_range<KT>(
+ src_iter.get_nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt);
+ ++src_iter;
+ appender.wrap_nxt();
+ } else {
+ // we can safely append the current item as-a-whole
+ }
+ }
+ appender.append_until(src_iter, to_index);
+ }
+
+ template <KeyT KT>
+ static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender,
+ position_t& position, match_stage_t stage) {
+ assert(position.index == src_iter.index());
+ // reaches the last item
+ if (stage == STAGE) {
+ // done, end recursion
+ if constexpr (!IS_BOTTOM) {
+ position.nxt = position_t::nxt_t::begin();
+ }
+ } else {
+ assert(stage < STAGE);
+ // proceed append in the next stage
+ NXT_STAGE_T::template append_until<KT>(
+ src_iter.nxt(), appender.open_nxt(src_iter.get_key()),
+ position.nxt, stage);
+ }
+ }
+
+ template <KeyT KT>
+ static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender,
+ position_t& position, match_stage_t stage) {
+ index_t from_index = src_iter.index();
+ index_t& to_index = position.index;
+ assert(from_index <= to_index);
+ if constexpr (IS_BOTTOM) {
+ assert(stage == STAGE);
+ appender.append_until(src_iter, to_index);
+ } else {
+ assert(stage <= STAGE);
+ if (src_iter.index() == to_index) {
+ _append_into<KT>(src_iter, appender, position, stage);
+ } else {
+ if (to_index == INDEX_END) {
+ assert(stage == STAGE);
+ } else if (to_index == INDEX_LAST) {
+ assert(stage < STAGE);
+ }
+ _append_range<KT>(src_iter, appender, to_index);
+ _append_into<KT>(src_iter, appender, position, stage);
+ }
+ }
+ to_index -= from_index;
+ }
+
+ template <KeyT KT>
+ static bool append_insert(
+ const full_key_t<KT>& key, const value_input_t& value,
+ StagedIterator& src_iter, StagedAppender<KT>& appender,
+ bool is_front_insert, match_stage_t& stage, const value_t*& p_value) {
+ assert(src_iter.valid());
+ if (stage == STAGE) {
+ appender.append(key, value, p_value);
+ if (src_iter.is_end()) {
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ assert(stage < STAGE);
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>(
+ key, value, src_iter.get_nxt(), appender.get_nxt(),
+ is_front_insert, stage, p_value);
+ if (nxt_is_end) {
+ appender.wrap_nxt();
+ ++src_iter;
+ if (is_front_insert) {
+ stage = STAGE;
+ }
+ if (src_iter.is_end()) {
+ return true;
+ }
+ }
+ return false;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+
+ /* TrimType:
+ * BEFORE: remove the entire container, normally means the according higher
+ * stage iterator needs to be trimmed as-a-whole.
+ * AFTER: retain the entire container, normally means the trim should be
+ * start from the next iterator at the higher stage.
+ * AT: trim happens in the current container, and the according higher
+ * stage iterator needs to be adjusted by the trimmed size.
+ */
+ static std::tuple<TrimType, node_offset_t>
+ recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+ if (!trim_at.valid()) {
+ return {TrimType::BEFORE, 0u};
+ }
+ if (trim_at.is_end()) {
+ return {TrimType::AFTER, 0u};
+ }
+
+ auto& iter = trim_at.get();
+ if constexpr (!IS_BOTTOM) {
+ auto [type, trimmed] = NXT_STAGE_T::recursively_trim(
+ mut, trim_at.get_nxt());
+ node_offset_t trim_size;
+ if (type == TrimType::AFTER) {
+ if (iter.is_last()) {
+ return {TrimType::AFTER, 0u};
+ }
+ ++trim_at;
+ trim_size = iter.trim_until(mut);
+ } else if (type == TrimType::BEFORE) {
+ if (iter.index() == 0) {
+ return {TrimType::BEFORE, 0u};
+ }
+ trim_size = iter.trim_until(mut);
+ } else {
+ trim_size = iter.trim_at(mut, trimmed);
+ }
+ return {TrimType::AT, trim_size};
+ } else {
+ if (iter.index() == 0) {
+ return {TrimType::BEFORE, 0u};
+ } else {
+ auto trimmed = iter.trim_until(mut);
+ return {TrimType::AT, trimmed};
+ }
+ }
+ }
+
+ static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+ auto [type, trimmed] = recursively_trim(mut, trim_at);
+ if (type == TrimType::BEFORE) {
+ assert(trim_at.valid());
+ auto& iter = trim_at.get();
+ iter.trim_until(mut);
+ }
+ }
+
+ static std::optional<std::tuple<match_stage_t, node_offset_t, bool>>
+ proceed_erase_recursively(
+ NodeExtentMutable& mut,
+ const container_t& container, // IN
+ const char* p_left_bound, // IN
+ position_t& pos) { // IN&OUT
+ auto iter = iterator_t(container);
+ auto& index = pos.index;
+ assert(is_valid_index(index));
+ iter.seek_at(index);
+ bool is_last = iter.is_last();
+
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ auto ret = NXT_STAGE_T::proceed_erase_recursively(
+ mut, nxt_container, p_left_bound, pos.nxt);
+ if (ret.has_value()) {
+ // erased at lower level
+ auto [r_stage, r_erase_size, r_done] = *ret;
+ assert(r_erase_size != 0);
+ iter.update_size(mut, -r_erase_size);
+ if (r_done) {
+ // done, the next_pos is calculated
+ return ret;
+ } else {
+ if (is_last) {
+ // need to find the next pos at upper stage
+ return ret;
+ } else {
+ // done, calculate the next pos
+ ++index;
+ pos.nxt = NXT_STAGE_T::position_t::begin();
+ return {{r_stage, r_erase_size, true}};
+ }
+ }
+ }
+ // not erased at lower level
+ }
+
+ // not erased yet
+ if (index == 0 && is_last) {
+ // need to erase from the upper stage
+ return std::nullopt;
+ } else {
+ auto erase_size = iter.erase(mut, p_left_bound);
+ assert(erase_size != 0);
+ if (is_last) {
+ // need to find the next pos at upper stage
+ return {{STAGE, erase_size, false}};
+ } else {
+ // done, calculate the next pos (should be correct already)
+ if constexpr (!IS_BOTTOM) {
+ assert(pos.nxt == NXT_STAGE_T::position_t::begin());
+ }
+ return {{STAGE, erase_size, true}};
+ }
+ }
+ }
+
+ static match_stage_t erase(
+ NodeExtentMutable& mut,
+ const container_t& node_stage, // IN
+ position_t& erase_pos) { // IN&OUT
+ auto p_left_bound = node_stage.p_left_bound();
+ auto ret = proceed_erase_recursively(
+ mut, node_stage, p_left_bound, erase_pos);
+ if (ret.has_value()) {
+ auto [r_stage, r_erase_size, r_done] = *ret;
+ std::ignore = r_erase_size;
+ if (r_done) {
+ assert(!erase_pos.is_end());
+ return r_stage;
+ } else {
+ // erased the last kv
+ erase_pos = position_t::end();
+ return r_stage;
+ }
+ } else {
+ assert(node_stage.keys() == 1);
+ node_stage.erase_at(mut, node_stage, 0, p_left_bound);
+ erase_pos = position_t::end();
+ return STAGE;
+ }
+ }
+
+ static std::tuple<match_stage_t, node_offset_t> evaluate_merge(
+ const key_view_t& left_pivot_index,
+ const container_t& right_container) {
+ auto r_iter = iterator_t(right_container);
+ r_iter.seek_at(0);
+ node_offset_t compensate = r_iter.header_size();
+ auto cmp = left_pivot_index <=> r_iter.get_key();
+ if (cmp == std::strong_ordering::equal) {
+ if constexpr (!IS_BOTTOM) {
+ // the index is equal, compensate and look at the lower stage
+ compensate += r_iter.size_to_nxt();
+ auto r_nxt_container = r_iter.get_nxt_container();
+ auto [ret_stage, ret_compensate] = NXT_STAGE_T::evaluate_merge(
+ left_pivot_index, r_nxt_container);
+ compensate += ret_compensate;
+ return {ret_stage, compensate};
+ } else {
+ ceph_abort("impossible path: left_pivot_key == right_first_key");
+ }
+ } else if (cmp == std::strong_ordering::less) {
+ // ok, do merge here
+ return {STAGE, compensate};
+ } else {
+ ceph_abort("impossible path: left_pivot_key < right_first_key");
+ }
+ }
+};
+
+/**
+ * Configurations for struct staged
+ *
+ * staged_params_* assembles different container_t implementations (defined by
+ * stated::_iterator_t) by STAGE, and constructs the final multi-stage
+ * implementations for different node layouts defined by
+ * node_extent_t<FieldType, NODE_TYPE>.
+ *
+ * The specialized implementations for different layouts are accessible through
+ * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>.
+ *
+ * Specifically, the settings of 8 layouts are:
+ *
+ * The layout (N0, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT: node_extent_t<node_fields_0_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N1, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT: node_extent_t<node_fields_1_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N2, LEAF/INTERNAL) has 2 stages:
+ * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N3, LEAF) has 1 stage:
+ * - STAGE_RIGHT: node_extent_t<leaf_fields_3_t, LEAF>
+ *
+ * The layout (N3, INTERNAL) has 1 stage:
+ * - STAGE_RIGHT: node_extent_t<internal_fields_3_t, INTERNAL>
+ */
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_subitems {
+ using container_t = sub_items_t<_NODE_TYPE>;
+ static constexpr auto NODE_TYPE = _NODE_TYPE;
+ static constexpr auto STAGE = STAGE_RIGHT;
+
+ // dummy type in order to make our type system work
+ // any better solution to get rid of this?
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_item_iterator {
+ using container_t = item_iterator_t<_NODE_TYPE>;
+ static constexpr auto NODE_TYPE = _NODE_TYPE;
+ static constexpr auto STAGE = STAGE_STRING;
+
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_01 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_LEFT;
+
+ using next_param_t = staged_params_item_iterator<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_2 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_STRING;
+
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_3 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_RIGHT;
+
+ // dummy type in order to make our type system work
+ // any better solution to get rid of this?
+ using next_param_t = staged_params_node_3<NodeType>;
+};
+
+template <typename NodeType, typename Enable = void> struct _node_to_stage_t;
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 ||
+ NodeType::FIELD_TYPE == field_type_t::N1>> {
+ using type = staged<staged_params_node_01<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> {
+ using type = staged<staged_params_node_2<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> {
+ using type = staged<staged_params_node_3<NodeType>>;
+};
+template <typename NodeType>
+using node_to_stage_t = typename _node_to_stage_t<NodeType>::type;
+
+}
+
+template<typename T>
+concept HasDoFormatTo = requires(T x, std::back_insert_iterator<fmt::memory_buffer> out) {
+ { x.do_format_to(out, true) } -> std::same_as<decltype(out)>;
+};
+template <HasDoFormatTo T> struct fmt::formatter<T> : fmt::formatter<std::string_view> {
+ template <typename FormatContext>
+ auto format(const T& staged_iterator, FormatContext& ctx) {
+ return staged_iterator.do_format_to(ctx.out(), true);
+ }
+};
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
new file mode 100644
index 000000000..3c1b32a41
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
@@ -0,0 +1,442 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/value.h"
+
+namespace crimson::os::seastore::onode {
+
+using match_stage_t = int8_t;
+constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush
+constexpr match_stage_t STAGE_STRING = 1; // nspace/oid
+constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen
+constexpr auto STAGE_TOP = STAGE_LEFT;
+constexpr auto STAGE_BOTTOM = STAGE_RIGHT;
+constexpr bool is_valid_stage(match_stage_t stage) {
+ return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage;
+}
+// TODO: replace by
+// using match_history_t = int8_t;
+// left_m, str_m, right_m
+// 3: GT,
+// 2: EQ, GT,
+// 1: EQ, EQ, GT
+// 0: EQ, EQ, EQ
+// -1: EQ, EQ, LT
+// -2: EQ, LT,
+// -3: LT,
+
+struct MatchHistory {
+ template <match_stage_t STAGE>
+ const std::optional<MatchKindCMP>& get() const {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE == STAGE_RIGHT) {
+ return right_match;
+ } else if (STAGE == STAGE_STRING) {
+ return string_match;
+ } else {
+ return left_match;
+ }
+ }
+
+ const std::optional<MatchKindCMP>&
+ get_by_stage(match_stage_t stage) const {
+ assert(is_valid_stage(stage));
+ if (stage == STAGE_RIGHT) {
+ return right_match;
+ } else if (stage == STAGE_STRING) {
+ return string_match;
+ } else {
+ return left_match;
+ }
+ }
+
+ template <match_stage_t STAGE = STAGE_TOP>
+ const bool is_GT() const;
+
+ template <match_stage_t STAGE>
+ void set(MatchKindCMP match) {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE < STAGE_TOP) {
+ assert(*get<STAGE + 1>() == MatchKindCMP::EQ);
+ }
+ assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ);
+ const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match;
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "history(";
+ dump_each(os, left_match) << ", ";
+ dump_each(os, string_match) << ", ";
+ dump_each(os, right_match) << ")";
+ return os;
+ }
+
+ std::ostream& dump_each(
+ std::ostream& os, const std::optional<MatchKindCMP>& match) const {
+ if (!match.has_value()) {
+ return os << "--";
+ } else if (*match == MatchKindCMP::LT) {
+ return os << "LT";
+ } else if (*match == MatchKindCMP::EQ) {
+ return os << "EQ";
+ } else if (*match == MatchKindCMP::GT) {
+ return os << "GT";
+ } else {
+ ceph_abort("impossble path");
+ }
+ }
+
+ std::optional<MatchKindCMP> left_match;
+ std::optional<MatchKindCMP> string_match;
+ std::optional<MatchKindCMP> right_match;
+};
+inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) {
+ return pos.dump(os);
+}
+
+template <match_stage_t STAGE>
+struct _check_GT_t {
+ static bool eval(const MatchHistory* history) {
+ return history->get<STAGE>() &&
+ (*history->get<STAGE>() == MatchKindCMP::GT ||
+ (*history->get<STAGE>() == MatchKindCMP::EQ &&
+ _check_GT_t<STAGE - 1>::eval(history)));
+ }
+};
+template <>
+struct _check_GT_t<STAGE_RIGHT> {
+ static bool eval(const MatchHistory* history) {
+ return history->get<STAGE_RIGHT>() &&
+ *history->get<STAGE_RIGHT>() == MatchKindCMP::GT;
+ }
+};
+template <match_stage_t STAGE>
+const bool MatchHistory::is_GT() const {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE < STAGE_TOP) {
+ assert(get<STAGE + 1>() == MatchKindCMP::EQ);
+ }
+ return _check_GT_t<STAGE>::eval(this);
+}
+
+template <match_stage_t STAGE>
+struct staged_position_t {
+ static_assert(is_valid_stage(STAGE));
+ using me_t = staged_position_t<STAGE>;
+ using nxt_t = staged_position_t<STAGE - 1>;
+ bool is_end() const {
+ if (index == INDEX_END) {
+ return true;
+ } else {
+ assert(is_valid_index(index));
+ return false;
+ }
+ }
+ index_t& index_by_stage(match_stage_t stage) {
+ assert(stage <= STAGE);
+ if (STAGE == stage) {
+ return index;
+ } else {
+ return nxt.index_by_stage(stage);
+ }
+ }
+
+ auto operator<=>(const me_t& o) const = default;
+
+ void assert_next_to(const me_t& prv) const {
+#ifndef NDEBUG
+ if (is_end()) {
+ assert(!prv.is_end());
+ } else if (index == prv.index) {
+ assert(!nxt.is_end());
+ nxt.assert_next_to(prv.nxt);
+ } else if (index == prv.index + 1) {
+ assert(!prv.nxt.is_end());
+ assert(nxt == nxt_t::begin());
+ } else {
+ assert(false);
+ }
+#endif
+ }
+
+ me_t& operator-=(const me_t& o) {
+ assert(is_valid_index(o.index));
+ assert(index >= o.index);
+ if (index != INDEX_END) {
+ assert(is_valid_index(index));
+ index -= o.index;
+ if (index == 0) {
+ nxt -= o.nxt;
+ }
+ }
+ return *this;
+ }
+
+ me_t& operator+=(const me_t& o) {
+ assert(is_valid_index(index));
+ assert(is_valid_index(o.index));
+ index += o.index;
+ nxt += o.nxt;
+ return *this;
+ }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(index, encoded);
+ nxt.encode(encoded);
+ }
+
+ static me_t decode(ceph::bufferlist::const_iterator& delta) {
+ me_t ret;
+ ceph::decode(ret.index, delta);
+ ret.nxt = nxt_t::decode(delta);
+ return ret;
+ }
+
+ static me_t begin() { return {0u, nxt_t::begin()}; }
+ static me_t end() {
+ return {INDEX_END, nxt_t::end()};
+ }
+
+ index_t index;
+ nxt_t nxt;
+};
+template <match_stage_t STAGE>
+std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) {
+ if (pos.index == INDEX_END) {
+ os << "END";
+ } else if (pos.index == INDEX_LAST) {
+ os << "LAST";
+ } else {
+ os << pos.index;
+ assert(is_valid_index(pos.index));
+ }
+ return os << ", " << pos.nxt;
+}
+
+template <>
+struct staged_position_t<STAGE_BOTTOM> {
+ using me_t = staged_position_t<STAGE_BOTTOM>;
+ bool is_end() const {
+ if (index == INDEX_END) {
+ return true;
+ } else {
+ assert(is_valid_index(index));
+ return false;
+ }
+ }
+ index_t& index_by_stage(match_stage_t stage) {
+ assert(stage == STAGE_BOTTOM);
+ return index;
+ }
+
+ auto operator<=>(const me_t&) const = default;
+
+ me_t& operator-=(const me_t& o) {
+ assert(is_valid_index(o.index));
+ assert(index >= o.index);
+ if (index != INDEX_END) {
+ assert(is_valid_index(index));
+ index -= o.index;
+ }
+ return *this;
+ }
+
+ me_t& operator+=(const me_t& o) {
+ assert(is_valid_index(index));
+ assert(is_valid_index(o.index));
+ index += o.index;
+ return *this;
+ }
+
+ void assert_next_to(const me_t& prv) const {
+#ifndef NDEBUG
+ if (is_end()) {
+ assert(!prv.is_end());
+ } else {
+ assert(index == prv.index + 1);
+ }
+#endif
+ }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(index, encoded);
+ }
+
+ static me_t decode(ceph::bufferlist::const_iterator& delta) {
+ me_t ret;
+ ceph::decode(ret.index, delta);
+ return ret;
+ }
+
+ static me_t begin() { return {0u}; }
+ static me_t end() { return {INDEX_END}; }
+
+ index_t index;
+};
+template <>
+inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) {
+ if (pos.index == INDEX_END) {
+ os << "END";
+ } else if (pos.index == INDEX_LAST) {
+ os << "LAST";
+ } else {
+ os << pos.index;
+ assert(is_valid_index(pos.index));
+ }
+ return os;
+}
+
+using search_position_t = staged_position_t<STAGE_TOP>;
+
+template <match_stage_t STAGE>
+const staged_position_t<STAGE>& cast_down(const search_position_t& pos) {
+ if constexpr (STAGE == STAGE_LEFT) {
+ return pos;
+ } else if constexpr (STAGE == STAGE_STRING) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(pos.nxt.is_end());
+ } else {
+ assert(pos.index == 0u);
+ }
+#endif
+ return pos.nxt;
+ } else if constexpr (STAGE == STAGE_RIGHT) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(pos.nxt.nxt.is_end());
+ } else {
+ assert(pos.index == 0u);
+ assert(pos.nxt.index == 0u);
+ }
+#endif
+ return pos.nxt.nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down(search_position_t& pos) {
+ const search_position_t& _pos = pos;
+ return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos));
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) {
+ if constexpr (STAGE == STAGE_LEFT) {
+ return pos;
+ } if constexpr (STAGE == STAGE_STRING) {
+ pos.index = 0;
+ return pos.nxt;
+ } else if constexpr (STAGE == STAGE_RIGHT) {
+ pos.index = 0;
+ pos.nxt.index = 0;
+ return pos.nxt.nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); }
+
+template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>>
+search_position_t normalize(staged_position_t<STAGE>&& pos) {
+ if (pos.is_end()) {
+ return search_position_t::end();
+ }
+ if constexpr (STAGE == STAGE_STRING) {
+ return {0u, std::move(pos)};
+ } else if (STAGE == STAGE_RIGHT) {
+ return {0u, {0u, std::move(pos)}};
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+struct memory_range_t {
+ const char* p_start;
+ const char* p_end;
+};
+
+struct container_range_t {
+ memory_range_t range;
+ extent_len_t node_size;
+};
+
+enum class ContainerType { ITERATIVE, INDEXABLE };
+
+// the input type to construct the value during insert.
+template <node_type_t> struct value_input_type;
+template<> struct value_input_type<node_type_t::INTERNAL> { using type = laddr_t; };
+template<> struct value_input_type<node_type_t::LEAF> { using type = value_config_t; };
+template <node_type_t NODE_TYPE>
+using value_input_type_t = typename value_input_type<NODE_TYPE>::type;
+
+template <node_type_t> struct value_type;
+template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; };
+template<> struct value_type<node_type_t::LEAF> { using type = value_header_t; };
+template <node_type_t NODE_TYPE>
+using value_type_t = typename value_type<NODE_TYPE>::type;
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE>
+struct staged_result_t {
+ using me_t = staged_result_t<NODE_TYPE, STAGE>;
+ bool is_end() const { return position.is_end(); }
+
+ static me_t end() {
+ return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END};
+ }
+ template <typename T = me_t>
+ static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt(
+ index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) {
+ return {{index, nxt_stage_result.position},
+ nxt_stage_result.p_value,
+ nxt_stage_result.mstat};
+ }
+
+ staged_position_t<STAGE> position;
+ const value_type_t<NODE_TYPE>* p_value;
+ match_stat_t mstat;
+};
+
+template <node_type_t NODE_TYPE>
+using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>;
+
+template <node_type_t NODE_TYPE>
+lookup_result_t<NODE_TYPE>&& normalize(
+ lookup_result_t<NODE_TYPE>&& result) { return std::move(result); }
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE,
+ typename = std::enable_if_t<STAGE != STAGE_TOP>>
+lookup_result_t<NODE_TYPE> normalize(
+ staged_result_t<NODE_TYPE, STAGE>&& result) {
+ // FIXME: assert result.mstat correct
+ return {normalize(std::move(result.position)), result.p_value, result.mstat};
+}
+
+struct node_stats_t {
+ size_t size_persistent = 0;
+ size_t size_filled = 0;
+ // filled by staged::get_stats()
+ size_t size_logical = 0;
+ size_t size_overhead = 0;
+ size_t size_value = 0;
+ unsigned num_kvs = 0;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <crimson::os::seastore::onode::match_stage_t S>
+struct fmt::formatter<crimson::os::seastore::onode::staged_position_t<S>> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::onode::MatchHistory> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
new file mode 100644
index 000000000..28e6f7102
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
@@ -0,0 +1,329 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "sub_items_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+template <IsFullKey Key>
+const laddr_packed_t* internal_sub_items_t::insert_at(
+ NodeExtentMutable& mut, const internal_sub_items_t& sub_items,
+ const Key& key, const laddr_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound)
+{
+ assert(index <= sub_items.keys());
+ assert(size == estimate_insert(key, value));
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = reinterpret_cast<const char*>(
+ sub_items.p_first_item + 1 - index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+ auto p_insert = const_cast<char*>(p_shift_end) - size;
+ auto item = internal_sub_item_t{
+ snap_gen_t::from_key(key), laddr_packed_t{value}};
+ mut.copy_in_absolute(p_insert, item);
+ return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value;
+}
+#define IA_TEMPLATE(Key) \
+ template const laddr_packed_t* internal_sub_items_t::insert_at<Key>( \
+ NodeExtentMutable&, const internal_sub_items_t&, const Key&, \
+ const laddr_t&, index_t, node_offset_t, const char*)
+IA_TEMPLATE(key_view_t);
+IA_TEMPLATE(key_hobj_t);
+
+node_offset_t internal_sub_items_t::trim_until(
+ NodeExtentMutable& mut, internal_sub_items_t& items, index_t index)
+{
+ assert(index != 0);
+ auto keys = items.keys();
+ assert(index <= keys);
+ size_t ret = sizeof(internal_sub_item_t) * (keys - index);
+ assert(ret < mut.get_length());
+ return ret;
+}
+
+node_offset_t internal_sub_items_t::erase_at(
+ NodeExtentMutable& mut, const internal_sub_items_t& sub_items,
+ index_t index, const char* p_left_bound)
+{
+ assert(index < sub_items.keys());
+ node_offset_t erase_size = sizeof(internal_sub_item_t);
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = reinterpret_cast<const char*>(
+ sub_items.p_first_item - index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, erase_size);
+ return erase_size;
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+ const internal_sub_items_t& src, index_t from, index_t items)
+{
+ assert(from <= src.keys());
+ if (items == 0) {
+ return;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ node_offset_t size = sizeof(internal_sub_item_t) * items;
+ p_append -= size;
+ p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size);
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+ const full_key_t<KT>& key, const laddr_t& value,
+ const laddr_packed_t*& p_value)
+{
+ p_append -= sizeof(internal_sub_item_t);
+ auto item = internal_sub_item_t{
+ snap_gen_t::from_key(key), laddr_packed_t{value}};
+ p_mut->copy_in_absolute(p_append, item);
+ p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value;
+}
+
+template <IsFullKey Key>
+const value_header_t* leaf_sub_items_t::insert_at(
+ NodeExtentMutable& mut, const leaf_sub_items_t& sub_items,
+ const Key& key, const value_config_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound)
+{
+ assert(index <= sub_items.keys());
+ assert(size == estimate_insert(key, value));
+ // a. [... item(index)] << size
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = sub_items.get_item_end(index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+ // b. insert item
+ auto p_insert = const_cast<char*>(p_shift_end - size);
+ auto p_value = reinterpret_cast<value_header_t*>(p_insert);
+ p_value->initiate(mut, value);
+ p_insert += value.allocation_size();
+ mut.copy_in_absolute(p_insert, snap_gen_t::from_key(key));
+ assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end);
+
+ // c. compensate affected offsets
+ auto item_size = value.allocation_size() + sizeof(snap_gen_t);
+ for (auto i = index; i < sub_items.keys(); ++i) {
+ const node_offset_packed_t& offset_i = sub_items.get_offset(i);
+ mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size));
+ }
+
+ // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t)
+ const char* p_offset = (index == 0 ?
+ (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) :
+ (const char*)&sub_items.get_offset(index - 1));
+ p_shift_start = p_shift_end;
+ p_shift_end = p_offset;
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t));
+
+ // e. insert offset
+ node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index);
+ mut.copy_in_absolute(
+ const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start);
+
+ // f. update num_sub_keys
+ mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1));
+
+ return p_value;
+}
+template const value_header_t* leaf_sub_items_t::insert_at<key_hobj_t>(
+ NodeExtentMutable&, const leaf_sub_items_t&, const key_hobj_t&,
+ const value_config_t&, index_t, node_offset_t, const char*);
+
+node_offset_t leaf_sub_items_t::trim_until(
+ NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index)
+{
+ assert(index != 0);
+ auto keys = items.keys();
+ assert(index <= keys);
+ if (index == keys) {
+ return 0;
+ }
+ index_t trim_items = keys - index;
+ const char* p_items_start = items.p_start();
+ const char* p_shift_start = items.get_item_end(index);
+ const char* p_shift_end = items.get_item_end(0);
+ size_t size_trim_offsets = sizeof(node_offset_t) * trim_items;
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start,
+ size_trim_offsets);
+ mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index));
+ size_t ret = size_trim_offsets + (p_shift_start - p_items_start);
+ assert(ret < mut.get_length());
+ return ret;
+}
+
+node_offset_t leaf_sub_items_t::erase_at(
+ NodeExtentMutable& mut, const leaf_sub_items_t& sub_items,
+ index_t index, const char* p_left_bound)
+{
+ assert(sub_items.keys() > 0);
+ assert(index < sub_items.keys());
+ auto p_item_start = sub_items.get_item_start(index);
+ auto p_item_end = sub_items.get_item_end(index);
+ assert(p_item_start < p_item_end);
+ node_offset_t item_erase_size = p_item_end - p_item_start;
+ node_offset_t erase_size = item_erase_size + sizeof(node_offset_t);
+ auto p_offset_end = (const char*)&sub_items.get_offset(index);
+
+ // a. compensate affected offset[n] ... offset[index+1]
+ for (auto i = index + 1; i < sub_items.keys(); ++i) {
+ const node_offset_packed_t& offset_i = sub_items.get_offset(i);
+ mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value - item_erase_size));
+ }
+
+ // b. kv[index-1] ... kv[0] ... offset[index+1] >> sizeof(node_offset_t)
+ mut.shift_absolute(p_item_end, p_offset_end - p_item_end, sizeof(node_offset_t));
+
+ // c. ... kv[n] ... kv[index+1] >> item_erase_size
+ mut.shift_absolute(p_left_bound, p_item_start - p_left_bound, erase_size);
+
+ // d. update num_keys
+ mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() - 1));
+
+ return erase_size;
+}
+
+template class internal_sub_items_t::Appender<KeyT::VIEW>;
+template class internal_sub_items_t::Appender<KeyT::HOBJ>;
+
+// helper type for the visitor
+template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
+// explicit deduction guide
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+template <KeyT KT>
+void leaf_sub_items_t::Appender<KT>::append(
+ const leaf_sub_items_t& src, index_t from, index_t items)
+{
+ if (p_append) {
+ // append from empty
+ assert(cnt <= APPENDER_LIMIT);
+ assert(from <= src.keys());
+ if (items == 0) {
+ return;
+ }
+ if (op_src) {
+ assert(*op_src == src);
+ } else {
+ op_src = src;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ appends[cnt] = range_items_t{from, items};
+ ++cnt;
+ } else {
+ // append from existing
+ assert(op_dst.has_value());
+ assert(!p_appended);
+ assert(from == 0);
+ assert(items);
+ assert(items == src.keys());
+
+ num_keys_t num_keys = op_dst->keys();
+ node_offset_t compensate = op_dst->get_offset(num_keys - 1).value;
+ const char* p_items_start = op_dst->p_start();
+ const char* p_items_end = op_dst->p_items_end;
+
+ // update dst num_keys
+ num_keys += items;
+ p_mut->copy_in_absolute((char*)op_dst->p_num_keys, num_keys);
+
+ // shift dst items
+ std::size_t src_offsets_size = sizeof(node_offset_t) * items;
+ p_mut->shift_absolute(p_items_start,
+ p_items_end - p_items_start,
+ -(int)src_offsets_size);
+
+ // fill offsets from src
+ node_offset_t offset;
+ char* p_cur_offset = const_cast<char*>(p_items_end);
+ for (auto i = from; i < from + items; ++i) {
+ offset = src.get_offset(i).value + compensate;
+ p_cur_offset -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur_offset, offset);
+ }
+
+ // fill items from src
+ auto p_src_items_start = src.get_item_end(from + items);
+ std::size_t src_items_size = src.get_item_end(from) - p_src_items_start;
+ p_appended = const_cast<char*>(p_items_start) - src_offsets_size - src_items_size;
+ p_mut->copy_in_absolute(p_appended, p_src_items_start, src_items_size);
+ }
+}
+
+template <KeyT KT>
+char* leaf_sub_items_t::Appender<KT>::wrap()
+{
+ if (op_dst.has_value()) {
+ // append from existing
+ assert(p_appended);
+ return p_appended;
+ }
+ // append from empty
+ assert(p_append);
+ auto p_cur = p_append;
+ num_keys_t num_keys = 0;
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) { num_keys += arg.items; },
+ [&] (const kv_item_t& arg) { ++num_keys; }
+ }, a);
+ }
+ assert(num_keys);
+ p_cur -= sizeof(num_keys_t);
+ p_mut->copy_in_absolute(p_cur, num_keys);
+
+ node_offset_t last_offset = 0;
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) {
+ int compensate = (last_offset - op_src->get_offset_to_end(arg.from));
+ node_offset_t offset;
+ for (auto i = arg.from; i < arg.from + arg.items; ++i) {
+ offset = op_src->get_offset(i).value + compensate;
+ p_cur -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur, offset);
+ }
+ last_offset = offset;
+ },
+ [&] (const kv_item_t& arg) {
+ last_offset += sizeof(snap_gen_t) + arg.value_config.allocation_size();
+ p_cur -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur, last_offset);
+ }
+ }, a);
+ }
+
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) {
+ auto _p_start = op_src->get_item_end(arg.from + arg.items);
+ size_t _len = op_src->get_item_end(arg.from) - _p_start;
+ p_cur -= _len;
+ p_mut->copy_in_absolute(p_cur, _p_start, _len);
+ },
+ [&] (const kv_item_t& arg) {
+ assert(pp_value);
+ p_cur -= sizeof(snap_gen_t);
+ p_mut->copy_in_absolute(p_cur, snap_gen_t::from_key(*arg.p_key));
+ p_cur -= arg.value_config.allocation_size();
+ auto p_value = reinterpret_cast<value_header_t*>(p_cur);
+ p_value->initiate(*p_mut, arg.value_config);
+ *pp_value = p_value;
+ }
+ }, a);
+ }
+ return p_cur;
+}
+
+template class leaf_sub_items_t::Appender<KeyT::VIEW>;
+template class leaf_sub_items_t::Appender<KeyT::HOBJ>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
new file mode 100644
index 000000000..e3d1fd7c5
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
@@ -0,0 +1,368 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <variant>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct internal_sub_item_t {
+ const snap_gen_t& get_key() const { return key; }
+ const laddr_packed_t* get_p_value() const { return &value; }
+
+ snap_gen_t key;
+ laddr_packed_t value;
+} __attribute__((packed));
+
+/**
+ * internal_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to child node
+ * addresses.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <--------- container range -----------> #
+ * #<~># sub-items [2, n) #
+ * # # <- sub-item 1 -> # <- sub-item 0 -> #
+ * #...# snap-gen | laddr # snap-gen | laddr #
+ * ^
+ * |
+ * p_first_item +
+ */
+class internal_sub_items_t {
+ public:
+ using num_keys_t = index_t;
+
+ internal_sub_items_t(const container_range_t& _range)
+ : node_size{_range.node_size} {
+ assert(is_valid_node_size(node_size));
+ auto& range = _range.range;
+ assert(range.p_start < range.p_end);
+ assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0);
+ num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t);
+ assert(num_items > 0);
+ auto _p_first_item = range.p_end - sizeof(internal_sub_item_t);
+ p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item);
+ }
+
+ // container type system
+ using key_get_type = const snap_gen_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ num_keys_t keys() const { return num_items; }
+ key_get_type operator[](index_t index) const {
+ assert(index < num_items);
+ return (p_first_item - index)->get_key();
+ }
+ node_offset_t size_before(index_t index) const {
+ size_t ret = index * sizeof(internal_sub_item_t);
+ assert(ret < node_size);
+ return ret;
+ }
+ const laddr_packed_t* get_p_value(index_t index) const {
+ assert(index < num_items);
+ return (p_first_item - index)->get_p_value();
+ }
+ node_offset_t size_overhead_at(index_t index) const { return 0u; }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ auto p_end = reinterpret_cast<const char*>(p_first_item) +
+ sizeof(internal_sub_item_t);
+ auto p_start = p_end - num_items * sizeof(internal_sub_item_t);
+ int start_offset = p_start - p_node_start;
+ int stage_size = p_end - p_start;
+ assert(start_offset > 0);
+ assert(stage_size > 0);
+ assert(start_offset + stage_size < (int)node_size);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(stage_size), encoded);
+ }
+
+ static internal_sub_items_t decode(
+ const char* p_node_start,
+ extent_len_t node_size,
+ ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t stage_size;
+ ceph::decode(stage_size, delta);
+ assert(start_offset > 0);
+ assert(stage_size > 0);
+ assert((unsigned)start_offset + stage_size < node_size);
+ return internal_sub_items_t({{p_node_start + start_offset,
+ p_node_start + start_offset + stage_size},
+ node_size});
+ }
+
+ static node_offset_t header_size() { return 0u; }
+
+ template <IsFullKey Key>
+ static node_offset_t estimate_insert(
+ const Key&, const laddr_t&) {
+ return sizeof(internal_sub_item_t);
+ }
+
+ template <IsFullKey Key>
+ static const laddr_packed_t* insert_at(
+ NodeExtentMutable&, const internal_sub_items_t&,
+ const Key&, const laddr_t&,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t);
+
+ static node_offset_t erase_at(
+ NodeExtentMutable&, const internal_sub_items_t&, index_t, const char*);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ extent_len_t node_size;
+ index_t num_items;
+ const internal_sub_item_t* p_first_item;
+};
+
+template <KeyT KT>
+class internal_sub_items_t::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {}
+ Appender(NodeExtentMutable* p_mut, const internal_sub_items_t& sub_items)
+ : p_mut{p_mut},
+ p_append{(char*)(sub_items.p_first_item + 1 - sub_items.keys())} {
+ assert(sub_items.keys());
+ }
+ void append(const internal_sub_items_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>&, const laddr_t&, const laddr_packed_t*&);
+ char* wrap() { return p_append; }
+ private:
+ NodeExtentMutable* p_mut;
+ char* p_append;
+};
+
+/**
+ * leaf_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to value_header_t.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <------------------------ container range -------------------------------> #
+ * # <---------- sub-items ----------------> # <--- offsets ---------# #
+ * #<~># sub-items [2, n) #<~>| offsets [2, n) # #
+ * # # <- sub-item 1 -> # <- sub-item 0 -> # | # #
+ * #...# snap-gen | value # snap-gen | value #...| offset1 | offset0 # num_keys #
+ * ^ ^ ^
+ * | | |
+ * p_items_end + p_offsets + |
+ * p_num_keys +
+ */
+class leaf_sub_items_t {
+ public:
+ // should be enough to index all keys under 64 KiB node
+ using num_keys_t = uint16_t;
+
+ // TODO: remove if num_keys_t is aligned
+ struct num_keys_packed_t {
+ num_keys_t value;
+ } __attribute__((packed));
+
+ leaf_sub_items_t(const container_range_t& _range)
+ : node_size{_range.node_size} {
+ assert(is_valid_node_size(node_size));
+ auto& range = _range.range;
+ assert(range.p_start < range.p_end);
+ auto _p_num_keys = range.p_end - sizeof(num_keys_t);
+ assert(range.p_start < _p_num_keys);
+ p_num_keys = reinterpret_cast<const num_keys_packed_t*>(_p_num_keys);
+ assert(keys());
+ auto _p_offsets = _p_num_keys - sizeof(node_offset_t);
+ assert(range.p_start < _p_offsets);
+ p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets);
+ p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1));
+ assert(range.p_start < p_items_end);
+ assert(range.p_start == p_start());
+ }
+
+ bool operator==(const leaf_sub_items_t& x) {
+ return (p_num_keys == x.p_num_keys &&
+ p_offsets == x.p_offsets &&
+ p_items_end == x.p_items_end);
+ }
+
+ const char* p_start() const { return get_item_end(keys()); }
+
+ const node_offset_packed_t& get_offset(index_t index) const {
+ assert(index < keys());
+ return *(p_offsets - index);
+ }
+
+ const node_offset_t get_offset_to_end(index_t index) const {
+ assert(index <= keys());
+ return index == 0 ? 0 : get_offset(index - 1).value;
+ }
+
+ const char* get_item_start(index_t index) const {
+ return p_items_end - get_offset(index).value;
+ }
+
+ const char* get_item_end(index_t index) const {
+ return p_items_end - get_offset_to_end(index);
+ }
+
+ // container type system
+ using key_get_type = const snap_gen_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ num_keys_t keys() const { return p_num_keys->value; }
+ key_get_type operator[](index_t index) const {
+ assert(index < keys());
+ auto pointer = get_item_end(index);
+ assert(get_item_start(index) < pointer);
+ pointer -= sizeof(snap_gen_t);
+ assert(get_item_start(index) < pointer);
+ return *reinterpret_cast<const snap_gen_t*>(pointer);
+ }
+ node_offset_t size_before(index_t index) const {
+ assert(index <= keys());
+ size_t ret;
+ if (index == 0) {
+ ret = sizeof(num_keys_t);
+ } else {
+ --index;
+ ret = sizeof(num_keys_t) +
+ (index + 1) * sizeof(node_offset_t) +
+ get_offset(index).value;
+ }
+ assert(ret < node_size);
+ return ret;
+ }
+ node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); }
+ const value_header_t* get_p_value(index_t index) const {
+ assert(index < keys());
+ auto pointer = get_item_start(index);
+ auto value = reinterpret_cast<const value_header_t*>(pointer);
+ assert(pointer + value->allocation_size() + sizeof(snap_gen_t) ==
+ get_item_end(index));
+ return value;
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ auto p_end = reinterpret_cast<const char*>(p_num_keys) +
+ sizeof(num_keys_t);
+ int start_offset = p_start() - p_node_start;
+ int stage_size = p_end - p_start();
+ assert(start_offset > 0);
+ assert(stage_size > 0);
+ assert(start_offset + stage_size < (int)node_size);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(stage_size), encoded);
+ }
+
+ static leaf_sub_items_t decode(
+ const char* p_node_start,
+ extent_len_t node_size,
+ ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t stage_size;
+ ceph::decode(stage_size, delta);
+ assert(start_offset > 0);
+ assert(stage_size > 0);
+ assert((unsigned)start_offset + stage_size < node_size);
+ return leaf_sub_items_t({{p_node_start + start_offset,
+ p_node_start + start_offset + stage_size},
+ node_size});
+ }
+
+ static node_offset_t header_size() { return sizeof(num_keys_t); }
+
+ template <IsFullKey Key>
+ static node_offset_t estimate_insert(
+ const Key&, const value_config_t& value) {
+ return value.allocation_size() + sizeof(snap_gen_t) + sizeof(node_offset_t);
+ }
+
+ template <IsFullKey Key>
+ static const value_header_t* insert_at(
+ NodeExtentMutable&, const leaf_sub_items_t&,
+ const Key&, const value_config_t&,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index);
+
+ static node_offset_t erase_at(
+ NodeExtentMutable&, const leaf_sub_items_t&, index_t, const char*);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ extent_len_t node_size;
+ const num_keys_packed_t* p_num_keys;
+ const node_offset_packed_t* p_offsets;
+ const char* p_items_end;
+};
+
+constexpr index_t APPENDER_LIMIT = 3u;
+
+template <KeyT KT>
+class leaf_sub_items_t::Appender {
+ struct range_items_t {
+ index_t from;
+ index_t items;
+ };
+ struct kv_item_t {
+ const full_key_t<KT>* p_key;
+ value_config_t value_config;
+ };
+ using var_t = std::variant<range_items_t, kv_item_t>;
+
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {
+ }
+ Appender(NodeExtentMutable* p_mut, const leaf_sub_items_t& sub_items)
+ : p_mut{p_mut} , op_dst(sub_items) {
+ assert(sub_items.keys());
+ }
+
+ void append(const leaf_sub_items_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>& key,
+ const value_config_t& value, const value_header_t*& p_value) {
+ // append from empty
+ assert(p_append);
+ assert(pp_value == nullptr);
+ assert(cnt <= APPENDER_LIMIT);
+ appends[cnt] = kv_item_t{&key, value};
+ ++cnt;
+ pp_value = &p_value;
+ }
+ char* wrap();
+
+ private:
+ NodeExtentMutable* p_mut;
+ // append from empty
+ std::optional<leaf_sub_items_t> op_src;
+ const value_header_t** pp_value = nullptr;
+ char* p_append = nullptr;
+ var_t appends[APPENDER_LIMIT];
+ index_t cnt = 0;
+ // append from existing
+ std::optional<leaf_sub_items_t> op_dst;
+ char* p_appended = nullptr;
+};
+
+template <node_type_t> struct _sub_items_t;
+template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; };
+template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; };
+template <node_type_t NODE_TYPE>
+using sub_items_t = typename _sub_items_t<NODE_TYPE>::type;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc
new file mode 100644
index 000000000..09f20db3a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "super.h"
+#include "node.h"
+
+namespace crimson::os::seastore::onode {
+
+Ref<Node> RootNodeTrackerIsolated::get_root(Transaction& t) const
+{
+ auto iter = tracked_supers.find(&t);
+ if (iter == tracked_supers.end()) {
+ return nullptr;
+ } else {
+ return iter->second->get_p_root();
+ }
+}
+
+Ref<Node> RootNodeTrackerShared::get_root(Transaction&) const
+{
+ if (is_clean()) {
+ return nullptr;
+ } else {
+ return tracked_super->get_p_root();
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h
new file mode 100644
index 000000000..5eefee9ff
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include "crimson/common/type_helpers.h"
+
+#include "fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+class Node;
+class Super;
+
+/**
+ * RootNodeTracker
+ *
+ * An abstracted tracker to get the root node by Transaction.
+ */
+class RootNodeTracker {
+ public:
+ virtual ~RootNodeTracker() = default;
+ virtual bool is_clean() const = 0;
+ virtual Ref<Node> get_root(Transaction&) const = 0;
+ static RootNodeTrackerURef create(bool read_isolated);
+ protected:
+ RootNodeTracker() = default;
+ RootNodeTracker(const RootNodeTracker&) = delete;
+ RootNodeTracker(RootNodeTracker&&) = delete;
+ RootNodeTracker& operator=(const RootNodeTracker&) = delete;
+ RootNodeTracker& operator=(RootNodeTracker&&) = delete;
+ virtual void do_track_super(Transaction&, Super&) = 0;
+ virtual void do_untrack_super(Transaction&, Super&) = 0;
+ friend class Super;
+};
+
+/**
+ * Super
+ *
+ * The parent of root node. It contains the relationship between a Transaction
+ * and a root node address.
+ */
+class Super {
+ public:
+ using URef = std::unique_ptr<Super>;
+ Super(const Super&) = delete;
+ Super(Super&&) = delete;
+ Super& operator=(const Super&) = delete;
+ Super& operator=(Super&&) = delete;
+ virtual ~Super() {
+ assert(tracked_root_node == nullptr);
+ tracker.do_untrack_super(t, *this);
+ }
+
+ virtual laddr_t get_root_laddr() const = 0;
+ virtual void write_root_laddr(context_t, laddr_t) = 0;
+
+ void do_track_root(Node& root) {
+ assert(tracked_root_node == nullptr);
+ tracked_root_node = &root;
+ }
+ void do_untrack_root(Node& root) {
+ assert(tracked_root_node == &root);
+ tracked_root_node = nullptr;
+ }
+ Node* get_p_root() const {
+ assert(tracked_root_node != nullptr);
+ return tracked_root_node;
+ }
+
+ protected:
+ Super(Transaction& t, RootNodeTracker& tracker)
+ : t{t}, tracker{tracker} {
+ tracker.do_track_super(t, *this);
+ }
+
+ private:
+ Transaction& t;
+ RootNodeTracker& tracker;
+ Node* tracked_root_node = nullptr;
+};
+
+/**
+ * RootNodeTrackerIsolated
+ *
+ * A concrete RootNodeTracker implementation which provides root node isolation
+ * between Transactions for Seastore backend.
+ */
+class RootNodeTrackerIsolated final : public RootNodeTracker {
+ public:
+ ~RootNodeTrackerIsolated() override { assert(is_clean()); }
+ protected:
+ bool is_clean() const override {
+ return tracked_supers.empty();
+ }
+ void do_track_super(Transaction& t, Super& super) override {
+ assert(tracked_supers.find(&t) == tracked_supers.end());
+ tracked_supers[&t] = &super;
+ }
+ void do_untrack_super(Transaction& t, Super& super) override {
+ [[maybe_unused]] auto removed = tracked_supers.erase(&t);
+ assert(removed);
+ }
+ ::Ref<Node> get_root(Transaction& t) const override;
+ std::map<Transaction*, Super*> tracked_supers;
+};
+
+/**
+ * RootNodeTrackerShared
+ *
+ * A concrete RootNodeTracker implementation which has no isolation between
+ * Transactions for Dummy backend.
+ */
+class RootNodeTrackerShared final : public RootNodeTracker {
+ public:
+ ~RootNodeTrackerShared() override { assert(is_clean()); }
+ protected:
+ bool is_clean() const override {
+ return tracked_super == nullptr;
+ }
+ void do_track_super(Transaction&, Super& super) override {
+ assert(is_clean());
+ tracked_super = &super;
+ }
+ void do_untrack_super(Transaction&, Super& super) override {
+ assert(tracked_super == &super);
+ tracked_super = nullptr;
+ }
+ ::Ref<Node> get_root(Transaction&) const override;
+ Super* tracked_super = nullptr;
+};
+
+inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) {
+ if (read_isolated) {
+ return RootNodeTrackerURef(new RootNodeTrackerIsolated());
+ } else {
+ return RootNodeTrackerURef(new RootNodeTrackerShared());
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h
new file mode 100644
index 000000000..7385e080c
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h
@@ -0,0 +1,387 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/seastore/logging.h"
+
+#include "fwd.h"
+#include "node.h"
+#include "node_extent_manager.h"
+#include "stages/key_layout.h"
+#include "super.h"
+#include "value.h"
+
+/**
+ * tree.h
+ *
+ * A special-purpose and b-tree-based implementation that:
+ * - Fulfills requirements of OnodeManager to index ordered onode key-values;
+ * - Runs above seastore block and transaction layer;
+ * - Specially optimized for onode key structures and seastore
+ * delta/transaction semantics;
+ *
+ * Note: Cursor/Value are transactional, they cannot be used outside the scope
+ * of the according transaction, or the behavior is undefined.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class Node;
+class tree_cursor_t;
+
+template <typename ValueImpl>
+class Btree {
+ public:
+ Btree(NodeExtentManagerURef&& _nm)
+ : nm{std::move(_nm)},
+ root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {}
+ ~Btree() { assert(root_tracker->is_clean()); }
+
+ Btree(const Btree&) = delete;
+ Btree(Btree&&) = delete;
+ Btree& operator=(const Btree&) = delete;
+ Btree& operator=(Btree&&) = delete;
+
+ eagain_ifuture<> mkfs(Transaction& t) {
+ return Node::mkfs(get_context(t), *root_tracker);
+ }
+
+ class Cursor {
+ public:
+ Cursor(const Cursor&) = default;
+ Cursor(Cursor&&) noexcept = default;
+ Cursor& operator=(const Cursor&) = default;
+ Cursor& operator=(Cursor&&) = default;
+ ~Cursor() = default;
+
+ bool is_end() const {
+ if (p_cursor->is_tracked()) {
+ return false;
+ } else if (p_cursor->is_invalid()) {
+ return true;
+ } else {
+ // we don't actually store end cursor because it will hold a reference
+ // to an end leaf node and is not kept updated.
+ assert(p_cursor->is_end());
+ ceph_abort("impossible");
+ }
+ }
+
+ /// Invalidate the Cursor before submitting transaction.
+ void invalidate() {
+ p_cursor.reset();
+ }
+
+ // XXX: return key_view_t to avoid unecessary ghobject_t constructions
+ ghobject_t get_ghobj() const {
+ assert(!is_end());
+ auto view = p_cursor->get_key_view(
+ p_tree->value_builder.get_header_magic());
+ assert(view.nspace().size() <=
+ p_tree->value_builder.get_max_ns_size());
+ assert(view.oid().size() <=
+ p_tree->value_builder.get_max_oid_size());
+ return view.to_ghobj();
+ }
+
+ ValueImpl value() {
+ assert(!is_end());
+ return p_tree->value_builder.build_value(
+ *p_tree->nm, p_tree->value_builder, p_cursor);
+ }
+
+ bool operator==(const Cursor& o) const { return operator<=>(o) == 0; }
+
+ eagain_ifuture<Cursor> get_next(Transaction& t) {
+ assert(!is_end());
+ auto this_obj = *this;
+ return p_cursor->get_next(p_tree->get_context(t)
+ ).si_then([this_obj] (Ref<tree_cursor_t> next_cursor) {
+ next_cursor->assert_next_to(
+ *this_obj.p_cursor, this_obj.p_tree->value_builder.get_header_magic());
+ auto ret = Cursor{this_obj.p_tree, next_cursor};
+ assert(this_obj < ret);
+ return ret;
+ });
+ }
+
+ template <bool FORCE_MERGE = false>
+ eagain_ifuture<Cursor> erase(Transaction& t) {
+ assert(!is_end());
+ auto this_obj = *this;
+ return p_cursor->erase<FORCE_MERGE>(p_tree->get_context(t), true
+ ).si_then([this_obj, this] (Ref<tree_cursor_t> next_cursor) {
+ assert(p_cursor->is_invalid());
+ if (next_cursor) {
+ assert(!next_cursor->is_end());
+ return Cursor{p_tree, next_cursor};
+ } else {
+ return Cursor{p_tree};
+ }
+ });
+ }
+
+ private:
+ Cursor(Btree* p_tree, Ref<tree_cursor_t> _p_cursor) : p_tree(p_tree) {
+ if (_p_cursor->is_invalid()) {
+ // we don't create Cursor from an invalid tree_cursor_t.
+ ceph_abort("impossible");
+ } else if (_p_cursor->is_end()) {
+ // we don't actually store end cursor because it will hold a reference
+ // to an end leaf node and is not kept updated.
+ } else {
+ assert(_p_cursor->is_tracked());
+ p_cursor = _p_cursor;
+ }
+ }
+ Cursor(Btree* p_tree) : p_tree{p_tree} {}
+
+ std::strong_ordering operator<=>(const Cursor& o) const {
+ assert(p_tree == o.p_tree);
+ return p_cursor->compare_to(
+ *o.p_cursor, p_tree->value_builder.get_header_magic());
+ }
+
+ static Cursor make_end(Btree* p_tree) {
+ return {p_tree};
+ }
+
+ Btree* p_tree;
+ Ref<tree_cursor_t> p_cursor = tree_cursor_t::get_invalid();
+
+ friend class Btree;
+ };
+
+ /*
+ * lookup
+ */
+
+ eagain_ifuture<Cursor> begin(Transaction& t) {
+ return get_root(t).si_then([this, &t](auto root) {
+ return root->lookup_smallest(get_context(t));
+ }).si_then([this](auto cursor) {
+ return Cursor{this, cursor};
+ });
+ }
+
+ eagain_ifuture<Cursor> last(Transaction& t) {
+ return get_root(t).si_then([this, &t](auto root) {
+ return root->lookup_largest(get_context(t));
+ }).si_then([this](auto cursor) {
+ return Cursor(this, cursor);
+ });
+ }
+
+ Cursor end() {
+ return Cursor::make_end(this);
+ }
+
+ eagain_ifuture<bool> contains(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ key_hobj_t{obj},
+ [this, &t](auto& key) -> eagain_ifuture<bool> {
+ return get_root(t).si_then([this, &t, &key](auto root) {
+ // TODO: improve lower_bound()
+ return root->lower_bound(get_context(t), key);
+ }).si_then([](auto result) {
+ return MatchKindBS::EQ == result.match();
+ });
+ }
+ );
+ }
+
+ eagain_ifuture<Cursor> find(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ key_hobj_t{obj},
+ [this, &t](auto& key) -> eagain_ifuture<Cursor> {
+ return get_root(t).si_then([this, &t, &key](auto root) {
+ // TODO: improve lower_bound()
+ return root->lower_bound(get_context(t), key);
+ }).si_then([this](auto result) {
+ if (result.match() == MatchKindBS::EQ) {
+ return Cursor(this, result.p_cursor);
+ } else {
+ return Cursor::make_end(this);
+ }
+ });
+ }
+ );
+ }
+
+ /**
+ * lower_bound
+ *
+ * Returns a Cursor pointing to the element that is equal to the key, or the
+ * first element larger than the key, or the end Cursor if that element
+ * doesn't exist.
+ */
+ eagain_ifuture<Cursor> lower_bound(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ key_hobj_t{obj},
+ [this, &t](auto& key) -> eagain_ifuture<Cursor> {
+ return get_root(t).si_then([this, &t, &key](auto root) {
+ return root->lower_bound(get_context(t), key);
+ }).si_then([this](auto result) {
+ return Cursor(this, result.p_cursor);
+ });
+ }
+ );
+ }
+
+ eagain_ifuture<Cursor> get_next(Transaction& t, Cursor& cursor) {
+ return cursor.get_next(t);
+ }
+
+ /*
+ * modifiers
+ */
+
+ struct tree_value_config_t {
+ value_size_t payload_size = 256;
+ };
+ using insert_iertr = eagain_iertr::extend<
+ crimson::ct_error::value_too_large>;
+ insert_iertr::future<std::pair<Cursor, bool>>
+ insert(Transaction& t, const ghobject_t& obj, tree_value_config_t _vconf) {
+ LOG_PREFIX(OTree::insert);
+ if (_vconf.payload_size > value_builder.get_max_value_payload_size()) {
+ SUBERRORT(seastore_onode, "value payload size {} too large to insert {}",
+ t, _vconf.payload_size, key_hobj_t{obj});
+ return crimson::ct_error::value_too_large::make();
+ }
+ if (obj.hobj.nspace.size() > value_builder.get_max_ns_size()) {
+ SUBERRORT(seastore_onode, "namespace size {} too large to insert {}",
+ t, obj.hobj.nspace.size(), key_hobj_t{obj});
+ return crimson::ct_error::value_too_large::make();
+ }
+ if (obj.hobj.oid.name.size() > value_builder.get_max_oid_size()) {
+ SUBERRORT(seastore_onode, "oid size {} too large to insert {}",
+ t, obj.hobj.oid.name.size(), key_hobj_t{obj});
+ return crimson::ct_error::value_too_large::make();
+ }
+ value_config_t vconf{value_builder.get_header_magic(), _vconf.payload_size};
+ return seastar::do_with(
+ key_hobj_t{obj},
+ [this, &t, vconf](auto& key) -> eagain_ifuture<std::pair<Cursor, bool>> {
+ ceph_assert(key.is_valid());
+ return get_root(t).si_then([this, &t, &key, vconf](auto root) {
+ return root->insert(get_context(t), key, vconf, std::move(root));
+ }).si_then([this](auto ret) {
+ auto& [cursor, success] = ret;
+ return std::make_pair(Cursor(this, cursor), success);
+ });
+ }
+ );
+ }
+
+ eagain_ifuture<std::size_t> erase(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ key_hobj_t{obj},
+ [this, &t](auto& key) -> eagain_ifuture<std::size_t> {
+ return get_root(t).si_then([this, &t, &key](auto root) {
+ return root->erase(get_context(t), key, std::move(root));
+ });
+ }
+ );
+ }
+
+ eagain_ifuture<Cursor> erase(Transaction& t, Cursor& pos) {
+ return pos.erase(t);
+ }
+
+ eagain_ifuture<> erase(Transaction& t, Value& value) {
+ assert(value.is_tracked());
+ auto ref_cursor = value.p_cursor;
+ return ref_cursor->erase(get_context(t), false
+ ).si_then([ref_cursor] (auto next_cursor) {
+ assert(ref_cursor->is_invalid());
+ assert(!next_cursor);
+ });
+ }
+
+ /*
+ * stats
+ */
+
+ eagain_ifuture<size_t> height(Transaction& t) {
+ return get_root(t).si_then([](auto root) {
+ return size_t(root->level() + 1);
+ });
+ }
+
+ eagain_ifuture<tree_stats_t> get_stats_slow(Transaction& t) {
+ return get_root(t).si_then([this, &t](auto root) {
+ unsigned height = root->level() + 1;
+ return root->get_tree_stats(get_context(t)
+ ).si_then([height](auto stats) {
+ stats.height = height;
+ return seastar::make_ready_future<tree_stats_t>(stats);
+ });
+ });
+ }
+
+ std::ostream& dump(Transaction& t, std::ostream& os) {
+ auto root = root_tracker->get_root(t);
+ if (root) {
+ root->dump(os);
+ } else {
+ os << "empty tree!";
+ }
+ return os;
+ }
+
+ std::ostream& print(std::ostream& os) const {
+ return os << "BTree-" << *nm;
+ }
+
+ /*
+ * test_only
+ */
+
+ bool test_is_clean() const {
+ return root_tracker->is_clean();
+ }
+
+ eagain_ifuture<> test_clone_from(
+ Transaction& t, Transaction& t_from, Btree& from) {
+ // Note: assume the tree to clone is tracked correctly in memory.
+ // In some unit tests, parts of the tree are stubbed out that they
+ // should not be loaded from NodeExtentManager.
+ return from.get_root(t_from
+ ).si_then([this, &t](auto root_from) {
+ return root_from->test_clone_root(get_context(t), *root_tracker);
+ });
+ }
+
+ private:
+ context_t get_context(Transaction& t) {
+ return {*nm, value_builder, t};
+ }
+
+ eagain_ifuture<Ref<Node>> get_root(Transaction& t) {
+ auto root = root_tracker->get_root(t);
+ if (root) {
+ return seastar::make_ready_future<Ref<Node>>(root);
+ } else {
+ return Node::load_root(get_context(t), *root_tracker);
+ }
+ }
+
+ NodeExtentManagerURef nm;
+ const ValueBuilderImpl<ValueImpl> value_builder;
+ RootNodeTrackerURef root_tracker;
+
+ friend class DummyChildPool;
+};
+
+template <typename ValueImpl>
+inline std::ostream& operator<<(std::ostream& os, const Btree<ValueImpl>& tree) {
+ return tree.print(os);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h
new file mode 100644
index 000000000..3ad3564a6
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h
@@ -0,0 +1,565 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <random>
+#include <string>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include <seastar/core/thread.hh>
+
+#include "crimson/common/log.h"
+#include "stages/key_layout.h"
+#include "tree.h"
+
+/**
+ * tree_utils.h
+ *
+ * Contains shared logic for unit tests and perf tool.
+ */
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * templates to work with tree utility classes:
+ *
+ * struct ValueItem {
+ * <public members>
+ *
+ * value_size_t get_payload_size() const;
+ * static ValueItem create(std::size_t expected_size, std::size_t id);
+ * };
+ * std::ostream& operator<<(std::ostream& os, const ValueItem& item);
+ *
+ * class ValueImpl final : public Value {
+ * ...
+ *
+ * using item_t = ValueItem;
+ * void initialize(Transaction& t, const item_t& item);
+ * void validate(const item_t& item);
+ * };
+ *
+ */
+
+template <typename CursorType>
+void initialize_cursor_from_item(
+ Transaction& t,
+ const ghobject_t& key,
+ const typename decltype(std::declval<CursorType>().value())::item_t& item,
+ CursorType& cursor,
+ bool insert_success) {
+ ceph_assert(insert_success);
+ ceph_assert(!cursor.is_end());
+ ceph_assert(cursor.get_ghobj() == key);
+ auto tree_value = cursor.value();
+ tree_value.initialize(t, item);
+}
+
+
+template <typename CursorType>
+void validate_cursor_from_item(
+ const ghobject_t& key,
+ const typename decltype(std::declval<CursorType>().value())::item_t& item,
+ CursorType& cursor) {
+ ceph_assert(!cursor.is_end());
+ ceph_assert(cursor.get_ghobj() == key);
+ auto tree_value = cursor.value();
+ tree_value.validate(item);
+}
+
+template <typename ValueItem>
+class Values {
+ public:
+ Values(size_t n) {
+ for (size_t i = 1; i <= n; ++i) {
+ auto item = create(i * 8);
+ values.push_back(item);
+ }
+ }
+
+ Values(std::vector<size_t> sizes) {
+ for (auto& size : sizes) {
+ auto item = create(size);
+ values.push_back(item);
+ }
+ }
+
+ ~Values() = default;
+
+ ValueItem create(size_t size) {
+ return ValueItem::create(size, id++);
+ }
+
+ ValueItem pick() const {
+ auto index = rd() % values.size();
+ return values[index];
+ }
+
+ private:
+ std::size_t id = 0;
+ mutable std::random_device rd;
+ std::vector<ValueItem> values;
+};
+
+template <typename ValueItem>
+class KVPool {
+ public:
+ struct kv_t {
+ ghobject_t key;
+ ValueItem value;
+ };
+ using kv_vector_t = std::vector<kv_t>;
+ using kvptr_vector_t = std::vector<kv_t*>;
+ using iterator_t = typename kvptr_vector_t::iterator;
+
+ size_t size() const {
+ return kvs.size();
+ }
+
+ iterator_t begin() {
+ return serial_p_kvs.begin();
+ }
+ iterator_t end() {
+ return serial_p_kvs.end();
+ }
+ iterator_t random_begin() {
+ return random_p_kvs.begin();
+ }
+ iterator_t random_end() {
+ return random_p_kvs.end();
+ }
+
+ void shuffle() {
+ std::shuffle(random_p_kvs.begin(), random_p_kvs.end(), std::default_random_engine{});
+ }
+
+ void erase_from_random(iterator_t begin, iterator_t end) {
+ random_p_kvs.erase(begin, end);
+ kv_vector_t new_kvs;
+ for (auto p_kv : random_p_kvs) {
+ new_kvs.emplace_back(*p_kv);
+ }
+ std::sort(new_kvs.begin(), new_kvs.end(), [](auto& l, auto& r) {
+ return l.key < r.key;
+ });
+
+ kvs.swap(new_kvs);
+ serial_p_kvs.resize(kvs.size());
+ random_p_kvs.resize(kvs.size());
+ init();
+ }
+
+ static KVPool create_raw_range(
+ const std::vector<size_t>& ns_sizes,
+ const std::vector<size_t>& oid_sizes,
+ const std::vector<size_t>& value_sizes,
+ const std::pair<index_t, index_t>& range2,
+ const std::pair<index_t, index_t>& range1,
+ const std::pair<index_t, index_t>& range0) {
+ ceph_assert(range2.first < range2.second);
+ ceph_assert(range2.second - 1 <= MAX_SHARD);
+ ceph_assert(range2.second - 1 <= MAX_CRUSH);
+ ceph_assert(range1.first < range1.second);
+ ceph_assert(range1.second - 1 <= 9);
+ ceph_assert(range0.first < range0.second);
+
+ kv_vector_t kvs;
+ std::random_device rd;
+ Values<ValueItem> values{value_sizes};
+ for (index_t i = range2.first; i < range2.second; ++i) {
+ for (index_t j = range1.first; j < range1.second; ++j) {
+ size_t ns_size;
+ size_t oid_size;
+ if (j == 0) {
+ // store ns0, oid0 as empty strings for test purposes
+ ns_size = 0;
+ oid_size = 0;
+ } else {
+ ns_size = ns_sizes[rd() % ns_sizes.size()];
+ oid_size = oid_sizes[rd() % oid_sizes.size()];
+ assert(ns_size && oid_size);
+ }
+ for (index_t k = range0.first; k < range0.second; ++k) {
+ kvs.emplace_back(
+ kv_t{make_raw_oid(i, j, k, ns_size, oid_size), values.pick()}
+ );
+ }
+ }
+ }
+ return KVPool(std::move(kvs));
+ }
+
+ static KVPool create_range(
+ const std::pair<index_t, index_t>& range_i,
+ const std::vector<size_t>& value_sizes,
+ const uint64_t block_size) {
+ kv_vector_t kvs;
+ std::random_device rd;
+ for (index_t i = range_i.first; i < range_i.second; ++i) {
+ auto value_size = value_sizes[rd() % value_sizes.size()];
+ kvs.emplace_back(
+ kv_t{make_oid(i), ValueItem::create(value_size, i, block_size)}
+ );
+ }
+ return KVPool(std::move(kvs));
+ }
+
+ private:
+ KVPool(kv_vector_t&& _kvs)
+ : kvs(std::move(_kvs)), serial_p_kvs(kvs.size()), random_p_kvs(kvs.size()) {
+ init();
+ }
+
+ void init() {
+ std::transform(kvs.begin(), kvs.end(), serial_p_kvs.begin(),
+ [] (kv_t& item) { return &item; });
+ std::transform(kvs.begin(), kvs.end(), random_p_kvs.begin(),
+ [] (kv_t& item) { return &item; });
+ shuffle();
+ }
+
+ static ghobject_t make_raw_oid(
+ index_t index2, index_t index1, index_t index0,
+ size_t ns_size, size_t oid_size) {
+ assert(index1 < 10);
+ std::ostringstream os_ns;
+ std::ostringstream os_oid;
+ if (index1 == 0) {
+ assert(!ns_size);
+ assert(!oid_size);
+ } else {
+ os_ns << "ns" << index1;
+ auto current_size = (size_t)os_ns.tellp();
+ assert(ns_size >= current_size);
+ os_ns << std::string(ns_size - current_size, '_');
+
+ os_oid << "oid" << index1;
+ current_size = (size_t)os_oid.tellp();
+ assert(oid_size >= current_size);
+ os_oid << std::string(oid_size - current_size, '_');
+ }
+
+ return ghobject_t(shard_id_t(index2), index2, index2,
+ os_ns.str(), os_oid.str(), index0, index0);
+ }
+
+ static ghobject_t make_oid(index_t i) {
+ std::stringstream ss;
+ ss << "object_" << i;
+ auto ret = ghobject_t(
+ hobject_t(
+ sobject_t(ss.str(), CEPH_NOSNAP)));
+ ret.set_shard(shard_id_t(0));
+ ret.hobj.nspace = "asdf";
+ return ret;
+ }
+
+ kv_vector_t kvs;
+ kvptr_vector_t serial_p_kvs;
+ kvptr_vector_t random_p_kvs;
+};
+
+template <bool TRACK, typename ValueImpl>
+class TreeBuilder {
+ public:
+ using BtreeImpl = Btree<ValueImpl>;
+ using BtreeCursor = typename BtreeImpl::Cursor;
+ using ValueItem = typename ValueImpl::item_t;
+ using iterator_t = typename KVPool<ValueItem>::iterator_t;
+
+ TreeBuilder(KVPool<ValueItem>& kvs, NodeExtentManagerURef&& nm)
+ : kvs{kvs} {
+ tree.emplace(std::move(nm));
+ }
+
+ eagain_ifuture<> bootstrap(Transaction& t) {
+ std::ostringstream oss;
+#ifndef NDEBUG
+ oss << "debug=on, ";
+#else
+ oss << "debug=off, ";
+#endif
+#ifdef UNIT_TESTS_BUILT
+ oss << "UNIT_TEST_BUILT=on, ";
+#else
+ oss << "UNIT_TEST_BUILT=off, ";
+#endif
+ if constexpr (TRACK) {
+ oss << "track=on, ";
+ } else {
+ oss << "track=off, ";
+ }
+ oss << *tree;
+ logger().warn("TreeBuilder: {}, bootstrapping ...", oss.str());
+ return tree->mkfs(t);
+ }
+
+ eagain_ifuture<BtreeCursor> insert_one(
+ Transaction& t, const iterator_t& iter_rd) {
+ auto p_kv = *iter_rd;
+ logger().debug("[{}] insert {} -> {}",
+ iter_rd - kvs.random_begin(),
+ key_hobj_t{p_kv->key},
+ p_kv->value);
+ return tree->insert(
+ t, p_kv->key, {p_kv->value.get_payload_size()}
+ ).si_then([&t, this, p_kv](auto ret) {
+ boost::ignore_unused(this); // avoid clang warning;
+ auto success = ret.second;
+ auto cursor = std::move(ret.first);
+ initialize_cursor_from_item(t, p_kv->key, p_kv->value, cursor, success);
+#ifndef NDEBUG
+ validate_cursor_from_item(p_kv->key, p_kv->value, cursor);
+ return tree->find(t, p_kv->key
+ ).si_then([cursor, p_kv](auto cursor_) mutable {
+ assert(!cursor_.is_end());
+ ceph_assert(cursor_.get_ghobj() == p_kv->key);
+ ceph_assert(cursor_.value() == cursor.value());
+ validate_cursor_from_item(p_kv->key, p_kv->value, cursor_);
+ return cursor;
+ });
+#else
+ return eagain_iertr::make_ready_future<BtreeCursor>(cursor);
+#endif
+ }).handle_error_interruptible(
+ [] (const crimson::ct_error::value_too_large& e) {
+ ceph_abort("impossible path");
+ },
+ crimson::ct_error::pass_further_all{}
+ );
+ }
+
+ eagain_ifuture<> insert(Transaction& t) {
+ auto ref_kv_iter = seastar::make_lw_shared<iterator_t>();
+ *ref_kv_iter = kvs.random_begin();
+ auto cursors = seastar::make_lw_shared<std::vector<BtreeCursor>>();
+ logger().warn("start inserting {} kvs ...", kvs.size());
+ auto start_time = mono_clock::now();
+ return trans_intr::repeat([&t, this, cursors, ref_kv_iter,
+ start_time]()
+ -> eagain_ifuture<seastar::stop_iteration> {
+ if (*ref_kv_iter == kvs.random_end()) {
+ std::chrono::duration<double> duration = mono_clock::now() - start_time;
+ logger().warn("Insert done! {}s", duration.count());
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ } else {
+ return insert_one(t, *ref_kv_iter
+ ).si_then([cursors, ref_kv_iter] (auto cursor) {
+ if constexpr (TRACK) {
+ cursors->emplace_back(cursor);
+ }
+ ++(*ref_kv_iter);
+ return seastar::stop_iteration::no;
+ });
+ }
+ }).si_then([&t, this, cursors, ref_kv_iter] {
+ if (!cursors->empty()) {
+ logger().info("Verifing tracked cursors ...");
+ *ref_kv_iter = kvs.random_begin();
+ return seastar::do_with(
+ cursors->begin(),
+ [&t, this, cursors, ref_kv_iter] (auto& c_iter) {
+ return trans_intr::repeat(
+ [&t, this, &c_iter, cursors, ref_kv_iter] ()
+ -> eagain_ifuture<seastar::stop_iteration> {
+ if (*ref_kv_iter == kvs.random_end()) {
+ logger().info("Verify done!");
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ assert(c_iter != cursors->end());
+ auto p_kv = **ref_kv_iter;
+ // validate values in tree keep intact
+ return tree->find(t, p_kv->key).si_then([&c_iter, ref_kv_iter](auto cursor) {
+ auto p_kv = **ref_kv_iter;
+ validate_cursor_from_item(p_kv->key, p_kv->value, cursor);
+ // validate values in cursors keep intact
+ validate_cursor_from_item(p_kv->key, p_kv->value, *c_iter);
+ ++(*ref_kv_iter);
+ ++c_iter;
+ return seastar::stop_iteration::no;
+ });
+ });
+ });
+ } else {
+ return eagain_iertr::now();
+ }
+ });
+ }
+
+ eagain_ifuture<> erase_one(
+ Transaction& t, const iterator_t& iter_rd) {
+ auto p_kv = *iter_rd;
+ logger().debug("[{}] erase {} -> {}",
+ iter_rd - kvs.random_begin(),
+ key_hobj_t{p_kv->key},
+ p_kv->value);
+ return tree->erase(t, p_kv->key
+ ).si_then([&t, this, p_kv] (auto size) {
+ boost::ignore_unused(t); // avoid clang warning;
+ boost::ignore_unused(this);
+ boost::ignore_unused(p_kv);
+ ceph_assert(size == 1);
+#ifndef NDEBUG
+ return tree->contains(t, p_kv->key
+ ).si_then([] (bool ret) {
+ ceph_assert(ret == false);
+ });
+#else
+ return eagain_iertr::now();
+#endif
+ });
+ }
+
+ eagain_ifuture<> erase(Transaction& t, std::size_t erase_size) {
+ assert(erase_size <= kvs.size());
+ kvs.shuffle();
+ auto erase_end = kvs.random_begin() + erase_size;
+ auto ref_kv_iter = seastar::make_lw_shared<iterator_t>();
+ auto cursors = seastar::make_lw_shared<std::map<ghobject_t, BtreeCursor>>();
+ return eagain_iertr::now().si_then([&t, this, cursors, ref_kv_iter] {
+ (void)this; // silence clang warning for !TRACK
+ (void)t; // silence clang warning for !TRACK
+ if constexpr (TRACK) {
+ logger().info("Tracking cursors before erase ...");
+ *ref_kv_iter = kvs.begin();
+ auto start_time = mono_clock::now();
+ return trans_intr::repeat(
+ [&t, this, cursors, ref_kv_iter, start_time] ()
+ -> eagain_ifuture<seastar::stop_iteration> {
+ if (*ref_kv_iter == kvs.end()) {
+ std::chrono::duration<double> duration = mono_clock::now() - start_time;
+ logger().info("Track done! {}s", duration.count());
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ auto p_kv = **ref_kv_iter;
+ return tree->find(t, p_kv->key).si_then([cursors, ref_kv_iter](auto cursor) {
+ auto p_kv = **ref_kv_iter;
+ validate_cursor_from_item(p_kv->key, p_kv->value, cursor);
+ cursors->emplace(p_kv->key, cursor);
+ ++(*ref_kv_iter);
+ return seastar::stop_iteration::no;
+ });
+ });
+ } else {
+ return eagain_iertr::now();
+ }
+ }).si_then([&t, this, ref_kv_iter, erase_end] {
+ *ref_kv_iter = kvs.random_begin();
+ logger().warn("start erasing {}/{} kvs ...",
+ erase_end - kvs.random_begin(), kvs.size());
+ auto start_time = mono_clock::now();
+ return trans_intr::repeat([&t, this, ref_kv_iter,
+ start_time, erase_end] ()
+ -> eagain_ifuture<seastar::stop_iteration> {
+ if (*ref_kv_iter == erase_end) {
+ std::chrono::duration<double> duration = mono_clock::now() - start_time;
+ logger().warn("Erase done! {}s", duration.count());
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ } else {
+ return erase_one(t, *ref_kv_iter
+ ).si_then([ref_kv_iter] {
+ ++(*ref_kv_iter);
+ return seastar::stop_iteration::no;
+ });
+ }
+ });
+ }).si_then([this, cursors, ref_kv_iter, erase_end] {
+ if constexpr (TRACK) {
+ logger().info("Verifing tracked cursors ...");
+ *ref_kv_iter = kvs.random_begin();
+ while (*ref_kv_iter != erase_end) {
+ auto p_kv = **ref_kv_iter;
+ auto c_it = cursors->find(p_kv->key);
+ ceph_assert(c_it != cursors->end());
+ ceph_assert(c_it->second.is_end());
+ cursors->erase(c_it);
+ ++(*ref_kv_iter);
+ }
+ }
+ kvs.erase_from_random(kvs.random_begin(), erase_end);
+ if constexpr (TRACK) {
+ *ref_kv_iter = kvs.begin();
+ for (auto& [k, c] : *cursors) {
+ assert(*ref_kv_iter != kvs.end());
+ auto p_kv = **ref_kv_iter;
+ validate_cursor_from_item(p_kv->key, p_kv->value, c);
+ ++(*ref_kv_iter);
+ }
+ logger().info("Verify done!");
+ }
+ });
+ }
+
+ eagain_ifuture<> get_stats(Transaction& t) {
+ return tree->get_stats_slow(t
+ ).si_then([](auto stats) {
+ logger().warn("{}", stats);
+ });
+ }
+
+ eagain_ifuture<std::size_t> height(Transaction& t) {
+ return tree->height(t);
+ }
+
+ void reload(NodeExtentManagerURef&& nm) {
+ tree.emplace(std::move(nm));
+ }
+
+ eagain_ifuture<> validate_one(
+ Transaction& t, const iterator_t& iter_seq) {
+ assert(iter_seq != kvs.end());
+ auto next_iter = iter_seq + 1;
+ auto p_kv = *iter_seq;
+ return tree->find(t, p_kv->key
+ ).si_then([p_kv, &t] (auto cursor) {
+ validate_cursor_from_item(p_kv->key, p_kv->value, cursor);
+ return cursor.get_next(t);
+ }).si_then([next_iter, this] (auto cursor) {
+ if (next_iter == kvs.end()) {
+ ceph_assert(cursor.is_end());
+ } else {
+ auto p_kv = *next_iter;
+ validate_cursor_from_item(p_kv->key, p_kv->value, cursor);
+ }
+ });
+ }
+
+ eagain_ifuture<> validate(Transaction& t) {
+ logger().info("Verifing inserted ...");
+ return seastar::do_with(
+ kvs.begin(),
+ [this, &t] (auto &iter) {
+ return trans_intr::repeat(
+ [this, &t, &iter]() ->eagain_iertr::future<seastar::stop_iteration> {
+ if (iter == kvs.end()) {
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ return validate_one(t, iter).si_then([&iter] {
+ ++iter;
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ });
+ });
+ }
+
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_test);
+ }
+
+ KVPool<ValueItem>& kvs;
+ std::optional<BtreeImpl> tree;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/value.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/value.cc
new file mode 100644
index 000000000..694480d4e
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/value.cc
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "value.h"
+
+#include "node.h"
+#include "node_delta_recorder.h"
+#include "node_layout.h"
+
+// value implementations
+#include "test/crimson/seastore/onode_tree/test_value.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h"
+
+namespace crimson::os::seastore::onode {
+
+ceph::bufferlist&
+ValueDeltaRecorder::get_encoded(NodeExtentMutable& payload_mut)
+{
+ ceph::encode(node_delta_op_t::SUBOP_UPDATE_VALUE, encoded);
+ node_offset_t offset = payload_mut.get_node_offset();
+ assert(offset > sizeof(value_header_t));
+ offset -= sizeof(value_header_t);
+ ceph::encode(offset, encoded);
+ return encoded;
+}
+
+Value::Value(NodeExtentManager& nm,
+ const ValueBuilder& vb,
+ Ref<tree_cursor_t>& p_cursor)
+ : nm{nm}, vb{vb}, p_cursor{p_cursor} {}
+
+Value::~Value() {}
+
+bool Value::is_tracked() const
+{
+ assert(!p_cursor->is_end());
+ return p_cursor->is_tracked();
+}
+
+void Value::invalidate()
+{
+ p_cursor.reset();
+}
+
+eagain_ifuture<> Value::extend(Transaction& t, value_size_t extend_size)
+{
+ assert(is_tracked());
+ [[maybe_unused]] auto target_size = get_payload_size() + extend_size;
+ return p_cursor->extend_value(get_context(t), extend_size)
+#ifndef NDEBUG
+ .si_then([this, target_size] {
+ assert(target_size == get_payload_size());
+ })
+#endif
+ ;
+}
+
+eagain_ifuture<> Value::trim(Transaction& t, value_size_t trim_size)
+{
+ assert(is_tracked());
+ assert(get_payload_size() > trim_size);
+ [[maybe_unused]] auto target_size = get_payload_size() - trim_size;
+ return p_cursor->trim_value(get_context(t), trim_size)
+#ifndef NDEBUG
+ .si_then([this, target_size] {
+ assert(target_size == get_payload_size());
+ })
+#endif
+ ;
+}
+
+const value_header_t* Value::read_value_header() const
+{
+ auto ret = p_cursor->read_value_header(vb.get_header_magic());
+ assert(ret->payload_size <= vb.get_max_value_payload_size());
+ return ret;
+}
+
+std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+Value::do_prepare_mutate_payload(Transaction& t)
+{
+ return p_cursor->prepare_mutate_value_payload(get_context(t));
+}
+
+laddr_t Value::get_hint() const
+{
+ return p_cursor->get_key_view(vb.get_header_magic()).get_hint();
+}
+
+std::unique_ptr<ValueDeltaRecorder>
+build_value_recorder_by_type(ceph::bufferlist& encoded,
+ const value_magic_t& magic)
+{
+ std::unique_ptr<ValueDeltaRecorder> ret;
+ switch (magic) {
+ case value_magic_t::ONODE:
+ ret = std::make_unique<FLTreeOnode::Recorder>(encoded);
+ break;
+ case value_magic_t::TEST_UNBOUND:
+ ret = std::make_unique<UnboundedValue::Recorder>(encoded);
+ break;
+ case value_magic_t::TEST_BOUNDED:
+ ret = std::make_unique<BoundedValue::Recorder>(encoded);
+ break;
+ case value_magic_t::TEST_EXTENDED:
+ ret = std::make_unique<ExtendedValue::Recorder>(encoded);
+ break;
+ default:
+ ret = nullptr;
+ break;
+ }
+ assert(!ret || ret->get_header_magic() == magic);
+ return ret;
+}
+
+void validate_tree_config(const tree_conf_t& conf)
+{
+ ceph_assert(conf.max_ns_size <
+ string_key_view_t::VALID_UPPER_BOUND);
+ ceph_assert(conf.max_oid_size <
+ string_key_view_t::VALID_UPPER_BOUND);
+ ceph_assert(is_valid_node_size(conf.internal_node_size));
+ ceph_assert(is_valid_node_size(conf.leaf_node_size));
+
+ if (conf.do_split_check) {
+ // In hope to comply with 3 * (oid + ns) + 2 * value < node
+ //
+ // see node_layout.h for NODE_BLOCK_SIZE considerations
+ //
+ // The below calculations also consider the internal indexing overhead in
+ // order to be accurate, so the equation has become:
+ // node-header-size + 2 * max-full-insert-size +
+ // max-ns/oid-split-overhead <= node-size
+
+ auto obj = ghobject_t{shard_id_t{0}, 0, 0, "", "", 0, 0};
+ key_hobj_t key(obj);
+ auto max_str_size = conf.max_ns_size + conf.max_oid_size;
+#define _STAGE_T(NodeType) node_to_stage_t<typename NodeType::node_stage_t>
+#define NXT_T(StageType) staged<typename StageType::next_param_t>
+
+ laddr_t i_value{0};
+ auto insert_size_2 =
+ _STAGE_T(InternalNode0)::insert_size(key, i_value);
+ auto insert_size_0 =
+ NXT_T(NXT_T(_STAGE_T(InternalNode0)))::insert_size(key, i_value);
+ unsigned internal_size_bound = sizeof(node_header_t) +
+ (insert_size_2 + max_str_size) * 2 +
+ (insert_size_2 - insert_size_0 + max_str_size);
+ ceph_assert(internal_size_bound <= conf.internal_node_size);
+
+ value_config_t l_value;
+ l_value.payload_size = conf.max_value_payload_size;
+ insert_size_2 =
+ _STAGE_T(LeafNode0)::insert_size(key, l_value);
+ insert_size_0 =
+ NXT_T(NXT_T(_STAGE_T(LeafNode0)))::insert_size(key, l_value);
+ unsigned leaf_size_bound = sizeof(node_header_t) +
+ (insert_size_2 + max_str_size) * 2 +
+ (insert_size_2 - insert_size_0 + max_str_size);
+ ceph_assert(leaf_size_bound <= conf.leaf_node_size);
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/value.h b/src/crimson/os/seastore/onode_manager/staged-fltree/value.h
new file mode 100644
index 000000000..d9f0c231a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/value.h
@@ -0,0 +1,337 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "include/buffer.h"
+#include "crimson/common/type_helpers.h"
+
+#include "fwd.h"
+#include "node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+// value size up to 64 KiB
+using value_size_t = uint16_t;
+enum class value_magic_t : uint8_t {
+ ONODE = 0x52,
+ TEST_UNBOUND,
+ TEST_BOUNDED,
+ TEST_EXTENDED,
+};
+inline std::ostream& operator<<(std::ostream& os, const value_magic_t& magic) {
+ switch (magic) {
+ case value_magic_t::ONODE:
+ return os << "ONODE";
+ case value_magic_t::TEST_UNBOUND:
+ return os << "TEST_UNBOUND";
+ case value_magic_t::TEST_BOUNDED:
+ return os << "TEST_BOUNDED";
+ case value_magic_t::TEST_EXTENDED:
+ return os << "TEST_EXTENDED";
+ default:
+ return os << "UNKNOWN(" << magic << ")";
+ }
+}
+
+/**
+ * value_config_t
+ *
+ * Parameters to create a value.
+ */
+struct value_config_t {
+ value_magic_t magic;
+ value_size_t payload_size;
+
+ value_size_t allocation_size() const;
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(magic, encoded);
+ ceph::encode(payload_size, encoded);
+ }
+
+ static value_config_t decode(ceph::bufferlist::const_iterator& delta) {
+ value_magic_t magic;
+ ceph::decode(magic, delta);
+ value_size_t payload_size;
+ ceph::decode(payload_size, delta);
+ return {magic, payload_size};
+ }
+};
+inline std::ostream& operator<<(std::ostream& os, const value_config_t& conf) {
+ return os << "ValueConf(" << conf.magic
+ << ", " << conf.payload_size << "B)";
+}
+
+/**
+ * value_header_t
+ *
+ * The header structure in value layout.
+ *
+ * Value layout:
+ *
+ * # <- alloc size -> #
+ * # header | payload #
+ */
+struct value_header_t {
+ value_magic_t magic;
+ value_size_t payload_size;
+
+ bool operator==(const value_header_t& rhs) const {
+ return (magic == rhs.magic && payload_size == rhs.payload_size);
+ }
+ bool operator!=(const value_header_t& rhs) const {
+ return !(*this == rhs);
+ }
+
+ value_size_t allocation_size() const {
+ return payload_size + sizeof(value_header_t);
+ }
+
+ const char* get_payload() const {
+ return reinterpret_cast<const char*>(this) + sizeof(value_header_t);
+ }
+
+ NodeExtentMutable get_payload_mutable(NodeExtentMutable& node) const {
+ return node.get_mutable_absolute(get_payload(), payload_size);
+ }
+
+ char* get_payload() {
+ return reinterpret_cast<char*>(this) + sizeof(value_header_t);
+ }
+
+ void initiate(NodeExtentMutable& mut, const value_config_t& config) {
+ value_header_t header{config.magic, config.payload_size};
+ mut.copy_in_absolute(this, header);
+ mut.set_absolute(get_payload(), 0, config.payload_size);
+ }
+
+ static value_size_t estimate_allocation_size(value_size_t payload_size) {
+ return payload_size + sizeof(value_header_t);
+ }
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const value_header_t& header) {
+ return os << "Value(" << header.magic
+ << ", " << header.payload_size << "B)";
+}
+
+inline value_size_t value_config_t::allocation_size() const {
+ return value_header_t::estimate_allocation_size(payload_size);
+}
+
+/**
+ * ValueDeltaRecorder
+ *
+ * An abstracted class to handle user-defined value delta encode, decode and
+ * replay.
+ */
+class ValueDeltaRecorder {
+ public:
+ virtual ~ValueDeltaRecorder() = default;
+ ValueDeltaRecorder(const ValueDeltaRecorder&) = delete;
+ ValueDeltaRecorder(ValueDeltaRecorder&&) = delete;
+ ValueDeltaRecorder& operator=(const ValueDeltaRecorder&) = delete;
+ ValueDeltaRecorder& operator=(ValueDeltaRecorder&&) = delete;
+
+ /// Returns the value header magic for validation purpose.
+ virtual value_magic_t get_header_magic() const = 0;
+
+ /// Called by DeltaRecorderT to apply user-defined value delta.
+ virtual void apply_value_delta(ceph::bufferlist::const_iterator&,
+ NodeExtentMutable&,
+ laddr_t) = 0;
+
+ protected:
+ ValueDeltaRecorder(ceph::bufferlist& encoded) : encoded{encoded} {}
+
+ /// Get the delta buffer to encode user-defined value delta.
+ ceph::bufferlist& get_encoded(NodeExtentMutable&);
+
+ private:
+ ceph::bufferlist& encoded;
+};
+
+/**
+ * tree_conf_t
+ *
+ * Hard limits and compile-time configurations.
+ */
+struct tree_conf_t {
+ value_magic_t value_magic;
+ string_size_t max_ns_size;
+ string_size_t max_oid_size;
+ value_size_t max_value_payload_size;
+ extent_len_t internal_node_size;
+ extent_len_t leaf_node_size;
+ bool do_split_check = true;
+};
+
+class tree_cursor_t;
+/**
+ * Value
+ *
+ * Value is a stateless view of the underlying value header and payload content
+ * stored in a tree leaf node, with the support to implement user-defined value
+ * deltas and to extend and trim the underlying payload data (not implemented
+ * yet).
+ *
+ * In the current implementation, we don't guarantee any alignment for value
+ * payload due to unaligned node layout and the according merge and split
+ * operations.
+ */
+class Value {
+ public:
+ virtual ~Value();
+ Value(const Value&) = default;
+ Value(Value&&) = default;
+ Value& operator=(const Value&) = delete;
+ Value& operator=(Value&&) = delete;
+
+ /// Returns whether the Value is still tracked in tree.
+ bool is_tracked() const;
+
+ /// Invalidate the Value before submitting transaction.
+ void invalidate();
+
+ /// Returns the value payload size.
+ value_size_t get_payload_size() const {
+ assert(is_tracked());
+ return read_value_header()->payload_size;
+ }
+
+ laddr_t get_hint() const;
+
+ bool operator==(const Value& v) const { return p_cursor == v.p_cursor; }
+ bool operator!=(const Value& v) const { return !(*this == v); }
+
+ protected:
+ Value(NodeExtentManager&, const ValueBuilder&, Ref<tree_cursor_t>&);
+
+ /// Extends the payload size.
+ eagain_ifuture<> extend(Transaction&, value_size_t extend_size);
+
+ /// Trim and shrink the payload.
+ eagain_ifuture<> trim(Transaction&, value_size_t trim_size);
+
+ /// Get the permission to mutate the payload with the optional value recorder.
+ template <typename PayloadT, typename ValueDeltaRecorderT>
+ std::pair<NodeExtentMutable&, ValueDeltaRecorderT*>
+ prepare_mutate_payload(Transaction& t) {
+ assert(is_tracked());
+ assert(sizeof(PayloadT) <= get_payload_size());
+
+ auto value_mutable = do_prepare_mutate_payload(t);
+ assert(value_mutable.first.get_write() ==
+ const_cast<const Value*>(this)->template read_payload<char>());
+ assert(value_mutable.first.get_length() == get_payload_size());
+ return {value_mutable.first,
+ static_cast<ValueDeltaRecorderT*>(value_mutable.second)};
+ }
+
+ /// Get the latest payload pointer for read.
+ template <typename PayloadT>
+ const PayloadT* read_payload() const {
+ assert(is_tracked());
+ // see Value documentation
+ static_assert(alignof(PayloadT) == 1);
+ assert(sizeof(PayloadT) <= get_payload_size());
+ return reinterpret_cast<const PayloadT*>(read_value_header()->get_payload());
+ }
+
+ private:
+ const value_header_t* read_value_header() const;
+ context_t get_context(Transaction& t) {
+ return {nm, vb, t};
+ }
+
+ std::pair<NodeExtentMutable&, ValueDeltaRecorder*>
+ do_prepare_mutate_payload(Transaction&);
+
+ NodeExtentManager& nm;
+ const ValueBuilder& vb;
+ Ref<tree_cursor_t> p_cursor;
+
+ template <typename ValueImpl>
+ friend class Btree;
+};
+
+/**
+ * ValueBuilder
+ *
+ * For tree nodes to build values without the need to depend on the actual
+ * implementation.
+ */
+struct ValueBuilder {
+ virtual value_magic_t get_header_magic() const = 0;
+ virtual string_size_t get_max_ns_size() const = 0;
+ virtual string_size_t get_max_oid_size() const = 0;
+ virtual value_size_t get_max_value_payload_size() const = 0;
+ virtual extent_len_t get_internal_node_size() const = 0;
+ virtual extent_len_t get_leaf_node_size() const = 0;
+ virtual std::unique_ptr<ValueDeltaRecorder>
+ build_value_recorder(ceph::bufferlist&) const = 0;
+};
+
+/**
+ * ValueBuilderImpl
+ *
+ * The concrete ValueBuilder implementation in Btree.
+ */
+template <typename ValueImpl>
+struct ValueBuilderImpl final : public ValueBuilder {
+ ValueBuilderImpl() {
+ validate_tree_config(ValueImpl::TREE_CONF);
+ }
+
+ value_magic_t get_header_magic() const {
+ return ValueImpl::TREE_CONF.value_magic;
+ }
+ string_size_t get_max_ns_size() const override {
+ return ValueImpl::TREE_CONF.max_ns_size;
+ }
+ string_size_t get_max_oid_size() const override {
+ return ValueImpl::TREE_CONF.max_oid_size;
+ }
+ value_size_t get_max_value_payload_size() const override {
+ return ValueImpl::TREE_CONF.max_value_payload_size;
+ }
+ extent_len_t get_internal_node_size() const override {
+ return ValueImpl::TREE_CONF.internal_node_size;
+ }
+ extent_len_t get_leaf_node_size() const override {
+ return ValueImpl::TREE_CONF.leaf_node_size;
+ }
+
+ std::unique_ptr<ValueDeltaRecorder>
+ build_value_recorder(ceph::bufferlist& encoded) const override {
+ std::unique_ptr<ValueDeltaRecorder> ret =
+ std::make_unique<typename ValueImpl::Recorder>(encoded);
+ assert(ret->get_header_magic() == get_header_magic());
+ return ret;
+ }
+
+ ValueImpl build_value(NodeExtentManager& nm,
+ const ValueBuilder& vb,
+ Ref<tree_cursor_t>& p_cursor) const {
+ assert(vb.get_header_magic() == get_header_magic());
+ return ValueImpl(nm, vb, p_cursor);
+ }
+};
+
+void validate_tree_config(const tree_conf_t& conf);
+
+/**
+ * Get the value recorder by type (the magic value) when the ValueBuilder is
+ * unavailable.
+ */
+std::unique_ptr<ValueDeltaRecorder>
+build_value_recorder_by_type(ceph::bufferlist& encoded, const value_magic_t& magic);
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::onode::value_config_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::onode::value_header_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/ordering_handle.h b/src/crimson/os/seastore/ordering_handle.h
new file mode 100644
index 000000000..a7802fda3
--- /dev/null
+++ b/src/crimson/os/seastore/ordering_handle.h
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/shared_mutex.hh>
+
+#include "crimson/common/operation.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace crimson::os::seastore {
+
+struct WritePipeline {
+ struct ReserveProjectedUsage : OrderedExclusivePhaseT<ReserveProjectedUsage> {
+ constexpr static auto type_name = "WritePipeline::reserve_projected_usage";
+ } reserve_projected_usage;
+ struct OolWrites : UnorderedStageT<OolWrites> {
+ constexpr static auto type_name = "UnorderedStage::ool_writes_stage";
+ } ool_writes;
+ struct Prepare : OrderedExclusivePhaseT<Prepare> {
+ constexpr static auto type_name = "WritePipeline::prepare_phase";
+ } prepare;
+ struct DeviceSubmission : OrderedConcurrentPhaseT<DeviceSubmission> {
+ constexpr static auto type_name = "WritePipeline::device_submission_phase";
+ } device_submission;
+ struct Finalize : OrderedExclusivePhaseT<Finalize> {
+ constexpr static auto type_name = "WritePipeline::finalize_phase";
+ } finalize;
+
+ using BlockingEvents = std::tuple<
+ ReserveProjectedUsage::BlockingEvent,
+ OolWrites::BlockingEvent,
+ Prepare::BlockingEvent,
+ DeviceSubmission::BlockingEvent,
+ Finalize::BlockingEvent
+ >;
+};
+
+/**
+ * PlaceholderOperation
+ *
+ * Once seastore is more complete, I expect to update the externally
+ * facing interfaces to permit passing the osd level operation through.
+ * Until then (and for tests likely permanently) we'll use this unregistered
+ * placeholder for the pipeline phases necessary for journal correctness.
+ */
+class PlaceholderOperation : public crimson::osd::PhasedOperationT<PlaceholderOperation> {
+public:
+ constexpr static auto type = 0U;
+ constexpr static auto type_name =
+ "crimson::os::seastore::PlaceholderOperation";
+
+ static PlaceholderOperation::IRef create() {
+ return IRef{new PlaceholderOperation()};
+ }
+
+ PipelineHandle handle;
+ WritePipeline::BlockingEvents tracking_events;
+
+ PipelineHandle& get_handle() {
+ return handle;
+ }
+private:
+ void dump_detail(ceph::Formatter *f) const final {}
+ void print(std::ostream &) const final {}
+};
+
+struct OperationProxy {
+ OperationRef op;
+ OperationProxy(OperationRef op) : op(std::move(op)) {}
+
+ virtual seastar::future<> enter(WritePipeline::ReserveProjectedUsage&) = 0;
+ virtual seastar::future<> enter(WritePipeline::OolWrites&) = 0;
+ virtual seastar::future<> enter(WritePipeline::Prepare&) = 0;
+ virtual seastar::future<> enter(WritePipeline::DeviceSubmission&) = 0;
+ virtual seastar::future<> enter(WritePipeline::Finalize&) = 0;
+
+ virtual void exit() = 0;
+ virtual seastar::future<> complete() = 0;
+
+ virtual ~OperationProxy() = default;
+};
+
+template <typename OpT>
+struct OperationProxyT : OperationProxy {
+ OperationProxyT(typename OpT::IRef op) : OperationProxy(op) {}
+
+ OpT* that() {
+ return static_cast<OpT*>(op.get());
+ }
+ const OpT* that() const {
+ return static_cast<const OpT*>(op.get());
+ }
+
+ seastar::future<> enter(WritePipeline::ReserveProjectedUsage& s) final {
+ return that()->enter_stage(s);
+ }
+ seastar::future<> enter(WritePipeline::OolWrites& s) final {
+ return that()->enter_stage(s);
+ }
+ seastar::future<> enter(WritePipeline::Prepare& s) final {
+ return that()->enter_stage(s);
+ }
+ seastar::future<> enter(WritePipeline::DeviceSubmission& s) final {
+ return that()->enter_stage(s);
+ }
+ seastar::future<> enter(WritePipeline::Finalize& s) final {
+ return that()->enter_stage(s);
+ }
+
+ void exit() final {
+ return that()->handle.exit();
+ }
+ seastar::future<> complete() final {
+ return that()->handle.complete();
+ }
+};
+
+struct OrderingHandle {
+ // we can easily optimize this dynalloc out as all concretes are
+ // supposed to have exactly the same size.
+ std::unique_ptr<OperationProxy> op;
+ seastar::shared_mutex *collection_ordering_lock = nullptr;
+
+ // in the future we might add further constructors / template to type
+ // erasure while extracting the location of tracking events.
+ OrderingHandle(std::unique_ptr<OperationProxy> op) : op(std::move(op)) {}
+ OrderingHandle(OrderingHandle &&other)
+ : op(std::move(other.op)),
+ collection_ordering_lock(other.collection_ordering_lock) {
+ other.collection_ordering_lock = nullptr;
+ }
+
+ seastar::future<> take_collection_lock(seastar::shared_mutex &mutex) {
+ ceph_assert(!collection_ordering_lock);
+ collection_ordering_lock = &mutex;
+ return collection_ordering_lock->lock();
+ }
+
+ void maybe_release_collection_lock() {
+ if (collection_ordering_lock) {
+ collection_ordering_lock->unlock();
+ collection_ordering_lock = nullptr;
+ }
+ }
+
+ template <typename T>
+ seastar::future<> enter(T &t) {
+ return op->enter(t);
+ }
+
+ void exit() {
+ op->exit();
+ }
+
+ seastar::future<> complete() {
+ return op->complete();
+ }
+
+ ~OrderingHandle() {
+ maybe_release_collection_lock();
+ }
+};
+
+inline OrderingHandle get_dummy_ordering_handle() {
+ using PlaceholderOpProxy = OperationProxyT<PlaceholderOperation>;
+ return OrderingHandle{
+ std::make_unique<PlaceholderOpProxy>(PlaceholderOperation::create())};
+}
+
+} // namespace crimson::os::seastore
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<os::seastore::PlaceholderOperation> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+} // namespace crimson
+
diff --git a/src/crimson/os/seastore/random_block_manager.cc b/src/crimson/os/seastore/random_block_manager.cc
new file mode 100644
index 000000000..749edc97f
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager.cc
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/random_block_manager.h"
+#include "crimson/os/seastore/random_block_manager/nvme_block_device.h"
+#include "crimson/os/seastore/random_block_manager/rbm_device.h"
+
+namespace crimson::os::seastore {
+
+seastar::future<random_block_device::RBMDeviceRef>
+get_rb_device(
+ const std::string &device)
+{
+ return seastar::make_ready_future<random_block_device::RBMDeviceRef>(
+ std::make_unique<
+ random_block_device::nvme::NVMeBlockDevice
+ >(device + "/block"));
+}
+
+}
diff --git a/src/crimson/os/seastore/random_block_manager.h b/src/crimson/os/seastore/random_block_manager.h
new file mode 100644
index 000000000..d9be1b5e6
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager.h
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iosfwd>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "include/buffer_fwd.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/transaction.h"
+
+#include "crimson/common/layout.h"
+#include "include/buffer.h"
+#include "crimson/os/seastore/device.h"
+
+namespace crimson::os::seastore {
+
+struct rbm_shard_info_t {
+ std::size_t size = 0;
+ uint64_t start_offset = 0;
+
+ DENC(rbm_shard_info_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.size, p);
+ denc(v.start_offset, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct rbm_metadata_header_t {
+ size_t size = 0;
+ size_t block_size = 0;
+ uint64_t feature = 0;
+ uint64_t journal_size = 0;
+ checksum_t crc = 0;
+ device_config_t config;
+ unsigned int shard_num = 0;
+ std::vector<rbm_shard_info_t> shard_infos;
+
+ DENC(rbm_metadata_header_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.size, p);
+ denc(v.block_size, p);
+ denc(v.feature, p);
+
+ denc(v.journal_size, p);
+ denc(v.crc, p);
+ denc(v.config, p);
+ denc(v.shard_num, p);
+ denc(v.shard_infos, p);
+ DENC_FINISH(p);
+ }
+
+ void validate() const {
+ ceph_assert(shard_num == seastar::smp::count);
+ ceph_assert(block_size > 0);
+ for (unsigned int i = 0; i < seastar::smp::count; i ++) {
+ ceph_assert(shard_infos[i].size > block_size &&
+ shard_infos[i].size % block_size == 0);
+ ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX);
+ ceph_assert(journal_size > 0 &&
+ journal_size % block_size == 0);
+ ceph_assert(shard_infos[i].start_offset < size &&
+ shard_infos[i].start_offset % block_size == 0);
+ }
+ ceph_assert(config.spec.magic != 0);
+ ceph_assert(get_default_backend_of_device(config.spec.dtype) ==
+ backend_type_t::RANDOM_BLOCK);
+ ceph_assert(config.spec.id <= DEVICE_ID_MAX_VALID);
+ }
+};
+
+enum class rbm_extent_state_t {
+ FREE, // not allocated
+ RESERVED, // extent is reserved by alloc_new_extent, but is not persistent
+ ALLOCATED, // extent is persistent
+};
+
+class Device;
+using rbm_abs_addr = uint64_t;
+constexpr rbm_abs_addr RBM_START_ADDRESS = 0;
+class RandomBlockManager {
+public:
+
+ using read_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ virtual read_ertr::future<> read(paddr_t addr, bufferptr &buffer) = 0;
+
+ using write_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::ebadf,
+ crimson::ct_error::enospc,
+ crimson::ct_error::erange
+ >;
+ virtual write_ertr::future<> write(paddr_t addr, bufferptr &buf) = 0;
+
+ using open_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent>;
+ virtual open_ertr::future<> open() = 0;
+
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg>;
+ virtual close_ertr::future<> close() = 0;
+
+ using allocate_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enospc
+ >;
+ using allocate_ret = allocate_ertr::future<paddr_t>;
+ // allocator, return start addr of allocated blocks
+ virtual paddr_t alloc_extent(size_t size) = 0;
+
+ virtual void mark_space_used(paddr_t paddr, size_t len) = 0;
+ virtual void mark_space_free(paddr_t paddr, size_t len) = 0;
+
+ virtual void complete_allocation(paddr_t addr, size_t size) = 0;
+
+ virtual size_t get_size() const = 0;
+ virtual extent_len_t get_block_size() const = 0;
+ virtual uint64_t get_free_blocks() const = 0;
+ virtual device_id_t get_device_id() const = 0;
+ virtual const seastore_meta_t &get_meta() const = 0;
+ virtual Device* get_device() = 0;
+ virtual paddr_t get_start() = 0;
+ virtual rbm_extent_state_t get_extent_state(paddr_t addr, size_t size) = 0;
+ virtual size_t get_journal_size() const = 0;
+ virtual ~RandomBlockManager() {}
+};
+using RandomBlockManagerRef = std::unique_ptr<RandomBlockManager>;
+
+inline rbm_abs_addr convert_paddr_to_abs_addr(const paddr_t& paddr) {
+ const blk_paddr_t& blk_addr = paddr.as_blk_paddr();
+ return blk_addr.get_device_off();
+}
+
+inline paddr_t convert_abs_addr_to_paddr(rbm_abs_addr addr, device_id_t d_id) {
+ return paddr_t::make_blk_paddr(d_id, addr);
+}
+
+namespace random_block_device {
+ class RBMDevice;
+}
+
+seastar::future<std::unique_ptr<random_block_device::RBMDevice>>
+ get_rb_device(const std::string &device);
+
+std::ostream &operator<<(std::ostream &out, const rbm_metadata_header_t &header);
+std::ostream &operator<<(std::ostream &out, const rbm_shard_info_t &shard);
+}
+
+WRITE_CLASS_DENC_BOUNDED(
+ crimson::os::seastore::rbm_shard_info_t
+)
+WRITE_CLASS_DENC_BOUNDED(
+ crimson::os::seastore::rbm_metadata_header_t
+)
+
+#if FMT_VERSION >= 90000
+template<> struct fmt::formatter<crimson::os::seastore::rbm_metadata_header_t> : fmt::ostream_formatter {};
+template<> struct fmt::formatter<crimson::os::seastore::rbm_shard_info_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/random_block_manager/avlallocator.cc b/src/crimson/os/seastore/random_block_manager/avlallocator.cc
new file mode 100644
index 000000000..28137a23d
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/avlallocator.cc
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+//
+#include "avlallocator.h"
+#include "crimson/os/seastore/logging.h"
+
+SET_SUBSYS(seastore_device);
+
+namespace crimson::os::seastore {
+
+void AvlAllocator::mark_extent_used(rbm_abs_addr addr, size_t size)
+{
+ LOG_PREFIX(AvlAllocator::mark_extent_used);
+ DEBUG("addr: {}, size: {}, avail: {}", addr, size, available_size);
+ _remove_from_tree(addr, size);
+}
+
+void AvlAllocator::init(rbm_abs_addr addr, size_t size, size_t b_size)
+{
+ LOG_PREFIX(AvlAllocator::init);
+ DEBUG("addr: {}, size: {}", addr, size);
+ auto r = new extent_range_t{ addr, addr + size };
+ extent_tree.insert(*r);
+ extent_size_tree.insert(*r);
+ available_size = size;
+ block_size = b_size;
+ total_size = size;
+ base_addr = addr;
+}
+
+void AvlAllocator::_remove_from_tree(rbm_abs_addr start, rbm_abs_addr size)
+{
+ LOG_PREFIX(AvlAllocator::_remove_from_tree);
+ rbm_abs_addr end = start + size;
+
+ ceph_assert(size != 0);
+ ceph_assert(size <= available_size);
+
+ auto rs = extent_tree.find(extent_range_t{start, end}, extent_tree.key_comp());
+ DEBUG("rs start: {}, rs end: {}", rs->start, rs->end);
+ ceph_assert(rs != extent_tree.end());
+ ceph_assert(rs->start <= start);
+ ceph_assert(rs->end >= end);
+
+ bool left_over = (rs->start != start);
+ bool right_over = (rs->end != end);
+
+ _extent_size_tree_rm(*rs);
+
+ if (left_over && right_over) {
+ auto old_right_end = rs->end;
+ auto insert_pos = rs;
+ ceph_assert(insert_pos != extent_tree.end());
+ ++insert_pos;
+ rs->end = start;
+
+ auto r = new extent_range_t{end, old_right_end};
+ extent_tree.insert_before(insert_pos, *r);
+ extent_size_tree.insert(*r);
+ available_size += r->length();
+ _extent_size_tree_try_insert(*rs);
+ } else if (left_over) {
+ assert(is_aligned(start, block_size));
+ rs->end = start;
+ _extent_size_tree_try_insert(*rs);
+ } else if (right_over) {
+ assert(is_aligned(end, block_size));
+ rs->start = end;
+ _extent_size_tree_try_insert(*rs);
+ } else {
+ extent_tree.erase_and_dispose(rs, dispose_rs{});
+ }
+}
+
+rbm_abs_addr AvlAllocator::find_block(size_t size)
+{
+ const auto comp = extent_size_tree.key_comp();
+ auto iter = extent_size_tree.lower_bound(
+ extent_range_t{base_addr, base_addr + size}, comp);
+ for (; iter != extent_size_tree.end(); ++iter) {
+ assert(is_aligned(iter->start, block_size));
+ rbm_abs_addr off = iter->start;
+ if (off + size <= iter->end) {
+ return off;
+ }
+ }
+ return total_size;
+}
+
+void AvlAllocator::_add_to_tree(rbm_abs_addr start, rbm_abs_addr size)
+{
+ LOG_PREFIX(AvlAllocator::_add_to_tree);
+ ceph_assert(size != 0);
+ DEBUG("addr: {}, size: {}", start, size);
+
+ rbm_abs_addr end = start + size;
+
+ auto rs_after = extent_tree.upper_bound(extent_range_t{start, end},
+ extent_tree.key_comp());
+
+ auto rs_before = extent_tree.end();
+ if (rs_after != extent_tree.begin()) {
+ rs_before = std::prev(rs_after);
+ }
+
+ bool merge_before = (rs_before != extent_tree.end() && rs_before->end == start);
+ bool merge_after = (rs_after != extent_tree.end() && rs_after->start == end);
+
+ if (merge_before && merge_after) {
+ _extent_size_tree_rm(*rs_before);
+ _extent_size_tree_rm(*rs_after);
+ rs_after->start = rs_before->start;
+ extent_tree.erase_and_dispose(rs_before, dispose_rs{});
+ _extent_size_tree_try_insert(*rs_after);
+ } else if (merge_before) {
+ _extent_size_tree_rm(*rs_before);
+ rs_before->end = end;
+ _extent_size_tree_try_insert(*rs_before);
+ } else if (merge_after) {
+ _extent_size_tree_rm(*rs_after);
+ rs_after->start = start;
+ _extent_size_tree_try_insert(*rs_after);
+ } else {
+ auto r = new extent_range_t{start, end};
+ extent_tree.insert(*r);
+ extent_size_tree.insert(*r);
+ available_size += r->length();
+ }
+}
+
+std::optional<interval_set<rbm_abs_addr>> AvlAllocator::alloc_extent(
+ size_t size)
+{
+ LOG_PREFIX(AvlAllocator::alloc_extent);
+ if (available_size < size) {
+ return std::nullopt;
+ }
+ if (extent_size_tree.empty()) {
+ return std::nullopt;
+ }
+ ceph_assert(size > 0);
+ ceph_assert(is_aligned(size, block_size));
+
+ interval_set<rbm_abs_addr> result;
+
+ auto try_to_alloc_block = [this, &result, FNAME] (uint64_t alloc_size) -> uint64_t
+ {
+ rbm_abs_addr start = find_block(alloc_size);
+ if (start != base_addr + total_size) {
+ _remove_from_tree(start, alloc_size);
+ DEBUG("allocate addr: {}, allocate size: {}, available size: {}",
+ start, alloc_size, available_size);
+ result.insert(start, alloc_size);
+ return alloc_size;
+ }
+ return 0;
+ };
+
+ auto alloc = std::min(max_alloc_size, size);
+ rbm_abs_addr ret = try_to_alloc_block(alloc);
+ if (ret == 0) {
+ return std::nullopt;
+ }
+
+ assert(!result.empty());
+ assert(result.num_intervals() == 1);
+ for (auto p : result) {
+ INFO("result start: {}, end: {}", p.first, p.first + p.second);
+ if (detailed) {
+ assert(!reserved_extent_tracker.contains(p.first, p.second));
+ reserved_extent_tracker.insert(p.first, p.second);
+ }
+ }
+ return result;
+}
+
+void AvlAllocator::free_extent(rbm_abs_addr addr, size_t size)
+{
+ assert(total_size);
+ assert(total_size > available_size);
+ _add_to_tree(addr, size);
+ if (detailed && reserved_extent_tracker.contains(addr, size)) {
+ reserved_extent_tracker.erase(addr, size);
+ }
+}
+
+bool AvlAllocator::is_free_extent(rbm_abs_addr start, size_t size)
+{
+ rbm_abs_addr end = start + size;
+ ceph_assert(size != 0);
+ if (start < base_addr || base_addr + total_size < end) {
+ return false;
+ }
+
+ auto rs = extent_tree.find(extent_range_t{start, end}, extent_tree.key_comp());
+ if (rs != extent_tree.end() && rs->start <= start && rs->end >= end) {
+ return true;
+ }
+ return false;
+}
+}
diff --git a/src/crimson/os/seastore/random_block_manager/avlallocator.h b/src/crimson/os/seastore/random_block_manager/avlallocator.h
new file mode 100644
index 000000000..d1a4fabca
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/avlallocator.h
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include "extent_allocator.h"
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/transaction.h"
+#include <string.h>
+#include "include/buffer.h"
+
+#include <boost/intrusive/avl_set.hpp>
+#include <optional>
+#include <vector>
+
+namespace crimson::os::seastore {
+
+struct extent_range_t {
+ rbm_abs_addr start;
+ rbm_abs_addr end;
+
+ extent_range_t(rbm_abs_addr start, rbm_abs_addr end) :
+ start(start), end(end)
+ {}
+
+ struct before_t {
+ template<typename KeyLeft, typename KeyRight>
+ bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
+ return lhs.end <= rhs.start;
+ }
+ };
+ boost::intrusive::avl_set_member_hook<> offset_hook;
+
+ struct shorter_t {
+ template<typename KeyType>
+ bool operator()(const extent_range_t& lhs, const KeyType& rhs) const {
+ auto lhs_size = lhs.length();
+ auto rhs_size = rhs.end - rhs.start;
+ if (lhs_size < rhs_size) {
+ return true;
+ } else if (lhs_size > rhs_size) {
+ return false;
+ } else {
+ return lhs.start < rhs.start;
+ }
+ }
+ };
+
+ size_t length() const {
+ return end - start;
+ }
+ boost::intrusive::avl_set_member_hook<> size_hook;
+};
+
+/*
+ * This is the simplest version of avlallocator from bluestore's avlallocator
+ */
+class AvlAllocator : public ExtentAllocator {
+public:
+ AvlAllocator(bool detailed) :
+ detailed(detailed) {}
+ std::optional<interval_set<rbm_abs_addr>> alloc_extent(
+ size_t size) final;
+
+ void free_extent(rbm_abs_addr addr, size_t size) final;
+ void mark_extent_used(rbm_abs_addr addr, size_t size) final;
+ void init(rbm_abs_addr addr, size_t size, size_t b_size);
+
+ struct dispose_rs {
+ void operator()(extent_range_t* p)
+ {
+ delete p;
+ }
+ };
+
+ ~AvlAllocator() {
+ close();
+ }
+
+ void close() {
+ if (!detailed) {
+ assert(reserved_extent_tracker.size() == 0);
+ }
+ extent_size_tree.clear();
+ extent_tree.clear_and_dispose(dispose_rs{});
+ total_size = 0;
+ block_size = 0;
+ available_size = 0;
+ base_addr = 0;
+ }
+
+ uint64_t get_available_size() const final {
+ return available_size;
+ }
+
+ uint64_t get_max_alloc_size() const final {
+ return max_alloc_size;
+ }
+
+ bool is_free_extent(rbm_abs_addr start, size_t size);
+
+ void complete_allocation(rbm_abs_addr start, size_t size) final {
+ if (detailed) {
+ assert(reserved_extent_tracker.contains(start, size));
+ reserved_extent_tracker.erase(start, size);
+ }
+ }
+
+ bool is_reserved_extent(rbm_abs_addr start, size_t size) {
+ if (detailed) {
+ return reserved_extent_tracker.contains(start, size);
+ }
+ return false;
+ }
+
+ rbm_extent_state_t get_extent_state(rbm_abs_addr addr, size_t size) final {
+ if (is_reserved_extent(addr, size)) {
+ return rbm_extent_state_t::RESERVED;
+ } else if (is_free_extent(addr, size)) {
+ return rbm_extent_state_t::FREE;
+ }
+ return rbm_extent_state_t::ALLOCATED;
+ }
+
+private:
+ void _add_to_tree(rbm_abs_addr start, size_t size);
+
+ void _extent_size_tree_rm(extent_range_t& r) {
+ ceph_assert(available_size >= r.length());
+ available_size -= r.length();
+ extent_size_tree.erase(r);
+ }
+
+ void _extent_size_tree_try_insert(extent_range_t& r) {
+ extent_size_tree.insert(r);
+ available_size += r.length();
+ }
+
+ void _remove_from_tree(rbm_abs_addr start, rbm_abs_addr size);
+ rbm_abs_addr find_block(size_t size);
+
+ using extent_tree_t =
+ boost::intrusive::avl_set<
+ extent_range_t,
+ boost::intrusive::compare<extent_range_t::before_t>,
+ boost::intrusive::member_hook<
+ extent_range_t,
+ boost::intrusive::avl_set_member_hook<>,
+ &extent_range_t::offset_hook>>;
+ extent_tree_t extent_tree;
+
+ using extent_size_tree_t =
+ boost::intrusive::avl_set<
+ extent_range_t,
+ boost::intrusive::compare<extent_range_t::shorter_t>,
+ boost::intrusive::member_hook<
+ extent_range_t,
+ boost::intrusive::avl_set_member_hook<>,
+ &extent_range_t::size_hook>>;
+ extent_size_tree_t extent_size_tree;
+
+ uint64_t block_size = 0;
+ uint64_t available_size = 0;
+ uint64_t total_size = 0;
+ uint64_t base_addr = 0;
+ uint64_t max_alloc_size = 4 << 20;
+ bool detailed;
+ interval_set<rbm_abs_addr> reserved_extent_tracker;
+};
+
+}
diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
new file mode 100644
index 000000000..511b70a2e
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "crimson/os/seastore/logging.h"
+
+#include "include/buffer.h"
+#include "rbm_device.h"
+#include "include/interval_set.h"
+#include "include/intarith.h"
+#include "block_rb_manager.h"
+
+SET_SUBSYS(seastore_device);
+
+namespace crimson::os::seastore {
+
+device_config_t get_rbm_ephemeral_device_config(
+ std::size_t index, std::size_t num_devices)
+{
+ assert(num_devices > index);
+ magic_t magic = 0xfffa;
+ auto type = device_type_t::RANDOM_BLOCK_EPHEMERAL;
+ bool is_major_device;
+ secondary_device_set_t secondary_devices;
+ if (index == 0) {
+ is_major_device = true;
+ for (std::size_t secondary_index = index + 1;
+ secondary_index < num_devices;
+ ++secondary_index) {
+ device_id_t secondary_id = static_cast<device_id_t>(secondary_index);
+ secondary_devices.insert({
+ secondary_index, device_spec_t{magic, type, secondary_id}
+ });
+ }
+ } else { // index > 0
+ is_major_device = false;
+ }
+
+ device_id_t id = static_cast<device_id_t>(DEVICE_ID_RANDOM_BLOCK_MIN + index);
+ seastore_meta_t meta = {};
+ return {is_major_device,
+ device_spec_t{magic, type, id},
+ meta,
+ secondary_devices};
+}
+
+paddr_t BlockRBManager::alloc_extent(size_t size)
+{
+ LOG_PREFIX(BlockRBManager::alloc_extent);
+ assert(allocator);
+ auto alloc = allocator->alloc_extent(size);
+ ceph_assert((*alloc).num_intervals() == 1);
+ auto extent = (*alloc).begin();
+ ceph_assert(size == extent.get_len());
+ paddr_t paddr = convert_abs_addr_to_paddr(
+ extent.get_start(),
+ device->get_device_id());
+ DEBUG("allocated addr: {}, size: {}, requested size: {}",
+ paddr, extent.get_len(), size);
+ return paddr;
+}
+
+void BlockRBManager::complete_allocation(
+ paddr_t paddr, size_t size)
+{
+ assert(allocator);
+ rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr);
+ allocator->complete_allocation(addr, size);
+}
+
+BlockRBManager::open_ertr::future<> BlockRBManager::open()
+{
+ assert(device);
+ assert(device->get_available_size() > 0);
+ assert(device->get_block_size() > 0);
+ auto ool_start = get_start_rbm_addr();
+ allocator->init(
+ ool_start,
+ device->get_shard_end() -
+ ool_start,
+ device->get_block_size());
+ return open_ertr::now();
+}
+
+BlockRBManager::write_ertr::future<> BlockRBManager::write(
+ paddr_t paddr,
+ bufferptr &bptr)
+{
+ LOG_PREFIX(BlockRBManager::write);
+ ceph_assert(device);
+ rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr);
+ rbm_abs_addr start = device->get_shard_start();
+ rbm_abs_addr end = device->get_shard_end();
+ if (addr < start || addr + bptr.length() > end) {
+ ERROR("out of range: start {}, end {}, addr {}, length {}",
+ start, end, addr, bptr.length());
+ return crimson::ct_error::erange::make();
+ }
+ bufferptr bp = bufferptr(ceph::buffer::create_page_aligned(bptr.length()));
+ bp.copy_in(0, bptr.length(), bptr.c_str());
+ return device->write(
+ addr,
+ std::move(bp));
+}
+
+BlockRBManager::read_ertr::future<> BlockRBManager::read(
+ paddr_t paddr,
+ bufferptr &bptr)
+{
+ LOG_PREFIX(BlockRBManager::read);
+ ceph_assert(device);
+ rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr);
+ rbm_abs_addr start = device->get_shard_start();
+ rbm_abs_addr end = device->get_shard_end();
+ if (addr < start || addr + bptr.length() > end) {
+ ERROR("out of range: start {}, end {}, addr {}, length {}",
+ start, end, addr, bptr.length());
+ return crimson::ct_error::erange::make();
+ }
+ return device->read(
+ addr,
+ bptr);
+}
+
+BlockRBManager::close_ertr::future<> BlockRBManager::close()
+{
+ ceph_assert(device);
+ allocator->close();
+ return device->close();
+}
+
+BlockRBManager::write_ertr::future<> BlockRBManager::write(
+ rbm_abs_addr addr,
+ bufferlist &bl)
+{
+ LOG_PREFIX(BlockRBManager::write);
+ ceph_assert(device);
+ bufferptr bptr;
+ try {
+ bptr = bufferptr(ceph::buffer::create_page_aligned(bl.length()));
+ auto iter = bl.cbegin();
+ iter.copy(bl.length(), bptr.c_str());
+ } catch (const std::exception &e) {
+ DEBUG("write: exception creating aligned buffer {}", e);
+ ceph_assert(0 == "unhandled exception");
+ }
+ return device->write(
+ addr,
+ std::move(bptr));
+}
+
+std::ostream &operator<<(std::ostream &out, const rbm_metadata_header_t &header)
+{
+ out << " rbm_metadata_header_t(size=" << header.size
+ << ", block_size=" << header.block_size
+ << ", feature=" << header.feature
+ << ", journal_size=" << header.journal_size
+ << ", crc=" << header.crc
+ << ", config=" << header.config
+ << ", shard_num=" << header.shard_num;
+ for (auto p : header.shard_infos) {
+ out << p;
+ }
+ return out << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const rbm_shard_info_t &shard)
+{
+ out << " rbm_shard_info_t(size=" << shard.size
+ << ", start_offset=" << shard.start_offset;
+ return out << ")";
+}
+
+}
diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.h b/src/crimson/os/seastore/random_block_manager/block_rb_manager.h
new file mode 100644
index 000000000..b686820d0
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.h
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iosfwd>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "include/buffer_fwd.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/transaction.h"
+#include "rbm_device.h"
+#include "crimson/os/seastore/random_block_manager.h"
+
+#include "crimson/common/layout.h"
+#include "include/buffer.h"
+#include "include/uuid.h"
+#include "avlallocator.h"
+
+
+namespace crimson::os::seastore {
+
+using RBMDevice = random_block_device::RBMDevice;
+using RBMDeviceRef = std::unique_ptr<RBMDevice>;
+
+device_config_t get_rbm_ephemeral_device_config(
+ std::size_t index, std::size_t num_devices);
+
+class BlockRBManager final : public RandomBlockManager {
+public:
+ /*
+ * Ondisk layout (TODO)
+ *
+ * ---------------------------------------------------------------------------
+ * | rbm_metadata_header_t | metadatas | ... | data blocks |
+ * ---------------------------------------------------------------------------
+ */
+
+ read_ertr::future<> read(paddr_t addr, bufferptr &buffer) final;
+ write_ertr::future<> write(paddr_t addr, bufferptr &buf) final;
+ open_ertr::future<> open() final;
+ close_ertr::future<> close() final;
+
+ /*
+ * alloc_extent
+ *
+ * The role of this function is to find out free blocks the transaction requires.
+ * To do so, alloc_extent() looks into both in-memory allocator
+ * and freebitmap blocks.
+ *
+ * TODO: multiple allocation
+ *
+ */
+ paddr_t alloc_extent(size_t size) final; // allocator, return blocks
+
+ void complete_allocation(paddr_t addr, size_t size) final;
+
+ size_t get_start_rbm_addr() const {
+ return device->get_shard_journal_start() + device->get_journal_size();
+ }
+ size_t get_size() const final {
+ return device->get_shard_end() - get_start_rbm_addr();
+ };
+ extent_len_t get_block_size() const final { return device->get_block_size(); }
+
+ BlockRBManager(RBMDevice * device, std::string path, bool detailed)
+ : device(device), path(path) {
+ allocator.reset(new AvlAllocator(detailed));
+ }
+
+ write_ertr::future<> write(rbm_abs_addr addr, bufferlist &bl);
+
+ device_id_t get_device_id() const final {
+ assert(device);
+ return device->get_device_id();
+ }
+
+ uint64_t get_free_blocks() const final {
+ // TODO: return correct free blocks after block allocator is introduced
+ assert(device);
+ return get_size() / get_block_size();
+ }
+ const seastore_meta_t &get_meta() const final {
+ return device->get_meta();
+ }
+ RBMDevice* get_device() {
+ return device;
+ }
+
+ void mark_space_used(paddr_t paddr, size_t len) final {
+ assert(allocator);
+ rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr);
+ assert(addr >= get_start_rbm_addr() &&
+ addr + len <= device->get_shard_end());
+ allocator->mark_extent_used(addr, len);
+ }
+
+ void mark_space_free(paddr_t paddr, size_t len) final {
+ assert(allocator);
+ rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr);
+ assert(addr >= get_start_rbm_addr() &&
+ addr + len <= device->get_shard_end());
+ allocator->free_extent(addr, len);
+ }
+
+ paddr_t get_start() final {
+ return convert_abs_addr_to_paddr(
+ get_start_rbm_addr(),
+ device->get_device_id());
+ }
+
+ rbm_extent_state_t get_extent_state(paddr_t paddr, size_t size) final {
+ assert(allocator);
+ rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr);
+ assert(addr >= get_start_rbm_addr() &&
+ addr + size <= device->get_shard_end());
+ return allocator->get_extent_state(addr, size);
+ }
+
+ size_t get_journal_size() const final {
+ return device->get_journal_size();
+ }
+
+private:
+ /*
+ * this contains the number of bitmap blocks, free blocks and
+ * rbm specific information
+ */
+ ExtentAllocatorRef allocator;
+ RBMDevice * device;
+ std::string path;
+ int stream_id; // for multi-stream
+};
+using BlockRBManagerRef = std::unique_ptr<BlockRBManager>;
+
+}
diff --git a/src/crimson/os/seastore/random_block_manager/extent_allocator.h b/src/crimson/os/seastore/random_block_manager/extent_allocator.h
new file mode 100644
index 000000000..8a3e62c6d
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/extent_allocator.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/random_block_manager.h"
+#include "include/interval_set.h"
+
+namespace crimson::os::seastore {
+
+class ExtentAllocator {
+public:
+ /**
+ * alloc_extent
+ *
+ * Allocate continous region as much as given size.
+ * Note that the inital state of extent is RESERVED after alloc_extent().
+ * see rbm_extent_state_t in random_block_manager.h
+ *
+ * @param size
+ * @return nullopt or the address range (rbm_abs_addr, len)
+ */
+ virtual std::optional<interval_set<rbm_abs_addr>> alloc_extent(
+ size_t size) = 0;
+ /**
+ * free_extent
+ *
+ * free given region
+ *
+ * @param rbm_abs_addr
+ * @param size
+ */
+ virtual void free_extent(rbm_abs_addr addr, size_t size) = 0;
+ /**
+ * mark_extent_used
+ *
+ * This marks given region as used without alloc_extent.
+ *
+ * @param rbm_abs_addr
+ * @param size
+ */
+ virtual void mark_extent_used(rbm_abs_addr addr, size_t size) = 0;
+ /**
+ * init
+ *
+ * Initialize the address space the ExtentAllocator will manage
+ *
+ * @param start address (rbm_abs_addr)
+ * @param total size
+ * @param block size
+ */
+ virtual void init(rbm_abs_addr addr, size_t size, size_t b_size) = 0;
+ virtual uint64_t get_available_size() const = 0;
+ virtual uint64_t get_max_alloc_size() const = 0;
+ virtual void close() = 0;
+ /**
+ * complete_allocation
+ *
+ * This changes this extent state from RESERVED to ALLOCATED
+ *
+ * @param start address
+ * @param size
+ */
+ virtual void complete_allocation(rbm_abs_addr start, size_t size) = 0;
+ virtual rbm_extent_state_t get_extent_state(rbm_abs_addr addr, size_t size) = 0;
+ virtual ~ExtentAllocator() {}
+};
+using ExtentAllocatorRef = std::unique_ptr<ExtentAllocator>;
+
+
+}
diff --git a/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc b/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc
new file mode 100644
index 000000000..6437f06a4
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc
@@ -0,0 +1,280 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <fcntl.h>
+
+#include "crimson/common/log.h"
+#include "crimson/common/errorator-loop.h"
+
+#include "include/buffer.h"
+#include "rbm_device.h"
+#include "nvme_block_device.h"
+#include "block_rb_manager.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_seastore_tm);
+ }
+}
+
+namespace crimson::os::seastore::random_block_device::nvme {
+
+NVMeBlockDevice::mkfs_ret NVMeBlockDevice::mkfs(device_config_t config) {
+ using crimson::common::get_conf;
+ return shard_devices.local().do_primary_mkfs(config,
+ seastar::smp::count,
+ get_conf<Option::size_t>("seastore_cbjournal_size")
+ );
+}
+
+open_ertr::future<> NVMeBlockDevice::open(
+ const std::string &in_path,
+ seastar::open_flags mode) {
+ return seastar::do_with(in_path, [this, mode](auto& in_path) {
+ return seastar::file_stat(in_path).then([this, mode, in_path](auto stat) {
+ return seastar::open_file_dma(in_path, mode).then([=, this](auto file) {
+ device = std::move(file);
+ logger().debug("open");
+ // Get SSD's features from identify_controller and namespace command.
+ // Do identify_controller first, and then identify_namespace.
+ return identify_controller(device).safe_then([this, in_path, mode](
+ auto id_controller_data) {
+ support_multistream = id_controller_data.oacs.support_directives;
+ if (support_multistream) {
+ stream_id_count = WRITE_LIFE_MAX;
+ }
+ awupf = id_controller_data.awupf + 1;
+ return identify_namespace(device).safe_then([this, in_path, mode] (
+ auto id_namespace_data) {
+ atomic_write_unit = awupf * super.block_size;
+ data_protection_type = id_namespace_data.dps.protection_type;
+ data_protection_enabled = (data_protection_type > 0);
+ if (id_namespace_data.nsfeat.opterf == 1){
+ // NPWG and NPWA is 0'based value
+ write_granularity = super.block_size * (id_namespace_data.npwg + 1);
+ write_alignment = super.block_size * (id_namespace_data.npwa + 1);
+ }
+ return open_for_io(in_path, mode);
+ });
+ }).handle_error(crimson::ct_error::input_output_error::handle([this, in_path, mode]{
+ logger().error("open: id ctrlr failed. open without ioctl");
+ return open_for_io(in_path, mode);
+ }), crimson::ct_error::pass_further_all{});
+ });
+ });
+ });
+}
+
+open_ertr::future<> NVMeBlockDevice::open_for_io(
+ const std::string& in_path,
+ seastar::open_flags mode) {
+ io_device.resize(stream_id_count);
+ return seastar::do_for_each(io_device, [=, this](auto &target_device) {
+ return seastar::open_file_dma(in_path, mode).then([this](
+ auto file) {
+ assert(io_device.size() > stream_index_to_open);
+ io_device[stream_index_to_open] = std::move(file);
+ return io_device[stream_index_to_open].fcntl(
+ F_SET_FILE_RW_HINT,
+ (uintptr_t)&stream_index_to_open).then([this](auto ret) {
+ stream_index_to_open++;
+ return seastar::now();
+ });
+ });
+ });
+}
+
+NVMeBlockDevice::mount_ret NVMeBlockDevice::mount()
+{
+ logger().debug(" mount ");
+ return shard_devices.invoke_on_all([](auto &local_device) {
+ return local_device.do_shard_mount(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in RBMDevice::do_mount"
+ });
+ });
+}
+
+write_ertr::future<> NVMeBlockDevice::write(
+ uint64_t offset,
+ bufferptr &&bptr,
+ uint16_t stream) {
+ logger().debug(
+ "block: write offset {} len {}",
+ offset,
+ bptr.length());
+ auto length = bptr.length();
+
+ assert((length % super.block_size) == 0);
+ uint16_t supported_stream = stream;
+ if (stream >= stream_id_count) {
+ supported_stream = WRITE_LIFE_NOT_SET;
+ }
+ return seastar::do_with(
+ std::move(bptr),
+ [this, offset, length, supported_stream] (auto& bptr) {
+ return io_device[supported_stream].dma_write(
+ offset, bptr.c_str(), length).handle_exception(
+ [](auto e) -> write_ertr::future<size_t> {
+ logger().error("write: dma_write got error{}", e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([length](auto result) -> write_ertr::future<> {
+ if (result != length) {
+ logger().error("write: dma_write got error with not proper length");
+ return crimson::ct_error::input_output_error::make();
+ }
+ return write_ertr::now();
+ });
+ });
+}
+
+read_ertr::future<> NVMeBlockDevice::read(
+ uint64_t offset,
+ bufferptr &bptr) {
+ logger().debug(
+ "block: read offset {} len {}",
+ offset,
+ bptr.length());
+ auto length = bptr.length();
+
+ assert((length % super.block_size) == 0);
+
+ return device.dma_read(offset, bptr.c_str(), length).handle_exception(
+ [](auto e) -> read_ertr::future<size_t> {
+ logger().error("read: dma_read got error{}", e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([length](auto result) -> read_ertr::future<> {
+ if (result != length) {
+ logger().error("read: dma_read got error with not proper length");
+ return crimson::ct_error::input_output_error::make();
+ }
+ return read_ertr::now();
+ });
+}
+
+write_ertr::future<> NVMeBlockDevice::writev(
+ uint64_t offset,
+ ceph::bufferlist bl,
+ uint16_t stream) {
+ logger().debug(
+ "block: write offset {} len {}",
+ offset,
+ bl.length());
+
+ uint16_t supported_stream = stream;
+ if (stream >= stream_id_count) {
+ supported_stream = WRITE_LIFE_NOT_SET;
+ }
+ bl.rebuild_aligned(super.block_size);
+
+ return seastar::do_with(
+ bl.prepare_iovs(),
+ std::move(bl),
+ [this, supported_stream, offset](auto& iovs, auto& bl)
+ {
+ return write_ertr::parallel_for_each(
+ iovs,
+ [this, supported_stream, offset](auto& p) mutable
+ {
+ auto off = offset + p.offset;
+ auto len = p.length;
+ auto& iov = p.iov;
+ return io_device[supported_stream].dma_write(off, std::move(iov)
+ ).handle_exception(
+ [this, off, len](auto e) -> write_ertr::future<size_t>
+ {
+ logger().error("{} poffset={}~{} dma_write got error -- {}",
+ device_id_printer_t{get_device_id()}, off, len, e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([this, off, len](size_t written) -> write_ertr::future<> {
+ if (written != len) {
+ logger().error("{} poffset={}~{} dma_write len={} inconsistent",
+ device_id_printer_t{get_device_id()}, off, len, written);
+ return crimson::ct_error::input_output_error::make();
+ }
+ return write_ertr::now();
+ });
+ });
+ });
+}
+
+Device::close_ertr::future<> NVMeBlockDevice::close() {
+ logger().debug(" close ");
+ stream_index_to_open = WRITE_LIFE_NOT_SET;
+ return device.close().then([this]() {
+ return seastar::do_for_each(io_device, [](auto target_device) {
+ return target_device.close();
+ });
+ });
+}
+
+nvme_command_ertr::future<nvme_identify_controller_data_t>
+NVMeBlockDevice::identify_controller(seastar::file f) {
+ return seastar::do_with(
+ nvme_admin_command_t(),
+ nvme_identify_controller_data_t(),
+ [this, f](auto &admin_command, auto &data) {
+ admin_command.common.opcode = nvme_admin_command_t::OPCODE_IDENTIFY;
+ admin_command.common.addr = (uint64_t)&data;
+ admin_command.common.data_len = sizeof(data);
+ admin_command.identify.cns = nvme_identify_command_t::CNS_CONTROLLER;
+
+ return pass_admin(admin_command, f).safe_then([&data](auto status) {
+ return seastar::make_ready_future<nvme_identify_controller_data_t>(
+ std::move(data));
+ });
+ });
+}
+
+discard_ertr::future<> NVMeBlockDevice::discard(uint64_t offset, uint64_t len) {
+ return device.discard(offset, len);
+}
+
+nvme_command_ertr::future<nvme_identify_namespace_data_t>
+NVMeBlockDevice::identify_namespace(seastar::file f) {
+ return get_nsid(f).safe_then([this, f](auto nsid) {
+ return seastar::do_with(
+ nvme_admin_command_t(),
+ nvme_identify_namespace_data_t(),
+ [this, nsid, f](auto &admin_command, auto &data) {
+ admin_command.common.opcode = nvme_admin_command_t::OPCODE_IDENTIFY;
+ admin_command.common.addr = (uint64_t)&data;
+ admin_command.common.data_len = sizeof(data);
+ admin_command.common.nsid = nsid;
+ admin_command.identify.cns = nvme_identify_command_t::CNS_NAMESPACE;
+
+ return pass_admin(admin_command, f).safe_then([&data](auto status){
+ return seastar::make_ready_future<nvme_identify_namespace_data_t>(
+ std::move(data));
+ });
+ });
+ });
+}
+
+nvme_command_ertr::future<int> NVMeBlockDevice::get_nsid(seastar::file f) {
+ return f.ioctl(NVME_IOCTL_ID, nullptr).handle_exception(
+ [](auto e)->nvme_command_ertr::future<int> {
+ logger().error("pass_admin: ioctl failed");
+ return crimson::ct_error::input_output_error::make();
+ });
+}
+
+nvme_command_ertr::future<int> NVMeBlockDevice::pass_admin(
+ nvme_admin_command_t& admin_cmd, seastar::file f) {
+ return f.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd).handle_exception(
+ [](auto e)->nvme_command_ertr::future<int> {
+ logger().error("pass_admin: ioctl failed");
+ return crimson::ct_error::input_output_error::make();
+ });
+}
+
+nvme_command_ertr::future<int> NVMeBlockDevice::pass_through_io(
+ nvme_io_command_t& io_cmd) {
+ return device.ioctl(NVME_IOCTL_IO_CMD, &io_cmd);
+}
+
+}
diff --git a/src/crimson/os/seastore/random_block_manager/nvme_block_device.h b/src/crimson/os/seastore/random_block_manager/nvme_block_device.h
new file mode 100644
index 000000000..ed8f99be8
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/nvme_block_device.h
@@ -0,0 +1,360 @@
+//-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <seastar/core/file.hh>
+#include <linux/nvme_ioctl.h>
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/common/layout.h"
+#include "rbm_device.h"
+
+namespace ceph {
+ namespace buffer {
+ class bufferptr;
+ }
+}
+
+namespace crimson::os::seastore::random_block_device::nvme {
+/*
+ * NVMe protocol structures (nvme_XX, identify_XX)
+ *
+ * All structures relative to NVMe protocol are following NVMe protocol v1.4
+ * (latest). NVMe is protocol for fast interfacing between user and SSD device.
+ * We selectively adopted features among various NVMe features to ease
+ * implementation. And also, NVMeBlockDevice provides generic command submission
+ * APIs for IO and Admin commands. Please use pass_through_io() and pass_admin()
+ * to do it.
+ *
+ * For more information about NVMe protocol, refer https://nvmexpress.org/
+ */
+struct nvme_identify_command_t {
+ uint32_t common_dw[10];
+
+ uint32_t cns : 8;
+ uint32_t reserved : 8;
+ uint32_t cnt_id : 16;
+
+ static const uint8_t CNS_NAMESPACE = 0x00;
+ static const uint8_t CNS_CONTROLLER = 0x01;
+};
+
+struct nvme_admin_command_t {
+ union {
+ nvme_passthru_cmd common;
+ nvme_identify_command_t identify;
+ };
+
+ static const uint8_t OPCODE_IDENTIFY = 0x06;
+};
+
+// Optional Admin Command Support (OACS)
+// Indicates optional commands are supported by SSD or not
+struct oacs_t {
+ uint16_t unused : 5;
+ uint16_t support_directives : 1; // Support multi-stream
+ uint16_t unused2 : 10;
+};
+
+struct nvme_identify_controller_data_t {
+ union {
+ struct {
+ uint8_t unused[256]; // [255:0]
+ oacs_t oacs; // [257:256]
+ uint8_t unused2[270]; // [527:258]
+ uint16_t awupf; // [529:528]
+ };
+ uint8_t raw[4096];
+ };
+};
+
+// End-to-end Data Protection Capabilities (DPC)
+// Indicates type of E2E data protection supported by SSD
+struct dpc_t {
+ uint8_t support_type1 : 1;
+ uint8_t support_type2 : 1;
+ uint8_t support_type3 : 1;
+ uint8_t support_first_meta : 1;
+ uint8_t support_last_meta : 1;
+ uint8_t reserved : 3;
+};
+
+// End-to-end Data Protection Type Settings (DPS)
+// Indicates enabled type of E2E data protection
+struct dps_t {
+ uint8_t protection_type : 3;
+ uint8_t protection_info : 1;
+ uint8_t reserved : 4;
+};
+
+// Namespace Features (NSFEAT)
+// Indicates features of namespace
+struct nsfeat_t {
+ uint8_t thinp : 1;
+ uint8_t nsabp : 1;
+ uint8_t dae : 1;
+ uint8_t uid_reuse : 1;
+ uint8_t opterf : 1; // Support NPWG, NPWA
+ uint8_t reserved : 3;
+};
+
+// LBA Format (LBAF)
+// Indicates LBA format (metadata size, data size, performance)
+struct lbaf_t {
+ uint32_t ms : 16;
+ uint32_t lbads : 8;
+ uint32_t rp : 2;
+ uint32_t reserved : 6;
+};
+
+struct nvme_identify_namespace_data_t {
+ union {
+ struct {
+ uint8_t unused[24]; // [23:0]
+ nsfeat_t nsfeat; // [24]
+ uint8_t unused2[3]; // [27:25]
+ dpc_t dpc; // [28]
+ dps_t dps; // [29]
+ uint8_t unused3[34]; // [63:30]
+ uint16_t npwg; // [65:64]
+ uint16_t npwa; // [67:66]
+ uint8_t unused4[60]; // [127:68]
+ lbaf_t lbaf0; // [131:128]
+ };
+ uint8_t raw[4096];
+ };
+};
+
+struct nvme_rw_command_t {
+ uint32_t common_dw[10];
+
+ uint64_t s_lba;
+
+ uint32_t nlb : 16; // 0's based value
+ uint32_t reserved : 4;
+ uint32_t d_type : 4;
+ uint32_t reserved2 : 2;
+ uint32_t prinfo_prchk : 3;
+ uint32_t prinfo_pract : 1;
+ uint32_t fua : 1;
+ uint32_t lr : 1;
+
+ uint32_t reserved3 : 16;
+ uint32_t dspec : 16;
+
+ static const uint32_t DTYPE_STREAM = 1;
+};
+
+struct nvme_io_command_t {
+ union {
+ nvme_passthru_cmd common;
+ nvme_rw_command_t rw;
+ };
+ static const uint8_t OPCODE_WRITE = 0x01;
+ static const uint8_t OPCODE_READ = 0x01;
+};
+
+/*
+ * Implementation of NVMeBlockDevice with POSIX APIs
+ *
+ * NVMeBlockDevice provides NVMe SSD interfaces through POSIX APIs which is
+ * generally available at most operating environment.
+ */
+class NVMeBlockDevice : public RBMDevice {
+public:
+
+ /*
+ * Service NVMe device relative size
+ *
+ * size : total size of device in byte.
+ *
+ * block_size : IO unit size in byte. Caller should follow every IO command
+ * aligned with block size.
+ *
+ * preffered_write_granularity(PWG), preffered_write_alignment(PWA) : IO unit
+ * size for write in byte. Caller should request every write IO sized multiple
+ * times of PWG and aligned starting address by PWA. Available only if NVMe
+ * Device supports NVMe protocol 1.4 or later versions.
+ * atomic_write_unit : The maximum size of write whose atomicity is guranteed
+ * by SSD even on power failure. The write equal to or smaller than
+ * atomic_write_unit does not require fsync().
+ */
+
+ NVMeBlockDevice(std::string device_path) : device_path(device_path) {}
+ ~NVMeBlockDevice() = default;
+
+ open_ertr::future<> open(
+ const std::string &in_path,
+ seastar::open_flags mode) override;
+
+ write_ertr::future<> write(
+ uint64_t offset,
+ bufferptr &&bptr,
+ uint16_t stream = 0) override;
+
+ using RBMDevice::read;
+ read_ertr::future<> read(
+ uint64_t offset,
+ bufferptr &bptr) final;
+
+ close_ertr::future<> close() override;
+
+ discard_ertr::future<> discard(
+ uint64_t offset,
+ uint64_t len) override;
+
+ mount_ret mount() final;
+
+ mkfs_ret mkfs(device_config_t config) final;
+
+ write_ertr::future<> writev(
+ uint64_t offset,
+ ceph::bufferlist bl,
+ uint16_t stream = 0) final;
+
+ stat_device_ret stat_device() final {
+ return seastar::file_stat(device_path, seastar::follow_symlink::yes
+ ).handle_exception([](auto e) -> stat_device_ret {
+ return crimson::ct_error::input_output_error::make();
+ }).then([this](auto stat) {
+ return seastar::open_file_dma(
+ device_path,
+ seastar::open_flags::rw | seastar::open_flags::dsync
+ ).then([this, stat](auto file) mutable {
+ return file.size().then([this, stat, file](auto size) mutable {
+ stat.size = size;
+ return identify_namespace(file
+ ).safe_then([stat] (auto id_namespace_data) mutable {
+ // LBA format provides LBA size which is power of 2. LBA is the
+ // minimum size of read and write.
+ stat.block_size = (1 << id_namespace_data.lbaf0.lbads);
+ if (stat.block_size < RBM_SUPERBLOCK_SIZE) {
+ stat.block_size = RBM_SUPERBLOCK_SIZE;
+ }
+ return stat_device_ret(
+ read_ertr::ready_future_marker{},
+ stat
+ );
+ }).handle_error(crimson::ct_error::input_output_error::handle(
+ [stat]{
+ return stat_device_ret(
+ read_ertr::ready_future_marker{},
+ stat
+ );
+ }), crimson::ct_error::pass_further_all{});
+ }).safe_then([file](auto st) mutable {
+ return file.close(
+ ).then([st] {
+ return stat_device_ret(
+ read_ertr::ready_future_marker{},
+ st
+ );
+ });
+ });
+ });
+ });
+ }
+
+ std::string get_device_path() const final {
+ return device_path;
+ }
+
+ seastar::future<> start() final {
+ return shard_devices.start(device_path);
+ }
+
+ seastar::future<> stop() final {
+ return shard_devices.stop();
+ }
+
+ Device& get_sharded_device() final {
+ return shard_devices.local();
+ }
+
+ uint64_t get_preffered_write_granularity() const { return write_granularity; }
+ uint64_t get_preffered_write_alignment() const { return write_alignment; }
+ uint64_t get_atomic_write_unit() const { return atomic_write_unit; }
+ /*
+ * End-to-End Data Protection
+ *
+ * NVMe device keeps track of data integrity similar with checksum. Client can
+ * offload checksuming to NVMe device to reduce its CPU utilization. If data
+ * protection is enabled, checksum is calculated on every write and used to
+ * verify data on every read.
+ */
+ bool is_data_protection_enabled() const { return data_protection_enabled; }
+
+ /*
+ * Data Health
+ *
+ * Returns list of LBAs which have almost corrupted data. Data of the LBAs
+ * will be corrupted very soon. Caller can overwrite, unmap or refresh data to
+ * protect data
+ */
+ virtual nvme_command_ertr::future<std::list<uint64_t>> get_data_health() {
+ std::list<uint64_t> fragile_lbas;
+ return nvme_command_ertr::future<std::list<uint64_t>>(
+ nvme_command_ertr::ready_future_marker{},
+ fragile_lbas
+ );
+ }
+
+ /*
+ * Recovery Level
+ *
+ * Regulate magnitude of SSD-internal data recovery. Caller can get good read
+ * latency with lower magnitude.
+ */
+ virtual nvme_command_ertr::future<> set_data_recovery_level(
+ uint32_t level) { return nvme_command_ertr::now(); }
+ /*
+ * For passsing through nvme IO or Admin command to SSD
+ * Caller can construct and execute its own nvme command
+ */
+ nvme_command_ertr::future<int> pass_admin(
+ nvme_admin_command_t& admin_cmd, seastar::file f);
+ nvme_command_ertr::future<int> pass_through_io(
+ nvme_io_command_t& io_cmd);
+
+ bool support_multistream = false;
+ uint8_t data_protection_type = 0;
+
+ /*
+ * Predictable Latency
+ *
+ * NVMe device can guarantee IO latency within pre-defined time window. This
+ * functionality will be analyzed soon.
+ */
+
+private:
+ // identify_controller/namespace are used to get SSD internal information such
+ // as supported features, NPWG and NPWA
+ nvme_command_ertr::future<nvme_identify_controller_data_t>
+ identify_controller(seastar::file f);
+ nvme_command_ertr::future<nvme_identify_namespace_data_t>
+ identify_namespace(seastar::file f);
+ nvme_command_ertr::future<int> get_nsid(seastar::file f);
+ open_ertr::future<> open_for_io(
+ const std::string& in_path,
+ seastar::open_flags mode);
+
+ seastar::file device;
+ std::vector<seastar::file> io_device;
+ uint32_t stream_index_to_open = WRITE_LIFE_NOT_SET;
+ uint32_t stream_id_count = 1; // stream is disabled, defaultly.
+ uint32_t awupf = 0;
+
+ uint64_t write_granularity = 4096;
+ uint64_t write_alignment = 4096;
+ uint32_t atomic_write_unit = 4096;
+
+ bool data_protection_enabled = false;
+ std::string device_path;
+ seastar::sharded<NVMeBlockDevice> shard_devices;
+};
+
+}
diff --git a/src/crimson/os/seastore/random_block_manager/rbm_device.cc b/src/crimson/os/seastore/random_block_manager/rbm_device.cc
new file mode 100644
index 000000000..cea6c30a7
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/rbm_device.cc
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <fcntl.h>
+
+#include "crimson/common/log.h"
+#include "crimson/common/errorator-loop.h"
+
+#include "include/buffer.h"
+#include "rbm_device.h"
+#include "nvme_block_device.h"
+#include "block_rb_manager.h"
+
+namespace crimson::os::seastore::random_block_device {
+#include "crimson/os/seastore/logging.h"
+SET_SUBSYS(seastore_device);
+
+RBMDevice::mkfs_ret RBMDevice::do_primary_mkfs(device_config_t config,
+ int shard_num, size_t journal_size) {
+ LOG_PREFIX(RBMDevice::do_primary_mkfs);
+ return stat_device(
+ ).handle_error(
+ mkfs_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error stat_device in RBMDevice::do_primary_mkfs"}
+ ).safe_then(
+ [this, FNAME, config=std::move(config), shard_num, journal_size](auto st) {
+ super.block_size = st.block_size;
+ super.size = st.size;
+ super.feature |= RBM_BITMAP_BLOCK_CRC;
+ super.config = std::move(config);
+ super.journal_size = journal_size;
+ ceph_assert_always(super.journal_size > 0);
+ ceph_assert_always(super.size >= super.journal_size);
+ ceph_assert_always(shard_num > 0);
+
+ std::vector<rbm_shard_info_t> shard_infos(shard_num);
+ for (int i = 0; i < shard_num; i++) {
+ uint64_t aligned_size =
+ (super.size / shard_num) -
+ ((super.size / shard_num) % super.block_size);
+ shard_infos[i].size = aligned_size;
+ shard_infos[i].start_offset = i * aligned_size;
+ assert(shard_infos[i].size > super.journal_size);
+ }
+ super.shard_infos = shard_infos;
+ super.shard_num = shard_num;
+ shard_info = shard_infos[seastar::this_shard_id()];
+ DEBUG("super {} ", super);
+
+ // write super block
+ return open(get_device_path(),
+ seastar::open_flags::rw | seastar::open_flags::dsync
+ ).handle_error(
+ mkfs_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error open in RBMDevice::do_primary_mkfs"}
+ ).safe_then([this] {
+ return write_rbm_header(
+ ).safe_then([this] {
+ return close();
+ }).handle_error(
+ mkfs_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error write_rbm_header in RBMDevice::do_primary_mkfs"
+ });
+ });
+ });
+}
+
+write_ertr::future<> RBMDevice::write_rbm_header()
+{
+ bufferlist meta_b_header;
+ super.crc = 0;
+ encode(super, meta_b_header);
+ // If NVMeDevice supports data protection, CRC for checksum is not required
+ // NVMeDevice is expected to generate and store checksum internally.
+ // CPU overhead for CRC might be saved.
+ if (is_data_protection_enabled()) {
+ super.crc = -1;
+ } else {
+ super.crc = meta_b_header.crc32c(-1);
+ }
+
+ bufferlist bl;
+ encode(super, bl);
+ auto iter = bl.begin();
+ auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size));
+ assert(bl.length() < super.block_size);
+ iter.copy(bl.length(), bp.c_str());
+ return write(RBM_START_ADDRESS, std::move(bp));
+}
+
+read_ertr::future<rbm_metadata_header_t> RBMDevice::read_rbm_header(
+ rbm_abs_addr addr)
+{
+ LOG_PREFIX(RBMDevice::read_rbm_header);
+ assert(super.block_size > 0);
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(super.block_size)),
+ [this, addr, FNAME](auto &bptr) {
+ return read(
+ addr,
+ bptr
+ ).safe_then([length=bptr.length(), this, bptr, FNAME]()
+ -> read_ertr::future<rbm_metadata_header_t> {
+ bufferlist bl;
+ bl.append(bptr);
+ auto p = bl.cbegin();
+ rbm_metadata_header_t super_block;
+ try {
+ decode(super_block, p);
+ }
+ catch (ceph::buffer::error& e) {
+ DEBUG("read_rbm_header: unable to decode rbm super block {}",
+ e.what());
+ return crimson::ct_error::enoent::make();
+ }
+ checksum_t crc = super_block.crc;
+ bufferlist meta_b_header;
+ super_block.crc = 0;
+ encode(super_block, meta_b_header);
+ assert(ceph::encoded_sizeof<rbm_metadata_header_t>(super_block) <
+ super_block.block_size);
+
+ // Do CRC verification only if data protection is not supported.
+ if (is_data_protection_enabled() == false) {
+ if (meta_b_header.crc32c(-1) != crc) {
+ DEBUG("bad crc on super block, expected {} != actual {} ",
+ meta_b_header.crc32c(-1), crc);
+ return crimson::ct_error::input_output_error::make();
+ }
+ } else {
+ ceph_assert_always(crc == (checksum_t)-1);
+ }
+ super_block.crc = crc;
+ super = super_block;
+ DEBUG("got {} ", super);
+ return read_ertr::future<rbm_metadata_header_t>(
+ read_ertr::ready_future_marker{},
+ super_block
+ );
+ });
+ });
+}
+
+RBMDevice::mount_ret RBMDevice::do_shard_mount()
+{
+ return open(get_device_path(),
+ seastar::open_flags::rw | seastar::open_flags::dsync
+ ).safe_then([this] {
+ return stat_device(
+ ).handle_error(
+ mount_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error stat_device in RBMDevice::do_shard_mount"}
+ ).safe_then([this](auto st) {
+ assert(st.block_size > 0);
+ super.block_size = st.block_size;
+ return read_rbm_header(RBM_START_ADDRESS
+ ).safe_then([this](auto s) {
+ LOG_PREFIX(RBMDevice::do_shard_mount);
+ shard_info = s.shard_infos[seastar::this_shard_id()];
+ INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info);
+ s.validate();
+ return seastar::now();
+ });
+ });
+ }).handle_error(
+ mount_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error mount in RBMDevice::do_shard_mount"}
+ );
+}
+
+EphemeralRBMDeviceRef create_test_ephemeral(uint64_t journal_size, uint64_t data_size) {
+ return EphemeralRBMDeviceRef(
+ new EphemeralRBMDevice(journal_size + data_size +
+ random_block_device::RBMDevice::get_shard_reserved_size(),
+ EphemeralRBMDevice::TEST_BLOCK_SIZE));
+}
+
+open_ertr::future<> EphemeralRBMDevice::open(
+ const std::string &in_path,
+ seastar::open_flags mode) {
+ LOG_PREFIX(EphemeralRBMDevice::open);
+ if (buf) {
+ return open_ertr::now();
+ }
+
+ DEBUG(
+ "Initializing test memory device {}",
+ size);
+
+ void* addr = ::mmap(
+ nullptr,
+ size,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+ -1,
+ 0);
+
+ buf = (char*)addr;
+
+ ::memset(buf, 0, size);
+ return open_ertr::now();
+}
+
+write_ertr::future<> EphemeralRBMDevice::write(
+ uint64_t offset,
+ bufferptr &&bptr,
+ uint16_t stream) {
+ LOG_PREFIX(EphemeralRBMDevice::write);
+ ceph_assert(buf);
+ DEBUG(
+ "EphemeralRBMDevice: write offset {} len {}",
+ offset,
+ bptr.length());
+
+ ::memcpy(buf + offset, bptr.c_str(), bptr.length());
+
+ return write_ertr::now();
+}
+
+read_ertr::future<> EphemeralRBMDevice::read(
+ uint64_t offset,
+ bufferptr &bptr) {
+ LOG_PREFIX(EphemeralRBMDevice::read);
+ ceph_assert(buf);
+ DEBUG(
+ "EphemeralRBMDevice: read offset {} len {}",
+ offset,
+ bptr.length());
+
+ bptr.copy_in(0, bptr.length(), buf + offset);
+ return read_ertr::now();
+}
+
+Device::close_ertr::future<> EphemeralRBMDevice::close() {
+ LOG_PREFIX(EphemeralRBMDevice::close);
+ DEBUG(" close ");
+ return close_ertr::now();
+}
+
+write_ertr::future<> EphemeralRBMDevice::writev(
+ uint64_t offset,
+ ceph::bufferlist bl,
+ uint16_t stream) {
+ LOG_PREFIX(EphemeralRBMDevice::writev);
+ ceph_assert(buf);
+ DEBUG(
+ "EphemeralRBMDevice: write offset {} len {}",
+ offset,
+ bl.length());
+
+ bl.begin().copy(bl.length(), buf + offset);
+ return write_ertr::now();
+}
+
+EphemeralRBMDevice::mount_ret EphemeralRBMDevice::mount() {
+ return do_shard_mount();
+}
+
+EphemeralRBMDevice::mkfs_ret EphemeralRBMDevice::mkfs(device_config_t config) {
+ return do_primary_mkfs(config, 1, DEFAULT_TEST_CBJOURNAL_SIZE);
+}
+
+}
+
diff --git a/src/crimson/os/seastore/random_block_manager/rbm_device.h b/src/crimson/os/seastore/random_block_manager/rbm_device.h
new file mode 100644
index 000000000..501d9f913
--- /dev/null
+++ b/src/crimson/os/seastore/random_block_manager/rbm_device.h
@@ -0,0 +1,261 @@
+//-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/random_block_manager.h"
+#include "crimson/os/seastore/device.h"
+
+namespace ceph {
+ namespace buffer {
+ class bufferptr;
+ }
+}
+
+namespace crimson::os::seastore::random_block_device {
+
+// from blk/BlockDevice.h
+#if defined(__linux__)
+#if !defined(F_SET_FILE_RW_HINT)
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
+#endif
+// These values match Linux definition
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+#define WRITE_LIFE_NOT_SET 0 // No hint information set
+#define WRITE_LIFE_NONE 1 // No hints about write life time
+#define WRITE_LIFE_SHORT 2 // Data written has a short life time
+#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time
+#define WRITE_LIFE_LONG 4 // Data written has a long life time
+#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time
+#define WRITE_LIFE_MAX 6
+#else
+// On systems don't have WRITE_LIFE_* only use one FD
+// And all files are created equal
+#define WRITE_LIFE_NOT_SET 0 // No hint information set
+#define WRITE_LIFE_NONE 0 // No hints about write life time
+#define WRITE_LIFE_SHORT 0 // Data written has a short life time
+#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time
+#define WRITE_LIFE_LONG 0 // Data written has a long life time
+#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time
+#define WRITE_LIFE_MAX 1
+#endif
+
+using read_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+
+using write_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::ebadf,
+ crimson::ct_error::enospc>;
+
+using open_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent>;
+
+using nvme_command_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+
+using discard_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+
+constexpr uint32_t RBM_SUPERBLOCK_SIZE = 4096;
+enum {
+ // TODO: This allows the device to manage crc on a block by itself
+ RBM_NVME_END_TO_END_PROTECTION = 1,
+ RBM_BITMAP_BLOCK_CRC = 2,
+};
+
+class RBMDevice : public Device {
+public:
+ using Device::read;
+ read_ertr::future<> read (
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) final {
+ uint64_t rbm_addr = convert_paddr_to_abs_addr(addr);
+ return read(rbm_addr, out);
+ }
+protected:
+ rbm_metadata_header_t super;
+ rbm_shard_info_t shard_info;
+public:
+ RBMDevice() {}
+ virtual ~RBMDevice() = default;
+
+ template <typename T>
+ static std::unique_ptr<T> create() {
+ return std::make_unique<T>();
+ }
+
+ device_id_t get_device_id() const {
+ return super.config.spec.id;
+ }
+
+ magic_t get_magic() const final {
+ return super.config.spec.magic;
+ }
+
+ device_type_t get_device_type() const final {
+ return device_type_t::RANDOM_BLOCK_SSD;
+ }
+
+ backend_type_t get_backend_type() const final {
+ return backend_type_t::RANDOM_BLOCK;
+ }
+
+ const seastore_meta_t &get_meta() const final {
+ return super.config.meta;
+ }
+
+ secondary_device_set_t& get_secondary_devices() final {
+ return super.config.secondary_devices;
+ }
+ std::size_t get_available_size() const { return super.size; }
+ extent_len_t get_block_size() const { return super.block_size; }
+
+ virtual read_ertr::future<> read(
+ uint64_t offset,
+ bufferptr &bptr) = 0;
+
+ /*
+ * Multi-stream write
+ *
+ * Give hint to device about classification of data whose life time is similar
+ * with each other. Data with same stream value will be managed together in
+ * SSD for better write performance.
+ */
+ virtual write_ertr::future<> write(
+ uint64_t offset,
+ bufferptr &&bptr,
+ uint16_t stream = 0) = 0;
+
+ virtual discard_ertr::future<> discard(
+ uint64_t offset,
+ uint64_t len) { return seastar::now(); }
+
+ virtual open_ertr::future<> open(
+ const std::string& path,
+ seastar::open_flags mode) = 0;
+
+ virtual write_ertr::future<> writev(
+ uint64_t offset,
+ ceph::bufferlist bl,
+ uint16_t stream = 0) = 0;
+
+ bool is_data_protection_enabled() const { return false; }
+
+ mkfs_ret do_mkfs(device_config_t);
+
+ // shard 0 mkfs
+ mkfs_ret do_primary_mkfs(device_config_t, int shard_num, size_t journal_size);
+
+ mount_ret do_mount();
+
+ mount_ret do_shard_mount();
+
+ write_ertr::future<> write_rbm_header();
+
+ read_ertr::future<rbm_metadata_header_t> read_rbm_header(rbm_abs_addr addr);
+
+ using stat_device_ret =
+ read_ertr::future<seastar::stat_data>;
+ virtual stat_device_ret stat_device() = 0;
+
+ virtual std::string get_device_path() const = 0;
+
+ uint64_t get_journal_size() const {
+ return super.journal_size;
+ }
+
+ static rbm_abs_addr get_shard_reserved_size() {
+ return RBM_SUPERBLOCK_SIZE;
+ }
+
+ rbm_abs_addr get_shard_journal_start() {
+ return shard_info.start_offset + get_shard_reserved_size();
+ }
+
+ uint64_t get_shard_start() const {
+ return shard_info.start_offset;
+ }
+
+ uint64_t get_shard_end() const {
+ return shard_info.start_offset + shard_info.size;
+ }
+};
+using RBMDeviceRef = std::unique_ptr<RBMDevice>;
+
+constexpr uint64_t DEFAULT_TEST_CBJOURNAL_SIZE = 1 << 26;
+
+class EphemeralRBMDevice : public RBMDevice {
+public:
+ uint64_t size = 0;
+ uint64_t block_size = 0;
+ constexpr static uint32_t TEST_BLOCK_SIZE = 4096;
+
+ EphemeralRBMDevice(size_t size, uint64_t block_size) :
+ size(size), block_size(block_size), buf(nullptr) {
+ }
+ ~EphemeralRBMDevice() {
+ if (buf) {
+ ::munmap(buf, size);
+ buf = nullptr;
+ }
+ }
+
+ std::size_t get_available_size() const final { return size; }
+ extent_len_t get_block_size() const final { return block_size; }
+
+ mount_ret mount() final;
+ mkfs_ret mkfs(device_config_t config) final;
+
+ open_ertr::future<> open(
+ const std::string &in_path,
+ seastar::open_flags mode) override;
+
+ write_ertr::future<> write(
+ uint64_t offset,
+ bufferptr &&bptr,
+ uint16_t stream = 0) override;
+
+ using RBMDevice::read;
+ read_ertr::future<> read(
+ uint64_t offset,
+ bufferptr &bptr) override;
+
+ close_ertr::future<> close() override;
+
+ write_ertr::future<> writev(
+ uint64_t offset,
+ ceph::bufferlist bl,
+ uint16_t stream = 0) final;
+
+ stat_device_ret stat_device() final {
+ seastar::stat_data stat;
+ stat.block_size = block_size;
+ stat.size = size;
+ return stat_device_ret(
+ read_ertr::ready_future_marker{},
+ stat
+ );
+ }
+
+ std::string get_device_path() const final {
+ return "";
+ }
+
+ char *buf;
+};
+using EphemeralRBMDeviceRef = std::unique_ptr<EphemeralRBMDevice>;
+EphemeralRBMDeviceRef create_test_ephemeral(
+ uint64_t journal_size = DEFAULT_TEST_CBJOURNAL_SIZE,
+ uint64_t data_size = DEFAULT_TEST_CBJOURNAL_SIZE);
+
+}
diff --git a/src/crimson/os/seastore/randomblock_manager_group.h b/src/crimson/os/seastore/randomblock_manager_group.h
new file mode 100644
index 000000000..77d9cf797
--- /dev/null
+++ b/src/crimson/os/seastore/randomblock_manager_group.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <set>
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/random_block_manager.h"
+#include "crimson/os/seastore/random_block_manager/block_rb_manager.h"
+
+namespace crimson::os::seastore {
+
+class RBMDeviceGroup {
+public:
+ RBMDeviceGroup() {
+ rb_devices.resize(DEVICE_ID_MAX);
+ }
+
+ const std::set<device_id_t>& get_device_ids() const {
+ return device_ids;
+ }
+
+ std::vector<RandomBlockManager*> get_rb_managers() const {
+ assert(device_ids.size());
+ std::vector<RandomBlockManager*> ret;
+ for (auto& device_id : device_ids) {
+ auto rb_device = rb_devices[device_id].get();
+ assert(rb_device->get_device_id() == device_id);
+ ret.emplace_back(rb_device);
+ }
+ return ret;
+ }
+
+ void add_rb_manager(RandomBlockManagerRef rbm) {
+ auto device_id = rbm->get_device_id();
+ ceph_assert(!has_device(device_id));
+ rb_devices[device_id] = std::move(rbm);
+ device_ids.insert(device_id);
+ }
+
+ void reset() {
+ rb_devices.clear();
+ rb_devices.resize(DEVICE_ID_MAX);
+ device_ids.clear();
+ }
+
+ auto get_block_size() const {
+ assert(device_ids.size());
+ return rb_devices[*device_ids.begin()]->get_block_size();
+ }
+
+ const seastore_meta_t &get_meta() const {
+ assert(device_ids.size());
+ return rb_devices[*device_ids.begin()]->get_meta();
+ }
+
+private:
+ bool has_device(device_id_t id) const {
+ assert(id <= DEVICE_ID_MAX_VALID);
+ return device_ids.count(id) >= 1;
+ }
+
+ std::vector<RandomBlockManagerRef> rb_devices;
+ std::set<device_id_t> device_ids;
+};
+
+using RBMDeviceGroupRef = std::unique_ptr<RBMDeviceGroup>;
+
+}
diff --git a/src/crimson/os/seastore/record_scanner.cc b/src/crimson/os/seastore/record_scanner.cc
new file mode 100644
index 000000000..9778bbb77
--- /dev/null
+++ b/src/crimson/os/seastore/record_scanner.cc
@@ -0,0 +1,239 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "crimson/os/seastore/record_scanner.h"
+
+#include "crimson/os/seastore/logging.h"
+
+SET_SUBSYS(seastore_journal);
+
+namespace crimson::os::seastore {
+
+RecordScanner::scan_valid_records_ret
+RecordScanner::scan_valid_records(
+ scan_valid_records_cursor &cursor,
+ segment_nonce_t nonce,
+ size_t budget,
+ found_record_handler_t &handler)
+{
+ LOG_PREFIX(RecordScanner::scan_valid_records);
+ initialize_cursor(cursor);
+ DEBUG("starting at {}, budget={}", cursor, budget);
+ auto retref = std::make_unique<size_t>(0);
+ auto &budget_used = *retref;
+ return crimson::repeat(
+ [=, &cursor, &budget_used, &handler, this]() mutable
+ -> scan_valid_records_ertr::future<seastar::stop_iteration> {
+ return [=, &handler, &cursor, &budget_used, this] {
+ if (!cursor.last_valid_header_found) {
+ return read_validate_record_metadata(cursor, nonce
+ ).safe_then([=, &cursor](auto md) {
+ if (!md) {
+ cursor.last_valid_header_found = true;
+ if (cursor.is_complete()) {
+ INFO("complete at {}, invalid record group metadata",
+ cursor);
+ } else {
+ DEBUG("found invalid record group metadata at {}, "
+ "processing {} pending record groups",
+ cursor.seq,
+ cursor.pending_record_groups.size());
+ }
+ return scan_valid_records_ertr::now();
+ } else {
+ auto& [header, md_bl] = *md;
+ DEBUG("found valid {} at {}", header, cursor.seq);
+ cursor.emplace_record_group(header, std::move(md_bl));
+ return scan_valid_records_ertr::now();
+ }
+ }).safe_then([=, &cursor, &budget_used, &handler, this] {
+ DEBUG("processing committed record groups until {}, {} pending",
+ cursor.last_committed,
+ cursor.pending_record_groups.size());
+ return crimson::repeat(
+ [=, &budget_used, &cursor, &handler, this] {
+ if (cursor.pending_record_groups.empty()) {
+ /* This is only possible if the segment is empty.
+ * A record's last_commited must be prior to its own
+ * location since it itself cannot yet have been committed
+ * at its own time of submission. Thus, the most recently
+ * read record must always fall after cursor.last_committed */
+ return scan_valid_records_ertr::make_ready_future<
+ seastar::stop_iteration>(seastar::stop_iteration::yes);
+ }
+ auto &next = cursor.pending_record_groups.front();
+ journal_seq_t next_seq = {cursor.seq.segment_seq, next.offset};
+ if (cursor.last_committed == JOURNAL_SEQ_NULL ||
+ next_seq > cursor.last_committed) {
+ return scan_valid_records_ertr::make_ready_future<
+ seastar::stop_iteration>(seastar::stop_iteration::yes);
+ }
+ return consume_next_records(cursor, handler, budget_used
+ ).safe_then([] {
+ return scan_valid_records_ertr::make_ready_future<
+ seastar::stop_iteration>(seastar::stop_iteration::no);
+ });
+ });
+ });
+ } else {
+ assert(!cursor.pending_record_groups.empty());
+ auto &next = cursor.pending_record_groups.front();
+ return read_validate_data(next.offset, next.header
+ ).safe_then([this, FNAME, &budget_used, &cursor, &handler, &next](auto valid) {
+ if (!valid) {
+ INFO("complete at {}, invalid record group data at {}, {}",
+ cursor, next.offset, next.header);
+ cursor.pending_record_groups.clear();
+ return scan_valid_records_ertr::now();
+ }
+ return consume_next_records(cursor, handler, budget_used);
+ });
+ }
+ }().safe_then([=, &budget_used, &cursor] {
+ if (cursor.is_complete() || budget_used >= budget) {
+ DEBUG("finish at {}, budget_used={}, budget={}",
+ cursor, budget_used, budget);
+ return seastar::stop_iteration::yes;
+ } else {
+ return seastar::stop_iteration::no;
+ }
+ });
+ }).safe_then([retref=std::move(retref)]() mutable -> scan_valid_records_ret {
+ return scan_valid_records_ret(
+ scan_valid_records_ertr::ready_future_marker{},
+ std::move(*retref));
+ });
+}
+
+RecordScanner::read_validate_record_metadata_ret
+RecordScanner::read_validate_record_metadata(
+ scan_valid_records_cursor &cursor,
+ segment_nonce_t nonce)
+{
+ LOG_PREFIX(RecordScanner::read_validate_record_metadata);
+ paddr_t start = cursor.seq.offset;
+ auto block_size = cursor.get_block_size();
+ if (get_segment_off(cursor.seq.offset) + block_size > get_segment_end_offset(cursor.seq.offset)) {
+ DEBUG("failed -- record group header block {}~4096 > segment_size {}",
+ start, get_segment_end_offset(cursor.seq.offset));
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+ TRACE("reading record group header block {}~4096", start);
+ return read(start, block_size
+ ).safe_then([=](bufferptr bptr) mutable
+ -> read_validate_record_metadata_ret {
+ bufferlist bl;
+ bl.append(bptr);
+ auto maybe_header = try_decode_records_header(bl, nonce);
+ if (!maybe_header.has_value()) {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+
+ auto& header = *maybe_header;
+ if (header.mdlength < block_size ||
+ header.mdlength % block_size != 0 ||
+ header.dlength % block_size != 0 ||
+ (header.committed_to != JOURNAL_SEQ_NULL &&
+ get_segment_off(header.committed_to.offset) %
+ cursor.get_block_size() != 0) ||
+ (get_segment_off(cursor.seq.offset) + header.mdlength + header.dlength >
+ get_segment_end_offset(cursor.seq.offset))) {
+ ERROR("failed, invalid record group header {}", header);
+ return crimson::ct_error::input_output_error::make();
+ }
+
+ if (is_record_segment_seq_invalid(cursor, header)) {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+
+ if (header.mdlength == block_size) {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::make_pair(std::move(header), std::move(bl))
+ );
+ }
+
+ paddr_t rest_start = cursor.seq.offset.add_offset(block_size);
+ auto rest_len = header.mdlength - block_size;
+ TRACE("reading record group header rest {}~{}", rest_start, rest_len);
+ return read(rest_start, rest_len
+ ).safe_then([header=std::move(header), bl=std::move(bl)
+ ](auto&& bptail) mutable {
+ bl.push_back(bptail);
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::make_pair(std::move(header), std::move(bl)));
+ });
+ }).safe_then([](auto p) {
+ if (p && validate_records_metadata(p->second)) {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::move(*p)
+ );
+ } else {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+ });
+
+}
+
+RecordScanner::read_validate_data_ret RecordScanner::read_validate_data(
+ paddr_t record_base,
+ const record_group_header_t &header)
+{
+ LOG_PREFIX(RecordScanner::read_validate_data);
+ auto data_addr = record_base.add_offset(header.mdlength);
+ TRACE("reading record group data blocks {}~{}", data_addr, header.dlength);
+ return read(
+ data_addr,
+ header.dlength
+ ).safe_then([=, &header](auto bptr) {
+ bufferlist bl;
+ bl.append(bptr);
+ return validate_records_data(header, bl);
+ });
+}
+
+RecordScanner::consume_record_group_ertr::future<>
+RecordScanner::consume_next_records(
+ scan_valid_records_cursor& cursor,
+ found_record_handler_t& handler,
+ std::size_t& budget_used)
+{
+ LOG_PREFIX(RecordScanner::consume_next_records);
+ auto& next = cursor.pending_record_groups.front();
+ auto total_length = next.header.dlength + next.header.mdlength;
+ budget_used += total_length;
+ auto locator = record_locator_t{
+ next.offset.add_offset(next.header.mdlength),
+ write_result_t{
+ journal_seq_t{
+ cursor.seq.segment_seq,
+ next.offset
+ },
+ total_length
+ }
+ };
+ DEBUG("processing {} at {}, budget_used={}",
+ next.header, locator, budget_used);
+ return handler(
+ locator,
+ next.header,
+ next.mdbuffer
+ ).safe_then([FNAME, &cursor] {
+ cursor.pop_record_group();
+ if (cursor.is_complete()) {
+ INFO("complete at {}, no more record group", cursor);
+ }
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/record_scanner.h b/src/crimson/os/seastore/record_scanner.h
new file mode 100644
index 000000000..2cbc7c562
--- /dev/null
+++ b/src/crimson/os/seastore/record_scanner.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+
+namespace crimson::os::seastore {
+
+class RecordScanner {
+public:
+ using read_ertr = SegmentManager::read_ertr;
+ using scan_valid_records_ertr = read_ertr;
+ using scan_valid_records_ret = scan_valid_records_ertr::future<
+ size_t>;
+ using found_record_handler_t = std::function<
+ scan_valid_records_ertr::future<>(
+ record_locator_t record_locator,
+ // callee may assume header and bl will remain valid until
+ // returned future resolves
+ const record_group_header_t &header,
+ const bufferlist &mdbuf)>;
+ scan_valid_records_ret scan_valid_records(
+ scan_valid_records_cursor &cursor, ///< [in, out] cursor, updated during call
+ segment_nonce_t nonce, ///< [in] nonce for segment
+ size_t budget, ///< [in] max budget to use
+ found_record_handler_t &handler ///< [in] handler for records
+ ); ///< @return used budget
+
+ device_off_t get_segment_off(paddr_t addr) const {
+ if (addr.get_addr_type() == paddr_types_t::SEGMENT) {
+ auto& seg_addr = addr.as_seg_paddr();
+ return seg_addr.get_segment_off();
+ }
+ assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK);
+ auto& blk_addr = addr.as_blk_paddr();
+ return blk_addr.get_device_off();
+ }
+
+protected:
+ /// read record metadata for record starting at start
+ using read_validate_record_metadata_ertr = read_ertr;
+ using read_validate_record_metadata_ret =
+ read_validate_record_metadata_ertr::future<
+ std::optional<std::pair<record_group_header_t, bufferlist>>
+ >;
+ read_validate_record_metadata_ret read_validate_record_metadata(
+ scan_valid_records_cursor &cursor,
+ segment_nonce_t nonce);
+
+ /// read and validate data
+ using read_validate_data_ertr = read_ertr;
+ using read_validate_data_ret = read_validate_data_ertr::future<bool>;
+ read_validate_data_ret read_validate_data(
+ paddr_t record_base,
+ const record_group_header_t &header ///< caller must ensure lifetime through
+ /// future resolution
+ );
+
+ virtual bool is_record_segment_seq_invalid(scan_valid_records_cursor &cursor,
+ record_group_header_t &h) = 0;
+
+ virtual int64_t get_segment_end_offset(paddr_t addr) = 0;
+
+ using read_ret = read_ertr::future<bufferptr>;
+ virtual read_ret read(paddr_t start, size_t len) = 0;
+
+ using consume_record_group_ertr = scan_valid_records_ertr;
+ consume_record_group_ertr::future<> consume_next_records(
+ scan_valid_records_cursor& cursor,
+ found_record_handler_t& handler,
+ std::size_t& budget_used);
+
+ virtual void initialize_cursor(scan_valid_records_cursor &cursor) = 0;
+
+ virtual ~RecordScanner() {}
+
+};
+
+}
diff --git a/src/crimson/os/seastore/root_block.cc b/src/crimson/os/seastore/root_block.cc
new file mode 100644
index 000000000..dc928e81b
--- /dev/null
+++ b/src/crimson/os/seastore/root_block.cc
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/root_block.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/backref/backref_tree_node.h"
+
+namespace crimson::os::seastore {
+
+void RootBlock::on_replace_prior(Transaction &t) {
+ if (!lba_root_node) {
+ auto &prior = static_cast<RootBlock&>(*get_prior_instance());
+ lba_root_node = prior.lba_root_node;
+ if (lba_root_node) {
+ ((lba_manager::btree::LBANode*)lba_root_node)->root_block = this;
+ }
+ }
+ if (!backref_root_node) {
+ auto &prior = static_cast<RootBlock&>(*get_prior_instance());
+ backref_root_node = prior.backref_root_node;
+ if (backref_root_node) {
+ ((backref::BackrefNode*)backref_root_node)->root_block = this;
+ }
+ }
+}
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h
new file mode 100644
index 000000000..0e45519ce
--- /dev/null
+++ b/src/crimson/os/seastore/root_block.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * RootBlock
+ *
+ * Holds the physical addresses of all metadata roots.
+ * In-memory values may be
+ * - absolute: reference to block which predates the current transaction
+ * - record_relative: reference to block updated in this transaction
+ * if !pending()
+ *
+ * Journal replay only considers deltas and must always discover the most
+ * recent value for the RootBlock. Because the contents of root_t above are
+ * very small, it's simplest to stash the entire root_t value into the delta
+ * and never actually write the RootBlock to a physical location (safe since
+ * nothing references the location of the RootBlock).
+ *
+ * As a result, Cache treats the root differently in a few ways including:
+ * - state will only ever be DIRTY or MUTATION_PENDING
+ * - RootBlock's never show up in the transaction fresh or dirty lists --
+ * there's a special Transaction::root member for when the root needs to
+ * be mutated.
+ *
+ * TODO: Journal trimming will need to be aware of the most recent RootBlock
+ * delta location, or, even easier, just always write one out with the
+ * mutation which changes the journal trim bound.
+ */
+struct RootBlock : CachedExtent {
+ constexpr static extent_len_t SIZE = 4<<10;
+ using Ref = TCachedExtentRef<RootBlock>;
+
+ root_t root;
+
+ CachedExtent* lba_root_node = nullptr;
+ CachedExtent* backref_root_node = nullptr;
+
+ RootBlock() : CachedExtent(zero_length_t()) {};
+
+ RootBlock(const RootBlock &rhs)
+ : CachedExtent(rhs),
+ root(rhs.root),
+ lba_root_node(nullptr),
+ backref_root_node(nullptr)
+ {}
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ return CachedExtentRef(new RootBlock(*this));
+ };
+
+ static constexpr extent_types_t TYPE = extent_types_t::ROOT;
+ extent_types_t get_type() const final {
+ return extent_types_t::ROOT;
+ }
+
+ void on_replace_prior(Transaction &t) final;
+
+ /// dumps root as delta
+ ceph::bufferlist get_delta() final {
+ ceph::bufferlist bl;
+ ceph::buffer::ptr bptr(sizeof(root_t));
+ *reinterpret_cast<root_t*>(bptr.c_str()) = root;
+ bl.append(bptr);
+ return bl;
+ }
+
+ /// overwrites root
+ void apply_delta_and_adjust_crc(paddr_t base, const ceph::bufferlist &_bl) final {
+ assert(_bl.length() == sizeof(root_t));
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ root = *reinterpret_cast<const root_t*>(bl.front().c_str());
+ root.adjust_addrs_from_base(base);
+ }
+
+ /// Patches relative addrs in memory based on record commit addr
+ void on_delta_write(paddr_t record_block_offset) final {
+ root.adjust_addrs_from_base(record_block_offset);
+ }
+
+ complete_load_ertr::future<> complete_load() final {
+ ceph_abort_msg("Root is only written via deltas");
+ }
+
+ void on_initial_write() final {
+ ceph_abort_msg("Root is only written via deltas");
+ }
+
+ root_t &get_root() { return root; }
+
+ std::ostream &print_detail(std::ostream &out) const final {
+ return out << ", root_block(lba_root_node=" << (void*)lba_root_node
+ << ", backref_root_node=" << (void*)backref_root_node
+ << ")";
+ }
+};
+using RootBlockRef = RootBlock::Ref;
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::RootBlock> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
new file mode 100644
index 000000000..897a063e0
--- /dev/null
+++ b/src/crimson/os/seastore/seastore.cc
@@ -0,0 +1,2135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "seastore.h"
+
+#include <algorithm>
+
+#include <boost/algorithm/string/trim.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <seastar/core/file.hh>
+#include <seastar/core/fstream.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/shared_mutex.hh>
+
+#include "common/safe_io.h"
+#include "include/stringify.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/buffer_io.h"
+
+#include "crimson/os/futurized_collection.h"
+
+#include "crimson/os/seastore/backref_manager.h"
+#include "crimson/os/seastore/async_cleaner.h"
+#include "crimson/os/seastore/collection_manager/flat_collection_manager.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h"
+#include "crimson/os/seastore/onode_manager.h"
+#include "crimson/os/seastore/object_data_handler.h"
+
+
+using std::string;
+using crimson::common::local_conf;
+
+template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
+ : fmt::formatter<std::string_view> {
+ using op_type_t = crimson::os::seastore::op_type_t;
+ // parse is inherited from formatter<string_view>.
+ template <typename FormatContext>
+ auto format(op_type_t op, FormatContext& ctx) {
+ std::string_view name = "unknown";
+ switch (op) {
+ case op_type_t::TRANSACTION:
+ name = "transaction";
+ break;
+ case op_type_t::READ:
+ name = "read";
+ break;
+ case op_type_t::WRITE:
+ name = "write";
+ break;
+ case op_type_t::GET_ATTR:
+ name = "get_attr";
+ break;
+ case op_type_t::GET_ATTRS:
+ name = "get_attrs";
+ break;
+ case op_type_t::STAT:
+ name = "stat";
+ break;
+ case op_type_t::OMAP_GET_VALUES:
+ name = "omap_get_values";
+ break;
+ case op_type_t::OMAP_LIST:
+ name = "omap_list";
+ break;
+ case op_type_t::MAX:
+ name = "unknown";
+ break;
+ }
+ return formatter<string_view>::format(name, ctx);
+ }
+};
+
+SET_SUBSYS(seastore);
+
+namespace crimson::os::seastore {
+
+class FileMDStore final : public SeaStore::MDStore {
+ std::string root;
+public:
+ FileMDStore(const std::string& root) : root(root) {}
+
+ write_meta_ret write_meta(
+ const std::string& key, const std::string& value) final {
+ std::string path = fmt::format("{}/{}", root, key);
+ ceph::bufferlist bl;
+ bl.append(value + "\n");
+ return crimson::write_file(std::move(bl), path);
+ }
+
+ read_meta_ret read_meta(const std::string& key) final {
+ std::string path = fmt::format("{}/{}", root, key);
+ return seastar::file_exists(
+ path
+ ).then([path] (bool exist) {
+ if (exist) {
+ return crimson::read_file(path)
+ .then([] (auto tmp_buf) {
+ std::string v = {tmp_buf.get(), tmp_buf.size()};
+ std::size_t pos = v.find("\n");
+ std::string str = v.substr(0, pos);
+ return seastar::make_ready_future<std::optional<std::string>>(str);
+ });
+ } else {
+ return seastar::make_ready_future<std::optional<std::string>>(std::nullopt);
+ }
+ });
+ }
+};
+
+using crimson::common::get_conf;
+
+SeaStore::Shard::Shard(
+ std::string root,
+ Device* dev,
+ bool is_test)
+ :root(root),
+ max_object_size(
+ get_conf<uint64_t>("seastore_default_max_object_size")),
+ is_test(is_test),
+ throttler(
+ get_conf<uint64_t>("seastore_max_concurrent_transactions"))
+{
+ device = &(dev->get_sharded_device());
+ register_metrics();
+}
+
+SeaStore::SeaStore(
+ const std::string& root,
+ MDStoreRef mdstore)
+ : root(root),
+ mdstore(std::move(mdstore))
+{
+}
+
+SeaStore::~SeaStore() = default;
+
+void SeaStore::Shard::register_metrics()
+{
+ namespace sm = seastar::metrics;
+ using op_type_t = crimson::os::seastore::op_type_t;
+ std::pair<op_type_t, sm::label_instance> labels_by_op_type[] = {
+ {op_type_t::TRANSACTION, sm::label_instance("latency", "TRANSACTION")},
+ {op_type_t::READ, sm::label_instance("latency", "READ")},
+ {op_type_t::WRITE, sm::label_instance("latency", "WRITE")},
+ {op_type_t::GET_ATTR, sm::label_instance("latency", "GET_ATTR")},
+ {op_type_t::GET_ATTRS, sm::label_instance("latency", "GET_ATTRS")},
+ {op_type_t::STAT, sm::label_instance("latency", "STAT")},
+ {op_type_t::OMAP_GET_VALUES, sm::label_instance("latency", "OMAP_GET_VALUES")},
+ {op_type_t::OMAP_LIST, sm::label_instance("latency", "OMAP_LIST")},
+ };
+
+ for (auto& [op_type, label] : labels_by_op_type) {
+ auto desc = fmt::format("latency of seastore operation (optype={})",
+ op_type);
+ metrics.add_group(
+ "seastore",
+ {
+ sm::make_histogram(
+ "op_lat", [this, op_type=op_type] {
+ return get_latency(op_type);
+ },
+ sm::description(desc),
+ {label}
+ ),
+ }
+ );
+ }
+
+ metrics.add_group(
+ "seastore",
+ {
+ sm::make_gauge(
+ "concurrent_transactions",
+ [this] {
+ return throttler.get_current();
+ },
+ sm::description("transactions that are running inside seastore")
+ ),
+ sm::make_gauge(
+ "pending_transactions",
+ [this] {
+ return throttler.get_pending();
+ },
+ sm::description("transactions waiting to get "
+ "through seastore's throttler")
+ )
+ }
+ );
+}
+
+seastar::future<> SeaStore::start()
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+#ifndef NDEBUG
+ bool is_test = true;
+#else
+ bool is_test = false;
+#endif
+ using crimson::common::get_conf;
+ std::string type = get_conf<std::string>("seastore_main_device_type");
+ device_type_t d_type = string_to_device_type(type);
+ assert(d_type == device_type_t::SSD ||
+ d_type == device_type_t::RANDOM_BLOCK_SSD);
+
+ ceph_assert(root != "");
+ return Device::make_device(root, d_type
+ ).then([this](DeviceRef device_obj) {
+ device = std::move(device_obj);
+ return device->start();
+ }).then([this, is_test] {
+ ceph_assert(device);
+ return shard_stores.start(root, device.get(), is_test);
+ });
+}
+
+seastar::future<> SeaStore::test_start(DeviceRef device_obj)
+{
+ ceph_assert(device_obj);
+ ceph_assert(root == "");
+ device = std::move(device_obj);
+ return shard_stores.start_single(root, device.get(), true);
+}
+
+seastar::future<> SeaStore::stop()
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return seastar::do_for_each(secondaries, [](auto& sec_dev) {
+ return sec_dev->stop();
+ }).then([this] {
+ secondaries.clear();
+ if (device) {
+ return device->stop();
+ } else {
+ return seastar::now();
+ }
+ }).then([this] {
+ return shard_stores.stop();
+ });
+}
+
+SeaStore::mount_ertr::future<> SeaStore::test_mount()
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.local().mount_managers();
+}
+
+SeaStore::mount_ertr::future<> SeaStore::mount()
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return device->mount(
+ ).safe_then([this] {
+ auto sec_devices = device->get_sharded_device().get_secondary_devices();
+ return crimson::do_for_each(sec_devices, [this](auto& device_entry) {
+ device_id_t id = device_entry.first;
+ magic_t magic = device_entry.second.magic;
+ device_type_t dtype = device_entry.second.dtype;
+ std::string path =
+ fmt::format("{}/block.{}.{}", root, dtype, std::to_string(id));
+ return Device::make_device(path, dtype
+ ).then([this, path, magic](DeviceRef sec_dev) {
+ return sec_dev->start(
+ ).then([this, magic, sec_dev = std::move(sec_dev)]() mutable {
+ return sec_dev->mount(
+ ).safe_then([this, sec_dev=std::move(sec_dev), magic]() mutable {
+ boost::ignore_unused(magic); // avoid clang warning;
+ assert(sec_dev->get_sharded_device().get_magic() == magic);
+ secondaries.emplace_back(std::move(sec_dev));
+ });
+ }).safe_then([this] {
+ return set_secondaries();
+ });
+ });
+ }).safe_then([this] {
+ return shard_stores.invoke_on_all([](auto &local_store) {
+ return local_store.mount_managers();
+ });
+ });
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::mount"
+ }
+ );
+}
+
+seastar::future<> SeaStore::Shard::mount_managers()
+{
+ init_managers();
+ return transaction_manager->mount(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in mount_managers"
+ });
+}
+
+seastar::future<> SeaStore::umount()
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.invoke_on_all([](auto &local_store) {
+ return local_store.umount();
+ });
+}
+
+seastar::future<> SeaStore::Shard::umount()
+{
+ return [this] {
+ if (transaction_manager) {
+ return transaction_manager->close();
+ } else {
+ return TransactionManager::close_ertr::now();
+ }
+ }().safe_then([this] {
+ return crimson::do_for_each(
+ secondaries,
+ [](auto& sec_dev) -> SegmentManager::close_ertr::future<>
+ {
+ return sec_dev->close();
+ });
+ }).safe_then([this] {
+ return device->close();
+ }).safe_then([this] {
+ secondaries.clear();
+ transaction_manager.reset();
+ collection_manager.reset();
+ onode_manager.reset();
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::umount"
+ }
+ );
+}
+
+seastar::future<> SeaStore::write_fsid(uuid_d new_osd_fsid)
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ LOG_PREFIX(SeaStore::write_fsid);
+ return read_meta("fsid").then([this, FNAME, new_osd_fsid] (auto tuple) {
+ auto [ret, fsid] = tuple;
+ std::string str_fsid = stringify(new_osd_fsid);
+ if (ret == -1) {
+ return write_meta("fsid", stringify(new_osd_fsid));
+ } else if (ret == 0 && fsid != str_fsid) {
+ ERROR("on-disk fsid {} != provided {}",
+ fsid, stringify(new_osd_fsid));
+ throw std::runtime_error("store fsid error");
+ } else {
+ return seastar::now();
+ }
+ });
+}
+
+seastar::future<>
+SeaStore::Shard::mkfs_managers()
+{
+ init_managers();
+ return transaction_manager->mkfs(
+ ).safe_then([this] {
+ init_managers();
+ return transaction_manager->mount();
+ }).safe_then([this] {
+ return repeat_eagain([this] {
+ return transaction_manager->with_transaction_intr(
+ Transaction::src_t::MUTATE,
+ "mkfs_seastore",
+ [this](auto& t)
+ {
+ return onode_manager->mkfs(t
+ ).si_then([this, &t] {
+ return collection_manager->mkfs(t);
+ }).si_then([this, &t](auto coll_root) {
+ transaction_manager->write_collection_root(
+ t, coll_root);
+ return transaction_manager->submit_transaction(t);
+ });
+ });
+ });
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in Shard::mkfs_managers"
+ }
+ );
+}
+
+seastar::future<> SeaStore::set_secondaries()
+{
+ auto sec_dev_ite = secondaries.rbegin();
+ Device* sec_dev = sec_dev_ite->get();
+ return shard_stores.invoke_on_all([sec_dev](auto &local_store) {
+ local_store.set_secondaries(sec_dev->get_sharded_device());
+ });
+}
+
+SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid)
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+ auto [done, value] = tuple;
+ if (done == 0) {
+ return seastar::now();
+ }
+ return shard_stores.local().mkfs_managers(
+ ).then([this, new_osd_fsid] {
+ return prepare_meta(new_osd_fsid);
+ });
+ });
+}
+
+seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid)
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return write_fsid(new_osd_fsid).then([this] {
+ return read_meta("type").then([this] (auto tuple) {
+ auto [ret, type] = tuple;
+ if (ret == 0 && type == "seastore") {
+ return seastar::now();
+ } else if (ret == 0 && type != "seastore") {
+ LOG_PREFIX(SeaStore::prepare_meta);
+ ERROR("expected seastore, but type is {}", type);
+ throw std::runtime_error("store type error");
+ } else {
+ return write_meta("type", "seastore");
+ }
+ });
+ }).then([this] {
+ return write_meta("mkfs_done", "yes");
+ });
+}
+
+SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+ auto [done, value] = tuple;
+ if (done == 0) {
+ return seastar::now();
+ } else {
+ return seastar::do_with(
+ secondary_device_set_t(),
+ [this, new_osd_fsid](auto& sds) {
+ auto fut = seastar::now();
+ LOG_PREFIX(SeaStore::mkfs);
+ DEBUG("root: {}", root);
+ if (!root.empty()) {
+ fut = seastar::open_directory(root
+ ).then([this, &sds, new_osd_fsid](seastar::file rdir) mutable {
+ std::unique_ptr<seastar::file> root_f =
+ std::make_unique<seastar::file>(std::move(rdir));
+ auto sub = root_f->list_directory(
+ [this, &sds, new_osd_fsid](auto de) mutable -> seastar::future<>
+ {
+ LOG_PREFIX(SeaStore::mkfs);
+ DEBUG("found file: {}", de.name);
+ if (de.name.find("block.") == 0
+ && de.name.length() > 6 /* 6 for "block." */) {
+ std::string entry_name = de.name;
+ auto dtype_end = entry_name.find_first_of('.', 6);
+ device_type_t dtype =
+ string_to_device_type(
+ entry_name.substr(6, dtype_end - 6));
+ if (dtype == device_type_t::NONE) {
+ // invalid device type
+ return seastar::now();
+ }
+ auto id = std::stoi(entry_name.substr(dtype_end + 1));
+ std::string path = fmt::format("{}/{}", root, entry_name);
+ return Device::make_device(path, dtype
+ ).then([this, &sds, id, dtype, new_osd_fsid](DeviceRef sec_dev) {
+ auto p_sec_dev = sec_dev.get();
+ secondaries.emplace_back(std::move(sec_dev));
+ return p_sec_dev->start(
+ ).then([&sds, id, dtype, new_osd_fsid, p_sec_dev]() {
+ magic_t magic = (magic_t)std::rand();
+ sds.emplace(
+ (device_id_t)id,
+ device_spec_t{magic, dtype, (device_id_t)id});
+ return p_sec_dev->mkfs(device_config_t::create_secondary(
+ new_osd_fsid, id, dtype, magic)
+ ).handle_error(crimson::ct_error::assert_all{"not possible"});
+ });
+ }).then([this] {
+ return set_secondaries();
+ });
+ }
+ return seastar::now();
+ });
+ return sub.done().then([root_f=std::move(root_f)] {});
+ });
+ }
+ return fut.then([this, &sds, new_osd_fsid] {
+ device_id_t id = 0;
+ device_type_t d_type = device->get_device_type();
+ assert(d_type == device_type_t::SSD ||
+ d_type == device_type_t::RANDOM_BLOCK_SSD);
+ if (d_type == device_type_t::RANDOM_BLOCK_SSD) {
+ id = static_cast<device_id_t>(DEVICE_ID_RANDOM_BLOCK_MIN);
+ }
+
+ return device->mkfs(
+ device_config_t::create_primary(new_osd_fsid, id, d_type, sds)
+ );
+ }).safe_then([this] {
+ return crimson::do_for_each(secondaries, [](auto& sec_dev) {
+ return sec_dev->mount();
+ });
+ });
+ }).safe_then([this] {
+ return device->mount();
+ }).safe_then([this] {
+ return shard_stores.invoke_on_all([] (auto &local_store) {
+ return local_store.mkfs_managers();
+ });
+ }).safe_then([this, new_osd_fsid] {
+ return prepare_meta(new_osd_fsid);
+ }).safe_then([this] {
+ return umount();
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::mkfs"
+ }
+ );
+ }
+ });
+}
+
+using coll_core_t = FuturizedStore::coll_core_t;
+seastar::future<std::vector<coll_core_t>>
+SeaStore::list_collections()
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.map([](auto &local_store) {
+ return local_store.list_collections();
+ }).then([](std::vector<std::vector<coll_core_t>> results) {
+ std::vector<coll_core_t> collections;
+ for (auto& colls : results) {
+ collections.insert(collections.end(), colls.begin(), colls.end());
+ }
+ return seastar::make_ready_future<std::vector<coll_core_t>>(
+ std::move(collections));
+ });
+}
+
+store_statfs_t SeaStore::Shard::stat() const
+{
+ return transaction_manager->store_stat();
+}
+
+seastar::future<store_statfs_t> SeaStore::stat() const
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ LOG_PREFIX(SeaStore::stat);
+ DEBUG("");
+ return shard_stores.map_reduce0(
+ [](const SeaStore::Shard &local_store) {
+ return local_store.stat();
+ },
+ store_statfs_t(),
+ [](auto &&ss, auto &&ret) {
+ ss.add(ret);
+ return std::move(ss);
+ }
+ ).then([](store_statfs_t ss) {
+ return seastar::make_ready_future<store_statfs_t>(std::move(ss));
+ });
+}
+
+TransactionManager::read_extent_iertr::future<std::optional<unsigned>>
+SeaStore::Shard::get_coll_bits(CollectionRef ch, Transaction &t) const
+{
+ return transaction_manager->read_collection_root(t)
+ .si_then([this, ch, &t](auto coll_root) {
+ return collection_manager->list(coll_root, t);
+ }).si_then([ch](auto colls) {
+ auto it = std::find_if(colls.begin(), colls.end(),
+ [ch](const std::pair<coll_t, coll_info_t>& element) {
+ return element.first == ch->get_cid();
+ });
+ if (it != colls.end()) {
+ return TransactionManager::read_extent_iertr::make_ready_future<
+ std::optional<unsigned>>(it->second.split_bits);
+ } else {
+ return TransactionManager::read_extent_iertr::make_ready_future<
+ std::optional<unsigned>>(std::nullopt);
+ }
+ });
+}
+
+col_obj_ranges_t
+SeaStore::get_objs_range(CollectionRef ch, unsigned bits)
+{
+ col_obj_ranges_t obj_ranges;
+ spg_t pgid;
+ constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
+ const std::string_view MAX_NSPACE = "\xff";
+ if (ch->get_cid().is_pg(&pgid)) {
+ obj_ranges.obj_begin.shard_id = pgid.shard;
+ obj_ranges.temp_begin = obj_ranges.obj_begin;
+
+ obj_ranges.obj_begin.hobj.pool = pgid.pool();
+ obj_ranges.temp_begin.hobj.pool = -2ll - pgid.pool();
+
+ obj_ranges.obj_end = obj_ranges.obj_begin;
+ obj_ranges.temp_end = obj_ranges.temp_begin;
+
+ uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
+ obj_ranges.obj_begin.hobj.set_bitwise_key_u32(reverse_hash);
+ obj_ranges.temp_begin.hobj.set_bitwise_key_u32(reverse_hash);
+
+ uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
+ if (end_hash > MAX_HASH) {
+ // make sure end hobj is even greater than the maximum possible hobj
+ obj_ranges.obj_end.hobj.set_bitwise_key_u32(MAX_HASH);
+ obj_ranges.temp_end.hobj.set_bitwise_key_u32(MAX_HASH);
+ obj_ranges.obj_end.hobj.nspace = MAX_NSPACE;
+ } else {
+ obj_ranges.obj_end.hobj.set_bitwise_key_u32(end_hash);
+ obj_ranges.temp_end.hobj.set_bitwise_key_u32(end_hash);
+ }
+ } else {
+ obj_ranges.obj_begin.shard_id = shard_id_t::NO_SHARD;
+ obj_ranges.obj_begin.hobj.pool = -1ull;
+
+ obj_ranges.obj_end = obj_ranges.obj_begin;
+ obj_ranges.obj_begin.hobj.set_bitwise_key_u32(0);
+ obj_ranges.obj_end.hobj.set_bitwise_key_u32(MAX_HASH);
+ obj_ranges.obj_end.hobj.nspace = MAX_NSPACE;
+ // no separate temp section
+ obj_ranges.temp_begin = obj_ranges.obj_end;
+ obj_ranges.temp_end = obj_ranges.obj_end;
+ }
+
+ obj_ranges.obj_begin.generation = 0;
+ obj_ranges.obj_end.generation = 0;
+ obj_ranges.temp_begin.generation = 0;
+ obj_ranges.temp_end.generation = 0;
+ return obj_ranges;
+}
+
+static std::list<std::pair<ghobject_t, ghobject_t>>
+get_ranges(CollectionRef ch,
+ ghobject_t start,
+ ghobject_t end,
+ col_obj_ranges_t obj_ranges)
+{
+ ceph_assert(start <= end);
+ std::list<std::pair<ghobject_t, ghobject_t>> ranges;
+ if (start < obj_ranges.temp_end) {
+ ranges.emplace_back(
+ std::max(obj_ranges.temp_begin, start),
+ std::min(obj_ranges.temp_end, end));
+ }
+ if (end > obj_ranges.obj_begin) {
+ ranges.emplace_back(
+ std::max(obj_ranges.obj_begin, start),
+ std::min(obj_ranges.obj_end, end));
+ }
+ return ranges;
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+SeaStore::Shard::list_objects(CollectionRef ch,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const
+{
+ ceph_assert(start <= end);
+ using list_iertr = OnodeManager::list_onodes_iertr;
+ using RetType = typename OnodeManager::list_onodes_bare_ret;
+ return seastar::do_with(
+ RetType(std::vector<ghobject_t>(), start),
+ std::move(limit),
+ [this, ch, start, end](auto& ret, auto& limit) {
+ return repeat_eagain([this, ch, start, end, &limit, &ret] {
+ return transaction_manager->with_transaction_intr(
+ Transaction::src_t::READ,
+ "list_objects",
+ [this, ch, start, end, &limit, &ret](auto &t)
+ {
+ return get_coll_bits(
+ ch, t
+ ).si_then([this, ch, &t, start, end, &limit, &ret](auto bits) {
+ if (!bits) {
+ return list_iertr::make_ready_future<
+ OnodeManager::list_onodes_bare_ret
+ >(std::make_tuple(
+ std::vector<ghobject_t>(),
+ ghobject_t::get_max()));
+ } else {
+ auto filter = SeaStore::get_objs_range(ch, *bits);
+ using list_iertr = OnodeManager::list_onodes_iertr;
+ using repeat_ret = list_iertr::future<seastar::stop_iteration>;
+ return trans_intr::repeat(
+ [this, &t, &ret, &limit,
+ filter, ranges = get_ranges(ch, start, end, filter)
+ ]() mutable -> repeat_ret {
+ if (limit == 0 || ranges.empty()) {
+ return list_iertr::make_ready_future<
+ seastar::stop_iteration
+ >(seastar::stop_iteration::yes);
+ }
+ auto ite = ranges.begin();
+ auto pstart = ite->first;
+ auto pend = ite->second;
+ ranges.pop_front();
+ return onode_manager->list_onodes(
+ t, pstart, pend, limit
+ ).si_then([&limit, &ret, pend](auto &&_ret) mutable {
+ auto &next_objects = std::get<0>(_ret);
+ auto &ret_objects = std::get<0>(ret);
+ ret_objects.insert(
+ ret_objects.end(),
+ next_objects.begin(),
+ next_objects.end());
+ std::get<1>(ret) = std::get<1>(_ret);
+ assert(limit >= next_objects.size());
+ limit -= next_objects.size();
+ assert(limit == 0 ||
+ std::get<1>(_ret) == pend ||
+ std::get<1>(_ret) == ghobject_t::get_max());
+ return list_iertr::make_ready_future<
+ seastar::stop_iteration
+ >(seastar::stop_iteration::no);
+ });
+ }).si_then([&ret] {
+ return list_iertr::make_ready_future<
+ OnodeManager::list_onodes_bare_ret>(std::move(ret));
+ });
+ }
+ });
+ }).safe_then([&ret](auto&& _ret) {
+ ret = std::move(_ret);
+ });
+ }).safe_then([&ret] {
+ return std::move(ret);
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::list_objects"
+ }
+ );
+ });
+}
+
+seastar::future<CollectionRef>
+SeaStore::Shard::create_new_collection(const coll_t& cid)
+{
+ LOG_PREFIX(SeaStore::create_new_collection);
+ DEBUG("{}", cid);
+ return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
+}
+
+seastar::future<CollectionRef>
+SeaStore::Shard::open_collection(const coll_t& cid)
+{
+ LOG_PREFIX(SeaStore::open_collection);
+ DEBUG("{}", cid);
+ return list_collections().then([cid, this] (auto colls_cores) {
+ if (auto found = std::find(colls_cores.begin(),
+ colls_cores.end(),
+ std::make_pair(cid, seastar::this_shard_id()));
+ found != colls_cores.end()) {
+ return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
+ } else {
+ return seastar::make_ready_future<CollectionRef>();
+ }
+ });
+}
+
+seastar::future<std::vector<coll_core_t>>
+SeaStore::Shard::list_collections()
+{
+ return seastar::do_with(
+ std::vector<coll_core_t>(),
+ [this](auto &ret) {
+ return repeat_eagain([this, &ret] {
+ return transaction_manager->with_transaction_intr(
+ Transaction::src_t::READ,
+ "list_collections",
+ [this, &ret](auto& t)
+ {
+ return transaction_manager->read_collection_root(t
+ ).si_then([this, &t](auto coll_root) {
+ return collection_manager->list(coll_root, t);
+ }).si_then([&ret](auto colls) {
+ ret.resize(colls.size());
+ std::transform(
+ colls.begin(), colls.end(), ret.begin(),
+ [](auto p) {
+ return std::make_pair(p.first, seastar::this_shard_id());
+ });
+ });
+ });
+ }).safe_then([&ret] {
+ return seastar::make_ready_future<std::vector<coll_core_t>>(ret);
+ });
+ }
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::list_collections"
+ }
+ );
+}
+
+SeaStore::Shard::read_errorator::future<ceph::bufferlist>
+SeaStore::Shard::read(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags)
+{
+ LOG_PREFIX(SeaStore::read);
+ DEBUG("oid {} offset {} len {}", oid, offset, len);
+ return repeat_with_onode<ceph::bufferlist>(
+ ch,
+ oid,
+ Transaction::src_t::READ,
+ "read_obj",
+ op_type_t::READ,
+ [=, this](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
+ size_t size = onode.get_layout().size;
+
+ if (offset >= size) {
+ return seastar::make_ready_future<ceph::bufferlist>();
+ }
+
+ size_t corrected_len = (len == 0) ?
+ size - offset :
+ std::min(size - offset, len);
+
+ return ObjectDataHandler(max_object_size).read(
+ ObjectDataHandler::context_t{
+ *transaction_manager,
+ t,
+ onode,
+ },
+ offset,
+ corrected_len);
+ });
+}
+
+SeaStore::Shard::read_errorator::future<ceph::bufferlist>
+SeaStore::Shard::readv(
+ CollectionRef ch,
+ const ghobject_t& _oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags)
+{
+ return seastar::do_with(
+ _oid,
+ ceph::bufferlist{},
+ [=, this, &m](auto &oid, auto &ret) {
+ return crimson::do_for_each(
+ m,
+ [=, this, &oid, &ret](auto &p) {
+ return read(
+ ch, oid, p.first, p.second, op_flags
+ ).safe_then([&ret](auto bl) {
+ ret.claim_append(bl);
+ });
+ }).safe_then([&ret] {
+ return read_errorator::make_ready_future<ceph::bufferlist>
+ (std::move(ret));
+ });
+ });
+ return read_errorator::make_ready_future<ceph::bufferlist>();
+}
+
+using crimson::os::seastore::omap_manager::BtreeOMapManager;
+
+SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
+SeaStore::Shard::get_attr(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ std::string_view name) const
+{
+ auto c = static_cast<SeastoreCollection*>(ch.get());
+ LOG_PREFIX(SeaStore::get_attr);
+ DEBUG("{} {}", c->get_cid(), oid);
+ return repeat_with_onode<ceph::bufferlist>(
+ c,
+ oid,
+ Transaction::src_t::READ,
+ "get_attr",
+ op_type_t::GET_ATTR,
+ [=, this](auto &t, auto& onode) -> _omap_get_value_ret {
+ auto& layout = onode.get_layout();
+ if (name == OI_ATTR && layout.oi_size) {
+ ceph::bufferlist bl;
+ bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+ return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+ }
+ if (name == SS_ATTR && layout.ss_size) {
+ ceph::bufferlist bl;
+ bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+ return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+ }
+ return _omap_get_value(
+ t,
+ layout.xattr_root.get(
+ onode.get_metadata_hint(device->get_block_size())),
+ name);
+ }
+ ).handle_error(crimson::ct_error::input_output_error::handle([FNAME] {
+ ERROR("EIO when getting attrs");
+ abort();
+ }), crimson::ct_error::pass_further_all{});
+}
+
+SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t>
+SeaStore::Shard::get_attrs(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ LOG_PREFIX(SeaStore::get_attrs);
+ auto c = static_cast<SeastoreCollection*>(ch.get());
+ DEBUG("{} {}", c->get_cid(), oid);
+ return repeat_with_onode<attrs_t>(
+ c,
+ oid,
+ Transaction::src_t::READ,
+ "get_addrs",
+ op_type_t::GET_ATTRS,
+ [=, this](auto &t, auto& onode) {
+ auto& layout = onode.get_layout();
+ return omap_list(onode, layout.xattr_root, t, std::nullopt,
+ OMapManager::omap_list_config_t().with_inclusive(false, false)
+ ).si_then([&layout](auto p) {
+ auto& attrs = std::get<1>(p);
+ ceph::bufferlist bl;
+ if (layout.oi_size) {
+ bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+ attrs.emplace(OI_ATTR, std::move(bl));
+ }
+ if (layout.ss_size) {
+ bl.clear();
+ bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+ attrs.emplace(SS_ATTR, std::move(bl));
+ }
+ return seastar::make_ready_future<omap_values_t>(std::move(attrs));
+ });
+ }
+ ).handle_error(crimson::ct_error::input_output_error::handle([FNAME] {
+ ERROR("EIO when getting attrs");
+ abort();
+ }), crimson::ct_error::pass_further_all{});
+}
+
+seastar::future<struct stat> SeaStore::Shard::stat(
+ CollectionRef c,
+ const ghobject_t& oid)
+{
+ LOG_PREFIX(SeaStore::stat);
+ return repeat_with_onode<struct stat>(
+ c,
+ oid,
+ Transaction::src_t::READ,
+ "stat",
+ op_type_t::STAT,
+ [=, this](auto &t, auto &onode) {
+ struct stat st;
+ auto &olayout = onode.get_layout();
+ st.st_size = olayout.size;
+ st.st_blksize = device->get_block_size();
+ st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
+ st.st_nlink = 1;
+ DEBUGT("cid {}, oid {}, return size {}", t, c->get_cid(), oid, st.st_size);
+ return seastar::make_ready_future<struct stat>(st);
+ }
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::stat"
+ }
+ );
+}
+
+SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
+SeaStore::Shard::omap_get_header(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY);
+}
+
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t>
+SeaStore::Shard::omap_get_values(
+ CollectionRef ch,
+ const ghobject_t &oid,
+ const omap_keys_t &keys)
+{
+ auto c = static_cast<SeastoreCollection*>(ch.get());
+ return repeat_with_onode<omap_values_t>(
+ c,
+ oid,
+ Transaction::src_t::READ,
+ "omap_get_values",
+ op_type_t::OMAP_GET_VALUES,
+ [this, keys](auto &t, auto &onode) {
+ omap_root_t omap_root = onode.get_layout().omap_root.get(
+ onode.get_metadata_hint(device->get_block_size()));
+ return _omap_get_values(
+ t,
+ std::move(omap_root),
+ keys);
+ });
+}
+
+SeaStore::Shard::_omap_get_value_ret
+SeaStore::Shard::_omap_get_value(
+ Transaction &t,
+ omap_root_t &&root,
+ std::string_view key) const
+{
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ std::move(root),
+ std::string(key),
+ [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret {
+ if (root.is_null()) {
+ return crimson::ct_error::enodata::make();
+ }
+ return manager.omap_get_value(root, t, key
+ ).si_then([](auto opt) -> _omap_get_value_ret {
+ if (!opt) {
+ return crimson::ct_error::enodata::make();
+ }
+ return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
+ });
+ }
+ );
+}
+
+SeaStore::Shard::_omap_get_values_ret
+SeaStore::Shard::_omap_get_values(
+ Transaction &t,
+ omap_root_t &&omap_root,
+ const omap_keys_t &keys) const
+{
+ if (omap_root.is_null()) {
+ return seastar::make_ready_future<omap_values_t>();
+ }
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ std::move(omap_root),
+ omap_values_t(),
+ [&](auto &manager, auto &root, auto &ret) {
+ return trans_intr::do_for_each(
+ keys.begin(),
+ keys.end(),
+ [&](auto &key) {
+ return manager.omap_get_value(
+ root,
+ t,
+ key
+ ).si_then([&ret, &key](auto &&p) {
+ if (p) {
+ bufferlist bl;
+ bl.append(*p);
+ ret.emplace(
+ std::move(key),
+ std::move(bl));
+ }
+ return seastar::now();
+ });
+ }
+ ).si_then([&ret] {
+ return std::move(ret);
+ });
+ }
+ );
+}
+
+SeaStore::Shard::omap_list_ret
+SeaStore::Shard::omap_list(
+ Onode &onode,
+ const omap_root_le_t& omap_root,
+ Transaction& t,
+ const std::optional<std::string>& start,
+ OMapManager::omap_list_config_t config) const
+{
+ auto root = omap_root.get(
+ onode.get_metadata_hint(device->get_block_size()));
+ if (root.is_null()) {
+ return seastar::make_ready_future<omap_list_bare_ret>(
+ true, omap_values_t{}
+ );
+ }
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ root,
+ start,
+ std::optional<std::string>(std::nullopt),
+ [&t, config](auto &manager, auto &root, auto &start, auto &end) {
+ return manager.omap_list(root, t, start, end, config);
+ });
+}
+
+SeaStore::Shard::omap_get_values_ret_t
+SeaStore::Shard::omap_get_values(
+ CollectionRef ch,
+ const ghobject_t &oid,
+ const std::optional<string> &start)
+{
+ auto c = static_cast<SeastoreCollection*>(ch.get());
+ LOG_PREFIX(SeaStore::omap_get_values);
+ DEBUG("{} {}", c->get_cid(), oid);
+ using ret_bare_t = std::tuple<bool, SeaStore::Shard::omap_values_t>;
+ return repeat_with_onode<ret_bare_t>(
+ c,
+ oid,
+ Transaction::src_t::READ,
+ "omap_list",
+ op_type_t::OMAP_LIST,
+ [this, start](auto &t, auto &onode) {
+ return omap_list(
+ onode,
+ onode.get_layout().omap_root,
+ t,
+ start,
+ OMapManager::omap_list_config_t().with_inclusive(false, false));
+ });
+}
+
+SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
+ Transaction &t,
+ Onode &onode,
+ uint64_t off,
+ uint64_t len) const
+{
+ return seastar::do_with(
+ ObjectDataHandler(max_object_size),
+ [=, this, &t, &onode] (auto &objhandler) {
+ return objhandler.fiemap(
+ ObjectDataHandler::context_t{
+ *transaction_manager,
+ t,
+ onode,
+ },
+ off,
+ len);
+ });
+}
+
+SeaStore::Shard::read_errorator::future<std::map<uint64_t, uint64_t>>
+SeaStore::Shard::fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len)
+{
+ LOG_PREFIX(SeaStore::fiemap);
+ DEBUG("oid: {}, off: {}, len: {} ", oid, off, len);
+ return repeat_with_onode<std::map<uint64_t, uint64_t>>(
+ ch,
+ oid,
+ Transaction::src_t::READ,
+ "fiemap_read",
+ op_type_t::READ,
+ [=, this](auto &t, auto &onode) -> _fiemap_ret {
+ size_t size = onode.get_layout().size;
+ if (off >= size) {
+ INFOT("fiemap offset is over onode size!", t);
+ return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
+ }
+ size_t adjust_len = (len == 0) ?
+ size - off:
+ std::min(size - off, len);
+ return _fiemap(t, onode, off, adjust_len);
+ });
+}
+
+void SeaStore::Shard::on_error(ceph::os::Transaction &t) {
+ LOG_PREFIX(SeaStore::on_error);
+ ERROR(" transaction dump:\n");
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ std::stringstream str;
+ f.flush(str);
+ ERROR("{}", str.str());
+ abort();
+}
+
+seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
+ CollectionRef _ch,
+ ceph::os::Transaction&& _t)
+{
+ // repeat_with_internal_context ensures ordering via collection lock
+ return repeat_with_internal_context(
+ _ch,
+ std::move(_t),
+ Transaction::src_t::MUTATE,
+ "do_transaction",
+ op_type_t::TRANSACTION,
+ [this](auto &ctx) {
+ return with_trans_intr(*ctx.transaction, [&, this](auto &t) {
+#ifndef NDEBUG
+ LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks);
+ TRACET(" transaction dump:\n", t);
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ ctx.ext_transaction.dump(&f);
+ f.close_section();
+ std::stringstream str;
+ f.flush(str);
+ TRACET("{}", t, str.str());
+#endif
+ return seastar::do_with(
+ std::vector<OnodeRef>(ctx.iter.objects.size()),
+ std::vector<OnodeRef>(ctx.iter.objects.size()),
+ [this, &ctx](auto& onodes, auto& d_onodes) mutable {
+ return trans_intr::repeat(
+ [this, &ctx, &onodes, &d_onodes]() mutable
+ -> tm_iertr::future<seastar::stop_iteration>
+ {
+ if (ctx.iter.have_op()) {
+ return _do_transaction_step(
+ ctx, ctx.ch, onodes, d_onodes, ctx.iter
+ ).si_then([] {
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ } else {
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ };
+ }).si_then([this, &ctx, &d_onodes] {
+ return onode_manager->write_dirty(*ctx.transaction, d_onodes);
+ });
+ }).si_then([this, &ctx] {
+ return transaction_manager->submit_transaction(*ctx.transaction);
+ });
+ });
+ });
+}
+
+
+seastar::future<> SeaStore::Shard::flush(CollectionRef ch)
+{
+ return seastar::do_with(
+ get_dummy_ordering_handle(),
+ [this, ch](auto &handle) {
+ return handle.take_collection_lock(
+ static_cast<SeastoreCollection&>(*ch).ordering_lock
+ ).then([this, &handle] {
+ return transaction_manager->flush(handle);
+ });
+ });
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_do_transaction_step(
+ internal_context_t &ctx,
+ CollectionRef &col,
+ std::vector<OnodeRef> &onodes,
+ std::vector<OnodeRef> &d_onodes,
+ ceph::os::Transaction::iterator &i)
+{
+ auto op = i.decode_op();
+
+ using ceph::os::Transaction;
+ if (op->op == Transaction::OP_NOP)
+ return tm_iertr::now();
+
+ switch (op->op) {
+ case Transaction::OP_RMCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ return _remove_collection(ctx, cid);
+ }
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ return _create_collection(ctx, cid, op->split_bits);
+ }
+ case Transaction::OP_COLL_HINT:
+ {
+ ceph::bufferlist hint;
+ i.decode_bl(hint);
+ return tm_iertr::now();
+ }
+ }
+
+ using onode_iertr = OnodeManager::get_onode_iertr::extend<
+ crimson::ct_error::value_too_large>;
+ auto fut = onode_iertr::make_ready_future<OnodeRef>(OnodeRef());
+ bool create = false;
+ if (op->op == Transaction::OP_TOUCH ||
+ op->op == Transaction::OP_CREATE ||
+ op->op == Transaction::OP_WRITE ||
+ op->op == Transaction::OP_ZERO) {
+ create = true;
+ }
+ if (!onodes[op->oid]) {
+ if (!create) {
+ fut = onode_manager->get_onode(*ctx.transaction, i.get_oid(op->oid));
+ } else {
+ fut = onode_manager->get_or_create_onode(
+ *ctx.transaction, i.get_oid(op->oid));
+ }
+ }
+ return fut.si_then([&, op](auto get_onode) {
+ OnodeRef &o = onodes[op->oid];
+ if (!o) {
+ assert(get_onode);
+ o = get_onode;
+ d_onodes[op->oid] = get_onode;
+ }
+ if (op->op == Transaction::OP_CLONE && !d_onodes[op->dest_oid]) {
+ //TODO: use when_all_succeed after making onode tree
+ // support parallel extents loading
+ return onode_manager->get_or_create_onode(
+ *ctx.transaction, i.get_oid(op->dest_oid)
+ ).si_then([&, op](auto dest_onode) {
+ assert(dest_onode);
+ auto &d_o = onodes[op->dest_oid];
+ assert(!d_o);
+ assert(!d_onodes[op->dest_oid]);
+ d_o = dest_onode;
+ d_onodes[op->dest_oid] = dest_onode;
+ return seastar::now();
+ });
+ } else {
+ return OnodeManager::get_or_create_onode_iertr::now();
+ }
+ }).si_then([&, op, this]() -> tm_ret {
+ LOG_PREFIX(SeaStore::_do_transaction_step);
+ try {
+ switch (op->op) {
+ case Transaction::OP_REMOVE:
+ {
+ TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid));
+ return _remove(ctx, onodes[op->oid]
+ ).si_then([&onodes, &d_onodes, op] {
+ onodes[op->oid].reset();
+ d_onodes[op->oid].reset();
+ });
+ }
+ case Transaction::OP_CREATE:
+ case Transaction::OP_TOUCH:
+ {
+ return _touch(ctx, onodes[op->oid]);
+ }
+ case Transaction::OP_WRITE:
+ {
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ return _write(
+ ctx, onodes[op->oid], off, len, std::move(bl),
+ fadvise_flags);
+ }
+ case Transaction::OP_TRUNCATE:
+ {
+ uint64_t off = op->off;
+ return _truncate(ctx, onodes[op->oid], off);
+ }
+ case Transaction::OP_SETATTR:
+ {
+ std::string name = i.decode_string();
+ std::map<std::string, bufferlist> to_set;
+ ceph::bufferlist& bl = to_set[name];
+ i.decode_bl(bl);
+ return _setattrs(ctx, onodes[op->oid], std::move(to_set));
+ }
+ case Transaction::OP_SETATTRS:
+ {
+ std::map<std::string, bufferlist> to_set;
+ i.decode_attrset(to_set);
+ return _setattrs(ctx, onodes[op->oid], std::move(to_set));
+ }
+ case Transaction::OP_RMATTR:
+ {
+ std::string name = i.decode_string();
+ return _rmattr(ctx, onodes[op->oid], name);
+ }
+ case Transaction::OP_RMATTRS:
+ {
+ return _rmattrs(ctx, onodes[op->oid]);
+ }
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ std::map<std::string, ceph::bufferlist> aset;
+ i.decode_attrset(aset);
+ return _omap_set_values(ctx, onodes[op->oid], std::move(aset));
+ }
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ return _omap_set_header(ctx, onodes[op->oid], std::move(bl));
+ }
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ omap_keys_t keys;
+ i.decode_keyset(keys);
+ return _omap_rmkeys(ctx, onodes[op->oid], std::move(keys));
+ }
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ return _omap_rmkeyrange(
+ ctx, onodes[op->oid],
+ std::move(first), std::move(last));
+ }
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ return _omap_clear(ctx, onodes[op->oid]);
+ }
+ case Transaction::OP_ZERO:
+ {
+ objaddr_t off = op->off;
+ extent_len_t len = op->len;
+ return _zero(ctx, onodes[op->oid], off, len);
+ }
+ case Transaction::OP_SETALLOCHINT:
+ {
+ // TODO
+ return tm_iertr::now();
+ }
+ case Transaction::OP_CLONE:
+ {
+ TRACET("cloning {} to {}",
+ *ctx.transaction,
+ i.get_oid(op->oid),
+ i.get_oid(op->dest_oid));
+ return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]);
+ }
+ default:
+ ERROR("bad op {}", static_cast<unsigned>(op->op));
+ return crimson::ct_error::input_output_error::make();
+ }
+ } catch (std::exception &e) {
+ ERROR("got exception {}", e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ }).handle_error_interruptible(
+ tm_iertr::pass_further{},
+ crimson::ct_error::enoent::handle([op] {
+ //OMAP_CLEAR, TRUNCATE, REMOVE etc ops will tolerate absent onode.
+ if (op->op == Transaction::OP_CLONERANGE ||
+ op->op == Transaction::OP_CLONE ||
+ op->op == Transaction::OP_CLONERANGE2 ||
+ op->op == Transaction::OP_COLL_ADD ||
+ op->op == Transaction::OP_SETATTR ||
+ op->op == Transaction::OP_SETATTRS ||
+ op->op == Transaction::OP_RMATTR ||
+ op->op == Transaction::OP_OMAP_SETKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYS ||
+ op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+ op->op == Transaction::OP_OMAP_SETHEADER) {
+ ceph_abort_msg("unexpected enoent error");
+ }
+ return seastar::now();
+ }),
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::do_transaction_step"
+ }
+ );
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_remove(
+ internal_context_t &ctx,
+ OnodeRef &onode)
+{
+ LOG_PREFIX(SeaStore::_remove);
+ DEBUGT("onode={}", *ctx.transaction, *onode);
+ auto fut = BtreeOMapManager::omap_clear_iertr::now();
+ auto omap_root = onode->get_layout().omap_root.get(
+ onode->get_metadata_hint(device->get_block_size()));
+ if (omap_root.get_location() != L_ADDR_NULL) {
+ fut = seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ onode->get_layout().omap_root.get(
+ onode->get_metadata_hint(device->get_block_size())),
+ [&ctx, onode](auto &omap_manager, auto &omap_root) {
+ return omap_manager.omap_clear(
+ omap_root,
+ *ctx.transaction
+ );
+ });
+ }
+ return fut.si_then([this, &ctx, onode] {
+ return seastar::do_with(
+ ObjectDataHandler(max_object_size),
+ [=, this, &ctx](auto &objhandler) {
+ return objhandler.clear(
+ ObjectDataHandler::context_t{
+ *transaction_manager,
+ *ctx.transaction,
+ *onode,
+ });
+ });
+ }).si_then([this, &ctx, onode]() mutable {
+ return onode_manager->erase_onode(*ctx.transaction, onode);
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all(
+ "Invalid error in SeaStore::_remove"
+ )
+ );
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_touch(
+ internal_context_t &ctx,
+ OnodeRef &onode)
+{
+ LOG_PREFIX(SeaStore::_touch);
+ DEBUGT("onode={}", *ctx.transaction, *onode);
+ return tm_iertr::now();
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_write(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ uint64_t offset, size_t len,
+ ceph::bufferlist &&_bl,
+ uint32_t fadvise_flags)
+{
+ LOG_PREFIX(SeaStore::_write);
+ DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
+ {
+ auto &object_size = onode->get_mutable_layout(*ctx.transaction).size;
+ object_size = std::max<uint64_t>(
+ offset + len,
+ object_size);
+ }
+ return seastar::do_with(
+ std::move(_bl),
+ ObjectDataHandler(max_object_size),
+ [=, this, &ctx, &onode](auto &bl, auto &objhandler) {
+ return objhandler.write(
+ ObjectDataHandler::context_t{
+ *transaction_manager,
+ *ctx.transaction,
+ *onode,
+ },
+ offset,
+ bl);
+ });
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_clone(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ OnodeRef &d_onode)
+{
+ LOG_PREFIX(SeaStore::_clone);
+ DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode);
+ return seastar::do_with(
+ ObjectDataHandler(max_object_size),
+ [this, &ctx, &onode, &d_onode](auto &objHandler) {
+ //TODO: currently, we only care about object data, leaving cloning
+ // of xattr/omap for future work
+ auto &object_size = onode->get_layout().size;
+ auto &d_object_size = d_onode->get_mutable_layout(*ctx.transaction).size;
+ d_object_size = object_size;
+ return objHandler.clone(
+ ObjectDataHandler::context_t{
+ *transaction_manager,
+ *ctx.transaction,
+ *onode,
+ d_onode.get()});
+ });
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_zero(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ objaddr_t offset,
+ extent_len_t len)
+{
+ LOG_PREFIX(SeaStore::_zero);
+ DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
+ if (offset + len >= max_object_size) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ auto &object_size = onode->get_mutable_layout(*ctx.transaction).size;
+ object_size = std::max<uint64_t>(offset + len, object_size);
+ return seastar::do_with(
+ ObjectDataHandler(max_object_size),
+ [=, this, &ctx, &onode](auto &objhandler) {
+ return objhandler.zero(
+ ObjectDataHandler::context_t{
+ *transaction_manager,
+ *ctx.transaction,
+ *onode,
+ },
+ offset,
+ len);
+ });
+}
+
+SeaStore::Shard::omap_set_kvs_ret
+SeaStore::Shard::_omap_set_kvs(
+ OnodeRef &onode,
+ const omap_root_le_t& omap_root,
+ Transaction& t,
+ omap_root_le_t& mutable_omap_root,
+ std::map<std::string, ceph::bufferlist>&& kvs)
+{
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ omap_root.get(onode->get_metadata_hint(device->get_block_size())),
+ [&, keys=std::move(kvs)](auto &omap_manager, auto &root) {
+ tm_iertr::future<> maybe_create_root =
+ !root.is_null() ?
+ tm_iertr::now() :
+ omap_manager.initialize_omap(
+ t, onode->get_metadata_hint(device->get_block_size())
+ ).si_then([&root](auto new_root) {
+ root = new_root;
+ });
+ return maybe_create_root.si_then(
+ [&, keys=std::move(keys)]() mutable {
+ return omap_manager.omap_set_keys(root, t, std::move(keys));
+ }).si_then([&] {
+ return tm_iertr::make_ready_future<omap_root_t>(std::move(root));
+ }).si_then([&mutable_omap_root](auto root) {
+ if (root.must_update()) {
+ mutable_omap_root.update(root);
+ }
+ });
+ }
+ );
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_omap_set_values(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::map<std::string, ceph::bufferlist> &&aset)
+{
+ LOG_PREFIX(SeaStore::_omap_set_values);
+ DEBUGT("{} {} keys", *ctx.transaction, *onode, aset.size());
+ return _omap_set_kvs(
+ onode,
+ onode->get_layout().omap_root,
+ *ctx.transaction,
+ onode->get_mutable_layout(*ctx.transaction).omap_root,
+ std::move(aset));
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_omap_set_header(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ ceph::bufferlist &&header)
+{
+ LOG_PREFIX(SeaStore::_omap_set_header);
+ DEBUGT("{} {} bytes", *ctx.transaction, *onode, header.length());
+ std::map<std::string, bufferlist> to_set;
+ to_set[OMAP_HEADER_XATTR_KEY] = header;
+ return _setattrs(ctx, onode,std::move(to_set));
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_omap_clear(
+ internal_context_t &ctx,
+ OnodeRef &onode)
+{
+ LOG_PREFIX(SeaStore::_omap_clear);
+ DEBUGT("{} {} keys", *ctx.transaction, *onode);
+ return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY))
+ .si_then([this, &ctx, &onode]() -> tm_ret {
+ if (auto omap_root = onode->get_layout().omap_root.get(
+ onode->get_metadata_hint(device->get_block_size()));
+ omap_root.is_null()) {
+ return seastar::now();
+ } else {
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ onode->get_layout().omap_root.get(
+ onode->get_metadata_hint(device->get_block_size())),
+ [&ctx, &onode](
+ auto &omap_manager,
+ auto &omap_root) {
+ return omap_manager.omap_clear(
+ omap_root,
+ *ctx.transaction)
+ .si_then([&] {
+ if (omap_root.must_update()) {
+ onode->get_mutable_layout(*ctx.transaction
+ ).omap_root.update(omap_root);
+ }
+ });
+ });
+ }
+ });
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_omap_rmkeys(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ omap_keys_t &&keys)
+{
+ LOG_PREFIX(SeaStore::_omap_rmkeys);
+ DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size());
+ auto omap_root = onode->get_layout().omap_root.get(
+ onode->get_metadata_hint(device->get_block_size()));
+ if (omap_root.is_null()) {
+ return seastar::now();
+ } else {
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ onode->get_layout().omap_root.get(
+ onode->get_metadata_hint(device->get_block_size())),
+ std::move(keys),
+ [&ctx, &onode](
+ auto &omap_manager,
+ auto &omap_root,
+ auto &keys) {
+ return trans_intr::do_for_each(
+ keys.begin(),
+ keys.end(),
+ [&](auto &p) {
+ return omap_manager.omap_rm_key(
+ omap_root,
+ *ctx.transaction,
+ p);
+ }
+ ).si_then([&] {
+ if (omap_root.must_update()) {
+ onode->get_mutable_layout(*ctx.transaction
+ ).omap_root.update(omap_root);
+ }
+ });
+ }
+ );
+ }
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_omap_rmkeyrange(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::string first,
+ std::string last)
+{
+ LOG_PREFIX(SeaStore::_omap_rmkeyrange);
+ DEBUGT("{} first={} last={}", *ctx.transaction, *onode, first, last);
+ if (first > last) {
+ ERRORT("range error, first: {} > last:{}", *ctx.transaction, first, last);
+ ceph_abort();
+ }
+ auto omap_root = onode->get_layout().omap_root.get(
+ onode->get_metadata_hint(device->get_block_size()));
+ if (omap_root.is_null()) {
+ return seastar::now();
+ } else {
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ onode->get_layout().omap_root.get(
+ onode->get_metadata_hint(device->get_block_size())),
+ std::move(first),
+ std::move(last),
+ [&ctx, &onode](
+ auto &omap_manager,
+ auto &omap_root,
+ auto &first,
+ auto &last) {
+ auto config = OMapManager::omap_list_config_t()
+ .with_inclusive(true, false)
+ .without_max();
+ return omap_manager.omap_rm_key_range(
+ omap_root,
+ *ctx.transaction,
+ first,
+ last,
+ config
+ ).si_then([&] {
+ if (omap_root.must_update()) {
+ onode->get_mutable_layout(*ctx.transaction
+ ).omap_root.update(omap_root);
+ }
+ });
+ });
+ }
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_truncate(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ uint64_t size)
+{
+ LOG_PREFIX(SeaStore::_truncate);
+ DEBUGT("onode={} size={}", *ctx.transaction, *onode, size);
+ onode->get_mutable_layout(*ctx.transaction).size = size;
+ return seastar::do_with(
+ ObjectDataHandler(max_object_size),
+ [=, this, &ctx, &onode](auto &objhandler) {
+ return objhandler.truncate(
+ ObjectDataHandler::context_t{
+ *transaction_manager,
+ *ctx.transaction,
+ *onode
+ },
+ size);
+ });
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_setattrs(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::map<std::string, bufferlist>&& aset)
+{
+ LOG_PREFIX(SeaStore::_setattrs);
+ DEBUGT("onode={}", *ctx.transaction, *onode);
+
+ auto fut = tm_iertr::now();
+ auto& layout = onode->get_mutable_layout(*ctx.transaction);
+ if (auto it = aset.find(OI_ATTR); it != aset.end()) {
+ auto& val = it->second;
+ if (likely(val.length() <= onode_layout_t::MAX_OI_LENGTH)) {
+ maybe_inline_memcpy(
+ &layout.oi[0],
+ val.c_str(),
+ val.length(),
+ onode_layout_t::MAX_OI_LENGTH);
+
+ if (!layout.oi_size) {
+ // if oi was not in the layout, it probably exists in the omap,
+ // need to remove it first
+ fut = _xattr_rmattr(ctx, onode, OI_ATTR);
+ }
+ layout.oi_size = val.length();
+ aset.erase(it);
+ } else {
+ layout.oi_size = 0;
+ }
+ }
+
+ if (auto it = aset.find(SS_ATTR); it != aset.end()) {
+ auto& val = it->second;
+ if (likely(val.length() <= onode_layout_t::MAX_SS_LENGTH)) {
+ maybe_inline_memcpy(
+ &layout.ss[0],
+ val.c_str(),
+ val.length(),
+ onode_layout_t::MAX_SS_LENGTH);
+
+ if (!layout.ss_size) {
+ fut = _xattr_rmattr(ctx, onode, SS_ATTR);
+ }
+ layout.ss_size = val.length();
+
+ aset.erase(it);
+ } else {
+ layout.ss_size = 0;
+ }
+ }
+
+ if (aset.empty()) {
+ return fut;
+ }
+
+ return fut.si_then(
+ [this, onode, &ctx, &layout,
+ aset=std::move(aset)]() mutable {
+ return _omap_set_kvs(
+ onode,
+ onode->get_layout().xattr_root,
+ *ctx.transaction,
+ layout.xattr_root,
+ std::move(aset));
+ });
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_rmattr(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::string name)
+{
+ LOG_PREFIX(SeaStore::_rmattr);
+ DEBUGT("onode={}", *ctx.transaction, *onode);
+ auto& layout = onode->get_mutable_layout(*ctx.transaction);
+ if ((name == OI_ATTR) && (layout.oi_size > 0)) {
+ memset(&layout.oi[0], 0, layout.oi_size);
+ layout.oi_size = 0;
+ return tm_iertr::now();
+ } else if ((name == SS_ATTR) && (layout.ss_size > 0)) {
+ memset(&layout.ss[0], 0, layout.ss_size);
+ layout.ss_size = 0;
+ return tm_iertr::now();
+ } else {
+ return _xattr_rmattr(
+ ctx,
+ onode,
+ std::move(name));
+ }
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_xattr_rmattr(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::string &&name)
+{
+ LOG_PREFIX(SeaStore::_xattr_rmattr);
+ DEBUGT("onode={}", *ctx.transaction, *onode);
+ auto xattr_root = onode->get_layout().xattr_root.get(
+ onode->get_metadata_hint(device->get_block_size()));
+ if (xattr_root.is_null()) {
+ return seastar::now();
+ } else {
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ onode->get_layout().xattr_root.get(
+ onode->get_metadata_hint(device->get_block_size())),
+ std::move(name),
+ [&ctx, &onode](auto &omap_manager, auto &xattr_root, auto &name) {
+ return omap_manager.omap_rm_key(xattr_root, *ctx.transaction, name)
+ .si_then([&] {
+ if (xattr_root.must_update()) {
+ onode->get_mutable_layout(*ctx.transaction
+ ).xattr_root.update(xattr_root);
+ }
+ });
+ });
+ }
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_rmattrs(
+ internal_context_t &ctx,
+ OnodeRef &onode)
+{
+ LOG_PREFIX(SeaStore::_rmattrs);
+ DEBUGT("onode={}", *ctx.transaction, *onode);
+ auto& layout = onode->get_mutable_layout(*ctx.transaction);
+ memset(&layout.oi[0], 0, layout.oi_size);
+ layout.oi_size = 0;
+ memset(&layout.ss[0], 0, layout.ss_size);
+ layout.ss_size = 0;
+ return _xattr_clear(ctx, onode);
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_xattr_clear(
+ internal_context_t &ctx,
+ OnodeRef &onode)
+{
+ LOG_PREFIX(SeaStore::_xattr_clear);
+ DEBUGT("onode={}", *ctx.transaction, *onode);
+ auto xattr_root = onode->get_layout().xattr_root.get(
+ onode->get_metadata_hint(device->get_block_size()));
+ if (xattr_root.is_null()) {
+ return seastar::now();
+ } else {
+ return seastar::do_with(
+ BtreeOMapManager(*transaction_manager),
+ onode->get_layout().xattr_root.get(
+ onode->get_metadata_hint(device->get_block_size())),
+ [&ctx, &onode](auto &omap_manager, auto &xattr_root) {
+ return omap_manager.omap_clear(xattr_root, *ctx.transaction)
+ .si_then([&] {
+ if (xattr_root.must_update()) {
+ onode->get_mutable_layout(*ctx.transaction
+ ).xattr_root.update(xattr_root);
+ }
+ });
+ });
+ }
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_create_collection(
+ internal_context_t &ctx,
+ const coll_t& cid, int bits)
+{
+ return transaction_manager->read_collection_root(
+ *ctx.transaction
+ ).si_then([=, this, &ctx](auto _cmroot) {
+ return seastar::do_with(
+ _cmroot,
+ [=, this, &ctx](auto &cmroot) {
+ return collection_manager->create(
+ cmroot,
+ *ctx.transaction,
+ cid,
+ bits
+ ).si_then([this, &ctx, &cmroot] {
+ if (cmroot.must_update()) {
+ transaction_manager->write_collection_root(
+ *ctx.transaction,
+ cmroot);
+ }
+ });
+ }
+ );
+ }).handle_error_interruptible(
+ tm_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::_create_collection"
+ }
+ );
+}
+
+SeaStore::Shard::tm_ret
+SeaStore::Shard::_remove_collection(
+ internal_context_t &ctx,
+ const coll_t& cid)
+{
+ return transaction_manager->read_collection_root(
+ *ctx.transaction
+ ).si_then([=, this, &ctx](auto _cmroot) {
+ return seastar::do_with(
+ _cmroot,
+ [=, this, &ctx](auto &cmroot) {
+ return collection_manager->remove(
+ cmroot,
+ *ctx.transaction,
+ cid
+ ).si_then([this, &ctx, &cmroot] {
+ // param here denotes whether it already existed, probably error
+ if (cmroot.must_update()) {
+ transaction_manager->write_collection_root(
+ *ctx.transaction,
+ cmroot);
+ }
+ });
+ });
+ }).handle_error_interruptible(
+ tm_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::_create_collection"
+ }
+ );
+}
+
+boost::intrusive_ptr<SeastoreCollection>
+SeaStore::Shard::_get_collection(const coll_t& cid)
+{
+ return new SeastoreCollection{cid};
+}
+
+seastar::future<> SeaStore::Shard::write_meta(
+ const std::string& key,
+ const std::string& value)
+{
+ LOG_PREFIX(SeaStore::write_meta);
+ DEBUG("key: {}; value: {}", key, value);
+ return seastar::do_with(
+ key, value,
+ [this, FNAME](auto& key, auto& value) {
+ return repeat_eagain([this, FNAME, &key, &value] {
+ return transaction_manager->with_transaction_intr(
+ Transaction::src_t::MUTATE,
+ "write_meta",
+ [this, FNAME, &key, &value](auto& t)
+ {
+ DEBUGT("Have transaction, key: {}; value: {}", t, key, value);
+ return transaction_manager->update_root_meta(
+ t, key, value
+ ).si_then([this, &t] {
+ return transaction_manager->submit_transaction(t);
+ });
+ });
+ });
+ }).handle_error(
+ crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+ );
+}
+
+seastar::future<std::tuple<int, std::string>>
+SeaStore::read_meta(const std::string& key)
+{
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ LOG_PREFIX(SeaStore::read_meta);
+ DEBUG("key: {}", key);
+ return mdstore->read_meta(key).safe_then([](auto v) {
+ if (v) {
+ return std::make_tuple(0, std::move(*v));
+ } else {
+ return std::make_tuple(-1, std::string(""));
+ }
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in SeaStore::read_meta"
+ }
+ );
+}
+
+uuid_d SeaStore::Shard::get_fsid() const
+{
+ return device->get_meta().seastore_id;
+}
+
+void SeaStore::Shard::init_managers()
+{
+ transaction_manager.reset();
+ collection_manager.reset();
+ onode_manager.reset();
+
+ transaction_manager = make_transaction_manager(
+ device, secondaries, is_test);
+ collection_manager = std::make_unique<collection_manager::FlatCollectionManager>(
+ *transaction_manager);
+ onode_manager = std::make_unique<crimson::os::seastore::onode::FLTreeOnodeManager>(
+ *transaction_manager);
+}
+
+std::unique_ptr<SeaStore> make_seastore(
+ const std::string &device)
+{
+ auto mdstore = std::make_unique<FileMDStore>(device);
+ return std::make_unique<SeaStore>(
+ device,
+ std::move(mdstore));
+}
+
+std::unique_ptr<SeaStore> make_test_seastore(
+ SeaStore::MDStoreRef mdstore)
+{
+ return std::make_unique<SeaStore>(
+ "",
+ std::move(mdstore));
+}
+
+}
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
new file mode 100644
index 000000000..876fadca8
--- /dev/null
+++ b/src/crimson/os/seastore/seastore.h
@@ -0,0 +1,531 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <typeinfo>
+#include <vector>
+
+#include <optional>
+#include <seastar/core/future.hh>
+#include <seastar/core/metrics_types.hh>
+
+#include "include/uuid.h"
+
+#include "os/Transaction.h"
+#include "crimson/common/throttle.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+#include "crimson/os/seastore/device.h"
+#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/onode_manager.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/collection_manager.h"
+#include "crimson/os/seastore/object_data_handler.h"
+
+namespace crimson::os::seastore {
+
+class Onode;
+using OnodeRef = boost::intrusive_ptr<Onode>;
+class TransactionManager;
+
+enum class op_type_t : uint8_t {
+ TRANSACTION = 0,
+ READ,
+ WRITE,
+ GET_ATTR,
+ GET_ATTRS,
+ STAT,
+ OMAP_GET_VALUES,
+ OMAP_LIST,
+ MAX
+};
+
+class SeastoreCollection final : public FuturizedCollection {
+public:
+ template <typename... T>
+ SeastoreCollection(T&&... args) :
+ FuturizedCollection(std::forward<T>(args)...) {}
+
+ seastar::shared_mutex ordering_lock;
+};
+
+/**
+ * col_obj_ranges_t
+ *
+ * Represents the two ghobject_t ranges spanned by a PG collection.
+ * Temp objects will be within [temp_begin, temp_end) and normal objects
+ * will be in [obj_begin, obj_end).
+ */
+struct col_obj_ranges_t {
+ ghobject_t temp_begin;
+ ghobject_t temp_end;
+ ghobject_t obj_begin;
+ ghobject_t obj_end;
+};
+
+class SeaStore final : public FuturizedStore {
+public:
+ class MDStore {
+ public:
+ using base_iertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+
+ using write_meta_ertr = base_iertr;
+ using write_meta_ret = write_meta_ertr::future<>;
+ virtual write_meta_ret write_meta(
+ const std::string &key,
+ const std::string &val
+ ) = 0;
+
+ using read_meta_ertr = base_iertr;
+ using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>;
+ virtual read_meta_ret read_meta(const std::string &key) = 0;
+
+ virtual ~MDStore() {}
+ };
+ using MDStoreRef = std::unique_ptr<MDStore>;
+
+ class Shard : public FuturizedStore::Shard {
+ public:
+ Shard(
+ std::string root,
+ Device* device,
+ bool is_test);
+ ~Shard() = default;
+
+ seastar::future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ read_errorator::future<ceph::bufferlist> read(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags = 0) final;
+
+ read_errorator::future<ceph::bufferlist> readv(
+ CollectionRef c,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags = 0) final;
+
+ get_attr_errorator::future<ceph::bufferlist> get_attr(
+ CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name) const final;
+
+ get_attrs_ertr::future<attrs_t> get_attrs(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ read_errorator::future<omap_values_t> omap_get_values(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const omap_keys_t& keys) final;
+
+ /// Retrieves paged set of values > start (if present)
+ using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>;
+ using omap_get_values_ret_t = read_errorator::future<
+ omap_get_values_ret_bare_t>;
+ omap_get_values_ret_t omap_get_values(
+ CollectionRef c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] oid
+ const std::optional<std::string> &start ///< [in] start, empty for begin
+ ) final; ///< @return <done, values> values.empty() iff done
+
+ get_attr_errorator::future<bufferlist> omap_get_header(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+ CollectionRef c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const final;
+
+ seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+ seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+
+ seastar::future<> do_transaction_no_callbacks(
+ CollectionRef ch,
+ ceph::os::Transaction&& txn) final;
+
+ /* Note, flush() machinery must go through the same pipeline
+ * stages and locks as do_transaction. */
+ seastar::future<> flush(CollectionRef ch) final;
+
+ read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len) final;
+
+ unsigned get_max_attr_name_length() const final {
+ return 256;
+ }
+
+ // only exposed to SeaStore
+ public:
+ seastar::future<> umount();
+ // init managers and mount transaction_manager
+ seastar::future<> mount_managers();
+
+ void set_secondaries(Device& sec_dev) {
+ secondaries.emplace_back(&sec_dev);
+ }
+
+ using coll_core_t = FuturizedStore::coll_core_t;
+ seastar::future<std::vector<coll_core_t>> list_collections();
+
+ seastar::future<> write_meta(const std::string& key,
+ const std::string& value);
+
+ store_statfs_t stat() const;
+
+ uuid_d get_fsid() const;
+
+ seastar::future<> mkfs_managers();
+
+ void init_managers();
+
+ private:
+ struct internal_context_t {
+ CollectionRef ch;
+ ceph::os::Transaction ext_transaction;
+
+ internal_context_t(
+ CollectionRef ch,
+ ceph::os::Transaction &&_ext_transaction,
+ TransactionRef &&transaction)
+ : ch(ch), ext_transaction(std::move(_ext_transaction)),
+ transaction(std::move(transaction)),
+ iter(ext_transaction.begin()) {}
+
+ TransactionRef transaction;
+
+ ceph::os::Transaction::iterator iter;
+ std::chrono::steady_clock::time_point begin_timestamp = std::chrono::steady_clock::now();
+
+ void reset_preserve_handle(TransactionManager &tm) {
+ tm.reset_transaction_preserve_handle(*transaction);
+ iter = ext_transaction.begin();
+ }
+ };
+
+ TransactionManager::read_extent_iertr::future<std::optional<unsigned>>
+ get_coll_bits(CollectionRef ch, Transaction &t) const;
+
+ static void on_error(ceph::os::Transaction &t);
+
+ template <typename F>
+ auto repeat_with_internal_context(
+ CollectionRef ch,
+ ceph::os::Transaction &&t,
+ Transaction::src_t src,
+ const char* tname,
+ op_type_t op_type,
+ F &&f) {
+ return seastar::do_with(
+ internal_context_t(
+ ch, std::move(t),
+ transaction_manager->create_transaction(src, tname)),
+ std::forward<F>(f),
+ [this, op_type](auto &ctx, auto &f) {
+ return ctx.transaction->get_handle().take_collection_lock(
+ static_cast<SeastoreCollection&>(*(ctx.ch)).ordering_lock
+ ).then([this] {
+ return throttler.get(1);
+ }).then([&, this] {
+ return repeat_eagain([&, this] {
+ ctx.reset_preserve_handle(*transaction_manager);
+ return std::invoke(f, ctx);
+ }).handle_error(
+ crimson::ct_error::eagain::pass_further{},
+ crimson::ct_error::all_same_way([&ctx](auto e) {
+ on_error(ctx.ext_transaction);
+ })
+ );
+ }).then([this, op_type, &ctx] {
+ add_latency_sample(op_type,
+ std::chrono::steady_clock::now() - ctx.begin_timestamp);
+ }).finally([this] {
+ throttler.put();
+ });
+ });
+ }
+
+ template <typename Ret, typename F>
+ auto repeat_with_onode(
+ CollectionRef ch,
+ const ghobject_t &oid,
+ Transaction::src_t src,
+ const char* tname,
+ op_type_t op_type,
+ F &&f) const {
+ auto begin_time = std::chrono::steady_clock::now();
+ return seastar::do_with(
+ oid, Ret{}, std::forward<F>(f),
+ [this, src, op_type, begin_time, tname
+ ](auto &oid, auto &ret, auto &f)
+ {
+ return repeat_eagain([&, this, src, tname] {
+ return transaction_manager->with_transaction_intr(
+ src,
+ tname,
+ [&, this](auto& t)
+ {
+ return onode_manager->get_onode(t, oid
+ ).si_then([&](auto onode) {
+ return seastar::do_with(std::move(onode), [&](auto& onode) {
+ return f(t, *onode);
+ });
+ }).si_then([&ret](auto _ret) {
+ ret = _ret;
+ });
+ });
+ }).safe_then([&ret, op_type, begin_time, this] {
+ const_cast<Shard*>(this)->add_latency_sample(op_type,
+ std::chrono::steady_clock::now() - begin_time);
+ return seastar::make_ready_future<Ret>(ret);
+ });
+ });
+ }
+
+ using _fiemap_ret = ObjectDataHandler::fiemap_ret;
+ _fiemap_ret _fiemap(
+ Transaction &t,
+ Onode &onode,
+ uint64_t off,
+ uint64_t len) const;
+
+ using _omap_get_value_iertr = OMapManager::base_iertr::extend<
+ crimson::ct_error::enodata
+ >;
+ using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>;
+ _omap_get_value_ret _omap_get_value(
+ Transaction &t,
+ omap_root_t &&root,
+ std::string_view key) const;
+
+ using _omap_get_values_iertr = OMapManager::base_iertr;
+ using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>;
+ _omap_get_values_ret _omap_get_values(
+ Transaction &t,
+ omap_root_t &&root,
+ const omap_keys_t &keys) const;
+
+ friend class SeaStoreOmapIterator;
+
+ using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
+ using omap_list_ret = OMapManager::omap_list_ret;
+ omap_list_ret omap_list(
+ Onode &onode,
+ const omap_root_le_t& omap_root,
+ Transaction& t,
+ const std::optional<std::string>& start,
+ OMapManager::omap_list_config_t config) const;
+
+ using tm_iertr = TransactionManager::base_iertr;
+ using tm_ret = tm_iertr::future<>;
+ tm_ret _do_transaction_step(
+ internal_context_t &ctx,
+ CollectionRef &col,
+ std::vector<OnodeRef> &onodes,
+ std::vector<OnodeRef> &d_onodes,
+ ceph::os::Transaction::iterator &i);
+
+ tm_ret _remove(
+ internal_context_t &ctx,
+ OnodeRef &onode);
+ tm_ret _touch(
+ internal_context_t &ctx,
+ OnodeRef &onode);
+ tm_ret _write(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ uint64_t offset, size_t len,
+ ceph::bufferlist &&bl,
+ uint32_t fadvise_flags);
+ tm_ret _clone(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ OnodeRef &d_onode);
+ tm_ret _zero(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ objaddr_t offset, extent_len_t len);
+ tm_ret _omap_set_values(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::map<std::string, ceph::bufferlist> &&aset);
+ tm_ret _omap_set_header(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ ceph::bufferlist &&header);
+ tm_ret _omap_clear(
+ internal_context_t &ctx,
+ OnodeRef &onode);
+ tm_ret _omap_rmkeys(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ omap_keys_t &&aset);
+ tm_ret _omap_rmkeyrange(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::string first,
+ std::string last);
+ tm_ret _truncate(
+ internal_context_t &ctx,
+ OnodeRef &onode, uint64_t size);
+ tm_ret _setattrs(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::map<std::string,bufferlist>&& aset);
+ tm_ret _rmattr(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::string name);
+ tm_ret _rmattrs(
+ internal_context_t &ctx,
+ OnodeRef &onode);
+ tm_ret _xattr_rmattr(
+ internal_context_t &ctx,
+ OnodeRef &onode,
+ std::string &&name);
+ tm_ret _xattr_clear(
+ internal_context_t &ctx,
+ OnodeRef &onode);
+ tm_ret _create_collection(
+ internal_context_t &ctx,
+ const coll_t& cid, int bits);
+ tm_ret _remove_collection(
+ internal_context_t &ctx,
+ const coll_t& cid);
+ using omap_set_kvs_ret = tm_iertr::future<>;
+ omap_set_kvs_ret _omap_set_kvs(
+ OnodeRef &onode,
+ const omap_root_le_t& omap_root,
+ Transaction& t,
+ omap_root_le_t& mutable_omap_root,
+ std::map<std::string, ceph::bufferlist>&& kvs);
+
+ boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid);
+
+ static constexpr auto LAT_MAX = static_cast<std::size_t>(op_type_t::MAX);
+
+ struct {
+ std::array<seastar::metrics::histogram, LAT_MAX> op_lat;
+ } stats;
+
+ seastar::metrics::histogram& get_latency(
+ op_type_t op_type) {
+ assert(static_cast<std::size_t>(op_type) < stats.op_lat.size());
+ return stats.op_lat[static_cast<std::size_t>(op_type)];
+ }
+
+ void add_latency_sample(op_type_t op_type,
+ std::chrono::steady_clock::duration dur) {
+ seastar::metrics::histogram& lat = get_latency(op_type);
+ lat.sample_count++;
+ lat.sample_sum += std::chrono::duration_cast<std::chrono::milliseconds>(dur).count();
+ }
+
+ private:
+ std::string root;
+ Device* device;
+ const uint32_t max_object_size;
+ bool is_test;
+
+ std::vector<Device*> secondaries;
+ TransactionManagerRef transaction_manager;
+ CollectionManagerRef collection_manager;
+ OnodeManagerRef onode_manager;
+
+ common::Throttle throttler;
+
+ seastar::metrics::metric_group metrics;
+ void register_metrics();
+ };
+
+public:
+ SeaStore(
+ const std::string& root,
+ MDStoreRef mdstore);
+ ~SeaStore();
+
+ seastar::future<> start() final;
+ seastar::future<> stop() final;
+
+ mount_ertr::future<> mount() final;
+ seastar::future<> umount() final;
+
+ mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
+ seastar::future<store_statfs_t> stat() const final;
+
+ uuid_d get_fsid() const final {
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.local().get_fsid();
+ }
+
+ seastar::future<> write_meta(
+ const std::string& key,
+ const std::string& value) final {
+ ceph_assert(seastar::this_shard_id() == primary_core);
+ return shard_stores.local().write_meta(
+ key, value).then([this, key, value] {
+ return mdstore->write_meta(key, value);
+ }).handle_error(
+ crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+ );
+ }
+
+ seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
+
+ seastar::future<std::vector<coll_core_t>> list_collections() final;
+
+ FuturizedStore::Shard& get_sharded_store() final {
+ return shard_stores.local();
+ }
+
+ static col_obj_ranges_t
+ get_objs_range(CollectionRef ch, unsigned bits);
+
+// for test
+public:
+ mount_ertr::future<> test_mount();
+ mkfs_ertr::future<> test_mkfs(uuid_d new_osd_fsid);
+
+ DeviceRef get_primary_device_ref() {
+ return std::move(device);
+ }
+
+ seastar::future<> test_start(DeviceRef dev);
+
+private:
+ seastar::future<> write_fsid(uuid_d new_osd_fsid);
+
+ seastar::future<> prepare_meta(uuid_d new_osd_fsid);
+
+ seastar::future<> set_secondaries();
+
+private:
+ std::string root;
+ MDStoreRef mdstore;
+ DeviceRef device;
+ std::vector<DeviceRef> secondaries;
+ seastar::sharded<SeaStore::Shard> shard_stores;
+};
+
+std::unique_ptr<SeaStore> make_seastore(
+ const std::string &device);
+
+std::unique_ptr<SeaStore> make_test_seastore(
+ SeaStore::MDStoreRef mdstore);
+}
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
new file mode 100644
index 000000000..0acfdb74e
--- /dev/null
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -0,0 +1,874 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/common/log.h"
+
+namespace {
+
+seastar::logger& journal_logger() {
+ return crimson::get_logger(ceph_subsys_seastore_journal);
+}
+
+}
+
+namespace crimson::os::seastore {
+
+bool is_aligned(uint64_t offset, uint64_t alignment)
+{
+ return (offset % alignment) == 0;
+}
+
+std::ostream& operator<<(std::ostream &out, const omap_root_t &root)
+{
+ return out << "omap_root{addr=" << root.addr
+ << ", depth=" << root.depth
+ << ", hint=" << root.hint
+ << ", mutated=" << root.mutated
+ << "}";
+}
+
+std::ostream& operator<<(std::ostream& out, const seastore_meta_t& meta)
+{
+ return out << meta.seastore_id;
+}
+
+std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id)
+{
+ auto _id = id.id;
+ if (_id == DEVICE_ID_NULL) {
+ return out << "Dev(NULL)";
+ } else if (_id == DEVICE_ID_RECORD_RELATIVE) {
+ return out << "Dev(RR)";
+ } else if (_id == DEVICE_ID_BLOCK_RELATIVE) {
+ return out << "Dev(BR)";
+ } else if (_id == DEVICE_ID_DELAYED) {
+ return out << "Dev(DELAYED)";
+ } else if (_id == DEVICE_ID_FAKE) {
+ return out << "Dev(FAKE)";
+ } else if (_id == DEVICE_ID_ZERO) {
+ return out << "Dev(ZERO)";
+ } else if (_id == DEVICE_ID_ROOT) {
+ return out << "Dev(ROOT)";
+ } else {
+ return out << "Dev(" << (unsigned)_id << ")";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const segment_id_t &segment)
+{
+ if (segment == NULL_SEG_ID) {
+ return out << "Seg[NULL]";
+ } else {
+ return out << "Seg[" << device_id_printer_t{segment.device_id()}
+ << "," << segment.device_segment_id()
+ << "]";
+ }
+}
+
+std::ostream& operator<<(std::ostream& out, segment_type_t t)
+{
+ switch(t) {
+ case segment_type_t::JOURNAL:
+ return out << "JOURNAL";
+ case segment_type_t::OOL:
+ return out << "OOL";
+ case segment_type_t::NULL_SEG:
+ return out << "NULL_SEG";
+ default:
+ return out << "INVALID_SEGMENT_TYPE!";
+ }
+}
+
+std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq)
+{
+ if (seq.seq == NULL_SEG_SEQ) {
+ return out << "sseq(NULL)";
+ } else {
+ return out << "sseq(" << seq.seq << ")";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr)
+{
+ if (pladdr.is_laddr()) {
+ return out << pladdr.get_laddr();
+ } else {
+ return out << pladdr.get_paddr();
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const paddr_t &rhs)
+{
+ auto id = rhs.get_device_id();
+ out << "paddr<";
+ if (rhs == P_ADDR_NULL) {
+ out << "NULL";
+ } else if (rhs == P_ADDR_MIN) {
+ out << "MIN";
+ } else if (rhs == P_ADDR_ZERO) {
+ out << "ZERO";
+ } else if (has_device_off(id)) {
+ auto &s = rhs.as_res_paddr();
+ out << device_id_printer_t{id}
+ << ","
+ << s.get_device_off();
+ } else if (rhs.get_addr_type() == paddr_types_t::SEGMENT) {
+ auto &s = rhs.as_seg_paddr();
+ out << s.get_segment_id()
+ << ","
+ << s.get_segment_off();
+ } else if (rhs.get_addr_type() == paddr_types_t::RANDOM_BLOCK) {
+ auto &s = rhs.as_blk_paddr();
+ out << device_id_printer_t{s.get_device_id()}
+ << ","
+ << s.get_device_off();
+ } else {
+ out << "INVALID!";
+ }
+ return out << ">";
+}
+
+journal_seq_t journal_seq_t::add_offset(
+ journal_type_t type,
+ device_off_t off,
+ device_off_t roll_start,
+ device_off_t roll_size) const
+{
+ assert(offset.is_absolute());
+ assert(off <= DEVICE_OFF_MAX && off >= DEVICE_OFF_MIN);
+ assert(roll_start >= 0);
+ assert(roll_size > 0);
+
+ segment_seq_t jseq = segment_seq;
+ device_off_t joff;
+ if (type == journal_type_t::SEGMENTED) {
+ joff = offset.as_seg_paddr().get_segment_off();
+ } else {
+ assert(type == journal_type_t::RANDOM_BLOCK);
+ auto boff = offset.as_blk_paddr().get_device_off();
+ joff = boff;
+ }
+ auto roll_end = roll_start + roll_size;
+ assert(joff >= roll_start);
+ assert(joff <= roll_end);
+
+ if (off >= 0) {
+ device_off_t new_jseq = jseq + (off / roll_size);
+ joff += (off % roll_size);
+ if (joff >= roll_end) {
+ ++new_jseq;
+ joff -= roll_size;
+ }
+ assert(new_jseq < MAX_SEG_SEQ);
+ jseq = static_cast<segment_seq_t>(new_jseq);
+ } else {
+ device_off_t mod = (-off) / roll_size;
+ joff -= ((-off) % roll_size);
+ if (joff < roll_start) {
+ ++mod;
+ joff += roll_size;
+ }
+ if (jseq >= mod) {
+ jseq -= mod;
+ } else {
+ return JOURNAL_SEQ_MIN;
+ }
+ }
+ assert(joff >= roll_start);
+ assert(joff < roll_end);
+ return journal_seq_t{jseq, make_block_relative_paddr(joff)};
+}
+
+device_off_t journal_seq_t::relative_to(
+ journal_type_t type,
+ const journal_seq_t& r,
+ device_off_t roll_start,
+ device_off_t roll_size) const
+{
+ assert(offset.is_absolute());
+ assert(r.offset.is_absolute());
+ assert(roll_start >= 0);
+ assert(roll_size > 0);
+
+ device_off_t ret = static_cast<device_off_t>(segment_seq) - r.segment_seq;
+ ret *= roll_size;
+ if (type == journal_type_t::SEGMENTED) {
+ ret += (static_cast<device_off_t>(offset.as_seg_paddr().get_segment_off()) -
+ static_cast<device_off_t>(r.offset.as_seg_paddr().get_segment_off()));
+ } else {
+ assert(type == journal_type_t::RANDOM_BLOCK);
+ ret += offset.as_blk_paddr().get_device_off() -
+ r.offset.as_blk_paddr().get_device_off();
+ }
+ assert(ret <= DEVICE_OFF_MAX && ret >= DEVICE_OFF_MIN);
+ return ret;
+}
+
+std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq)
+{
+ if (seq == JOURNAL_SEQ_NULL) {
+ return out << "JOURNAL_SEQ_NULL";
+ } else if (seq == JOURNAL_SEQ_MIN) {
+ return out << "JOURNAL_SEQ_MIN";
+ } else {
+ return out << "jseq("
+ << segment_seq_printer_t{seq.segment_seq}
+ << ", " << seq.offset
+ << ")";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, extent_types_t t)
+{
+ switch (t) {
+ case extent_types_t::ROOT:
+ return out << "ROOT";
+ case extent_types_t::LADDR_INTERNAL:
+ return out << "LADDR_INTERNAL";
+ case extent_types_t::LADDR_LEAF:
+ return out << "LADDR_LEAF";
+ case extent_types_t::DINK_LADDR_LEAF:
+ return out << "LADDR_LEAF";
+ case extent_types_t::ONODE_BLOCK_STAGED:
+ return out << "ONODE_BLOCK_STAGED";
+ case extent_types_t::OMAP_INNER:
+ return out << "OMAP_INNER";
+ case extent_types_t::OMAP_LEAF:
+ return out << "OMAP_LEAF";
+ case extent_types_t::COLL_BLOCK:
+ return out << "COLL_BLOCK";
+ case extent_types_t::OBJECT_DATA_BLOCK:
+ return out << "OBJECT_DATA_BLOCK";
+ case extent_types_t::RETIRED_PLACEHOLDER:
+ return out << "RETIRED_PLACEHOLDER";
+ case extent_types_t::TEST_BLOCK:
+ return out << "TEST_BLOCK";
+ case extent_types_t::TEST_BLOCK_PHYSICAL:
+ return out << "TEST_BLOCK_PHYSICAL";
+ case extent_types_t::BACKREF_INTERNAL:
+ return out << "BACKREF_INTERNAL";
+ case extent_types_t::BACKREF_LEAF:
+ return out << "BACKREF_LEAF";
+ case extent_types_t::NONE:
+ return out << "NONE";
+ default:
+ return out << "UNKNOWN";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, rewrite_gen_printer_t gen)
+{
+ if (gen.gen == NULL_GENERATION) {
+ return out << "GEN_NULL";
+ } else if (gen.gen == INIT_GENERATION) {
+ return out << "GEN_INIT";
+ } else if (gen.gen == INLINE_GENERATION) {
+ return out << "GEN_INL";
+ } else if (gen.gen == OOL_GENERATION) {
+ return out << "GEN_OOL";
+ } else if (gen.gen > REWRITE_GENERATIONS) {
+ return out << "GEN_INVALID(" << (unsigned)gen.gen << ")!";
+ } else {
+ return out << "GEN(" << (unsigned)gen.gen << ")";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, data_category_t c)
+{
+ switch (c) {
+ case data_category_t::METADATA:
+ return out << "MD";
+ case data_category_t::DATA:
+ return out << "DATA";
+ default:
+ return out << "INVALID_CATEGORY!";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, sea_time_point_printer_t tp)
+{
+ if (tp.tp == NULL_TIME) {
+ return out << "tp(NULL)";
+ }
+ auto time = seastar::lowres_system_clock::to_time_t(tp.tp);
+ char buf[32];
+ std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&time));
+ return out << "tp(" << buf << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, mod_time_point_printer_t tp) {
+ auto time = mod_to_timepoint(tp.tp);
+ return out << "mod_" << sea_time_point_printer_t{time};
+}
+
+std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs)
+{
+ bool first = false;
+ for (auto &i: rhs) {
+ out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')';
+ first = true;
+ }
+ return out << ']';
+}
+std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs)
+{
+ bool first = false;
+ for (auto &i: rhs) {
+ out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')';
+ first = true;
+ }
+ return out << ']';
+}
+
+std::ostream &operator<<(std::ostream &out, const delta_info_t &delta)
+{
+ return out << "delta_info_t("
+ << "type: " << delta.type
+ << ", paddr: " << delta.paddr
+ << ", laddr: " << delta.laddr
+ << ", prev_crc: " << delta.prev_crc
+ << ", final_crc: " << delta.final_crc
+ << ", length: " << delta.length
+ << ", pversion: " << delta.pversion
+ << ", ext_seq: " << delta.ext_seq
+ << ", seg_type: " << delta.seg_type
+ << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const journal_tail_delta_t &delta)
+{
+ return out << "journal_tail_delta_t("
+ << "alloc_tail=" << delta.alloc_tail
+ << ", dirty_tail=" << delta.dirty_tail
+ << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const extent_info_t &info)
+{
+ return out << "extent_info_t("
+ << "type: " << info.type
+ << ", addr: " << info.addr
+ << ", len: " << info.len
+ << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const segment_header_t &header)
+{
+ return out << "segment_header_t("
+ << header.physical_segment_id
+ << " " << header.type
+ << " " << segment_seq_printer_t{header.segment_seq}
+ << " " << header.category
+ << " " << rewrite_gen_printer_t{header.generation}
+ << ", dirty_tail=" << header.dirty_tail
+ << ", alloc_tail=" << header.alloc_tail
+ << ", segment_nonce=" << header.segment_nonce
+ << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail)
+{
+ return out << "segment_tail_t("
+ << tail.physical_segment_id
+ << " " << tail.type
+ << " " << segment_seq_printer_t{tail.segment_seq}
+ << ", segment_nonce=" << tail.segment_nonce
+ << ", modify_time=" << mod_time_point_printer_t{tail.modify_time}
+ << ", num_extents=" << tail.num_extents
+ << ")";
+}
+
+extent_len_t record_size_t::get_raw_mdlength() const
+{
+ // empty record is allowed to submit
+ return plain_mdlength +
+ ceph::encoded_sizeof_bounded<record_header_t>();
+}
+
+void record_size_t::account_extent(extent_len_t extent_len)
+{
+ assert(extent_len);
+ plain_mdlength += ceph::encoded_sizeof_bounded<extent_info_t>();
+ dlength += extent_len;
+}
+
+void record_size_t::account(const delta_info_t& delta)
+{
+ assert(delta.bl.length());
+ plain_mdlength += ceph::encoded_sizeof(delta);
+}
+
+std::ostream &operator<<(std::ostream &os, transaction_type_t type)
+{
+ switch (type) {
+ case transaction_type_t::MUTATE:
+ return os << "MUTATE";
+ case transaction_type_t::READ:
+ return os << "READ";
+ case transaction_type_t::TRIM_DIRTY:
+ return os << "TRIM_DIRTY";
+ case transaction_type_t::TRIM_ALLOC:
+ return os << "TRIM_ALLOC";
+ case transaction_type_t::CLEANER_MAIN:
+ return os << "CLEANER_MAIN";
+ case transaction_type_t::CLEANER_COLD:
+ return os << "CLEANER_COLD";
+ case transaction_type_t::MAX:
+ return os << "TRANS_TYPE_NULL";
+ default:
+ return os << "INVALID_TRANS_TYPE("
+ << static_cast<std::size_t>(type)
+ << ")";
+ }
+}
+
+std::ostream &operator<<(std::ostream& out, const record_size_t& rsize)
+{
+ return out << "record_size_t("
+ << "raw_md=" << rsize.get_raw_mdlength()
+ << ", data=" << rsize.dlength
+ << ")";
+}
+
+std::ostream &operator<<(std::ostream& out, const record_t& r)
+{
+ return out << "record_t("
+ << "type=" << r.type
+ << ", num_extents=" << r.extents.size()
+ << ", num_deltas=" << r.deltas.size()
+ << ", modify_time=" << sea_time_point_printer_t{r.modify_time}
+ << ")";
+}
+
+std::ostream &operator<<(std::ostream& out, const record_header_t& r)
+{
+ return out << "record_header_t("
+ << "type=" << r.type
+ << ", num_extents=" << r.extents
+ << ", num_deltas=" << r.deltas
+ << ", modify_time=" << mod_time_point_printer_t{r.modify_time}
+ << ")";
+}
+
+std::ostream& operator<<(std::ostream& out, const record_group_header_t& h)
+{
+ return out << "record_group_header_t("
+ << "num_records=" << h.records
+ << ", mdlength=" << h.mdlength
+ << ", dlength=" << h.dlength
+ << ", nonce=" << h.segment_nonce
+ << ", committed_to=" << h.committed_to
+ << ", data_crc=" << h.data_crc
+ << ")";
+}
+
+extent_len_t record_group_size_t::get_raw_mdlength() const
+{
+ return plain_mdlength +
+ sizeof(checksum_t) +
+ ceph::encoded_sizeof_bounded<record_group_header_t>();
+}
+
+void record_group_size_t::account(
+ const record_size_t& rsize,
+ extent_len_t _block_size)
+{
+ // empty record is allowed to submit
+ assert(_block_size > 0);
+ assert(rsize.dlength % _block_size == 0);
+ assert(block_size == 0 || block_size == _block_size);
+ plain_mdlength += rsize.get_raw_mdlength();
+ dlength += rsize.dlength;
+ block_size = _block_size;
+}
+
+std::ostream& operator<<(std::ostream& out, const record_group_size_t& size)
+{
+ return out << "record_group_size_t("
+ << "raw_md=" << size.get_raw_mdlength()
+ << ", data=" << size.dlength
+ << ", block_size=" << size.block_size
+ << ", fullness=" << size.get_fullness()
+ << ")";
+}
+
+std::ostream& operator<<(std::ostream& out, const record_group_t& rg)
+{
+ return out << "record_group_t("
+ << "num_records=" << rg.records.size()
+ << ", " << rg.size
+ << ")";
+}
+
+ceph::bufferlist encode_record(
+ record_t&& record,
+ extent_len_t block_size,
+ const journal_seq_t& committed_to,
+ segment_nonce_t current_segment_nonce)
+{
+ record_group_t record_group(std::move(record), block_size);
+ return encode_records(
+ record_group,
+ committed_to,
+ current_segment_nonce);
+}
+
+ceph::bufferlist encode_records(
+ record_group_t& record_group,
+ const journal_seq_t& committed_to,
+ segment_nonce_t current_segment_nonce)
+{
+ assert(record_group.size.block_size > 0);
+ assert(record_group.records.size() > 0);
+
+ bufferlist data_bl;
+ for (auto& r: record_group.records) {
+ for (auto& i: r.extents) {
+ assert(i.bl.length());
+ data_bl.append(i.bl);
+ }
+ }
+
+ bufferlist bl;
+ record_group_header_t header{
+ static_cast<extent_len_t>(record_group.records.size()),
+ record_group.size.get_mdlength(),
+ record_group.size.dlength,
+ current_segment_nonce,
+ committed_to,
+ data_bl.crc32c(-1)
+ };
+ encode(header, bl);
+
+ auto metadata_crc_filler = bl.append_hole(sizeof(checksum_t));
+
+ for (auto& r: record_group.records) {
+ record_header_t rheader{
+ r.type,
+ (extent_len_t)r.deltas.size(),
+ (extent_len_t)r.extents.size(),
+ timepoint_to_mod(r.modify_time)
+ };
+ encode(rheader, bl);
+ }
+ for (auto& r: record_group.records) {
+ for (const auto& i: r.extents) {
+ encode(extent_info_t(i), bl);
+ }
+ }
+ for (auto& r: record_group.records) {
+ for (const auto& i: r.deltas) {
+ encode(i, bl);
+ }
+ }
+ ceph_assert(bl.length() == record_group.size.get_raw_mdlength());
+
+ auto aligned_mdlength = record_group.size.get_mdlength();
+ if (bl.length() != aligned_mdlength) {
+ assert(bl.length() < aligned_mdlength);
+ bl.append_zero(aligned_mdlength - bl.length());
+ }
+
+ auto bliter = bl.cbegin();
+ auto metadata_crc = bliter.crc32c(
+ ceph::encoded_sizeof_bounded<record_group_header_t>(),
+ -1);
+ bliter += sizeof(checksum_t); /* metadata crc hole */
+ metadata_crc = bliter.crc32c(
+ bliter.get_remaining(),
+ metadata_crc);
+ ceph_le32 metadata_crc_le;
+ metadata_crc_le = metadata_crc;
+ metadata_crc_filler.copy_in(
+ sizeof(checksum_t),
+ reinterpret_cast<const char *>(&metadata_crc_le));
+
+ bl.claim_append(data_bl);
+ ceph_assert(bl.length() == record_group.size.get_encoded_length());
+
+ record_group.clear();
+ return bl;
+}
+
+std::optional<record_group_header_t>
+try_decode_records_header(
+ const ceph::bufferlist& header_bl,
+ segment_nonce_t expected_nonce)
+{
+ auto bp = header_bl.cbegin();
+ record_group_header_t header;
+ try {
+ decode(header, bp);
+ } catch (ceph::buffer::error &e) {
+ journal_logger().debug(
+ "try_decode_records_header: failed, "
+ "cannot decode record_group_header_t, got {}.",
+ e.what());
+ return std::nullopt;
+ }
+ if (header.segment_nonce != expected_nonce) {
+ journal_logger().debug(
+ "try_decode_records_header: failed, record_group_header nonce mismatch, "
+ "read {}, expected {}!",
+ header.segment_nonce,
+ expected_nonce);
+ return std::nullopt;
+ }
+ return header;
+}
+
+bool validate_records_metadata(
+ const ceph::bufferlist& md_bl)
+{
+ auto bliter = md_bl.cbegin();
+ auto test_crc = bliter.crc32c(
+ ceph::encoded_sizeof_bounded<record_group_header_t>(),
+ -1);
+ ceph_le32 recorded_crc_le;
+ decode(recorded_crc_le, bliter);
+ uint32_t recorded_crc = recorded_crc_le;
+ test_crc = bliter.crc32c(
+ bliter.get_remaining(),
+ test_crc);
+ bool success = (test_crc == recorded_crc);
+ if (!success) {
+ journal_logger().debug(
+ "validate_records_metadata: failed, metadata crc mismatch.");
+ }
+ return success;
+}
+
+bool validate_records_data(
+ const record_group_header_t& header,
+ const ceph::bufferlist& data_bl)
+{
+ bool success = (data_bl.crc32c(-1) == header.data_crc);
+ if (!success) {
+ journal_logger().debug(
+ "validate_records_data: failed, data crc mismatch!");
+ }
+ return success;
+}
+
+std::optional<std::vector<record_header_t>>
+try_decode_record_headers(
+ const record_group_header_t& header,
+ const ceph::bufferlist& md_bl)
+{
+ auto bliter = md_bl.cbegin();
+ bliter += ceph::encoded_sizeof_bounded<record_group_header_t>();
+ bliter += sizeof(checksum_t); /* metadata crc hole */
+ std::vector<record_header_t> record_headers(header.records);
+ for (auto &&i: record_headers) {
+ try {
+ decode(i, bliter);
+ } catch (ceph::buffer::error &e) {
+ journal_logger().debug(
+ "try_decode_record_headers: failed, "
+ "cannot decode record_header_t, got {}.",
+ e.what());
+ return std::nullopt;
+ }
+ }
+ return record_headers;
+}
+
+std::optional<std::vector<record_extent_infos_t> >
+try_decode_extent_infos(
+ const record_group_header_t& header,
+ const ceph::bufferlist& md_bl)
+{
+ auto maybe_headers = try_decode_record_headers(header, md_bl);
+ if (!maybe_headers) {
+ return std::nullopt;
+ }
+
+ auto bliter = md_bl.cbegin();
+ bliter += ceph::encoded_sizeof_bounded<record_group_header_t>();
+ bliter += sizeof(checksum_t); /* metadata crc hole */
+ bliter += (ceph::encoded_sizeof_bounded<record_header_t>() *
+ maybe_headers->size());
+
+ std::vector<record_extent_infos_t> record_extent_infos(
+ maybe_headers->size());
+ auto result_iter = record_extent_infos.begin();
+ for (auto& h: *maybe_headers) {
+ result_iter->header = h;
+ result_iter->extent_infos.resize(h.extents);
+ for (auto& i: result_iter->extent_infos) {
+ try {
+ decode(i, bliter);
+ } catch (ceph::buffer::error &e) {
+ journal_logger().debug(
+ "try_decode_extent_infos: failed, "
+ "cannot decode extent_info_t, got {}.",
+ e.what());
+ return std::nullopt;
+ }
+ }
+ ++result_iter;
+ }
+ return record_extent_infos;
+}
+
+std::optional<std::vector<record_deltas_t> >
+try_decode_deltas(
+ const record_group_header_t& header,
+ const ceph::bufferlist& md_bl,
+ paddr_t record_block_base)
+{
+ auto maybe_record_extent_infos = try_decode_extent_infos(header, md_bl);
+ if (!maybe_record_extent_infos) {
+ return std::nullopt;
+ }
+
+ auto bliter = md_bl.cbegin();
+ bliter += ceph::encoded_sizeof_bounded<record_group_header_t>();
+ bliter += sizeof(checksum_t); /* metadata crc hole */
+ bliter += (ceph::encoded_sizeof_bounded<record_header_t>() *
+ maybe_record_extent_infos->size());
+ for (auto& r: *maybe_record_extent_infos) {
+ bliter += (ceph::encoded_sizeof_bounded<extent_info_t>() *
+ r.extent_infos.size());
+ }
+
+ std::vector<record_deltas_t> record_deltas(
+ maybe_record_extent_infos->size());
+ auto result_iter = record_deltas.begin();
+ for (auto& r: *maybe_record_extent_infos) {
+ result_iter->record_block_base = record_block_base;
+ result_iter->deltas.resize(r.header.deltas);
+ for (auto& i: result_iter->deltas) {
+ try {
+ decode(i.second, bliter);
+ i.first = mod_to_timepoint(r.header.modify_time);
+ } catch (ceph::buffer::error &e) {
+ journal_logger().debug(
+ "try_decode_deltas: failed, "
+ "cannot decode delta_info_t, got {}.",
+ e.what());
+ return std::nullopt;
+ }
+ }
+ for (auto& i: r.extent_infos) {
+ record_block_base = record_block_base.add_offset(i.len);
+ }
+ ++result_iter;
+ }
+ return record_deltas;
+}
+
+std::ostream& operator<<(std::ostream& out, placement_hint_t h)
+{
+ switch (h) {
+ case placement_hint_t::HOT:
+ return out << "Hint(HOT)";
+ case placement_hint_t::COLD:
+ return out << "Hint(COLD)";
+ case placement_hint_t::REWRITE:
+ return out << "Hint(REWRITE)";
+ case PLACEMENT_HINT_NULL:
+ return out << "Hint(NULL)";
+ default:
+ return out << "INVALID_PLACEMENT_HINT_TYPE!";
+ }
+}
+
+bool can_delay_allocation(device_type_t type) {
+ // Some types of device may not support delayed allocation, for example PMEM.
+ // All types of device currently support delayed allocation.
+ return true;
+}
+
+device_type_t string_to_device_type(std::string type) {
+ if (type == "HDD") {
+ return device_type_t::HDD;
+ }
+ if (type == "SSD") {
+ return device_type_t::SSD;
+ }
+ if (type == "ZBD") {
+ return device_type_t::ZBD;
+ }
+ if (type == "RANDOM_BLOCK_SSD") {
+ return device_type_t::RANDOM_BLOCK_SSD;
+ }
+ return device_type_t::NONE;
+}
+
+std::ostream& operator<<(std::ostream& out, device_type_t t)
+{
+ switch (t) {
+ case device_type_t::NONE:
+ return out << "NONE";
+ case device_type_t::HDD:
+ return out << "HDD";
+ case device_type_t::SSD:
+ return out << "SSD";
+ case device_type_t::ZBD:
+ return out << "ZBD";
+ case device_type_t::EPHEMERAL_COLD:
+ return out << "EPHEMERAL_COLD";
+ case device_type_t::EPHEMERAL_MAIN:
+ return out << "EPHEMERAL_MAIN";
+ case device_type_t::RANDOM_BLOCK_SSD:
+ return out << "RANDOM_BLOCK_SSD";
+ case device_type_t::RANDOM_BLOCK_EPHEMERAL:
+ return out << "RANDOM_BLOCK_EPHEMERAL";
+ default:
+ return out << "INVALID_DEVICE_TYPE!";
+ }
+}
+
+std::ostream& operator<<(std::ostream& out, backend_type_t btype) {
+ if (btype == backend_type_t::SEGMENTED) {
+ return out << "SEGMENTED";
+ } else {
+ return out << "RANDOM_BLOCK";
+ }
+}
+
+std::ostream& operator<<(std::ostream& out, const write_result_t& w)
+{
+ return out << "write_result_t("
+ << "start=" << w.start_seq
+ << ", length=" << w.length
+ << ")";
+}
+
+std::ostream& operator<<(std::ostream& out, const record_locator_t& l)
+{
+ return out << "record_locator_t("
+ << "block_base=" << l.record_block_base
+ << ", " << l.write_result
+ << ")";
+}
+
+void scan_valid_records_cursor::emplace_record_group(
+ const record_group_header_t& header, ceph::bufferlist&& md_bl)
+{
+ auto new_committed_to = header.committed_to;
+ ceph_assert(last_committed == JOURNAL_SEQ_NULL ||
+ last_committed <= new_committed_to);
+ last_committed = new_committed_to;
+ pending_record_groups.emplace_back(
+ seq.offset,
+ header,
+ std::move(md_bl));
+ increment_seq(header.dlength + header.mdlength);
+ ceph_assert(new_committed_to == JOURNAL_SEQ_NULL ||
+ new_committed_to < seq);
+}
+
+std::ostream& operator<<(std::ostream& out, const scan_valid_records_cursor& c)
+{
+ return out << "cursor(last_valid_header_found=" << c.last_valid_header_found
+ << ", seq=" << c.seq
+ << ", last_committed=" << c.last_committed
+ << ", pending_record_groups=" << c.pending_record_groups.size()
+ << ", num_consumed_records=" << c.num_consumed_records
+ << ")";
+}
+
+}
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
new file mode 100644
index 000000000..0b4ad8536
--- /dev/null
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -0,0 +1,2254 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <limits>
+#include <numeric>
+#include <optional>
+#include <iostream>
+#include <vector>
+#include <boost/core/ignore_unused.hpp>
+
+#include <seastar/core/lowres_clock.hh>
+
+#include "include/byteorder.h"
+#include "include/denc.h"
+#include "include/buffer.h"
+#include "include/intarith.h"
+#include "include/interval_set.h"
+#include "include/uuid.h"
+
+namespace crimson::os::seastore {
+
+/* using a special xattr key "omap_header" to store omap header */
+ const std::string OMAP_HEADER_XATTR_KEY = "omap_header";
+
+using transaction_id_t = uint64_t;
+constexpr transaction_id_t TRANS_ID_NULL = 0;
+
+/*
+ * Note: NULL value is usually the default and max value.
+ */
+
+using depth_t = uint32_t;
+using depth_le_t = ceph_le32;
+
+inline depth_le_t init_depth_le(uint32_t i) {
+ return ceph_le32(i);
+}
+
+using checksum_t = uint32_t;
+
+// Immutable metadata for seastore to set at mkfs time
+struct seastore_meta_t {
+ uuid_d seastore_id;
+
+ DENC(seastore_meta_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.seastore_id, p);
+ DENC_FINISH(p);
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const seastore_meta_t& meta);
+
+bool is_aligned(uint64_t offset, uint64_t alignment);
+
+// identifies a specific physical device within seastore
+using device_id_t = uint8_t;
+
+constexpr auto DEVICE_ID_BITS = std::numeric_limits<device_id_t>::digits;
+
+constexpr device_id_t DEVICE_ID_MAX = std::numeric_limits<device_id_t>::max();
+constexpr device_id_t DEVICE_ID_NULL = DEVICE_ID_MAX;
+constexpr device_id_t DEVICE_ID_RECORD_RELATIVE = DEVICE_ID_MAX - 1;
+constexpr device_id_t DEVICE_ID_BLOCK_RELATIVE = DEVICE_ID_MAX - 2;
+constexpr device_id_t DEVICE_ID_DELAYED = DEVICE_ID_MAX - 3;
+// for tests which generate fake paddrs
+constexpr device_id_t DEVICE_ID_FAKE = DEVICE_ID_MAX - 4;
+constexpr device_id_t DEVICE_ID_ZERO = DEVICE_ID_MAX - 5;
+constexpr device_id_t DEVICE_ID_ROOT = DEVICE_ID_MAX - 6;
+constexpr device_id_t DEVICE_ID_MAX_VALID = DEVICE_ID_MAX - 7;
+constexpr device_id_t DEVICE_ID_MAX_VALID_SEGMENT = DEVICE_ID_MAX >> 1;
+constexpr device_id_t DEVICE_ID_SEGMENTED_MIN = 0;
+constexpr device_id_t DEVICE_ID_RANDOM_BLOCK_MIN =
+ 1 << (std::numeric_limits<device_id_t>::digits - 1);
+
+struct device_id_printer_t {
+ device_id_t id;
+};
+
+std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id);
+
+// 1 bit in paddr_t to identify the absolute physical address type
+enum class paddr_types_t {
+ SEGMENT = 0,
+ RANDOM_BLOCK = 1,
+ RESERVED = 2
+};
+
+constexpr paddr_types_t device_id_to_paddr_type(device_id_t id) {
+ if (id > DEVICE_ID_MAX_VALID) {
+ return paddr_types_t::RESERVED;
+ } else if ((id & 0x80) == 0) {
+ return paddr_types_t::SEGMENT;
+ } else {
+ return paddr_types_t::RANDOM_BLOCK;
+ }
+}
+
+constexpr bool has_device_off(device_id_t id) {
+ return id == DEVICE_ID_RECORD_RELATIVE ||
+ id == DEVICE_ID_BLOCK_RELATIVE ||
+ id == DEVICE_ID_DELAYED ||
+ id == DEVICE_ID_FAKE ||
+ id == DEVICE_ID_ROOT;
+}
+
+// internal segment id type of segment_id_t below, with the top
+// "DEVICE_ID_BITS" bits representing the device id of the segment.
+using internal_segment_id_t = uint32_t;
+constexpr auto SEGMENT_ID_BITS = std::numeric_limits<internal_segment_id_t>::digits;
+
+// segment ids without a device id encapsulated
+using device_segment_id_t = uint32_t;
+constexpr auto DEVICE_SEGMENT_ID_BITS = SEGMENT_ID_BITS - DEVICE_ID_BITS;
+constexpr device_segment_id_t DEVICE_SEGMENT_ID_MAX = (1 << DEVICE_SEGMENT_ID_BITS) - 1;
+
+// Identifies segment location on disk, see SegmentManager,
+struct segment_id_t {
+public:
+ // segment_id_t() == MAX_SEG_ID == NULL_SEG_ID
+ segment_id_t()
+ : segment_id_t(DEVICE_ID_MAX_VALID_SEGMENT, DEVICE_SEGMENT_ID_MAX) {}
+
+ segment_id_t(device_id_t id, device_segment_id_t _segment)
+ : segment_id_t(make_internal(id, _segment)) {}
+
+ segment_id_t(internal_segment_id_t _segment)
+ : segment(_segment) {
+ assert(device_id_to_paddr_type(device_id()) == paddr_types_t::SEGMENT);
+ }
+
+ [[gnu::always_inline]]
+ constexpr device_id_t device_id() const {
+ return static_cast<device_id_t>(segment >> DEVICE_SEGMENT_ID_BITS);
+ }
+
+ [[gnu::always_inline]]
+ constexpr device_segment_id_t device_segment_id() const {
+ constexpr internal_segment_id_t _SEGMENT_ID_MASK = (1u << DEVICE_SEGMENT_ID_BITS) - 1;
+ return segment & _SEGMENT_ID_MASK;
+ }
+
+ bool operator==(const segment_id_t& other) const {
+ return segment == other.segment;
+ }
+ bool operator!=(const segment_id_t& other) const {
+ return segment != other.segment;
+ }
+ bool operator<(const segment_id_t& other) const {
+ return segment < other.segment;
+ }
+ bool operator<=(const segment_id_t& other) const {
+ return segment <= other.segment;
+ }
+ bool operator>(const segment_id_t& other) const {
+ return segment > other.segment;
+ }
+ bool operator>=(const segment_id_t& other) const {
+ return segment >= other.segment;
+ }
+
+ DENC(segment_id_t, v, p) {
+ denc(v.segment, p);
+ }
+
+ static constexpr segment_id_t create_const(
+ device_id_t id, device_segment_id_t segment) {
+ return segment_id_t(id, segment, const_t{});
+ }
+
+private:
+ struct const_t {};
+ constexpr segment_id_t(device_id_t id, device_segment_id_t _segment, const_t)
+ : segment(make_internal(id, _segment)) {}
+
+ constexpr static inline internal_segment_id_t make_internal(
+ device_id_t d_id,
+ device_segment_id_t s_id) {
+ return static_cast<internal_segment_id_t>(s_id) |
+ (static_cast<internal_segment_id_t>(d_id) << DEVICE_SEGMENT_ID_BITS);
+ }
+
+ internal_segment_id_t segment;
+
+ friend struct segment_id_le_t;
+ friend struct paddr_t;
+};
+
+std::ostream &operator<<(std::ostream &out, const segment_id_t&);
+
+// ondisk type of segment_id_t
+struct __attribute((packed)) segment_id_le_t {
+ ceph_le32 segment = ceph_le32(segment_id_t().segment);
+
+ segment_id_le_t(const segment_id_t id) :
+ segment(ceph_le32(id.segment)) {}
+
+ operator segment_id_t() const {
+ return segment_id_t(segment);
+ }
+};
+
+constexpr segment_id_t MIN_SEG_ID = segment_id_t::create_const(0, 0);
+// segment_id_t() == MAX_SEG_ID == NULL_SEG_ID
+constexpr segment_id_t MAX_SEG_ID =
+ segment_id_t::create_const(DEVICE_ID_MAX_VALID_SEGMENT, DEVICE_SEGMENT_ID_MAX);
+constexpr segment_id_t NULL_SEG_ID = MAX_SEG_ID;
+
+/* Monotonically increasing segment seq, uniquely identifies
+ * the incarnation of a segment */
+using segment_seq_t = uint64_t;
+static constexpr segment_seq_t MAX_SEG_SEQ =
+ std::numeric_limits<segment_seq_t>::max();
+static constexpr segment_seq_t NULL_SEG_SEQ = MAX_SEG_SEQ;
+
+enum class segment_type_t : uint8_t {
+ JOURNAL = 0,
+ OOL,
+ NULL_SEG,
+};
+
+std::ostream& operator<<(std::ostream& out, segment_type_t t);
+
+struct segment_seq_printer_t {
+ segment_seq_t seq;
+};
+
+std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq);
+
+/**
+ * segment_map_t
+ *
+ * Compact templated mapping from a segment_id_t to a value type.
+ */
+template <typename T>
+class segment_map_t {
+public:
+ segment_map_t() {
+ // initializes top vector with 0 length vectors to indicate that they
+ // are not yet present
+ device_to_segments.resize(DEVICE_ID_MAX_VALID);
+ }
+ void add_device(device_id_t device, std::size_t segments, const T& init) {
+ ceph_assert(device <= DEVICE_ID_MAX_VALID);
+ ceph_assert(device_to_segments[device].size() == 0);
+ ceph_assert(segments > 0);
+ device_to_segments[device].resize(segments, init);
+ total_segments += segments;
+ }
+ void clear() {
+ device_to_segments.clear();
+ device_to_segments.resize(DEVICE_ID_MAX_VALID);
+ total_segments = 0;
+ }
+
+ T& operator[](segment_id_t id) {
+ assert(id.device_segment_id() < device_to_segments[id.device_id()].size());
+ return device_to_segments[id.device_id()][id.device_segment_id()];
+ }
+ const T& operator[](segment_id_t id) const {
+ assert(id.device_segment_id() < device_to_segments[id.device_id()].size());
+ return device_to_segments[id.device_id()][id.device_segment_id()];
+ }
+
+ bool contains(segment_id_t id) {
+ bool b = id.device_id() < device_to_segments.size();
+ if (!b) {
+ return b;
+ }
+ b = id.device_segment_id() < device_to_segments[id.device_id()].size();
+ return b;
+ }
+
+ auto begin() {
+ return iterator<false>::lower_bound(*this, 0, 0);
+ }
+ auto begin() const {
+ return iterator<true>::lower_bound(*this, 0, 0);
+ }
+
+ auto end() {
+ return iterator<false>::end_iterator(*this);
+ }
+ auto end() const {
+ return iterator<true>::end_iterator(*this);
+ }
+
+ auto device_begin(device_id_t id) {
+ auto ret = iterator<false>::lower_bound(*this, id, 0);
+ assert(ret->first.device_id() == id);
+ return ret;
+ }
+ auto device_end(device_id_t id) {
+ return iterator<false>::lower_bound(*this, id + 1, 0);
+ }
+
+ size_t size() const {
+ return total_segments;
+ }
+
+private:
+ template <bool is_const = false>
+ class iterator {
+ /// points at set being iterated over
+ std::conditional_t<
+ is_const,
+ const segment_map_t &,
+ segment_map_t &> parent;
+
+ /// points at current device, or DEVICE_ID_MAX_VALID if is_end()
+ device_id_t device_id;
+
+ /// segment at which we are pointing, 0 if is_end()
+ device_segment_id_t device_segment_id;
+
+ /// holds referent for operator* and operator-> when !is_end()
+ std::optional<
+ std::pair<
+ const segment_id_t,
+ std::conditional_t<is_const, const T&, T&>
+ >> current;
+
+ bool is_end() const {
+ return device_id == DEVICE_ID_MAX_VALID;
+ }
+
+ void find_valid() {
+ assert(!is_end());
+ auto &device_vec = parent.device_to_segments[device_id];
+ if (device_vec.size() == 0 ||
+ device_segment_id == device_vec.size()) {
+ while (++device_id < DEVICE_ID_MAX_VALID &&
+ parent.device_to_segments[device_id].size() == 0);
+ device_segment_id = 0;
+ }
+ if (is_end()) {
+ current = std::nullopt;
+ } else {
+ current.emplace(
+ segment_id_t{device_id, device_segment_id},
+ parent.device_to_segments[device_id][device_segment_id]
+ );
+ }
+ }
+
+ iterator(
+ decltype(parent) &parent,
+ device_id_t device_id,
+ device_segment_id_t device_segment_id)
+ : parent(parent), device_id(device_id),
+ device_segment_id(device_segment_id) {}
+
+ public:
+ static iterator lower_bound(
+ decltype(parent) &parent,
+ device_id_t device_id,
+ device_segment_id_t device_segment_id) {
+ if (device_id == DEVICE_ID_MAX_VALID) {
+ return end_iterator(parent);
+ } else {
+ auto ret = iterator{parent, device_id, device_segment_id};
+ ret.find_valid();
+ return ret;
+ }
+ }
+
+ static iterator end_iterator(
+ decltype(parent) &parent) {
+ return iterator{parent, DEVICE_ID_MAX_VALID, 0};
+ }
+
+ iterator<is_const>& operator++() {
+ assert(!is_end());
+ ++device_segment_id;
+ find_valid();
+ return *this;
+ }
+
+ bool operator==(iterator<is_const> rit) {
+ return (device_id == rit.device_id &&
+ device_segment_id == rit.device_segment_id);
+ }
+
+ bool operator!=(iterator<is_const> rit) {
+ return !(*this == rit);
+ }
+
+ template <bool c = is_const, std::enable_if_t<c, int> = 0>
+ const std::pair<const segment_id_t, const T&> *operator->() {
+ assert(!is_end());
+ return &*current;
+ }
+ template <bool c = is_const, std::enable_if_t<!c, int> = 0>
+ std::pair<const segment_id_t, T&> *operator->() {
+ assert(!is_end());
+ return &*current;
+ }
+
+ using reference = std::conditional_t<
+ is_const, const std::pair<const segment_id_t, const T&>&,
+ std::pair<const segment_id_t, T&>&>;
+ reference operator*() {
+ assert(!is_end());
+ return *current;
+ }
+ };
+
+ /**
+ * device_to_segments
+ *
+ * device -> segment -> T mapping. device_to_segments[d].size() > 0 iff
+ * device <d> has been added.
+ */
+ std::vector<std::vector<T>> device_to_segments;
+
+ /// total number of added segments
+ size_t total_segments = 0;
+};
+
+/**
+ * paddr_t
+ *
+ * <segment, offset> offset on disk, see SegmentManager
+ *
+ * May be absolute, record_relative, or block_relative.
+ *
+ * Blocks get read independently of the surrounding record,
+ * so paddrs embedded directly within a block need to refer
+ * to other blocks within the same record by a block_relative
+ * addr relative to the block's own offset. By contrast,
+ * deltas to existing blocks need to use record_relative
+ * addrs relative to the first block of the record.
+ *
+ * Fresh extents during a transaction are refered to by
+ * record_relative paddrs.
+ */
+
+using internal_paddr_t = uint64_t;
+constexpr auto PADDR_BITS = std::numeric_limits<internal_paddr_t>::digits;
+
+/**
+ * device_off_t
+ *
+ * Offset within a device, may be negative for relative offsets.
+ */
+using device_off_t = int64_t;
+using u_device_off_t = uint64_t;
+constexpr auto DEVICE_OFF_BITS = PADDR_BITS - DEVICE_ID_BITS;
+constexpr auto DEVICE_OFF_MAX =
+ std::numeric_limits<device_off_t>::max() >> DEVICE_ID_BITS;
+constexpr auto DEVICE_OFF_MIN = -(DEVICE_OFF_MAX + 1);
+
+/**
+ * segment_off_t
+ *
+ * Offset within a segment on disk, may be negative for relative offsets.
+ */
+using segment_off_t = int32_t;
+using u_segment_off_t = uint32_t;
+constexpr auto SEGMENT_OFF_MAX = std::numeric_limits<segment_off_t>::max();
+constexpr auto SEGMENT_OFF_MIN = std::numeric_limits<segment_off_t>::min();
+constexpr auto SEGMENT_OFF_BITS = std::numeric_limits<u_segment_off_t>::digits;
+static_assert(PADDR_BITS == SEGMENT_ID_BITS + SEGMENT_OFF_BITS);
+
+constexpr auto DEVICE_ID_MASK =
+ ((internal_paddr_t(1) << DEVICE_ID_BITS) - 1) << DEVICE_OFF_BITS;
+constexpr auto DEVICE_OFF_MASK =
+ std::numeric_limits<u_device_off_t>::max() >> DEVICE_ID_BITS;
+constexpr auto SEGMENT_ID_MASK =
+ ((internal_paddr_t(1) << SEGMENT_ID_BITS) - 1) << SEGMENT_OFF_BITS;
+constexpr auto SEGMENT_OFF_MASK =
+ (internal_paddr_t(1) << SEGMENT_OFF_BITS) - 1;
+
+constexpr internal_paddr_t encode_device_off(device_off_t off) {
+ return static_cast<internal_paddr_t>(off) & DEVICE_OFF_MASK;
+}
+
+constexpr device_off_t decode_device_off(internal_paddr_t addr) {
+ if (addr & (1ull << (DEVICE_OFF_BITS - 1))) {
+ return static_cast<device_off_t>(addr | DEVICE_ID_MASK);
+ } else {
+ return static_cast<device_off_t>(addr & DEVICE_OFF_MASK);
+ }
+}
+
+struct seg_paddr_t;
+struct blk_paddr_t;
+struct res_paddr_t;
+struct pladdr_t;
+struct paddr_t {
+public:
+ // P_ADDR_MAX == P_ADDR_NULL == paddr_t{}
+ paddr_t() : paddr_t(DEVICE_ID_MAX, device_off_t(0)) {}
+
+ static paddr_t make_seg_paddr(
+ segment_id_t seg,
+ segment_off_t offset) {
+ return paddr_t(seg, offset);
+ }
+
+ static paddr_t make_seg_paddr(
+ device_id_t device,
+ device_segment_id_t seg,
+ segment_off_t offset) {
+ return paddr_t(segment_id_t(device, seg), offset);
+ }
+
+ static paddr_t make_blk_paddr(
+ device_id_t device,
+ device_off_t offset) {
+ assert(device_id_to_paddr_type(device) == paddr_types_t::RANDOM_BLOCK);
+ return paddr_t(device, offset);
+ }
+
+ static paddr_t make_res_paddr(
+ device_id_t device,
+ device_off_t offset) {
+ assert(device_id_to_paddr_type(device) == paddr_types_t::RESERVED);
+ return paddr_t(device, offset);
+ }
+
+ void swap(paddr_t &other) {
+ std::swap(internal_paddr, other.internal_paddr);
+ }
+
+ device_id_t get_device_id() const {
+ return static_cast<device_id_t>(internal_paddr >> DEVICE_OFF_BITS);
+ }
+
+ paddr_types_t get_addr_type() const {
+ return device_id_to_paddr_type(get_device_id());
+ }
+
+ paddr_t add_offset(device_off_t o) const;
+
+ paddr_t add_relative(paddr_t o) const;
+
+ paddr_t add_block_relative(paddr_t o) const {
+ // special version mainly for documentation purposes
+ assert(o.is_block_relative());
+ return add_relative(o);
+ }
+
+ paddr_t add_record_relative(paddr_t o) const {
+ // special version mainly for documentation purposes
+ assert(o.is_record_relative());
+ return add_relative(o);
+ }
+
+ /**
+ * maybe_relative_to
+ *
+ * Helper for the case where an in-memory paddr_t may be
+ * either block_relative or absolute (not record_relative).
+ *
+ * base must be either absolute or record_relative.
+ */
+ paddr_t maybe_relative_to(paddr_t base) const {
+ assert(!base.is_block_relative());
+ if (is_block_relative()) {
+ return base.add_block_relative(*this);
+ } else {
+ return *this;
+ }
+ }
+
+ /**
+ * block_relative_to
+ *
+ * Only defined for record_relative paddr_ts. Yields a
+ * block_relative address.
+ */
+ paddr_t block_relative_to(paddr_t rhs) const;
+
+ // To be compatible with laddr_t operator+
+ paddr_t operator+(device_off_t o) const {
+ return add_offset(o);
+ }
+
+ seg_paddr_t& as_seg_paddr();
+ const seg_paddr_t& as_seg_paddr() const;
+ blk_paddr_t& as_blk_paddr();
+ const blk_paddr_t& as_blk_paddr() const;
+ res_paddr_t& as_res_paddr();
+ const res_paddr_t& as_res_paddr() const;
+
+ bool is_delayed() const {
+ return get_device_id() == DEVICE_ID_DELAYED;
+ }
+ bool is_block_relative() const {
+ return get_device_id() == DEVICE_ID_BLOCK_RELATIVE;
+ }
+ bool is_record_relative() const {
+ return get_device_id() == DEVICE_ID_RECORD_RELATIVE;
+ }
+ bool is_relative() const {
+ return is_block_relative() || is_record_relative();
+ }
+ /// Denotes special null addr
+ bool is_null() const {
+ return get_device_id() == DEVICE_ID_NULL;
+ }
+ /// Denotes special zero addr
+ bool is_zero() const {
+ return get_device_id() == DEVICE_ID_ZERO;
+ }
+ /// Denotes the root addr
+ bool is_root() const {
+ return get_device_id() == DEVICE_ID_ROOT;
+ }
+
+ /**
+ * is_real
+ *
+ * indicates whether addr reflects a physical location, absolute, relative,
+ * or delayed. FAKE segments also count as real so as to reflect the way in
+ * which unit tests use them.
+ */
+ bool is_real() const {
+ return !is_zero() && !is_null() && !is_root();
+ }
+
+ bool is_absolute() const {
+ return get_addr_type() != paddr_types_t::RESERVED;
+ }
+
+ bool is_fake() const {
+ return get_device_id() == DEVICE_ID_FAKE;
+ }
+
+ auto operator<=>(const paddr_t &) const = default;
+
+ DENC(paddr_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.internal_paddr, p);
+ DENC_FINISH(p);
+ }
+
+ constexpr static paddr_t create_const(
+ device_id_t d_id, device_off_t offset) {
+ return paddr_t(d_id, offset, const_construct_t());
+ }
+
+protected:
+ internal_paddr_t internal_paddr;
+
+private:
+ // as seg
+ paddr_t(segment_id_t seg, segment_off_t offset)
+ : paddr_t((static_cast<internal_paddr_t>(seg.segment) << SEGMENT_OFF_BITS) |
+ static_cast<u_segment_off_t>(offset)) {}
+
+ // as blk or res
+ paddr_t(device_id_t d_id, device_off_t offset)
+ : paddr_t((static_cast<internal_paddr_t>(d_id) << DEVICE_OFF_BITS) |
+ encode_device_off(offset)) {
+ assert(offset >= DEVICE_OFF_MIN);
+ assert(offset <= DEVICE_OFF_MAX);
+ assert(get_addr_type() != paddr_types_t::SEGMENT);
+ }
+
+ paddr_t(internal_paddr_t val);
+
+ struct const_construct_t {};
+ constexpr paddr_t(device_id_t d_id, device_off_t offset, const_construct_t)
+ : internal_paddr((static_cast<internal_paddr_t>(d_id) << DEVICE_OFF_BITS) |
+ static_cast<u_device_off_t>(offset)) {}
+
+ friend struct paddr_le_t;
+ friend struct pladdr_le_t;
+
+};
+
+std::ostream &operator<<(std::ostream &out, const paddr_t &rhs);
+
+struct seg_paddr_t : public paddr_t {
+ seg_paddr_t(const seg_paddr_t&) = delete;
+ seg_paddr_t(seg_paddr_t&) = delete;
+ seg_paddr_t& operator=(const seg_paddr_t&) = delete;
+ seg_paddr_t& operator=(seg_paddr_t&) = delete;
+
+ segment_id_t get_segment_id() const {
+ return segment_id_t(static_cast<internal_segment_id_t>(
+ internal_paddr >> SEGMENT_OFF_BITS));
+ }
+
+ segment_off_t get_segment_off() const {
+ return segment_off_t(internal_paddr & SEGMENT_OFF_MASK);
+ }
+
+ void set_segment_off(segment_off_t off) {
+ assert(off >= 0);
+ internal_paddr = (internal_paddr & SEGMENT_ID_MASK);
+ internal_paddr |= static_cast<u_segment_off_t>(off);
+ }
+
+ paddr_t add_offset(device_off_t o) const {
+ device_off_t off = get_segment_off() + o;
+ assert(off >= 0);
+ assert(off <= SEGMENT_OFF_MAX);
+ return paddr_t::make_seg_paddr(
+ get_segment_id(), static_cast<segment_off_t>(off));
+ }
+};
+
+struct blk_paddr_t : public paddr_t {
+ blk_paddr_t(const blk_paddr_t&) = delete;
+ blk_paddr_t(blk_paddr_t&) = delete;
+ blk_paddr_t& operator=(const blk_paddr_t&) = delete;
+ blk_paddr_t& operator=(blk_paddr_t&) = delete;
+
+ device_off_t get_device_off() const {
+ return decode_device_off(internal_paddr);
+ }
+
+ void set_device_off(device_off_t off) {
+ assert(off >= 0);
+ assert(off <= DEVICE_OFF_MAX);
+ internal_paddr = (internal_paddr & DEVICE_ID_MASK);
+ internal_paddr |= encode_device_off(off);
+ }
+
+ paddr_t add_offset(device_off_t o) const {
+ assert(o >= DEVICE_OFF_MIN);
+ assert(o <= DEVICE_OFF_MAX);
+ auto off = get_device_off() + o;
+ return paddr_t::make_blk_paddr(get_device_id(), off);
+ }
+};
+
+struct res_paddr_t : public paddr_t {
+ res_paddr_t(const res_paddr_t&) = delete;
+ res_paddr_t(res_paddr_t&) = delete;
+ res_paddr_t& operator=(const res_paddr_t&) = delete;
+ res_paddr_t& operator=(res_paddr_t&) = delete;
+
+ device_off_t get_device_off() const {
+ return decode_device_off(internal_paddr);
+ }
+
+ void set_device_off(device_off_t off) {
+ assert(has_device_off(get_device_id()));
+ assert(off >= DEVICE_OFF_MIN);
+ assert(off <= DEVICE_OFF_MAX);
+ internal_paddr = (internal_paddr & DEVICE_ID_MASK);
+ internal_paddr |= encode_device_off(off);
+ }
+
+ paddr_t add_offset(device_off_t o) const {
+ assert(has_device_off(get_device_id()));
+ assert(o >= DEVICE_OFF_MIN);
+ assert(o <= DEVICE_OFF_MAX);
+ auto off = get_device_off() + o;
+ return paddr_t::make_res_paddr(get_device_id(), off);
+ }
+
+ paddr_t block_relative_to(const res_paddr_t &rhs) const {
+ assert(rhs.is_record_relative() && is_record_relative());
+ auto off = get_device_off() - rhs.get_device_off();
+ return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE, off);
+ }
+};
+
+constexpr paddr_t P_ADDR_MIN = paddr_t::create_const(0, 0);
+// P_ADDR_MAX == P_ADDR_NULL == paddr_t{}
+constexpr paddr_t P_ADDR_MAX = paddr_t::create_const(DEVICE_ID_MAX, 0);
+constexpr paddr_t P_ADDR_NULL = P_ADDR_MAX;
+constexpr paddr_t P_ADDR_ZERO = paddr_t::create_const(DEVICE_ID_ZERO, 0);
+constexpr paddr_t P_ADDR_ROOT = paddr_t::create_const(DEVICE_ID_ROOT, 0);
+
+inline paddr_t make_record_relative_paddr(device_off_t off) {
+ return paddr_t::make_res_paddr(DEVICE_ID_RECORD_RELATIVE, off);
+}
+inline paddr_t make_block_relative_paddr(device_off_t off) {
+ return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE, off);
+}
+inline paddr_t make_fake_paddr(device_off_t off) {
+ return paddr_t::make_res_paddr(DEVICE_ID_FAKE, off);
+}
+inline paddr_t make_delayed_temp_paddr(device_off_t off) {
+ return paddr_t::make_res_paddr(DEVICE_ID_DELAYED, off);
+}
+
+inline const seg_paddr_t& paddr_t::as_seg_paddr() const {
+ assert(get_addr_type() == paddr_types_t::SEGMENT);
+ return *static_cast<const seg_paddr_t*>(this);
+}
+
+inline seg_paddr_t& paddr_t::as_seg_paddr() {
+ assert(get_addr_type() == paddr_types_t::SEGMENT);
+ return *static_cast<seg_paddr_t*>(this);
+}
+
+inline const blk_paddr_t& paddr_t::as_blk_paddr() const {
+ assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK);
+ return *static_cast<const blk_paddr_t*>(this);
+}
+
+inline blk_paddr_t& paddr_t::as_blk_paddr() {
+ assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK);
+ return *static_cast<blk_paddr_t*>(this);
+}
+
+inline const res_paddr_t& paddr_t::as_res_paddr() const {
+ assert(get_addr_type() == paddr_types_t::RESERVED);
+ return *static_cast<const res_paddr_t*>(this);
+}
+
+inline res_paddr_t& paddr_t::as_res_paddr() {
+ assert(get_addr_type() == paddr_types_t::RESERVED);
+ return *static_cast<res_paddr_t*>(this);
+}
+
+inline paddr_t::paddr_t(internal_paddr_t val) : internal_paddr(val) {
+#ifndef NDEBUG
+ auto type = get_addr_type();
+ if (type == paddr_types_t::SEGMENT) {
+ assert(as_seg_paddr().get_segment_off() >= 0);
+ } else if (type == paddr_types_t::RANDOM_BLOCK) {
+ assert(as_blk_paddr().get_device_off() >= 0);
+ } else {
+ assert(type == paddr_types_t::RESERVED);
+ if (!has_device_off(get_device_id())) {
+ assert(as_res_paddr().get_device_off() == 0);
+ }
+ }
+#endif
+}
+
+#define PADDR_OPERATION(a_type, base, func) \
+ if (get_addr_type() == a_type) { \
+ return static_cast<const base*>(this)->func; \
+ }
+
+inline paddr_t paddr_t::add_offset(device_off_t o) const {
+ PADDR_OPERATION(paddr_types_t::SEGMENT, seg_paddr_t, add_offset(o))
+ PADDR_OPERATION(paddr_types_t::RANDOM_BLOCK, blk_paddr_t, add_offset(o))
+ PADDR_OPERATION(paddr_types_t::RESERVED, res_paddr_t, add_offset(o))
+ ceph_assert(0 == "not supported type");
+ return P_ADDR_NULL;
+}
+
+inline paddr_t paddr_t::add_relative(paddr_t o) const {
+ assert(o.is_relative());
+ auto &res_o = o.as_res_paddr();
+ return add_offset(res_o.get_device_off());
+}
+
+inline paddr_t paddr_t::block_relative_to(paddr_t rhs) const {
+ return as_res_paddr().block_relative_to(rhs.as_res_paddr());
+}
+
+struct __attribute((packed)) paddr_le_t {
+ ceph_le64 internal_paddr =
+ ceph_le64(P_ADDR_NULL.internal_paddr);
+
+ using orig_type = paddr_t;
+
+ paddr_le_t() = default;
+ paddr_le_t(const paddr_t &addr) : internal_paddr(ceph_le64(addr.internal_paddr)) {}
+
+ operator paddr_t() const {
+ return paddr_t{internal_paddr};
+ }
+};
+
+using objaddr_t = uint32_t;
+constexpr objaddr_t OBJ_ADDR_MAX = std::numeric_limits<objaddr_t>::max();
+constexpr objaddr_t OBJ_ADDR_NULL = OBJ_ADDR_MAX;
+
+enum class placement_hint_t {
+ HOT = 0, // The default user hint that expects mutations or retirement
+ COLD, // Expect no mutations and no retirement in the near future
+ REWRITE, // Hint for the internal rewrites
+ NUM_HINTS // Constant for number of hints or as NULL
+};
+
+constexpr auto PLACEMENT_HINT_NULL = placement_hint_t::NUM_HINTS;
+
+std::ostream& operator<<(std::ostream& out, placement_hint_t h);
+
+enum class device_type_t : uint8_t {
+ NONE = 0,
+ HDD,
+ SSD,
+ ZBD, // ZNS SSD or SMR HDD
+ EPHEMERAL_COLD,
+ EPHEMERAL_MAIN,
+ RANDOM_BLOCK_SSD,
+ RANDOM_BLOCK_EPHEMERAL,
+ NUM_TYPES
+};
+
+std::ostream& operator<<(std::ostream& out, device_type_t t);
+
+bool can_delay_allocation(device_type_t type);
+device_type_t string_to_device_type(std::string type);
+
+enum class backend_type_t {
+ SEGMENTED, // SegmentManager: SSD, ZBD, HDD
+ RANDOM_BLOCK // RBMDevice: RANDOM_BLOCK_SSD
+};
+
+std::ostream& operator<<(std::ostream& out, backend_type_t);
+using journal_type_t = backend_type_t;
+
+constexpr backend_type_t get_default_backend_of_device(device_type_t dtype) {
+ assert(dtype != device_type_t::NONE &&
+ dtype != device_type_t::NUM_TYPES);
+ if (dtype >= device_type_t::HDD &&
+ dtype <= device_type_t::EPHEMERAL_MAIN) {
+ return backend_type_t::SEGMENTED;
+ } else {
+ return backend_type_t::RANDOM_BLOCK;
+ }
+}
+
+/**
+ * Monotonically increasing identifier for the location of a
+ * journal_record.
+ */
+// JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{}
+struct journal_seq_t {
+ segment_seq_t segment_seq = NULL_SEG_SEQ;
+ paddr_t offset = P_ADDR_NULL;
+
+ void swap(journal_seq_t &other) {
+ std::swap(segment_seq, other.segment_seq);
+ std::swap(offset, other.offset);
+ }
+
+ // produces a pseudo journal_seq_t relative to this by offset
+ journal_seq_t add_offset(
+ journal_type_t type,
+ device_off_t off,
+ device_off_t roll_start,
+ device_off_t roll_size) const;
+
+ device_off_t relative_to(
+ journal_type_t type,
+ const journal_seq_t& r,
+ device_off_t roll_start,
+ device_off_t roll_size) const;
+
+ DENC(journal_seq_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.segment_seq, p);
+ denc(v.offset, p);
+ DENC_FINISH(p);
+ }
+
+ bool operator==(const journal_seq_t &o) const { return cmp(o) == 0; }
+ bool operator!=(const journal_seq_t &o) const { return cmp(o) != 0; }
+ bool operator<(const journal_seq_t &o) const { return cmp(o) < 0; }
+ bool operator<=(const journal_seq_t &o) const { return cmp(o) <= 0; }
+ bool operator>(const journal_seq_t &o) const { return cmp(o) > 0; }
+ bool operator>=(const journal_seq_t &o) const { return cmp(o) >= 0; }
+
+private:
+ int cmp(const journal_seq_t &other) const {
+ if (segment_seq > other.segment_seq) {
+ return 1;
+ } else if (segment_seq < other.segment_seq) {
+ return -1;
+ }
+ using ret_t = std::pair<device_off_t, segment_id_t>;
+ auto to_pair = [](const paddr_t &addr) -> ret_t {
+ if (addr.get_addr_type() == paddr_types_t::SEGMENT) {
+ auto &seg_addr = addr.as_seg_paddr();
+ return ret_t(seg_addr.get_segment_off(), seg_addr.get_segment_id());
+ } else if (addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK) {
+ auto &blk_addr = addr.as_blk_paddr();
+ return ret_t(blk_addr.get_device_off(), MAX_SEG_ID);
+ } else if (addr.get_addr_type() == paddr_types_t::RESERVED) {
+ auto &res_addr = addr.as_res_paddr();
+ return ret_t(res_addr.get_device_off(), MAX_SEG_ID);
+ } else {
+ assert(0 == "impossible");
+ return ret_t(0, MAX_SEG_ID);
+ }
+ };
+ auto left = to_pair(offset);
+ auto right = to_pair(other.offset);
+ if (left > right) {
+ return 1;
+ } else if (left < right) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+};
+
+std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq);
+
+constexpr journal_seq_t JOURNAL_SEQ_MIN{
+ 0,
+ P_ADDR_MIN
+};
+constexpr journal_seq_t JOURNAL_SEQ_MAX{
+ MAX_SEG_SEQ,
+ P_ADDR_MAX
+};
+// JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{}
+constexpr journal_seq_t JOURNAL_SEQ_NULL = JOURNAL_SEQ_MAX;
+
+// logical addr, see LBAManager, TransactionManager
+using laddr_t = uint64_t;
+constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min();
+constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max();
+constexpr laddr_t L_ADDR_NULL = L_ADDR_MAX;
+constexpr laddr_t L_ADDR_ROOT = L_ADDR_MAX - 1;
+constexpr laddr_t L_ADDR_LBAT = L_ADDR_MAX - 2;
+
+struct __attribute((packed)) laddr_le_t {
+ ceph_le64 laddr = ceph_le64(L_ADDR_NULL);
+
+ using orig_type = laddr_t;
+
+ laddr_le_t() = default;
+ laddr_le_t(const laddr_le_t &) = default;
+ explicit laddr_le_t(const laddr_t &addr)
+ : laddr(ceph_le64(addr)) {}
+
+ operator laddr_t() const {
+ return laddr_t(laddr);
+ }
+ laddr_le_t& operator=(laddr_t addr) {
+ ceph_le64 val;
+ val = addr;
+ laddr = val;
+ return *this;
+ }
+};
+
+constexpr uint64_t PL_ADDR_NULL = std::numeric_limits<uint64_t>::max();
+
+struct pladdr_t {
+ std::variant<laddr_t, paddr_t> pladdr;
+
+ pladdr_t() = default;
+ pladdr_t(const pladdr_t &) = default;
+ pladdr_t(laddr_t laddr)
+ : pladdr(laddr) {}
+ pladdr_t(paddr_t paddr)
+ : pladdr(paddr) {}
+
+ bool is_laddr() const {
+ return pladdr.index() == 0;
+ }
+
+ bool is_paddr() const {
+ return pladdr.index() == 1;
+ }
+
+ pladdr_t& operator=(paddr_t paddr) {
+ pladdr = paddr;
+ return *this;
+ }
+
+ pladdr_t& operator=(laddr_t laddr) {
+ pladdr = laddr;
+ return *this;
+ }
+
+ bool operator==(const pladdr_t &) const = default;
+
+ paddr_t get_paddr() const {
+ assert(pladdr.index() == 1);
+ return paddr_t(std::get<1>(pladdr));
+ }
+
+ laddr_t get_laddr() const {
+ assert(pladdr.index() == 0);
+ return laddr_t(std::get<0>(pladdr));
+ }
+
+};
+
+std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr);
+
+enum class addr_type_t : uint8_t {
+ PADDR=0,
+ LADDR=1,
+ MAX=2 // or NONE
+};
+
+struct __attribute((packed)) pladdr_le_t {
+ ceph_le64 pladdr = ceph_le64(PL_ADDR_NULL);
+ addr_type_t addr_type = addr_type_t::MAX;
+
+ pladdr_le_t() = default;
+ pladdr_le_t(const pladdr_le_t &) = default;
+ explicit pladdr_le_t(const pladdr_t &addr)
+ : pladdr(
+ ceph_le64(
+ addr.is_laddr() ?
+ std::get<0>(addr.pladdr) :
+ std::get<1>(addr.pladdr).internal_paddr)),
+ addr_type(
+ addr.is_laddr() ?
+ addr_type_t::LADDR :
+ addr_type_t::PADDR)
+ {}
+
+ operator pladdr_t() const {
+ if (addr_type == addr_type_t::LADDR) {
+ return pladdr_t(laddr_t(pladdr));
+ } else {
+ assert(addr_type == addr_type_t::PADDR);
+ return pladdr_t(paddr_t(pladdr));
+ }
+ }
+};
+
+template <typename T>
+struct min_max_t {};
+
+template <>
+struct min_max_t<laddr_t> {
+ static constexpr laddr_t max = L_ADDR_MAX;
+ static constexpr laddr_t min = L_ADDR_MIN;
+ static constexpr laddr_t null = L_ADDR_NULL;
+};
+
+template <>
+struct min_max_t<paddr_t> {
+ static constexpr paddr_t max = P_ADDR_MAX;
+ static constexpr paddr_t min = P_ADDR_MIN;
+ static constexpr paddr_t null = P_ADDR_NULL;
+};
+
+// logical offset, see LBAManager, TransactionManager
+using extent_len_t = uint32_t;
+constexpr extent_len_t EXTENT_LEN_MAX =
+ std::numeric_limits<extent_len_t>::max();
+
+using extent_len_le_t = ceph_le32;
+inline extent_len_le_t init_extent_len_le(extent_len_t len) {
+ return ceph_le32(len);
+}
+
+struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> {
+ template <typename... T>
+ laddr_list_t(T&&... args)
+ : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {}
+};
+struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> {
+ template <typename... T>
+ paddr_list_t(T&&... args)
+ : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {}
+};
+
+std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs);
+std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs);
+
+/* identifies type of extent, used for interpretting deltas, managing
+ * writeback.
+ *
+ * Note that any new extent type needs to be added to
+ * Cache::get_extent_by_type in cache.cc
+ */
+enum class extent_types_t : uint8_t {
+ ROOT = 0,
+ LADDR_INTERNAL = 1,
+ LADDR_LEAF = 2,
+ DINK_LADDR_LEAF = 3, // should only be used for unitttests
+ OMAP_INNER = 4,
+ OMAP_LEAF = 5,
+ ONODE_BLOCK_STAGED = 6,
+ COLL_BLOCK = 7,
+ OBJECT_DATA_BLOCK = 8,
+ RETIRED_PLACEHOLDER = 9,
+ // the following two types are not extent types,
+ // they are just used to indicates paddr allocation deltas
+ ALLOC_INFO = 10,
+ JOURNAL_TAIL = 11,
+ // Test Block Types
+ TEST_BLOCK = 12,
+ TEST_BLOCK_PHYSICAL = 13,
+ BACKREF_INTERNAL = 14,
+ BACKREF_LEAF = 15,
+ // None and the number of valid extent_types_t
+ NONE = 16,
+};
+using extent_types_le_t = uint8_t;
+constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE);
+
+constexpr size_t BACKREF_NODE_SIZE = 4096;
+
+std::ostream &operator<<(std::ostream &out, extent_types_t t);
+
+constexpr bool is_logical_type(extent_types_t type) {
+ switch (type) {
+ case extent_types_t::ROOT:
+ case extent_types_t::LADDR_INTERNAL:
+ case extent_types_t::LADDR_LEAF:
+ case extent_types_t::BACKREF_INTERNAL:
+ case extent_types_t::BACKREF_LEAF:
+ return false;
+ default:
+ return true;
+ }
+}
+
+constexpr bool is_retired_placeholder(extent_types_t type)
+{
+ return type == extent_types_t::RETIRED_PLACEHOLDER;
+}
+
+constexpr bool is_lba_node(extent_types_t type)
+{
+ return type == extent_types_t::LADDR_INTERNAL ||
+ type == extent_types_t::LADDR_LEAF ||
+ type == extent_types_t::DINK_LADDR_LEAF;
+}
+
+constexpr bool is_backref_node(extent_types_t type)
+{
+ return type == extent_types_t::BACKREF_INTERNAL ||
+ type == extent_types_t::BACKREF_LEAF;
+}
+
+constexpr bool is_lba_backref_node(extent_types_t type)
+{
+ return is_lba_node(type) || is_backref_node(type);
+}
+
+std::ostream &operator<<(std::ostream &out, extent_types_t t);
+
+/**
+ * rewrite_gen_t
+ *
+ * The goal is to group the similar aged extents in the same segment for better
+ * bimodel utilization distribution, and also to the same device tier. For EPM,
+ * it has the flexibility to make placement decisions by re-assigning the
+ * generation. And each non-inline generation will be statically mapped to a
+ * writer in EPM.
+ *
+ * All the fresh and dirty extents start with INIT_GENERATION upon allocation,
+ * and they will be assigned to INLINE/OOL generation by EPM before the initial
+ * writes. After that, the generation can only be increased upon rewrite.
+ *
+ * Note, although EPM can re-assign the generations according to the tiering
+ * status, it cannot decrease the generation for the correctness of space
+ * reservation. It may choose to assign a larger generation if the extent is
+ * hinted cold, or if want to evict extents to the cold tier. And it may choose
+ * to not increase the generation if want to keep the hot tier as filled as
+ * possible.
+ */
+using rewrite_gen_t = uint8_t;
+
+// INIT_GENERATION requires EPM decision to INLINE/OOL_GENERATION
+constexpr rewrite_gen_t INIT_GENERATION = 0;
+constexpr rewrite_gen_t INLINE_GENERATION = 1; // to the journal
+constexpr rewrite_gen_t OOL_GENERATION = 2;
+
+// All the rewritten extents start with MIN_REWRITE_GENERATION
+constexpr rewrite_gen_t MIN_REWRITE_GENERATION = 3;
+// without cold tier, the largest generation is less than MIN_COLD_GENERATION
+constexpr rewrite_gen_t MIN_COLD_GENERATION = 5;
+constexpr rewrite_gen_t MAX_REWRITE_GENERATION = 7;
+constexpr rewrite_gen_t REWRITE_GENERATIONS = MAX_REWRITE_GENERATION + 1;
+constexpr rewrite_gen_t NULL_GENERATION =
+ std::numeric_limits<rewrite_gen_t>::max();
+
+struct rewrite_gen_printer_t {
+ rewrite_gen_t gen;
+};
+
+std::ostream &operator<<(std::ostream &out, rewrite_gen_printer_t gen);
+
+constexpr std::size_t generation_to_writer(rewrite_gen_t gen) {
+ // caller to assert the gen is in the reasonable range
+ return gen - OOL_GENERATION;
+}
+
+// before EPM decision
+constexpr bool is_target_rewrite_generation(rewrite_gen_t gen) {
+ return gen == INIT_GENERATION ||
+ (gen >= MIN_REWRITE_GENERATION &&
+ gen <= REWRITE_GENERATIONS);
+}
+
+// after EPM decision
+constexpr bool is_rewrite_generation(rewrite_gen_t gen) {
+ return gen >= INLINE_GENERATION &&
+ gen < REWRITE_GENERATIONS;
+}
+
+enum class data_category_t : uint8_t {
+ METADATA = 0,
+ DATA,
+ NUM
+};
+
+std::ostream &operator<<(std::ostream &out, data_category_t c);
+
+constexpr data_category_t get_extent_category(extent_types_t type) {
+ if (type == extent_types_t::OBJECT_DATA_BLOCK ||
+ type == extent_types_t::TEST_BLOCK) {
+ return data_category_t::DATA;
+ } else {
+ return data_category_t::METADATA;
+ }
+}
+
+// type for extent modification time, milliseconds since the epoch
+using sea_time_point = seastar::lowres_system_clock::time_point;
+using sea_duration = seastar::lowres_system_clock::duration;
+using mod_time_point_t = int64_t;
+
+constexpr mod_time_point_t
+timepoint_to_mod(const sea_time_point &t) {
+ return std::chrono::duration_cast<std::chrono::milliseconds>(
+ t.time_since_epoch()).count();
+}
+
+constexpr sea_time_point
+mod_to_timepoint(mod_time_point_t t) {
+ return sea_time_point(std::chrono::duration_cast<sea_duration>(
+ std::chrono::milliseconds(t)));
+}
+
+constexpr auto NULL_TIME = sea_time_point();
+constexpr auto NULL_MOD_TIME = timepoint_to_mod(NULL_TIME);
+
+struct sea_time_point_printer_t {
+ sea_time_point tp;
+};
+std::ostream &operator<<(std::ostream &out, sea_time_point_printer_t tp);
+
+struct mod_time_point_printer_t {
+ mod_time_point_t tp;
+};
+std::ostream &operator<<(std::ostream &out, mod_time_point_printer_t tp);
+
+constexpr sea_time_point
+get_average_time(const sea_time_point& t1, std::size_t n1,
+ const sea_time_point& t2, std::size_t n2) {
+ assert(t1 != NULL_TIME);
+ assert(t2 != NULL_TIME);
+ auto new_size = n1 + n2;
+ assert(new_size > 0);
+ auto c1 = t1.time_since_epoch().count();
+ auto c2 = t2.time_since_epoch().count();
+ auto c_ret = c1 / new_size * n1 + c2 / new_size * n2;
+ return sea_time_point(sea_duration(c_ret));
+}
+
+/* description of a new physical extent */
+struct extent_t {
+ extent_types_t type; ///< type of extent
+ laddr_t addr; ///< laddr of extent (L_ADDR_NULL for non-logical)
+ ceph::bufferlist bl; ///< payload, bl.length() == length, aligned
+};
+
+using extent_version_t = uint32_t;
+
+/* description of a mutation to a physical extent */
+struct delta_info_t {
+ extent_types_t type = extent_types_t::NONE; ///< delta type
+ paddr_t paddr; ///< physical address
+ laddr_t laddr = L_ADDR_NULL; ///< logical address
+ uint32_t prev_crc = 0;
+ uint32_t final_crc = 0;
+ extent_len_t length = 0; ///< extent length
+ extent_version_t pversion; ///< prior version
+ segment_seq_t ext_seq; ///< seq of the extent's segment
+ segment_type_t seg_type;
+ ceph::bufferlist bl; ///< payload
+
+ DENC(delta_info_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.paddr, p);
+ denc(v.laddr, p);
+ denc(v.prev_crc, p);
+ denc(v.final_crc, p);
+ denc(v.length, p);
+ denc(v.pversion, p);
+ denc(v.ext_seq, p);
+ denc(v.seg_type, p);
+ denc(v.bl, p);
+ DENC_FINISH(p);
+ }
+
+ bool operator==(const delta_info_t &rhs) const {
+ return (
+ type == rhs.type &&
+ paddr == rhs.paddr &&
+ laddr == rhs.laddr &&
+ prev_crc == rhs.prev_crc &&
+ final_crc == rhs.final_crc &&
+ length == rhs.length &&
+ pversion == rhs.pversion &&
+ ext_seq == rhs.ext_seq &&
+ bl == rhs.bl
+ );
+ }
+};
+
+std::ostream &operator<<(std::ostream &out, const delta_info_t &delta);
+
+/* contains the latest journal tail information */
+struct journal_tail_delta_t {
+ journal_seq_t alloc_tail;
+ journal_seq_t dirty_tail;
+
+ DENC(journal_tail_delta_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.alloc_tail, p);
+ denc(v.dirty_tail, p);
+ DENC_FINISH(p);
+ }
+};
+
+std::ostream &operator<<(std::ostream &out, const journal_tail_delta_t &delta);
+
+class object_data_t {
+ laddr_t reserved_data_base = L_ADDR_NULL;
+ extent_len_t reserved_data_len = 0;
+
+ bool dirty = false;
+public:
+ object_data_t(
+ laddr_t reserved_data_base,
+ extent_len_t reserved_data_len)
+ : reserved_data_base(reserved_data_base),
+ reserved_data_len(reserved_data_len) {}
+
+ laddr_t get_reserved_data_base() const {
+ return reserved_data_base;
+ }
+
+ extent_len_t get_reserved_data_len() const {
+ return reserved_data_len;
+ }
+
+ bool is_null() const {
+ return reserved_data_base == L_ADDR_NULL;
+ }
+
+ bool must_update() const {
+ return dirty;
+ }
+
+ void update_reserved(
+ laddr_t base,
+ extent_len_t len) {
+ dirty = true;
+ reserved_data_base = base;
+ reserved_data_len = len;
+ }
+
+ void update_len(
+ extent_len_t len) {
+ dirty = true;
+ reserved_data_len = len;
+ }
+
+ void clear() {
+ dirty = true;
+ reserved_data_base = L_ADDR_NULL;
+ reserved_data_len = 0;
+ }
+};
+
+struct __attribute__((packed)) object_data_le_t {
+ laddr_le_t reserved_data_base = laddr_le_t(L_ADDR_NULL);
+ extent_len_le_t reserved_data_len = init_extent_len_le(0);
+
+ void update(const object_data_t &nroot) {
+ reserved_data_base = nroot.get_reserved_data_base();
+ reserved_data_len = init_extent_len_le(nroot.get_reserved_data_len());
+ }
+
+ object_data_t get() const {
+ return object_data_t(
+ reserved_data_base,
+ reserved_data_len);
+ }
+};
+
+struct omap_root_t {
+ laddr_t addr = L_ADDR_NULL;
+ depth_t depth = 0;
+ laddr_t hint = L_ADDR_MIN;
+ bool mutated = false;
+
+ omap_root_t() = default;
+ omap_root_t(laddr_t addr, depth_t depth, laddr_t addr_min)
+ : addr(addr),
+ depth(depth),
+ hint(addr_min) {}
+
+ omap_root_t(const omap_root_t &o) = default;
+ omap_root_t(omap_root_t &&o) = default;
+ omap_root_t &operator=(const omap_root_t &o) = default;
+ omap_root_t &operator=(omap_root_t &&o) = default;
+
+ bool is_null() const {
+ return addr == L_ADDR_NULL;
+ }
+
+ bool must_update() const {
+ return mutated;
+ }
+
+ void update(laddr_t _addr, depth_t _depth, laddr_t _hint) {
+ mutated = true;
+ addr = _addr;
+ depth = _depth;
+ hint = _hint;
+ }
+
+ laddr_t get_location() const {
+ return addr;
+ }
+
+ depth_t get_depth() const {
+ return depth;
+ }
+
+ laddr_t get_hint() const {
+ return hint;
+ }
+};
+std::ostream &operator<<(std::ostream &out, const omap_root_t &root);
+
+class __attribute__((packed)) omap_root_le_t {
+ laddr_le_t addr = laddr_le_t(L_ADDR_NULL);
+ depth_le_t depth = init_depth_le(0);
+
+public:
+ omap_root_le_t() = default;
+
+ omap_root_le_t(laddr_t addr, depth_t depth)
+ : addr(addr), depth(init_depth_le(depth)) {}
+
+ omap_root_le_t(const omap_root_le_t &o) = default;
+ omap_root_le_t(omap_root_le_t &&o) = default;
+ omap_root_le_t &operator=(const omap_root_le_t &o) = default;
+ omap_root_le_t &operator=(omap_root_le_t &&o) = default;
+
+ void update(const omap_root_t &nroot) {
+ addr = nroot.get_location();
+ depth = init_depth_le(nroot.get_depth());
+ }
+
+ omap_root_t get(laddr_t hint) const {
+ return omap_root_t(addr, depth, hint);
+ }
+};
+
+/**
+ * phy_tree_root_t
+ */
+class __attribute__((packed)) phy_tree_root_t {
+ paddr_le_t root_addr;
+ depth_le_t depth = init_extent_len_le(0);
+
+public:
+ phy_tree_root_t() = default;
+
+ phy_tree_root_t(paddr_t addr, depth_t depth)
+ : root_addr(addr), depth(init_depth_le(depth)) {}
+
+ phy_tree_root_t(const phy_tree_root_t &o) = default;
+ phy_tree_root_t(phy_tree_root_t &&o) = default;
+ phy_tree_root_t &operator=(const phy_tree_root_t &o) = default;
+ phy_tree_root_t &operator=(phy_tree_root_t &&o) = default;
+
+ paddr_t get_location() const {
+ return root_addr;
+ }
+
+ void set_location(paddr_t location) {
+ root_addr = location;
+ }
+
+ depth_t get_depth() const {
+ return depth;
+ }
+
+ void set_depth(depth_t ndepth) {
+ depth = ndepth;
+ }
+
+ void adjust_addrs_from_base(paddr_t base) {
+ paddr_t _root_addr = root_addr;
+ if (_root_addr.is_relative()) {
+ root_addr = base.add_record_relative(_root_addr);
+ }
+ }
+};
+
+class coll_root_t {
+ laddr_t addr = L_ADDR_NULL;
+ extent_len_t size = 0;
+
+ bool mutated = false;
+
+public:
+ coll_root_t() = default;
+ coll_root_t(laddr_t addr, extent_len_t size) : addr(addr), size(size) {}
+
+ coll_root_t(const coll_root_t &o) = default;
+ coll_root_t(coll_root_t &&o) = default;
+ coll_root_t &operator=(const coll_root_t &o) = default;
+ coll_root_t &operator=(coll_root_t &&o) = default;
+
+ bool must_update() const {
+ return mutated;
+ }
+
+ void update(laddr_t _addr, extent_len_t _s) {
+ mutated = true;
+ addr = _addr;
+ size = _s;
+ }
+
+ laddr_t get_location() const {
+ return addr;
+ }
+
+ extent_len_t get_size() const {
+ return size;
+ }
+};
+
+/**
+ * coll_root_le_t
+ *
+ * Information for locating CollectionManager information, to be embedded
+ * in root block.
+ */
+class __attribute__((packed)) coll_root_le_t {
+ laddr_le_t addr;
+ extent_len_le_t size = init_extent_len_le(0);
+
+public:
+ coll_root_le_t() = default;
+
+ coll_root_le_t(laddr_t laddr, extent_len_t size)
+ : addr(laddr), size(init_extent_len_le(size)) {}
+
+
+ coll_root_le_t(const coll_root_le_t &o) = default;
+ coll_root_le_t(coll_root_le_t &&o) = default;
+ coll_root_le_t &operator=(const coll_root_le_t &o) = default;
+ coll_root_le_t &operator=(coll_root_le_t &&o) = default;
+
+ void update(const coll_root_t &nroot) {
+ addr = nroot.get_location();
+ size = init_extent_len_le(nroot.get_size());
+ }
+
+ coll_root_t get() const {
+ return coll_root_t(addr, size);
+ }
+};
+
+using lba_root_t = phy_tree_root_t;
+using backref_root_t = phy_tree_root_t;
+
+/**
+ * root_t
+ *
+ * Contains information required to find metadata roots.
+ * TODO: generalize this to permit more than one lba_manager implementation
+ */
+struct __attribute__((packed)) root_t {
+ using meta_t = std::map<std::string, std::string>;
+
+ static constexpr int MAX_META_LENGTH = 1024;
+
+ backref_root_t backref_root;
+ lba_root_t lba_root;
+ laddr_le_t onode_root;
+ coll_root_le_t collection_root;
+
+ char meta[MAX_META_LENGTH];
+
+ root_t() {
+ set_meta(meta_t{});
+ }
+
+ void adjust_addrs_from_base(paddr_t base) {
+ lba_root.adjust_addrs_from_base(base);
+ backref_root.adjust_addrs_from_base(base);
+ }
+
+ meta_t get_meta() {
+ bufferlist bl;
+ bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta));
+ meta_t ret;
+ auto iter = bl.cbegin();
+ decode(ret, iter);
+ return ret;
+ }
+
+ void set_meta(const meta_t &m) {
+ ceph::bufferlist bl;
+ encode(m, bl);
+ ceph_assert(bl.length() < MAX_META_LENGTH);
+ bl.rebuild();
+ auto &bptr = bl.front();
+ ::memset(meta, 0, MAX_META_LENGTH);
+ ::memcpy(meta, bptr.c_str(), bl.length());
+ }
+};
+
+struct alloc_blk_t {
+ alloc_blk_t(
+ paddr_t paddr,
+ laddr_t laddr,
+ extent_len_t len,
+ extent_types_t type)
+ : paddr(paddr), laddr(laddr), len(len), type(type)
+ {}
+
+ explicit alloc_blk_t() = default;
+
+ paddr_t paddr = P_ADDR_NULL;
+ laddr_t laddr = L_ADDR_NULL;
+ extent_len_t len = 0;
+ extent_types_t type = extent_types_t::ROOT;
+ DENC(alloc_blk_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.paddr, p);
+ denc(v.laddr, p);
+ denc(v.len, p);
+ denc(v.type, p);
+ DENC_FINISH(p);
+ }
+};
+
+// use absolute address
+struct alloc_delta_t {
+ enum class op_types_t : uint8_t {
+ NONE = 0,
+ SET = 1,
+ CLEAR = 2
+ };
+ std::vector<alloc_blk_t> alloc_blk_ranges;
+ op_types_t op = op_types_t::NONE;
+
+ alloc_delta_t() = default;
+
+ DENC(alloc_delta_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.alloc_blk_ranges, p);
+ denc(v.op, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct extent_info_t {
+ extent_types_t type = extent_types_t::NONE;
+ laddr_t addr = L_ADDR_NULL;
+ extent_len_t len = 0;
+
+ extent_info_t() = default;
+ extent_info_t(const extent_t &et)
+ : type(et.type), addr(et.addr),
+ len(et.bl.length())
+ {}
+
+ DENC(extent_info_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.addr, p);
+ denc(v.len, p);
+ DENC_FINISH(p);
+ }
+};
+std::ostream &operator<<(std::ostream &out, const extent_info_t &header);
+
+using segment_nonce_t = uint32_t;
+
+/**
+ * Segment header
+ *
+ * Every segment contains and encode segment_header_t in the first block.
+ * Our strategy for finding the journal replay point is:
+ * 1) Find the segment with the highest journal_segment_seq
+ * 2) Get dirty_tail and alloc_tail from the segment header
+ * 3) Scan forward to update tails from journal_tail_delta_t
+ * 4) Replay from the latest tails
+ */
+struct segment_header_t {
+ segment_seq_t segment_seq;
+ segment_id_t physical_segment_id; // debugging
+
+ journal_seq_t dirty_tail;
+ journal_seq_t alloc_tail;
+ segment_nonce_t segment_nonce;
+
+ segment_type_t type;
+
+ data_category_t category;
+ rewrite_gen_t generation;
+
+ segment_type_t get_type() const {
+ return type;
+ }
+
+ DENC(segment_header_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.segment_seq, p);
+ denc(v.physical_segment_id, p);
+ denc(v.dirty_tail, p);
+ denc(v.alloc_tail, p);
+ denc(v.segment_nonce, p);
+ denc(v.type, p);
+ denc(v.category, p);
+ denc(v.generation, p);
+ DENC_FINISH(p);
+ }
+};
+std::ostream &operator<<(std::ostream &out, const segment_header_t &header);
+
+struct segment_tail_t {
+ segment_seq_t segment_seq;
+ segment_id_t physical_segment_id; // debugging
+
+ segment_nonce_t segment_nonce;
+
+ segment_type_t type;
+
+ mod_time_point_t modify_time;
+ std::size_t num_extents;
+
+ segment_type_t get_type() const {
+ return type;
+ }
+
+ DENC(segment_tail_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.segment_seq, p);
+ denc(v.physical_segment_id, p);
+ denc(v.segment_nonce, p);
+ denc(v.type, p);
+ denc(v.modify_time, p);
+ denc(v.num_extents, p);
+ DENC_FINISH(p);
+ }
+};
+std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail);
+
+enum class transaction_type_t : uint8_t {
+ MUTATE = 0,
+ READ, // including weak and non-weak read transactions
+ TRIM_DIRTY,
+ TRIM_ALLOC,
+ CLEANER_MAIN,
+ CLEANER_COLD,
+ MAX
+};
+
+static constexpr auto TRANSACTION_TYPE_NULL = transaction_type_t::MAX;
+
+static constexpr auto TRANSACTION_TYPE_MAX = static_cast<std::size_t>(
+ transaction_type_t::MAX);
+
+std::ostream &operator<<(std::ostream &os, transaction_type_t type);
+
+constexpr bool is_valid_transaction(transaction_type_t type) {
+ return type < transaction_type_t::MAX;
+}
+
+constexpr bool is_background_transaction(transaction_type_t type) {
+ return (type >= transaction_type_t::TRIM_DIRTY &&
+ type < transaction_type_t::MAX);
+}
+
+constexpr bool is_trim_transaction(transaction_type_t type) {
+ return (type == transaction_type_t::TRIM_DIRTY ||
+ type == transaction_type_t::TRIM_ALLOC);
+}
+
+struct record_size_t {
+ extent_len_t plain_mdlength = 0; // mdlength without the record header
+ extent_len_t dlength = 0;
+
+ extent_len_t get_raw_mdlength() const;
+
+ bool is_empty() const {
+ return plain_mdlength == 0 &&
+ dlength == 0;
+ }
+
+ void account_extent(extent_len_t extent_len);
+
+ void account(const extent_t& extent) {
+ account_extent(extent.bl.length());
+ }
+
+ void account(const delta_info_t& delta);
+
+ bool operator==(const record_size_t &) const = default;
+};
+std::ostream &operator<<(std::ostream&, const record_size_t&);
+
+struct record_t {
+ transaction_type_t type = TRANSACTION_TYPE_NULL;
+ std::vector<extent_t> extents;
+ std::vector<delta_info_t> deltas;
+ record_size_t size;
+ sea_time_point modify_time = NULL_TIME;
+
+ record_t(transaction_type_t type) : type{type} { }
+
+ // unit test only
+ record_t() {
+ type = transaction_type_t::MUTATE;
+ }
+
+ // unit test only
+ record_t(std::vector<extent_t>&& _extents,
+ std::vector<delta_info_t>&& _deltas) {
+ auto modify_time = seastar::lowres_system_clock::now();
+ for (auto& e: _extents) {
+ push_back(std::move(e), modify_time);
+ }
+ for (auto& d: _deltas) {
+ push_back(std::move(d));
+ }
+ type = transaction_type_t::MUTATE;
+ }
+
+ bool is_empty() const {
+ return extents.size() == 0 &&
+ deltas.size() == 0;
+ }
+
+ std::size_t get_delta_size() const {
+ auto delta_size = std::accumulate(
+ deltas.begin(), deltas.end(), 0,
+ [](uint64_t sum, auto& delta) {
+ return sum + delta.bl.length();
+ }
+ );
+ return delta_size;
+ }
+
+ void push_back(extent_t&& extent, sea_time_point &t) {
+ ceph_assert(t != NULL_TIME);
+ if (extents.size() == 0) {
+ assert(modify_time == NULL_TIME);
+ modify_time = t;
+ } else {
+ modify_time = get_average_time(modify_time, extents.size(), t, 1);
+ }
+ size.account(extent);
+ extents.push_back(std::move(extent));
+ }
+
+ void push_back(delta_info_t&& delta) {
+ size.account(delta);
+ deltas.push_back(std::move(delta));
+ }
+};
+std::ostream &operator<<(std::ostream&, const record_t&);
+
+struct record_header_t {
+ transaction_type_t type;
+ uint32_t deltas; // number of deltas
+ uint32_t extents; // number of extents
+ mod_time_point_t modify_time;
+
+ DENC(record_header_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.deltas, p);
+ denc(v.extents, p);
+ denc(v.modify_time, p);
+ DENC_FINISH(p);
+ }
+};
+std::ostream &operator<<(std::ostream&, const record_header_t&);
+
+struct record_group_header_t {
+ uint32_t records;
+ extent_len_t mdlength; // block aligned, length of metadata
+ extent_len_t dlength; // block aligned, length of data
+ segment_nonce_t segment_nonce;// nonce of containing segment
+ journal_seq_t committed_to; // records prior to committed_to have been
+ // fully written, maybe in another segment.
+ checksum_t data_crc; // crc of data payload
+
+
+ DENC(record_group_header_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.records, p);
+ denc(v.mdlength, p);
+ denc(v.dlength, p);
+ denc(v.segment_nonce, p);
+ denc(v.committed_to, p);
+ denc(v.data_crc, p);
+ DENC_FINISH(p);
+ }
+};
+std::ostream& operator<<(std::ostream&, const record_group_header_t&);
+
+struct record_group_size_t {
+ extent_len_t plain_mdlength = 0; // mdlength without the group header
+ extent_len_t dlength = 0;
+ extent_len_t block_size = 0;
+
+ record_group_size_t() = default;
+ record_group_size_t(
+ const record_size_t& rsize,
+ extent_len_t block_size) {
+ account(rsize, block_size);
+ }
+
+ extent_len_t get_raw_mdlength() const;
+
+ extent_len_t get_mdlength() const {
+ assert(block_size > 0);
+ return p2roundup(get_raw_mdlength(), block_size);
+ }
+
+ extent_len_t get_encoded_length() const {
+ assert(block_size > 0);
+ assert(dlength % block_size == 0);
+ return get_mdlength() + dlength;
+ }
+
+ record_group_size_t get_encoded_length_after(
+ const record_size_t& rsize,
+ extent_len_t block_size) const {
+ record_group_size_t tmp = *this;
+ tmp.account(rsize, block_size);
+ return tmp;
+ }
+
+ double get_fullness() const {
+ assert(block_size > 0);
+ return ((double)(get_raw_mdlength() + dlength) /
+ get_encoded_length());
+ }
+
+ void account(const record_size_t& rsize,
+ extent_len_t block_size);
+
+ bool operator==(const record_group_size_t &) const = default;
+};
+std::ostream& operator<<(std::ostream&, const record_group_size_t&);
+
+struct record_group_t {
+ std::vector<record_t> records;
+ record_group_size_t size;
+
+ record_group_t() = default;
+ record_group_t(
+ record_t&& record,
+ extent_len_t block_size) {
+ push_back(std::move(record), block_size);
+ }
+
+ std::size_t get_size() const {
+ return records.size();
+ }
+
+ void push_back(
+ record_t&& record,
+ extent_len_t block_size) {
+ size.account(record.size, block_size);
+ records.push_back(std::move(record));
+ assert(size.get_encoded_length() < SEGMENT_OFF_MAX);
+ }
+
+ void reserve(std::size_t limit) {
+ records.reserve(limit);
+ }
+
+ void clear() {
+ records.clear();
+ size = {};
+ }
+};
+std::ostream& operator<<(std::ostream&, const record_group_t&);
+
+ceph::bufferlist encode_record(
+ record_t&& record,
+ extent_len_t block_size,
+ const journal_seq_t& committed_to,
+ segment_nonce_t current_segment_nonce);
+
+ceph::bufferlist encode_records(
+ record_group_t& record_group,
+ const journal_seq_t& committed_to,
+ segment_nonce_t current_segment_nonce);
+
+std::optional<record_group_header_t>
+try_decode_records_header(
+ const ceph::bufferlist& header_bl,
+ segment_nonce_t expected_nonce);
+
+bool validate_records_metadata(
+ const ceph::bufferlist& md_bl);
+
+bool validate_records_data(
+ const record_group_header_t& header,
+ const ceph::bufferlist& data_bl);
+
+struct record_extent_infos_t {
+ record_header_t header;
+ std::vector<extent_info_t> extent_infos;
+};
+std::optional<std::vector<record_extent_infos_t> >
+try_decode_extent_infos(
+ const record_group_header_t& header,
+ const ceph::bufferlist& md_bl);
+std::optional<std::vector<record_header_t>>
+try_decode_record_headers(
+ const record_group_header_t& header,
+ const ceph::bufferlist& md_bl);
+
+struct record_deltas_t {
+ paddr_t record_block_base;
+ std::vector<std::pair<sea_time_point, delta_info_t>> deltas;
+};
+std::optional<std::vector<record_deltas_t> >
+try_decode_deltas(
+ const record_group_header_t& header,
+ const ceph::bufferlist& md_bl,
+ paddr_t record_block_base);
+
+struct write_result_t {
+ journal_seq_t start_seq;
+ extent_len_t length;
+
+ journal_seq_t get_end_seq() const {
+ return journal_seq_t{
+ start_seq.segment_seq,
+ start_seq.offset.add_offset(length)};
+ }
+};
+std::ostream& operator<<(std::ostream&, const write_result_t&);
+
+struct record_locator_t {
+ paddr_t record_block_base;
+ write_result_t write_result;
+};
+std::ostream& operator<<(std::ostream&, const record_locator_t&);
+
+/// scan segment for end incrementally
+struct scan_valid_records_cursor {
+ bool last_valid_header_found = false;
+ journal_seq_t seq;
+ journal_seq_t last_committed;
+ std::size_t num_consumed_records = 0;
+ extent_len_t block_size = 0;
+
+ struct found_record_group_t {
+ paddr_t offset;
+ record_group_header_t header;
+ bufferlist mdbuffer;
+
+ found_record_group_t(
+ paddr_t offset,
+ const record_group_header_t &header,
+ const bufferlist &mdbuffer)
+ : offset(offset), header(header), mdbuffer(mdbuffer) {}
+ };
+ std::deque<found_record_group_t> pending_record_groups;
+
+ bool is_complete() const {
+ return last_valid_header_found && pending_record_groups.empty();
+ }
+
+ segment_id_t get_segment_id() const {
+ return seq.offset.as_seg_paddr().get_segment_id();
+ }
+
+ segment_off_t get_segment_offset() const {
+ return seq.offset.as_seg_paddr().get_segment_off();
+ }
+
+ extent_len_t get_block_size() const {
+ return block_size;
+ }
+
+ void increment_seq(segment_off_t off) {
+ seq.offset = seq.offset.add_offset(off);
+ }
+
+ void emplace_record_group(const record_group_header_t&, ceph::bufferlist&&);
+
+ void pop_record_group() {
+ assert(!pending_record_groups.empty());
+ ++num_consumed_records;
+ pending_record_groups.pop_front();
+ }
+
+ scan_valid_records_cursor(
+ journal_seq_t seq)
+ : seq(seq) {}
+};
+std::ostream& operator<<(std::ostream&, const scan_valid_records_cursor&);
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_id_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_tail_delta_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_group_header_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_blk_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_delta_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_tail_t)
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::data_category_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::delta_info_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::device_id_printer_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::extent_types_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::journal_seq_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::journal_tail_delta_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::laddr_list_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::omap_root_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::paddr_list_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::paddr_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::pladdr_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::placement_hint_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::device_type_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::record_group_header_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::record_group_size_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::record_header_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::record_locator_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::record_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::rewrite_gen_printer_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::scan_valid_records_cursor> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::sea_time_point_printer_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::segment_header_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::segment_id_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::segment_seq_printer_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::segment_tail_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::segment_type_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::transaction_type_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::write_result_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<ceph::buffer::list> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/segment_manager.cc b/src/crimson/os/seastore/segment_manager.cc
new file mode 100644
index 000000000..1be9cce5f
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager.cc
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/segment_manager/block.h"
+#include "crimson/os/seastore/logging.h"
+
+#ifdef HAVE_ZNS
+#include "crimson/os/seastore/segment_manager/zbd.h"
+SET_SUBSYS(seastore_device);
+#endif
+
+
+namespace crimson::os::seastore {
+
+std::ostream& operator<<(std::ostream& out, const block_shard_info_t& sf)
+{
+ out << "("
+ << "size=" << sf.size
+ << ", segments=" <<sf.segments
+ << ", tracker_offset=" <<sf.tracker_offset
+ << ", first_segment_offset=" <<sf.first_segment_offset
+ <<")";
+ return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const block_sm_superblock_t& sb)
+{
+ out << "superblock("
+ << "shard_num=" << sb.shard_num
+ << ", segment_size=" << sb.segment_size
+ << ", block_size=" << sb.block_size
+ << ", shard_info:";
+ for (auto &sf : sb.shard_infos) {
+ out << sf
+ << ",";
+ }
+ out << "config=" << sb.config
+ << ")";
+ return out;
+}
+
+std::ostream& operator<<(std::ostream &out, Segment::segment_state_t s)
+{
+ using state_t = Segment::segment_state_t;
+ switch (s) {
+ case state_t::EMPTY:
+ return out << "EMPTY";
+ case state_t::OPEN:
+ return out << "OPEN";
+ case state_t::CLOSED:
+ return out << "CLOSED";
+ default:
+ return out << "INVALID_SEGMENT_STATE!";
+ }
+}
+
+seastar::future<crimson::os::seastore::SegmentManagerRef>
+SegmentManager::get_segment_manager(
+ const std::string &device, device_type_t dtype)
+{
+#ifdef HAVE_ZNS
+LOG_PREFIX(SegmentManager::get_segment_manager);
+ return seastar::do_with(
+ static_cast<size_t>(0),
+ [FNAME,
+ dtype,
+ device](auto &nr_zones) {
+ return seastar::open_file_dma(
+ device + "/block",
+ seastar::open_flags::rw
+ ).then([FNAME,
+ dtype,
+ device,
+ &nr_zones](auto file) {
+ return seastar::do_with(
+ file,
+ [&nr_zones](auto &f) -> seastar::future<int> {
+ ceph_assert(f);
+ return f.ioctl(BLKGETNRZONES, (void *)&nr_zones);
+ });
+ }).then([FNAME,
+ dtype,
+ device,
+ &nr_zones](auto ret) -> crimson::os::seastore::SegmentManagerRef {
+ crimson::os::seastore::SegmentManagerRef sm;
+ INFO("Found {} zones.", nr_zones);
+ if (nr_zones != 0) {
+ return std::make_unique<
+ segment_manager::zbd::ZBDSegmentManager
+ >(device + "/block");
+ } else {
+ return std::make_unique<
+ segment_manager::block::BlockSegmentManager
+ >(device + "/block", dtype);
+ }
+ });
+ });
+#else
+ return seastar::make_ready_future<crimson::os::seastore::SegmentManagerRef>(
+ std::make_unique<
+ segment_manager::block::BlockSegmentManager
+ >(device + "/block", dtype));
+#endif
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager.h b/src/crimson/os/seastore/segment_manager.h
new file mode 100644
index 000000000..719fa6075
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager.h
@@ -0,0 +1,216 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iosfwd>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/buffer_fwd.h"
+#include "include/ceph_assert.h"
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/osd/exceptions.h"
+#include "device.h"
+
+namespace crimson::os::seastore {
+
+using std::vector;
+struct block_shard_info_t {
+ std::size_t size;
+ std::size_t segments;
+ uint64_t tracker_offset;
+ uint64_t first_segment_offset;
+
+ DENC(block_shard_info_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.size, p);
+ denc(v.segments, p);
+ denc(v.tracker_offset, p);
+ denc(v.first_segment_offset, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct block_sm_superblock_t {
+ unsigned int shard_num = 0;
+ size_t segment_size = 0;
+ size_t block_size = 0;
+
+ std::vector<block_shard_info_t> shard_infos;
+
+ device_config_t config;
+
+ DENC(block_sm_superblock_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.shard_num, p);
+ denc(v.segment_size, p);
+ denc(v.block_size, p);
+ denc(v.shard_infos, p);
+ denc(v.config, p);
+ DENC_FINISH(p);
+ }
+
+ void validate() const {
+ ceph_assert(shard_num == seastar::smp::count);
+ ceph_assert(block_size > 0);
+ ceph_assert(segment_size > 0 &&
+ segment_size % block_size == 0);
+ ceph_assert_always(segment_size <= SEGMENT_OFF_MAX);
+ for (unsigned int i = 0; i < seastar::smp::count; i ++) {
+ ceph_assert(shard_infos[i].size > segment_size &&
+ shard_infos[i].size % block_size == 0);
+ ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX);
+ ceph_assert(shard_infos[i].segments > 0);
+ ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX);
+ ceph_assert(shard_infos[i].tracker_offset > 0 &&
+ shard_infos[i].tracker_offset % block_size == 0);
+ ceph_assert(shard_infos[i].first_segment_offset > shard_infos[i].tracker_offset &&
+ shard_infos[i].first_segment_offset % block_size == 0);
+ }
+ ceph_assert(config.spec.magic != 0);
+ ceph_assert(get_default_backend_of_device(config.spec.dtype) ==
+ backend_type_t::SEGMENTED);
+ ceph_assert(config.spec.id <= DEVICE_ID_MAX_VALID);
+ if (!config.major_dev) {
+ ceph_assert(config.secondary_devices.size() == 0);
+ }
+ for (const auto& [k, v] : config.secondary_devices) {
+ ceph_assert(k != config.spec.id);
+ ceph_assert(k <= DEVICE_ID_MAX_VALID);
+ ceph_assert(k == v.id);
+ ceph_assert(v.magic != 0);
+ ceph_assert(v.dtype > device_type_t::NONE);
+ ceph_assert(v.dtype < device_type_t::NUM_TYPES);
+ }
+ }
+};
+
+std::ostream& operator<<(std::ostream&, const block_shard_info_t&);
+std::ostream& operator<<(std::ostream&, const block_sm_superblock_t&);
+
+class Segment : public boost::intrusive_ref_counter<
+ Segment,
+ boost::thread_unsafe_counter>{
+public:
+
+ enum class segment_state_t : uint8_t {
+ EMPTY = 0,
+ OPEN = 1,
+ CLOSED = 2
+ };
+
+ /**
+ * get_segment_id
+ */
+ virtual segment_id_t get_segment_id() const = 0;
+
+ /**
+ * min next write location
+ */
+ virtual segment_off_t get_write_ptr() const = 0;
+
+ /**
+ * max capacity
+ */
+ virtual segment_off_t get_write_capacity() const = 0;
+
+ /**
+ * close
+ *
+ * Closes segment for writes. Won't complete until
+ * outstanding writes to this segment are complete.
+ */
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent>;
+ virtual close_ertr::future<> close() = 0;
+
+
+ /**
+ * write
+ *
+ * @param offset offset of write, must be aligned to <> and >= write pointer, advances
+ * write pointer
+ * @param bl buffer to write, will be padded if not aligned
+ */
+ using write_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error, // media error or corruption
+ crimson::ct_error::invarg, // if offset is < write pointer or misaligned
+ crimson::ct_error::ebadf, // segment closed
+ crimson::ct_error::enospc // write exceeds segment size
+ >;
+ virtual write_ertr::future<> write(
+ segment_off_t offset, ceph::bufferlist bl) = 0;
+
+ /**
+ * advance_wp
+ *
+ * advance the segment write pointer,
+ * needed when writing at wp is strictly implemented. ex: ZBD backed segments
+ * @param offset: advance write pointer till the given offset
+ */
+ virtual write_ertr::future<> advance_wp(
+ segment_off_t offset) = 0;
+
+ virtual ~Segment() {}
+};
+using SegmentRef = boost::intrusive_ptr<Segment>;
+
+std::ostream& operator<<(std::ostream& out, Segment::segment_state_t);
+
+constexpr size_t PADDR_SIZE = sizeof(paddr_t);
+class SegmentManager;
+
+using SegmentManagerRef = std::unique_ptr<SegmentManager>;
+
+class SegmentManager : public Device {
+public:
+ backend_type_t get_backend_type() const final {
+ return backend_type_t::SEGMENTED;
+ }
+
+ using open_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent>;
+ virtual open_ertr::future<SegmentRef> open(segment_id_t id) = 0;
+
+ using release_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent>;
+ virtual release_ertr::future<> release(segment_id_t id) = 0;
+
+ /* Methods for discovering device geometry, segmentid set, etc */
+ virtual segment_off_t get_segment_size() const = 0;
+ virtual device_segment_id_t get_num_segments() const {
+ ceph_assert(get_available_size() % get_segment_size() == 0);
+ return ((device_segment_id_t)(get_available_size() / get_segment_size()));
+ }
+
+ virtual ~SegmentManager() {}
+
+ static seastar::future<SegmentManagerRef>
+ get_segment_manager(const std::string &device, device_type_t dtype);
+};
+
+}
+
+WRITE_CLASS_DENC(
+ crimson::os::seastore::block_shard_info_t
+)
+WRITE_CLASS_DENC(
+ crimson::os::seastore::block_sm_superblock_t
+)
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::block_shard_info_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::block_sm_superblock_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc
new file mode 100644
index 000000000..4eb8d60b2
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/block.cc
@@ -0,0 +1,810 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <fmt/format.h>
+
+#include <seastar/core/metrics.hh>
+
+#include "include/buffer.h"
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/errorator-loop.h"
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/segment_manager/block.h"
+
+SET_SUBSYS(seastore_device);
+/*
+ * format:
+ * - D<device-id> S<segment-id> offset=<off>~<len> poffset=<off> information
+ * - D<device-id> poffset=<off>~<len> information
+ *
+ * levels:
+ * - INFO: major initiation, closing and segment operations
+ * - DEBUG: INFO details, major read and write operations
+ * - TRACE: DEBUG details
+ */
+
+using segment_state_t = crimson::os::seastore::Segment::segment_state_t;
+
+template <> struct fmt::formatter<segment_state_t>: fmt::formatter<std::string_view> {
+ // parse is inherited from formatter<string_view>.
+ template <typename FormatContext>
+ auto format(segment_state_t s, FormatContext& ctx) {
+ std::string_view name = "unknown";
+ switch (s) {
+ case segment_state_t::EMPTY:
+ name = "empty";
+ break;
+ case segment_state_t::OPEN:
+ name = "open";
+ break;
+ case segment_state_t::CLOSED:
+ name = "closed";
+ break;
+ }
+ return formatter<string_view>::format(name, ctx);
+ }
+};
+
+namespace crimson::os::seastore::segment_manager::block {
+
+static write_ertr::future<> do_write(
+ device_id_t device_id,
+ seastar::file &device,
+ uint64_t offset,
+ bufferptr &bptr)
+{
+ LOG_PREFIX(block_do_write);
+ auto len = bptr.length();
+ TRACE("{} poffset={}~{} ...",
+ device_id_printer_t{device_id}, offset, len);
+ return device.dma_write(
+ offset,
+ bptr.c_str(),
+ len
+ ).handle_exception(
+ [FNAME, device_id, offset, len](auto e) -> write_ertr::future<size_t> {
+ ERROR("{} poffset={}~{} got error -- {}",
+ device_id_printer_t{device_id}, offset, len, e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([FNAME, device_id, offset, len](auto result) -> write_ertr::future<> {
+ if (result != len) {
+ ERROR("{} poffset={}~{} write len={} inconsistent",
+ device_id_printer_t{device_id}, offset, len, result);
+ return crimson::ct_error::input_output_error::make();
+ }
+ TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len);
+ return write_ertr::now();
+ });
+}
+
+static write_ertr::future<> do_writev(
+ device_id_t device_id,
+ seastar::file &device,
+ uint64_t offset,
+ bufferlist&& bl,
+ size_t block_size)
+{
+ LOG_PREFIX(block_do_writev);
+ TRACE("{} poffset={}~{}, {} buffers",
+ device_id_printer_t{device_id}, offset, bl.length(), bl.get_num_buffers());
+
+ // writev requires each buffer to be aligned to the disks' block
+ // size, we need to rebuild here
+ bl.rebuild_aligned(block_size);
+
+ return seastar::do_with(
+ bl.prepare_iovs(),
+ std::move(bl),
+ [&device, device_id, offset, FNAME](auto& iovs, auto& bl)
+ {
+ return write_ertr::parallel_for_each(
+ iovs,
+ [&device, device_id, offset, FNAME](auto& p) mutable
+ {
+ auto off = offset + p.offset;
+ auto len = p.length;
+ auto& iov = p.iov;
+ TRACE("{} poffset={}~{} dma_write ...",
+ device_id_printer_t{device_id}, off, len);
+ return device.dma_write(off, std::move(iov)
+ ).handle_exception(
+ [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
+ {
+ ERROR("{} poffset={}~{} dma_write got error -- {}",
+ device_id_printer_t{device_id}, off, len, e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
+ if (written != len) {
+ ERROR("{} poffset={}~{} dma_write len={} inconsistent",
+ device_id_printer_t{device_id}, off, len, written);
+ return crimson::ct_error::input_output_error::make();
+ }
+ TRACE("{} poffset={}~{} dma_write done",
+ device_id_printer_t{device_id}, off, len);
+ return write_ertr::now();
+ });
+ });
+ });
+}
+
+static read_ertr::future<> do_read(
+ device_id_t device_id,
+ seastar::file &device,
+ uint64_t offset,
+ size_t len,
+ bufferptr &bptr)
+{
+ LOG_PREFIX(block_do_read);
+ TRACE("{} poffset={}~{} ...", device_id_printer_t{device_id}, offset, len);
+ assert(len <= bptr.length());
+ return device.dma_read(
+ offset,
+ bptr.c_str(),
+ len
+ ).handle_exception(
+ //FIXME: this is a little bit tricky, since seastar::future<T>::handle_exception
+ // returns seastar::future<T>, to return an crimson::ct_error, we have to create
+ // a seastar::future<T> holding that crimson::ct_error. This is not necessary
+ // once seastar::future<T>::handle_exception() returns seastar::futurize_t<T>
+ [FNAME, device_id, offset, len](auto e) -> read_ertr::future<size_t>
+ {
+ ERROR("{} poffset={}~{} got error -- {}",
+ device_id_printer_t{device_id}, offset, len, e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([FNAME, device_id, offset, len](auto result) -> read_ertr::future<> {
+ if (result != len) {
+ ERROR("{} poffset={}~{} read len={} inconsistent",
+ device_id_printer_t{device_id}, offset, len, result);
+ return crimson::ct_error::input_output_error::make();
+ }
+ TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len);
+ return read_ertr::now();
+ });
+}
+
+write_ertr::future<>
+SegmentStateTracker::write_out(
+ device_id_t device_id,
+ seastar::file &device,
+ uint64_t offset)
+{
+ LOG_PREFIX(SegmentStateTracker::write_out);
+ DEBUG("{} poffset={}~{}",
+ device_id_printer_t{device_id}, offset, bptr.length());
+ return do_write(device_id, device, offset, bptr);
+}
+
+write_ertr::future<>
+SegmentStateTracker::read_in(
+ device_id_t device_id,
+ seastar::file &device,
+ uint64_t offset)
+{
+ LOG_PREFIX(SegmentStateTracker::read_in);
+ DEBUG("{} poffset={}~{}",
+ device_id_printer_t{device_id}, offset, bptr.length());
+ return do_read(
+ device_id,
+ device,
+ offset,
+ bptr.length(),
+ bptr);
+}
+using std::vector;
+static
+block_sm_superblock_t make_superblock(
+ device_id_t device_id,
+ device_config_t sm_config,
+ const seastar::stat_data &data)
+{
+ LOG_PREFIX(block_make_superblock);
+ using crimson::common::get_conf;
+
+ auto config_size = get_conf<Option::size_t>(
+ "seastore_device_size");
+
+ size_t size = (data.size == 0) ? config_size : data.size;
+
+ auto config_segment_size = get_conf<Option::size_t>(
+ "seastore_segment_size");
+ size_t raw_segments = size / config_segment_size;
+ size_t shard_tracker_size = SegmentStateTracker::get_raw_size(
+ raw_segments / seastar::smp::count,
+ data.block_size);
+ size_t total_tracker_size = shard_tracker_size * seastar::smp::count;
+ size_t tracker_off = data.block_size; //superblock
+ size_t segments = (size - tracker_off - total_tracker_size) / config_segment_size;
+ size_t segments_per_shard = segments / seastar::smp::count;
+
+ vector<block_shard_info_t> shard_infos(seastar::smp::count);
+ for (unsigned int i = 0; i < seastar::smp::count; i++) {
+ shard_infos[i].size = segments_per_shard * config_segment_size;
+ shard_infos[i].segments = segments_per_shard;
+ shard_infos[i].tracker_offset = tracker_off + i * shard_tracker_size;
+ shard_infos[i].first_segment_offset = tracker_off + total_tracker_size
+ + i * segments_per_shard * config_segment_size;
+ }
+
+ INFO("{} disk_size={}, segment_size={}, block_size={}",
+ device_id_printer_t{device_id},
+ size,
+ uint64_t(config_segment_size),
+ data.block_size);
+ for (unsigned int i = 0; i < seastar::smp::count; i++) {
+ INFO("shard {} infos:", i, shard_infos[i]);
+ }
+
+ return block_sm_superblock_t{
+ seastar::smp::count,
+ config_segment_size,
+ data.block_size,
+ shard_infos,
+ std::move(sm_config)
+ };
+}
+
+using check_create_device_ertr = BlockSegmentManager::access_ertr;
+using check_create_device_ret = check_create_device_ertr::future<>;
+static check_create_device_ret check_create_device(
+ const std::string &path,
+ size_t size)
+{
+ LOG_PREFIX(block_check_create_device);
+ INFO("path={}, size={}", path, size);
+ return seastar::open_file_dma(
+ path,
+ seastar::open_flags::exclusive |
+ seastar::open_flags::rw |
+ seastar::open_flags::create
+ ).then([size, FNAME, &path](auto file) {
+ return seastar::do_with(
+ file,
+ [size, FNAME, &path](auto &f) -> seastar::future<>
+ {
+ DEBUG("path={} created, truncating to {}", path, size);
+ ceph_assert(f);
+ return f.truncate(
+ size
+ ).then([&f, size] {
+ return f.allocate(0, size);
+ }).finally([&f] {
+ return f.close();
+ });
+ });
+ }).then_wrapped([&path, FNAME](auto f) -> check_create_device_ret {
+ if (f.failed()) {
+ try {
+ f.get();
+ return seastar::now();
+ } catch (const std::system_error &e) {
+ if (e.code().value() == EEXIST) {
+ ERROR("path={} exists", path);
+ return seastar::now();
+ } else {
+ ERROR("path={} creation error -- {}", path, e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ } catch (...) {
+ ERROR("path={} creation error", path);
+ return crimson::ct_error::input_output_error::make();
+ }
+ }
+
+ DEBUG("path={} complete", path);
+ std::ignore = f.discard_result();
+ return seastar::now();
+ });
+}
+
+using open_device_ret =
+ BlockSegmentManager::access_ertr::future<
+ std::pair<seastar::file, seastar::stat_data>
+ >;
+static
+open_device_ret open_device(
+ const std::string &path)
+{
+ LOG_PREFIX(block_open_device);
+ return seastar::file_stat(path, seastar::follow_symlink::yes
+ ).then([&path, FNAME](auto stat) mutable {
+ return seastar::open_file_dma(
+ path,
+ seastar::open_flags::rw | seastar::open_flags::dsync
+ ).then([stat, &path, FNAME](auto file) mutable {
+ return file.size().then([stat, file, &path, FNAME](auto size) mutable {
+ stat.size = size;
+ INFO("path={} successful, size={}, block_size={}",
+ path, stat.size, stat.block_size);
+ return std::make_pair(file, stat);
+ });
+ });
+ }).handle_exception([FNAME, &path](auto e) -> open_device_ret {
+ ERROR("path={} got error -- {}", path, e);
+ return crimson::ct_error::input_output_error::make();
+ });
+}
+
+
+static
+BlockSegmentManager::access_ertr::future<>
+write_superblock(
+ device_id_t device_id,
+ seastar::file &device,
+ block_sm_superblock_t sb)
+{
+ LOG_PREFIX(block_write_superblock);
+ DEBUG("{} write {}", device_id_printer_t{device_id}, sb);
+ sb.validate();
+ assert(ceph::encoded_sizeof<block_sm_superblock_t>(sb) <
+ sb.block_size);
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
+ [=, &device](auto &bp)
+ {
+ bufferlist bl;
+ encode(sb, bl);
+ auto iter = bl.begin();
+ assert(bl.length() < sb.block_size);
+ iter.copy(bl.length(), bp.c_str());
+ return do_write(device_id, device, 0, bp);
+ });
+}
+
+static
+BlockSegmentManager::access_ertr::future<block_sm_superblock_t>
+read_superblock(seastar::file &device, seastar::stat_data sd)
+{
+ LOG_PREFIX(block_read_superblock);
+ DEBUG("reading superblock ...");
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
+ [=, &device](auto &bp)
+ {
+ return do_read(
+ DEVICE_ID_NULL, // unknown
+ device,
+ 0,
+ bp.length(),
+ bp
+ ).safe_then([=, &bp] {
+ bufferlist bl;
+ bl.push_back(bp);
+ block_sm_superblock_t ret;
+ auto bliter = bl.cbegin();
+ try {
+ decode(ret, bliter);
+ } catch (...) {
+ ERROR("got decode error!");
+ ceph_assert(0 == "invalid superblock");
+ }
+ assert(ceph::encoded_sizeof<block_sm_superblock_t>(ret) <
+ sd.block_size);
+ return BlockSegmentManager::access_ertr::future<block_sm_superblock_t>(
+ BlockSegmentManager::access_ertr::ready_future_marker{},
+ ret);
+ });
+ });
+}
+
+BlockSegment::BlockSegment(
+ BlockSegmentManager &manager, segment_id_t id)
+ : manager(manager), id(id) {}
+
+segment_off_t BlockSegment::get_write_capacity() const
+{
+ return manager.get_segment_size();
+}
+
+Segment::close_ertr::future<> BlockSegment::close()
+{
+ return manager.segment_close(id, write_pointer);
+}
+
+Segment::write_ertr::future<> BlockSegment::write(
+ segment_off_t offset, ceph::bufferlist bl)
+{
+ LOG_PREFIX(BlockSegment::write);
+ auto paddr = paddr_t::make_seg_paddr(id, offset);
+ DEBUG("{} offset={}~{} poffset={} ...",
+ id, offset, bl.length(), manager.get_offset(paddr));
+
+ if (offset < write_pointer ||
+ offset % manager.superblock.block_size != 0 ||
+ bl.length() % manager.superblock.block_size != 0) {
+ ERROR("{} offset={}~{} poffset={} invalid write",
+ id, offset, bl.length(), manager.get_offset(paddr));
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (offset + bl.length() > manager.superblock.segment_size) {
+ ERROR("{} offset={}~{} poffset={} write out of the range {}",
+ id, offset, bl.length(), manager.get_offset(paddr),
+ manager.superblock.segment_size);
+ return crimson::ct_error::enospc::make();
+ }
+
+ write_pointer = offset + bl.length();
+ return manager.segment_write(paddr, bl);
+}
+
+Segment::write_ertr::future<> BlockSegment::advance_wp(
+ segment_off_t offset) {
+ return write_ertr::now();
+}
+
+Segment::close_ertr::future<> BlockSegmentManager::segment_close(
+ segment_id_t id, segment_off_t write_pointer)
+{
+ LOG_PREFIX(BlockSegmentManager::segment_close);
+ auto s_id = id.device_segment_id();
+ int unused_bytes = get_segment_size() - write_pointer;
+ INFO("{} unused_bytes={} ...", id, unused_bytes);
+
+ assert(unused_bytes >= 0);
+ assert(id.device_id() == get_device_id());
+ assert(tracker);
+
+ tracker->set(s_id, segment_state_t::CLOSED);
+ ++stats.closed_segments;
+ stats.closed_segments_unused_bytes += unused_bytes;
+ stats.metadata_write.increment(tracker->get_size());
+ return tracker->write_out(
+ get_device_id(), device,
+ shard_info.tracker_offset);
+}
+
+Segment::write_ertr::future<> BlockSegmentManager::segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check)
+{
+ assert(addr.get_device_id() == get_device_id());
+ assert((bl.length() % superblock.block_size) == 0);
+ stats.data_write.increment(bl.length());
+ return do_writev(
+ get_device_id(),
+ device,
+ get_offset(addr),
+ std::move(bl),
+ superblock.block_size);
+}
+
+BlockSegmentManager::~BlockSegmentManager()
+{
+}
+
+BlockSegmentManager::mount_ret BlockSegmentManager::mount()
+{
+ return shard_devices.invoke_on_all([](auto &local_device) {
+ return local_device.shard_mount(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in BlockSegmentManager::mount"
+ });
+ });
+}
+
+BlockSegmentManager::mount_ret BlockSegmentManager::shard_mount()
+{
+ LOG_PREFIX(BlockSegmentManager::shard_mount);
+ return open_device(
+ device_path
+ ).safe_then([=, this](auto p) {
+ device = std::move(p.first);
+ auto sd = p.second;
+ return read_superblock(device, sd);
+ }).safe_then([=, this](auto sb) {
+ set_device_id(sb.config.spec.id);
+ shard_info = sb.shard_infos[seastar::this_shard_id()];
+ INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info);
+ sb.validate();
+ superblock = sb;
+ stats.data_read.increment(
+ ceph::encoded_sizeof<block_sm_superblock_t>(superblock));
+ tracker = std::make_unique<SegmentStateTracker>(
+ shard_info.segments,
+ superblock.block_size);
+ stats.data_read.increment(tracker->get_size());
+ return tracker->read_in(
+ get_device_id(),
+ device,
+ shard_info.tracker_offset
+ ).safe_then([this] {
+ for (device_segment_id_t i = 0; i < tracker->get_capacity(); ++i) {
+ if (tracker->get(i) == segment_state_t::OPEN) {
+ tracker->set(i, segment_state_t::CLOSED);
+ }
+ }
+ stats.metadata_write.increment(tracker->get_size());
+ return tracker->write_out(
+ get_device_id(), device,
+ shard_info.tracker_offset);
+ });
+ }).safe_then([this, FNAME] {
+ INFO("{} complete", device_id_printer_t{get_device_id()});
+ register_metrics();
+ });
+}
+
+BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(
+ device_config_t sm_config)
+{
+ return shard_devices.local().primary_mkfs(sm_config
+ ).safe_then([this] {
+ return shard_devices.invoke_on_all([](auto &local_device) {
+ return local_device.shard_mkfs(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in BlockSegmentManager::mkfs"
+ });
+ });
+ });
+}
+
+BlockSegmentManager::mkfs_ret BlockSegmentManager::primary_mkfs(
+ device_config_t sm_config)
+{
+ LOG_PREFIX(BlockSegmentManager::primary_mkfs);
+ ceph_assert(sm_config.spec.dtype == superblock.config.spec.dtype);
+ set_device_id(sm_config.spec.id);
+ INFO("{} path={}, {}",
+ device_id_printer_t{get_device_id()}, device_path, sm_config);
+ return seastar::do_with(
+ seastar::file{},
+ seastar::stat_data{},
+ block_sm_superblock_t{},
+ std::unique_ptr<SegmentStateTracker>(),
+ [=, this](auto &device, auto &stat, auto &sb, auto &tracker)
+ {
+ check_create_device_ret maybe_create = check_create_device_ertr::now();
+ using crimson::common::get_conf;
+ if (get_conf<bool>("seastore_block_create")) {
+ auto size = get_conf<Option::size_t>("seastore_device_size");
+ maybe_create = check_create_device(device_path, size);
+ }
+
+ return maybe_create.safe_then([this] {
+ return open_device(device_path);
+ }).safe_then([&, sm_config](auto p) {
+ device = p.first;
+ stat = p.second;
+ sb = make_superblock(get_device_id(), sm_config, stat);
+ stats.metadata_write.increment(
+ ceph::encoded_sizeof<block_sm_superblock_t>(sb));
+ return write_superblock(get_device_id(), device, sb);
+ }).finally([&] {
+ return device.close();
+ }).safe_then([FNAME, this] {
+ INFO("{} complete", device_id_printer_t{get_device_id()});
+ return mkfs_ertr::now();
+ });
+ });
+}
+
+BlockSegmentManager::mkfs_ret BlockSegmentManager::shard_mkfs()
+{
+ LOG_PREFIX(BlockSegmentManager::shard_mkfs);
+ return open_device(
+ device_path
+ ).safe_then([this](auto p) {
+ device = std::move(p.first);
+ auto sd = p.second;
+ return read_superblock(device, sd);
+ }).safe_then([this, FNAME](auto sb) {
+ set_device_id(sb.config.spec.id);
+ shard_info = sb.shard_infos[seastar::this_shard_id()];
+ INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info);
+ sb.validate();
+ tracker.reset(new SegmentStateTracker(
+ shard_info.segments, sb.block_size));
+ stats.metadata_write.increment(tracker->get_size());
+ return tracker->write_out(
+ get_device_id(), device,
+ shard_info.tracker_offset);
+ }).finally([this] {
+ return device.close();
+ }).safe_then([FNAME, this] {
+ INFO("{} complete", device_id_printer_t{get_device_id()});
+ return mkfs_ertr::now();
+ });
+}
+
+BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close()
+{
+ LOG_PREFIX(BlockSegmentManager::close);
+ INFO("{}", device_id_printer_t{get_device_id()});
+ metrics.clear();
+ return device.close();
+}
+
+SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open(
+ segment_id_t id)
+{
+ LOG_PREFIX(BlockSegmentManager::open);
+ auto s_id = id.device_segment_id();
+ INFO("{} ...", id);
+
+ assert(id.device_id() == get_device_id());
+
+ if (s_id >= get_num_segments()) {
+ ERROR("{} segment-id out of range {}", id, get_num_segments());
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (tracker->get(s_id) != segment_state_t::EMPTY) {
+ ERROR("{} invalid state {} != EMPTY", id, tracker->get(s_id));
+ return crimson::ct_error::invarg::make();
+ }
+
+ tracker->set(s_id, segment_state_t::OPEN);
+ stats.metadata_write.increment(tracker->get_size());
+ return tracker->write_out(
+ get_device_id(), device,
+ shard_info.tracker_offset
+ ).safe_then([this, id, FNAME] {
+ ++stats.opened_segments;
+ DEBUG("{} done", id);
+ return open_ertr::future<SegmentRef>(
+ open_ertr::ready_future_marker{},
+ SegmentRef(new BlockSegment(*this, id)));
+ });
+}
+
+SegmentManager::release_ertr::future<> BlockSegmentManager::release(
+ segment_id_t id)
+{
+ LOG_PREFIX(BlockSegmentManager::release);
+ auto s_id = id.device_segment_id();
+ INFO("{} ...", id);
+
+ assert(id.device_id() == get_device_id());
+
+ if (s_id >= get_num_segments()) {
+ ERROR("{} segment-id out of range {}", id, get_num_segments());
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (tracker->get(s_id) != segment_state_t::CLOSED) {
+ ERROR("{} invalid state {} != CLOSED", id, tracker->get(s_id));
+ return crimson::ct_error::invarg::make();
+ }
+
+ tracker->set(s_id, segment_state_t::EMPTY);
+ ++stats.released_segments;
+ stats.metadata_write.increment(tracker->get_size());
+ return tracker->write_out(
+ get_device_id(), device,
+ shard_info.tracker_offset);
+}
+
+SegmentManager::read_ertr::future<> BlockSegmentManager::read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out)
+{
+ LOG_PREFIX(BlockSegmentManager::read);
+ auto& seg_addr = addr.as_seg_paddr();
+ auto id = seg_addr.get_segment_id();
+ auto s_id = id.device_segment_id();
+ auto s_off = seg_addr.get_segment_off();
+ auto p_off = get_offset(addr);
+ DEBUG("{} offset={}~{} poffset={} ...", id, s_off, len, p_off);
+
+ assert(addr.get_device_id() == get_device_id());
+
+ if (s_off % superblock.block_size != 0 ||
+ len % superblock.block_size != 0) {
+ ERROR("{} offset={}~{} poffset={} invalid read", id, s_off, len, p_off);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (s_id >= get_num_segments()) {
+ ERROR("{} offset={}~{} poffset={} segment-id out of range {}",
+ id, s_off, len, p_off, get_num_segments());
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (s_off + len > superblock.segment_size) {
+ ERROR("{} offset={}~{} poffset={} read out of range {}",
+ id, s_off, len, p_off, superblock.segment_size);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (tracker->get(s_id) == segment_state_t::EMPTY) {
+ // XXX: not an error during scanning,
+ // might need refactor to increase the log level
+ DEBUG("{} offset={}~{} poffset={} invalid state {}",
+ id, s_off, len, p_off, tracker->get(s_id));
+ return crimson::ct_error::enoent::make();
+ }
+
+ stats.data_read.increment(len);
+ return do_read(
+ get_device_id(),
+ device,
+ p_off,
+ len,
+ out);
+}
+
+void BlockSegmentManager::register_metrics()
+{
+ LOG_PREFIX(BlockSegmentManager::register_metrics);
+ DEBUG("{}", device_id_printer_t{get_device_id()});
+ namespace sm = seastar::metrics;
+ std::vector<sm::label_instance> label_instances;
+ label_instances.push_back(sm::label_instance("device_id", get_device_id()));
+ stats.reset();
+ metrics.add_group(
+ "segment_manager",
+ {
+ sm::make_counter(
+ "data_read_num",
+ stats.data_read.num,
+ sm::description("total number of data read"),
+ label_instances
+ ),
+ sm::make_counter(
+ "data_read_bytes",
+ stats.data_read.bytes,
+ sm::description("total bytes of data read"),
+ label_instances
+ ),
+ sm::make_counter(
+ "data_write_num",
+ stats.data_write.num,
+ sm::description("total number of data write"),
+ label_instances
+ ),
+ sm::make_counter(
+ "data_write_bytes",
+ stats.data_write.bytes,
+ sm::description("total bytes of data write"),
+ label_instances
+ ),
+ sm::make_counter(
+ "metadata_write_num",
+ stats.metadata_write.num,
+ sm::description("total number of metadata write"),
+ label_instances
+ ),
+ sm::make_counter(
+ "metadata_write_bytes",
+ stats.metadata_write.bytes,
+ sm::description("total bytes of metadata write"),
+ label_instances
+ ),
+ sm::make_counter(
+ "opened_segments",
+ stats.opened_segments,
+ sm::description("total segments opened"),
+ label_instances
+ ),
+ sm::make_counter(
+ "closed_segments",
+ stats.closed_segments,
+ sm::description("total segments closed"),
+ label_instances
+ ),
+ sm::make_counter(
+ "closed_segments_unused_bytes",
+ stats.closed_segments_unused_bytes,
+ sm::description("total unused bytes of closed segments"),
+ label_instances
+ ),
+ sm::make_counter(
+ "released_segments",
+ stats.released_segments,
+ sm::description("total segments released"),
+ label_instances
+ ),
+ }
+ );
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/block.h b/src/crimson/os/seastore/segment_manager/block.h
new file mode 100644
index 000000000..495d0d104
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/block.h
@@ -0,0 +1,262 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/file.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/reactor.hh>
+
+#include "crimson/common/layout.h"
+
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace crimson::os::seastore::segment_manager::block {
+
+using write_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+using read_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+
+/**
+ * SegmentStateTracker
+ *
+ * Tracks lifecycle state of each segment using space at the beginning
+ * of the drive.
+ */
+class SegmentStateTracker {
+ using segment_state_t = Segment::segment_state_t;
+
+ bufferptr bptr;
+
+ using L = absl::container_internal::Layout<uint8_t>;
+ const L layout;
+
+public:
+ static size_t get_raw_size(size_t segments, size_t block_size) {
+ return p2roundup(segments, block_size);
+ }
+
+ SegmentStateTracker(size_t segments, size_t block_size)
+ : bptr(ceph::buffer::create_page_aligned(
+ get_raw_size(segments, block_size))),
+ layout(bptr.length())
+ {
+ ::memset(
+ bptr.c_str(),
+ static_cast<char>(segment_state_t::EMPTY),
+ bptr.length());
+ }
+
+ size_t get_size() const {
+ return bptr.length();
+ }
+
+ size_t get_capacity() const {
+ return bptr.length();
+ }
+
+ segment_state_t get(device_segment_id_t offset) const {
+ assert(offset < get_capacity());
+ return static_cast<segment_state_t>(
+ layout.template Pointer<0>(
+ bptr.c_str())[offset]);
+ }
+
+ void set(device_segment_id_t offset, segment_state_t state) {
+ assert(offset < get_capacity());
+ layout.template Pointer<0>(bptr.c_str())[offset] =
+ static_cast<uint8_t>(state);
+ }
+
+ write_ertr::future<> write_out(
+ device_id_t device_id,
+ seastar::file &device,
+ uint64_t offset);
+
+ read_ertr::future<> read_in(
+ device_id_t device_id,
+ seastar::file &device,
+ uint64_t offset);
+};
+
+class BlockSegmentManager;
+class BlockSegment final : public Segment {
+ friend class BlockSegmentManager;
+ BlockSegmentManager &manager;
+ const segment_id_t id;
+ segment_off_t write_pointer = 0;
+public:
+ BlockSegment(BlockSegmentManager &manager, segment_id_t id);
+
+ segment_id_t get_segment_id() const final { return id; }
+ segment_off_t get_write_capacity() const final;
+ segment_off_t get_write_ptr() const final { return write_pointer; }
+ close_ertr::future<> close() final;
+ write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+ write_ertr::future<> advance_wp(segment_off_t offset) final;
+
+ ~BlockSegment() {}
+};
+
+/**
+ * BlockSegmentManager
+ *
+ * Implements SegmentManager on a conventional block device.
+ * SegmentStateTracker uses space at the start of the device to store
+ * state analagous to that of the segments of a zns device.
+ */
+class BlockSegmentManager final : public SegmentManager {
+// interfaces used by Device
+public:
+ seastar::future<> start() {
+ return shard_devices.start(device_path, superblock.config.spec.dtype);
+ }
+
+ seastar::future<> stop() {
+ return shard_devices.stop();
+ }
+
+ Device& get_sharded_device() final {
+ return shard_devices.local();
+ }
+ mount_ret mount() final;
+
+ mkfs_ret mkfs(device_config_t) final;
+// interfaces used by each shard device
+public:
+ close_ertr::future<> close();
+
+ BlockSegmentManager(
+ const std::string &path,
+ device_type_t dtype)
+ : device_path(path) {
+ ceph_assert(get_device_type() == device_type_t::NONE);
+ superblock.config.spec.dtype = dtype;
+ }
+
+ ~BlockSegmentManager();
+
+ open_ertr::future<SegmentRef> open(segment_id_t id) final;
+
+ release_ertr::future<> release(segment_id_t id) final;
+
+ read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) final;
+
+ device_type_t get_device_type() const final {
+ return superblock.config.spec.dtype;
+ }
+ size_t get_available_size() const final {
+ return shard_info.size;
+ }
+ extent_len_t get_block_size() const {
+ return superblock.block_size;
+ }
+ segment_off_t get_segment_size() const {
+ return superblock.segment_size;
+ }
+
+ device_id_t get_device_id() const final {
+ assert(device_id <= DEVICE_ID_MAX_VALID);
+ return device_id;
+ }
+ secondary_device_set_t& get_secondary_devices() final {
+ return superblock.config.secondary_devices;
+ }
+ // public so tests can bypass segment interface when simpler
+ Segment::write_ertr::future<> segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check=false);
+
+ magic_t get_magic() const final {
+ return superblock.config.spec.magic;
+ }
+
+private:
+ friend class BlockSegment;
+ using segment_state_t = Segment::segment_state_t;
+
+ struct effort_t {
+ uint64_t num = 0;
+ uint64_t bytes = 0;
+
+ void increment(uint64_t read_bytes) {
+ ++num;
+ bytes += read_bytes;
+ }
+ };
+
+ struct {
+ effort_t data_read;
+ effort_t data_write;
+ effort_t metadata_write;
+ uint64_t opened_segments;
+ uint64_t closed_segments;
+ uint64_t closed_segments_unused_bytes;
+ uint64_t released_segments;
+
+ void reset() {
+ data_read = {};
+ data_write = {};
+ metadata_write = {};
+ opened_segments = 0;
+ closed_segments = 0;
+ closed_segments_unused_bytes = 0;
+ released_segments = 0;
+ }
+ } stats;
+
+ void register_metrics();
+ seastar::metrics::metric_group metrics;
+
+ std::string device_path;
+ std::unique_ptr<SegmentStateTracker> tracker;
+ block_shard_info_t shard_info;
+ block_sm_superblock_t superblock;
+ seastar::file device;
+
+ void set_device_id(device_id_t id) {
+ assert(id <= DEVICE_ID_MAX_VALID);
+ assert(device_id == DEVICE_ID_NULL ||
+ device_id == id);
+ device_id = id;
+ }
+ device_id_t device_id = DEVICE_ID_NULL;
+
+ size_t get_offset(paddr_t addr) {
+ auto& seg_addr = addr.as_seg_paddr();
+ return shard_info.first_segment_offset +
+ (seg_addr.get_segment_id().device_segment_id() * superblock.segment_size) +
+ seg_addr.get_segment_off();
+ }
+
+ const seastore_meta_t &get_meta() const {
+ return superblock.config.meta;
+ }
+
+ std::vector<segment_state_t> segment_state;
+
+ char *buffer = nullptr;
+
+ Segment::close_ertr::future<> segment_close(
+ segment_id_t id, segment_off_t write_pointer);
+
+private:
+ // shard 0 mkfs
+ mkfs_ret primary_mkfs(device_config_t);
+ // all shards mkfs
+ mkfs_ret shard_mkfs();
+ // all shards mount
+ mount_ret shard_mount();
+
+ seastar::sharded<BlockSegmentManager> shard_devices;
+};
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc
new file mode 100644
index 000000000..4a4873afb
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc
@@ -0,0 +1,294 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "seastar/core/sleep.hh"
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/segment_manager/ephemeral.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_seastore_device);
+ }
+}
+
+namespace crimson::os::seastore::segment_manager {
+
+std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) {
+ return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size
+ << ", segment_size=" << c.segment_size << ")";
+}
+
+EphemeralSegmentManagerRef create_test_ephemeral() {
+ return EphemeralSegmentManagerRef(
+ new EphemeralSegmentManager(DEFAULT_TEST_EPHEMERAL));
+}
+
+device_config_t get_ephemeral_device_config(
+ std::size_t index,
+ std::size_t num_main_devices,
+ std::size_t num_cold_devices)
+{
+ auto num_devices = num_main_devices + num_cold_devices;
+ assert(num_devices > index);
+ auto get_sec_dtype = [num_main_devices](std::size_t idx) {
+ if (idx < num_main_devices) {
+ return device_type_t::EPHEMERAL_MAIN;
+ } else {
+ return device_type_t::EPHEMERAL_COLD;
+ }
+ };
+
+ magic_t magic = 0xabcd;
+ bool is_major_device;
+ secondary_device_set_t secondary_devices;
+ if (index == 0) {
+ is_major_device = true;
+ for (std::size_t secondary_index = index + 1;
+ secondary_index < num_devices;
+ ++secondary_index) {
+ device_id_t secondary_id = static_cast<device_id_t>(secondary_index);
+ secondary_devices.insert({
+ secondary_index,
+ device_spec_t{
+ magic,
+ get_sec_dtype(secondary_index),
+ secondary_id
+ }
+ });
+ }
+ } else { // index > 0
+ is_major_device = false;
+ }
+
+ device_id_t id = static_cast<device_id_t>(index);
+ seastore_meta_t meta = {};
+ return {is_major_device,
+ device_spec_t{
+ magic,
+ get_sec_dtype(index),
+ id
+ },
+ meta,
+ secondary_devices};
+}
+
+EphemeralSegment::EphemeralSegment(
+ EphemeralSegmentManager &manager, segment_id_t id)
+ : manager(manager), id(id) {}
+
+segment_off_t EphemeralSegment::get_write_capacity() const
+{
+ return manager.get_segment_size();
+}
+
+Segment::close_ertr::future<> EphemeralSegment::close()
+{
+ return manager.segment_close(id).safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+Segment::write_ertr::future<> EphemeralSegment::write(
+ segment_off_t offset, ceph::bufferlist bl)
+{
+ if (offset < write_pointer || offset % manager.config.block_size != 0)
+ return crimson::ct_error::invarg::make();
+
+ if (offset + bl.length() > (size_t)manager.get_segment_size())
+ return crimson::ct_error::enospc::make();
+
+ return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
+}
+
+Segment::write_ertr::future<> EphemeralSegment::advance_wp(
+ segment_off_t offset)
+{
+ return write_ertr::now();
+}
+
+Segment::close_ertr::future<> EphemeralSegmentManager::segment_close(segment_id_t id)
+{
+ auto s_id = id.device_segment_id();
+ if (segment_state[s_id] != segment_state_t::OPEN)
+ return crimson::ct_error::invarg::make();
+
+ segment_state[s_id] = segment_state_t::CLOSED;
+ return Segment::close_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+EphemeralSegmentManager::mkfs_ret
+EphemeralSegmentManager::mkfs(device_config_t _config)
+{
+ logger().info(
+ "Mkfs ephemeral segment manager with {}",
+ _config);
+ device_config = _config;
+ return mkfs_ertr::now();
+}
+
+Segment::write_ertr::future<> EphemeralSegmentManager::segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check)
+{
+ auto& seg_addr = addr.as_seg_paddr();
+ logger().debug(
+ "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}",
+ seg_addr.get_segment_id(),
+ seg_addr.get_segment_off(),
+ get_offset(addr),
+ bl.length(),
+ bl.crc32c(1));
+ if (!ignore_check && segment_state[seg_addr.get_segment_id().device_segment_id()]
+ != segment_state_t::OPEN)
+ return crimson::ct_error::invarg::make();
+
+ bl.begin().copy(bl.length(), buffer + get_offset(addr));
+ return Segment::write_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+EphemeralSegmentManager::init_ertr::future<> EphemeralSegmentManager::init()
+{
+ logger().info(
+ "Initing ephemeral segment manager with config {}",
+ config);
+
+ if (config.block_size % (4<<10) != 0) {
+ return crimson::ct_error::invarg::make();
+ }
+ if (config.segment_size % config.block_size != 0) {
+ return crimson::ct_error::invarg::make();
+ }
+ if (config.size % config.segment_size != 0) {
+ return crimson::ct_error::invarg::make();
+ }
+
+ void* addr = ::mmap(
+ nullptr,
+ config.size,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+ -1,
+ 0);
+
+ segment_state.resize(config.size / config.segment_size, segment_state_t::EMPTY);
+
+ if (addr == MAP_FAILED)
+ return crimson::ct_error::enospc::make();
+
+ buffer = (char*)addr;
+
+ ::memset(buffer, 0, config.size);
+ return init_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+EphemeralSegmentManager::~EphemeralSegmentManager()
+{
+ if (buffer) {
+ ::munmap(buffer, config.size);
+ }
+}
+
+void EphemeralSegmentManager::remount()
+{
+ for (auto &i : segment_state) {
+ if (i == Segment::segment_state_t::OPEN)
+ i = Segment::segment_state_t::CLOSED;
+ }
+}
+
+SegmentManager::open_ertr::future<SegmentRef> EphemeralSegmentManager::open(
+ segment_id_t id)
+{
+ auto s_id = id.device_segment_id();
+ if (s_id >= get_num_segments()) {
+ logger().error("EphemeralSegmentManager::open: invalid segment {}", id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (segment_state[s_id] != segment_state_t::EMPTY) {
+ logger().error("EphemeralSegmentManager::open: segment {} not empty", id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ segment_state[s_id] = segment_state_t::OPEN;
+ return open_ertr::make_ready_future<SegmentRef>(new EphemeralSegment(*this, id));
+}
+
+SegmentManager::release_ertr::future<> EphemeralSegmentManager::release(
+ segment_id_t id)
+{
+ auto s_id = id.device_segment_id();
+ logger().debug("EphemeralSegmentManager::release: {}", id);
+
+ if (s_id >= get_num_segments()) {
+ logger().error(
+ "EphemeralSegmentManager::release: invalid segment {}",
+ id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (segment_state[s_id] != segment_state_t::CLOSED) {
+ logger().error(
+ "EphemeralSegmentManager::release: segment id {} not closed",
+ id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ ::memset(buffer + get_offset(paddr_t::make_seg_paddr(id, 0)), 0, config.segment_size);
+ segment_state[s_id] = segment_state_t::EMPTY;
+ return release_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+SegmentManager::read_ertr::future<> EphemeralSegmentManager::read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out)
+{
+ auto& seg_addr = addr.as_seg_paddr();
+ if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) {
+ logger().error(
+ "EphemeralSegmentManager::read: invalid segment {}",
+ addr);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (seg_addr.get_segment_off() + len > config.segment_size) {
+ logger().error(
+ "EphemeralSegmentManager::read: invalid offset {}~{}!",
+ addr,
+ len);
+ return crimson::ct_error::invarg::make();
+ }
+
+ out.copy_in(0, len, buffer + get_offset(addr));
+
+ bufferlist bl;
+ bl.push_back(out);
+ logger().debug(
+ "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}",
+ seg_addr.get_segment_id().device_segment_id(),
+ seg_addr.get_segment_off(),
+ get_offset(addr),
+ len,
+ bl.begin().crc32c(len, 1));
+
+ return read_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.h b/src/crimson/os/seastore/segment_manager/ephemeral.h
new file mode 100644
index 000000000..d7a3eb4a7
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/ephemeral.h
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "crimson/os/seastore/segment_manager/ephemeral.h"
+
+namespace crimson::os::seastore::segment_manager {
+
+class EphemeralSegmentManager;
+using EphemeralSegmentManagerRef = std::unique_ptr<EphemeralSegmentManager>;
+
+struct ephemeral_config_t {
+ size_t size = 0;
+ size_t block_size = 0;
+ size_t segment_size = 0;
+
+ void validate() const {
+ ceph_assert_always(size > 0);
+ ceph_assert_always(size <= DEVICE_OFF_MAX);
+ ceph_assert_always(segment_size > 0);
+ ceph_assert_always(segment_size <= SEGMENT_OFF_MAX);
+ ceph_assert_always(size / segment_size > 0);
+ ceph_assert_always(size / segment_size <= DEVICE_SEGMENT_ID_MAX);
+ }
+};
+
+constexpr ephemeral_config_t DEFAULT_TEST_EPHEMERAL = {
+ 1 << 30,
+ 4 << 10,
+ 8 << 20
+};
+
+std::ostream &operator<<(std::ostream &, const ephemeral_config_t &);
+
+EphemeralSegmentManagerRef create_test_ephemeral();
+
+device_config_t get_ephemeral_device_config(
+ std::size_t index,
+ std::size_t num_main_devices,
+ std::size_t num_cold_devices);
+
+class EphemeralSegment final : public Segment {
+ friend class EphemeralSegmentManager;
+ EphemeralSegmentManager &manager;
+ const segment_id_t id;
+ segment_off_t write_pointer = 0;
+public:
+ EphemeralSegment(EphemeralSegmentManager &manager, segment_id_t id);
+
+ segment_id_t get_segment_id() const final { return id; }
+ segment_off_t get_write_capacity() const final;
+ segment_off_t get_write_ptr() const final { return write_pointer; }
+ close_ertr::future<> close() final;
+ write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+ write_ertr::future<> advance_wp(segment_off_t offset) final;
+
+ ~EphemeralSegment() {}
+};
+
+class EphemeralSegmentManager final : public SegmentManager {
+ friend class EphemeralSegment;
+ using segment_state_t = Segment::segment_state_t;
+
+ const ephemeral_config_t config;
+ std::optional<device_config_t> device_config;
+
+ device_type_t get_device_type() const final {
+ assert(device_config);
+ return device_config->spec.dtype;
+ }
+
+ size_t get_offset(paddr_t addr) {
+ auto& seg_addr = addr.as_seg_paddr();
+ return (seg_addr.get_segment_id().device_segment_id() * config.segment_size) +
+ seg_addr.get_segment_off();
+ }
+
+ std::vector<segment_state_t> segment_state;
+
+ char *buffer = nullptr;
+
+ Segment::close_ertr::future<> segment_close(segment_id_t id);
+
+public:
+ EphemeralSegmentManager(
+ ephemeral_config_t config)
+ : config(config) {
+ config.validate();
+ }
+
+ ~EphemeralSegmentManager();
+
+ close_ertr::future<> close() final {
+ return close_ertr::now();
+ }
+
+ device_id_t get_device_id() const final {
+ assert(device_config);
+ return device_config->spec.id;
+ }
+
+ mount_ret mount() final {
+ return mount_ertr::now();
+ }
+
+ mkfs_ret mkfs(device_config_t) final;
+
+ open_ertr::future<SegmentRef> open(segment_id_t id) final;
+
+ release_ertr::future<> release(segment_id_t id) final;
+
+ read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) final;
+
+ size_t get_available_size() const final {
+ return config.size;
+ }
+ extent_len_t get_block_size() const final {
+ return config.block_size;
+ }
+ segment_off_t get_segment_size() const final {
+ return config.segment_size;
+ }
+
+ const seastore_meta_t &get_meta() const final {
+ assert(device_config);
+ return device_config->meta;
+ }
+
+ secondary_device_set_t& get_secondary_devices() final {
+ assert(device_config);
+ return device_config->secondary_devices;
+ }
+
+ magic_t get_magic() const final {
+ return device_config->spec.magic;
+ }
+
+ using init_ertr = crimson::errorator<
+ crimson::ct_error::enospc,
+ crimson::ct_error::invarg>;
+ init_ertr::future<> init();
+
+ void remount();
+
+ // public so tests can bypass segment interface when simpler
+ Segment::write_ertr::future<> segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check=false);
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::segment_manager::ephemeral_config_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/segment_manager/zbd.cc b/src/crimson/os/seastore/segment_manager/zbd.cc
new file mode 100644
index 000000000..88521a947
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/zbd.cc
@@ -0,0 +1,823 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+#include <linux/blkzoned.h>
+
+#include <fmt/format.h>
+#include "crimson/os/seastore/segment_manager/zbd.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/os/seastore/logging.h"
+#include "crimson/common/errorator-loop.h"
+#include "include/buffer.h"
+
+SET_SUBSYS(seastore_device);
+
+#define SECT_SHIFT 9
+#define RESERVED_ZONES 1
+// limit the max padding buf size to 1MB
+#define MAX_PADDING_SIZE 4194304
+
+using z_op = crimson::os::seastore::segment_manager::zbd::zone_op;
+template <> struct fmt::formatter<z_op>: fmt::formatter<std::string_view> {
+ template <typename FormatContext>
+ auto format(z_op s, FormatContext& ctx) {
+ std::string_view name = "Unknown";
+ switch (s) {
+ using enum z_op;
+ case OPEN:
+ name = "BLKOPENZONE";
+ break;
+ case FINISH:
+ name = "BLKFINISHZONE";
+ break;
+ case CLOSE:
+ name = "BLKCLOSEZONE";
+ break;
+ case RESET:
+ name = "BLKRESETZONE";
+ break;
+ }
+ return formatter<string_view>::format(name, ctx);
+ }
+};
+
+namespace crimson::os::seastore::segment_manager::zbd {
+
+using open_device_ret = ZBDSegmentManager::access_ertr::future<
+ std::pair<seastar::file, seastar::stat_data>>;
+static open_device_ret open_device(
+ const std::string &path,
+ seastar::open_flags mode)
+{
+ LOG_PREFIX(ZBDSegmentManager::open_device);
+ return seastar::file_stat(
+ path, seastar::follow_symlink::yes
+ ).then([FNAME, mode, &path](auto stat) mutable {
+ return seastar::open_file_dma(path, mode).then([=](auto file) {
+ DEBUG("open of device {} successful, size {}",
+ path,
+ stat.size);
+ return std::make_pair(file, stat);
+ });
+ }).handle_exception(
+ [FNAME](auto e) -> open_device_ret {
+ ERROR("got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ );
+}
+
+static zbd_sm_metadata_t make_metadata(
+ uint64_t total_size,
+ seastore_meta_t meta,
+ const seastar::stat_data &data,
+ size_t zone_size_sectors,
+ size_t zone_capacity_sectors,
+ size_t nr_cnv_zones,
+ size_t num_zones)
+{
+ LOG_PREFIX(ZBDSegmentManager::make_metadata);
+
+ // Using only SWR zones in a SMR drive, for now
+ auto skipped_zones = RESERVED_ZONES + nr_cnv_zones;
+ assert(num_zones > skipped_zones);
+
+ // TODO: support Option::size_t seastore_segment_size
+ // to allow zones_per_segment > 1 with striping.
+ size_t zone_size = zone_size_sectors << SECT_SHIFT;
+ assert(total_size == num_zones * zone_size);
+ size_t zone_capacity = zone_capacity_sectors << SECT_SHIFT;
+ size_t segment_size = zone_size;
+ size_t zones_per_segment = segment_size / zone_size;
+ size_t segments = (num_zones - skipped_zones) / zones_per_segment;
+ size_t per_shard_segments = segments / seastar::smp::count;
+ size_t available_size = zone_capacity * segments;
+ size_t per_shard_available_size = zone_capacity * per_shard_segments;
+
+
+ WARN("Ignoring configuration values for device and segment size");
+ INFO(
+ "device size: {}, available size: {}, block size: {}, allocated size: {},"
+ " total zones {}, zone size: {}, zone capacity: {},"
+ " total segments: {}, zones per segment: {}, segment size: {}"
+ " conv zones: {}, swr zones: {}, per shard segments: {}"
+ " per shard available size: {}",
+ total_size,
+ available_size,
+ data.block_size,
+ data.allocated_size,
+ num_zones,
+ zone_size,
+ zone_capacity,
+ segments,
+ zones_per_segment,
+ zone_capacity * zones_per_segment,
+ nr_cnv_zones,
+ num_zones - nr_cnv_zones,
+ per_shard_segments,
+ per_shard_available_size);
+
+ std::vector<zbd_shard_info_t> shard_infos(seastar::smp::count);
+ for (unsigned int i = 0; i < seastar::smp::count; i++) {
+ shard_infos[i].size = per_shard_available_size;
+ shard_infos[i].segments = per_shard_segments;
+ shard_infos[i].first_segment_offset = zone_size * skipped_zones
+ + i * segment_size * per_shard_segments;
+ INFO("First segment offset for shard {} is: {}",
+ i, shard_infos[i].first_segment_offset);
+ }
+
+ zbd_sm_metadata_t ret = zbd_sm_metadata_t{
+ seastar::smp::count,
+ segment_size,
+ zone_capacity * zones_per_segment,
+ zones_per_segment,
+ zone_capacity,
+ data.block_size,
+ zone_size,
+ shard_infos,
+ meta};
+ ret.validate();
+ return ret;
+}
+
+struct ZoneReport {
+ struct blk_zone_report *hdr;
+ ZoneReport(int nr_zones)
+ : hdr((blk_zone_report *)malloc(
+ sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;}
+ ~ZoneReport(){
+ free(hdr);
+ }
+ ZoneReport(const ZoneReport &) = delete;
+ ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) {
+ rhs.hdr = nullptr;
+ }
+};
+
+static seastar::future<size_t> get_blk_dev_size(
+ seastar::file &device)
+{
+ return seastar::do_with(
+ (uint64_t)0,
+ [&](auto& size_sects) {
+ return device.ioctl(
+ BLKGETSIZE,
+ (void *)&size_sects
+ ).then([&](int ret) {
+ ceph_assert(size_sects);
+ size_t size = size_sects << SECT_SHIFT;
+ return seastar::make_ready_future<size_t>(size);
+ });
+ });
+}
+
+// zone_size should be in 512B sectors
+static seastar::future<> reset_device(
+ seastar::file &device,
+ uint64_t zone_size_sects,
+ uint64_t nr_zones)
+{
+ return seastar::do_with(
+ blk_zone_range{},
+ [&, nr_zones, zone_size_sects](auto &range) {
+ range.sector = 0;
+ range.nr_sectors = zone_size_sects * nr_zones;
+ return device.ioctl(
+ BLKRESETZONE,
+ &range
+ ).then([&](int ret){
+ return seastar::now();
+ });
+ }
+ );
+}
+
+static seastar::future<size_t> get_zone_capacity(
+ seastar::file &device,
+ uint32_t nr_zones)
+{
+ return seastar::do_with(
+ ZoneReport(nr_zones),
+ [&](auto &zr) {
+ zr.hdr->sector = 0;
+ zr.hdr->nr_zones = nr_zones;
+ return device.ioctl(
+ BLKREPORTZONE,
+ zr.hdr
+ ).then([&](int ret) {
+ return seastar::make_ready_future<size_t>(zr.hdr->zones[0].capacity);
+ });
+ }
+ );
+}
+
+// get the number of conventional zones of SMR HDD,
+// they are randomly writable and don't respond to zone operations
+static seastar::future<size_t> get_nr_cnv_zones(
+ seastar::file &device,
+ uint32_t nr_zones)
+{
+ return seastar::do_with(
+ ZoneReport(nr_zones),
+ [&](auto &zr) {
+ zr.hdr->sector = 0;
+ zr.hdr->nr_zones = nr_zones;
+ return device.ioctl(
+ BLKREPORTZONE,
+ zr.hdr
+ ).then([&, nr_zones](int ret) {
+ size_t cnv_zones = 0;
+ for (uint32_t i = 0; i < nr_zones; i++) {
+ if (zr.hdr->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
+ cnv_zones++;
+ }
+ return seastar::make_ready_future<size_t>(cnv_zones);
+ });
+ }
+ );
+}
+
+
+static write_ertr::future<> do_write(
+ seastar::file &device,
+ uint64_t offset,
+ bufferptr &bptr)
+{
+ LOG_PREFIX(ZBDSegmentManager::do_write);
+ DEBUG("offset {} len {}",
+ offset,
+ bptr.length());
+ return device.dma_write(
+ offset,
+ bptr.c_str(),
+ bptr.length()
+ ).handle_exception(
+ [FNAME](auto e) -> write_ertr::future<size_t> {
+ ERROR("dma_write got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ ).then([length = bptr.length()](auto result) -> write_ertr::future<> {
+ if (result != length) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ return write_ertr::now();
+ });
+}
+
+static write_ertr::future<> do_writev(
+ device_id_t device_id,
+ seastar::file &device,
+ uint64_t offset,
+ bufferlist&& bl,
+ size_t block_size)
+{
+ LOG_PREFIX(ZBDSegmentManager::do_writev);
+ DEBUG("{} offset {} len {}",
+ device_id_printer_t{device_id}, offset, bl.length());
+ // writev requires each buffer to be aligned to the disks' block
+ // size, we need to rebuild here
+ bl.rebuild_aligned(block_size);
+
+ return seastar::do_with(
+ bl.prepare_iovs(),
+ std::move(bl),
+ [&device, device_id, offset, FNAME](auto& iovs, auto& bl)
+ {
+ return write_ertr::parallel_for_each(
+ iovs,
+ [&device, device_id, offset, FNAME](auto& p)
+ {
+ auto off = offset + p.offset;
+ auto len = p.length;
+ auto& iov = p.iov;
+ DEBUG("{} poffset={}~{} dma_write ...",
+ device_id_printer_t{device_id},
+ off, len);
+ return device.dma_write(off, std::move(iov)
+ ).handle_exception(
+ [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
+ {
+ ERROR("{} poffset={}~{} dma_write got error -- {}",
+ device_id_printer_t{device_id}, off, len, e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
+ if (written != len) {
+ ERROR("{} poffset={}~{} dma_write len={} inconsistent",
+ device_id_printer_t{device_id}, off, len, written);
+ return crimson::ct_error::input_output_error::make();
+ }
+ DEBUG("{} poffset={}~{} dma_write done",
+ device_id_printer_t{device_id},
+ off, len);
+ return write_ertr::now();
+ });
+ });
+ });
+}
+
+static ZBDSegmentManager::access_ertr::future<>
+write_metadata(seastar::file &device, zbd_sm_metadata_t sb)
+{
+ assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() <
+ sb.block_size);
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
+ [=, &device](auto &bp) {
+ LOG_PREFIX(ZBDSegmentManager::write_metadata);
+ DEBUG("block_size {}", sb.block_size);
+ bufferlist bl;
+ encode(sb, bl);
+ auto iter = bl.begin();
+ assert(bl.length() < sb.block_size);
+ DEBUG("buffer length {}", bl.length());
+ iter.copy(bl.length(), bp.c_str());
+ DEBUG("doing writeout");
+ return do_write(device, 0, bp);
+ });
+}
+
+static read_ertr::future<> do_read(
+ seastar::file &device,
+ uint64_t offset,
+ size_t len,
+ bufferptr &bptr)
+{
+ LOG_PREFIX(ZBDSegmentManager::do_read);
+ assert(len <= bptr.length());
+ DEBUG("offset {} len {}",
+ offset,
+ len);
+ return device.dma_read(
+ offset,
+ bptr.c_str(),
+ len
+ ).handle_exception(
+ [FNAME](auto e) -> read_ertr::future<size_t> {
+ ERROR("dma_read got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ ).then([len](auto result) -> read_ertr::future<> {
+ if (result != len) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ return read_ertr::now();
+ });
+}
+
+static
+ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t>
+read_metadata(seastar::file &device, seastar::stat_data sd)
+{
+ assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() <
+ sd.block_size);
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
+ [=, &device](auto &bp) {
+ return do_read(
+ device,
+ 0,
+ bp.length(),
+ bp
+ ).safe_then([=, &bp] {
+ bufferlist bl;
+ bl.push_back(bp);
+ zbd_sm_metadata_t ret;
+ auto bliter = bl.cbegin();
+ decode(ret, bliter);
+ ret.validate();
+ return ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t>(
+ ZBDSegmentManager::access_ertr::ready_future_marker{},
+ ret);
+ });
+ });
+}
+
+ZBDSegmentManager::mount_ret ZBDSegmentManager::mount()
+{
+ return shard_devices.invoke_on_all([](auto &local_device) {
+ return local_device.shard_mount(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in ZBDSegmentManager::mount"
+ });
+ });
+}
+
+ZBDSegmentManager::mount_ret ZBDSegmentManager::shard_mount()
+{
+ return open_device(
+ device_path, seastar::open_flags::rw
+ ).safe_then([=, this](auto p) {
+ device = std::move(p.first);
+ auto sd = p.second;
+ return read_metadata(device, sd);
+ }).safe_then([=, this](auto meta){
+ shard_info = meta.shard_infos[seastar::this_shard_id()];
+ metadata = meta;
+ return mount_ertr::now();
+ });
+}
+
+ZBDSegmentManager::mkfs_ret ZBDSegmentManager::mkfs(
+ device_config_t config)
+{
+ return shard_devices.local().primary_mkfs(config
+ ).safe_then([this] {
+ return shard_devices.invoke_on_all([](auto &local_device) {
+ return local_device.shard_mkfs(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in ZBDSegmentManager::mkfs"
+ });
+ });
+ });
+}
+
+ZBDSegmentManager::mkfs_ret ZBDSegmentManager::primary_mkfs(
+ device_config_t config)
+{
+ LOG_PREFIX(ZBDSegmentManager::primary_mkfs);
+ INFO("starting, device_path {}", device_path);
+ return seastar::do_with(
+ seastar::file{},
+ seastar::stat_data{},
+ zbd_sm_metadata_t{},
+ size_t(),
+ size_t(),
+ size_t(),
+ size_t(),
+ [=, this]
+ (auto &device,
+ auto &stat,
+ auto &sb,
+ auto &zone_size_sects,
+ auto &nr_zones,
+ auto &size,
+ auto &nr_cnv_zones) {
+ return open_device(
+ device_path,
+ seastar::open_flags::rw
+ ).safe_then([=, this, &device, &stat, &sb, &zone_size_sects, &nr_zones, &size, &nr_cnv_zones](auto p) {
+ device = p.first;
+ stat = p.second;
+ return device.ioctl(
+ BLKGETNRZONES,
+ (void *)&nr_zones
+ ).then([&](int ret) {
+ if (nr_zones == 0) {
+ return seastar::make_exception_future<int>(
+ std::system_error(std::make_error_code(std::errc::io_error)));
+ }
+ return device.ioctl(BLKGETZONESZ, (void *)&zone_size_sects);
+ }).then([&](int ret) {
+ ceph_assert(zone_size_sects);
+ return reset_device(device, zone_size_sects, nr_zones);
+ }).then([&] {
+ return get_blk_dev_size(device);
+ }).then([&](auto devsize) {
+ size = devsize;
+ return get_nr_cnv_zones(device, nr_zones);
+ }).then([&](auto cnv_zones) {
+ DEBUG("Found {} conventional zones", cnv_zones);
+ nr_cnv_zones = cnv_zones;
+ return get_zone_capacity(device, nr_zones);
+ }).then([&, FNAME, config](auto zone_capacity_sects) {
+ ceph_assert(zone_capacity_sects);
+ DEBUG("zone_size in sectors {}, zone_capacity in sectors {}",
+ zone_size_sects, zone_capacity_sects);
+ sb = make_metadata(
+ size,
+ config.meta,
+ stat,
+ zone_size_sects,
+ zone_capacity_sects,
+ nr_cnv_zones,
+ nr_zones);
+ metadata = sb;
+ stats.metadata_write.increment(
+ ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>());
+ DEBUG("Wrote to stats.");
+ return write_metadata(device, sb);
+ }).finally([&, FNAME] {
+ DEBUG("Closing device.");
+ return device.close();
+ }).safe_then([FNAME] {
+ DEBUG("Returning from mkfs.");
+ return mkfs_ertr::now();
+ });
+ });
+ });
+}
+
+ZBDSegmentManager::mkfs_ret ZBDSegmentManager::shard_mkfs()
+{
+ LOG_PREFIX(ZBDSegmentManager::shard_mkfs);
+ INFO("starting, device_path {}", device_path);
+ return open_device(
+ device_path, seastar::open_flags::rw
+ ).safe_then([=, this](auto p) {
+ device = std::move(p.first);
+ auto sd = p.second;
+ return read_metadata(device, sd);
+ }).safe_then([=, this](auto meta){
+ shard_info = meta.shard_infos[seastar::this_shard_id()];
+ metadata = meta;
+ return device.close();
+ }).safe_then([FNAME] {
+ DEBUG("Returning from shard_mkfs.");
+ return mkfs_ertr::now();
+ });
+}
+
+// Return range of sectors to operate on.
+struct blk_zone_range make_range(
+ segment_id_t id,
+ size_t segment_size,
+ size_t first_segment_offset)
+{
+ return blk_zone_range{
+ (id.device_segment_id() * (segment_size >> SECT_SHIFT)
+ + (first_segment_offset >> SECT_SHIFT)),
+ (segment_size >> SECT_SHIFT)
+ };
+}
+
+using blk_zone_op_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+using blk_zone_op_ret = blk_zone_op_ertr::future<>;
+blk_zone_op_ret blk_zone_op(seastar::file &device,
+ blk_zone_range &range,
+ zone_op op) {
+ LOG_PREFIX(ZBDSegmentManager::blk_zone_op);
+
+ unsigned long ioctl_op = 0;
+ switch (op) {
+ using enum zone_op;
+ case OPEN:
+ ioctl_op = BLKOPENZONE;
+ break;
+ case FINISH:
+ ioctl_op = BLKFINISHZONE;
+ break;
+ case RESET:
+ ioctl_op = BLKRESETZONE;
+ break;
+ case CLOSE:
+ ioctl_op = BLKCLOSEZONE;
+ break;
+ default:
+ ERROR("Invalid zone operation {}", op);
+ ceph_assert(ioctl_op);
+ }
+
+ return device.ioctl(
+ ioctl_op,
+ &range
+ ).then_wrapped([=](auto f) -> blk_zone_op_ret {
+ if (f.failed()) {
+ ERROR("{} ioctl failed", op);
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ int ret = f.get();
+ if (ret == 0) {
+ return seastar::now();
+ } else {
+ ERROR("{} ioctl failed with return code {}", op, ret);
+ return crimson::ct_error::input_output_error::make();
+ }
+ }
+ });
+}
+
+ZBDSegmentManager::open_ertr::future<SegmentRef> ZBDSegmentManager::open(
+ segment_id_t id)
+{
+ LOG_PREFIX(ZBDSegmentManager::open);
+ return seastar::do_with(
+ blk_zone_range{},
+ [=, this](auto &range) {
+ range = make_range(
+ id,
+ metadata.segment_size,
+ shard_info.first_segment_offset);
+ return blk_zone_op(
+ device,
+ range,
+ zone_op::OPEN
+ );
+ }
+ ).safe_then([=, this] {
+ DEBUG("segment {}, open successful", id);
+ return open_ertr::future<SegmentRef>(
+ open_ertr::ready_future_marker{},
+ SegmentRef(new ZBDSegment(*this, id))
+ );
+ });
+}
+
+ZBDSegmentManager::release_ertr::future<> ZBDSegmentManager::release(
+ segment_id_t id)
+{
+ LOG_PREFIX(ZBDSegmentManager::release);
+ DEBUG("Resetting zone/segment {}", id);
+ return seastar::do_with(
+ blk_zone_range{},
+ [=, this](auto &range) {
+ range = make_range(
+ id,
+ metadata.segment_size,
+ shard_info.first_segment_offset);
+ return blk_zone_op(
+ device,
+ range,
+ zone_op::RESET
+ );
+ }
+ ).safe_then([=] {
+ DEBUG("segment release successful");
+ return release_ertr::now();
+ });
+}
+
+SegmentManager::read_ertr::future<> ZBDSegmentManager::read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out)
+{
+ LOG_PREFIX(ZBDSegmentManager::read);
+ auto& seg_addr = addr.as_seg_paddr();
+ if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) {
+ ERROR("invalid segment {}",
+ seg_addr.get_segment_id().device_segment_id());
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (seg_addr.get_segment_off() + len > metadata.segment_capacity) {
+ ERROR("invalid read offset {}, len {}",
+ addr,
+ len);
+ return crimson::ct_error::invarg::make();
+ }
+ return do_read(
+ device,
+ get_offset(addr),
+ len,
+ out);
+}
+
+Segment::close_ertr::future<> ZBDSegmentManager::segment_close(
+ segment_id_t id, segment_off_t write_pointer)
+{
+ LOG_PREFIX(ZBDSegmentManager::segment_close);
+ return seastar::do_with(
+ blk_zone_range{},
+ [=, this](auto &range) {
+ range = make_range(
+ id,
+ metadata.segment_size,
+ shard_info.first_segment_offset);
+ return blk_zone_op(
+ device,
+ range,
+ zone_op::FINISH
+ );
+ }
+ ).safe_then([=] {
+ DEBUG("zone finish successful");
+ return Segment::close_ertr::now();
+ });
+}
+
+Segment::write_ertr::future<> ZBDSegmentManager::segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check)
+{
+ LOG_PREFIX(ZBDSegmentManager::segment_write);
+ assert(addr.get_device_id() == get_device_id());
+ assert((bl.length() % metadata.block_size) == 0);
+ auto& seg_addr = addr.as_seg_paddr();
+ DEBUG("write to segment {} at offset {}, physical offset {}, len {}",
+ seg_addr.get_segment_id(),
+ seg_addr.get_segment_off(),
+ get_offset(addr),
+ bl.length());
+ stats.data_write.increment(bl.length());
+ return do_writev(
+ get_device_id(),
+ device,
+ get_offset(addr),
+ std::move(bl),
+ metadata.block_size);
+}
+
+device_id_t ZBDSegmentManager::get_device_id() const
+{
+ return metadata.device_id;
+};
+
+secondary_device_set_t& ZBDSegmentManager::get_secondary_devices()
+{
+ return metadata.secondary_devices;
+};
+
+magic_t ZBDSegmentManager::get_magic() const
+{
+ return metadata.magic;
+};
+
+segment_off_t ZBDSegment::get_write_capacity() const
+{
+ return manager.get_segment_size();
+}
+
+SegmentManager::close_ertr::future<> ZBDSegmentManager::close()
+{
+ if (device) {
+ return device.close();
+ }
+ return seastar::now();
+}
+
+Segment::close_ertr::future<> ZBDSegment::close()
+{
+ return manager.segment_close(id, write_pointer);
+}
+
+Segment::write_ertr::future<> ZBDSegment::write(
+ segment_off_t offset, ceph::bufferlist bl)
+{
+ LOG_PREFIX(ZBDSegment::write);
+ if (offset != write_pointer || offset % manager.metadata.block_size != 0) {
+ ERROR("Segment offset and zone write pointer mismatch. "
+ "segment {} segment-offset {} write pointer {}",
+ id, offset, write_pointer);
+ return crimson::ct_error::invarg::make();
+ }
+ if (offset + bl.length() > manager.metadata.segment_capacity) {
+ return crimson::ct_error::enospc::make();
+ }
+
+ write_pointer = offset + bl.length();
+ return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
+}
+
+Segment::write_ertr::future<> ZBDSegment::write_padding_bytes(
+ size_t padding_bytes)
+{
+ LOG_PREFIX(ZBDSegment::write_padding_bytes);
+ DEBUG("Writing {} padding bytes to segment {} at wp {}",
+ padding_bytes, id, write_pointer);
+
+ return crimson::repeat([FNAME, padding_bytes, this] () mutable {
+ size_t bufsize = 0;
+ if (padding_bytes >= MAX_PADDING_SIZE) {
+ bufsize = MAX_PADDING_SIZE;
+ } else {
+ bufsize = padding_bytes;
+ }
+
+ padding_bytes -= bufsize;
+ bufferptr bp(ceph::buffer::create_page_aligned(bufsize));
+ bp.zero();
+ bufferlist padd_bl;
+ padd_bl.append(bp);
+ return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() {
+ if (padding_bytes == 0) {
+ return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::yes);
+ } else {
+ return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::no);
+ }
+ });
+ });
+}
+
+// Advance write pointer, to given offset.
+Segment::write_ertr::future<> ZBDSegment::advance_wp(
+ segment_off_t offset)
+{
+ LOG_PREFIX(ZBDSegment::advance_wp);
+
+ DEBUG("Advancing write pointer from {} to {}", write_pointer, offset);
+ if (offset < write_pointer) {
+ return crimson::ct_error::invarg::make();
+ }
+
+ size_t padding_bytes = offset - write_pointer;
+
+ if (padding_bytes == 0) {
+ return write_ertr::now();
+ }
+
+ assert(padding_bytes % manager.metadata.block_size == 0);
+
+ return write_padding_bytes(padding_bytes);
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/zbd.h b/src/crimson/os/seastore/segment_manager/zbd.h
new file mode 100644
index 000000000..c18f46336
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/zbd.h
@@ -0,0 +1,246 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <linux/blkzoned.h>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/file.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/reactor.hh>
+
+#include "crimson/common/layout.h"
+
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "include/uuid.h"
+
+namespace crimson::os::seastore::segment_manager::zbd {
+
+ struct zbd_shard_info_t {
+ size_t size = 0;
+ size_t segments = 0;
+ size_t first_segment_offset = 0;
+
+ DENC(zbd_shard_info_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.size, p);
+ denc(v.segments, p);
+ denc(v.first_segment_offset, p);
+ DENC_FINISH(p);
+ }
+ };
+
+ struct zbd_sm_metadata_t {
+ unsigned int shard_num = 0;
+ size_t segment_size = 0;
+ size_t segment_capacity = 0;
+ size_t zones_per_segment = 0;
+ size_t zone_capacity = 0;
+ size_t block_size = 0;
+ size_t zone_size = 0;
+
+ std::vector<zbd_shard_info_t> shard_infos;
+
+ seastore_meta_t meta;
+
+ bool major_dev = false;
+ magic_t magic = 0;
+ device_type_t dtype = device_type_t::NONE;
+ device_id_t device_id = 0;
+ secondary_device_set_t secondary_devices;
+
+ DENC(zbd_sm_metadata_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.shard_num, p);
+ denc(v.segment_size, p);
+ denc(v.segment_capacity, p);
+ denc(v.zones_per_segment, p);
+ denc(v.zone_capacity, p);
+ denc(v.block_size, p);
+ denc(v.zone_size, p);
+ denc(v.shard_infos, p);
+ denc(v.meta, p);
+ denc(v.magic, p);
+ denc(v.dtype, p);
+ denc(v.device_id, p);
+ if (v.major_dev) {
+ denc(v.secondary_devices, p);
+ }
+ DENC_FINISH(p);
+ }
+
+ void validate() const {
+ ceph_assert_always(shard_num == seastar::smp::count);
+ for (unsigned int i = 0; i < seastar::smp::count; i++) {
+ ceph_assert_always(shard_infos[i].size > 0);
+ ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX);
+ ceph_assert_always(shard_infos[i].segments > 0);
+ ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX);
+ }
+ ceph_assert_always(segment_capacity > 0);
+ ceph_assert_always(segment_capacity <= SEGMENT_OFF_MAX);
+ }
+ };
+
+ using write_ertr = crimson::errorator<crimson::ct_error::input_output_error>;
+ using read_ertr = crimson::errorator<crimson::ct_error::input_output_error>;
+
+ enum class zone_op {
+ OPEN,
+ FINISH,
+ CLOSE,
+ RESET,
+ };
+
+ class ZBDSegmentManager;
+
+ class ZBDSegment final : public Segment {
+ public:
+ ZBDSegment(ZBDSegmentManager &man, segment_id_t i) : manager(man), id(i){};
+
+ segment_id_t get_segment_id() const final { return id; }
+ segment_off_t get_write_capacity() const final;
+ segment_off_t get_write_ptr() const final { return write_pointer; }
+ close_ertr::future<> close() final;
+ write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+ write_ertr::future<> advance_wp(segment_off_t offset) final;
+
+ ~ZBDSegment() {}
+ private:
+ friend class ZBDSegmentManager;
+ ZBDSegmentManager &manager;
+ const segment_id_t id;
+ segment_off_t write_pointer = 0;
+ write_ertr::future<> write_padding_bytes(size_t padding_bytes);
+ };
+
+ class ZBDSegmentManager final : public SegmentManager{
+ // interfaces used by Device
+ public:
+ seastar::future<> start() {
+ return shard_devices.start(device_path);
+ }
+
+ seastar::future<> stop() {
+ return shard_devices.stop();
+ }
+
+ Device& get_sharded_device() final {
+ return shard_devices.local();
+ }
+
+ mount_ret mount() final;
+ mkfs_ret mkfs(device_config_t meta) final;
+
+ ZBDSegmentManager(const std::string &path) : device_path(path) {}
+
+ ~ZBDSegmentManager() final = default;
+
+ //interfaces used by each shard device
+ public:
+ open_ertr::future<SegmentRef> open(segment_id_t id) final;
+ close_ertr::future<> close() final;
+
+ release_ertr::future<> release(segment_id_t id) final;
+
+ read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) final;
+
+ device_type_t get_device_type() const final {
+ return device_type_t::ZBD;
+ }
+
+ size_t get_available_size() const final {
+ return shard_info.size;
+ };
+
+ extent_len_t get_block_size() const final {
+ return metadata.block_size;
+ };
+
+ segment_off_t get_segment_size() const final {
+ return metadata.segment_capacity;
+ };
+
+ const seastore_meta_t &get_meta() const {
+ return metadata.meta;
+ };
+
+ device_id_t get_device_id() const final;
+
+ secondary_device_set_t& get_secondary_devices() final;
+
+ magic_t get_magic() const final;
+
+ Segment::write_ertr::future<> segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check=false);
+
+ private:
+ friend class ZBDSegment;
+ std::string device_path;
+ zbd_shard_info_t shard_info;
+ zbd_sm_metadata_t metadata;
+ seastar::file device;
+ uint32_t nr_zones;
+ struct effort_t {
+ uint64_t num = 0;
+ uint64_t bytes = 0;
+
+ void increment(uint64_t read_bytes) {
+ ++num;
+ bytes += read_bytes;
+ }
+ };
+
+ struct zbd_sm_stats {
+ effort_t data_read = {};
+ effort_t data_write = {};
+ effort_t metadata_write = {};
+ uint64_t opened_segments = 0;
+ uint64_t closed_segments = 0;
+ uint64_t closed_segments_unused_bytes = 0;
+ uint64_t released_segments = 0;
+
+ void reset() {
+ *this = zbd_sm_stats{};
+ }
+ } stats;
+
+ void register_metrics();
+ seastar::metrics::metric_group metrics;
+
+ Segment::close_ertr::future<> segment_close(
+ segment_id_t id, segment_off_t write_pointer);
+
+ uint64_t get_offset(paddr_t addr) {
+ auto& seg_addr = addr.as_seg_paddr();
+ return (shard_info.first_segment_offset +
+ (seg_addr.get_segment_id().device_segment_id() *
+ metadata.segment_size)) + seg_addr.get_segment_off();
+ }
+ private:
+ // shard 0 mkfs
+ mkfs_ret primary_mkfs(device_config_t meta);
+ // all shards mkfs
+ mkfs_ret shard_mkfs();
+
+ mount_ret shard_mount();
+
+ seastar::sharded<ZBDSegmentManager> shard_devices;
+ };
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(
+ crimson::os::seastore::segment_manager::zbd::zbd_shard_info_t
+)
+WRITE_CLASS_DENC_BOUNDED(
+ crimson::os::seastore::segment_manager::zbd::zbd_sm_metadata_t
+)
diff --git a/src/crimson/os/seastore/segment_manager_group.cc b/src/crimson/os/seastore/segment_manager_group.cc
new file mode 100644
index 000000000..332b794b7
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager_group.cc
@@ -0,0 +1,171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "crimson/os/seastore/segment_manager_group.h"
+
+#include "crimson/os/seastore/logging.h"
+
+SET_SUBSYS(seastore_journal);
+
+namespace crimson::os::seastore {
+
+SegmentManagerGroup::read_segment_tail_ret
+SegmentManagerGroup::read_segment_tail(segment_id_t segment)
+{
+ assert(has_device(segment.device_id()));
+ auto& segment_manager = *segment_managers[segment.device_id()];
+ return segment_manager.read(
+ paddr_t::make_seg_paddr(
+ segment,
+ segment_manager.get_segment_size() - get_rounded_tail_length()),
+ get_rounded_tail_length()
+ ).handle_error(
+ read_segment_header_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in SegmentManagerGroup::read_segment_tail"
+ }
+ ).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_tail_ret {
+ LOG_PREFIX(SegmentManagerGroup::read_segment_tail);
+ DEBUG("segment {} bptr size {}", segment, bptr.length());
+
+ segment_tail_t tail;
+ bufferlist bl;
+ bl.push_back(bptr);
+
+ DEBUG("segment {} block crc {}",
+ segment,
+ bl.begin().crc32c(segment_manager.get_block_size(), 0));
+
+ auto bp = bl.cbegin();
+ try {
+ decode(tail, bp);
+ } catch (ceph::buffer::error &e) {
+ DEBUG("segment {} unable to decode tail, skipping -- {}",
+ segment, e.what());
+ return crimson::ct_error::enodata::make();
+ }
+ DEBUG("segment {} tail {}", segment, tail);
+ return read_segment_tail_ret(
+ read_segment_tail_ertr::ready_future_marker{},
+ tail);
+ });
+}
+
+SegmentManagerGroup::read_segment_header_ret
+SegmentManagerGroup::read_segment_header(segment_id_t segment)
+{
+ assert(has_device(segment.device_id()));
+ auto& segment_manager = *segment_managers[segment.device_id()];
+ return segment_manager.read(
+ paddr_t::make_seg_paddr(segment, 0),
+ get_rounded_header_length()
+ ).handle_error(
+ read_segment_header_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in SegmentManagerGroup::read_segment_header"
+ }
+ ).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_header_ret {
+ LOG_PREFIX(SegmentManagerGroup::read_segment_header);
+ DEBUG("segment {} bptr size {}", segment, bptr.length());
+
+ segment_header_t header;
+ bufferlist bl;
+ bl.push_back(bptr);
+
+ DEBUG("segment {} block crc {}",
+ segment,
+ bl.begin().crc32c(segment_manager.get_block_size(), 0));
+
+ auto bp = bl.cbegin();
+ try {
+ decode(header, bp);
+ } catch (ceph::buffer::error &e) {
+ DEBUG("segment {} unable to decode header, skipping -- {}",
+ segment, e.what());
+ return crimson::ct_error::enodata::make();
+ }
+ DEBUG("segment {} header {}", segment, header);
+ return read_segment_header_ret(
+ read_segment_header_ertr::ready_future_marker{},
+ header);
+ });
+}
+
+void SegmentManagerGroup::initialize_cursor(
+ scan_valid_records_cursor &cursor)
+{
+ LOG_PREFIX(SegmentManagerGroup::initialize_cursor);
+ assert(has_device(cursor.get_segment_id().device_id()));
+ auto& segment_manager =
+ *segment_managers[cursor.get_segment_id().device_id()];
+ if (cursor.get_segment_offset() == 0) {
+ INFO("start to scan segment {}", cursor.get_segment_id());
+ cursor.increment_seq(segment_manager.get_block_size());
+ }
+ cursor.block_size = segment_manager.get_block_size();
+}
+
+SegmentManagerGroup::read_ret
+SegmentManagerGroup::read(paddr_t start, size_t len)
+{
+ LOG_PREFIX(SegmentManagerGroup::read);
+ assert(has_device(start.get_device_id()));
+ auto& segment_manager = *segment_managers[start.get_device_id()];
+ TRACE("reading data {}~{}", start, len);
+ return segment_manager.read(
+ start,
+ len
+ ).safe_then([](auto bptr) {
+ return read_ret(
+ read_ertr::ready_future_marker{},
+ std::move(bptr)
+ );
+ });
+}
+
+SegmentManagerGroup::find_journal_segment_headers_ret
+SegmentManagerGroup::find_journal_segment_headers()
+{
+ return seastar::do_with(
+ get_segment_managers(),
+ find_journal_segment_headers_ret_bare{},
+ [this](auto &sms, auto& ret) -> find_journal_segment_headers_ret
+ {
+ return crimson::do_for_each(sms,
+ [this, &ret](SegmentManager *sm)
+ {
+ LOG_PREFIX(SegmentManagerGroup::find_journal_segment_headers);
+ auto device_id = sm->get_device_id();
+ auto num_segments = sm->get_num_segments();
+ DEBUG("processing {} with {} segments",
+ device_id_printer_t{device_id}, num_segments);
+ return crimson::do_for_each(
+ boost::counting_iterator<device_segment_id_t>(0),
+ boost::counting_iterator<device_segment_id_t>(num_segments),
+ [this, &ret, device_id](device_segment_id_t d_segment_id)
+ {
+ segment_id_t segment_id{device_id, d_segment_id};
+ return read_segment_header(segment_id
+ ).safe_then([segment_id, &ret](auto &&header) {
+ if (header.get_type() == segment_type_t::JOURNAL) {
+ ret.emplace_back(std::make_pair(segment_id, std::move(header)));
+ }
+ }).handle_error(
+ crimson::ct_error::enoent::handle([](auto) {
+ return find_journal_segment_headers_ertr::now();
+ }),
+ crimson::ct_error::enodata::handle([](auto) {
+ return find_journal_segment_headers_ertr::now();
+ }),
+ crimson::ct_error::input_output_error::pass_further{}
+ );
+ });
+ }).safe_then([&ret]() mutable {
+ return find_journal_segment_headers_ret{
+ find_journal_segment_headers_ertr::ready_future_marker{},
+ std::move(ret)};
+ });
+ });
+}
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/segment_manager_group.h b/src/crimson/os/seastore/segment_manager_group.h
new file mode 100644
index 000000000..f193b5eed
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager_group.h
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <set>
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/record_scanner.h"
+
+namespace crimson::os::seastore {
+
+class SegmentManagerGroup : public RecordScanner {
+public:
+ SegmentManagerGroup() {
+ segment_managers.resize(DEVICE_ID_MAX, nullptr);
+ }
+
+ const std::set<device_id_t>& get_device_ids() const {
+ return device_ids;
+ }
+
+ std::vector<SegmentManager*> get_segment_managers() const {
+ assert(device_ids.size());
+ std::vector<SegmentManager*> ret;
+ for (auto& device_id : device_ids) {
+ auto segment_manager = segment_managers[device_id];
+ assert(segment_manager->get_device_id() == device_id);
+ ret.emplace_back(segment_manager);
+ }
+ return ret;
+ }
+
+ void add_segment_manager(SegmentManager* segment_manager) {
+ auto device_id = segment_manager->get_device_id();
+ ceph_assert(!has_device(device_id));
+ if (!device_ids.empty()) {
+ auto existing_id = *device_ids.begin();
+ ceph_assert(segment_managers[existing_id]->get_device_type()
+ == segment_manager->get_device_type());
+ }
+ segment_managers[device_id] = segment_manager;
+ device_ids.insert(device_id);
+ }
+
+ void reset() {
+ segment_managers.clear();
+ segment_managers.resize(DEVICE_ID_MAX, nullptr);
+ device_ids.clear();
+ }
+
+ /**
+ * get device info
+ *
+ * Assume all segment managers share the same following information.
+ */
+ extent_len_t get_block_size() const {
+ assert(device_ids.size());
+ return segment_managers[*device_ids.begin()]->get_block_size();
+ }
+
+ segment_off_t get_segment_size() const {
+ assert(device_ids.size());
+ return segment_managers[*device_ids.begin()]->get_segment_size();
+ }
+
+ const seastore_meta_t &get_meta() const {
+ assert(device_ids.size());
+ return segment_managers[*device_ids.begin()]->get_meta();
+ }
+
+ std::size_t get_rounded_header_length() const {
+ return p2roundup(
+ ceph::encoded_sizeof_bounded<segment_header_t>(),
+ (std::size_t)get_block_size());
+ }
+
+ std::size_t get_rounded_tail_length() const {
+ return p2roundup(
+ ceph::encoded_sizeof_bounded<segment_tail_t>(),
+ (std::size_t)get_block_size());
+ }
+
+ using read_segment_header_ertr = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::enodata,
+ crimson::ct_error::input_output_error
+ >;
+ using read_segment_header_ret = read_segment_header_ertr::future<
+ segment_header_t>;
+ read_segment_header_ret read_segment_header(segment_id_t segment);
+
+ using read_segment_tail_ertr = read_segment_header_ertr;
+ using read_segment_tail_ret = read_segment_tail_ertr::future<
+ segment_tail_t>;
+ read_segment_tail_ret read_segment_tail(segment_id_t segment);
+
+ /*
+ * read journal segment headers
+ */
+ using find_journal_segment_headers_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using find_journal_segment_headers_ret_bare = std::vector<
+ std::pair<segment_id_t, segment_header_t>>;
+ using find_journal_segment_headers_ret = find_journal_segment_headers_ertr::future<
+ find_journal_segment_headers_ret_bare>;
+ find_journal_segment_headers_ret find_journal_segment_headers();
+
+ using open_ertr = SegmentManager::open_ertr;
+ open_ertr::future<SegmentRef> open(segment_id_t id) {
+ assert(has_device(id.device_id()));
+ return segment_managers[id.device_id()]->open(id);
+ }
+
+ using release_ertr = SegmentManager::release_ertr;
+ release_ertr::future<> release_segment(segment_id_t id) {
+ assert(has_device(id.device_id()));
+ return segment_managers[id.device_id()]->release(id);
+ }
+
+private:
+ bool has_device(device_id_t id) const {
+ assert(id <= DEVICE_ID_MAX_VALID);
+ return device_ids.count(id) >= 1;
+ }
+
+ void initialize_cursor(scan_valid_records_cursor &cursor) final;
+
+ read_ret read(paddr_t start, size_t len) final;
+
+ bool is_record_segment_seq_invalid(scan_valid_records_cursor &cursor,
+ record_group_header_t &header) final {
+ return false;
+ }
+
+ int64_t get_segment_end_offset(paddr_t addr) final {
+ auto& seg_addr = addr.as_seg_paddr();
+ auto& segment_manager = *segment_managers[seg_addr.get_segment_id().device_id()];
+ return static_cast<int64_t>(segment_manager.get_segment_size());
+ }
+
+ std::vector<SegmentManager*> segment_managers;
+ std::set<device_id_t> device_ids;
+};
+
+using SegmentManagerGroupRef = std::unique_ptr<SegmentManagerGroup>;
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/segment_seq_allocator.h b/src/crimson/os/seastore/segment_seq_allocator.h
new file mode 100644
index 000000000..28c81bf32
--- /dev/null
+++ b/src/crimson/os/seastore/segment_seq_allocator.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+class AsyncCleaner;
+}
+
+namespace crimson::os::seastore::journal {
+class SegmentedJournal;
+}
+
+namespace crimson::os::seastore {
+
+class SegmentSeqAllocator {
+public:
+ SegmentSeqAllocator(segment_type_t type)
+ : type(type) {}
+ segment_seq_t get_and_inc_next_segment_seq() {
+ return next_segment_seq++;
+ }
+private:
+ void set_next_segment_seq(segment_seq_t seq) {
+ LOG_PREFIX(SegmentSeqAllocator::set_next_segment_seq);
+ SUBDEBUG(
+ seastore_journal,
+ "{}, next={}, cur={}",
+ type,
+ segment_seq_printer_t{seq},
+ segment_seq_printer_t{next_segment_seq});
+ assert(type == segment_type_t::JOURNAL
+ ? seq >= next_segment_seq
+ : true);
+ if (seq > next_segment_seq)
+ next_segment_seq = seq;
+ }
+ segment_seq_t next_segment_seq = 0;
+ segment_type_t type = segment_type_t::NULL_SEG;
+ friend class journal::SegmentedJournal;
+ friend class SegmentCleaner;
+};
+
+using SegmentSeqAllocatorRef =
+ std::unique_ptr<SegmentSeqAllocator>;
+
+};
diff --git a/src/crimson/os/seastore/transaction.cc b/src/crimson/os/seastore/transaction.cc
new file mode 100644
index 000000000..4cab476c3
--- /dev/null
+++ b/src/crimson/os/seastore/transaction.cc
@@ -0,0 +1,8 @@
+#include "transaction.h"
+#include "crimson/common/interruptible_future.h"
+
+namespace crimson::interruptible {
+template
+thread_local interrupt_cond_t<::crimson::os::seastore::TransactionConflictCondition>
+interrupt_cond<::crimson::os::seastore::TransactionConflictCondition>;
+}
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
new file mode 100644
index 000000000..d423196fe
--- /dev/null
+++ b/src/crimson/os/seastore/transaction.h
@@ -0,0 +1,653 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive/list.hpp>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/ordering_handle.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/root_block.h"
+
+namespace crimson::os::seastore {
+
+class SeaStore;
+class Transaction;
+
+struct io_stat_t {
+ uint64_t num = 0;
+ uint64_t bytes = 0;
+
+ bool is_clear() const {
+ return (num == 0 && bytes == 0);
+ }
+
+ void increment(uint64_t _bytes) {
+ ++num;
+ bytes += _bytes;
+ }
+
+ void increment_stat(const io_stat_t& stat) {
+ num += stat.num;
+ bytes += stat.bytes;
+ }
+};
+inline std::ostream& operator<<(std::ostream& out, const io_stat_t& stat) {
+ return out << stat.num << "(" << stat.bytes << "B)";
+}
+
+struct version_stat_t {
+ uint64_t num = 0;
+ uint64_t version = 0;
+
+ bool is_clear() const {
+ return (num == 0 && version == 0);
+ }
+
+ void increment(extent_version_t v) {
+ ++num;
+ version += v;
+ }
+
+ void increment_stat(const version_stat_t& stat) {
+ num += stat.num;
+ version += stat.version;
+ }
+};
+
+/**
+ * Transaction
+ *
+ * Representation of in-progress mutation. Used exclusively through Cache methods.
+ *
+ * Transaction log levels:
+ * seastore_t
+ * - DEBUG: transaction create, conflict, commit events
+ * - TRACE: DEBUG details
+ * - seastore_cache logs
+ */
+class Transaction {
+public:
+ using Ref = std::unique_ptr<Transaction>;
+ using on_destruct_func_t = std::function<void(Transaction&)>;
+ enum class get_extent_ret {
+ PRESENT,
+ ABSENT,
+ RETIRED
+ };
+ get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) {
+ LOG_PREFIX(Transaction::get_extent);
+ // it's possible that both write_set and retired_set contain
+ // this addr at the same time when addr is absolute and the
+ // corresponding extent is used to map existing extent on disk.
+ // So search write_set first.
+ if (auto iter = write_set.find_offset(addr);
+ iter != write_set.end()) {
+ if (out)
+ *out = CachedExtentRef(&*iter);
+ SUBTRACET(seastore_cache, "{} is present in write_set -- {}",
+ *this, addr, *iter);
+ assert((*out)->is_valid());
+ return get_extent_ret::PRESENT;
+ } else if (retired_set.count(addr)) {
+ return get_extent_ret::RETIRED;
+ } else if (
+ auto iter = read_set.find(addr);
+ iter != read_set.end()) {
+ // placeholder in read-set should be in the retired-set
+ // at the same time.
+ assert(iter->ref->get_type() != extent_types_t::RETIRED_PLACEHOLDER);
+ if (out)
+ *out = iter->ref;
+ SUBTRACET(seastore_cache, "{} is present in read_set -- {}",
+ *this, addr, *(iter->ref));
+ return get_extent_ret::PRESENT;
+ } else {
+ return get_extent_ret::ABSENT;
+ }
+ }
+
+ void add_to_retired_set(CachedExtentRef ref) {
+ ceph_assert(!is_weak());
+ if (ref->is_exist_clean() ||
+ ref->is_exist_mutation_pending()) {
+ existing_block_stats.dec(ref);
+ ref->set_invalid(*this);
+ write_set.erase(*ref);
+ } else if (ref->is_initial_pending()) {
+ ref->set_invalid(*this);
+ write_set.erase(*ref);
+ } else if (ref->is_mutation_pending()) {
+ ref->set_invalid(*this);
+ write_set.erase(*ref);
+ assert(ref->prior_instance);
+ retired_set.insert(ref->prior_instance);
+ assert(read_set.count(ref->prior_instance->get_paddr()));
+ ref->prior_instance.reset();
+ } else {
+ // && retired_set.count(ref->get_paddr()) == 0
+ // If it's already in the set, insert here will be a noop,
+ // which is what we want.
+ retired_set.insert(ref);
+ }
+ }
+
+ void add_to_read_set(CachedExtentRef ref) {
+ if (is_weak()) return;
+
+ assert(ref->is_valid());
+
+ auto it = ref->transactions.lower_bound(
+ this, read_set_item_t<Transaction>::trans_cmp_t());
+ if (it != ref->transactions.end() && it->t == this) return;
+
+ auto [iter, inserted] = read_set.emplace(this, ref);
+ ceph_assert(inserted);
+ ref->transactions.insert_before(
+ it, const_cast<read_set_item_t<Transaction>&>(*iter));
+ }
+
+ void add_fresh_extent(
+ CachedExtentRef ref) {
+ ceph_assert(!is_weak());
+ if (ref->is_exist_clean()) {
+ existing_block_stats.inc(ref);
+ existing_block_list.push_back(ref);
+ } else if (ref->get_paddr().is_delayed()) {
+ assert(ref->get_paddr() == make_delayed_temp_paddr(0));
+ assert(ref->is_logical());
+ ref->set_paddr(make_delayed_temp_paddr(delayed_temp_offset));
+ delayed_temp_offset += ref->get_length();
+ delayed_alloc_list.emplace_back(ref->cast<LogicalCachedExtent>());
+ fresh_block_stats.increment(ref->get_length());
+ } else if (ref->get_paddr().is_absolute()) {
+ pre_alloc_list.emplace_back(ref->cast<LogicalCachedExtent>());
+ fresh_block_stats.increment(ref->get_length());
+ } else {
+ if (likely(ref->get_paddr() == make_record_relative_paddr(0))) {
+ ref->set_paddr(make_record_relative_paddr(offset));
+ } else {
+ ceph_assert(ref->get_paddr().is_fake());
+ }
+ offset += ref->get_length();
+ inline_block_list.push_back(ref);
+ fresh_block_stats.increment(ref->get_length());
+ }
+ write_set.insert(*ref);
+ if (is_backref_node(ref->get_type()))
+ fresh_backref_extents++;
+ }
+
+ uint64_t get_num_fresh_backref() const {
+ return fresh_backref_extents;
+ }
+
+ void mark_delayed_extent_inline(LogicalCachedExtentRef& ref) {
+ write_set.erase(*ref);
+ assert(ref->get_paddr().is_delayed());
+ ref->set_paddr(make_record_relative_paddr(offset),
+ /* need_update_mapping: */ true);
+ offset += ref->get_length();
+ inline_block_list.push_back(ref);
+ write_set.insert(*ref);
+ }
+
+ void mark_delayed_extent_ool(LogicalCachedExtentRef& ref) {
+ written_ool_block_list.push_back(ref);
+ }
+
+ void update_delayed_ool_extent_addr(LogicalCachedExtentRef& ref,
+ paddr_t final_addr) {
+ write_set.erase(*ref);
+ assert(ref->get_paddr().is_delayed());
+ ref->set_paddr(final_addr, /* need_update_mapping: */ true);
+ assert(!ref->get_paddr().is_null());
+ assert(!ref->is_inline());
+ write_set.insert(*ref);
+ }
+
+ void mark_allocated_extent_ool(LogicalCachedExtentRef& ref) {
+ assert(ref->get_paddr().is_absolute());
+ assert(!ref->is_inline());
+ written_ool_block_list.push_back(ref);
+ }
+
+ void add_mutated_extent(CachedExtentRef ref) {
+ ceph_assert(!is_weak());
+ assert(ref->is_exist_mutation_pending() ||
+ read_set.count(ref->prior_instance->get_paddr()));
+ mutated_block_list.push_back(ref);
+ if (!ref->is_exist_mutation_pending()) {
+ write_set.insert(*ref);
+ } else {
+ assert(write_set.find_offset(ref->get_paddr()) !=
+ write_set.end());
+ }
+ }
+
+ void replace_placeholder(CachedExtent& placeholder, CachedExtent& extent) {
+ ceph_assert(!is_weak());
+
+ assert(placeholder.get_type() == extent_types_t::RETIRED_PLACEHOLDER);
+ assert(extent.get_type() != extent_types_t::RETIRED_PLACEHOLDER);
+ assert(extent.get_type() != extent_types_t::ROOT);
+ assert(extent.get_paddr() == placeholder.get_paddr());
+ {
+ auto where = read_set.find(placeholder.get_paddr());
+ assert(where != read_set.end());
+ assert(where->ref.get() == &placeholder);
+ where = read_set.erase(where);
+ auto it = read_set.emplace_hint(where, this, &extent);
+ extent.transactions.insert(const_cast<read_set_item_t<Transaction>&>(*it));
+ }
+ {
+ auto where = retired_set.find(&placeholder);
+ assert(where != retired_set.end());
+ assert(where->get() == &placeholder);
+ where = retired_set.erase(where);
+ retired_set.emplace_hint(where, &extent);
+ }
+ }
+
+ auto get_delayed_alloc_list() {
+ std::list<LogicalCachedExtentRef> ret;
+ for (auto& extent : delayed_alloc_list) {
+ // delayed extents may be invalidated
+ if (extent->is_valid()) {
+ ret.push_back(std::move(extent));
+ } else {
+ ++num_delayed_invalid_extents;
+ }
+ }
+ delayed_alloc_list.clear();
+ return ret;
+ }
+
+ auto get_valid_pre_alloc_list() {
+ std::list<LogicalCachedExtentRef> ret;
+ assert(num_allocated_invalid_extents == 0);
+ for (auto& extent : pre_alloc_list) {
+ if (extent->is_valid()) {
+ ret.push_back(extent);
+ } else {
+ ++num_allocated_invalid_extents;
+ }
+ }
+ return ret;
+ }
+
+ const auto &get_inline_block_list() {
+ return inline_block_list;
+ }
+
+ const auto &get_mutated_block_list() {
+ return mutated_block_list;
+ }
+
+ const auto &get_existing_block_list() {
+ return existing_block_list;
+ }
+
+ const auto &get_retired_set() {
+ return retired_set;
+ }
+
+ bool is_retired(paddr_t paddr, extent_len_t len) {
+ if (retired_set.empty()) {
+ return false;
+ }
+ auto iter = retired_set.lower_bound(paddr);
+ if (iter == retired_set.end() ||
+ (*iter)->get_paddr() > paddr) {
+ assert(iter != retired_set.begin());
+ --iter;
+ }
+ auto retired_paddr = (*iter)->get_paddr();
+ auto retired_length = (*iter)->get_length();
+ return retired_paddr <= paddr &&
+ retired_paddr.add_offset(retired_length) >= paddr.add_offset(len);
+ }
+
+ template <typename F>
+ auto for_each_fresh_block(F &&f) const {
+ std::for_each(written_ool_block_list.begin(), written_ool_block_list.end(), f);
+ std::for_each(inline_block_list.begin(), inline_block_list.end(), f);
+ }
+
+ const io_stat_t& get_fresh_block_stats() const {
+ return fresh_block_stats;
+ }
+
+ using src_t = transaction_type_t;
+ src_t get_src() const {
+ return src;
+ }
+
+ bool is_weak() const {
+ return weak;
+ }
+
+ void test_set_conflict() {
+ conflicted = true;
+ }
+
+ bool is_conflicted() const {
+ return conflicted;
+ }
+
+ auto &get_handle() {
+ return handle;
+ }
+
+ Transaction(
+ OrderingHandle &&handle,
+ bool weak,
+ src_t src,
+ journal_seq_t initiated_after,
+ on_destruct_func_t&& f,
+ transaction_id_t trans_id
+ ) : weak(weak),
+ handle(std::move(handle)),
+ on_destruct(std::move(f)),
+ src(src),
+ trans_id(trans_id)
+ {}
+
+ void invalidate_clear_write_set() {
+ for (auto &&i: write_set) {
+ i.set_invalid(*this);
+ }
+ write_set.clear();
+ }
+
+ ~Transaction() {
+ on_destruct(*this);
+ invalidate_clear_write_set();
+ }
+
+ friend class crimson::os::seastore::SeaStore;
+ friend class TransactionConflictCondition;
+
+ void reset_preserve_handle(journal_seq_t initiated_after) {
+ root.reset();
+ offset = 0;
+ delayed_temp_offset = 0;
+ read_set.clear();
+ fresh_backref_extents = 0;
+ invalidate_clear_write_set();
+ mutated_block_list.clear();
+ fresh_block_stats = {};
+ num_delayed_invalid_extents = 0;
+ num_allocated_invalid_extents = 0;
+ delayed_alloc_list.clear();
+ inline_block_list.clear();
+ written_ool_block_list.clear();
+ pre_alloc_list.clear();
+ retired_set.clear();
+ existing_block_list.clear();
+ existing_block_stats = {};
+ onode_tree_stats = {};
+ omap_tree_stats = {};
+ lba_tree_stats = {};
+ backref_tree_stats = {};
+ ool_write_stats = {};
+ rewrite_version_stats = {};
+ conflicted = false;
+ if (!has_reset) {
+ has_reset = true;
+ }
+ }
+
+ bool did_reset() const {
+ return has_reset;
+ }
+
+ struct tree_stats_t {
+ uint64_t depth = 0;
+ uint64_t num_inserts = 0;
+ uint64_t num_erases = 0;
+ uint64_t num_updates = 0;
+ int64_t extents_num_delta = 0;
+
+ bool is_clear() const {
+ return (depth == 0 &&
+ num_inserts == 0 &&
+ num_erases == 0 &&
+ num_updates == 0 &&
+ extents_num_delta == 0);
+ }
+ };
+ tree_stats_t& get_onode_tree_stats() {
+ return onode_tree_stats;
+ }
+ tree_stats_t& get_omap_tree_stats() {
+ return omap_tree_stats;
+ }
+ tree_stats_t& get_lba_tree_stats() {
+ return lba_tree_stats;
+ }
+ tree_stats_t& get_backref_tree_stats() {
+ return backref_tree_stats;
+ }
+
+ struct ool_write_stats_t {
+ io_stat_t extents;
+ uint64_t md_bytes = 0;
+ uint64_t num_records = 0;
+
+ uint64_t get_data_bytes() const {
+ return extents.bytes;
+ }
+
+ bool is_clear() const {
+ return (extents.is_clear() &&
+ md_bytes == 0 &&
+ num_records == 0);
+ }
+ };
+ ool_write_stats_t& get_ool_write_stats() {
+ return ool_write_stats;
+ }
+ version_stat_t& get_rewrite_version_stats() {
+ return rewrite_version_stats;
+ }
+
+ struct existing_block_stats_t {
+ uint64_t valid_num = 0;
+ uint64_t clean_num = 0;
+ uint64_t mutated_num = 0;
+ void inc(const CachedExtentRef &ref) {
+ valid_num++;
+ if (ref->is_exist_clean()) {
+ clean_num++;
+ } else {
+ mutated_num++;
+ }
+ }
+ void dec(const CachedExtentRef &ref) {
+ valid_num--;
+ if (ref->is_exist_clean()) {
+ clean_num--;
+ } else {
+ mutated_num--;
+ }
+ }
+ };
+ existing_block_stats_t& get_existing_block_stats() {
+ return existing_block_stats;
+ }
+
+ transaction_id_t get_trans_id() const {
+ return trans_id;
+ }
+
+private:
+ friend class Cache;
+ friend Ref make_test_transaction();
+
+ /**
+ * If set, *this may not be used to perform writes and will not provide
+ * consistentency allowing operations using to avoid maintaining a read_set.
+ */
+ const bool weak;
+
+ RootBlockRef root; ///< ref to root if read or written by transaction
+
+ device_off_t offset = 0; ///< relative offset of next block
+ device_off_t delayed_temp_offset = 0;
+
+ /**
+ * read_set
+ *
+ * Holds a reference (with a refcount) to every extent read via *this.
+ * Submitting a transaction mutating any contained extent/addr will
+ * invalidate *this.
+ */
+ read_set_t<Transaction> read_set; ///< set of extents read by paddr
+
+ uint64_t fresh_backref_extents = 0; // counter of new backref extents
+
+ /**
+ * write_set
+ *
+ * Contains a reference (without a refcount) to every extent mutated
+ * as part of *this. No contained extent may be referenced outside
+ * of *this. Every contained extent will be in one of inline_block_list,
+ * written_ool_block_list or/and pre_alloc_list, mutated_block_list,
+ * or delayed_alloc_list.
+ */
+ ExtentIndex write_set;
+
+ /**
+ * lists of fresh blocks, holds refcounts, subset of write_set
+ */
+ io_stat_t fresh_block_stats;
+ uint64_t num_delayed_invalid_extents = 0;
+ uint64_t num_allocated_invalid_extents = 0;
+ /// blocks that will be committed with journal record inline
+ std::list<CachedExtentRef> inline_block_list;
+ /// blocks that will be committed with out-of-line record
+ std::list<CachedExtentRef> written_ool_block_list;
+ /// blocks with delayed allocation, may become inline or ool above
+ std::list<LogicalCachedExtentRef> delayed_alloc_list;
+
+ /// Extents with pre-allocated addresses,
+ /// will be added to written_ool_block_list after write
+ std::list<LogicalCachedExtentRef> pre_alloc_list;
+
+ /// list of mutated blocks, holds refcounts, subset of write_set
+ std::list<CachedExtentRef> mutated_block_list;
+
+ /// partial blocks of extents on disk, with data and refcounts
+ std::list<CachedExtentRef> existing_block_list;
+ existing_block_stats_t existing_block_stats;
+
+ /**
+ * retire_set
+ *
+ * Set of extents retired by *this.
+ */
+ pextent_set_t retired_set;
+
+ /// stats to collect when commit or invalidate
+ tree_stats_t onode_tree_stats;
+ tree_stats_t omap_tree_stats; // exclude omap tree depth
+ tree_stats_t lba_tree_stats;
+ tree_stats_t backref_tree_stats;
+ ool_write_stats_t ool_write_stats;
+ version_stat_t rewrite_version_stats;
+
+ bool conflicted = false;
+
+ bool has_reset = false;
+
+ OrderingHandle handle;
+
+ on_destruct_func_t on_destruct;
+
+ const src_t src;
+
+ transaction_id_t trans_id = TRANS_ID_NULL;
+};
+using TransactionRef = Transaction::Ref;
+
+/// Should only be used with dummy staged-fltree node extent manager
+inline TransactionRef make_test_transaction() {
+ static transaction_id_t next_id = 0;
+ return std::make_unique<Transaction>(
+ get_dummy_ordering_handle(),
+ false,
+ Transaction::src_t::MUTATE,
+ JOURNAL_SEQ_NULL,
+ [](Transaction&) {},
+ ++next_id
+ );
+}
+
+struct TransactionConflictCondition {
+ class transaction_conflict final : public std::exception {
+ public:
+ const char* what() const noexcept final {
+ return "transaction conflict detected";
+ }
+ };
+
+public:
+ TransactionConflictCondition(Transaction &t) : t(t) {}
+
+ template <typename Fut>
+ std::optional<Fut> may_interrupt() {
+ if (t.conflicted) {
+ return seastar::futurize<Fut>::make_exception_future(
+ transaction_conflict());
+ } else {
+ return std::optional<Fut>();
+ }
+ }
+
+ template <typename T>
+ static constexpr bool is_interruption_v =
+ std::is_same_v<T, transaction_conflict>;
+
+
+ static bool is_interruption(std::exception_ptr& eptr) {
+ return *eptr.__cxa_exception_type() == typeid(transaction_conflict);
+ }
+
+private:
+ Transaction &t;
+};
+
+using trans_intr = crimson::interruptible::interruptor<
+ TransactionConflictCondition
+ >;
+
+template <typename E>
+using trans_iertr =
+ crimson::interruptible::interruptible_errorator<
+ TransactionConflictCondition,
+ E
+ >;
+
+template <typename F, typename... Args>
+auto with_trans_intr(Transaction &t, F &&f, Args&&... args) {
+ return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>(
+ std::move(f),
+ TransactionConflictCondition(t),
+ t,
+ std::forward<Args>(args)...);
+}
+
+template <typename T>
+using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>;
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::io_stat_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
new file mode 100644
index 000000000..ad8e5f1a6
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -0,0 +1,759 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "include/denc.h"
+#include "include/intarith.h"
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/journal.h"
+#include "crimson/os/seastore/journal/circular_bounded_journal.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/random_block_manager/rbm_device.h"
+
+/*
+ * TransactionManager logs
+ *
+ * levels:
+ * - INFO: major initiation, closing operations
+ * - DEBUG: major extent related operations, INFO details
+ * - TRACE: DEBUG details
+ * - seastore_t logs
+ */
+SET_SUBSYS(seastore_tm);
+
+namespace crimson::os::seastore {
+
+TransactionManager::TransactionManager(
+ JournalRef _journal,
+ CacheRef _cache,
+ LBAManagerRef _lba_manager,
+ ExtentPlacementManagerRef &&_epm,
+ BackrefManagerRef&& _backref_manager)
+ : cache(std::move(_cache)),
+ lba_manager(std::move(_lba_manager)),
+ journal(std::move(_journal)),
+ epm(std::move(_epm)),
+ backref_manager(std::move(_backref_manager))
+{
+ epm->set_extent_callback(this);
+ journal->set_write_pipeline(&write_pipeline);
+}
+
+TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
+{
+ LOG_PREFIX(TransactionManager::mkfs);
+ INFO("enter");
+ return epm->mount(
+ ).safe_then([this] {
+ return journal->open_for_mkfs();
+ }).safe_then([this](auto start_seq) {
+ journal->get_trimmer().update_journal_tails(start_seq, start_seq);
+ journal->get_trimmer().set_journal_head(start_seq);
+ return epm->open_for_write();
+ }).safe_then([this, FNAME]() {
+ return with_transaction_intr(
+ Transaction::src_t::MUTATE,
+ "mkfs_tm",
+ [this, FNAME](auto& t)
+ {
+ cache->init();
+ return cache->mkfs(t
+ ).si_then([this, &t] {
+ return lba_manager->mkfs(t);
+ }).si_then([this, &t] {
+ return backref_manager->mkfs(t);
+ }).si_then([this, FNAME, &t] {
+ INFOT("submitting mkfs transaction", t);
+ return submit_transaction_direct(t);
+ });
+ }).handle_error(
+ crimson::ct_error::eagain::handle([] {
+ ceph_assert(0 == "eagain impossible");
+ return mkfs_ertr::now();
+ }),
+ mkfs_ertr::pass_further{}
+ );
+ }).safe_then([this] {
+ return close();
+ }).safe_then([FNAME] {
+ INFO("completed");
+ });
+}
+
+TransactionManager::mount_ertr::future<> TransactionManager::mount()
+{
+ LOG_PREFIX(TransactionManager::mount);
+ INFO("enter");
+ cache->init();
+ return epm->mount(
+ ).safe_then([this] {
+ return journal->replay(
+ [this](
+ const auto &offsets,
+ const auto &e,
+ const journal_seq_t &dirty_tail,
+ const journal_seq_t &alloc_tail,
+ sea_time_point modify_time)
+ {
+ auto start_seq = offsets.write_result.start_seq;
+ return cache->replay_delta(
+ start_seq,
+ offsets.record_block_base,
+ e,
+ dirty_tail,
+ alloc_tail,
+ modify_time);
+ });
+ }).safe_then([this] {
+ return journal->open_for_mount();
+ }).safe_then([this](auto start_seq) {
+ journal->get_trimmer().set_journal_head(start_seq);
+ return with_transaction_weak(
+ "mount",
+ [this](auto &t)
+ {
+ return cache->init_cached_extents(t, [this](auto &t, auto &e) {
+ if (is_backref_node(e->get_type())) {
+ return backref_manager->init_cached_extent(t, e);
+ } else {
+ return lba_manager->init_cached_extent(t, e);
+ }
+ }).si_then([this, &t] {
+ epm->start_scan_space();
+ return backref_manager->scan_mapped_space(
+ t,
+ [this](
+ paddr_t paddr,
+ paddr_t backref_key,
+ extent_len_t len,
+ extent_types_t type,
+ laddr_t laddr) {
+ if (is_backref_node(type)) {
+ assert(laddr == L_ADDR_NULL);
+ assert(backref_key != P_ADDR_NULL);
+ backref_manager->cache_new_backref_extent(paddr, backref_key, type);
+ cache->update_tree_extents_num(type, 1);
+ epm->mark_space_used(paddr, len);
+ } else if (laddr == L_ADDR_NULL) {
+ assert(backref_key == P_ADDR_NULL);
+ cache->update_tree_extents_num(type, -1);
+ epm->mark_space_free(paddr, len);
+ } else {
+ assert(backref_key == P_ADDR_NULL);
+ cache->update_tree_extents_num(type, 1);
+ epm->mark_space_used(paddr, len);
+ }
+ });
+ });
+ });
+ }).safe_then([this] {
+ return epm->open_for_write();
+ }).safe_then([FNAME, this] {
+ epm->start_background();
+ INFO("completed");
+ }).handle_error(
+ mount_ertr::pass_further{},
+ crimson::ct_error::all_same_way([] {
+ ceph_assert(0 == "unhandled error");
+ return mount_ertr::now();
+ })
+ );
+}
+
+TransactionManager::close_ertr::future<> TransactionManager::close() {
+ LOG_PREFIX(TransactionManager::close);
+ INFO("enter");
+ return epm->stop_background(
+ ).then([this] {
+ return cache->close();
+ }).safe_then([this] {
+ cache->dump_contents();
+ return journal->close();
+ }).safe_then([this] {
+ return epm->close();
+ }).safe_then([FNAME] {
+ INFO("completed");
+ return seastar::now();
+ });
+}
+
+TransactionManager::ref_ret TransactionManager::inc_ref(
+ Transaction &t,
+ LogicalCachedExtentRef &ref)
+{
+ LOG_PREFIX(TransactionManager::inc_ref);
+ TRACET("{}", t, *ref);
+ return lba_manager->incref_extent(t, ref->get_laddr()
+ ).si_then([FNAME, ref, &t](auto result) {
+ DEBUGT("extent refcount is incremented to {} -- {}",
+ t, result.refcount, *ref);
+ return result.refcount;
+ }).handle_error_interruptible(
+ ref_iertr::pass_further{},
+ ct_error::all_same_way([](auto e) {
+ ceph_assert(0 == "unhandled error, TODO");
+ }));
+}
+
+TransactionManager::ref_ret TransactionManager::inc_ref(
+ Transaction &t,
+ laddr_t offset)
+{
+ LOG_PREFIX(TransactionManager::inc_ref);
+ TRACET("{}", t, offset);
+ return lba_manager->incref_extent(t, offset
+ ).si_then([FNAME, offset, &t](auto result) {
+ DEBUGT("extent refcount is incremented to {} -- {}~{}, {}",
+ t, result.refcount, offset, result.length, result.addr);
+ return result.refcount;
+ });
+}
+
+TransactionManager::ref_ret TransactionManager::dec_ref(
+ Transaction &t,
+ LogicalCachedExtentRef &ref)
+{
+ LOG_PREFIX(TransactionManager::dec_ref);
+ TRACET("{}", t, *ref);
+ return lba_manager->decref_extent(t, ref->get_laddr(), true
+ ).si_then([this, FNAME, &t, ref](auto result) {
+ DEBUGT("extent refcount is decremented to {} -- {}",
+ t, result.refcount, *ref);
+ if (result.refcount == 0) {
+ cache->retire_extent(t, ref);
+ }
+ return result.refcount;
+ });
+}
+
+TransactionManager::ref_ret TransactionManager::_dec_ref(
+ Transaction &t,
+ laddr_t offset,
+ bool cascade_remove)
+{
+ LOG_PREFIX(TransactionManager::_dec_ref);
+ TRACET("{}", t, offset);
+ return lba_manager->decref_extent(t, offset, cascade_remove
+ ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret {
+ DEBUGT("extent refcount is decremented to {} -- {}~{}, {}",
+ t, result.refcount, offset, result.length, result.addr);
+ auto fut = ref_iertr::now();
+ if (result.refcount == 0) {
+ if (result.addr.is_paddr() &&
+ !result.addr.get_paddr().is_zero()) {
+ fut = cache->retire_extent_addr(
+ t, result.addr.get_paddr(), result.length);
+ }
+ }
+
+ return fut.si_then([result=std::move(result)] {
+ return result.refcount;
+ });
+ });
+}
+
+TransactionManager::refs_ret TransactionManager::dec_ref(
+ Transaction &t,
+ std::vector<laddr_t> offsets)
+{
+ LOG_PREFIX(TransactionManager::dec_ref);
+ DEBUG("{} offsets", offsets.size());
+ return seastar::do_with(std::move(offsets), std::vector<unsigned>(),
+ [this, &t] (auto &&offsets, auto &refcnt) {
+ return trans_intr::do_for_each(offsets.begin(), offsets.end(),
+ [this, &t, &refcnt] (auto &laddr) {
+ return this->dec_ref(t, laddr).si_then([&refcnt] (auto ref) {
+ refcnt.push_back(ref);
+ return ref_iertr::now();
+ });
+ }).si_then([&refcnt] {
+ return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnt));
+ });
+ });
+}
+
+TransactionManager::submit_transaction_iertr::future<>
+TransactionManager::submit_transaction(
+ Transaction &t)
+{
+ LOG_PREFIX(TransactionManager::submit_transaction);
+ SUBTRACET(seastore_t, "start", t);
+ return trans_intr::make_interruptible(
+ t.get_handle().enter(write_pipeline.reserve_projected_usage)
+ ).then_interruptible([this, FNAME, &t] {
+ auto dispatch_result = epm->dispatch_delayed_extents(t);
+ auto projected_usage = dispatch_result.usage;
+ SUBTRACET(seastore_t, "waiting for projected_usage: {}", t, projected_usage);
+ return trans_intr::make_interruptible(
+ epm->reserve_projected_usage(projected_usage)
+ ).then_interruptible([this, &t, dispatch_result = std::move(dispatch_result)] {
+ return do_submit_transaction(t, std::move(dispatch_result));
+ }).finally([this, FNAME, projected_usage, &t] {
+ SUBTRACET(seastore_t, "releasing projected_usage: {}", t, projected_usage);
+ epm->release_projected_usage(projected_usage);
+ });
+ });
+}
+
+TransactionManager::submit_transaction_direct_ret
+TransactionManager::submit_transaction_direct(
+ Transaction &tref,
+ std::optional<journal_seq_t> trim_alloc_to)
+{
+ return do_submit_transaction(
+ tref,
+ epm->dispatch_delayed_extents(tref),
+ trim_alloc_to);
+}
+
+TransactionManager::submit_transaction_direct_ret
+TransactionManager::do_submit_transaction(
+ Transaction &tref,
+ ExtentPlacementManager::dispatch_result_t dispatch_result,
+ std::optional<journal_seq_t> trim_alloc_to)
+{
+ LOG_PREFIX(TransactionManager::do_submit_transaction);
+ SUBTRACET(seastore_t, "start", tref);
+ return trans_intr::make_interruptible(
+ tref.get_handle().enter(write_pipeline.ool_writes)
+ ).then_interruptible([this, FNAME, &tref,
+ dispatch_result = std::move(dispatch_result)] {
+ return seastar::do_with(std::move(dispatch_result),
+ [this, FNAME, &tref](auto &dispatch_result) {
+ return epm->write_delayed_ool_extents(tref, dispatch_result.alloc_map
+ ).si_then([this, FNAME, &tref, &dispatch_result] {
+ SUBTRACET(seastore_t, "update delayed extent mappings", tref);
+ return lba_manager->update_mappings(tref, dispatch_result.delayed_extents);
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all("invalid error")
+ );
+ });
+ }).si_then([this, FNAME, &tref] {
+ auto allocated_extents = tref.get_valid_pre_alloc_list();
+ auto num_extents = allocated_extents.size();
+ SUBTRACET(seastore_t, "process {} allocated extents", tref, num_extents);
+ return epm->write_preallocated_ool_extents(tref, allocated_extents
+ ).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all("invalid error")
+ );
+ }).si_then([this, FNAME, &tref] {
+ SUBTRACET(seastore_t, "about to prepare", tref);
+ return tref.get_handle().enter(write_pipeline.prepare);
+ }).si_then([this, FNAME, &tref, trim_alloc_to=std::move(trim_alloc_to)]() mutable
+ -> submit_transaction_iertr::future<> {
+ if (trim_alloc_to && *trim_alloc_to != JOURNAL_SEQ_NULL) {
+ cache->trim_backref_bufs(*trim_alloc_to);
+ }
+
+ auto record = cache->prepare_record(
+ tref,
+ journal->get_trimmer().get_journal_head(),
+ journal->get_trimmer().get_dirty_tail());
+
+ tref.get_handle().maybe_release_collection_lock();
+
+ SUBTRACET(seastore_t, "about to submit to journal", tref);
+ return journal->submit_record(std::move(record), tref.get_handle()
+ ).safe_then([this, FNAME, &tref](auto submit_result) mutable {
+ SUBDEBUGT(seastore_t, "committed with {}", tref, submit_result);
+ auto start_seq = submit_result.write_result.start_seq;
+ journal->get_trimmer().set_journal_head(start_seq);
+ cache->complete_commit(
+ tref,
+ submit_result.record_block_base,
+ start_seq);
+
+ std::vector<CachedExtentRef> lba_to_clear;
+ std::vector<CachedExtentRef> backref_to_clear;
+ lba_to_clear.reserve(tref.get_retired_set().size());
+ backref_to_clear.reserve(tref.get_retired_set().size());
+ for (auto &e: tref.get_retired_set()) {
+ if (e->is_logical() || is_lba_node(e->get_type()))
+ lba_to_clear.push_back(e);
+ else if (is_backref_node(e->get_type()))
+ backref_to_clear.push_back(e);
+ }
+
+ journal->get_trimmer().update_journal_tails(
+ cache->get_oldest_dirty_from().value_or(start_seq),
+ cache->get_oldest_backref_dirty_from().value_or(start_seq));
+ return journal->finish_commit(tref.get_src()
+ ).then([&tref] {
+ return tref.get_handle().complete();
+ });
+ }).handle_error(
+ submit_transaction_iertr::pass_further{},
+ crimson::ct_error::all_same_way([](auto e) {
+ ceph_assert(0 == "Hit error submitting to journal");
+ })
+ );
+ }).finally([&tref]() {
+ tref.get_handle().exit();
+ });
+}
+
+seastar::future<> TransactionManager::flush(OrderingHandle &handle)
+{
+ LOG_PREFIX(TransactionManager::flush);
+ SUBDEBUG(seastore_t, "H{} start", (void*)&handle);
+ return handle.enter(write_pipeline.reserve_projected_usage
+ ).then([this, &handle] {
+ return handle.enter(write_pipeline.ool_writes);
+ }).then([this, &handle] {
+ return handle.enter(write_pipeline.prepare);
+ }).then([this, &handle] {
+ handle.maybe_release_collection_lock();
+ return journal->flush(handle);
+ }).then([FNAME, &handle] {
+ SUBDEBUG(seastore_t, "H{} completed", (void*)&handle);
+ });
+}
+
+TransactionManager::get_next_dirty_extents_ret
+TransactionManager::get_next_dirty_extents(
+ Transaction &t,
+ journal_seq_t seq,
+ size_t max_bytes)
+{
+ LOG_PREFIX(TransactionManager::get_next_dirty_extents);
+ DEBUGT("max_bytes={}B, seq={}", t, max_bytes, seq);
+ return cache->get_next_dirty_extents(t, seq, max_bytes);
+}
+
+TransactionManager::rewrite_extent_ret
+TransactionManager::rewrite_logical_extent(
+ Transaction& t,
+ LogicalCachedExtentRef extent)
+{
+ LOG_PREFIX(TransactionManager::rewrite_logical_extent);
+ if (extent->has_been_invalidated()) {
+ ERRORT("extent has been invalidated -- {}", t, *extent);
+ ceph_abort();
+ }
+ TRACET("rewriting extent -- {}", t, *extent);
+
+ auto lextent = extent->cast<LogicalCachedExtent>();
+ cache->retire_extent(t, extent);
+ auto nlextent = cache->alloc_new_extent_by_type(
+ t,
+ lextent->get_type(),
+ lextent->get_length(),
+ lextent->get_user_hint(),
+ // get target rewrite generation
+ lextent->get_rewrite_generation())->cast<LogicalCachedExtent>();
+ lextent->get_bptr().copy_out(
+ 0,
+ lextent->get_length(),
+ nlextent->get_bptr().c_str());
+ nlextent->set_laddr(lextent->get_laddr());
+ nlextent->set_modify_time(lextent->get_modify_time());
+
+ DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+
+ /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
+ * extents since we're going to do it again once we either do the ool write
+ * or allocate a relative inline addr. TODO: refactor AsyncCleaner to
+ * avoid this complication. */
+ return lba_manager->update_mapping(
+ t,
+ lextent->get_laddr(),
+ lextent->get_paddr(),
+ nlextent->get_paddr(),
+ nlextent.get());
+}
+
+TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent,
+ rewrite_gen_t target_generation,
+ sea_time_point modify_time)
+{
+ LOG_PREFIX(TransactionManager::rewrite_extent);
+
+ {
+ auto updated = cache->update_extent_from_transaction(t, extent);
+ if (!updated) {
+ DEBUGT("extent is already retired, skipping -- {}", t, *extent);
+ return rewrite_extent_iertr::now();
+ }
+ extent = updated;
+ ceph_assert(!extent->is_pending_io());
+ }
+
+ assert(extent->is_valid() && !extent->is_initial_pending());
+ if (extent->is_dirty()) {
+ extent->set_target_rewrite_generation(INIT_GENERATION);
+ } else {
+ extent->set_target_rewrite_generation(target_generation);
+ ceph_assert(modify_time != NULL_TIME);
+ extent->set_modify_time(modify_time);
+ }
+
+ t.get_rewrite_version_stats().increment(extent->get_version());
+
+ if (is_backref_node(extent->get_type())) {
+ DEBUGT("rewriting backref extent -- {}", t, *extent);
+ return backref_manager->rewrite_extent(t, extent);
+ }
+
+ if (extent->get_type() == extent_types_t::ROOT) {
+ DEBUGT("rewriting root extent -- {}", t, *extent);
+ cache->duplicate_for_write(t, extent);
+ return rewrite_extent_iertr::now();
+ }
+
+ if (extent->is_logical()) {
+ return rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+ } else {
+ DEBUGT("rewriting physical extent -- {}", t, *extent);
+ return lba_manager->rewrite_extent(t, extent);
+ }
+}
+
+TransactionManager::get_extents_if_live_ret
+TransactionManager::get_extents_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t paddr,
+ laddr_t laddr,
+ extent_len_t len)
+{
+ LOG_PREFIX(TransactionManager::get_extent_if_live);
+ TRACET("{} {}~{} {}", t, type, laddr, len, paddr);
+
+ // This only works with segments to check if alive,
+ // as parallel transactions may split the extent at the same time.
+ ceph_assert(paddr.get_addr_type() == paddr_types_t::SEGMENT);
+
+ return cache->get_extent_if_cached(t, paddr, type
+ ).si_then([=, this, &t](auto extent)
+ -> get_extents_if_live_ret {
+ if (extent && extent->get_length() == len) {
+ DEBUGT("{} {}~{} {} is live in cache -- {}",
+ t, type, laddr, len, paddr, *extent);
+ std::list<CachedExtentRef> res;
+ res.emplace_back(std::move(extent));
+ return get_extents_if_live_ret(
+ interruptible::ready_future_marker{},
+ res);
+ }
+
+ if (is_logical_type(type)) {
+ return lba_manager->get_mappings(
+ t,
+ laddr,
+ len
+ ).si_then([=, this, &t](lba_pin_list_t pin_list) {
+ return seastar::do_with(
+ std::list<CachedExtentRef>(),
+ [=, this, &t, pin_list=std::move(pin_list)](
+ std::list<CachedExtentRef> &list) mutable
+ {
+ auto paddr_seg_id = paddr.as_seg_paddr().get_segment_id();
+ return trans_intr::parallel_for_each(
+ pin_list,
+ [=, this, &list, &t](
+ LBAMappingRef &pin) -> Cache::get_extent_iertr::future<>
+ {
+ auto pin_paddr = pin->get_val();
+ auto &pin_seg_paddr = pin_paddr.as_seg_paddr();
+ auto pin_paddr_seg_id = pin_seg_paddr.get_segment_id();
+ auto pin_len = pin->get_length();
+ if (pin_paddr_seg_id != paddr_seg_id) {
+ return seastar::now();
+ }
+ // Only extent split can happen during the lookup
+ ceph_assert(pin_seg_paddr >= paddr &&
+ pin_seg_paddr.add_offset(pin_len) <= paddr.add_offset(len));
+ return read_pin_by_type(t, std::move(pin), type
+ ).si_then([&list](auto ret) {
+ list.emplace_back(std::move(ret));
+ return seastar::now();
+ });
+ }).si_then([&list] {
+ return get_extents_if_live_ret(
+ interruptible::ready_future_marker{},
+ std::move(list));
+ });
+ });
+ }).handle_error_interruptible(crimson::ct_error::enoent::handle([] {
+ return get_extents_if_live_ret(
+ interruptible::ready_future_marker{},
+ std::list<CachedExtentRef>());
+ }), crimson::ct_error::pass_further_all{});
+ } else {
+ return lba_manager->get_physical_extent_if_live(
+ t,
+ type,
+ paddr,
+ laddr,
+ len
+ ).si_then([=, &t](auto ret) {
+ std::list<CachedExtentRef> res;
+ if (ret) {
+ DEBUGT("{} {}~{} {} is live as physical extent -- {}",
+ t, type, laddr, len, paddr, *ret);
+ res.emplace_back(std::move(ret));
+ } else {
+ DEBUGT("{} {}~{} {} is not live as physical extent",
+ t, type, laddr, len, paddr);
+ }
+ return get_extents_if_live_ret(
+ interruptible::ready_future_marker{},
+ std::move(res));
+ });
+ }
+ });
+}
+
+TransactionManager::~TransactionManager() {}
+
+TransactionManagerRef make_transaction_manager(
+ Device *primary_device,
+ const std::vector<Device*> &secondary_devices,
+ bool is_test)
+{
+ auto epm = std::make_unique<ExtentPlacementManager>();
+ auto cache = std::make_unique<Cache>(*epm);
+ auto lba_manager = lba_manager::create_lba_manager(*cache);
+ auto sms = std::make_unique<SegmentManagerGroup>();
+ auto rbs = std::make_unique<RBMDeviceGroup>();
+ auto backref_manager = create_backref_manager(*cache);
+ SegmentManagerGroupRef cold_sms = nullptr;
+ std::vector<SegmentProvider*> segment_providers_by_id{DEVICE_ID_MAX, nullptr};
+
+ auto p_backend_type = primary_device->get_backend_type();
+
+ if (p_backend_type == backend_type_t::SEGMENTED) {
+ auto dtype = primary_device->get_device_type();
+ ceph_assert(dtype != device_type_t::HDD &&
+ dtype != device_type_t::EPHEMERAL_COLD);
+ sms->add_segment_manager(static_cast<SegmentManager*>(primary_device));
+ } else {
+ auto rbm = std::make_unique<BlockRBManager>(
+ static_cast<RBMDevice*>(primary_device), "", is_test);
+ rbs->add_rb_manager(std::move(rbm));
+ }
+
+ for (auto &p_dev : secondary_devices) {
+ if (p_dev->get_backend_type() == backend_type_t::SEGMENTED) {
+ if (p_dev->get_device_type() == primary_device->get_device_type()) {
+ sms->add_segment_manager(static_cast<SegmentManager*>(p_dev));
+ } else {
+ if (!cold_sms) {
+ cold_sms = std::make_unique<SegmentManagerGroup>();
+ }
+ cold_sms->add_segment_manager(static_cast<SegmentManager*>(p_dev));
+ }
+ } else {
+ auto rbm = std::make_unique<BlockRBManager>(
+ static_cast<RBMDevice*>(p_dev), "", is_test);
+ rbs->add_rb_manager(std::move(rbm));
+ }
+ }
+
+ auto journal_type = p_backend_type;
+ device_off_t roll_size;
+ device_off_t roll_start;
+ if (journal_type == journal_type_t::SEGMENTED) {
+ roll_size = static_cast<SegmentManager*>(primary_device)->get_segment_size();
+ roll_start = 0;
+ } else {
+ roll_size = static_cast<random_block_device::RBMDevice*>(primary_device)
+ ->get_journal_size() - primary_device->get_block_size();
+ // see CircularBoundedJournal::get_records_start()
+ roll_start = static_cast<random_block_device::RBMDevice*>(primary_device)
+ ->get_shard_journal_start() + primary_device->get_block_size();
+ ceph_assert_always(roll_size <= DEVICE_OFF_MAX);
+ ceph_assert_always((std::size_t)roll_size + roll_start <=
+ primary_device->get_available_size());
+ }
+ ceph_assert(roll_size % primary_device->get_block_size() == 0);
+ ceph_assert(roll_start % primary_device->get_block_size() == 0);
+
+ bool cleaner_is_detailed;
+ SegmentCleaner::config_t cleaner_config;
+ JournalTrimmerImpl::config_t trimmer_config;
+ if (is_test) {
+ cleaner_is_detailed = true;
+ cleaner_config = SegmentCleaner::config_t::get_test();
+ trimmer_config = JournalTrimmerImpl::config_t::get_test(
+ roll_size, journal_type);
+ } else {
+ cleaner_is_detailed = false;
+ cleaner_config = SegmentCleaner::config_t::get_default();
+ trimmer_config = JournalTrimmerImpl::config_t::get_default(
+ roll_size, journal_type);
+ }
+
+ auto journal_trimmer = JournalTrimmerImpl::create(
+ *backref_manager, trimmer_config,
+ journal_type, roll_start, roll_size);
+
+ AsyncCleanerRef cleaner;
+ JournalRef journal;
+
+ SegmentCleanerRef cold_segment_cleaner = nullptr;
+
+ if (cold_sms) {
+ cold_segment_cleaner = SegmentCleaner::create(
+ cleaner_config,
+ std::move(cold_sms),
+ *backref_manager,
+ epm->get_ool_segment_seq_allocator(),
+ cleaner_is_detailed,
+ /* is_cold = */ true);
+ if (journal_type == journal_type_t::SEGMENTED) {
+ for (auto id : cold_segment_cleaner->get_device_ids()) {
+ segment_providers_by_id[id] =
+ static_cast<SegmentProvider*>(cold_segment_cleaner.get());
+ }
+ }
+ }
+
+ if (journal_type == journal_type_t::SEGMENTED) {
+ cleaner = SegmentCleaner::create(
+ cleaner_config,
+ std::move(sms),
+ *backref_manager,
+ epm->get_ool_segment_seq_allocator(),
+ cleaner_is_detailed);
+ auto segment_cleaner = static_cast<SegmentCleaner*>(cleaner.get());
+ for (auto id : segment_cleaner->get_device_ids()) {
+ segment_providers_by_id[id] =
+ static_cast<SegmentProvider*>(segment_cleaner);
+ }
+ segment_cleaner->set_journal_trimmer(*journal_trimmer);
+ journal = journal::make_segmented(
+ *segment_cleaner,
+ *journal_trimmer);
+ } else {
+ cleaner = RBMCleaner::create(
+ std::move(rbs),
+ *backref_manager,
+ cleaner_is_detailed);
+ journal = journal::make_circularbounded(
+ *journal_trimmer,
+ static_cast<random_block_device::RBMDevice*>(primary_device),
+ "");
+ }
+
+ cache->set_segment_providers(std::move(segment_providers_by_id));
+
+ epm->init(std::move(journal_trimmer),
+ std::move(cleaner),
+ std::move(cold_segment_cleaner));
+ epm->set_primary_device(primary_device);
+
+ return std::make_unique<TransactionManager>(
+ std::move(journal),
+ std::move(cache),
+ std::move(lba_manager),
+ std::move(epm),
+ std::move(backref_manager));
+}
+
+}
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
new file mode 100644
index 000000000..dd1898ba7
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -0,0 +1,928 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <optional>
+#include <vector>
+#include <utility>
+#include <functional>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/logging.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/backref_manager.h"
+#include "crimson/os/seastore/journal.h"
+#include "crimson/os/seastore/extent_placement_manager.h"
+#include "crimson/os/seastore/device.h"
+
+namespace crimson::os::seastore {
+class Journal;
+
+template <typename F>
+auto repeat_eagain(F &&f) {
+ return seastar::do_with(
+ std::forward<F>(f),
+ [](auto &f)
+ {
+ return crimson::repeat([&f] {
+ return std::invoke(f
+ ).safe_then([] {
+ return seastar::stop_iteration::yes;
+ }).handle_error(
+ [](const crimson::ct_error::eagain &e) {
+ return seastar::stop_iteration::no;
+ },
+ crimson::ct_error::pass_further_all{}
+ );
+ });
+ });
+}
+
+/**
+ * TransactionManager
+ *
+ * Abstraction hiding reading and writing to persistence.
+ * Exposes transaction based interface with read isolation.
+ */
+class TransactionManager : public ExtentCallbackInterface {
+public:
+ TransactionManager(
+ JournalRef journal,
+ CacheRef cache,
+ LBAManagerRef lba_manager,
+ ExtentPlacementManagerRef &&epm,
+ BackrefManagerRef&& backref_manager);
+
+ /// Writes initial metadata to disk
+ using mkfs_ertr = base_ertr;
+ mkfs_ertr::future<> mkfs();
+
+ /// Reads initial metadata from disk
+ using mount_ertr = base_ertr;
+ mount_ertr::future<> mount();
+
+ /// Closes transaction_manager
+ using close_ertr = base_ertr;
+ close_ertr::future<> close();
+
+ /// Resets transaction
+ void reset_transaction_preserve_handle(Transaction &t) {
+ return cache->reset_transaction_preserve_handle(t);
+ }
+
+ /**
+ * get_pin
+ *
+ * Get the logical pin at offset
+ */
+ using get_pin_iertr = LBAManager::get_mapping_iertr;
+ using get_pin_ret = LBAManager::get_mapping_iertr::future<LBAMappingRef>;
+ get_pin_ret get_pin(
+ Transaction &t,
+ laddr_t offset) {
+ LOG_PREFIX(TransactionManager::get_pin);
+ SUBTRACET(seastore_tm, "{}", t, offset);
+ return lba_manager->get_mapping(t, offset);
+ }
+
+ /**
+ * get_pins
+ *
+ * Get logical pins overlapping offset~length
+ */
+ using get_pins_iertr = LBAManager::get_mappings_iertr;
+ using get_pins_ret = get_pins_iertr::future<lba_pin_list_t>;
+ get_pins_ret get_pins(
+ Transaction &t,
+ laddr_t offset,
+ extent_len_t length) {
+ LOG_PREFIX(TransactionManager::get_pins);
+ SUBDEBUGT(seastore_tm, "{}~{}", t, offset, length);
+ return lba_manager->get_mappings(
+ t, offset, length);
+ }
+
+ /**
+ * read_extent
+ *
+ * Read extent of type T at offset~length
+ */
+ using read_extent_iertr = get_pin_iertr;
+ template <typename T>
+ using read_extent_ret = read_extent_iertr::future<
+ TCachedExtentRef<T>>;
+ template <typename T>
+ read_extent_ret<T> read_extent(
+ Transaction &t,
+ laddr_t offset,
+ extent_len_t length) {
+ LOG_PREFIX(TransactionManager::read_extent);
+ SUBTRACET(seastore_tm, "{}~{}", t, offset, length);
+ return get_pin(
+ t, offset
+ ).si_then([this, FNAME, &t, offset, length] (auto pin)
+ -> read_extent_ret<T> {
+ if (length != pin->get_length() || !pin->get_val().is_real()) {
+ SUBERRORT(seastore_tm,
+ "offset {} len {} got wrong pin {}",
+ t, offset, length, *pin);
+ ceph_assert(0 == "Should be impossible");
+ }
+ return this->read_pin<T>(t, std::move(pin));
+ });
+ }
+
+ /**
+ * read_extent
+ *
+ * Read extent of type T at offset
+ */
+ template <typename T>
+ read_extent_ret<T> read_extent(
+ Transaction &t,
+ laddr_t offset) {
+ LOG_PREFIX(TransactionManager::read_extent);
+ SUBTRACET(seastore_tm, "{}", t, offset);
+ return get_pin(
+ t, offset
+ ).si_then([this, FNAME, &t, offset] (auto pin)
+ -> read_extent_ret<T> {
+ if (!pin->get_val().is_real()) {
+ SUBERRORT(seastore_tm,
+ "offset {} got wrong pin {}",
+ t, offset, *pin);
+ ceph_assert(0 == "Should be impossible");
+ }
+ return this->read_pin<T>(t, std::move(pin));
+ });
+ }
+
+ template <typename T>
+ base_iertr::future<TCachedExtentRef<T>> read_pin(
+ Transaction &t,
+ LBAMappingRef pin)
+ {
+ auto v = pin->get_logical_extent(t);
+ if (v.has_child()) {
+ return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
+#ifndef NDEBUG
+ auto lextent = extent->template cast<LogicalCachedExtent>();
+ auto pin_laddr = pin->get_key();
+ if (pin->is_indirect()) {
+ pin_laddr = pin->get_intermediate_base();
+ }
+ assert(lextent->get_laddr() == pin_laddr);
+#endif
+ return extent->template cast<T>();
+ });
+ } else {
+ return pin_to_extent<T>(t, std::move(pin));
+ }
+ }
+
+ base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
+ Transaction &t,
+ LBAMappingRef pin,
+ extent_types_t type)
+ {
+ auto v = pin->get_logical_extent(t);
+ if (v.has_child()) {
+ return std::move(v.get_child_fut());
+ } else {
+ return pin_to_extent_by_type(t, std::move(pin), type);
+ }
+ }
+
+ /// Obtain mutable copy of extent
+ LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
+ LOG_PREFIX(TransactionManager::get_mutable_extent);
+ auto ret = cache->duplicate_for_write(
+ t,
+ ref)->cast<LogicalCachedExtent>();
+ if (!ret->has_laddr()) {
+ SUBDEBUGT(seastore_tm,
+ "duplicating extent for write -- {} -> {}",
+ t,
+ *ref,
+ *ret);
+ ret->set_laddr(ref->get_laddr());
+ } else {
+ SUBTRACET(seastore_tm,
+ "extent is already duplicated -- {}",
+ t,
+ *ref);
+ assert(ref->is_mutable());
+ assert(&*ref == &*ret);
+ }
+ return ret;
+ }
+
+
+ using ref_iertr = LBAManager::ref_iertr;
+ using ref_ret = ref_iertr::future<unsigned>;
+
+ /// Add refcount for ref
+ ref_ret inc_ref(
+ Transaction &t,
+ LogicalCachedExtentRef &ref);
+
+ /// Add refcount for offset
+ ref_ret inc_ref(
+ Transaction &t,
+ laddr_t offset);
+
+ /// Remove refcount for ref
+ ref_ret dec_ref(
+ Transaction &t,
+ LogicalCachedExtentRef &ref);
+
+ /// Remove refcount for offset
+ ref_ret dec_ref(
+ Transaction &t,
+ laddr_t offset) {
+ return _dec_ref(t, offset, true);
+ }
+
+ /// remove refcount for list of offset
+ using refs_ret = ref_iertr::future<std::vector<unsigned>>;
+ refs_ret dec_ref(
+ Transaction &t,
+ std::vector<laddr_t> offsets);
+
+ /**
+ * alloc_extent
+ *
+ * Allocates a new block of type T with the minimum lba range of size len
+ * greater than laddr_hint.
+ */
+ using alloc_extent_iertr = LBAManager::alloc_extent_iertr;
+ template <typename T>
+ using alloc_extent_ret = alloc_extent_iertr::future<TCachedExtentRef<T>>;
+ template <typename T>
+ alloc_extent_ret<T> alloc_extent(
+ Transaction &t,
+ laddr_t laddr_hint,
+ extent_len_t len,
+ placement_hint_t placement_hint = placement_hint_t::HOT) {
+ LOG_PREFIX(TransactionManager::alloc_extent);
+ SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
+ t, T::TYPE, len, placement_hint, laddr_hint);
+ ceph_assert(is_aligned(laddr_hint, epm->get_block_size()));
+ auto ext = cache->alloc_new_extent<T>(
+ t,
+ len,
+ placement_hint,
+ INIT_GENERATION);
+ return lba_manager->alloc_extent(
+ t,
+ laddr_hint,
+ len,
+ ext->get_paddr(),
+ *ext
+ ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable {
+ LOG_PREFIX(TransactionManager::alloc_extent);
+ SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint);
+ return alloc_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
+ std::move(ext));
+ });
+ }
+
+ /**
+ * remap_pin
+ *
+ * Remap original extent to new extents.
+ * Return the pins of new extent.
+ */
+ struct remap_entry {
+ extent_len_t offset;
+ extent_len_t len;
+ remap_entry(extent_len_t _offset, extent_len_t _len) {
+ offset = _offset;
+ len = _len;
+ }
+ };
+ using remap_pin_iertr = base_iertr;
+ template <std::size_t N>
+ using remap_pin_ret = remap_pin_iertr::future<std::array<LBAMappingRef, N>>;
+ template <typename T, std::size_t N>
+ remap_pin_ret<N> remap_pin(
+ Transaction &t,
+ LBAMappingRef &&pin,
+ std::array<remap_entry, N> remaps) {
+
+#ifndef NDEBUG
+ std::sort(remaps.begin(), remaps.end(),
+ [](remap_entry x, remap_entry y) {
+ return x.offset < y.offset;
+ });
+ auto original_len = pin->get_length();
+ extent_len_t total_remap_len = 0;
+ extent_len_t last_offset = 0;
+ extent_len_t last_len = 0;
+
+ for (auto &remap : remaps) {
+ auto remap_offset = remap.offset;
+ auto remap_len = remap.len;
+ total_remap_len += remap.len;
+ ceph_assert(remap_offset >= (last_offset + last_len));
+ last_offset = remap_offset;
+ last_len = remap_len;
+ }
+ ceph_assert(total_remap_len < original_len);
+#endif
+
+ // FIXME: paddr can be absolute and pending
+ ceph_assert(pin->get_val().is_absolute());
+ return cache->get_extent_if_cached(
+ t, pin->get_val(), T::TYPE
+ ).si_then([this, &t, remaps,
+ original_laddr = pin->get_key(),
+ intermediate_base = pin->is_indirect()
+ ? pin->get_intermediate_base()
+ : L_ADDR_NULL,
+ intermediate_key = pin->is_indirect()
+ ? pin->get_intermediate_key()
+ : L_ADDR_NULL,
+ original_paddr = pin->get_val(),
+ original_len = pin->get_length()](auto ext) mutable {
+ std::optional<ceph::bufferptr> original_bptr;
+ LOG_PREFIX(TransactionManager::remap_pin);
+ SUBDEBUGT(seastore_tm,
+ "original laddr: {}, original paddr: {}, original length: {},"
+ " intermediate_base: {}, intermediate_key: {},"
+ " remap to {} extents",
+ t, original_laddr, original_paddr, original_len,
+ intermediate_base, intermediate_key, remaps.size());
+ ceph_assert(
+ (intermediate_base == L_ADDR_NULL)
+ == (intermediate_key == L_ADDR_NULL));
+ if (ext) {
+ // FIXME: cannot and will not remap a dirty extent for now.
+ ceph_assert(!ext->is_dirty());
+ ceph_assert(!ext->is_mutable());
+ ceph_assert(ext->get_length() >= original_len);
+ ceph_assert(ext->get_paddr() == original_paddr);
+ original_bptr = ext->get_bptr();
+ }
+ return seastar::do_with(
+ std::array<LBAMappingRef, N>(),
+ 0,
+ std::move(original_bptr),
+ std::vector<remap_entry>(remaps.begin(), remaps.end()),
+ [this, &t, original_laddr, original_paddr,
+ original_len, intermediate_base, intermediate_key]
+ (auto &ret, auto &count, auto &original_bptr, auto &remaps) {
+ return _dec_ref(t, original_laddr, false
+ ).si_then([this, &t, &original_bptr, &ret, &count,
+ &remaps, intermediate_base, intermediate_key,
+ original_laddr, original_paddr, original_len](auto) {
+ return trans_intr::do_for_each(
+ remaps.begin(),
+ remaps.end(),
+ [this, &t, &original_bptr, &ret,
+ &count, intermediate_base, intermediate_key,
+ original_laddr, original_paddr, original_len](auto &remap) {
+ LOG_PREFIX(TransactionManager::remap_pin);
+ auto remap_offset = remap.offset;
+ auto remap_len = remap.len;
+ auto remap_laddr = original_laddr + remap_offset;
+ auto remap_paddr = original_paddr.add_offset(remap_offset);
+ ceph_assert(remap_len < original_len);
+ ceph_assert(remap_offset + remap_len <= original_len);
+ ceph_assert(remap_len != 0);
+ ceph_assert(remap_offset % cache->get_block_size() == 0);
+ ceph_assert(remap_len % cache->get_block_size() == 0);
+ SUBDEBUGT(seastore_tm,
+ "remap laddr: {}, remap paddr: {}, remap length: {}", t,
+ remap_laddr, remap_paddr, remap_len);
+ auto remapped_intermediate_key = intermediate_key;
+ if (remapped_intermediate_key != L_ADDR_NULL) {
+ assert(intermediate_base != L_ADDR_NULL);
+ remapped_intermediate_key += remap_offset;
+ }
+ return alloc_remapped_extent<T>(
+ t,
+ remap_laddr,
+ remap_paddr,
+ remap_len,
+ original_laddr,
+ intermediate_base,
+ remapped_intermediate_key,
+ std::move(original_bptr)
+ ).si_then([&ret, &count, remap_laddr](auto &&npin) {
+ ceph_assert(npin->get_key() == remap_laddr);
+ ret[count++] = std::move(npin);
+ });
+ });
+ }).si_then([this, &t, intermediate_base, intermediate_key] {
+ if (N > 1 && intermediate_key != L_ADDR_NULL) {
+ return lba_manager->incref_extent(
+ t, intermediate_base, N - 1
+ ).si_then([](auto) {
+ return seastar::now();
+ });
+ }
+ return LBAManager::ref_iertr::now();
+ }).handle_error_interruptible(
+ remap_pin_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "TransactionManager::remap_pin hit invalid error"
+ }
+ ).si_then([&ret, &count] {
+ ceph_assert(count == N);
+ return remap_pin_iertr::make_ready_future<
+ std::array<LBAMappingRef, N>>(std::move(ret));
+ });
+ });
+ });
+ }
+
+ using reserve_extent_iertr = alloc_extent_iertr;
+ using reserve_extent_ret = reserve_extent_iertr::future<LBAMappingRef>;
+ reserve_extent_ret reserve_region(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len) {
+ LOG_PREFIX(TransactionManager::reserve_region);
+ SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint);
+ ceph_assert(is_aligned(hint, epm->get_block_size()));
+ return lba_manager->reserve_region(
+ t,
+ hint,
+ len);
+ }
+
+ /*
+ * clone_pin
+ *
+ * create an indirect lba mapping pointing to the physical
+ * lba mapping whose key is intermediate_key. Resort to btree_lba_manager.h
+ * for the definition of "indirect lba mapping" and "physical lba mapping"
+ *
+ */
+ using clone_extent_iertr = alloc_extent_iertr;
+ using clone_extent_ret = clone_extent_iertr::future<LBAMappingRef>;
+ clone_extent_ret clone_pin(
+ Transaction &t,
+ laddr_t hint,
+ const LBAMapping &mapping) {
+ auto intermediate_key =
+ mapping.is_indirect()
+ ? mapping.get_intermediate_key()
+ : mapping.get_key();
+ auto intermediate_base =
+ mapping.is_indirect()
+ ? mapping.get_intermediate_base()
+ : mapping.get_key();
+
+ LOG_PREFIX(TransactionManager::clone_pin);
+ SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}",
+ t, mapping.get_length(), hint, intermediate_key);
+ ceph_assert(is_aligned(hint, epm->get_block_size()));
+ return lba_manager->clone_extent(
+ t,
+ hint,
+ mapping.get_length(),
+ intermediate_key,
+ mapping.get_val(),
+ intermediate_key
+ ).si_then([this, &t, intermediate_base](auto pin) {
+ return inc_ref(t, intermediate_base
+ ).si_then([pin=std::move(pin)](auto) mutable {
+ return std::move(pin);
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further(),
+ crimson::ct_error::assert_all("not possible")
+ );
+ });
+ }
+
+ /* alloc_extents
+ *
+ * allocates more than one new blocks of type T.
+ */
+ using alloc_extents_iertr = alloc_extent_iertr;
+ template<class T>
+ alloc_extents_iertr::future<std::vector<TCachedExtentRef<T>>>
+ alloc_extents(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ int num) {
+ LOG_PREFIX(TransactionManager::alloc_extents);
+ SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, num={}",
+ t, len, hint, num);
+ return seastar::do_with(std::vector<TCachedExtentRef<T>>(),
+ [this, &t, hint, len, num] (auto &extents) {
+ return trans_intr::do_for_each(
+ boost::make_counting_iterator(0),
+ boost::make_counting_iterator(num),
+ [this, &t, len, hint, &extents] (auto i) {
+ return alloc_extent<T>(t, hint, len).si_then(
+ [&extents](auto &&node) {
+ extents.push_back(node);
+ });
+ }).si_then([&extents] {
+ return alloc_extents_iertr::make_ready_future
+ <std::vector<TCachedExtentRef<T>>>(std::move(extents));
+ });
+ });
+ }
+
+ /**
+ * submit_transaction
+ *
+ * Atomically submits transaction to persistence
+ */
+ using submit_transaction_iertr = base_iertr;
+ submit_transaction_iertr::future<> submit_transaction(Transaction &);
+
+ /**
+ * flush
+ *
+ * Block until all outstanding IOs on handle are committed.
+ * Note, flush() machinery must go through the same pipeline
+ * stages and locks as submit_transaction.
+ */
+ seastar::future<> flush(OrderingHandle &handle);
+
+ /*
+ * ExtentCallbackInterface
+ */
+
+ /// weak transaction should be type READ
+ TransactionRef create_transaction(
+ Transaction::src_t src,
+ const char* name,
+ bool is_weak=false) final {
+ return cache->create_transaction(src, name, is_weak);
+ }
+
+ using ExtentCallbackInterface::submit_transaction_direct_ret;
+ submit_transaction_direct_ret submit_transaction_direct(
+ Transaction &t,
+ std::optional<journal_seq_t> seq_to_trim = std::nullopt) final;
+
+ using ExtentCallbackInterface::get_next_dirty_extents_ret;
+ get_next_dirty_extents_ret get_next_dirty_extents(
+ Transaction &t,
+ journal_seq_t seq,
+ size_t max_bytes) final;
+
+ using ExtentCallbackInterface::rewrite_extent_ret;
+ rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent,
+ rewrite_gen_t target_generation,
+ sea_time_point modify_time) final;
+
+ using ExtentCallbackInterface::get_extents_if_live_ret;
+ get_extents_if_live_ret get_extents_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t paddr,
+ laddr_t laddr,
+ extent_len_t len) final;
+
+ /**
+ * read_root_meta
+ *
+ * Read root block meta entry for key.
+ */
+ using read_root_meta_iertr = base_iertr;
+ using read_root_meta_bare = std::optional<std::string>;
+ using read_root_meta_ret = read_root_meta_iertr::future<
+ read_root_meta_bare>;
+ read_root_meta_ret read_root_meta(
+ Transaction &t,
+ const std::string &key) {
+ return cache->get_root(
+ t
+ ).si_then([&key, &t](auto root) {
+ LOG_PREFIX(TransactionManager::read_root_meta);
+ auto meta = root->root.get_meta();
+ auto iter = meta.find(key);
+ if (iter == meta.end()) {
+ SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key);
+ return seastar::make_ready_future<read_root_meta_bare>(std::nullopt);
+ } else {
+ SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second);
+ return seastar::make_ready_future<read_root_meta_bare>(iter->second);
+ }
+ });
+ }
+
+ /**
+ * update_root_meta
+ *
+ * Update root block meta entry for key to value.
+ */
+ using update_root_meta_iertr = base_iertr;
+ using update_root_meta_ret = update_root_meta_iertr::future<>;
+ update_root_meta_ret update_root_meta(
+ Transaction& t,
+ const std::string& key,
+ const std::string& value) {
+ LOG_PREFIX(TransactionManager::update_root_meta);
+ SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {}", t, key, value);
+ return cache->get_root(
+ t
+ ).si_then([this, &t, &key, &value](RootBlockRef root) {
+ root = cache->duplicate_for_write(t, root)->cast<RootBlock>();
+
+ auto meta = root->root.get_meta();
+ meta[key] = value;
+
+ root->root.set_meta(meta);
+ return seastar::now();
+ });
+ }
+
+ /**
+ * read_onode_root
+ *
+ * Get onode-tree root logical address
+ */
+ using read_onode_root_iertr = base_iertr;
+ using read_onode_root_ret = read_onode_root_iertr::future<laddr_t>;
+ read_onode_root_ret read_onode_root(Transaction &t) {
+ return cache->get_root(t).si_then([&t](auto croot) {
+ LOG_PREFIX(TransactionManager::read_onode_root);
+ laddr_t ret = croot->get_root().onode_root;
+ SUBTRACET(seastore_tm, "{}", t, ret);
+ return ret;
+ });
+ }
+
+ /**
+ * write_onode_root
+ *
+ * Write onode-tree root logical address, must be called after read.
+ */
+ void write_onode_root(Transaction &t, laddr_t addr) {
+ LOG_PREFIX(TransactionManager::write_onode_root);
+ SUBDEBUGT(seastore_tm, "{}", t, addr);
+ auto croot = cache->get_root_fast(t);
+ croot = cache->duplicate_for_write(t, croot)->cast<RootBlock>();
+ croot->get_root().onode_root = addr;
+ }
+
+ /**
+ * read_collection_root
+ *
+ * Get collection root addr
+ */
+ using read_collection_root_iertr = base_iertr;
+ using read_collection_root_ret = read_collection_root_iertr::future<
+ coll_root_t>;
+ read_collection_root_ret read_collection_root(Transaction &t) {
+ return cache->get_root(t).si_then([&t](auto croot) {
+ LOG_PREFIX(TransactionManager::read_collection_root);
+ auto ret = croot->get_root().collection_root.get();
+ SUBTRACET(seastore_tm, "{}~{}",
+ t, ret.get_location(), ret.get_size());
+ return ret;
+ });
+ }
+
+ /**
+ * write_collection_root
+ *
+ * Update collection root addr
+ */
+ void write_collection_root(Transaction &t, coll_root_t cmroot) {
+ LOG_PREFIX(TransactionManager::write_collection_root);
+ SUBDEBUGT(seastore_tm, "{}~{}",
+ t, cmroot.get_location(), cmroot.get_size());
+ auto croot = cache->get_root_fast(t);
+ croot = cache->duplicate_for_write(t, croot)->cast<RootBlock>();
+ croot->get_root().collection_root.update(cmroot);
+ }
+
+ extent_len_t get_block_size() const {
+ return epm->get_block_size();
+ }
+
+ store_statfs_t store_stat() const {
+ return epm->get_stat();
+ }
+
+ ~TransactionManager();
+
+private:
+ friend class Transaction;
+
+ CacheRef cache;
+ LBAManagerRef lba_manager;
+ JournalRef journal;
+ ExtentPlacementManagerRef epm;
+ BackrefManagerRef backref_manager;
+
+ WritePipeline write_pipeline;
+
+ rewrite_extent_ret rewrite_logical_extent(
+ Transaction& t,
+ LogicalCachedExtentRef extent);
+
+ submit_transaction_direct_ret do_submit_transaction(
+ Transaction &t,
+ ExtentPlacementManager::dispatch_result_t dispatch_result,
+ std::optional<journal_seq_t> seq_to_trim = std::nullopt);
+
+ /// Remove refcount for offset
+ ref_ret _dec_ref(
+ Transaction &t,
+ laddr_t offset,
+ bool cascade_remove);
+
+ /**
+ * pin_to_extent
+ *
+ * Get extent mapped at pin.
+ */
+ using pin_to_extent_iertr = base_iertr;
+ template <typename T>
+ using pin_to_extent_ret = pin_to_extent_iertr::future<
+ TCachedExtentRef<T>>;
+ template <typename T>
+ pin_to_extent_ret<T> pin_to_extent(
+ Transaction &t,
+ LBAMappingRef pin) {
+ LOG_PREFIX(TransactionManager::pin_to_extent);
+ SUBTRACET(seastore_tm, "getting extent {}", t, *pin);
+ static_assert(is_logical_type(T::TYPE));
+ using ret = pin_to_extent_ret<T>;
+ auto &pref = *pin;
+ return cache->get_absent_extent<T>(
+ t,
+ pref.get_val(),
+ pref.is_indirect() ?
+ pref.get_intermediate_length() :
+ pref.get_length(),
+ [pin=std::move(pin)]
+ (T &extent) mutable {
+ assert(!extent.has_laddr());
+ assert(!extent.has_been_invalidated());
+ assert(!pin->has_been_invalidated());
+ assert(pin->get_parent());
+ pin->link_child(&extent);
+ extent.maybe_set_intermediate_laddr(*pin);
+ }
+ ).si_then([FNAME, &t](auto ref) mutable -> ret {
+ SUBTRACET(seastore_tm, "got extent -- {}", t, *ref);
+ assert(ref->is_fully_loaded());
+ return pin_to_extent_ret<T>(
+ interruptible::ready_future_marker{},
+ std::move(ref));
+ });
+ }
+
+ /**
+ * pin_to_extent_by_type
+ *
+ * Get extent mapped at pin.
+ */
+ using pin_to_extent_by_type_ret = pin_to_extent_iertr::future<
+ LogicalCachedExtentRef>;
+ pin_to_extent_by_type_ret pin_to_extent_by_type(
+ Transaction &t,
+ LBAMappingRef pin,
+ extent_types_t type)
+ {
+ LOG_PREFIX(TransactionManager::pin_to_extent_by_type);
+ SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type);
+ assert(is_logical_type(type));
+ auto &pref = *pin;
+ return cache->get_absent_extent_by_type(
+ t,
+ type,
+ pref.get_val(),
+ pref.get_key(),
+ pref.is_indirect() ?
+ pref.get_intermediate_length() :
+ pref.get_length(),
+ [pin=std::move(pin)](CachedExtent &extent) mutable {
+ auto &lextent = static_cast<LogicalCachedExtent&>(extent);
+ assert(!lextent.has_laddr());
+ assert(!lextent.has_been_invalidated());
+ assert(!pin->has_been_invalidated());
+ assert(pin->get_parent());
+ assert(!pin->get_parent()->is_pending());
+ pin->link_child(&lextent);
+ lextent.maybe_set_intermediate_laddr(*pin);
+ }
+ ).si_then([FNAME, &t](auto ref) {
+ SUBTRACET(seastore_tm, "got extent -- {}", t, *ref);
+ assert(ref->is_fully_loaded());
+ return pin_to_extent_by_type_ret(
+ interruptible::ready_future_marker{},
+ std::move(ref->template cast<LogicalCachedExtent>()));
+ });
+ }
+
+ /**
+ * alloc_remapped_extent
+ *
+ * Allocates a new extent at given remap_paddr that must be absolute and
+ * use the buffer to fill the new extent if buffer exists. Otherwise, will
+ * not read disk to fill the new extent.
+ * Returns the new extent.
+ *
+ * Should make sure the end laddr of remap extent <= the end laddr of
+ * original extent when using this method.
+ */
+ using alloc_remapped_extent_iertr =
+ alloc_extent_iertr::extend_ertr<Device::read_ertr>;
+ using alloc_remapped_extent_ret =
+ alloc_remapped_extent_iertr::future<LBAMappingRef>;
+ template <typename T>
+ alloc_remapped_extent_ret alloc_remapped_extent(
+ Transaction &t,
+ laddr_t remap_laddr,
+ paddr_t remap_paddr,
+ extent_len_t remap_length,
+ laddr_t original_laddr,
+ laddr_t intermediate_base,
+ laddr_t intermediate_key,
+ std::optional<ceph::bufferptr> &&original_bptr) {
+ LOG_PREFIX(TransactionManager::alloc_remapped_extent);
+ SUBDEBUG(seastore_tm, "alloc remapped extent: remap_laddr: {}, "
+ "remap_paddr: {}, remap_length: {}, has data in cache: {} ",
+ remap_laddr, remap_paddr, remap_length,
+ original_bptr.has_value() ? "true":"false");
+ TCachedExtentRef<T> ext;
+ auto fut = LBAManager::alloc_extent_iertr::make_ready_future<
+ LBAMappingRef>();
+ assert((intermediate_key == L_ADDR_NULL)
+ == (intermediate_base == L_ADDR_NULL));
+ if (intermediate_key == L_ADDR_NULL) {
+ // remapping direct mapping
+ ext = cache->alloc_remapped_extent<T>(
+ t,
+ remap_laddr,
+ remap_paddr,
+ remap_length,
+ original_laddr,
+ std::move(original_bptr));
+ fut = lba_manager->alloc_extent(
+ t, remap_laddr, remap_length, remap_paddr, *ext);
+ } else {
+ fut = lba_manager->clone_extent(
+ t,
+ remap_laddr,
+ remap_length,
+ intermediate_key,
+ remap_paddr,
+ intermediate_base);
+ }
+ return fut.si_then([remap_laddr, remap_length, remap_paddr](auto &&ref) {
+ assert(ref->get_key() == remap_laddr);
+ assert(ref->get_val() == remap_paddr);
+ assert(ref->get_length() == remap_length);
+ return alloc_remapped_extent_iertr::make_ready_future
+ <LBAMappingRef>(std::move(ref));
+ });
+ }
+
+public:
+ // Testing interfaces
+ auto get_epm() {
+ return epm.get();
+ }
+
+ auto get_lba_manager() {
+ return lba_manager.get();
+ }
+
+ auto get_backref_manager() {
+ return backref_manager.get();
+ }
+
+ auto get_cache() {
+ return cache.get();
+ }
+ auto get_journal() {
+ return journal.get();
+ }
+};
+using TransactionManagerRef = std::unique_ptr<TransactionManager>;
+
+TransactionManagerRef make_transaction_manager(
+ Device *primary_device,
+ const std::vector<Device*> &secondary_devices,
+ bool is_test);
+}
diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt
new file mode 100644
index 000000000..f521e0244
--- /dev/null
+++ b/src/crimson/osd/CMakeLists.txt
@@ -0,0 +1,72 @@
+add_executable(crimson-osd
+ backfill_state.cc
+ ec_backend.cc
+ heartbeat.cc
+ lsan_suppressions.cc
+ main.cc
+ main_config_bootstrap_helpers.cc
+ osd.cc
+ osd_meta.cc
+ pg.cc
+ pg_backend.cc
+ pg_meta.cc
+ replicated_backend.cc
+ shard_services.cc
+ pg_shard_manager.cc
+ object_context.cc
+ object_context_loader.cc
+ ops_executer.cc
+ osd_operation.cc
+ osd_operations/client_request.cc
+ osd_operations/client_request_common.cc
+ osd_operations/internal_client_request.cc
+ osd_operations/peering_event.cc
+ osd_operations/pg_advance_map.cc
+ osd_operations/replicated_request.cc
+ osd_operations/logmissing_request.cc
+ osd_operations/logmissing_request_reply.cc
+ osd_operations/background_recovery.cc
+ osd_operations/recovery_subrequest.cc
+ osd_operations/snaptrim_event.cc
+ pg_recovery.cc
+ recovery_backend.cc
+ replicated_recovery_backend.cc
+ scheduler/scheduler.cc
+ scheduler/mclock_scheduler.cc
+ osdmap_gate.cc
+ pg_activation_blocker.cc
+ pg_map.cc
+ pg_interval_interrupt_condition.cc
+ objclass.cc
+ ${PROJECT_SOURCE_DIR}/src/objclass/class_api.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/ClassHandler.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/osd_op_util.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/OSDCap.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PeeringState.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PGStateUtils.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc
+ watch.cc
+ )
+if(HAS_VTA)
+ set_source_files_properties(main.cc
+ PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+target_link_libraries(crimson-osd
+ crimson-admin
+ crimson-common
+ crimson-os
+ crimson
+ fmt::fmt
+ Boost::MPL
+ dmclock::dmclock)
+set_target_properties(crimson-osd PROPERTIES
+ POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE})
+install(TARGETS crimson-osd DESTINATION bin)
+if(WITH_TESTS)
+ add_dependencies(tests crimson-osd)
+endif()
diff --git a/src/crimson/osd/acked_peers.h b/src/crimson/osd/acked_peers.h
new file mode 100644
index 000000000..b2f2562c0
--- /dev/null
+++ b/src/crimson/osd/acked_peers.h
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <vector>
+
+namespace crimson::osd {
+ struct peer_shard_t {
+ pg_shard_t shard;
+ eversion_t last_complete_ondisk;
+ };
+ using acked_peers_t = std::vector<peer_shard_t>;
+}
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
new file mode 100644
index 000000000..683dc6ea6
--- /dev/null
+++ b/src/crimson/osd/backfill_facades.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/pg.h"
+#include "osd/PeeringState.h"
+
+namespace crimson::osd {
+
+// PeeringFacade -- main implementation of the BackfillState::PeeringFacade
+// interface. We have the abstraction to decuple BackfillState from Peering
+// State, and thus cut depedencies in unit testing. The second implemention
+// is BackfillFixture::PeeringFacade and sits in test_backfill.cc.
+struct PeeringFacade final : BackfillState::PeeringFacade {
+ PeeringState& peering_state;
+
+ hobject_t earliest_backfill() const override {
+ return peering_state.earliest_backfill();
+ }
+
+ const std::set<pg_shard_t>& get_backfill_targets() const override {
+ return peering_state.get_backfill_targets();
+ }
+
+ const hobject_t& get_peer_last_backfill(pg_shard_t peer) const override {
+ return peering_state.get_peer_info(peer).last_backfill;
+ }
+
+ const eversion_t& get_last_update() const override {
+ return peering_state.get_info().last_update;
+ }
+
+ const eversion_t& get_log_tail() const override {
+ return peering_state.get_info().log_tail;
+ }
+
+ void scan_log_after(eversion_t v, scan_log_func_t f) const override {
+ peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f));
+ }
+
+ bool is_backfill_target(pg_shard_t peer) const override {
+ return peering_state.is_backfill_target(peer);
+ }
+ void update_complete_backfill_object_stats(const hobject_t &hoid,
+ const pg_stat_t &stats) override {
+ peering_state.update_complete_backfill_object_stats(hoid, stats);
+ }
+
+ bool is_backfilling() const override {
+ return peering_state.is_backfilling();
+ }
+
+ PeeringFacade(PeeringState& peering_state)
+ : peering_state(peering_state) {
+ }
+};
+
+// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge
+// interface of crimson's PG class. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct PGFacade final : BackfillState::PGFacade {
+ PG& pg;
+
+ const eversion_t& get_projected_last_update() const override {
+ return pg.projected_last_update;
+ }
+
+ PGFacade(PG& pg) : pg(pg) {}
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
new file mode 100644
index 000000000..46a270ffe
--- /dev/null
+++ b/src/crimson/osd/backfill_state.cc
@@ -0,0 +1,558 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <boost/type_index.hpp>
+#include <fmt/ranges.h>
+#include "common/hobject_fmt.h"
+#include "crimson/osd/backfill_state.h"
+#include "osd/osd_types_fmt.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+BackfillState::BackfillState(
+ BackfillState::BackfillListener& backfill_listener,
+ std::unique_ptr<BackfillState::PeeringFacade> peering_state,
+ std::unique_ptr<BackfillState::PGFacade> pg)
+ : backfill_machine(*this,
+ backfill_listener,
+ std::move(peering_state),
+ std::move(pg)),
+ progress_tracker(
+ std::make_unique<BackfillState::ProgressTracker>(backfill_machine))
+{
+ logger().debug("{}:{}", __func__, __LINE__);
+ backfill_machine.initiate();
+}
+
+template <class S>
+BackfillState::StateHelper<S>::StateHelper()
+{
+ logger().debug("enter {}",
+ boost::typeindex::type_id<S>().pretty_name());
+}
+
+template <class S>
+BackfillState::StateHelper<S>::~StateHelper()
+{
+ logger().debug("exit {}",
+ boost::typeindex::type_id<S>().pretty_name());
+}
+
+BackfillState::~BackfillState() = default;
+
+BackfillState::BackfillMachine::BackfillMachine(
+ BackfillState& backfill_state,
+ BackfillState::BackfillListener& backfill_listener,
+ std::unique_ptr<BackfillState::PeeringFacade> peering_state,
+ std::unique_ptr<BackfillState::PGFacade> pg)
+ : backfill_state(backfill_state),
+ backfill_listener(backfill_listener),
+ peering_state(std::move(peering_state)),
+ pg(std::move(pg))
+{}
+
+BackfillState::BackfillMachine::~BackfillMachine() = default;
+
+BackfillState::Initial::Initial(my_context ctx)
+ : my_base(ctx)
+{
+ backfill_state().last_backfill_started = peering_state().earliest_backfill();
+ logger().debug("{}: bft={} from {}",
+ __func__, peering_state().get_backfill_targets(),
+ backfill_state().last_backfill_started);
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ logger().debug("{}: target shard {} from {}",
+ __func__, bt, peering_state().get_peer_last_backfill(bt));
+ }
+ ceph_assert(peering_state().get_backfill_targets().size());
+ ceph_assert(!backfill_state().last_backfill_started.is_max());
+}
+
+boost::statechart::result
+BackfillState::Initial::react(const BackfillState::Triggered& evt)
+{
+ logger().debug("{}: backfill triggered", __func__);
+ ceph_assert(backfill_state().last_backfill_started == \
+ peering_state().earliest_backfill());
+ ceph_assert(peering_state().is_backfilling());
+ // initialize BackfillIntervals
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ backfill_state().peer_backfill_info[bt].reset(
+ peering_state().get_peer_last_backfill(bt));
+ }
+ backfill_state().backfill_info.reset(backfill_state().last_backfill_started);
+ if (Enqueuing::all_enqueued(peering_state(),
+ backfill_state().backfill_info,
+ backfill_state().peer_backfill_info)) {
+ logger().debug("{}: switching to Done state", __func__);
+ return transit<BackfillState::Done>();
+ } else {
+ logger().debug("{}: switching to Enqueuing state", __func__);
+ return transit<BackfillState::Enqueuing>();
+ }
+}
+
+
+// -- Enqueuing
+void BackfillState::Enqueuing::maybe_update_range()
+{
+ if (auto& primary_bi = backfill_state().backfill_info;
+ primary_bi.version >= pg().get_projected_last_update()) {
+ logger().info("{}: bi is current", __func__);
+ ceph_assert(primary_bi.version == pg().get_projected_last_update());
+ } else if (primary_bi.version >= peering_state().get_log_tail()) {
+#if 0
+ if (peering_state().get_pg_log().get_log().empty() &&
+ pg().get_projected_log().empty()) {
+ /* Because we don't move log_tail on split, the log might be
+ * empty even if log_tail != last_update. However, the only
+ * way to get here with an empty log is if log_tail is actually
+ * eversion_t(), because otherwise the entry which changed
+ * last_update since the last scan would have to be present.
+ */
+ ceph_assert(primary_bi.version == eversion_t());
+ return;
+ }
+#endif
+ logger().debug("{}: bi is old, ({}) can be updated with log to {}",
+ __func__,
+ primary_bi.version,
+ pg().get_projected_last_update());
+ logger().debug("{}: scanning pg log first", __func__);
+ peering_state().scan_log_after(primary_bi.version,
+ [&](const pg_log_entry_t& e) {
+ logger().debug("maybe_update_range(lambda): updating from version {}",
+ e.version);
+ if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) {
+ if (e.is_update()) {
+ logger().debug("maybe_update_range(lambda): {} updated to ver {}",
+ e.soid, e.version);
+ primary_bi.objects.erase(e.soid);
+ primary_bi.objects.insert(std::make_pair(e.soid,
+ e.version));
+ } else if (e.is_delete()) {
+ logger().debug("maybe_update_range(lambda): {} removed",
+ e.soid);
+ primary_bi.objects.erase(e.soid);
+ }
+ }
+ });
+ primary_bi.version = pg().get_projected_last_update();
+ } else {
+ ceph_abort_msg(
+ "scan_range should have raised primary_bi.version past log_tail");
+ }
+}
+
+void BackfillState::Enqueuing::trim_backfill_infos()
+{
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ backfill_state().peer_backfill_info[bt].trim_to(
+ std::max(peering_state().get_peer_last_backfill(bt),
+ backfill_state().last_backfill_started));
+ }
+ backfill_state().backfill_info.trim_to(
+ backfill_state().last_backfill_started);
+}
+
+/* static */ bool BackfillState::Enqueuing::all_enqueued(
+ const PeeringFacade& peering_state,
+ const BackfillInterval& backfill_info,
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+{
+ const bool all_local_enqueued = \
+ backfill_info.extends_to_end() && backfill_info.empty();
+ const bool all_peer_enqueued = std::all_of(
+ std::begin(peer_backfill_info),
+ std::end(peer_backfill_info),
+ [] (const auto& kv) {
+ [[maybe_unused]] const auto& [ shard, peer_backfill_info ] = kv;
+ return peer_backfill_info.extends_to_end() && peer_backfill_info.empty();
+ });
+ return all_local_enqueued && all_peer_enqueued;
+}
+
+hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+{
+ hobject_t e = hobject_t::get_max();
+ for (const pg_shard_t& bt : peering_state().get_backfill_targets()) {
+ const auto iter = peer_backfill_info.find(bt);
+ ceph_assert(iter != peer_backfill_info.end());
+ e = std::min(e, iter->second.begin);
+ }
+ return e;
+}
+
+bool BackfillState::Enqueuing::should_rescan_replicas(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+ const BackfillInterval& backfill_info) const
+{
+ const auto& targets = peering_state().get_backfill_targets();
+ return std::any_of(std::begin(targets), std::end(targets),
+ [&] (const auto& bt) {
+ return ReplicasScanning::replica_needs_scan(peer_backfill_info.at(bt),
+ backfill_info);
+ });
+}
+
+bool BackfillState::Enqueuing::should_rescan_primary(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+ const BackfillInterval& backfill_info) const
+{
+ return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
+ !backfill_info.extends_to_end();
+}
+
+void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
+ BackfillState::Enqueuing::result_t&& result,
+ hobject_t& last_backfill_started,
+ std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+{
+ std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets),
+ [&peer_backfill_info] (const auto& bt) {
+ peer_backfill_info.at(bt).pop_front();
+ });
+ last_backfill_started = std::move(result.new_last_backfill_started);
+}
+
+BackfillState::Enqueuing::result_t
+BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
+{
+ // set `new_last_backfill_started` to `check`
+ result_t result { {}, check };
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ const auto& pbi = backfill_state().peer_backfill_info.at(bt);
+ if (pbi.begin == check) {
+ result.pbi_targets.insert(bt);
+ const auto& version = pbi.objects.begin()->second;
+ backfill_state().progress_tracker->enqueue_drop(pbi.begin);
+ backfill_listener().enqueue_drop(bt, pbi.begin, version);
+ }
+ }
+ logger().debug("{}: BACKFILL removing {} from peers {}",
+ __func__, check, result.pbi_targets);
+ ceph_assert(!result.pbi_targets.empty());
+ return result;
+}
+
+BackfillState::Enqueuing::result_t
+BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
+{
+ logger().debug("{}: check={}", __func__, check);
+ const auto& primary_bi = backfill_state().backfill_info;
+ result_t result { {}, primary_bi.begin };
+
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
+
+ // Find all check peers that have the wrong version
+ if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
+ check == primary_bi.begin && check == peer_bi.begin) {
+ if(peer_bi.objects.begin()->second != obj_v &&
+ backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
+ backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+ } else {
+ // it's fine, keep it! OR already recovering
+ }
+ result.pbi_targets.insert(bt);
+ } else {
+ // Only include peers that we've caught up to their backfill line
+ // otherwise, they only appear to be missing this object
+ // because their peer_bi.begin > backfill_info.begin.
+ if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) &&
+ backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
+ backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+ }
+ }
+ }
+ return result;
+}
+
+bool BackfillState::Enqueuing::Enqueuing::all_emptied(
+ const BackfillInterval& local_backfill_info,
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+{
+ const auto& targets = peering_state().get_backfill_targets();
+ const auto replicas_emptied =
+ std::all_of(std::begin(targets), std::end(targets),
+ [&] (const auto& bt) {
+ return peer_backfill_info.at(bt).empty();
+ });
+ return local_backfill_info.empty() && replicas_emptied;
+}
+
+BackfillState::Enqueuing::Enqueuing(my_context ctx)
+ : my_base(ctx)
+{
+ auto& primary_bi = backfill_state().backfill_info;
+
+ // update our local interval to cope with recent changes
+ primary_bi.begin = backfill_state().last_backfill_started;
+ if (primary_bi.version < peering_state().get_log_tail()) {
+ // it might be that the OSD is so flooded with modifying operations
+ // that backfill will be spinning here over and over. For the sake
+ // of performance and complexity we don't synchronize with entire PG.
+ // similar can happen in classical OSD.
+ logger().warn("{}: bi is old, rescanning of local backfill_info",
+ __func__);
+ post_event(RequestPrimaryScanning{});
+ return;
+ } else {
+ maybe_update_range();
+ }
+ trim_backfill_infos();
+
+ while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+ if (!backfill_listener().budget_available()) {
+ post_event(RequestWaiting{});
+ return;
+ } else if (should_rescan_replicas(backfill_state().peer_backfill_info,
+ primary_bi)) {
+ // Count simultaneous scans as a single op and let those complete
+ post_event(RequestReplicasScanning{});
+ return;
+ }
+ // Get object within set of peers to operate on and the set of targets
+ // for which that object applies.
+ if (const hobject_t check = \
+ earliest_peer_backfill(backfill_state().peer_backfill_info);
+ check < primary_bi.begin) {
+ // Don't increment ops here because deletions
+ // are cheap and not replied to unlike real recovery_ops,
+ // and we can't increment ops without requeueing ourself
+ // for recovery.
+ auto result = remove_on_peers(check);
+ trim_backfilled_object_from_intervals(std::move(result),
+ backfill_state().last_backfill_started,
+ backfill_state().peer_backfill_info);
+ } else {
+ auto result = update_on_peers(check);
+ trim_backfilled_object_from_intervals(std::move(result),
+ backfill_state().last_backfill_started,
+ backfill_state().peer_backfill_info);
+ primary_bi.pop_front();
+ }
+ backfill_listener().maybe_flush();
+ }
+
+ if (should_rescan_primary(backfill_state().peer_backfill_info,
+ primary_bi)) {
+ // need to grab one another chunk of the object namespace and restart
+ // the queueing.
+ logger().debug("{}: reached end for current local chunk",
+ __func__);
+ post_event(RequestPrimaryScanning{});
+ } else if (backfill_state().progress_tracker->tracked_objects_completed()) {
+ post_event(RequestDone{});
+ } else {
+ logger().debug("{}: reached end for both local and all peers "
+ "but still has in-flight operations", __func__);
+ post_event(RequestWaiting{});
+ }
+}
+
+// -- PrimaryScanning
+BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx)
+ : my_base(ctx)
+{
+ backfill_state().backfill_info.version = peering_state().get_last_update();
+ backfill_listener().request_primary_scan(
+ backfill_state().backfill_info.begin);
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(PrimaryScanned evt)
+{
+ logger().debug("{}", __func__);
+ backfill_state().backfill_info = std::move(evt.result);
+ return transit<Enqueuing>();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(ObjectPushed evt)
+{
+ logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
+ evt.object);
+ backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+ return discard_event();
+}
+
+// -- ReplicasScanning
+bool BackfillState::ReplicasScanning::replica_needs_scan(
+ const BackfillInterval& replica_backfill_info,
+ const BackfillInterval& local_backfill_info)
+{
+ return replica_backfill_info.empty() && \
+ replica_backfill_info.begin <= local_backfill_info.begin && \
+ !replica_backfill_info.extends_to_end();
+}
+
+BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx)
+ : my_base(ctx)
+{
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ if (const auto& pbi = backfill_state().peer_backfill_info.at(bt);
+ replica_needs_scan(pbi, backfill_state().backfill_info)) {
+ logger().debug("{}: scanning peer osd.{} from {}",
+ __func__, bt, pbi.end);
+ backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{});
+
+ ceph_assert(waiting_on_backfill.find(bt) == \
+ waiting_on_backfill.end());
+ waiting_on_backfill.insert(bt);
+ }
+ }
+ ceph_assert(!waiting_on_backfill.empty());
+ // TODO: start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
+}
+
+#if 0
+BackfillState::ReplicasScanning::~ReplicasScanning()
+{
+ // TODO: finish_recovery_op(hobject_t::get_max());
+}
+#endif
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(ReplicaScanned evt)
+{
+ logger().debug("{}: got scan result from osd={}, result={}",
+ __func__, evt.from, evt.result);
+ // TODO: maybe we'll be able to move waiting_on_backfill from
+ // the machine to the state.
+ ceph_assert(peering_state().is_backfill_target(evt.from));
+ if (waiting_on_backfill.erase(evt.from)) {
+ backfill_state().peer_backfill_info[evt.from] = std::move(evt.result);
+ if (waiting_on_backfill.empty()) {
+ ceph_assert(backfill_state().peer_backfill_info.size() == \
+ peering_state().get_backfill_targets().size());
+ return transit<Enqueuing>();
+ }
+ } else {
+ // we canceled backfill for a while due to a too full, and this
+ // is an extra response from a non-too-full peer
+ logger().debug("{}: canceled backfill (too full?)", __func__);
+ }
+ return discard_event();
+}
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(ObjectPushed evt)
+{
+ logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
+ evt.object);
+ backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+ return discard_event();
+}
+
+
+// -- Waiting
+BackfillState::Waiting::Waiting(my_context ctx)
+ : my_base(ctx)
+{
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(ObjectPushed evt)
+{
+ logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
+ evt.object);
+ backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+ if (!Enqueuing::all_enqueued(peering_state(),
+ backfill_state().backfill_info,
+ backfill_state().peer_backfill_info)) {
+ return transit<Enqueuing>();
+ } else if (backfill_state().progress_tracker->tracked_objects_completed()) {
+ return transit<Done>();
+ } else {
+ // we still have something to wait on
+ logger().debug("Waiting::react() on ObjectPushed; still waiting");
+ return discard_event();
+ }
+}
+
+// -- Done
+BackfillState::Done::Done(my_context ctx)
+ : my_base(ctx)
+{
+ logger().info("{}: backfill is done", __func__);
+ backfill_listener().backfilled();
+}
+
+// -- Crashed
+BackfillState::Crashed::Crashed()
+{
+ ceph_abort_msg("{}: this should not happen");
+}
+
+// ProgressTracker is an intermediary between the BackfillListener and
+// BackfillMachine + its states. All requests to push or drop an object
+// are directed through it. The same happens with notifications about
+// completing given operations which are generated by BackfillListener
+// and dispatched as i.e. ObjectPushed events.
+// This allows ProgressTacker to track the list of in-flight operations
+// which is essential to make the decision whether the entire machine
+// should switch from Waiting to Done keep in Waiting.
+// ProgressTracker also coordinates .last_backfill_started and stats
+// updates.
+bool BackfillState::ProgressTracker::tracked_objects_completed() const
+{
+ return registry.empty();
+}
+
+bool BackfillState::ProgressTracker::enqueue_push(const hobject_t& obj)
+{
+ [[maybe_unused]] const auto [it, first_seen] = registry.try_emplace(
+ obj, registry_item_t{op_stage_t::enqueued_push, std::nullopt});
+ return first_seen;
+}
+
+void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj)
+{
+ registry.try_emplace(
+ obj, registry_item_t{op_stage_t::enqueued_drop, pg_stat_t{}});
+}
+
+void BackfillState::ProgressTracker::complete_to(
+ const hobject_t& obj,
+ const pg_stat_t& stats)
+{
+ logger().debug("{}: obj={}",
+ __func__, obj);
+ if (auto completion_iter = registry.find(obj);
+ completion_iter != std::end(registry)) {
+ completion_iter->second = \
+ registry_item_t{ op_stage_t::completed_push, stats };
+ } else {
+ ceph_abort_msg("completing untracked object shall not happen");
+ }
+ for (auto it = std::begin(registry);
+ it != std::end(registry) &&
+ it->second.stage != op_stage_t::enqueued_push;
+ it = registry.erase(it)) {
+ auto& [soid, item] = *it;
+ assert(item.stats);
+ peering_state().update_complete_backfill_object_stats(
+ soid,
+ *item.stats);
+ }
+ if (Enqueuing::all_enqueued(peering_state(),
+ backfill_state().backfill_info,
+ backfill_state().peer_backfill_info) &&
+ tracked_objects_completed()) {
+ backfill_state().last_backfill_started = hobject_t::get_max();
+ backfill_listener().update_peers_last_backfill(hobject_t::get_max());
+ } else {
+ backfill_listener().update_peers_last_backfill(obj);
+ }
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
new file mode 100644
index 000000000..4bd2991fb
--- /dev/null
+++ b/src/crimson/osd/backfill_state.h
@@ -0,0 +1,382 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <optional>
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+
+#include "osd/recovery_types.h"
+
+namespace crimson::osd {
+
+namespace sc = boost::statechart;
+
+struct BackfillState {
+ struct BackfillListener;
+ struct PeeringFacade;
+ struct PGFacade;
+
+ // events comes first
+ struct PrimaryScanned : sc::event<PrimaryScanned> {
+ BackfillInterval result;
+ PrimaryScanned(BackfillInterval&& result)
+ : result(std::move(result)) {
+ }
+ };
+
+ struct ReplicaScanned : sc::event<ReplicaScanned> {
+ pg_shard_t from;
+ BackfillInterval result;
+ ReplicaScanned(pg_shard_t from, BackfillInterval&& result)
+ : from(std::move(from)),
+ result(std::move(result)) {
+ }
+ };
+
+ struct ObjectPushed : sc::event<ObjectPushed> {
+ // TODO: implement replica management; I don't want to follow
+ // current convention where the backend layer is responsible
+ // for tracking replicas.
+ hobject_t object;
+ pg_stat_t stat;
+ ObjectPushed(hobject_t object)
+ : object(std::move(object)) {
+ }
+ };
+
+ struct Triggered : sc::event<Triggered> {
+ };
+
+private:
+ // internal events
+ struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> {
+ };
+
+ struct RequestReplicasScanning : sc::event<RequestReplicasScanning> {
+ };
+
+ struct RequestWaiting : sc::event<RequestWaiting> {
+ };
+
+ struct RequestDone : sc::event<RequestDone> {
+ };
+
+ class ProgressTracker;
+
+public:
+
+ struct Initial;
+ struct Enqueuing;
+ struct PrimaryScanning;
+ struct ReplicasScanning;
+ struct Waiting;
+ struct Done;
+
+ struct BackfillMachine : sc::state_machine<BackfillMachine, Initial> {
+ BackfillMachine(BackfillState& backfill_state,
+ BackfillListener& backfill_listener,
+ std::unique_ptr<PeeringFacade> peering_state,
+ std::unique_ptr<PGFacade> pg);
+ ~BackfillMachine();
+ BackfillState& backfill_state;
+ BackfillListener& backfill_listener;
+ std::unique_ptr<PeeringFacade> peering_state;
+ std::unique_ptr<PGFacade> pg;
+ };
+
+private:
+ template <class S>
+ struct StateHelper {
+ StateHelper();
+ ~StateHelper();
+
+ BackfillState& backfill_state() {
+ return static_cast<S*>(this) \
+ ->template context<BackfillMachine>().backfill_state;
+ }
+ BackfillListener& backfill_listener() {
+ return static_cast<S*>(this) \
+ ->template context<BackfillMachine>().backfill_listener;
+ }
+ PeeringFacade& peering_state() {
+ return *static_cast<S*>(this) \
+ ->template context<BackfillMachine>().peering_state;
+ }
+ PGFacade& pg() {
+ return *static_cast<S*>(this)->template context<BackfillMachine>().pg;
+ }
+
+ const PeeringFacade& peering_state() const {
+ return *static_cast<const S*>(this) \
+ ->template context<BackfillMachine>().peering_state;
+ }
+ const BackfillState& backfill_state() const {
+ return static_cast<const S*>(this) \
+ ->template context<BackfillMachine>().backfill_state;
+ }
+ };
+
+public:
+
+ // states
+ struct Crashed : sc::simple_state<Crashed, BackfillMachine>,
+ StateHelper<Crashed> {
+ explicit Crashed();
+ };
+
+ struct Initial : sc::state<Initial, BackfillMachine>,
+ StateHelper<Initial> {
+ using reactions = boost::mpl::list<
+ sc::custom_reaction<Triggered>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit Initial(my_context);
+ // initialize after triggering backfill by on_activate_complete().
+ // transit to Enqueuing.
+ sc::result react(const Triggered&);
+ };
+
+ struct Enqueuing : sc::state<Enqueuing, BackfillMachine>,
+ StateHelper<Enqueuing> {
+ using reactions = boost::mpl::list<
+ sc::transition<RequestPrimaryScanning, PrimaryScanning>,
+ sc::transition<RequestReplicasScanning, ReplicasScanning>,
+ sc::transition<RequestWaiting, Waiting>,
+ sc::transition<RequestDone, Done>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit Enqueuing(my_context);
+
+ // indicate whether there is any remaining work to do when it comes
+ // to comparing the hobject_t namespace between primary and replicas.
+ // true doesn't necessarily mean backfill is done -- there could be
+ // in-flight pushes or drops which had been enqueued but aren't
+ // completed yet.
+ static bool all_enqueued(
+ const PeeringFacade& peering_state,
+ const BackfillInterval& backfill_info,
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+
+ private:
+ void maybe_update_range();
+ void trim_backfill_infos();
+
+ // these methods take BackfillIntervals instead of extracting them from
+ // the state to emphasize the relationships across the main loop.
+ bool all_emptied(
+ const BackfillInterval& local_backfill_info,
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+ hobject_t earliest_peer_backfill(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+ bool should_rescan_replicas(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+ const BackfillInterval& backfill_info) const;
+ // indicate whether a particular acting primary needs to scanned again
+ // to process next piece of the hobject_t's namespace.
+ // the logic is per analogy to replica_needs_scan(). See comments there.
+ bool should_rescan_primary(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+ const BackfillInterval& backfill_info) const;
+
+ // the result_t is intermediary between {remove,update}_on_peers() and
+ // updating BackfillIntervals in trim_backfilled_object_from_intervals.
+ // This step is important because it affects the main loop's condition,
+ // and thus deserves to be exposed instead of being called deeply from
+ // {remove,update}_on_peers().
+ struct [[nodiscard]] result_t {
+ std::set<pg_shard_t> pbi_targets;
+ hobject_t new_last_backfill_started;
+ };
+ void trim_backfilled_object_from_intervals(
+ result_t&&,
+ hobject_t& last_backfill_started,
+ std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+ result_t remove_on_peers(const hobject_t& check);
+ result_t update_on_peers(const hobject_t& check);
+ };
+
+ struct PrimaryScanning : sc::state<PrimaryScanning, BackfillMachine>,
+ StateHelper<PrimaryScanning> {
+ using reactions = boost::mpl::list<
+ sc::custom_reaction<ObjectPushed>,
+ sc::custom_reaction<PrimaryScanned>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit PrimaryScanning(my_context);
+ sc::result react(ObjectPushed);
+ // collect scanning result and transit to Enqueuing.
+ sc::result react(PrimaryScanned);
+ };
+
+ struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>,
+ StateHelper<ReplicasScanning> {
+ using reactions = boost::mpl::list<
+ sc::custom_reaction<ObjectPushed>,
+ sc::custom_reaction<ReplicaScanned>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit ReplicasScanning(my_context);
+ // collect scanning result; if all results are collected, transition
+ // to Enqueuing will happen.
+ sc::result react(ObjectPushed);
+ sc::result react(ReplicaScanned);
+
+ // indicate whether a particular peer should be scanned to retrieve
+ // BackfillInterval for new range of hobject_t namespace.
+ // true when bi.objects is exhausted, replica bi's end is not MAX,
+ // and primary bi'begin is further than the replica's one.
+ static bool replica_needs_scan(
+ const BackfillInterval& replica_backfill_info,
+ const BackfillInterval& local_backfill_info);
+
+ private:
+ std::set<pg_shard_t> waiting_on_backfill;
+ };
+
+ struct Waiting : sc::state<Waiting, BackfillMachine>,
+ StateHelper<Waiting> {
+ using reactions = boost::mpl::list<
+ sc::custom_reaction<ObjectPushed>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit Waiting(my_context);
+ sc::result react(ObjectPushed);
+ };
+
+ struct Done : sc::state<Done, BackfillMachine>,
+ StateHelper<Done> {
+ using reactions = boost::mpl::list<
+ sc::transition<sc::event_base, Crashed>>;
+ explicit Done(my_context);
+ };
+
+ BackfillState(BackfillListener& backfill_listener,
+ std::unique_ptr<PeeringFacade> peering_state,
+ std::unique_ptr<PGFacade> pg);
+ ~BackfillState();
+
+ void process_event(
+ boost::intrusive_ptr<const sc::event_base> evt) {
+ backfill_machine.process_event(*std::move(evt));
+ }
+
+ hobject_t get_last_backfill_started() const {
+ return last_backfill_started;
+ }
+private:
+ hobject_t last_backfill_started;
+ BackfillInterval backfill_info;
+ std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+ BackfillMachine backfill_machine;
+ std::unique_ptr<ProgressTracker> progress_tracker;
+};
+
+// BackfillListener -- an interface used by the backfill FSM to request
+// low-level services like issueing `MOSDPGPush` or `MOSDPGBackfillRemove`.
+// The goals behind the interface are: 1) unittestability; 2) possibility
+// to retrofit classical OSD with BackfillState. For the second reason we
+// never use `seastar::future` -- instead responses to the requests are
+// conveyed as events; see ObjectPushed as an example.
+struct BackfillState::BackfillListener {
+ virtual void request_replica_scan(
+ const pg_shard_t& target,
+ const hobject_t& begin,
+ const hobject_t& end) = 0;
+
+ virtual void request_primary_scan(
+ const hobject_t& begin) = 0;
+
+ virtual void enqueue_push(
+ const hobject_t& obj,
+ const eversion_t& v) = 0;
+
+ virtual void enqueue_drop(
+ const pg_shard_t& target,
+ const hobject_t& obj,
+ const eversion_t& v) = 0;
+
+ virtual void maybe_flush() = 0;
+
+ virtual void update_peers_last_backfill(
+ const hobject_t& new_last_backfill) = 0;
+
+ virtual bool budget_available() const = 0;
+
+ virtual void backfilled() = 0;
+
+ virtual ~BackfillListener() = default;
+};
+
+// PeeringFacade -- a facade (in the GoF-defined meaning) simplifying
+// the interface of PeeringState. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct BackfillState::PeeringFacade {
+ virtual hobject_t earliest_backfill() const = 0;
+ virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0;
+ virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0;
+ virtual const eversion_t& get_last_update() const = 0;
+ virtual const eversion_t& get_log_tail() const = 0;
+
+ // the performance impact of `std::function` has not been considered yet.
+ // If there is any proof (from e.g. profiling) about its significance, we
+ // can switch back to the template variant.
+ using scan_log_func_t = std::function<void(const pg_log_entry_t&)>;
+ virtual void scan_log_after(eversion_t, scan_log_func_t) const = 0;
+
+ virtual bool is_backfill_target(pg_shard_t peer) const = 0;
+ virtual void update_complete_backfill_object_stats(const hobject_t &hoid,
+ const pg_stat_t &stats) = 0;
+ virtual bool is_backfilling() const = 0;
+ virtual ~PeeringFacade() {}
+};
+
+// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge
+// interface of crimson's PG class. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct BackfillState::PGFacade {
+ virtual const eversion_t& get_projected_last_update() const = 0;
+ virtual ~PGFacade() {}
+};
+
+class BackfillState::ProgressTracker {
+ // TODO: apply_stat,
+ enum class op_stage_t {
+ enqueued_push,
+ enqueued_drop,
+ completed_push,
+ };
+
+ struct registry_item_t {
+ op_stage_t stage;
+ std::optional<pg_stat_t> stats;
+ };
+
+ BackfillMachine& backfill_machine;
+ std::map<hobject_t, registry_item_t> registry;
+
+ BackfillState& backfill_state() {
+ return backfill_machine.backfill_state;
+ }
+ PeeringFacade& peering_state() {
+ return *backfill_machine.peering_state;
+ }
+ BackfillListener& backfill_listener() {
+ return backfill_machine.backfill_listener;
+ }
+
+public:
+ ProgressTracker(BackfillMachine& backfill_machine)
+ : backfill_machine(backfill_machine) {
+ }
+
+ bool tracked_objects_completed() const;
+
+ bool enqueue_push(const hobject_t&);
+ void enqueue_drop(const hobject_t&);
+ void complete_to(const hobject_t&, const pg_stat_t&);
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc
new file mode 100644
index 000000000..d555d6cdc
--- /dev/null
+++ b/src/crimson/osd/ec_backend.cc
@@ -0,0 +1,37 @@
+#include "ec_backend.h"
+
+#include "crimson/osd/shard_services.h"
+
+ECBackend::ECBackend(shard_id_t shard,
+ ECBackend::CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ const ec_profile_t&,
+ uint64_t,
+ DoutPrefixProvider &dpp)
+ : PGBackend{shard, coll, shard_services, dpp}
+{
+ // todo
+}
+
+ECBackend::ll_read_ierrorator::future<ceph::bufferlist>
+ECBackend::_read(const hobject_t& hoid,
+ const uint64_t off,
+ const uint64_t len,
+ const uint32_t flags)
+{
+ // todo
+ return seastar::make_ready_future<bufferlist>();
+}
+
+ECBackend::rep_op_fut_t
+ECBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch, epoch_t max_epoch,
+ std::vector<pg_log_entry_t>&& log_entries)
+{
+ // todo
+ return {seastar::now(),
+ seastar::make_ready_future<crimson::osd::acked_peers_t>()};
+}
diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h
new file mode 100644
index 000000000..3dbcc4def
--- /dev/null
+++ b/src/crimson/osd/ec_backend.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+#include "include/buffer_fwd.h"
+#include "osd/osd_types.h"
+#include "pg_backend.h"
+
+class ECBackend : public PGBackend
+{
+public:
+ ECBackend(shard_id_t shard,
+ CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ const ec_profile_t& ec_profile,
+ uint64_t stripe_width,
+ DoutPrefixProvider &dpp);
+ seastar::future<> stop() final {
+ return seastar::now();
+ }
+ void on_actingset_changed(bool same_primary) final {}
+private:
+ ll_read_ierrorator::future<ceph::bufferlist>
+ _read(const hobject_t& hoid, uint64_t off, uint64_t len, uint32_t flags) override;
+ rep_op_fut_t
+ _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& req,
+ epoch_t min_epoch, epoch_t max_epoch,
+ std::vector<pg_log_entry_t>&& log_entries) final;
+ CollectionRef coll;
+ crimson::os::FuturizedStore::Shard* store;
+ seastar::future<> request_committed(const osd_reqid_t& reqid,
+ const eversion_t& version) final {
+ return seastar::now();
+ }
+};
diff --git a/src/crimson/osd/exceptions.h b/src/crimson/osd/exceptions.h
new file mode 100644
index 000000000..2783ed252
--- /dev/null
+++ b/src/crimson/osd/exceptions.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <system_error>
+
+#include "crimson/common/errorator.h"
+
+namespace crimson::osd {
+class error : private std::system_error {
+public:
+ error(const std::errc ec)
+ : system_error(std::make_error_code(ec)) {
+ }
+
+ using system_error::code;
+ using system_error::what;
+
+ friend error make_error(int ret);
+
+private:
+ error(const int ret) noexcept
+ : system_error(ret, std::system_category()) {
+ }
+};
+
+inline error make_error(const int ret) {
+ return error{ret};
+}
+
+struct object_not_found : public error {
+ object_not_found() : error(std::errc::no_such_file_or_directory) {}
+};
+
+struct invalid_argument : public error {
+ invalid_argument() : error(std::errc::invalid_argument) {}
+};
+
+// FIXME: error handling
+struct permission_denied : public error {
+ permission_denied() : error(std::errc::operation_not_permitted) {}
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc
new file mode 100644
index 000000000..266e56533
--- /dev/null
+++ b/src/crimson/osd/heartbeat.cc
@@ -0,0 +1,819 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "heartbeat.h"
+
+#include <boost/range/join.hpp>
+#include <fmt/chrono.h>
+#include <fmt/os.h>
+
+#include "messages/MOSDPing.h"
+#include "messages/MOSDFailure.h"
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/formatter.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/mon/MonClient.h"
+
+#include "osd/OSDMap.h"
+
+using std::set;
+using std::string;
+using crimson::common::local_conf;
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+Heartbeat::Heartbeat(osd_id_t whoami,
+ crimson::osd::ShardServices& service,
+ crimson::mon::Client& monc,
+ crimson::net::Messenger &front_msgr,
+ crimson::net::Messenger &back_msgr)
+ : whoami{whoami},
+ service{service},
+ monc{monc},
+ front_msgr{front_msgr},
+ back_msgr{back_msgr},
+ // do this in background
+ timer{[this] {
+ heartbeat_check();
+ (void)send_heartbeats();
+ }},
+ failing_peers{*this}
+{}
+
+seastar::future<> Heartbeat::start(entity_addrvec_t front_addrs,
+ entity_addrvec_t back_addrs)
+{
+ logger().info("heartbeat: start front_addrs={}, back_addrs={}",
+ front_addrs, back_addrs);
+ // i only care about the address, so any unused port would work
+ for (auto& addr : boost::join(front_addrs.v, back_addrs.v)) {
+ addr.set_port(0);
+ }
+
+ using crimson::net::SocketPolicy;
+ front_msgr.set_policy(entity_name_t::TYPE_OSD,
+ SocketPolicy::lossy_client(0));
+ back_msgr.set_policy(entity_name_t::TYPE_OSD,
+ SocketPolicy::lossy_client(0));
+ return seastar::when_all_succeed(start_messenger(front_msgr,
+ front_addrs),
+ start_messenger(back_msgr,
+ back_addrs))
+ .then_unpack([this] {
+ timer.arm_periodic(
+ std::chrono::seconds(local_conf()->osd_heartbeat_interval));
+ });
+}
+
+seastar::future<>
+Heartbeat::start_messenger(crimson::net::Messenger& msgr,
+ const entity_addrvec_t& addrs)
+{
+ return msgr.bind(addrs).safe_then([this, &msgr]() mutable {
+ return msgr.start({this});
+ }, crimson::net::Messenger::bind_ertr::all_same_way(
+ [addrs] (const std::error_code& e) {
+ logger().error("heartbeat messenger bind({}): {}", addrs, e);
+ ceph_abort();
+ }));
+}
+
+seastar::future<> Heartbeat::stop()
+{
+ logger().info("{}", __func__);
+ timer.cancel();
+ front_msgr.stop();
+ back_msgr.stop();
+ return gate.close().then([this] {
+ return seastar::when_all_succeed(front_msgr.shutdown(),
+ back_msgr.shutdown());
+ }).then_unpack([] {
+ return seastar::now();
+ });
+}
+
+const entity_addrvec_t& Heartbeat::get_front_addrs() const
+{
+ return front_msgr.get_myaddrs();
+}
+
+const entity_addrvec_t& Heartbeat::get_back_addrs() const
+{
+ return back_msgr.get_myaddrs();
+}
+
+crimson::net::Messenger& Heartbeat::get_front_msgr() const
+{
+ return front_msgr;
+}
+
+crimson::net::Messenger& Heartbeat::get_back_msgr() const
+{
+ return back_msgr;
+}
+
+void Heartbeat::add_peer(osd_id_t _peer, epoch_t epoch)
+{
+ assert(whoami != _peer);
+ auto [iter, added] = peers.try_emplace(_peer, *this, _peer);
+ auto& peer = iter->second;
+ peer.set_epoch_added(epoch);
+}
+
+Heartbeat::osds_t Heartbeat::remove_down_peers()
+{
+ osds_t old_osds; // osds not added in this epoch
+ for (auto i = peers.begin(); i != peers.end(); ) {
+ auto osdmap = service.get_map();
+ const auto& [osd, peer] = *i;
+ if (!osdmap->is_up(osd)) {
+ i = peers.erase(i);
+ } else {
+ if (peer.get_epoch_added() < osdmap->get_epoch()) {
+ old_osds.push_back(osd);
+ }
+ ++i;
+ }
+ }
+ return old_osds;
+}
+
+void Heartbeat::add_reporter_peers(int whoami)
+{
+ auto osdmap = service.get_map();
+ // include next and previous up osds to ensure we have a fully-connected set
+ set<int> want;
+ if (auto next = osdmap->get_next_up_osd_after(whoami); next >= 0) {
+ want.insert(next);
+ }
+ if (auto prev = osdmap->get_previous_up_osd_before(whoami); prev >= 0) {
+ want.insert(prev);
+ }
+ // make sure we have at least **min_down** osds coming from different
+ // subtree level (e.g., hosts) for fast failure detection.
+ auto min_down = local_conf().get_val<uint64_t>("mon_osd_min_down_reporters");
+ auto subtree = local_conf().get_val<string>("mon_osd_reporter_subtree_level");
+ osdmap->get_random_up_osds_by_subtree(
+ whoami, subtree, min_down, want, &want);
+ auto epoch = osdmap->get_epoch();
+ for (int osd : want) {
+ add_peer(osd, epoch);
+ };
+}
+
+void Heartbeat::update_peers(int whoami)
+{
+ const auto min_peers = static_cast<size_t>(
+ local_conf().get_val<int64_t>("osd_heartbeat_min_peers"));
+ add_reporter_peers(whoami);
+ auto extra = remove_down_peers();
+ // too many?
+ for (auto& osd : extra) {
+ if (peers.size() <= min_peers) {
+ break;
+ }
+ remove_peer(osd);
+ }
+ // or too few?
+ auto osdmap = service.get_map();
+ auto epoch = osdmap->get_epoch();
+ for (auto next = osdmap->get_next_up_osd_after(whoami);
+ peers.size() < min_peers && next >= 0 && next != whoami;
+ next = osdmap->get_next_up_osd_after(next)) {
+ add_peer(next, epoch);
+ }
+}
+
+Heartbeat::osds_t Heartbeat::get_peers() const
+{
+ osds_t osds;
+ osds.reserve(peers.size());
+ for (auto& peer : peers) {
+ osds.push_back(peer.first);
+ }
+ return osds;
+}
+
+void Heartbeat::remove_peer(osd_id_t peer)
+{
+ assert(peers.count(peer) == 1);
+ peers.erase(peer);
+}
+
+std::optional<seastar::future<>>
+Heartbeat::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+ bool dispatched = true;
+ gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] {
+ switch (m->get_type()) {
+ case MSG_OSD_PING:
+ return handle_osd_ping(conn, boost::static_pointer_cast<MOSDPing>(m));
+ default:
+ dispatched = false;
+ return seastar::now();
+ }
+ });
+ return (dispatched ? std::make_optional(seastar::now()) : std::nullopt);
+}
+
+void Heartbeat::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace)
+{
+ auto peer = conn->get_peer_id();
+ if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+ peer == entity_name_t::NEW) {
+ return;
+ }
+ if (auto found = peers.find(peer);
+ found != peers.end()) {
+ found->second.handle_reset(conn, is_replace);
+ }
+}
+
+void Heartbeat::ms_handle_connect(
+ crimson::net::ConnectionRef conn,
+ seastar::shard_id prv_shard)
+{
+ ceph_assert_always(seastar::this_shard_id() == prv_shard);
+ auto peer = conn->get_peer_id();
+ if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+ peer == entity_name_t::NEW) {
+ return;
+ }
+ if (auto found = peers.find(peer);
+ found != peers.end()) {
+ found->second.handle_connect(conn);
+ }
+}
+
+void Heartbeat::ms_handle_accept(
+ crimson::net::ConnectionRef conn,
+ seastar::shard_id prv_shard,
+ bool is_replace)
+{
+ ceph_assert_always(seastar::this_shard_id() == prv_shard);
+ auto peer = conn->get_peer_id();
+ if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+ peer == entity_name_t::NEW) {
+ return;
+ }
+ if (auto found = peers.find(peer);
+ found != peers.end()) {
+ found->second.handle_accept(conn, is_replace);
+ }
+}
+
+seastar::future<> Heartbeat::handle_osd_ping(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m)
+{
+ switch (m->op) {
+ case MOSDPing::PING:
+ return handle_ping(conn, m);
+ case MOSDPing::PING_REPLY:
+ return handle_reply(conn, m);
+ case MOSDPing::YOU_DIED:
+ return handle_you_died();
+ default:
+ return seastar::now();
+ }
+}
+
+seastar::future<> Heartbeat::handle_ping(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m)
+{
+ auto min_message = static_cast<uint32_t>(
+ local_conf()->osd_heartbeat_min_size);
+ auto reply =
+ crimson::make_message<MOSDPing>(
+ m->fsid,
+ service.get_map()->get_epoch(),
+ MOSDPing::PING_REPLY,
+ m->ping_stamp,
+ m->mono_ping_stamp,
+ service.get_mnow(),
+ service.get_up_epoch(),
+ min_message);
+ return conn->send(std::move(reply)
+ ).then([this, m, conn] {
+ return maybe_share_osdmap(conn, m);
+ });
+}
+
+seastar::future<> Heartbeat::maybe_share_osdmap(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m)
+{
+ const osd_id_t from = m->get_source().num();
+ const epoch_t current_osdmap_epoch = service.get_map()->get_epoch();
+ auto found = peers.find(from);
+ if (found == peers.end()) {
+ return seastar::now();
+ }
+ auto& peer = found->second;
+
+ if (m->map_epoch > peer.get_projected_epoch()) {
+ logger().debug("{} updating peer {} session's projected_epoch"
+ "from {} to ping map epoch of {}",
+ __func__, from, peer.get_projected_epoch(),
+ m->map_epoch);
+ peer.set_projected_epoch(m->map_epoch);
+ }
+
+ if (current_osdmap_epoch <= peer.get_projected_epoch()) {
+ logger().debug("{} peer {} projected_epoch {} is already later "
+ "than our osdmap epoch of {}",
+ __func__ , from, peer.get_projected_epoch(),
+ current_osdmap_epoch);
+ return seastar::now();
+ }
+
+ const epoch_t send_from = peer.get_projected_epoch();
+ logger().debug("{} sending peer {} peer maps from projected epoch {} through "
+ "local osdmap epoch {}",
+ __func__,
+ from,
+ send_from,
+ current_osdmap_epoch);
+ peer.set_projected_epoch(current_osdmap_epoch);
+ return service.send_incremental_map_to_osd(from, send_from);
+}
+
+seastar::future<> Heartbeat::handle_reply(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m)
+{
+ const osd_id_t from = m->get_source().num();
+ auto found = peers.find(from);
+ if (found == peers.end()) {
+ // stale reply
+ return seastar::now();
+ }
+ auto& peer = found->second;
+ return peer.handle_reply(conn, m
+ ).then([this, conn, m] {
+ return maybe_share_osdmap(conn, m);
+ });
+}
+
+seastar::future<> Heartbeat::handle_you_died()
+{
+ // TODO: ask for newer osdmap
+ return seastar::now();
+}
+
+void Heartbeat::heartbeat_check()
+{
+ failure_queue_t failure_queue;
+ const auto now = clock::now();
+ for (const auto& [osd, peer] : peers) {
+ auto failed_since = peer.failed_since(now);
+ if (!clock::is_zero(failed_since)) {
+ failure_queue.emplace(osd, failed_since);
+ }
+ }
+ if (!failure_queue.empty()) {
+ // send_failures can run in background, because
+ // 1. After the execution of send_failures, no msg is actually
+ // sent, which means the sending operation is not done,
+ // which further seems to involve problems risks that when
+ // osd shuts down, the left part of the sending operation
+ // may reference OSD and Heartbeat instances that are already
+ // deleted. However, remaining work of that sending operation
+ // involves no reference back to OSD or Heartbeat instances,
+ // which means it wouldn't involve the above risks.
+ // 2. messages are sent in order, if later checks find out
+ // the previous "failed" peers to be healthy, that "still
+ // alive" messages would be sent after the previous "osd
+ // failure" messages which is totally safe.
+ (void)send_failures(std::move(failure_queue));
+ }
+}
+
+seastar::future<> Heartbeat::send_heartbeats()
+{
+ const auto mnow = service.get_mnow();
+ const auto now = clock::now();
+
+ std::vector<seastar::future<>> futures;
+ for (auto& [osd, peer] : peers) {
+ peer.send_heartbeat(now, mnow, futures);
+ }
+ return seastar::when_all_succeed(futures.begin(), futures.end());
+}
+
+seastar::future<> Heartbeat::send_failures(failure_queue_t&& failure_queue)
+{
+ std::vector<seastar::future<>> futures;
+ const auto now = clock::now();
+ for (auto [osd, failed_since] : failure_queue) {
+ failing_peers.add_pending(osd, failed_since, now, futures);
+ }
+
+ return seastar::when_all_succeed(futures.begin(), futures.end());
+}
+
+void Heartbeat::print(std::ostream& out) const
+{
+ out << "heartbeat";
+}
+
+Heartbeat::Connection::~Connection()
+{
+ if (conn) {
+ conn->mark_down();
+ }
+}
+
+bool Heartbeat::Connection::matches(crimson::net::ConnectionRef _conn) const
+{
+ return (conn && conn == _conn);
+}
+
+bool Heartbeat::Connection::accepted(
+ crimson::net::ConnectionRef accepted_conn,
+ bool is_replace)
+{
+ ceph_assert(accepted_conn);
+ ceph_assert(accepted_conn != conn);
+ if (accepted_conn->get_peer_addr() != listener.get_peer_addr(type)) {
+ return false;
+ }
+
+ if (is_replace) {
+ logger().info("Heartbeat::Connection::accepted(): "
+ "{} racing", *this);
+ racing_detected = true;
+ }
+ if (conn) {
+ // there is no assumption about the ordering of the reset and accept
+ // events for the 2 racing connections.
+ if (is_connected) {
+ logger().warn("Heartbeat::Connection::accepted(): "
+ "{} is accepted while connected, is_replace={}",
+ *this, is_replace);
+ conn->mark_down();
+ set_unconnected();
+ }
+ }
+ conn = accepted_conn;
+ set_connected();
+ return true;
+}
+
+void Heartbeat::Connection::reset(bool is_replace)
+{
+ if (is_replace) {
+ logger().info("Heartbeat::Connection::reset(): "
+ "{} racing, waiting for the replacing accept",
+ *this);
+ racing_detected = true;
+ }
+
+ if (is_connected) {
+ set_unconnected();
+ } else {
+ conn = nullptr;
+ }
+
+ if (is_replace) {
+ // waiting for the replacing accept event
+ } else if (!racing_detected || is_winner_side) {
+ connect();
+ } else { // racing_detected && !is_winner_side
+ logger().info("Heartbeat::Connection::reset(): "
+ "{} racing detected and lose, "
+ "waiting for peer connect me", *this);
+ }
+}
+
+seastar::future<> Heartbeat::Connection::send(MessageURef msg)
+{
+ assert(is_connected);
+ return conn->send(std::move(msg));
+}
+
+void Heartbeat::Connection::validate()
+{
+ assert(is_connected);
+ auto peer_addr = listener.get_peer_addr(type);
+ if (conn->get_peer_addr() != peer_addr) {
+ logger().info("Heartbeat::Connection::validate(): "
+ "{} has new address {} over {}, reset",
+ *this, peer_addr, conn->get_peer_addr());
+ conn->mark_down();
+ racing_detected = false;
+ reset();
+ }
+}
+
+void Heartbeat::Connection::retry()
+{
+ racing_detected = false;
+ if (!is_connected) {
+ if (conn) {
+ conn->mark_down();
+ reset();
+ } else {
+ connect();
+ }
+ }
+}
+
+void Heartbeat::Connection::set_connected()
+{
+ assert(conn);
+ assert(!is_connected);
+ ceph_assert(conn->is_connected());
+ is_connected = true;
+ listener.increase_connected();
+}
+
+void Heartbeat::Connection::set_unconnected()
+{
+ assert(conn);
+ assert(is_connected);
+ conn = nullptr;
+ is_connected = false;
+ listener.decrease_connected();
+}
+
+void Heartbeat::Connection::connect()
+{
+ assert(!conn);
+ auto addr = listener.get_peer_addr(type);
+ conn = msgr.connect(addr, entity_name_t(CEPH_ENTITY_TYPE_OSD, peer));
+ if (conn->is_connected()) {
+ set_connected();
+ }
+}
+
+Heartbeat::clock::time_point
+Heartbeat::Session::failed_since(Heartbeat::clock::time_point now) const
+{
+ if (do_health_screen(now) == health_state::UNHEALTHY) {
+ auto oldest_deadline = ping_history.begin()->second.deadline;
+ auto failed_since = std::min(last_rx_back, last_rx_front);
+ if (clock::is_zero(failed_since)) {
+ logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} "
+ "ever on either front or back, first ping sent {} "
+ "(oldest deadline {})",
+ peer, first_tx, oldest_deadline);
+ failed_since = first_tx;
+ } else {
+ logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} "
+ "since back {} front {} (oldest deadline {})",
+ peer, last_rx_back, last_rx_front, oldest_deadline);
+ }
+ return failed_since;
+ } else {
+ return clock::zero();
+ }
+}
+
+void Heartbeat::Session::set_inactive_history(clock::time_point now)
+{
+ assert(!connected);
+ if (ping_history.empty()) {
+ const utime_t sent_stamp{now};
+ const auto deadline =
+ now + std::chrono::seconds(local_conf()->osd_heartbeat_grace);
+ ping_history.emplace(sent_stamp, reply_t{deadline, 0});
+ } else { // the entry is already added
+ assert(ping_history.size() == 1);
+ }
+}
+
+Heartbeat::Peer::Peer(Heartbeat& heartbeat, osd_id_t peer)
+ : ConnectionListener(2), heartbeat{heartbeat}, peer{peer}, session{peer},
+ con_front(peer, heartbeat.whoami > peer, Connection::type_t::front,
+ heartbeat.front_msgr, *this),
+ con_back(peer, heartbeat.whoami > peer, Connection::type_t::back,
+ heartbeat.back_msgr, *this)
+{
+ logger().info("Heartbeat::Peer: osd.{} added", peer);
+}
+
+Heartbeat::Peer::~Peer()
+{
+ logger().info("Heartbeat::Peer: osd.{} removed", peer);
+}
+
+void Heartbeat::Peer::send_heartbeat(
+ clock::time_point now, ceph::signedspan mnow,
+ std::vector<seastar::future<>>& futures)
+{
+ session.set_tx(now);
+ if (session.is_started()) {
+ do_send_heartbeat(now, mnow, &futures);
+ for_each_conn([] (auto& conn) {
+ conn.validate();
+ });
+ } else {
+ // we should send MOSDPing but still cannot at this moment
+ if (pending_send) {
+ // we have already pending for a entire heartbeat interval
+ logger().warn("Heartbeat::Peer::send_heartbeat(): "
+ "heartbeat to osd.{} is still pending...", peer);
+ for_each_conn([] (auto& conn) {
+ conn.retry();
+ });
+ } else {
+ logger().info("Heartbeat::Peer::send_heartbeat(): "
+ "heartbeat to osd.{} is pending send...", peer);
+ session.set_inactive_history(now);
+ pending_send = true;
+ }
+ }
+}
+
+void Heartbeat::Peer::handle_reset(
+ crimson::net::ConnectionRef conn, bool is_replace)
+{
+ int cnt = 0;
+ for_each_conn([&] (auto& _conn) {
+ if (_conn.matches(conn)) {
+ ++cnt;
+ _conn.reset(is_replace);
+ }
+ });
+
+ if (cnt == 0) {
+ logger().info("Heartbeat::Peer::handle_reset(): {} ignores conn, is_replace={} -- {}",
+ *this, is_replace, *conn);
+ } else if (cnt > 1) {
+ logger().error("Heartbeat::Peer::handle_reset(): {} handles conn {} times -- {}",
+ *this, cnt, *conn);
+ }
+}
+
+void Heartbeat::Peer::handle_connect(crimson::net::ConnectionRef conn)
+{
+ int cnt = 0;
+ for_each_conn([&] (auto& _conn) {
+ if (_conn.matches(conn)) {
+ ++cnt;
+ _conn.connected();
+ }
+ });
+
+ if (cnt == 0) {
+ logger().error("Heartbeat::Peer::handle_connect(): {} ignores conn -- {}",
+ *this, *conn);
+ conn->mark_down();
+ } else if (cnt > 1) {
+ logger().error("Heartbeat::Peer::handle_connect(): {} handles conn {} times -- {}",
+ *this, cnt, *conn);
+ }
+}
+
+void Heartbeat::Peer::handle_accept(crimson::net::ConnectionRef conn, bool is_replace)
+{
+ int cnt = 0;
+ for_each_conn([&] (auto& _conn) {
+ if (_conn.accepted(conn, is_replace)) {
+ ++cnt;
+ }
+ });
+
+ if (cnt == 0) {
+ logger().warn("Heartbeat::Peer::handle_accept(): {} ignores conn -- {}",
+ *this, *conn);
+ } else if (cnt > 1) {
+ logger().error("Heartbeat::Peer::handle_accept(): {} handles conn {} times -- {}",
+ *this, cnt, *conn);
+ }
+}
+
+seastar::future<> Heartbeat::Peer::handle_reply(
+ crimson::net::ConnectionRef conn, Ref<MOSDPing> m)
+{
+ if (!session.is_started()) {
+ // we haven't sent any ping yet
+ return seastar::now();
+ }
+ type_t type;
+ if (con_front.matches(conn)) {
+ type = type_t::front;
+ } else if (con_back.matches(conn)) {
+ type = type_t::back;
+ } else {
+ return seastar::now();
+ }
+ const auto now = clock::now();
+ if (session.on_pong(m->ping_stamp, type, now)) {
+ if (session.do_health_screen(now) == Session::health_state::HEALTHY) {
+ return heartbeat.failing_peers.cancel_one(peer);
+ }
+ }
+ return seastar::now();
+}
+
+entity_addr_t Heartbeat::Peer::get_peer_addr(type_t type)
+{
+ const auto osdmap = heartbeat.service.get_map();
+ if (type == type_t::front) {
+ return osdmap->get_hb_front_addrs(peer).front();
+ } else {
+ return osdmap->get_hb_back_addrs(peer).front();
+ }
+}
+
+void Heartbeat::Peer::on_connected()
+{
+ logger().info("Heartbeat::Peer: osd.{} connected (send={})",
+ peer, pending_send);
+ session.on_connected();
+ if (pending_send) {
+ pending_send = false;
+ do_send_heartbeat(clock::now(), heartbeat.service.get_mnow(), nullptr);
+ }
+}
+
+void Heartbeat::Peer::on_disconnected()
+{
+ logger().info("Heartbeat::Peer: osd.{} disconnected", peer);
+ session.on_disconnected();
+}
+
+void Heartbeat::Peer::do_send_heartbeat(
+ Heartbeat::clock::time_point now,
+ ceph::signedspan mnow,
+ std::vector<seastar::future<>>* futures)
+{
+ const utime_t sent_stamp{now};
+ const auto deadline =
+ now + std::chrono::seconds(local_conf()->osd_heartbeat_grace);
+ session.on_ping(sent_stamp, deadline);
+ for_each_conn([&, this] (auto& conn) {
+ auto min_message = static_cast<uint32_t>(
+ local_conf()->osd_heartbeat_min_size);
+ auto ping = crimson::make_message<MOSDPing>(
+ heartbeat.monc.get_fsid(),
+ heartbeat.service.get_map()->get_epoch(),
+ MOSDPing::PING,
+ sent_stamp,
+ mnow,
+ mnow,
+ heartbeat.service.get_up_epoch(),
+ min_message);
+ if (futures) {
+ futures->push_back(conn.send(std::move(ping)));
+ }
+ });
+}
+
+bool Heartbeat::FailingPeers::add_pending(
+ osd_id_t peer,
+ clock::time_point failed_since,
+ clock::time_point now,
+ std::vector<seastar::future<>>& futures)
+{
+ if (failure_pending.count(peer)) {
+ return false;
+ }
+ auto failed_for = std::chrono::duration_cast<std::chrono::seconds>(
+ now - failed_since).count();
+ auto osdmap = heartbeat.service.get_map();
+ auto failure_report =
+ crimson::make_message<MOSDFailure>(heartbeat.monc.get_fsid(),
+ peer,
+ osdmap->get_addrs(peer),
+ static_cast<int>(failed_for),
+ osdmap->get_epoch());
+ failure_pending.emplace(peer, failure_info_t{failed_since,
+ osdmap->get_addrs(peer)});
+ futures.push_back(heartbeat.monc.send_message(std::move(failure_report)));
+ logger().info("{}: osd.{} failed for {}", __func__, peer, failed_for);
+ return true;
+}
+
+seastar::future<> Heartbeat::FailingPeers::cancel_one(osd_id_t peer)
+{
+ if (auto pending = failure_pending.find(peer);
+ pending != failure_pending.end()) {
+ auto fut = send_still_alive(peer, pending->second.addrs);
+ failure_pending.erase(peer);
+ return fut;
+ }
+ return seastar::now();
+}
+
+seastar::future<>
+Heartbeat::FailingPeers::send_still_alive(
+ osd_id_t osd, const entity_addrvec_t& addrs)
+{
+ auto still_alive = crimson::make_message<MOSDFailure>(
+ heartbeat.monc.get_fsid(),
+ osd,
+ addrs,
+ 0,
+ heartbeat.service.get_map()->get_epoch(),
+ MOSDFailure::FLAG_ALIVE);
+ logger().info("{}: osd.{}", __func__, osd);
+ return heartbeat.monc.send_message(std::move(still_alive));
+}
diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h
new file mode 100644
index 000000000..f5da45118
--- /dev/null
+++ b/src/crimson/osd/heartbeat.h
@@ -0,0 +1,461 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <seastar/core/future.hh>
+#include "common/ceph_time.h"
+#include "crimson/common/gated.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Fwd.h"
+
+class MOSDPing;
+
+namespace crimson::osd {
+ class ShardServices;
+}
+
+namespace crimson::mon {
+ class Client;
+}
+
+template<typename Message> using Ref = boost::intrusive_ptr<Message>;
+
+class Heartbeat : public crimson::net::Dispatcher {
+public:
+ using osd_id_t = int;
+
+ Heartbeat(osd_id_t whoami,
+ crimson::osd::ShardServices& service,
+ crimson::mon::Client& monc,
+ crimson::net::Messenger &front_msgr,
+ crimson::net::Messenger &back_msgr);
+
+ seastar::future<> start(entity_addrvec_t front,
+ entity_addrvec_t back);
+ seastar::future<> stop();
+
+ using osds_t = std::vector<osd_id_t>;
+ void add_peer(osd_id_t peer, epoch_t epoch);
+ void update_peers(int whoami);
+ void remove_peer(osd_id_t peer);
+ osds_t get_peers() const;
+
+ const entity_addrvec_t& get_front_addrs() const;
+ const entity_addrvec_t& get_back_addrs() const;
+
+ crimson::net::Messenger &get_front_msgr() const;
+ crimson::net::Messenger &get_back_msgr() const;
+
+ // Dispatcher methods
+ std::optional<seastar::future<>> ms_dispatch(
+ crimson::net::ConnectionRef conn, MessageRef m) override;
+ void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override;
+ void ms_handle_connect(crimson::net::ConnectionRef conn, seastar::shard_id) override;
+ void ms_handle_accept(crimson::net::ConnectionRef conn, seastar::shard_id, bool is_replace) override;
+
+ void print(std::ostream&) const;
+private:
+ seastar::future<> handle_osd_ping(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m);
+ seastar::future<> handle_ping(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m);
+ seastar::future<> handle_reply(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m);
+ seastar::future<> handle_you_died();
+
+ /// remove down OSDs
+ /// @return peers not added in this epoch
+ osds_t remove_down_peers();
+ /// add enough reporters for fast failure detection
+ void add_reporter_peers(int whoami);
+
+ seastar::future<> start_messenger(crimson::net::Messenger& msgr,
+ const entity_addrvec_t& addrs);
+ seastar::future<> maybe_share_osdmap(crimson::net::ConnectionRef,
+ Ref<MOSDPing> m);
+private:
+ const osd_id_t whoami;
+ crimson::osd::ShardServices& service;
+ crimson::mon::Client& monc;
+ crimson::net::Messenger &front_msgr;
+ crimson::net::Messenger &back_msgr;
+
+ seastar::timer<seastar::lowres_clock> timer;
+ // use real_clock so it can be converted to utime_t
+ using clock = ceph::coarse_real_clock;
+
+ class ConnectionListener;
+ class Connection;
+ class Session;
+ class Peer;
+ using peers_map_t = std::map<osd_id_t, Peer>;
+ peers_map_t peers;
+
+ // osds which are considered failed
+ // osd_id => when was the last time that both front and back pings were acked
+ // or sent.
+ // use for calculating how long the OSD has been unresponsive
+ using failure_queue_t = std::map<osd_id_t, clock::time_point>;
+ seastar::future<> send_failures(failure_queue_t&& failure_queue);
+ seastar::future<> send_heartbeats();
+ void heartbeat_check();
+
+ // osds we've reported to monior as failed ones, but they are not marked down
+ // yet
+ crimson::common::Gated gate;
+
+ class FailingPeers {
+ public:
+ FailingPeers(Heartbeat& heartbeat) : heartbeat(heartbeat) {}
+ bool add_pending(osd_id_t peer,
+ clock::time_point failed_since,
+ clock::time_point now,
+ std::vector<seastar::future<>>& futures);
+ seastar::future<> cancel_one(osd_id_t peer);
+
+ private:
+ seastar::future<> send_still_alive(osd_id_t, const entity_addrvec_t&);
+
+ Heartbeat& heartbeat;
+
+ struct failure_info_t {
+ clock::time_point failed_since;
+ entity_addrvec_t addrs;
+ };
+ std::map<osd_id_t, failure_info_t> failure_pending;
+ } failing_peers;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Heartbeat& hb) {
+ hb.print(out);
+ return out;
+}
+
+/*
+ * Event driven interface for Heartbeat::Peer to be notified when both hb_front
+ * and hb_back are connected, or connection is lost.
+ */
+class Heartbeat::ConnectionListener {
+ public:
+ ConnectionListener(size_t connections) : connections{connections} {}
+
+ void increase_connected() {
+ assert(connected < connections);
+ ++connected;
+ if (connected == connections) {
+ on_connected();
+ }
+ }
+ void decrease_connected() {
+ assert(connected > 0);
+ if (connected == connections) {
+ on_disconnected();
+ }
+ --connected;
+ }
+ enum class type_t { front, back };
+ virtual entity_addr_t get_peer_addr(type_t) = 0;
+
+ protected:
+ virtual void on_connected() = 0;
+ virtual void on_disconnected() = 0;
+
+ private:
+ const size_t connections;
+ size_t connected = 0;
+};
+
+class Heartbeat::Connection {
+ public:
+ using type_t = ConnectionListener::type_t;
+ Connection(osd_id_t peer, bool is_winner_side, type_t type,
+ crimson::net::Messenger& msgr,
+ ConnectionListener& listener)
+ : peer{peer}, type{type},
+ msgr{msgr}, listener{listener},
+ is_winner_side{is_winner_side} {
+ connect();
+ }
+ Connection(const Connection&) = delete;
+ Connection(Connection&&) = delete;
+ Connection& operator=(const Connection&) = delete;
+ Connection& operator=(Connection&&) = delete;
+
+ ~Connection();
+
+ bool matches(crimson::net::ConnectionRef _conn) const;
+ void connected() {
+ set_connected();
+ }
+ bool accepted(crimson::net::ConnectionRef, bool is_replace);
+ void reset(bool is_replace=false);
+ seastar::future<> send(MessageURef msg);
+ void validate();
+ // retry connection if still pending
+ void retry();
+
+ private:
+ void set_connected();
+ void set_unconnected();
+ void connect();
+
+ const osd_id_t peer;
+ const type_t type;
+ crimson::net::Messenger& msgr;
+ ConnectionListener& listener;
+
+/*
+ * Resolve the following racing when both me and peer are trying to connect
+ * each other symmetrically, under SocketPolicy::lossy_client:
+ *
+ * OSD.A OSD.B
+ * - -
+ * |-[1]----> <----[2]-|
+ * \ /
+ * \ /
+ * delay.. X delay..
+ * / \
+ * |-[1]x> / \ <x[2]-|
+ * |<-[2]--- ---[1]->|
+ * |(reset#1) (reset#2)|
+ * |(reconnectB) (reconnectA)|
+ * |-[2]---> <---[1]-|
+ * delay.. delay..
+ * (remote close populated)
+ * |-[2]x> <x[1]-|
+ * |(reset#2) (reset#1)|
+ * | ... ... |
+ * (dead loop!)
+ *
+ * Our solution is to remember if such racing was happened recently, and
+ * establish connection asymmetrically only from the winner side whose osd-id
+ * is larger.
+ */
+ const bool is_winner_side;
+ bool racing_detected = false;
+
+ crimson::net::ConnectionRef conn;
+ bool is_connected = false;
+
+ friend std::ostream& operator<<(std::ostream& os, const Connection& c) {
+ if (c.type == type_t::front) {
+ return os << "con_front(osd." << c.peer << ")";
+ } else {
+ return os << "con_back(osd." << c.peer << ")";
+ }
+ }
+};
+
+/*
+ * Track the ping history and ping reply (the pong) from the same session, clean up
+ * history once hb_front or hb_back loses connection and restart the session once
+ * both connections are connected again.
+ *
+ * We cannot simply remove the entire Heartbeat::Peer once hb_front or hb_back
+ * loses connection, because we would end up with the following deadloop:
+ *
+ * OSD.A OSD.B
+ * - -
+ * hb_front reset <--(network)--- hb_front close
+ * | ^
+ * | |
+ * remove Peer B (dead loop!) remove Peer A
+ * | |
+ * V |
+ * hb_back close ----(network)---> hb_back reset
+ */
+class Heartbeat::Session {
+ public:
+ Session(osd_id_t peer) : peer{peer} {}
+
+ void set_epoch_added(epoch_t epoch_) { epoch = epoch_; }
+ epoch_t get_epoch_added() const { return epoch; }
+
+ void set_projected_epoch(epoch_t epoch_) { projected_epoch = epoch_; }
+ epoch_t get_projected_epoch() const { return projected_epoch; }
+
+ bool is_started() const { return connected; }
+ bool pinged() const {
+ if (clock::is_zero(first_tx)) {
+ // i can never receive a pong without sending any ping message first.
+ assert(clock::is_zero(last_rx_front) &&
+ clock::is_zero(last_rx_back));
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ enum class health_state {
+ UNKNOWN,
+ UNHEALTHY,
+ HEALTHY,
+ };
+ health_state do_health_screen(clock::time_point now) const {
+ if (!pinged()) {
+ // we are not healty nor unhealty because we haven't sent anything yet
+ return health_state::UNKNOWN;
+ } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) {
+ return health_state::UNHEALTHY;
+ } else if (!clock::is_zero(last_rx_front) &&
+ !clock::is_zero(last_rx_back)) {
+ // only declare to be healthy until we have received the first
+ // replies from both front/back connections
+ return health_state::HEALTHY;
+ } else {
+ return health_state::UNKNOWN;
+ }
+ }
+
+ clock::time_point failed_since(clock::time_point now) const;
+
+ void set_tx(clock::time_point now) {
+ if (!pinged()) {
+ first_tx = now;
+ }
+ last_tx = now;
+ }
+
+ void on_connected() {
+ assert(!connected);
+ connected = true;
+ ping_history.clear();
+ }
+
+ void on_ping(const utime_t& sent_stamp,
+ const clock::time_point& deadline) {
+ assert(connected);
+ [[maybe_unused]] auto [reply, added] =
+ ping_history.emplace(sent_stamp, reply_t{deadline, 2});
+ }
+
+ bool on_pong(const utime_t& ping_stamp,
+ Connection::type_t type,
+ clock::time_point now) {
+ assert(connected);
+ auto ping = ping_history.find(ping_stamp);
+ if (ping == ping_history.end()) {
+ // old replies, deprecated by newly sent pings.
+ return false;
+ }
+ auto& unacked = ping->second.unacknowledged;
+ assert(unacked);
+ if (type == Connection::type_t::front) {
+ last_rx_front = now;
+ unacked--;
+ } else {
+ last_rx_back = now;
+ unacked--;
+ }
+ if (unacked == 0) {
+ ping_history.erase(ping_history.begin(), ++ping);
+ }
+ return true;
+ }
+
+ void on_disconnected() {
+ assert(connected);
+ connected = false;
+ if (!ping_history.empty()) {
+ // we lost our ping_history of the last session, but still need to keep
+ // the oldest deadline for unhealthy check.
+ auto oldest = ping_history.begin();
+ auto sent_stamp = oldest->first;
+ auto deadline = oldest->second.deadline;
+ ping_history.clear();
+ ping_history.emplace(sent_stamp, reply_t{deadline, 0});
+ }
+ }
+
+ // maintain an entry in ping_history for unhealthy check
+ void set_inactive_history(clock::time_point);
+
+ private:
+ const osd_id_t peer;
+ bool connected = false;
+ // time we sent our first ping request
+ clock::time_point first_tx;
+ // last time we sent a ping request
+ clock::time_point last_tx;
+ // last time we got a ping reply on the front side
+ clock::time_point last_rx_front;
+ // last time we got a ping reply on the back side
+ clock::time_point last_rx_back;
+ // most recent epoch we wanted this peer
+ epoch_t epoch; // rename me to epoch_added
+ // epoch we expect peer to be at once our sent incrementals are processed
+ epoch_t projected_epoch = 0;
+
+ struct reply_t {
+ clock::time_point deadline;
+ // one sent over front conn, another sent over back conn
+ uint8_t unacknowledged = 0;
+ };
+ // history of inflight pings, arranging by timestamp we sent
+ std::map<utime_t, reply_t> ping_history;
+};
+
+class Heartbeat::Peer final : private Heartbeat::ConnectionListener {
+ public:
+ Peer(Heartbeat&, osd_id_t);
+ ~Peer();
+ Peer(Peer&&) = delete;
+ Peer(const Peer&) = delete;
+ Peer& operator=(Peer&&) = delete;
+ Peer& operator=(const Peer&) = delete;
+
+ // set/get the epoch at which the peer was added
+ void set_epoch_added(epoch_t epoch) { session.set_epoch_added(epoch); }
+ epoch_t get_epoch_added() const { return session.get_epoch_added(); }
+
+ void set_projected_epoch(epoch_t epoch) { session.set_projected_epoch(epoch); }
+ epoch_t get_projected_epoch() const { return session.get_projected_epoch(); }
+
+ // if failure, return time_point since last active
+ // else, return clock::zero()
+ clock::time_point failed_since(clock::time_point now) const {
+ return session.failed_since(now);
+ }
+ void send_heartbeat(
+ clock::time_point, ceph::signedspan, std::vector<seastar::future<>>&);
+ seastar::future<> handle_reply(crimson::net::ConnectionRef, Ref<MOSDPing>);
+
+ void handle_reset(crimson::net::ConnectionRef conn, bool is_replace);
+
+ void handle_connect(crimson::net::ConnectionRef conn);
+
+ void handle_accept(crimson::net::ConnectionRef conn, bool is_replace);
+
+ private:
+ entity_addr_t get_peer_addr(type_t type) override;
+ void on_connected() override;
+ void on_disconnected() override;
+ void do_send_heartbeat(
+ clock::time_point, ceph::signedspan, std::vector<seastar::future<>>*);
+
+ template <typename Func>
+ void for_each_conn(Func&& f) {
+ f(con_front);
+ f(con_back);
+ }
+
+ Heartbeat& heartbeat;
+ const osd_id_t peer;
+ Session session;
+ // if need to send heartbeat when session connected
+ bool pending_send = false;
+ Connection con_front;
+ Connection con_back;
+
+ friend std::ostream& operator<<(std::ostream& os, const Peer& p) {
+ return os << "peer(osd." << p.peer << ")";
+ }
+};
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<Heartbeat> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<Heartbeat::Connection> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<Heartbeat::Peer> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/lsan_suppressions.cc b/src/crimson/osd/lsan_suppressions.cc
new file mode 100644
index 000000000..53b7eb630
--- /dev/null
+++ b/src/crimson/osd/lsan_suppressions.cc
@@ -0,0 +1,20 @@
+#ifndef _NDEBUG
+// The callbacks we define here will be called from the sanitizer runtime, but
+// aren't referenced from the Chrome executable. We must ensure that those
+// callbacks are not sanitizer-instrumented, and that they aren't stripped by
+// the linker.
+#define SANITIZER_HOOK_ATTRIBUTE \
+ extern "C" \
+ __attribute__((no_sanitize("address", "thread", "undefined"))) \
+ __attribute__((visibility("default"))) \
+ __attribute__((used))
+
+static char kLSanDefaultSuppressions[] =
+ "leak:InitModule\n"
+ "leak:MallocExtension::Initialize\n"
+ "leak:MallocExtension::Register\n";
+
+SANITIZER_HOOK_ATTRIBUTE const char *__lsan_default_suppressions() {
+ return kLSanDefaultSuppressions;
+}
+#endif // ! _NDEBUG
diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc
new file mode 100644
index 000000000..1e817415d
--- /dev/null
+++ b/src/crimson/osd/main.cc
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <fstream>
+#include <random>
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/print.hh>
+#include <seastar/core/prometheus.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/http/httpd.hh>
+#include <seastar/net/inet_address.hh>
+#include <seastar/util/closeable.hh>
+#include <seastar/util/defer.hh>
+#include <seastar/util/std-compat.hh>
+
+#include "auth/KeyRing.h"
+#include "common/ceph_argparse.h"
+#include "common/config_tracker.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/fatal_signal.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/osd/stop_signal.h"
+#include "crimson/osd/main_config_bootstrap_helpers.h"
+#include "global/pidfile.h"
+#include "osd.h"
+
+using namespace std::literals;
+namespace bpo = boost::program_options;
+using crimson::common::local_conf;
+using crimson::common::sharded_conf;
+using crimson::common::sharded_perf_coll;
+
+static seastar::logger& logger()
+{
+ return crimson::get_logger(ceph_subsys_osd);
+}
+
+seastar::future<> make_keyring()
+{
+ const auto path = local_conf().get_val<std::string>("keyring");
+ return seastar::file_exists(path).then([path](bool exists) {
+ KeyRing keyring;
+ EntityName name{local_conf()->name};
+ EntityAuth auth;
+ if (exists &&
+ keyring.load(nullptr, path) == 0 &&
+ keyring.get_auth(name, auth)) {
+ fmt::print(std::cerr, "already have key in keyring: {}\n", path);
+ return seastar::now();
+ } else {
+ CephContext temp_cct{};
+ auth.key.create(&temp_cct, CEPH_CRYPTO_AES);
+ keyring.add(name, auth);
+ bufferlist bl;
+ keyring.encode_plaintext(bl);
+ const auto permissions = (seastar::file_permissions::user_read |
+ seastar::file_permissions::user_write);
+ return crimson::write_file(std::move(bl), path, permissions);
+ }
+ }).handle_exception_type([path](const std::filesystem::filesystem_error& e) {
+ fmt::print(std::cerr, "FATAL: writing new keyring to {}: {}\n", path, e.what());
+ throw e;
+ });
+}
+
+static std::ofstream maybe_set_logger()
+{
+ std::ofstream log_file_stream;
+ if (auto log_file = local_conf()->log_file; !log_file.empty()) {
+ log_file_stream.open(log_file, std::ios::app | std::ios::out);
+ try {
+ seastar::throw_system_error_on(log_file_stream.fail());
+ } catch (const std::system_error& e) {
+ ceph_abort_msg(fmt::format("unable to open log file: {}", e.what()));
+ }
+ logger().set_ostream(log_file_stream);
+ }
+ return log_file_stream;
+}
+
+int main(int argc, const char* argv[])
+{
+ auto early_config_result = crimson::osd::get_early_config(argc, argv);
+ if (!early_config_result.has_value()) {
+ int r = early_config_result.error();
+ std::cerr << "do_early_config returned error: " << r << std::endl;
+ return r;
+ }
+ auto &early_config = early_config_result.value();
+
+ auto seastar_n_early_args = early_config.get_early_args();
+ auto config_proxy_args = early_config.get_ceph_args();
+
+ seastar::app_template::config app_cfg;
+ app_cfg.name = "Crimson";
+ app_cfg.auto_handle_sigint_sigterm = false;
+ seastar::app_template app(std::move(app_cfg));
+ app.add_options()
+ ("mkkey", "generate a new secret key. "
+ "This is normally used in combination with --mkfs")
+ ("mkfs", "create a [new] data directory")
+ ("debug", "enable debug output on all loggers")
+ ("trace", "enable trace output on all loggers")
+ ("osdspec-affinity", bpo::value<std::string>()->default_value(std::string{}),
+ "set affinity to an osdspec")
+ ("prometheus_port", bpo::value<uint16_t>()->default_value(0),
+ "Prometheus port. Set to zero to disable")
+ ("prometheus_address", bpo::value<std::string>()->default_value("0.0.0.0"),
+ "Prometheus listening address")
+ ("prometheus_prefix", bpo::value<std::string>()->default_value("osd"),
+ "Prometheus metrics prefix");
+
+ try {
+ return app.run(
+ seastar_n_early_args.size(),
+ const_cast<char**>(seastar_n_early_args.data()),
+ [&] {
+ auto& config = app.configuration();
+ return seastar::async([&] {
+ try {
+ FatalSignal fatal_signal;
+ seastar_apps_lib::stop_signal should_stop;
+ if (config.count("debug")) {
+ seastar::global_logger_registry().set_all_loggers_level(
+ seastar::log_level::debug
+ );
+ }
+ if (config.count("trace")) {
+ seastar::global_logger_registry().set_all_loggers_level(
+ seastar::log_level::trace
+ );
+ }
+ sharded_conf().start(
+ early_config.init_params.name, early_config.cluster_name).get();
+ local_conf().start().get();
+ auto stop_conf = seastar::deferred_stop(sharded_conf());
+ sharded_perf_coll().start().get();
+ auto stop_perf_coll = seastar::deferred_stop(sharded_perf_coll());
+ local_conf().parse_config_files(early_config.conf_file_list).get();
+ local_conf().parse_env().get();
+ local_conf().parse_argv(config_proxy_args).get();
+ auto log_file_stream = maybe_set_logger();
+ auto reset_logger = seastar::defer([] {
+ logger().set_ostream(std::cerr);
+ });
+ if (const auto ret = pidfile_write(local_conf()->pid_file);
+ ret == -EACCES || ret == -EAGAIN) {
+ ceph_abort_msg(
+ "likely there is another crimson-osd instance with the same id");
+ } else if (ret < 0) {
+ ceph_abort_msg(fmt::format("pidfile_write failed with {} {}",
+ ret, cpp_strerror(-ret)));
+ }
+ // just ignore SIGHUP, we don't reread settings. keep in mind signals
+ // handled by S* must be blocked for alien threads (see AlienStore).
+ seastar::engine().handle_signal(SIGHUP, [] {});
+
+ // start prometheus API server
+ seastar::httpd::http_server_control prom_server;
+ std::any stop_prometheus;
+ if (uint16_t prom_port = config["prometheus_port"].as<uint16_t>();
+ prom_port != 0) {
+ prom_server.start("prometheus").get();
+ stop_prometheus = seastar::make_shared(seastar::deferred_stop(prom_server));
+
+ seastar::prometheus::config prom_config;
+ prom_config.prefix = config["prometheus_prefix"].as<std::string>();
+ seastar::prometheus::start(prom_server, prom_config).get();
+ seastar::net::inet_address prom_addr(config["prometheus_address"].as<std::string>());
+ prom_server.listen(seastar::socket_address{prom_addr, prom_port})
+ .handle_exception([=] (auto ep) {
+ std::cerr << seastar::format("Could not start Prometheus API server on {}:{}: {}\n",
+ prom_addr, prom_port, ep);
+ return seastar::make_exception_future(ep);
+ }).get();
+ }
+
+ const int whoami = std::stoi(local_conf()->name.get_id());
+ const auto nonce = crimson::osd::get_nonce();
+ crimson::net::MessengerRef cluster_msgr, client_msgr;
+ crimson::net::MessengerRef hb_front_msgr, hb_back_msgr;
+ for (auto [msgr, name] : {make_pair(std::ref(cluster_msgr), "cluster"s),
+ make_pair(std::ref(client_msgr), "client"s),
+ make_pair(std::ref(hb_front_msgr), "hb_front"s),
+ make_pair(std::ref(hb_back_msgr), "hb_back"s)}) {
+ msgr = crimson::net::Messenger::create(entity_name_t::OSD(whoami),
+ name,
+ nonce,
+ true);
+ }
+ auto store = crimson::os::FuturizedStore::create(
+ local_conf().get_val<std::string>("osd_objectstore"),
+ local_conf().get_val<std::string>("osd_data"),
+ local_conf().get_config_values());
+
+ crimson::osd::OSD osd(
+ whoami, nonce, std::ref(should_stop.abort_source()),
+ std::ref(*store), cluster_msgr, client_msgr,
+ hb_front_msgr, hb_back_msgr);
+
+ if (config.count("mkkey")) {
+ make_keyring().get();
+ }
+ if (local_conf()->no_mon_config) {
+ logger().info("bypassing the config fetch due to --no-mon-config");
+ } else {
+ crimson::osd::populate_config_from_mon().get();
+ }
+ if (config.count("mkfs")) {
+ auto osd_uuid = local_conf().get_val<uuid_d>("osd_uuid");
+ if (osd_uuid.is_zero()) {
+ // use a random osd uuid if not specified
+ osd_uuid.generate_random();
+ }
+ osd.mkfs(
+ *store,
+ whoami,
+ osd_uuid,
+ local_conf().get_val<uuid_d>("fsid"),
+ config["osdspec-affinity"].as<std::string>()).get();
+ }
+ if (config.count("mkkey") || config.count("mkfs")) {
+ return EXIT_SUCCESS;
+ } else {
+ osd.start().get();
+ }
+ logger().info("crimson startup completed");
+ should_stop.wait().get();
+ logger().info("crimson shutting down");
+ osd.stop().get();
+ // stop()s registered using defer() are called here
+ } catch (...) {
+ logger().error("startup failed: {}", std::current_exception());
+ return EXIT_FAILURE;
+ }
+ logger().info("crimson shutdown complete");
+ return EXIT_SUCCESS;
+ });
+ });
+ } catch (...) {
+ fmt::print(std::cerr, "FATAL: Exception during startup, aborting: {}\n", std::current_exception());
+ return EXIT_FAILURE;
+ }
+}
+
+/*
+ * Local Variables:
+ * compile-command: "make -j4 \
+ * -C ../../../build \
+ * crimson-osd"
+ * End:
+ */
diff --git a/src/crimson/osd/main_config_bootstrap_helpers.cc b/src/crimson/osd/main_config_bootstrap_helpers.cc
new file mode 100644
index 000000000..807fd1591
--- /dev/null
+++ b/src/crimson/osd/main_config_bootstrap_helpers.cc
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/main_config_bootstrap_helpers.h"
+
+#include <seastar/core/print.hh>
+#include <seastar/core/prometheus.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/http/httpd.hh>
+#include <seastar/net/inet_address.hh>
+#include <seastar/util/closeable.hh>
+#include <seastar/util/defer.hh>
+#include <seastar/util/std-compat.hh>
+
+#include "common/ceph_argparse.h"
+#include "common/config_tracker.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/fatal_signal.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/osd/main_config_bootstrap_helpers.h"
+
+using namespace std::literals;
+using crimson::common::local_conf;
+using crimson::common::sharded_conf;
+using crimson::common::sharded_perf_coll;
+
+static seastar::logger& logger()
+{
+ return crimson::get_logger(ceph_subsys_osd);
+}
+
+namespace crimson::osd {
+
+void usage(const char* prog)
+{
+ std::cout << "usage: " << prog << std::endl;
+ generic_server_usage();
+}
+
+
+seastar::future<> populate_config_from_mon()
+{
+ logger().info("populating config from monitor");
+ // i don't have any client before joining the cluster, so no need to have
+ // a proper auth handler
+ class DummyAuthHandler : public crimson::common::AuthHandler {
+ public:
+ void handle_authentication(const EntityName& name,
+ const AuthCapsInfo& caps)
+ {}
+ };
+ return seastar::async([] {
+ auto auth_handler = std::make_unique<DummyAuthHandler>();
+ auto msgr = crimson::net::Messenger::create(entity_name_t::CLIENT(),
+ "temp_mon_client",
+ get_nonce(),
+ true);
+ crimson::mon::Client monc{*msgr, *auth_handler};
+ msgr->set_auth_client(&monc);
+ msgr->start({&monc}).get();
+ auto stop_msgr = seastar::defer([&] {
+ msgr->stop();
+ msgr->shutdown().get();
+ });
+ monc.start().handle_exception([] (auto ep) {
+ fmt::print(std::cerr, "FATAL: unable to connect to cluster: {}\n", ep);
+ return seastar::make_exception_future<>(ep);
+ }).get();
+ auto stop_monc = seastar::defer([&] {
+ monc.stop().get();
+ });
+ monc.sub_want("config", 0, 0);
+ monc.renew_subs().get();
+ // wait for monmap and config
+ monc.wait_for_config().get();
+ auto fsid = monc.get_fsid().to_string();
+ local_conf().set_val("fsid", fsid).get();
+ logger().debug("{}: got config from monitor, fsid {}", __func__, fsid);
+ });
+}
+
+static tl::expected<early_config_t, int>
+_get_early_config(int argc, const char *argv[])
+{
+ early_config_t ret;
+
+ // pull off ceph configs the stuff from early_args
+ std::vector<const char *> early_args;
+ early_args.insert(
+ std::end(early_args),
+ argv, argv + argc);
+
+ ret.init_params = ceph_argparse_early_args(
+ early_args,
+ CEPH_ENTITY_TYPE_OSD,
+ &ret.cluster_name,
+ &ret.conf_file_list);
+
+ if (ceph_argparse_need_usage(early_args)) {
+ usage(argv[0]);
+ exit(0);
+ }
+
+ seastar::app_template::config app_cfg;
+ app_cfg.name = "Crimson-startup";
+ app_cfg.auto_handle_sigint_sigterm = false;
+ seastar::app_template app(std::move(app_cfg));
+ const char *bootstrap_args[] = { argv[0], "--smp", "1" };
+ int r = app.run(
+ sizeof(bootstrap_args) / sizeof(bootstrap_args[0]),
+ const_cast<char**>(bootstrap_args),
+ [argc, argv, &ret, &early_args] {
+ return seastar::async([argc, argv, &ret, &early_args] {
+ seastar::global_logger_registry().set_all_loggers_level(
+ seastar::log_level::debug);
+ sharded_conf().start(
+ ret.init_params.name, ret.cluster_name).get();
+ local_conf().start().get();
+ auto stop_conf = seastar::deferred_stop(sharded_conf());
+
+ sharded_perf_coll().start().get();
+ auto stop_perf_coll = seastar::deferred_stop(sharded_perf_coll());
+
+ local_conf().parse_env().get();
+ local_conf().parse_argv(early_args).get();
+ local_conf().parse_config_files(ret.conf_file_list).get();
+
+ if (local_conf()->no_mon_config) {
+ logger().info("bypassing the config fetch due to --no-mon-config");
+ } else {
+ populate_config_from_mon().get();
+ }
+
+ // get ceph configs
+ std::set_difference(
+ argv, argv + argc,
+ std::begin(early_args),
+ std::end(early_args),
+ std::back_inserter(ret.ceph_args));
+
+ ret.early_args.insert(
+ std::end(ret.early_args),
+ std::begin(early_args),
+ std::end(early_args));
+
+ if (auto found = std::find_if(
+ std::begin(early_args),
+ std::end(early_args),
+ [](auto* arg) { return "--smp"sv == arg; });
+ found == std::end(early_args)) {
+
+ // Set --smp based on crimson_seastar_smp config option
+ ret.early_args.emplace_back("--smp");
+
+ auto smp_config = local_conf().get_val<uint64_t>(
+ "crimson_seastar_smp");
+
+ ret.early_args.emplace_back(fmt::format("{}", smp_config));
+ logger().info("get_early_config: set --smp {}", smp_config);
+ }
+ return 0;
+ });
+ });
+ if (r < 0) {
+ return tl::unexpected(r);
+ }
+ return ret;
+}
+
+/* get_early_config handles obtaining config parameters required prior
+ * to reactor startup. Most deployment mechanisms (cephadm for one)
+ * rely on pulling configs from the monitor rather than shipping around
+ * config files, so this process needs to support pulling config options
+ * from the monitors.
+ *
+ * Of particular interest are config params related to the seastar
+ * reactor itself which can't be modified after the reactor has been
+ * started -- like the number of cores to use (smp::count). Contacting
+ * the monitors, however, requires a MonClient, which in turn needs a
+ * running reactor.
+ *
+ * Unfortunately, seastar doesn't clean up thread local state
+ * associated with seastar::smp task queues etc, so we can't
+ * start a reactor, stop it, and restart it in the same thread
+ * without an impractical amount of cleanup in seastar.
+ *
+ * More unfortunately, starting a reactor in a seperate thread
+ * and then joining the thread still doesn't avoid all global state,
+ * I observed tasks from the previous reactor incarnation nevertheless
+ * continuing to run in the new one resulting in a crash as they access
+ * freed memory.
+ *
+ * The approach taken here, therefore, is to actually fork, start a
+ * reactor in the child process, encode the resulting early_config_t,
+ * and send it back to the parent process.
+ */
+tl::expected<early_config_t, int>
+get_early_config(int argc, const char *argv[])
+{
+ int pipes[2];
+ int r = pipe2(pipes, 0);
+ if (r < 0) {
+ std::cerr << "get_early_config: failed to create pipes: "
+ << -errno << std::endl;
+ return tl::unexpected(-errno);
+ }
+
+ pid_t worker = fork();
+ if (worker < 0) {
+ close(pipes[0]);
+ close(pipes[1]);
+ std::cerr << "get_early_config: failed to fork: "
+ << -errno << std::endl;
+ return tl::unexpected(-errno);
+ } else if (worker == 0) { // child
+ close(pipes[0]);
+ auto ret = _get_early_config(argc, argv);
+ if (ret.has_value()) {
+ bufferlist bl;
+ ::encode(ret.value(), bl);
+ r = bl.write_fd(pipes[1]);
+ close(pipes[1]);
+ if (r < 0) {
+ std::cerr << "get_early_config: child failed to write_fd: "
+ << r << std::endl;
+ exit(-r);
+ } else {
+ exit(0);
+ }
+ } else {
+ std::cerr << "get_early_config: child failed: "
+ << -ret.error() << std::endl;
+ exit(-ret.error());
+ }
+ return tl::unexpected(-1);
+ } else { // parent
+ close(pipes[1]);
+
+ bufferlist bl;
+ early_config_t ret;
+ while ((r = bl.read_fd(pipes[0], 1024)) > 0);
+ close(pipes[0]);
+
+ // ignore error, we'll propogate error based on read and decode
+ waitpid(worker, nullptr, 0);
+
+ if (r < 0) {
+ std::cerr << "get_early_config: parent failed to read from pipe: "
+ << r << std::endl;
+ return tl::unexpected(r);
+ }
+ try {
+ auto bliter = bl.cbegin();
+ ::decode(ret, bliter);
+ return ret;
+ } catch (...) {
+ std::cerr << "get_early_config: parent failed to decode" << std::endl;
+ return tl::unexpected(-EINVAL);
+ }
+ }
+}
+
+}
diff --git a/src/crimson/osd/main_config_bootstrap_helpers.h b/src/crimson/osd/main_config_bootstrap_helpers.h
new file mode 100644
index 000000000..7c6131d17
--- /dev/null
+++ b/src/crimson/osd/main_config_bootstrap_helpers.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <fstream>
+#include <random>
+
+#include <seastar/core/future.hh>
+
+#include "common/ceph_argparse.h"
+#include "include/expected.hpp"
+
+namespace crimson::osd {
+
+void usage(const char* prog);
+
+inline uint64_t get_nonce()
+{
+ if (auto pid = getpid(); pid == 1 || std::getenv("CEPH_USE_RANDOM_NONCE")) {
+ // we're running in a container; use a random number instead!
+ std::random_device rd;
+ std::default_random_engine rng{rd()};
+ return std::uniform_int_distribution<uint64_t>{}(rng);
+ } else {
+ return pid;
+ }
+}
+
+seastar::future<> populate_config_from_mon();
+
+struct early_config_t {
+ std::vector<std::string> early_args;
+ std::vector<std::string> ceph_args;
+
+ std::string cluster_name{"ceph"};
+ std::string conf_file_list;
+ CephInitParameters init_params{CEPH_ENTITY_TYPE_OSD};
+
+ /// Returned vector must not outlive in
+ auto to_ptr_vector(const std::vector<std::string> &in) {
+ std::vector<const char *> ret;
+ ret.reserve(in.size());
+ std::transform(
+ std::begin(in), std::end(in),
+ std::back_inserter(ret),
+ [](const auto &str) { return str.c_str(); });
+ return ret;
+ }
+
+ std::vector<const char *> get_early_args() {
+ return to_ptr_vector(early_args);
+ }
+
+ std::vector<const char *> get_ceph_args() {
+ return to_ptr_vector(ceph_args);
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(early_args, bl);
+ encode(ceph_args, bl);
+ encode(cluster_name, bl);
+ encode(conf_file_list, bl);
+ encode(init_params, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(early_args, bl);
+ decode(ceph_args, bl);
+ decode(cluster_name, bl);
+ decode(conf_file_list, bl);
+ decode(init_params, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+/**
+ * get_early_config
+ *
+ * Compile initial configuration information from command line arguments,
+ * config files, and monitors.
+ *
+ * This implementation forks off a worker process to do this work and must
+ * therefore be called very early in main(). (See implementation for an
+ * explanation).
+ */
+tl::expected<early_config_t, int>
+get_early_config(int argc, const char *argv[]);
+
+}
+
+WRITE_CLASS_ENCODER(crimson::osd::early_config_t)
diff --git a/src/crimson/osd/objclass.cc b/src/crimson/osd/objclass.cc
new file mode 100644
index 000000000..4cc9d7336
--- /dev/null
+++ b/src/crimson/osd/objclass.cc
@@ -0,0 +1,584 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdarg>
+#include <cstring>
+#include <boost/container/small_vector.hpp>
+#include "common/ceph_context.h"
+#include "common/ceph_releases.h"
+#include "common/config.h"
+#include "crimson/common/config_proxy.h"
+#include "common/debug.h"
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/pg_backend.h"
+
+#include "objclass/objclass.h"
+#include "osd/ClassHandler.h"
+
+#include "auth/Crypto.h"
+#include "common/armor.h"
+
+using std::map;
+using std::string;
+
+#define dout_context ClassHandler::get_instance().cct
+
+static constexpr int dout_subsys = ceph_subsys_objclass;
+
+static inline int execute_osd_op(cls_method_context_t hctx, OSDOp& op)
+{
+ // we can expect the memory under `ret` will be still fine after
+ // executing the osd op as we're running inside `seastar::thread`
+ // created for us by `seastar::async` in `::do_op_call()`.
+ int ret = 0;
+ using osd_op_errorator = crimson::osd::OpsExecuter::osd_op_errorator;
+ reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->execute_op(op)
+ .handle_error_interruptible(
+ osd_op_errorator::all_same_way([&ret] (const std::error_code& err) {
+ assert(err.value() > 0);
+ ret = -err.value();
+ return seastar::now();
+ })).get(); // we're blocking here which requires `seastar::thread`.
+ return ret;
+}
+
+int cls_call(cls_method_context_t hctx, const char *cls, const char *method,
+ char *indata, int datalen,
+ char **outdata, int *outdatalen)
+{
+// FIXME, HACK: this is for testing only. Let's use dynamic linker to verify
+// our depedencies
+ return 0;
+}
+
+int cls_getxattr(cls_method_context_t hctx,
+ const char *name,
+ char **outdata,
+ int *outdatalen)
+{
+ return 0;
+}
+
+int cls_setxattr(cls_method_context_t hctx,
+ const char *name,
+ const char *value,
+ int val_len)
+{
+ return 0;
+}
+
+int cls_read(cls_method_context_t hctx,
+ int ofs, int len,
+ char **outdata,
+ int *outdatalen)
+{
+ return 0;
+}
+
+int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin)
+{
+ assert(origin);
+
+ try {
+ const auto& message = \
+ reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message();
+ *origin = message.get_orig_source_inst();
+ return 0;
+ } catch (crimson::osd::error& e) {
+ return -e.code().value();
+ }
+}
+
+int cls_cxx_create(cls_method_context_t hctx, const bool exclusive)
+{
+ OSDOp op{CEPH_OSD_OP_CREATE};
+ op.op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_remove(cls_method_context_t hctx)
+{
+ OSDOp op{CEPH_OSD_OP_DELETE};
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime)
+{
+ OSDOp op{CEPH_OSD_OP_STAT};
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ utime_t ut;
+ uint64_t s;
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(s, iter);
+ decode(ut, iter);
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+ if (size) {
+ *size = s;
+ }
+ if (mtime) {
+ *mtime = ut.sec();
+ }
+ return 0;
+}
+
+int cls_cxx_stat2(cls_method_context_t hctx,
+ uint64_t *size,
+ ceph::real_time *mtime)
+{
+ OSDOp op{CEPH_OSD_OP_STAT};
+ if (const int ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ uint64_t dummy_size;
+ real_time dummy_mtime;
+ uint64_t& out_size = size ? *size : dummy_size;
+ real_time& out_mtime = mtime ? *mtime : dummy_mtime;
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(out_size, iter);
+ decode(out_mtime, iter);
+ return 0;
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+}
+
+int cls_cxx_read2(cls_method_context_t hctx,
+ int ofs,
+ int len,
+ bufferlist *outbl,
+ uint32_t op_flags)
+{
+ OSDOp op{CEPH_OSD_OP_SYNC_READ};
+ op.op.extent.offset = ofs;
+ op.op.extent.length = len;
+ op.op.flags = op_flags;
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ *outbl = std::move(op.outdata);
+ return outbl->length();
+}
+
+int cls_cxx_write2(cls_method_context_t hctx,
+ int ofs,
+ int len,
+ bufferlist *inbl,
+ uint32_t op_flags)
+{
+ OSDOp op{CEPH_OSD_OP_WRITE};
+ op.op.extent.offset = ofs;
+ op.op.extent.length = len;
+ op.op.flags = op_flags;
+ op.indata = *inbl;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_write_full(cls_method_context_t hctx, bufferlist * const inbl)
+{
+ OSDOp op{CEPH_OSD_OP_WRITEFULL};
+ op.op.extent.offset = 0;
+ op.op.extent.length = inbl->length();
+ op.indata = *inbl;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_replace(cls_method_context_t hctx,
+ int ofs,
+ int len,
+ bufferlist *inbl)
+{
+ {
+ OSDOp top{CEPH_OSD_OP_TRUNCATE};
+ top.op.extent.offset = 0;
+ top.op.extent.length = 0;
+ if (const auto ret = execute_osd_op(hctx, top); ret < 0) {
+ return ret;
+ }
+ }
+
+ {
+ OSDOp wop{CEPH_OSD_OP_WRITE};
+ wop.op.extent.offset = ofs;
+ wop.op.extent.length = len;
+ wop.indata = *inbl;
+ if (const auto ret = execute_osd_op(hctx, wop); ret < 0) {
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int cls_cxx_truncate(cls_method_context_t hctx, int ofs)
+{
+ OSDOp op{CEPH_OSD_OP_TRUNCATE};
+ op.op.extent.offset = ofs;
+ op.op.extent.length = 0;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_write_zero(cls_method_context_t hctx, int offset, int len)
+{
+ OSDOp op{CEPH_OSD_OP_ZERO};
+ op.op.extent.offset = offset;
+ op.op.extent.length = len;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_getxattr(cls_method_context_t hctx,
+ const char *name,
+ bufferlist *outbl)
+{
+ OSDOp op{CEPH_OSD_OP_GETXATTR};
+ op.op.xattr.name_len = strlen(name);
+ op.indata.append(name, op.op.xattr.name_len);
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ *outbl = std::move(op.outdata);
+ return outbl->length();
+}
+
+int cls_cxx_getxattrs(cls_method_context_t hctx,
+ map<string, bufferlist> *attrset)
+{
+ OSDOp op{CEPH_OSD_OP_GETXATTRS};
+ if (const int ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*attrset, iter);
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+ return 0;
+}
+
+int cls_cxx_setxattr(cls_method_context_t hctx,
+ const char *name,
+ bufferlist *inbl)
+{
+ OSDOp op{CEPH_OSD_OP_SETXATTR};
+ op.op.xattr.name_len = std::strlen(name);
+ op.op.xattr.value_len = inbl->length();
+ op.indata.append(name, op.op.xattr.name_len);
+ op.indata.append(*inbl);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid)
+{
+ OSDOp op{CEPH_OSD_OP_ROLLBACK};
+ op.op.snap.snapid = snapid;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_get_all_vals(cls_method_context_t hctx,
+ map<string, bufferlist>* vals,
+ bool *more)
+{
+ return 0;
+}
+
+int cls_cxx_map_get_keys(cls_method_context_t hctx,
+ const std::string& start_obj,
+ const uint64_t max_to_get,
+ std::set<std::string>* const keys,
+ bool* const more)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETKEYS};
+ encode(start_obj, op.indata);
+ encode(max_to_get, op.indata);
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*keys, iter);
+ decode(*more, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ return keys->size();
+}
+
+int cls_cxx_map_get_vals(cls_method_context_t hctx,
+ const std::string& start_obj,
+ const std::string& filter_prefix,
+ const uint64_t max_to_get,
+ std::map<std::string, ceph::bufferlist> *vals,
+ bool* const more)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETVALS};
+ encode(start_obj, op.indata);
+ encode(max_to_get, op.indata);
+ encode(filter_prefix, op.indata);
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*vals, iter);
+ decode(*more, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ return vals->size();
+}
+
+int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx,
+ const std::set<std::string> &keys,
+ std::map<std::string, ceph::bufferlist> *vals)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS};
+ encode(keys, op.indata);
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*vals, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ return 0;
+}
+
+int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETHEADER};
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ *outbl = std::move(op.outdata);
+ return 0;
+}
+
+int cls_cxx_map_get_val(cls_method_context_t hctx,
+ const string &key,
+ bufferlist *outbl)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS};
+ {
+ std::set<std::string> k{key};
+ encode(k, op.indata);
+ }
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ std::map<std::string, ceph::bufferlist> m;
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(m, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ if (auto iter = std::begin(m); iter != std::end(m)) {
+ *outbl = std::move(iter->second);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+int cls_cxx_map_set_val(cls_method_context_t hctx,
+ const string &key,
+ bufferlist *inbl)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPSETVALS};
+ {
+ std::map<std::string, ceph::bufferlist> m;
+ m[key] = *inbl;
+ encode(m, op.indata);
+ }
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_set_vals(cls_method_context_t hctx,
+ const std::map<string, ceph::bufferlist> *map)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPSETVALS};
+ encode(*map, op.indata);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_clear(cls_method_context_t hctx)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPCLEAR};
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPSETHEADER};
+ op.indata = std::move(*inbl);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_remove_range(cls_method_context_t hctx,
+ const std::string& key_begin,
+ const std::string& key_end)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPRMKEYRANGE};
+ encode(key_begin, op.indata);
+ encode(key_end, op.indata);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPRMKEYS};
+ std::vector<string> to_rm{key};
+ encode(to_rm, op.indata);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_list_watchers(cls_method_context_t hctx,
+ obj_list_watch_response_t *watchers)
+{
+ OSDOp op{CEPH_OSD_OP_LIST_WATCHERS};
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*watchers, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ return 0;
+}
+
+uint64_t cls_current_version(cls_method_context_t hctx)
+{
+ auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+ return ox->get_last_user_version();
+}
+
+
+int cls_current_subop_num(cls_method_context_t hctx)
+{
+ auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+ // in contrast to classical OSD, crimson doesn't count OP_CALL and
+ // OP_STAT which seems fine regarding how the plugins we take care
+ // about use this part of API.
+ return ox->get_processed_rw_ops_num();
+}
+
+uint64_t cls_get_features(cls_method_context_t hctx)
+{
+ return 0;
+}
+
+uint64_t cls_get_client_features(cls_method_context_t hctx)
+{
+ try {
+ const auto& message = \
+ reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message();
+ return message.get_features();
+ } catch (crimson::osd::error& e) {
+ return -e.code().value();
+ }
+}
+
+uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx)
+{
+ auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+ return ox->get_pool_stripe_width();
+}
+
+ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx)
+{
+ // FIXME
+ return ceph_release_t::nautilus;
+}
+
+ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx)
+{
+ // FIXME
+ return ceph_release_t::nautilus;
+}
+
+const ConfigProxy& cls_get_config(cls_method_context_t hctx)
+{
+ return crimson::common::local_conf();
+}
+
+const object_info_t& cls_get_object_info(cls_method_context_t hctx)
+{
+ return reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_object_info();
+}
+
+int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq)
+{
+ auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+ auto obc = ox->get_obc();
+ if (!obc->obs.exists ||
+ (obc->obs.oi.is_whiteout() &&
+ obc->ssc->snapset.clones.empty())) {
+ return -ENOENT;
+ }
+ *snap_seq = obc->ssc->snapset.seq;
+ return 0;
+}
+
+int cls_cxx_chunk_write_and_set(cls_method_context_t hctx,
+ int ofs,
+ int len,
+ bufferlist *write_inbl,
+ uint32_t op_flags,
+ bufferlist *set_inbl,
+ int set_len)
+{
+ return 0;
+}
+
+int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid)
+{
+ return 0;
+}
+
+uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) {
+ // FIXME
+ return 4096;
+}
+
+int cls_cxx_gather(cls_method_context_t hctx, const std::set<std::string> &src_objs, const std::string& pool,
+ const char *cls, const char *method, bufferlist& inbl)
+{
+ return 0;
+}
+
+int cls_cxx_get_gathered_data(cls_method_context_t hctx, std::map<std::string, bufferlist> *results)
+{
+ return 0;
+}
+
+// although at first glance the implementation looks the same as in
+// the classical OSD, it's different b/c of how the dout macro expands.
+int cls_log(int level, const char *format, ...)
+{
+ size_t size = 256;
+ va_list ap;
+ while (1) {
+ boost::container::small_vector<char, 256> buf(size);
+ va_start(ap, format);
+ int n = vsnprintf(buf.data(), size, format, ap);
+ va_end(ap);
+#define MAX_SIZE 8196UL
+ if ((n > -1 && static_cast<size_t>(n) < size) || size > MAX_SIZE) {
+ dout(ceph::dout::need_dynamic(level)) << buf.data() << dendl;
+ return n;
+ }
+ size *= 2;
+ }
+}
diff --git a/src/crimson/osd/object_context.cc b/src/crimson/osd/object_context.cc
new file mode 100644
index 000000000..1ea701c22
--- /dev/null
+++ b/src/crimson/osd/object_context.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/object_context.h"
+
+#include <fmt/ranges.h>
+
+#include "common/Formatter.h"
+#include "crimson/common/config_proxy.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+ObjectContextRegistry::ObjectContextRegistry(crimson::common::ConfigProxy &conf)
+{
+ obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size"));
+ conf.add_observer(this);
+}
+
+ObjectContextRegistry::~ObjectContextRegistry()
+{
+ // purge the cache to avoid leaks and complains from LSan
+ obc_lru.set_target_size(0UL);
+}
+
+const char** ObjectContextRegistry::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "crimson_osd_obc_lru_size",
+ nullptr
+ };
+ return KEYS;
+}
+
+void ObjectContextRegistry::handle_conf_change(
+ const crimson::common::ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+ obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size"));
+}
+
+std::optional<hobject_t> resolve_oid(
+ const SnapSet &ss,
+ const hobject_t &oid)
+{
+ logger().debug("{} oid.snap={},head snapset.seq={}",
+ __func__, oid.snap, ss.seq);
+ if (oid.snap > ss.seq) {
+ // Because oid.snap > ss.seq, we are trying to read from a snapshot
+ // taken after the most recent write to this object. Read from head.
+ return oid.get_head();
+ } else {
+ // which clone would it be?
+ auto clone = std::lower_bound(
+ begin(ss.clones), end(ss.clones),
+ oid.snap);
+ if (clone == end(ss.clones)) {
+ // Doesn't exist, > last clone, < ss.seq
+ return std::nullopt;
+ }
+ auto citer = ss.clone_snaps.find(*clone);
+ // TODO: how do we want to handle this kind of logic error?
+ ceph_assert(citer != ss.clone_snaps.end());
+
+ if (std::find(
+ citer->second.begin(),
+ citer->second.end(),
+ oid.snap) == citer->second.end()) {
+ logger().debug("{} {} does not contain {} -- DNE",
+ __func__, ss.clone_snaps, oid.snap);
+ return std::nullopt;
+ } else {
+ auto soid = oid;
+ soid.snap = *clone;
+ return std::optional<hobject_t>(soid);
+ }
+ }
+}
+
+}
diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h
new file mode 100644
index 000000000..8abf6d3f7
--- /dev/null
+++ b/src/crimson/osd/object_context.h
@@ -0,0 +1,276 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <optional>
+#include <utility>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "common/intrusive_lru.h"
+#include "osd/object_state.h"
+#include "crimson/common/exception.h"
+#include "crimson/common/tri_mutex.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::common {
+ class ConfigProxy;
+}
+
+namespace crimson::osd {
+
+class Watch;
+struct SnapSetContext;
+using SnapSetContextRef = boost::intrusive_ptr<SnapSetContext>;
+
+template <typename OBC>
+struct obc_to_hoid {
+ using type = hobject_t;
+ const type &operator()(const OBC &obc) {
+ return obc.obs.oi.soid;
+ }
+};
+
+struct SnapSetContext :
+ public boost::intrusive_ref_counter<SnapSetContext,
+ boost::thread_unsafe_counter>
+{
+ hobject_t oid;
+ SnapSet snapset;
+ bool exists = false;
+ /**
+ * exists
+ *
+ * Because ObjectContext's are cached, we need to be able to express the case
+ * where the object to which a cached ObjectContext refers does not exist.
+ * ObjectContext's for yet-to-be-created objects are initialized with exists=false.
+ * The ObjectContext for a deleted object will have exists set to false until it falls
+ * out of cache (or another write recreates the object).
+ */
+ explicit SnapSetContext(const hobject_t& o) :
+ oid(o), exists(false) {}
+};
+
+class ObjectContext : public ceph::common::intrusive_lru_base<
+ ceph::common::intrusive_lru_config<
+ hobject_t, ObjectContext, obc_to_hoid<ObjectContext>>>
+{
+public:
+ ObjectState obs;
+ SnapSetContextRef ssc;
+ // the watch / notify machinery rather stays away from the hot and
+ // frequented paths. std::map is used mostly because of developer's
+ // convenience.
+ using watch_key_t = std::pair<uint64_t, entity_name_t>;
+ std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers;
+
+ ObjectContext(hobject_t hoid) : obs(std::move(hoid)) {}
+
+ const hobject_t &get_oid() const {
+ return obs.oi.soid;
+ }
+
+ bool is_head() const {
+ return get_oid().is_head();
+ }
+
+ hobject_t get_head_oid() const {
+ return get_oid().get_head();
+ }
+
+ const SnapSet &get_head_ss() const {
+ ceph_assert(is_head());
+ ceph_assert(ssc);
+ return ssc->snapset;
+ }
+
+ void set_head_state(ObjectState &&_obs, SnapSetContextRef &&_ssc) {
+ ceph_assert(is_head());
+ obs = std::move(_obs);
+ ssc = std::move(_ssc);
+ }
+
+ void set_clone_state(ObjectState &&_obs) {
+ ceph_assert(!is_head());
+ obs = std::move(_obs);
+ }
+
+ /// pass the provided exception to any waiting consumers of this ObjectContext
+ template<typename Exception>
+ void interrupt(Exception ex) {
+ lock.abort(std::move(ex));
+ if (recovery_read_marker) {
+ drop_recovery_read();
+ }
+ }
+
+private:
+ tri_mutex lock;
+ bool recovery_read_marker = false;
+
+ template <typename Lock, typename Func>
+ auto _with_lock(Lock&& lock, Func&& func) {
+ Ref obc = this;
+ return lock.lock().then([&lock, func = std::forward<Func>(func), obc]() mutable {
+ return seastar::futurize_invoke(func).finally([&lock, obc] {
+ lock.unlock();
+ });
+ });
+ }
+
+ boost::intrusive::list_member_hook<> list_hook;
+ uint64_t list_link_cnt = 0;
+
+public:
+
+ template <typename ListType>
+ void append_to(ListType& list) {
+ if (list_link_cnt++ == 0) {
+ list.push_back(*this);
+ }
+ }
+
+ template <typename ListType>
+ void remove_from(ListType&& list) {
+ assert(list_link_cnt > 0);
+ if (--list_link_cnt == 0) {
+ list.erase(std::decay_t<ListType>::s_iterator_to(*this));
+ }
+ }
+
+ using obc_accessing_option_t = boost::intrusive::member_hook<
+ ObjectContext,
+ boost::intrusive::list_member_hook<>,
+ &ObjectContext::list_hook>;
+
+ template<RWState::State Type, typename InterruptCond = void, typename Func>
+ auto with_lock(Func&& func) {
+ if constexpr (!std::is_void_v<InterruptCond>) {
+ auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func));
+ switch (Type) {
+ case RWState::RWWRITE:
+ return _with_lock(lock.for_write(), std::move(wrapper));
+ case RWState::RWREAD:
+ return _with_lock(lock.for_read(), std::move(wrapper));
+ case RWState::RWEXCL:
+ return _with_lock(lock.for_excl(), std::move(wrapper));
+ case RWState::RWNONE:
+ return seastar::futurize_invoke(std::move(wrapper));
+ default:
+ assert(0 == "noop");
+ }
+ } else {
+ switch (Type) {
+ case RWState::RWWRITE:
+ return _with_lock(lock.for_write(), std::forward<Func>(func));
+ case RWState::RWREAD:
+ return _with_lock(lock.for_read(), std::forward<Func>(func));
+ case RWState::RWEXCL:
+ return _with_lock(lock.for_excl(), std::forward<Func>(func));
+ case RWState::RWNONE:
+ return seastar::futurize_invoke(std::forward<Func>(func));
+ default:
+ assert(0 == "noop");
+ }
+ }
+ }
+ template<RWState::State Type, typename InterruptCond = void, typename Func>
+ auto with_promoted_lock(Func&& func) {
+ if constexpr (!std::is_void_v<InterruptCond>) {
+ auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func));
+ switch (Type) {
+ case RWState::RWWRITE:
+ return _with_lock(lock.excl_from_write(), std::move(wrapper));
+ case RWState::RWREAD:
+ return _with_lock(lock.excl_from_read(), std::move(wrapper));
+ case RWState::RWEXCL:
+ return _with_lock(lock.excl_from_excl(), std::move(wrapper));
+ case RWState::RWNONE:
+ return _with_lock(lock.for_excl(), std::move(wrapper));
+ default:
+ assert(0 == "noop");
+ }
+ } else {
+ switch (Type) {
+ case RWState::RWWRITE:
+ return _with_lock(lock.excl_from_write(), std::forward<Func>(func));
+ case RWState::RWREAD:
+ return _with_lock(lock.excl_from_read(), std::forward<Func>(func));
+ case RWState::RWEXCL:
+ return _with_lock(lock.excl_from_excl(), std::forward<Func>(func));
+ case RWState::RWNONE:
+ return _with_lock(lock.for_excl(), std::forward<Func>(func));
+ default:
+ assert(0 == "noop");
+ }
+ }
+ }
+
+ bool empty() const {
+ return !lock.is_acquired();
+ }
+ bool is_request_pending() const {
+ return lock.is_acquired();
+ }
+
+ bool get_recovery_read() {
+ if (lock.try_lock_for_read()) {
+ recovery_read_marker = true;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ void wait_recovery_read() {
+ assert(lock.get_readers() > 0);
+ recovery_read_marker = true;
+ }
+ void drop_recovery_read() {
+ assert(recovery_read_marker);
+ recovery_read_marker = false;
+ }
+ bool maybe_get_excl() {
+ return lock.try_lock_for_excl();
+ }
+};
+using ObjectContextRef = ObjectContext::Ref;
+
+class ObjectContextRegistry : public md_config_obs_t {
+ ObjectContext::lru_t obc_lru;
+
+public:
+ ObjectContextRegistry(crimson::common::ConfigProxy &conf);
+ ~ObjectContextRegistry();
+
+ std::pair<ObjectContextRef, bool> get_cached_obc(const hobject_t &hoid) {
+ return obc_lru.get_or_create(hoid);
+ }
+ ObjectContextRef maybe_get_cached_obc(const hobject_t &hoid) {
+ return obc_lru.get(hoid);
+ }
+
+ void clear_range(const hobject_t &from,
+ const hobject_t &to) {
+ obc_lru.clear_range(from, to);
+ }
+
+ template <class F>
+ void for_each(F&& f) {
+ obc_lru.for_each(std::forward<F>(f));
+ }
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(const crimson::common::ConfigProxy& conf,
+ const std::set <std::string> &changed) final;
+};
+
+std::optional<hobject_t> resolve_oid(const SnapSet &ss,
+ const hobject_t &oid);
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/object_context_loader.cc b/src/crimson/osd/object_context_loader.cc
new file mode 100644
index 000000000..0a4d74c0d
--- /dev/null
+++ b/src/crimson/osd/object_context_loader.cc
@@ -0,0 +1,232 @@
+#include "crimson/osd/object_context_loader.h"
+#include "osd/osd_types_fmt.h"
+
+SET_SUBSYS(osd);
+
+namespace crimson::osd {
+
+using crimson::common::local_conf;
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_head_obc(ObjectContextRef obc,
+ bool existed,
+ with_obc_func_t&& func)
+ {
+ LOG_PREFIX(ObjectContextLoader::with_head_obc);
+ DEBUGDPP("object {}", dpp, obc->get_oid());
+ assert(obc->is_head());
+ obc->append_to(obc_set_accessing);
+ return obc->with_lock<State, IOInterruptCondition>(
+ [existed=existed, obc=obc, func=std::move(func), this] {
+ return get_or_load_obc<State>(obc, existed)
+ .safe_then_interruptible(
+ [func = std::move(func)](auto obc) {
+ return std::move(func)(std::move(obc));
+ });
+ }).finally([FNAME, this, obc=std::move(obc)] {
+ DEBUGDPP("released object {}", dpp, obc->get_oid());
+ obc->remove_from(obc_set_accessing);
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_clone_obc(hobject_t oid,
+ with_obc_func_t&& func)
+ {
+ LOG_PREFIX(ObjectContextLoader::with_clone_obc);
+ assert(!oid.is_head());
+ return with_obc<RWState::RWREAD>(
+ oid.get_head(),
+ [FNAME, oid, func=std::move(func), this](auto head) mutable
+ -> load_obc_iertr::future<> {
+ if (!head->obs.exists) {
+ ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid);
+ return load_obc_iertr::future<>{
+ crimson::ct_error::enoent::make()
+ };
+ }
+ return this->with_clone_obc_only<State>(std::move(head),
+ oid,
+ std::move(func));
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_clone_obc_only(ObjectContextRef head,
+ hobject_t oid,
+ with_obc_func_t&& func)
+ {
+ LOG_PREFIX(ObjectContextLoader::with_clone_obc_only);
+ auto coid = resolve_oid(head->get_head_ss(), oid);
+ if (!coid) {
+ ERRORDPP("clone {} not found", dpp, oid);
+ return load_obc_iertr::future<>{
+ crimson::ct_error::enoent::make()
+ };
+ }
+ auto [clone, existed] = obc_registry.get_cached_obc(*coid);
+ return clone->template with_lock<State, IOInterruptCondition>(
+ [existed=existed, clone=std::move(clone),
+ func=std::move(func), head=std::move(head), this]()
+ -> load_obc_iertr::future<> {
+ auto loaded = get_or_load_obc<State>(clone, existed);
+ return loaded.safe_then_interruptible(
+ [func = std::move(func)](auto clone) {
+ return std::move(func)(std::move(clone));
+ });
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_clone_obc_direct(
+ hobject_t oid,
+ with_both_obc_func_t&& func)
+ {
+ LOG_PREFIX(ObjectContextLoader::with_clone_obc_direct);
+ assert(!oid.is_head());
+ return with_obc<RWState::RWREAD>(
+ oid.get_head(),
+ [FNAME, oid, func=std::move(func), this](auto head) mutable
+ -> load_obc_iertr::future<> {
+ if (!head->obs.exists) {
+ ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid);
+ return load_obc_iertr::future<>{
+ crimson::ct_error::enoent::make()
+ };
+ }
+#ifndef NDEBUG
+ auto &ss = head->get_head_ss();
+ auto cit = std::find(
+ std::begin(ss.clones), std::end(ss.clones), oid.snap);
+ assert(cit != std::end(ss.clones));
+#endif
+ auto [clone, existed] = obc_registry.get_cached_obc(oid);
+ return clone->template with_lock<State, IOInterruptCondition>(
+ [existed=existed, clone=std::move(clone),
+ func=std::move(func), head=std::move(head), this]()
+ -> load_obc_iertr::future<> {
+ auto loaded = get_or_load_obc<State>(clone, existed);
+ return loaded.safe_then_interruptible(
+ [func = std::move(func), head=std::move(head)](auto clone) {
+ return std::move(func)(std::move(head), std::move(clone));
+ });
+ });
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc(hobject_t oid,
+ with_obc_func_t&& func)
+ {
+ if (oid.is_head()) {
+ auto [obc, existed] =
+ obc_registry.get_cached_obc(std::move(oid));
+ return with_head_obc<State>(std::move(obc),
+ existed,
+ std::move(func));
+ } else {
+ return with_clone_obc<State>(oid, std::move(func));
+ }
+ }
+
+ ObjectContextLoader::load_obc_iertr::future<ObjectContextRef>
+ ObjectContextLoader::load_obc(ObjectContextRef obc)
+ {
+ LOG_PREFIX(ObjectContextLoader::load_obc);
+ return backend.load_metadata(obc->get_oid())
+ .safe_then_interruptible(
+ [FNAME, this, obc=std::move(obc)](auto md)
+ -> load_obc_ertr::future<ObjectContextRef> {
+ const hobject_t& oid = md->os.oi.soid;
+ DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid);
+ if (oid.is_head()) {
+ if (!md->ssc) {
+ ERRORDPP("oid {} missing snapsetcontext", dpp, oid);
+ return crimson::ct_error::object_corrupted::make();
+ }
+ obc->set_head_state(std::move(md->os),
+ std::move(md->ssc));
+ } else {
+ obc->set_clone_state(std::move(md->os));
+ }
+ DEBUGDPP("returning obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid);
+ return load_obc_ertr::make_ready_future<ObjectContextRef>(obc);
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<ObjectContextRef>
+ ObjectContextLoader::get_or_load_obc(ObjectContextRef obc,
+ bool existed)
+ {
+ LOG_PREFIX(ObjectContextLoader::get_or_load_obc);
+ auto loaded =
+ load_obc_iertr::make_ready_future<ObjectContextRef>(obc);
+ if (existed) {
+ DEBUGDPP("cache hit on {}", dpp, obc->get_oid());
+ } else {
+ DEBUGDPP("cache miss on {}", dpp, obc->get_oid());
+ loaded =
+ obc->template with_promoted_lock<State, IOInterruptCondition>(
+ [obc, this] {
+ return load_obc(obc);
+ });
+ }
+ return loaded;
+ }
+
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::reload_obc(ObjectContext& obc) const
+ {
+ LOG_PREFIX(ObjectContextLoader::reload_obc);
+ assert(obc.is_head());
+ return backend.load_metadata(obc.get_oid())
+ .safe_then_interruptible<false>(
+ [FNAME, this, &obc](auto md)-> load_obc_ertr::future<> {
+ DEBUGDPP("reloaded obs {} for {}", dpp, md->os.oi, obc.get_oid());
+ if (!md->ssc) {
+ ERRORDPP("oid {} missing snapsetcontext", dpp, obc.get_oid());
+ return crimson::ct_error::object_corrupted::make();
+ }
+ obc.set_head_state(std::move(md->os), std::move(md->ssc));
+ return load_obc_ertr::now();
+ });
+ }
+
+ void ObjectContextLoader::notify_on_change(bool is_primary)
+ {
+ LOG_PREFIX(ObjectContextLoader::notify_on_change);
+ DEBUGDPP("is_primary: {}", dpp, is_primary);
+ for (auto& obc : obc_set_accessing) {
+ DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid());
+ obc.interrupt(::crimson::common::actingset_changed(is_primary));
+ }
+ }
+
+ // explicitly instantiate the used instantiations
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc<RWState::RWNONE>(hobject_t,
+ with_obc_func_t&&);
+
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc<RWState::RWREAD>(hobject_t,
+ with_obc_func_t&&);
+
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc<RWState::RWWRITE>(hobject_t,
+ with_obc_func_t&&);
+
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc<RWState::RWEXCL>(hobject_t,
+ with_obc_func_t&&);
+
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_clone_obc_direct<RWState::RWWRITE>(
+ hobject_t,
+ with_both_obc_func_t&&);
+}
diff --git a/src/crimson/osd/object_context_loader.h b/src/crimson/osd/object_context_loader.h
new file mode 100644
index 000000000..3ab7f6ad8
--- /dev/null
+++ b/src/crimson/osd/object_context_loader.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <seastar/core/future.hh>
+#include "crimson/common/errorator.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/pg_backend.h"
+
+namespace crimson::osd {
+class ObjectContextLoader {
+public:
+ using obc_accessing_list_t = boost::intrusive::list<
+ ObjectContext,
+ ObjectContext::obc_accessing_option_t>;
+
+ ObjectContextLoader(
+ ObjectContextRegistry& _obc_services,
+ PGBackend& _backend,
+ DoutPrefixProvider& dpp)
+ : obc_registry{_obc_services},
+ backend{_backend},
+ dpp{dpp}
+ {}
+
+ using load_obc_ertr = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::object_corrupted>;
+ using load_obc_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ load_obc_ertr>;
+
+ using with_obc_func_t =
+ std::function<load_obc_iertr::future<> (ObjectContextRef)>;
+
+ using with_both_obc_func_t =
+ std::function<load_obc_iertr::future<> (ObjectContextRef, ObjectContextRef)>;
+
+ // Use this variant by default
+ template<RWState::State State>
+ load_obc_iertr::future<> with_obc(hobject_t oid,
+ with_obc_func_t&& func);
+
+ // Use this variant in the case where the head object
+ // obc is already locked and only the clone obc is needed.
+ // Avoid nesting with_head_obc() calls by using with_clone_obc()
+ // with an already locked head.
+ template<RWState::State State>
+ load_obc_iertr::future<> with_clone_obc_only(ObjectContextRef head,
+ hobject_t oid,
+ with_obc_func_t&& func);
+
+ // Use this variant in the case where both the head
+ // object *and* the matching clone object are being used
+ // in func.
+ template<RWState::State State>
+ load_obc_iertr::future<> with_clone_obc_direct(
+ hobject_t oid,
+ with_both_obc_func_t&& func);
+
+ load_obc_iertr::future<> reload_obc(ObjectContext& obc) const;
+
+ void notify_on_change(bool is_primary);
+
+private:
+ ObjectContextRegistry& obc_registry;
+ PGBackend& backend;
+ DoutPrefixProvider& dpp;
+ obc_accessing_list_t obc_set_accessing;
+
+ template<RWState::State State>
+ load_obc_iertr::future<> with_clone_obc(hobject_t oid,
+ with_obc_func_t&& func);
+
+ template<RWState::State State>
+ load_obc_iertr::future<> with_head_obc(ObjectContextRef obc,
+ bool existed,
+ with_obc_func_t&& func);
+
+ template<RWState::State State>
+ load_obc_iertr::future<ObjectContextRef>
+ get_or_load_obc(ObjectContextRef obc,
+ bool existed);
+
+ load_obc_iertr::future<ObjectContextRef>
+ load_obc(ObjectContextRef obc);
+};
+}
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
new file mode 100644
index 000000000..040870203
--- /dev/null
+++ b/src/crimson/osd/ops_executer.cc
@@ -0,0 +1,1461 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ops_executer.h"
+
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm_ext/push_back.hpp>
+#include <boost/range/algorithm/max_element.hpp>
+#include <boost/range/numeric.hpp>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <seastar/core/thread.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/watch.h"
+#include "osd/ClassHandler.h"
+#include "osd/SnapMapper.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+OpsExecuter::call_ierrorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op)
+{
+ std::string cname, mname;
+ ceph::bufferlist indata;
+ try {
+ auto bp = std::begin(osd_op.indata);
+ bp.copy(osd_op.op.cls.class_len, cname);
+ bp.copy(osd_op.op.cls.method_len, mname);
+ bp.copy(osd_op.op.cls.indata_len, indata);
+ } catch (buffer::error&) {
+ logger().warn("call unable to decode class + method + indata");
+ return crimson::ct_error::invarg::make();
+ }
+
+ // NOTE: opening a class can actually result in dlopen(), and thus
+ // blocking the entire reactor. Thankfully to ClassHandler's cache
+ // this is supposed to be extremely infrequent.
+ ClassHandler::ClassData* cls;
+ int r = ClassHandler::get_instance().open_class(cname, &cls);
+ if (r) {
+ logger().warn("class {} open got {}", cname, cpp_strerror(r));
+ if (r == -ENOENT) {
+ return crimson::ct_error::operation_not_supported::make();
+ } else if (r == -EPERM) {
+ // propagate permission errors
+ return crimson::ct_error::permission_denied::make();
+ }
+ return crimson::ct_error::input_output_error::make();
+ }
+
+ ClassHandler::ClassMethod* method = cls->get_method(mname);
+ if (!method) {
+ logger().warn("call method {}.{} does not exist", cname, mname);
+ return crimson::ct_error::operation_not_supported::make();
+ }
+
+ const auto flags = method->get_flags();
+ if (!obc->obs.exists && (flags & CLS_METHOD_WR) == 0) {
+ return crimson::ct_error::enoent::make();
+ }
+
+#if 0
+ if (flags & CLS_METHOD_WR) {
+ ctx->user_modify = true;
+ }
+#endif
+
+ logger().debug("calling method {}.{}, num_read={}, num_write={}",
+ cname, mname, num_read, num_write);
+ const auto prev_rd = num_read;
+ const auto prev_wr = num_write;
+ return interruptor::async(
+ [this, method, indata=std::move(indata)]() mutable {
+ ceph::bufferlist outdata;
+ auto cls_context = reinterpret_cast<cls_method_context_t>(this);
+ const auto ret = method->exec(cls_context, indata, outdata);
+ return std::make_pair(ret, std::move(outdata));
+ }
+ ).then_interruptible(
+ [this, prev_rd, prev_wr, &osd_op, flags]
+ (auto outcome) -> call_errorator::future<> {
+ auto& [ret, outdata] = outcome;
+ osd_op.rval = ret;
+
+ logger().debug("do_op_call: method returned ret={}, outdata.length()={}"
+ " while num_read={}, num_write={}",
+ ret, outdata.length(), num_read, num_write);
+ if (num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
+ logger().error("method tried to read object but is not marked RD");
+ osd_op.rval = -EIO;
+ return crimson::ct_error::input_output_error::make();
+ }
+ if (num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
+ logger().error("method tried to update object but is not marked WR");
+ osd_op.rval = -EIO;
+ return crimson::ct_error::input_output_error::make();
+ }
+ // ceph-osd has this implemented in `PrimaryLogPG::execute_ctx`,
+ // grep for `ignore_out_data`.
+ using crimson::common::local_conf;
+ if (op_info.allows_returnvec() &&
+ op_info.may_write() &&
+ ret >= 0 &&
+ outdata.length() > local_conf()->osd_max_write_op_reply_len) {
+ // the justification of this limit it to not inflate the pg log.
+ // that's the reason why we don't worry about pure reads.
+ logger().error("outdata overflow due to .length()={}, limit={}",
+ outdata.length(),
+ local_conf()->osd_max_write_op_reply_len);
+ osd_op.rval = -EOVERFLOW;
+ return crimson::ct_error::value_too_large::make();
+ }
+ // for write calls we never return data expect errors or RETURNVEC.
+ // please refer cls/cls_hello.cc to details.
+ if (!op_info.may_write() || op_info.allows_returnvec() || ret < 0) {
+ osd_op.op.extent.length = outdata.length();
+ osd_op.outdata.claim_append(outdata);
+ }
+ if (ret < 0) {
+ return crimson::stateful_ec{
+ std::error_code(-ret, std::generic_category()) };
+ } else {
+ return seastar::now();
+ }
+ }
+ );
+}
+
+static watch_info_t create_watch_info(const OSDOp& osd_op,
+ const OpsExecuter::ExecutableMessage& msg,
+ entity_addr_t peer_addr)
+{
+ using crimson::common::local_conf;
+ const uint32_t timeout =
+ osd_op.op.watch.timeout == 0 ? local_conf()->osd_client_watch_timeout
+ : osd_op.op.watch.timeout;
+ return {
+ osd_op.op.watch.cookie,
+ timeout,
+ peer_addr
+ };
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_watch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}", __func__);
+ struct connect_ctx_t {
+ ObjectContext::watch_key_t key;
+ crimson::net::ConnectionRef conn;
+ watch_info_t info;
+
+ connect_ctx_t(
+ const OSDOp& osd_op,
+ const ExecutableMessage& msg,
+ crimson::net::ConnectionRef conn)
+ : key(osd_op.op.watch.cookie, msg.get_reqid().name),
+ conn(conn),
+ info(create_watch_info(osd_op, msg, conn->get_peer_addr())) {
+ }
+ };
+
+ return with_effect_on_obc(
+ connect_ctx_t{ osd_op, get_message(), conn },
+ [&](auto& ctx) {
+ const auto& entity = ctx.key.second;
+ auto [it, emplaced] =
+ os.oi.watchers.try_emplace(ctx.key, std::move(ctx.info));
+ if (emplaced) {
+ logger().info("registered new watch {} by {}", it->second, entity);
+ txn.nop();
+ } else {
+ logger().info("found existing watch {} by {}", it->second, entity);
+ }
+ return seastar::now();
+ },
+ [](auto&& ctx, ObjectContextRef obc, Ref<PG> pg) {
+ assert(pg);
+ auto [it, emplaced] = obc->watchers.try_emplace(ctx.key, nullptr);
+ if (emplaced) {
+ const auto& [cookie, entity] = ctx.key;
+ it->second = crimson::osd::Watch::create(
+ obc, ctx.info, entity, std::move(pg));
+ logger().info("op_effect: added new watcher: {}", ctx.key);
+ } else {
+ logger().info("op_effect: found existing watcher: {}", ctx.key);
+ }
+ return it->second->connect(std::move(ctx.conn), true /* will_ping */);
+ }
+ );
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_reconnect(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ const entity_name_t& entity = get_message().get_reqid().name;
+ const auto& cookie = osd_op.op.watch.cookie;
+ if (!os.oi.watchers.count(std::make_pair(cookie, entity))) {
+ return crimson::ct_error::not_connected::make();
+ } else {
+ logger().info("found existing watch by {}", entity);
+ return do_op_watch_subop_watch(osd_op, os, txn);
+ }
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_unwatch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ logger().info("{}", __func__);
+
+ struct disconnect_ctx_t {
+ ObjectContext::watch_key_t key;
+ disconnect_ctx_t(const OSDOp& osd_op, const ExecutableMessage& msg)
+ : key(osd_op.op.watch.cookie, msg.get_reqid().name) {
+ }
+ };
+ return with_effect_on_obc(disconnect_ctx_t{ osd_op, get_message() },
+ [&] (auto& ctx) {
+ const auto& entity = ctx.key.second;
+ if (auto nh = os.oi.watchers.extract(ctx.key); !nh.empty()) {
+ logger().info("removed watch {} by {}", nh.mapped(), entity);
+ txn.nop();
+ } else {
+ logger().info("can't remove: no watch by {}", entity);
+ }
+ return seastar::now();
+ },
+ [] (auto&& ctx, ObjectContextRef obc, Ref<PG>) {
+ if (auto nh = obc->watchers.extract(ctx.key); !nh.empty()) {
+ return seastar::do_with(std::move(nh.mapped()),
+ [ctx](auto&& watcher) {
+ logger().info("op_effect: disconnect watcher {}", ctx.key);
+ return watcher->remove();
+ });
+ } else {
+ logger().info("op_effect: disconnect failed to find watcher {}", ctx.key);
+ return seastar::now();
+ }
+ });
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_ping(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ const entity_name_t& entity = get_message().get_reqid().name;
+ const auto& cookie = osd_op.op.watch.cookie;
+ const auto key = std::make_pair(cookie, entity);
+
+ // Note: WATCH with PING doesn't cause may_write() to return true,
+ // so if there is nothing else in the transaction, this is going
+ // to run do_osd_op_effects, but not write out a log entry */
+ if (!os.oi.watchers.count(key)) {
+ return crimson::ct_error::not_connected::make();
+ }
+ auto it = obc->watchers.find(key);
+ if (it == std::end(obc->watchers) || !it->second->is_connected()) {
+ return crimson::ct_error::timed_out::make();
+ }
+ logger().info("found existing watch by {}", entity);
+ it->second->got_ping(ceph_clock_now());
+ return seastar::now();
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}", __func__);
+ if (!os.exists) {
+ return crimson::ct_error::enoent::make();
+ }
+ switch (osd_op.op.watch.op) {
+ case CEPH_OSD_WATCH_OP_WATCH:
+ return do_op_watch_subop_watch(osd_op, os, txn);
+ case CEPH_OSD_WATCH_OP_RECONNECT:
+ return do_op_watch_subop_reconnect(osd_op, os, txn);
+ case CEPH_OSD_WATCH_OP_PING:
+ return do_op_watch_subop_ping(osd_op, os, txn);
+ case CEPH_OSD_WATCH_OP_UNWATCH:
+ return do_op_watch_subop_unwatch(osd_op, os, txn);
+ case CEPH_OSD_WATCH_OP_LEGACY_WATCH:
+ logger().warn("ignoring CEPH_OSD_WATCH_OP_LEGACY_WATCH");
+ return crimson::ct_error::invarg::make();
+ }
+ logger().warn("unrecognized WATCH subop: {}", osd_op.op.watch.op);
+ return crimson::ct_error::invarg::make();
+}
+
+static uint64_t get_next_notify_id(epoch_t e)
+{
+ // FIXME
+ static std::uint64_t next_notify_id = 0;
+ return (((uint64_t)e) << 32) | ((uint64_t)(next_notify_id++));
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_notify(
+ OSDOp& osd_op,
+ const ObjectState& os)
+{
+ logger().debug("{}, msg epoch: {}", __func__, get_message().get_map_epoch());
+
+ if (!os.exists) {
+ return crimson::ct_error::enoent::make();
+ }
+ struct notify_ctx_t {
+ crimson::net::ConnectionRef conn;
+ notify_info_t ninfo;
+ const uint64_t client_gid;
+ const epoch_t epoch;
+
+ notify_ctx_t(const ExecutableMessage& msg,
+ crimson::net::ConnectionRef conn)
+ : conn(conn),
+ client_gid(msg.get_reqid().name.num()),
+ epoch(msg.get_map_epoch()) {
+ }
+ };
+ return with_effect_on_obc(
+ notify_ctx_t{ get_message(), conn },
+ [&](auto& ctx) {
+ try {
+ auto bp = osd_op.indata.cbegin();
+ uint32_t ver; // obsolete
+ ceph::decode(ver, bp);
+ ceph::decode(ctx.ninfo.timeout, bp);
+ ceph::decode(ctx.ninfo.bl, bp);
+ } catch (const buffer::error&) {
+ ctx.ninfo.timeout = 0;
+ }
+ if (!ctx.ninfo.timeout) {
+ using crimson::common::local_conf;
+ ctx.ninfo.timeout = local_conf()->osd_default_notify_timeout;
+ }
+ ctx.ninfo.notify_id = get_next_notify_id(ctx.epoch);
+ ctx.ninfo.cookie = osd_op.op.notify.cookie;
+ // return our unique notify id to the client
+ ceph::encode(ctx.ninfo.notify_id, osd_op.outdata);
+ return seastar::now();
+ },
+ [](auto&& ctx, ObjectContextRef obc, Ref<PG>) {
+ auto alive_watchers = obc->watchers | boost::adaptors::map_values
+ | boost::adaptors::filtered(
+ [] (const auto& w) {
+ // FIXME: filter as for the `is_ping` in `Watch::start_notify`
+ return w->is_alive();
+ });
+ return crimson::osd::Notify::create_n_propagate(
+ std::begin(alive_watchers),
+ std::end(alive_watchers),
+ std::move(ctx.conn),
+ ctx.ninfo,
+ ctx.client_gid,
+ obc->obs.oi.user_version);
+ }
+ );
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_list_watchers(
+ OSDOp& osd_op,
+ const ObjectState& os)
+{
+ logger().debug("{}", __func__);
+
+ obj_list_watch_response_t response;
+ for (const auto& [key, info] : os.oi.watchers) {
+ logger().debug("{}: key cookie={}, entity={}",
+ __func__, key.first, key.second);
+ assert(key.first == info.cookie);
+ assert(key.second.is_client());
+ response.entries.emplace_back(watch_item_t{
+ key.second, info.cookie, info.timeout_seconds, info.addr});
+ }
+ response.encode(osd_op.outdata, get_message().get_features());
+ return watch_ierrorator::now();
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_notify_ack(
+ OSDOp& osd_op,
+ const ObjectState& os)
+{
+ logger().debug("{}", __func__);
+
+ struct notifyack_ctx_t {
+ const entity_name_t entity;
+ uint64_t watch_cookie;
+ uint64_t notify_id;
+ ceph::bufferlist reply_bl;
+
+ notifyack_ctx_t(const ExecutableMessage& msg)
+ : entity(msg.get_reqid().name) {
+ }
+ };
+ return with_effect_on_obc(notifyack_ctx_t{ get_message() },
+ [&] (auto& ctx) -> watch_errorator::future<> {
+ try {
+ auto bp = osd_op.indata.cbegin();
+ ceph::decode(ctx.notify_id, bp);
+ ceph::decode(ctx.watch_cookie, bp);
+ if (!bp.end()) {
+ ceph::decode(ctx.reply_bl, bp);
+ }
+ } catch (const buffer::error&) {
+ // here we behave differently than ceph-osd. For historical reasons,
+ // it falls back to using `osd_op.op.watch.cookie` as `ctx.notify_id`.
+ // crimson just returns EINVAL if the data cannot be decoded.
+ return crimson::ct_error::invarg::make();
+ }
+ return watch_errorator::now();
+ },
+ [] (auto&& ctx, ObjectContextRef obc, Ref<PG>) {
+ logger().info("notify_ack watch_cookie={}, notify_id={}",
+ ctx.watch_cookie, ctx.notify_id);
+ return seastar::do_for_each(obc->watchers,
+ [ctx=std::move(ctx)] (auto& kv) {
+ const auto& [key, watchp] = kv;
+ static_assert(
+ std::is_same_v<std::decay_t<decltype(watchp)>,
+ seastar::shared_ptr<crimson::osd::Watch>>);
+ auto& [cookie, entity] = key;
+ if (ctx.entity != entity) {
+ logger().debug("skipping watch {}; entity name {} != {}",
+ key, entity, ctx.entity);
+ return seastar::now();
+ }
+ if (ctx.watch_cookie != cookie) {
+ logger().debug("skipping watch {}; cookie {} != {}",
+ key, ctx.watch_cookie, cookie);
+ return seastar::now();
+ }
+ logger().info("acking notify on watch {}", key);
+ return watchp->notify_ack(ctx.notify_id, ctx.reply_bl);
+ });
+ });
+}
+
+// Defined here because there is a circular dependency between OpsExecuter and PG
+template <class Func>
+auto OpsExecuter::do_const_op(Func&& f) {
+ // TODO: pass backend as read-only
+ return std::forward<Func>(f)(pg->get_backend(), std::as_const(obc->obs));
+}
+
+// Defined here because there is a circular dependency between OpsExecuter and PG
+template <class Func>
+auto OpsExecuter::do_write_op(Func&& f, OpsExecuter::modified_by m) {
+ ++num_write;
+ if (!osd_op_params) {
+ osd_op_params.emplace();
+ fill_op_params_bump_pg_version();
+ }
+ user_modify = (m == modified_by::user);
+ return std::forward<Func>(f)(pg->get_backend(), obc->obs, txn);
+}
+OpsExecuter::call_errorator::future<> OpsExecuter::do_assert_ver(
+ OSDOp& osd_op,
+ const ObjectState& os)
+{
+ if (!osd_op.op.assert_ver.ver) {
+ return crimson::ct_error::invarg::make();
+ } else if (osd_op.op.assert_ver.ver < os.oi.user_version) {
+ return crimson::ct_error::erange::make();
+ } else if (osd_op.op.assert_ver.ver > os.oi.user_version) {
+ return crimson::ct_error::value_too_large::make();
+ }
+ return seastar::now();
+}
+
+OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
+ OSDOp& osd_op,
+ const ObjectState& os,
+ const SnapSet& ss)
+{
+ obj_list_snap_response_t resp;
+ resp.clones.reserve(ss.clones.size() + 1);
+ for (auto &clone: ss.clones) {
+ clone_info ci;
+ ci.cloneid = clone;
+
+ {
+ auto p = ss.clone_snaps.find(clone);
+ if (p == ss.clone_snaps.end()) {
+ logger().error(
+ "OpsExecutor::do_list_snaps: {} has inconsistent "
+ "clone_snaps, missing clone {}",
+ os.oi.soid,
+ clone);
+ return crimson::ct_error::invarg::make();
+ }
+ ci.snaps.reserve(p->second.size());
+ ci.snaps.insert(ci.snaps.end(), p->second.rbegin(), p->second.rend());
+ }
+
+ {
+ auto p = ss.clone_overlap.find(clone);
+ if (p == ss.clone_overlap.end()) {
+ logger().error(
+ "OpsExecutor::do_list_snaps: {} has inconsistent "
+ "clone_overlap, missing clone {}",
+ os.oi.soid,
+ clone);
+ return crimson::ct_error::invarg::make();
+ }
+ ci.overlap.reserve(p->second.num_intervals());
+ ci.overlap.insert(ci.overlap.end(), p->second.begin(), p->second.end());
+ }
+
+ {
+ auto p = ss.clone_size.find(clone);
+ if (p == ss.clone_size.end()) {
+ logger().error(
+ "OpsExecutor::do_list_snaps: {} has inconsistent "
+ "clone_size, missing clone {}",
+ os.oi.soid,
+ clone);
+ return crimson::ct_error::invarg::make();
+ }
+ ci.size = p->second;
+ }
+ resp.clones.push_back(std::move(ci));
+ }
+
+ if (!os.oi.is_whiteout()) {
+ clone_info ci;
+ ci.cloneid = CEPH_NOSNAP;
+ ci.size = os.oi.size;
+ resp.clones.push_back(std::move(ci));
+ }
+ resp.seq = ss.seq;
+ logger().error(
+ "OpsExecutor::do_list_snaps: {}, resp.clones.size(): {}",
+ os.oi.soid,
+ resp.clones.size());
+ resp.encode(osd_op.outdata);
+ return read_ierrorator::now();
+}
+
+OpsExecuter::interruptible_errorated_future<OpsExecuter::osd_op_errorator>
+OpsExecuter::execute_op(OSDOp& osd_op)
+{
+ return do_execute_op(osd_op).handle_error_interruptible(
+ osd_op_errorator::all_same_way([&osd_op](auto e, auto&& e_raw)
+ -> OpsExecuter::osd_op_errorator::future<> {
+ // All ops except for CMPEXT should have rval set to -e.value(),
+ // CMPEXT sets rval itself and shouldn't be overridden.
+ if (e.value() != ct_error::cmp_fail_error_value) {
+ osd_op.rval = -e.value();
+ }
+ if ((osd_op.op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
+ e.value() != EAGAIN && e.value() != EINPROGRESS) {
+ return osd_op_errorator::now();
+ } else {
+ return std::move(e_raw);
+ }
+ }));
+}
+
+OpsExecuter::interruptible_errorated_future<OpsExecuter::osd_op_errorator>
+OpsExecuter::do_execute_op(OSDOp& osd_op)
+{
+ // TODO: dispatch via call table?
+ // TODO: we might want to find a way to unify both input and output
+ // of each op.
+ logger().debug(
+ "handling op {} on object {}",
+ ceph_osd_op_name(osd_op.op.op),
+ get_target());
+ switch (const ceph_osd_op& op = osd_op.op; op.op) {
+ case CEPH_OSD_OP_SYNC_READ:
+ [[fallthrough]];
+ case CEPH_OSD_OP_READ:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.read(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_SPARSE_READ:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.sparse_read(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_CHECKSUM:
+ return do_read_op([&osd_op](auto& backend, const auto& os) {
+ return backend.checksum(os, osd_op);
+ });
+ case CEPH_OSD_OP_CMPEXT:
+ return do_read_op([&osd_op](auto& backend, const auto& os) {
+ return backend.cmp_ext(os, osd_op);
+ });
+ case CEPH_OSD_OP_GETXATTR:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.getxattr(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_GETXATTRS:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.get_xattrs(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_CMPXATTR:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.cmp_xattr(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_RMXATTR:
+ return do_write_op([&osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.rm_xattr(os, osd_op, txn);
+ });
+ case CEPH_OSD_OP_CREATE:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.create(os, osd_op, txn, delta_stats);
+ });
+ case CEPH_OSD_OP_WRITE:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.write(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_WRITESAME:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.write_same(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_WRITEFULL:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.writefull(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_ROLLBACK:
+ return do_write_op([this, &head=obc,
+ &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.rollback(os, osd_op, txn, *osd_op_params, delta_stats,
+ head, pg->obc_loader);
+ });
+ case CEPH_OSD_OP_APPEND:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.append(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_TRUNCATE:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ // FIXME: rework needed. Move this out to do_write_op(), introduce
+ // do_write_op_no_user_modify()...
+ return backend.truncate(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_ZERO:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.zero(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_SETALLOCHINT:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.set_allochint(os, osd_op, txn, delta_stats);
+ });
+ case CEPH_OSD_OP_SETXATTR:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.setxattr(os, osd_op, txn, delta_stats);
+ });
+ case CEPH_OSD_OP_DELETE:
+ {
+ bool whiteout = false;
+ if (!obc->ssc->snapset.clones.empty() ||
+ (snapc.snaps.size() && // there are snaps
+ snapc.snaps[0] > obc->ssc->snapset.seq)) { // existing obj is old
+ logger().debug("{} has or will have clones, will whiteout {}",
+ __func__, obc->obs.oi.soid);
+ whiteout = true;
+ }
+ return do_write_op([this, whiteout](auto& backend, auto& os, auto& txn) {
+ return backend.remove(os, txn, delta_stats, whiteout);
+ });
+ }
+ case CEPH_OSD_OP_CALL:
+ return this->do_op_call(osd_op);
+ case CEPH_OSD_OP_STAT:
+ // note: stat does not require RD
+ return do_const_op([this, &osd_op] (/* const */auto& backend, const auto& os) {
+ return backend.stat(os, osd_op, delta_stats);
+ });
+
+ case CEPH_OSD_OP_TMAPPUT:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.tmapput(os, osd_op, txn, delta_stats, *osd_op_params);
+ });
+ case CEPH_OSD_OP_TMAPUP:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto &txn) {
+ return backend.tmapup(os, osd_op, txn, delta_stats, *osd_op_params);
+ });
+ case CEPH_OSD_OP_TMAPGET:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.tmapget(os, osd_op, delta_stats);
+ });
+
+ // OMAP
+ case CEPH_OSD_OP_OMAPGETKEYS:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_get_keys(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPGETVALS:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_get_vals(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAP_CMP:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_cmp(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPGETHEADER:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_get_header(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_get_vals_by_keys(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPSETVALS:
+#if 0
+ if (!pg.get_pgpool().info.supports_omap()) {
+ return crimson::ct_error::operation_not_supported::make();
+ }
+#endif
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_set_vals(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPSETHEADER:
+#if 0
+ if (!pg.get_pgpool().info.supports_omap()) {
+ return crimson::ct_error::operation_not_supported::make();
+ }
+#endif
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_set_header(os, osd_op, txn, *osd_op_params,
+ delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPRMKEYRANGE:
+#if 0
+ if (!pg.get_pgpool().info.supports_omap()) {
+ return crimson::ct_error::operation_not_supported::make();
+ }
+#endif
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_remove_range(os, osd_op, txn, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPRMKEYS:
+ /** TODO: Implement supports_omap()
+ if (!pg.get_pgpool().info.supports_omap()) {
+ return crimson::ct_error::operation_not_supported::make();
+ }*/
+ return do_write_op([&osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_remove_key(os, osd_op, txn);
+ });
+ case CEPH_OSD_OP_OMAPCLEAR:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_clear(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+
+ // watch/notify
+ case CEPH_OSD_OP_WATCH:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return do_op_watch(osd_op, os, txn);
+ }, modified_by::sys);
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ return do_read_op([this, &osd_op](auto&, const auto& os) {
+ return do_op_list_watchers(osd_op, os);
+ });
+ case CEPH_OSD_OP_NOTIFY:
+ return do_read_op([this, &osd_op](auto&, const auto& os) {
+ return do_op_notify(osd_op, os);
+ });
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ return do_read_op([this, &osd_op](auto&, const auto& os) {
+ return do_op_notify_ack(osd_op, os);
+ });
+ case CEPH_OSD_OP_ASSERT_VER:
+ return do_read_op([this, &osd_op](auto&, const auto& os) {
+ return do_assert_ver(osd_op, os);
+ });
+ case CEPH_OSD_OP_LIST_SNAPS:
+ return do_snapset_op([this, &osd_op](const auto &os, const auto &ss) {
+ return do_list_snaps(osd_op, os, ss);
+ });
+
+ default:
+ logger().warn("unknown op {}", ceph_osd_op_name(op.op));
+ throw std::runtime_error(
+ fmt::format("op '{}' not supported", ceph_osd_op_name(op.op)));
+ }
+}
+
+void OpsExecuter::fill_op_params_bump_pg_version()
+{
+ osd_op_params->req_id = msg->get_reqid();
+ osd_op_params->mtime = msg->get_mtime();
+ osd_op_params->at_version = pg->next_version();
+ osd_op_params->pg_trim_to = pg->get_pg_trim_to();
+ osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk();
+ osd_op_params->last_complete = pg->get_info().last_complete;
+}
+
+std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction(
+ const std::vector<OSDOp>& ops)
+{
+ // let's ensure we don't need to inform SnapMapper about this particular
+ // entry.
+ assert(obc->obs.oi.soid.snap >= CEPH_MAXSNAP);
+ std::vector<pg_log_entry_t> log_entries;
+ log_entries.emplace_back(
+ obc->obs.exists ?
+ pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE,
+ obc->obs.oi.soid,
+ osd_op_params->at_version,
+ obc->obs.oi.version,
+ osd_op_params->user_modify ? osd_op_params->at_version.version : 0,
+ osd_op_params->req_id,
+ osd_op_params->mtime,
+ op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0);
+ if (op_info.allows_returnvec()) {
+ // also the per-op values are recorded in the pg log
+ log_entries.back().set_op_returns(ops);
+ logger().debug("{} op_returns: {}",
+ __func__, log_entries.back().op_returns);
+ }
+ log_entries.back().clean_regions = std::move(osd_op_params->clean_regions);
+ return log_entries;
+}
+
+OpsExecuter::interruptible_future<> OpsExecuter::snap_map_remove(
+ const hobject_t& soid,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}: soid {}", __func__, soid);
+ return interruptor::async([soid, &snap_mapper,
+ _t=osdriver.get_transaction(&txn)]() mutable {
+ const auto r = snap_mapper.remove_oid(soid, &_t);
+ if (r) {
+ logger().error("{}: remove_oid {} failed with {}",
+ __func__, soid, r);
+ }
+ // On removal tolerate missing key corruption
+ assert(r == 0 || r == -ENOENT);
+ });
+}
+
+OpsExecuter::interruptible_future<> OpsExecuter::snap_map_modify(
+ const hobject_t& soid,
+ const std::set<snapid_t>& snaps,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps);
+ return interruptor::async([soid, snaps, &snap_mapper,
+ _t=osdriver.get_transaction(&txn)]() mutable {
+ assert(std::size(snaps) > 0);
+ [[maybe_unused]] const auto r = snap_mapper.update_snaps(
+ soid, snaps, 0, &_t);
+ assert(r == 0);
+ });
+}
+
+OpsExecuter::interruptible_future<> OpsExecuter::snap_map_clone(
+ const hobject_t& soid,
+ const std::set<snapid_t>& snaps,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps);
+ return interruptor::async([soid, snaps, &snap_mapper,
+ _t=osdriver.get_transaction(&txn)]() mutable {
+ assert(std::size(snaps) > 0);
+ snap_mapper.add_oid(soid, snaps, &_t);
+ });
+}
+
+// Defined here because there is a circular dependency between OpsExecuter and PG
+uint32_t OpsExecuter::get_pool_stripe_width() const {
+ return pg->get_pgpool().info.get_stripe_width();
+}
+
+// Defined here because there is a circular dependency between OpsExecuter and PG
+version_t OpsExecuter::get_last_user_version() const
+{
+ return pg->get_last_user_version();
+}
+
+std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone(
+ const SnapContext& snapc,
+ const ObjectState& initial_obs,
+ const SnapSet& initial_snapset,
+ PGBackend& backend,
+ ceph::os::Transaction& txn)
+{
+ const hobject_t& soid = initial_obs.oi.soid;
+ logger().debug("{} {} snapset={} snapc={}",
+ __func__, soid,
+ initial_snapset, snapc);
+
+ auto cloning_ctx = std::make_unique<CloningContext>();
+ cloning_ctx->new_snapset = initial_snapset;
+
+ // clone object, the snap field is set to the seq of the SnapContext
+ // at its creation.
+ hobject_t coid = soid;
+ coid.snap = snapc.seq;
+
+ // existing snaps are stored in descending order in snapc,
+ // cloned_snaps vector will hold all the snaps stored until snapset.seq
+ const std::vector<snapid_t> cloned_snaps = [&] {
+ auto last = std::find_if(
+ std::begin(snapc.snaps), std::end(snapc.snaps),
+ [&](snapid_t snap_id) { return snap_id <= initial_snapset.seq; });
+ return std::vector<snapid_t>{std::begin(snapc.snaps), last};
+ }();
+
+ auto [snap_oi, clone_obc] = prepare_clone(coid);
+ // make clone
+ backend.clone(snap_oi, initial_obs, clone_obc->obs, txn);
+
+ delta_stats.num_objects++;
+ if (snap_oi.is_omap()) {
+ delta_stats.num_objects_omap++;
+ }
+ delta_stats.num_object_clones++;
+ // newsnapset is obc's ssc
+ cloning_ctx->new_snapset.clones.push_back(coid.snap);
+ cloning_ctx->new_snapset.clone_size[coid.snap] = initial_obs.oi.size;
+ cloning_ctx->new_snapset.clone_snaps[coid.snap] = cloned_snaps;
+
+ // clone_overlap should contain an entry for each clone
+ // (an empty interval_set if there is no overlap)
+ auto &overlap = cloning_ctx->new_snapset.clone_overlap[coid.snap];
+ if (initial_obs.oi.size) {
+ overlap.insert(0, initial_obs.oi.size);
+ }
+
+ // log clone
+ logger().debug("cloning v {} to {} v {} snaps={} snapset={}",
+ initial_obs.oi.version, coid,
+ osd_op_params->at_version, cloned_snaps, cloning_ctx->new_snapset);
+
+ cloning_ctx->log_entry = {
+ pg_log_entry_t::CLONE,
+ coid,
+ snap_oi.version,
+ initial_obs.oi.version,
+ initial_obs.oi.user_version,
+ osd_reqid_t(),
+ initial_obs.oi.mtime, // will be replaced in `apply_to()`
+ 0
+ };
+ encode(cloned_snaps, cloning_ctx->log_entry.snaps);
+
+ // TODO: update most recent clone_overlap and usage stats
+ return cloning_ctx;
+}
+
+void OpsExecuter::CloningContext::apply_to(
+ std::vector<pg_log_entry_t>& log_entries,
+ ObjectContext& processed_obc) &&
+{
+ log_entry.mtime = processed_obc.obs.oi.mtime;
+ log_entries.emplace_back(std::move(log_entry));
+ processed_obc.ssc->snapset = std::move(new_snapset);
+}
+
+OpsExecuter::interruptible_future<std::vector<pg_log_entry_t>>
+OpsExecuter::flush_clone_metadata(
+ std::vector<pg_log_entry_t>&& log_entries,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn)
+{
+ assert(!txn.empty());
+ auto maybe_snap_mapped = interruptor::now();
+ if (cloning_ctx) {
+ std::move(*cloning_ctx).apply_to(log_entries, *obc);
+ const auto& coid = log_entries.back().soid;
+ const auto& cloned_snaps = obc->ssc->snapset.clone_snaps[coid.snap];
+ maybe_snap_mapped = snap_map_clone(
+ coid,
+ std::set<snapid_t>{std::begin(cloned_snaps), std::end(cloned_snaps)},
+ snap_mapper,
+ osdriver,
+ txn);
+ }
+ if (snapc.seq > obc->ssc->snapset.seq) {
+ // update snapset with latest snap context
+ obc->ssc->snapset.seq = snapc.seq;
+ obc->ssc->snapset.snaps.clear();
+ }
+ logger().debug("{} done, initial snapset={}, new snapset={}",
+ __func__, obc->obs.oi.soid, obc->ssc->snapset);
+ return std::move(
+ maybe_snap_mapped
+ ).then_interruptible([log_entries=std::move(log_entries)]() mutable {
+ return interruptor::make_ready_future<std::vector<pg_log_entry_t>>(
+ std::move(log_entries));
+ });
+}
+
+// TODO: make this static
+std::pair<object_info_t, ObjectContextRef> OpsExecuter::prepare_clone(
+ const hobject_t& coid)
+{
+ object_info_t static_snap_oi(coid);
+ static_snap_oi.version = pg->next_version();
+ static_snap_oi.prior_version = obc->obs.oi.version;
+ static_snap_oi.copy_user_bits(obc->obs.oi);
+ if (static_snap_oi.is_whiteout()) {
+ // clone shouldn't be marked as whiteout
+ static_snap_oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ }
+
+ ObjectContextRef clone_obc;
+ if (pg->is_primary()) {
+ // lookup_or_create
+ auto [c_obc, existed] =
+ pg->obc_registry.get_cached_obc(std::move(coid));
+ assert(!existed);
+ c_obc->obs.oi = static_snap_oi;
+ c_obc->obs.exists = true;
+ c_obc->ssc = obc->ssc;
+ logger().debug("clone_obc: {}", c_obc->obs.oi);
+ clone_obc = std::move(c_obc);
+ }
+ return std::make_pair(std::move(static_snap_oi), std::move(clone_obc));
+}
+
+void OpsExecuter::apply_stats()
+{
+ pg->get_peering_state().apply_op_stats(get_target(), delta_stats);
+ pg->publish_stats_to_osd();
+}
+
+OpsExecuter::OpsExecuter(Ref<PG> pg,
+ ObjectContextRef _obc,
+ const OpInfo& op_info,
+ abstracted_msg_t&& msg,
+ crimson::net::ConnectionRef conn,
+ const SnapContext& _snapc)
+ : pg(std::move(pg)),
+ obc(std::move(_obc)),
+ op_info(op_info),
+ msg(std::move(msg)),
+ conn(conn),
+ snapc(_snapc)
+{
+ if (op_info.may_write() && should_clone(*obc, snapc)) {
+ do_write_op([this](auto& backend, auto& os, auto& txn) {
+ cloning_ctx = execute_clone(std::as_const(snapc),
+ std::as_const(obc->obs),
+ std::as_const(obc->ssc->snapset),
+ backend,
+ txn);
+ });
+ }
+}
+
+static inline std::unique_ptr<const PGLSFilter> get_pgls_filter(
+ const std::string& type,
+ bufferlist::const_iterator& iter)
+{
+ // storing non-const PGLSFilter for the sake of ::init()
+ std::unique_ptr<PGLSFilter> filter;
+ if (type.compare("plain") == 0) {
+ filter = std::make_unique<PGLSPlainFilter>();
+ } else {
+ std::size_t dot = type.find(".");
+ if (dot == type.npos || dot == 0 || dot == type.size() - 1) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ const std::string class_name = type.substr(0, dot);
+ const std::string filter_name = type.substr(dot + 1);
+ ClassHandler::ClassData *cls = nullptr;
+ int r = ClassHandler::get_instance().open_class(class_name, &cls);
+ if (r != 0) {
+ logger().warn("can't open class {}: {}", class_name, cpp_strerror(r));
+ if (r == -EPERM) {
+ // propogate permission error
+ throw crimson::osd::permission_denied{};
+ } else {
+ throw crimson::osd::invalid_argument{};
+ }
+ } else {
+ ceph_assert(cls);
+ }
+
+ ClassHandler::ClassFilter * const class_filter = cls->get_filter(filter_name);
+ if (class_filter == nullptr) {
+ logger().warn("can't find filter {} in class {}", filter_name, class_name);
+ throw crimson::osd::invalid_argument{};
+ }
+
+ filter.reset(class_filter->fn());
+ if (!filter) {
+ // Object classes are obliged to return us something, but let's
+ // give an error rather than asserting out.
+ logger().warn("buggy class {} failed to construct filter {}",
+ class_name, filter_name);
+ throw crimson::osd::invalid_argument{};
+ }
+ }
+
+ ceph_assert(filter);
+ int r = filter->init(iter);
+ if (r < 0) {
+ logger().warn("error initializing filter {}: {}", type, cpp_strerror(r));
+ throw crimson::osd::invalid_argument{};
+ }
+
+ // successfully constructed and initialized, return it.
+ return filter;
+}
+
+static PG::interruptible_future<hobject_t> pgls_filter(
+ const PGLSFilter& filter,
+ const PGBackend& backend,
+ const hobject_t& sobj)
+{
+ if (const auto xattr = filter.get_xattr(); !xattr.empty()) {
+ logger().debug("pgls_filter: filter is interested in xattr={} for obj={}",
+ xattr, sobj);
+ return backend.getxattr(sobj, std::move(xattr)).safe_then_interruptible(
+ [&filter, sobj] (ceph::bufferlist val) {
+ logger().debug("pgls_filter: got xvalue for obj={}", sobj);
+
+ const bool filtered = filter.filter(sobj, val);
+ return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+ }, PGBackend::get_attr_errorator::all_same_way([&filter, sobj] {
+ logger().debug("pgls_filter: got error for obj={}", sobj);
+
+ if (filter.reject_empty_xattr()) {
+ return seastar::make_ready_future<hobject_t>();
+ }
+ ceph::bufferlist val;
+ const bool filtered = filter.filter(sobj, val);
+ return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+ }));
+ } else {
+ ceph::bufferlist empty_lvalue_bl;
+ const bool filtered = filter.filter(sobj, empty_lvalue_bl);
+ return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+ }
+}
+
+static PG::interruptible_future<ceph::bufferlist> do_pgnls_common(
+ const hobject_t& pg_start,
+ const hobject_t& pg_end,
+ const PGBackend& backend,
+ const hobject_t& lower_bound,
+ const std::string& nspace,
+ const uint64_t limit,
+ const PGLSFilter* const filter)
+{
+ if (!(lower_bound.is_min() ||
+ lower_bound.is_max() ||
+ (lower_bound >= pg_start && lower_bound < pg_end))) {
+ // this should only happen with a buggy client.
+ throw std::invalid_argument("outside of PG bounds");
+ }
+
+ return backend.list_objects(lower_bound, limit).then_interruptible(
+ [&backend, filter, nspace](auto&& ret)
+ -> PG::interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> {
+ auto& [objects, next] = ret;
+ auto in_my_namespace = [&nspace](const hobject_t& obj) {
+ using crimson::common::local_conf;
+ if (obj.get_namespace() == local_conf()->osd_hit_set_namespace) {
+ return false;
+ } else if (nspace == librados::all_nspaces) {
+ return true;
+ } else {
+ return obj.get_namespace() == nspace;
+ }
+ };
+ auto to_pglsed = [&backend, filter] (const hobject_t& obj)
+ -> PG::interruptible_future<hobject_t> {
+ // this transformation looks costly. However, I don't have any
+ // reason to think PGLS* operations are critical for, let's say,
+ // general performance.
+ //
+ // from tchaikov: "another way is to use seastar::map_reduce(),
+ // to 1) save the effort to filter the already filtered objects
+ // 2) avoid the space to keep the tuple<bool, object> even if
+ // the object is filtered out".
+ if (filter) {
+ return pgls_filter(*filter, backend, obj);
+ } else {
+ return seastar::make_ready_future<hobject_t>(obj);
+ }
+ };
+
+ auto range = objects | boost::adaptors::filtered(in_my_namespace)
+ | boost::adaptors::transformed(to_pglsed);
+ logger().debug("do_pgnls_common: finishing the 1st stage of pgls");
+ return seastar::when_all_succeed(std::begin(range),
+ std::end(range)).then(
+ [next=std::move(next)] (auto items) mutable {
+ // the sole purpose of this chaining is to pass `next` to 2nd
+ // stage altogether with items
+ logger().debug("do_pgnls_common: 1st done");
+ return seastar::make_ready_future<
+ std::tuple<std::vector<hobject_t>, hobject_t>>(
+ std::move(items), std::move(next));
+ });
+ }).then_interruptible(
+ [pg_end] (auto&& ret) {
+ auto& [items, next] = ret;
+ auto is_matched = [] (const auto& obj) {
+ return !obj.is_min();
+ };
+ auto to_entry = [] (const auto& obj) {
+ return librados::ListObjectImpl{
+ obj.get_namespace(), obj.oid.name, obj.get_key()
+ };
+ };
+
+ pg_nls_response_t response;
+ boost::push_back(response.entries, items | boost::adaptors::filtered(is_matched)
+ | boost::adaptors::transformed(to_entry));
+ response.handle = next.is_max() ? pg_end : next;
+ ceph::bufferlist out;
+ encode(response, out);
+ logger().debug("do_pgnls_common: response.entries.size()= {}",
+ response.entries.size());
+ return seastar::make_ready_future<ceph::bufferlist>(std::move(out));
+ });
+}
+
+static PG::interruptible_future<> do_pgnls(
+ const PG& pg,
+ const std::string& nspace,
+ OSDOp& osd_op)
+{
+ hobject_t lower_bound;
+ try {
+ ceph::decode(lower_bound, osd_op.indata);
+ } catch (const buffer::error&) {
+ throw std::invalid_argument("unable to decode PGNLS handle");
+ }
+ const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+ const auto pg_end = \
+ pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num());
+ return do_pgnls_common(pg_start,
+ pg_end,
+ pg.get_backend(),
+ lower_bound,
+ nspace,
+ osd_op.op.pgls.count,
+ nullptr /* no filter */)
+ .then_interruptible([&osd_op](bufferlist bl) {
+ osd_op.outdata = std::move(bl);
+ return seastar::now();
+ });
+}
+
+static PG::interruptible_future<> do_pgnls_filtered(
+ const PG& pg,
+ const std::string& nspace,
+ OSDOp& osd_op)
+{
+ std::string cname, mname, type;
+ auto bp = osd_op.indata.cbegin();
+ try {
+ ceph::decode(cname, bp);
+ ceph::decode(mname, bp);
+ ceph::decode(type, bp);
+ } catch (const buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ auto filter = get_pgls_filter(type, bp);
+
+ hobject_t lower_bound;
+ try {
+ lower_bound.decode(bp);
+ } catch (const buffer::error&) {
+ throw std::invalid_argument("unable to decode PGNLS_FILTER description");
+ }
+
+ logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}",
+ __func__, cname, mname, type, lower_bound,
+ static_cast<const void*>(filter.get()));
+ return seastar::do_with(std::move(filter),
+ [&, lower_bound=std::move(lower_bound)](auto&& filter) {
+ const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+ const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num());
+ return do_pgnls_common(pg_start,
+ pg_end,
+ pg.get_backend(),
+ lower_bound,
+ nspace,
+ osd_op.op.pgls.count,
+ filter.get())
+ .then_interruptible([&osd_op](bufferlist bl) {
+ osd_op.outdata = std::move(bl);
+ return seastar::now();
+ });
+ });
+}
+
+static PG::interruptible_future<ceph::bufferlist> do_pgls_common(
+ const hobject_t& pg_start,
+ const hobject_t& pg_end,
+ const PGBackend& backend,
+ const hobject_t& lower_bound,
+ const std::string& nspace,
+ const uint64_t limit,
+ const PGLSFilter* const filter)
+{
+ if (!(lower_bound.is_min() ||
+ lower_bound.is_max() ||
+ (lower_bound >= pg_start && lower_bound < pg_end))) {
+ // this should only happen with a buggy client.
+ throw std::invalid_argument("outside of PG bounds");
+ }
+
+ using entries_t = decltype(pg_ls_response_t::entries);
+ return backend.list_objects(lower_bound, limit).then_interruptible(
+ [&backend, filter, nspace](auto&& ret) {
+ auto& [objects, next] = ret;
+ return PG::interruptor::when_all(
+ PG::interruptor::map_reduce(std::move(objects),
+ [&backend, filter, nspace](const hobject_t& obj)
+ -> PG::interruptible_future<hobject_t>{
+ if (obj.get_namespace() == nspace) {
+ if (filter) {
+ return pgls_filter(*filter, backend, obj);
+ } else {
+ return seastar::make_ready_future<hobject_t>(obj);
+ }
+ } else {
+ return seastar::make_ready_future<hobject_t>();
+ }
+ },
+ entries_t{},
+ [](entries_t entries, hobject_t obj) {
+ if (!obj.is_min()) {
+ entries.emplace_back(obj.oid, obj.get_key());
+ }
+ return entries;
+ }),
+ seastar::make_ready_future<hobject_t>(next));
+ }).then_interruptible([pg_end](auto&& ret) {
+ auto entries = std::move(std::get<0>(ret).get0());
+ auto next = std::move(std::get<1>(ret).get0());
+ pg_ls_response_t response;
+ response.handle = next.is_max() ? pg_end : next;
+ response.entries = std::move(entries);
+ ceph::bufferlist out;
+ encode(response, out);
+ logger().debug("{}: response.entries.size()=",
+ __func__, response.entries.size());
+ return seastar::make_ready_future<ceph::bufferlist>(std::move(out));
+ });
+}
+
+static PG::interruptible_future<> do_pgls(
+ const PG& pg,
+ const std::string& nspace,
+ OSDOp& osd_op)
+{
+ hobject_t lower_bound;
+ auto bp = osd_op.indata.cbegin();
+ try {
+ lower_bound.decode(bp);
+ } catch (const buffer::error&) {
+ throw std::invalid_argument{"unable to decode PGLS handle"};
+ }
+ const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+ const auto pg_end =
+ pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num());
+ return do_pgls_common(pg_start,
+ pg_end,
+ pg.get_backend(),
+ lower_bound,
+ nspace,
+ osd_op.op.pgls.count,
+ nullptr /* no filter */)
+ .then_interruptible([&osd_op](bufferlist bl) {
+ osd_op.outdata = std::move(bl);
+ return seastar::now();
+ });
+}
+
+static PG::interruptible_future<> do_pgls_filtered(
+ const PG& pg,
+ const std::string& nspace,
+ OSDOp& osd_op)
+{
+ std::string cname, mname, type;
+ auto bp = osd_op.indata.cbegin();
+ try {
+ ceph::decode(cname, bp);
+ ceph::decode(mname, bp);
+ ceph::decode(type, bp);
+ } catch (const buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ auto filter = get_pgls_filter(type, bp);
+
+ hobject_t lower_bound;
+ try {
+ lower_bound.decode(bp);
+ } catch (const buffer::error&) {
+ throw std::invalid_argument("unable to decode PGLS_FILTER description");
+ }
+
+ logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}",
+ __func__, cname, mname, type, lower_bound,
+ static_cast<const void*>(filter.get()));
+ return seastar::do_with(std::move(filter),
+ [&, lower_bound=std::move(lower_bound)](auto&& filter) {
+ const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+ const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num());
+ return do_pgls_common(pg_start,
+ pg_end,
+ pg.get_backend(),
+ lower_bound,
+ nspace,
+ osd_op.op.pgls.count,
+ filter.get())
+ .then_interruptible([&osd_op](bufferlist bl) {
+ osd_op.outdata = std::move(bl);
+ return seastar::now();
+ });
+ });
+}
+
+PgOpsExecuter::interruptible_future<>
+PgOpsExecuter::execute_op(OSDOp& osd_op)
+{
+ logger().warn("handling op {}", ceph_osd_op_name(osd_op.op.op));
+ switch (const ceph_osd_op& op = osd_op.op; op.op) {
+ case CEPH_OSD_OP_PGLS:
+ return do_pgls(pg, nspace, osd_op);
+ case CEPH_OSD_OP_PGLS_FILTER:
+ return do_pgls_filtered(pg, nspace, osd_op);
+ case CEPH_OSD_OP_PGNLS:
+ return do_pgnls(pg, nspace, osd_op);
+ case CEPH_OSD_OP_PGNLS_FILTER:
+ return do_pgnls_filtered(pg, nspace, osd_op);
+ default:
+ logger().warn("unknown op {}", ceph_osd_op_name(op.op));
+ throw std::runtime_error(
+ fmt::format("op '{}' not supported", ceph_osd_op_name(op.op)));
+ }
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
new file mode 100644
index 000000000..1230b1c5a
--- /dev/null
+++ b/src/crimson/osd/ops_executer.h
@@ -0,0 +1,629 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <fmt/os.h>
+#include <seastar/core/chunked_fifo.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "common/dout.h"
+#include "common/map_cacher.hpp"
+#include "common/static_ptr.h"
+#include "messages/MOSDOp.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/common/interruptible_future.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/shard_services.h"
+
+struct ObjectState;
+struct OSDOp;
+class OSDriver;
+class SnapMapper;
+
+namespace crimson::osd {
+class PG;
+
+// OpsExecuter -- a class for executing ops targeting a certain object.
+class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
+ friend class SnapTrimObjSubEvent;
+
+ using call_errorator = crimson::errorator<
+ crimson::stateful_ec,
+ crimson::ct_error::enoent,
+ crimson::ct_error::eexist,
+ crimson::ct_error::enospc,
+ crimson::ct_error::edquot,
+ crimson::ct_error::cmp_fail,
+ crimson::ct_error::eagain,
+ crimson::ct_error::invarg,
+ crimson::ct_error::erange,
+ crimson::ct_error::ecanceled,
+ crimson::ct_error::enametoolong,
+ crimson::ct_error::permission_denied,
+ crimson::ct_error::operation_not_supported,
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::value_too_large,
+ crimson::ct_error::file_too_large>;
+ using read_errorator = PGBackend::read_errorator;
+ using write_ertr = PGBackend::write_ertr;
+ using get_attr_errorator = PGBackend::get_attr_errorator;
+ using watch_errorator = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::invarg,
+ crimson::ct_error::not_connected,
+ crimson::ct_error::timed_out>;
+
+ using call_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, call_errorator>;
+ using read_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, read_errorator>;
+ using write_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, write_ertr>;
+ using get_attr_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, get_attr_errorator>;
+ using watch_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, watch_errorator>;
+
+ template <typename Errorator, typename T = void>
+ using interruptible_errorated_future =
+ ::crimson::interruptible::interruptible_errorated_future<
+ IOInterruptCondition, Errorator, T>;
+ using interruptor =
+ ::crimson::interruptible::interruptor<IOInterruptCondition>;
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ IOInterruptCondition, T>;
+
+public:
+ // ExecutableMessage -- an interface class to allow using OpsExecuter
+ // with other message types than just the `MOSDOp`. The type erasure
+ // happens in the ctor of `OpsExecuter`.
+ struct ExecutableMessage {
+ virtual osd_reqid_t get_reqid() const = 0;
+ virtual utime_t get_mtime() const = 0;
+ virtual epoch_t get_map_epoch() const = 0;
+ virtual entity_inst_t get_orig_source_inst() const = 0;
+ virtual uint64_t get_features() const = 0;
+ virtual bool has_flag(uint32_t flag) const = 0;
+ virtual entity_name_t get_source() const = 0;
+ };
+
+ template <class ImplT>
+ class ExecutableMessagePimpl final : ExecutableMessage {
+ const ImplT* pimpl;
+ // In crimson, conn is independently maintained outside Message.
+ const crimson::net::ConnectionRef conn;
+ public:
+ ExecutableMessagePimpl(const ImplT* pimpl,
+ const crimson::net::ConnectionRef conn)
+ : pimpl(pimpl), conn(conn) {
+ }
+
+ osd_reqid_t get_reqid() const final {
+ return pimpl->get_reqid();
+ }
+ bool has_flag(uint32_t flag) const final {
+ return pimpl->has_flag(flag);
+ }
+ utime_t get_mtime() const final {
+ return pimpl->get_mtime();
+ };
+ epoch_t get_map_epoch() const final {
+ return pimpl->get_map_epoch();
+ }
+ entity_inst_t get_orig_source_inst() const final {
+ // We can't get the origin source address from the message
+ // since (In Crimson) the connection is maintained
+ // outside of the Message.
+ return entity_inst_t(get_source(), conn->get_peer_addr());
+ }
+ entity_name_t get_source() const final {
+ return pimpl->get_source();
+ }
+ uint64_t get_features() const final {
+ return pimpl->get_features();
+ }
+ };
+
+ // because OpsExecuter is pretty heavy-weight object we want to ensure
+ // it's not copied nor even moved by accident. Performance is the sole
+ // reason for prohibiting that.
+ OpsExecuter(OpsExecuter&&) = delete;
+ OpsExecuter(const OpsExecuter&) = delete;
+
+ using osd_op_errorator = crimson::compound_errorator_t<
+ call_errorator,
+ read_errorator,
+ write_ertr,
+ get_attr_errorator,
+ watch_errorator,
+ PGBackend::stat_errorator>;
+ using osd_op_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, osd_op_errorator>;
+
+ object_stat_sum_t delta_stats;
+private:
+ // an operation can be divided into two stages: main and effect-exposing
+ // one. The former is performed immediately on call to `do_osd_op()` while
+ // the later on `submit_changes()` – after successfully processing main
+ // stages of all involved operations. When any stage fails, none of all
+ // scheduled effect-exposing stages will be executed.
+ // when operation requires this division, some variant of `with_effect()`
+ // should be used.
+ struct effect_t {
+ // an effect can affect PG, i.e. create a watch timeout
+ virtual osd_op_errorator::future<> execute(Ref<PG> pg) = 0;
+ virtual ~effect_t() = default;
+ };
+
+ Ref<PG> pg; // for the sake of object class
+ ObjectContextRef obc;
+ const OpInfo& op_info;
+ using abstracted_msg_t =
+ ceph::static_ptr<ExecutableMessage,
+ sizeof(ExecutableMessagePimpl<void>)>;
+ abstracted_msg_t msg;
+ crimson::net::ConnectionRef conn;
+ std::optional<osd_op_params_t> osd_op_params;
+ bool user_modify = false;
+ ceph::os::Transaction txn;
+
+ size_t num_read = 0; ///< count read ops
+ size_t num_write = 0; ///< count update ops
+
+ SnapContext snapc; // writer snap context
+ struct CloningContext {
+ SnapSet new_snapset;
+ pg_log_entry_t log_entry;
+
+ void apply_to(
+ std::vector<pg_log_entry_t>& log_entries,
+ ObjectContext& processed_obc) &&;
+ };
+ std::unique_ptr<CloningContext> cloning_ctx;
+
+
+ /**
+ * execute_clone
+ *
+ * If snapc contains a snap which occurred logically after the last write
+ * seen by this object (see OpsExecutor::should_clone()), we first need
+ * make a clone of the object at its current state. execute_clone primes
+ * txn with that clone operation and returns an
+ * OpsExecutor::CloningContext which will allow us to fill in the corresponding
+ * metadata and log_entries once the operations have been processed.
+ *
+ * Note that this strategy differs from classic, which instead performs this
+ * work at the end and reorders the transaction. See
+ * PrimaryLogPG::make_writeable
+ *
+ * @param snapc [in] snapc for this operation (from the client if from the
+ * client, from the pool otherwise)
+ * @param initial_obs [in] objectstate for the object at operation start
+ * @param initial_snapset [in] snapset for the object at operation start
+ * @param backend [in,out] interface for generating mutations
+ * @param txn [out] transaction for the operation
+ */
+ std::unique_ptr<CloningContext> execute_clone(
+ const SnapContext& snapc,
+ const ObjectState& initial_obs,
+ const SnapSet& initial_snapset,
+ PGBackend& backend,
+ ceph::os::Transaction& txn);
+
+
+ /**
+ * should_clone
+ *
+ * Predicate returning whether a user write with snap context snapc
+ * contains a snap which occurred prior to the most recent write
+ * on the object reflected in initial_obc.
+ *
+ * @param initial_obc [in] obc for object to be mutated
+ * @param snapc [in] snapc for this operation (from the client if from the
+ * client, from the pool otherwise)
+ */
+ static bool should_clone(
+ const ObjectContext& initial_obc,
+ const SnapContext& snapc) {
+ // clone?
+ return initial_obc.obs.exists // both nominally and...
+ && !initial_obc.obs.oi.is_whiteout() // ... logically exists
+ && snapc.snaps.size() // there are snaps
+ && snapc.snaps[0] > initial_obc.ssc->snapset.seq; // existing obj is old
+ }
+
+ interruptible_future<std::vector<pg_log_entry_t>> flush_clone_metadata(
+ std::vector<pg_log_entry_t>&& log_entries,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn);
+
+ static interruptible_future<> snap_map_remove(
+ const hobject_t& soid,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn);
+ static interruptible_future<> snap_map_modify(
+ const hobject_t& soid,
+ const std::set<snapid_t>& snaps,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn);
+ static interruptible_future<> snap_map_clone(
+ const hobject_t& soid,
+ const std::set<snapid_t>& snaps,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn);
+
+ // this gizmo could be wrapped in std::optional for the sake of lazy
+ // initialization. we don't need it for ops that doesn't have effect
+ // TODO: verify the init overhead of chunked_fifo
+ seastar::chunked_fifo<std::unique_ptr<effect_t>> op_effects;
+
+ template <class Context, class MainFunc, class EffectFunc>
+ auto with_effect_on_obc(
+ Context&& ctx,
+ MainFunc&& main_func,
+ EffectFunc&& effect_func);
+
+ call_ierrorator::future<> do_op_call(OSDOp& osd_op);
+ watch_ierrorator::future<> do_op_watch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_watch_subop_watch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_watch_subop_reconnect(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_watch_subop_unwatch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_watch_subop_ping(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_list_watchers(
+ OSDOp& osd_op,
+ const ObjectState& os);
+ watch_ierrorator::future<> do_op_notify(
+ OSDOp& osd_op,
+ const ObjectState& os);
+ watch_ierrorator::future<> do_op_notify_ack(
+ OSDOp& osd_op,
+ const ObjectState& os);
+ call_errorator::future<> do_assert_ver(
+ OSDOp& osd_op,
+ const ObjectState& os);
+
+ using list_snaps_ertr = read_errorator::extend<
+ crimson::ct_error::invarg>;
+ using list_snaps_iertr = ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ list_snaps_ertr>;
+ list_snaps_iertr::future<> do_list_snaps(
+ OSDOp& osd_op,
+ const ObjectState& os,
+ const SnapSet& ss);
+
+ template <class Func>
+ auto do_const_op(Func&& f);
+
+ template <class Func>
+ auto do_read_op(Func&& f) {
+ ++num_read;
+ // TODO: pass backend as read-only
+ return do_const_op(std::forward<Func>(f));
+ }
+
+ template <class Func>
+ auto do_snapset_op(Func&& f) {
+ ++num_read;
+ return std::invoke(
+ std::forward<Func>(f),
+ std::as_const(obc->obs),
+ std::as_const(obc->ssc->snapset));
+ }
+
+ enum class modified_by {
+ user,
+ sys,
+ };
+
+ template <class Func>
+ auto do_write_op(Func&& f, modified_by m = modified_by::user);
+
+ decltype(auto) dont_do_legacy_op() {
+ return crimson::ct_error::operation_not_supported::make();
+ }
+
+ interruptible_errorated_future<osd_op_errorator>
+ do_execute_op(OSDOp& osd_op);
+
+ OpsExecuter(Ref<PG> pg,
+ ObjectContextRef obc,
+ const OpInfo& op_info,
+ abstracted_msg_t&& msg,
+ crimson::net::ConnectionRef conn,
+ const SnapContext& snapc);
+
+public:
+ template <class MsgT>
+ OpsExecuter(Ref<PG> pg,
+ ObjectContextRef obc,
+ const OpInfo& op_info,
+ const MsgT& msg,
+ crimson::net::ConnectionRef conn,
+ const SnapContext& snapc)
+ : OpsExecuter(
+ std::move(pg),
+ std::move(obc),
+ op_info,
+ abstracted_msg_t{
+ std::in_place_type_t<ExecutableMessagePimpl<MsgT>>{},
+ &msg,
+ conn},
+ conn,
+ snapc) {
+ }
+
+ template <class Func>
+ struct RollbackHelper;
+
+ template <class Func>
+ RollbackHelper<Func> create_rollbacker(Func&& func);
+
+ interruptible_errorated_future<osd_op_errorator>
+ execute_op(OSDOp& osd_op);
+
+ using rep_op_fut_tuple =
+ std::tuple<interruptible_future<>, osd_op_ierrorator::future<>>;
+ using rep_op_fut_t =
+ interruptible_future<rep_op_fut_tuple>;
+ template <typename MutFunc>
+ rep_op_fut_t flush_changes_n_do_ops_effects(
+ const std::vector<OSDOp>& ops,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ MutFunc&& mut_func) &&;
+ std::vector<pg_log_entry_t> prepare_transaction(
+ const std::vector<OSDOp>& ops);
+ void fill_op_params_bump_pg_version();
+
+ ObjectContextRef get_obc() const {
+ return obc;
+ }
+
+ const object_info_t &get_object_info() const {
+ return obc->obs.oi;
+ }
+ const hobject_t &get_target() const {
+ return get_object_info().soid;
+ }
+
+ const auto& get_message() const {
+ return *msg;
+ }
+
+ size_t get_processed_rw_ops_num() const {
+ return num_read + num_write;
+ }
+
+ uint32_t get_pool_stripe_width() const;
+
+ bool has_seen_write() const {
+ return num_write > 0;
+ }
+
+ object_stat_sum_t& get_stats(){
+ return delta_stats;
+ }
+
+ version_t get_last_user_version() const;
+
+ std::pair<object_info_t, ObjectContextRef> prepare_clone(
+ const hobject_t& coid);
+
+ void apply_stats();
+};
+
+template <class Context, class MainFunc, class EffectFunc>
+auto OpsExecuter::with_effect_on_obc(
+ Context&& ctx,
+ MainFunc&& main_func,
+ EffectFunc&& effect_func)
+{
+ using context_t = std::decay_t<Context>;
+ // the language offers implicit conversion to pointer-to-function for
+ // lambda only when it's closureless. We enforce this restriction due
+ // the fact that `flush_changes()` std::moves many executer's parts.
+ using allowed_effect_func_t =
+ seastar::future<> (*)(context_t&&, ObjectContextRef, Ref<PG>);
+ static_assert(std::is_convertible_v<EffectFunc, allowed_effect_func_t>,
+ "with_effect function is not allowed to capture");
+ struct task_t final : effect_t {
+ context_t ctx;
+ EffectFunc effect_func;
+ ObjectContextRef obc;
+
+ task_t(Context&& ctx, EffectFunc&& effect_func, ObjectContextRef obc)
+ : ctx(std::move(ctx)),
+ effect_func(std::move(effect_func)),
+ obc(std::move(obc)) {
+ }
+ osd_op_errorator::future<> execute(Ref<PG> pg) final {
+ return std::move(effect_func)(std::move(ctx),
+ std::move(obc),
+ std::move(pg));
+ }
+ };
+ auto task =
+ std::make_unique<task_t>(std::move(ctx), std::move(effect_func), obc);
+ auto& ctx_ref = task->ctx;
+ op_effects.emplace_back(std::move(task));
+ return std::forward<MainFunc>(main_func)(ctx_ref);
+}
+
+template <typename MutFunc>
+OpsExecuter::rep_op_fut_t
+OpsExecuter::flush_changes_n_do_ops_effects(
+ const std::vector<OSDOp>& ops,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ MutFunc&& mut_func) &&
+{
+ const bool want_mutate = !txn.empty();
+ // osd_op_params are instantiated by every wr-like operation.
+ assert(osd_op_params || !want_mutate);
+ assert(obc);
+ rep_op_fut_t maybe_mutated =
+ interruptor::make_ready_future<rep_op_fut_tuple>(
+ seastar::now(),
+ interruptor::make_interruptible(osd_op_errorator::now()));
+ if (cloning_ctx) {
+ ceph_assert(want_mutate);
+ }
+ if (want_mutate) {
+ if (user_modify) {
+ osd_op_params->user_at_version = osd_op_params->at_version.version;
+ }
+ maybe_mutated = flush_clone_metadata(
+ prepare_transaction(ops),
+ snap_mapper,
+ osdriver,
+ txn
+ ).then_interruptible([mut_func=std::move(mut_func),
+ this](auto&& log_entries) mutable {
+ auto [submitted, all_completed] =
+ std::forward<MutFunc>(mut_func)(std::move(txn),
+ std::move(obc),
+ std::move(*osd_op_params),
+ std::move(log_entries));
+ return interruptor::make_ready_future<rep_op_fut_tuple>(
+ std::move(submitted),
+ osd_op_ierrorator::future<>(std::move(all_completed)));
+ });
+ }
+ apply_stats();
+
+ if (__builtin_expect(op_effects.empty(), true)) {
+ return maybe_mutated;
+ } else {
+ return maybe_mutated.then_unpack_interruptible(
+ // need extra ref pg due to apply_stats() which can be executed after
+ // informing snap mapper
+ [this, pg=this->pg](auto&& submitted, auto&& all_completed) mutable {
+ return interruptor::make_ready_future<rep_op_fut_tuple>(
+ std::move(submitted),
+ all_completed.safe_then_interruptible([this, pg=std::move(pg)] {
+ // let's do the cleaning of `op_effects` in destructor
+ return interruptor::do_for_each(op_effects,
+ [pg=std::move(pg)](auto& op_effect) {
+ return op_effect->execute(pg);
+ });
+ }));
+ });
+ }
+}
+
+template <class Func>
+struct OpsExecuter::RollbackHelper {
+ interruptible_future<> rollback_obc_if_modified(const std::error_code& e);
+ ObjectContextRef get_obc() const {
+ assert(ox);
+ return ox->obc;
+ }
+ seastar::lw_shared_ptr<OpsExecuter> ox;
+ Func func;
+};
+
+template <class Func>
+inline OpsExecuter::RollbackHelper<Func>
+OpsExecuter::create_rollbacker(Func&& func) {
+ return {shared_from_this(), std::forward<Func>(func)};
+}
+
+
+template <class Func>
+OpsExecuter::interruptible_future<>
+OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified(
+ const std::error_code& e)
+{
+ // Oops, an operation had failed. do_osd_ops() altogether with
+ // OpsExecuter already dropped the ObjectStore::Transaction if
+ // there was any. However, this is not enough to completely
+ // rollback as we gave OpsExecuter the very single copy of `obc`
+ // we maintain and we did it for both reading and writing.
+ // Now all modifications must be reverted.
+ //
+ // Let's just reload from the store. Evicting from the shared
+ // LRU would be tricky as next MOSDOp (the one at `get_obc`
+ // phase) could actually already finished the lookup. Fortunately,
+ // this is supposed to live on cold paths, so performance is not
+ // a concern -- simplicity wins.
+ //
+ // The conditional's purpose is to efficiently handle hot errors
+ // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or
+ // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients
+ // typically append them before any write. If OpsExecuter hasn't
+ // seen any modifying operation, `obc` is supposed to be kept
+ // unchanged.
+ assert(ox);
+ const auto need_rollback = ox->has_seen_write();
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{}: object {} got error {}, need_rollback={}",
+ __func__,
+ ox->obc->get_oid(),
+ e,
+ need_rollback);
+ return need_rollback ? func(*ox->obc) : interruptor::now();
+}
+
+// PgOpsExecuter -- a class for executing ops targeting a certain PG.
+class PgOpsExecuter {
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ IOInterruptCondition, T>;
+
+public:
+ PgOpsExecuter(const PG& pg, const MOSDOp& msg)
+ : pg(pg), nspace(msg.get_hobj().nspace) {
+ }
+
+ interruptible_future<> execute_op(OSDOp& osd_op);
+
+private:
+ const PG& pg;
+ const std::string& nspace;
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
new file mode 100644
index 000000000..cfe4f54ab
--- /dev/null
+++ b/src/crimson/osd/osd.cc
@@ -0,0 +1,1357 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd.h"
+
+#include <sys/utsname.h>
+
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/range/join.hpp>
+#include <fmt/format.h>
+#include <fmt/os.h>
+#include <fmt/ostream.h>
+#include <seastar/core/timer.hh>
+
+#include "common/pick_address.h"
+#include "include/util.h"
+
+#include "messages/MCommand.h"
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDPeeringOp.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MPGStats.h"
+
+#include "os/Transaction.h"
+#include "osd/ClassHandler.h"
+#include "osd/OSDCap.h"
+#include "osd/PGPeeringEvent.h"
+#include "osd/PeeringState.h"
+
+#include "crimson/admin/osd_admin.h"
+#include "crimson/admin/pg_commands.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/exception.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/heartbeat.h"
+#include "crimson/osd/osd_meta.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_meta.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/osd_operations/recovery_subrequest.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/crush/CrushLocation.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+ static constexpr int TICK_INTERVAL = 1;
+}
+
+using std::make_unique;
+using std::map;
+using std::pair;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using crimson::common::local_conf;
+using crimson::os::FuturizedStore;
+
+namespace crimson::osd {
+
+OSD::OSD(int id, uint32_t nonce,
+ seastar::abort_source& abort_source,
+ crimson::os::FuturizedStore& store,
+ crimson::net::MessengerRef cluster_msgr,
+ crimson::net::MessengerRef public_msgr,
+ crimson::net::MessengerRef hb_front_msgr,
+ crimson::net::MessengerRef hb_back_msgr)
+ : whoami{id},
+ nonce{nonce},
+ abort_source{abort_source},
+ // do this in background
+ beacon_timer{[this] { (void)send_beacon(); }},
+ cluster_msgr{cluster_msgr},
+ public_msgr{public_msgr},
+ hb_front_msgr{hb_front_msgr},
+ hb_back_msgr{hb_back_msgr},
+ monc{new crimson::mon::Client{*public_msgr, *this}},
+ mgrc{new crimson::mgr::Client{*public_msgr, *this}},
+ store{store},
+ pg_shard_manager{osd_singleton_state,
+ shard_services,
+ pg_to_shard_mappings},
+ // do this in background -- continuation rearms timer when complete
+ tick_timer{[this] {
+ std::ignore = update_heartbeat_peers(
+ ).then([this] {
+ update_stats();
+ tick_timer.arm(
+ std::chrono::seconds(TICK_INTERVAL));
+ });
+ }},
+ asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()},
+ log_client(cluster_msgr.get(), LogClient::NO_FLAGS),
+ clog(log_client.create_channel())
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr),
+ std::ref(hb_front_msgr), std::ref(hb_back_msgr)}) {
+ msgr.get()->set_auth_server(monc.get());
+ msgr.get()->set_auth_client(monc.get());
+ }
+
+ if (local_conf()->osd_open_classes_on_start) {
+ const int r = ClassHandler::get_instance().open_all_classes();
+ if (r) {
+ logger().warn("{} warning: got an error loading one or more classes: {}",
+ __func__, cpp_strerror(r));
+ }
+ }
+ logger().info("{}: nonce is {}", __func__, nonce);
+ monc->set_log_client(&log_client);
+ clog->set_log_to_monitors(true);
+}
+
+OSD::~OSD() = default;
+
+namespace {
+// Initial features in new superblock.
+// Features here are also automatically upgraded
+CompatSet get_osd_initial_compat_set()
+{
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
+ return CompatSet(ceph_osd_feature_compat,
+ ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+}
+
+seastar::future<> OSD::open_meta_coll()
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return store.get_sharded_store().open_collection(
+ coll_t::meta()
+ ).then([this](auto ch) {
+ pg_shard_manager.init_meta_coll(ch, store.get_sharded_store());
+ return seastar::now();
+ });
+}
+
+seastar::future<OSDMeta> OSD::open_or_create_meta_coll(FuturizedStore &store)
+{
+ return store.get_sharded_store().open_collection(coll_t::meta()).then([&store](auto ch) {
+ if (!ch) {
+ return store.get_sharded_store().create_new_collection(
+ coll_t::meta()
+ ).then([&store](auto ch) {
+ return OSDMeta(ch, store.get_sharded_store());
+ });
+ } else {
+ return seastar::make_ready_future<OSDMeta>(ch, store.get_sharded_store());
+ }
+ });
+}
+
+seastar::future<> OSD::mkfs(
+ FuturizedStore &store,
+ unsigned whoami,
+ uuid_d osd_uuid,
+ uuid_d cluster_fsid,
+ std::string osdspec_affinity)
+{
+ return store.start().then([&store, osd_uuid] {
+ return store.mkfs(osd_uuid).handle_error(
+ crimson::stateful_ec::handle([] (const auto& ec) {
+ logger().error("error creating empty object store in {}: ({}) {}",
+ local_conf().get_val<std::string>("osd_data"),
+ ec.value(), ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([&store] {
+ return store.mount().handle_error(
+ crimson::stateful_ec::handle([](const auto& ec) {
+ logger().error("error mounting object store in {}: ({}) {}",
+ local_conf().get_val<std::string>("osd_data"),
+ ec.value(), ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([&store] {
+ return open_or_create_meta_coll(store);
+ }).then([&store, whoami, cluster_fsid](auto meta_coll) {
+ OSDSuperblock superblock;
+ superblock.cluster_fsid = cluster_fsid;
+ superblock.osd_fsid = store.get_fsid();
+ superblock.whoami = whoami;
+ superblock.compat_features = get_osd_initial_compat_set();
+ return _write_superblock(
+ store, std::move(meta_coll), std::move(superblock));
+ }).then([&store, cluster_fsid] {
+ return store.write_meta("ceph_fsid", cluster_fsid.to_string());
+ }).then([&store] {
+ return store.write_meta("magic", CEPH_OSD_ONDISK_MAGIC);
+ }).then([&store, whoami] {
+ return store.write_meta("whoami", std::to_string(whoami));
+ }).then([&store] {
+ return _write_key_meta(store);
+ }).then([&store, osdspec_affinity=std::move(osdspec_affinity)] {
+ return store.write_meta("osdspec_affinity", osdspec_affinity);
+ }).then([&store] {
+ return store.write_meta("ready", "ready");
+ }).then([&store, whoami, cluster_fsid] {
+ fmt::print("created object store {} for osd.{} fsid {}\n",
+ local_conf().get_val<std::string>("osd_data"),
+ whoami, cluster_fsid);
+ return store.umount();
+ }).then([&store] {
+ return store.stop();
+ });
+}
+
+seastar::future<> OSD::_write_superblock(
+ FuturizedStore &store,
+ OSDMeta meta_coll,
+ OSDSuperblock superblock)
+{
+ return seastar::do_with(
+ std::move(meta_coll),
+ std::move(superblock),
+ [&store](auto &meta_coll, auto &superblock) {
+ return meta_coll.load_superblock(
+ ).safe_then([&superblock](OSDSuperblock&& sb) {
+ if (sb.cluster_fsid != superblock.cluster_fsid) {
+ logger().error("provided cluster fsid {} != superblock's {}",
+ sb.cluster_fsid, superblock.cluster_fsid);
+ throw std::invalid_argument("mismatched fsid");
+ }
+ if (sb.whoami != superblock.whoami) {
+ logger().error("provided osd id {} != superblock's {}",
+ sb.whoami, superblock.whoami);
+ throw std::invalid_argument("mismatched osd id");
+ }
+ }).handle_error(
+ crimson::ct_error::enoent::handle([&store, &meta_coll, &superblock] {
+ // meta collection does not yet, create superblock
+ logger().info(
+ "{} writing superblock cluster_fsid {} osd_fsid {}",
+ "_write_superblock",
+ superblock.cluster_fsid,
+ superblock.osd_fsid);
+ ceph::os::Transaction t;
+ meta_coll.create(t);
+ meta_coll.store_superblock(t, superblock);
+ logger().debug("OSD::_write_superblock: do_transaction...");
+ return store.get_sharded_store().do_transaction(
+ meta_coll.collection(),
+ std::move(t));
+ }),
+ crimson::ct_error::assert_all("_write_superbock error")
+ );
+ });
+}
+
+// this `to_string` sits in the `crimson::osd` namespace, so we don't brake
+// the language rule on not overloading in `std::`.
+static std::string to_string(const seastar::temporary_buffer<char>& temp_buf)
+{
+ return {temp_buf.get(), temp_buf.size()};
+}
+
+seastar::future<> OSD::_write_key_meta(FuturizedStore &store)
+{
+
+ if (auto key = local_conf().get_val<std::string>("key"); !std::empty(key)) {
+ return store.write_meta("osd_key", key);
+ } else if (auto keyfile = local_conf().get_val<std::string>("keyfile");
+ !std::empty(keyfile)) {
+ return read_file(keyfile).then([&store](const auto& temp_buf) {
+ // it's on a truly cold path, so don't worry about memcpy.
+ return store.write_meta("osd_key", to_string(temp_buf));
+ }).handle_exception([keyfile] (auto ep) {
+ logger().error("_write_key_meta: failed to handle keyfile {}: {}",
+ keyfile, ep);
+ ceph_abort();
+ });
+ } else {
+ return seastar::now();
+ }
+}
+
+namespace {
+ entity_addrvec_t pick_addresses(int what) {
+ entity_addrvec_t addrs;
+ crimson::common::CephContext cct;
+ // we're interested solely in v2; crimson doesn't do v1
+ const auto flags = what | CEPH_PICK_ADDRESS_MSGR2;
+ if (int r = ::pick_addresses(&cct, flags, &addrs, -1); r < 0) {
+ throw std::runtime_error("failed to pick address");
+ }
+ for (auto addr : addrs.v) {
+ logger().info("picked address {}", addr);
+ }
+ return addrs;
+ }
+ std::pair<entity_addrvec_t, bool>
+ replace_unknown_addrs(entity_addrvec_t maybe_unknowns,
+ const entity_addrvec_t& knowns) {
+ bool changed = false;
+ auto maybe_replace = [&](entity_addr_t addr) {
+ if (!addr.is_blank_ip()) {
+ return addr;
+ }
+ for (auto& b : knowns.v) {
+ if (addr.get_family() == b.get_family()) {
+ auto a = b;
+ a.set_nonce(addr.get_nonce());
+ a.set_type(addr.get_type());
+ a.set_port(addr.get_port());
+ changed = true;
+ return a;
+ }
+ }
+ throw std::runtime_error("failed to replace unknown address");
+ };
+ entity_addrvec_t replaced;
+ std::transform(maybe_unknowns.v.begin(),
+ maybe_unknowns.v.end(),
+ std::back_inserter(replaced.v),
+ maybe_replace);
+ return {replaced, changed};
+ }
+}
+
+seastar::future<> OSD::start()
+{
+ logger().info("start");
+
+ startup_time = ceph::mono_clock::now();
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return store.start().then([this] {
+ return pg_to_shard_mappings.start(0, seastar::smp::count
+ ).then([this] {
+ return osd_singleton_state.start_single(
+ whoami, std::ref(*cluster_msgr), std::ref(*public_msgr),
+ std::ref(*monc), std::ref(*mgrc));
+ }).then([this] {
+ return osd_states.start();
+ }).then([this] {
+ ceph::mono_time startup_time = ceph::mono_clock::now();
+ return shard_services.start(
+ std::ref(osd_singleton_state),
+ std::ref(pg_to_shard_mappings),
+ whoami,
+ startup_time,
+ osd_singleton_state.local().perf,
+ osd_singleton_state.local().recoverystate_perf,
+ std::ref(store),
+ std::ref(osd_states));
+ });
+ }).then([this] {
+ heartbeat.reset(new Heartbeat{
+ whoami, get_shard_services(),
+ *monc, *hb_front_msgr, *hb_back_msgr});
+ return store.mount().handle_error(
+ crimson::stateful_ec::handle([] (const auto& ec) {
+ logger().error("error mounting object store in {}: ({}) {}",
+ local_conf().get_val<std::string>("osd_data"),
+ ec.value(), ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([this] {
+ return open_meta_coll();
+ }).then([this] {
+ return pg_shard_manager.get_meta_coll().load_superblock(
+ ).handle_error(
+ crimson::ct_error::assert_all("open_meta_coll error")
+ );
+ }).then([this](OSDSuperblock&& sb) {
+ superblock = std::move(sb);
+ pg_shard_manager.set_superblock(superblock);
+ return pg_shard_manager.get_local_map(superblock.current_epoch);
+ }).then([this](OSDMapService::local_cached_map_t&& map) {
+ osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(map));
+ return pg_shard_manager.update_map(std::move(map));
+ }).then([this] {
+ return shard_services.invoke_on_all([this](auto &local_service) {
+ local_service.local_state.osdmap_gate.got_map(osdmap->get_epoch());
+ });
+ }).then([this] {
+ bind_epoch = osdmap->get_epoch();
+ return pg_shard_manager.load_pgs(store);
+ }).then([this] {
+ uint64_t osd_required =
+ CEPH_FEATURE_UID |
+ CEPH_FEATURE_PGID64 |
+ CEPH_FEATURE_OSDENC;
+ using crimson::net::SocketPolicy;
+
+ public_msgr->set_default_policy(SocketPolicy::stateless_server(0));
+ public_msgr->set_policy(entity_name_t::TYPE_MON,
+ SocketPolicy::lossy_client(osd_required));
+ public_msgr->set_policy(entity_name_t::TYPE_MGR,
+ SocketPolicy::lossy_client(osd_required));
+ public_msgr->set_policy(entity_name_t::TYPE_OSD,
+ SocketPolicy::stateless_server(0));
+
+ cluster_msgr->set_default_policy(SocketPolicy::stateless_server(0));
+ cluster_msgr->set_policy(entity_name_t::TYPE_MON,
+ SocketPolicy::lossy_client(0));
+ cluster_msgr->set_policy(entity_name_t::TYPE_OSD,
+ SocketPolicy::lossless_peer(osd_required));
+ cluster_msgr->set_policy(entity_name_t::TYPE_CLIENT,
+ SocketPolicy::stateless_server(0));
+
+ crimson::net::dispatchers_t dispatchers{this, monc.get(), mgrc.get()};
+ return seastar::when_all_succeed(
+ cluster_msgr->bind(pick_addresses(CEPH_PICK_ADDRESS_CLUSTER))
+ .safe_then([this, dispatchers]() mutable {
+ return cluster_msgr->start(dispatchers);
+ }, crimson::net::Messenger::bind_ertr::all_same_way(
+ [] (const std::error_code& e) {
+ logger().error("cluster messenger bind(): {}", e);
+ ceph_abort();
+ })),
+ public_msgr->bind(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC))
+ .safe_then([this, dispatchers]() mutable {
+ return public_msgr->start(dispatchers);
+ }, crimson::net::Messenger::bind_ertr::all_same_way(
+ [] (const std::error_code& e) {
+ logger().error("public messenger bind(): {}", e);
+ ceph_abort();
+ })));
+ }).then_unpack([this] {
+ return seastar::when_all_succeed(monc->start(),
+ mgrc->start());
+ }).then_unpack([this] {
+ return _add_me_to_crush();
+ }).then([this] {
+ monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
+ monc->sub_want("mgrmap", 0, 0);
+ monc->sub_want("osdmap", 0, 0);
+ return monc->renew_subs();
+ }).then([this] {
+ if (auto [addrs, changed] =
+ replace_unknown_addrs(cluster_msgr->get_myaddrs(),
+ public_msgr->get_myaddrs()); changed) {
+ logger().debug("replacing unkwnown addrs of cluster messenger");
+ cluster_msgr->set_myaddrs(addrs);
+ }
+ return heartbeat->start(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC),
+ pick_addresses(CEPH_PICK_ADDRESS_CLUSTER));
+ }).then([this] {
+ // create the admin-socket server, and the objects that register
+ // to handle incoming commands
+ return start_asok_admin();
+ }).then([this] {
+ return log_client.set_fsid(monc->get_fsid());
+ }).then([this] {
+ return start_boot();
+ });
+}
+
+seastar::future<> OSD::start_boot()
+{
+ pg_shard_manager.set_preboot();
+ return monc->get_version("osdmap").then([this](auto&& ret) {
+ auto [newest, oldest] = ret;
+ return _preboot(oldest, newest);
+ });
+}
+
+seastar::future<> OSD::_preboot(version_t oldest, version_t newest)
+{
+ logger().info("osd.{}: _preboot", whoami);
+ if (osdmap->get_epoch() == 0) {
+ logger().info("waiting for initial osdmap");
+ } else if (osdmap->is_destroyed(whoami)) {
+ logger().warn("osdmap says I am destroyed");
+ // provide a small margin so we don't livelock seeing if we
+ // un-destroyed ourselves.
+ if (osdmap->get_epoch() > newest - 1) {
+ throw std::runtime_error("i am destroyed");
+ }
+ } else if (osdmap->is_noup(whoami)) {
+ logger().warn("osdmap NOUP flag is set, waiting for it to clear");
+ } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+ logger().error("osdmap SORTBITWISE OSDMap flag is NOT set; please set it");
+ } else if (osdmap->require_osd_release < ceph_release_t::octopus) {
+ logger().error("osdmap require_osd_release < octopus; please upgrade to octopus");
+ } else if (false) {
+ // TODO: update mon if current fullness state is different from osdmap
+ } else if (version_t n = local_conf()->osd_map_message_max;
+ osdmap->get_epoch() >= oldest - 1 &&
+ osdmap->get_epoch() + n > newest) {
+ return _send_boot();
+ }
+ // get all the latest maps
+ if (osdmap->get_epoch() + 1 >= oldest) {
+ return get_shard_services().osdmap_subscribe(osdmap->get_epoch() + 1, false);
+ } else {
+ return get_shard_services().osdmap_subscribe(oldest - 1, true);
+ }
+}
+
+seastar::future<> OSD::_send_boot()
+{
+ pg_shard_manager.set_booting();
+
+ entity_addrvec_t public_addrs = public_msgr->get_myaddrs();
+ entity_addrvec_t cluster_addrs = cluster_msgr->get_myaddrs();
+ entity_addrvec_t hb_back_addrs = heartbeat->get_back_addrs();
+ entity_addrvec_t hb_front_addrs = heartbeat->get_front_addrs();
+ if (cluster_msgr->set_addr_unknowns(public_addrs)) {
+ cluster_addrs = cluster_msgr->get_myaddrs();
+ }
+ if (heartbeat->get_back_msgr().set_addr_unknowns(cluster_addrs)) {
+ hb_back_addrs = heartbeat->get_back_addrs();
+ }
+ if (heartbeat->get_front_msgr().set_addr_unknowns(public_addrs)) {
+ hb_front_addrs = heartbeat->get_front_addrs();
+ }
+ logger().info("hb_back_msgr: {}", hb_back_addrs);
+ logger().info("hb_front_msgr: {}", hb_front_addrs);
+ logger().info("cluster_msgr: {}", cluster_addrs);
+
+ auto m = crimson::make_message<MOSDBoot>(superblock,
+ osdmap->get_epoch(),
+ boot_epoch,
+ hb_back_addrs,
+ hb_front_addrs,
+ cluster_addrs,
+ CEPH_FEATURES_ALL);
+ collect_sys_info(&m->metadata, NULL);
+
+ // See OSDMonitor::preprocess_boot, prevents boot without allow_crimson
+ // OSDMap flag
+ m->metadata["osd_type"] = "crimson";
+ return monc->send_message(std::move(m));
+}
+
+seastar::future<> OSD::_add_me_to_crush()
+{
+ if (!local_conf().get_val<bool>("osd_crush_update_on_start")) {
+ return seastar::now();
+ }
+ auto get_weight = [this] {
+ if (auto w = local_conf().get_val<double>("osd_crush_initial_weight");
+ w >= 0) {
+ return seastar::make_ready_future<double>(w);
+ } else {
+ return store.stat().then([](auto st) {
+ auto total = st.total;
+ return seastar::make_ready_future<double>(
+ std::max(.00001,
+ double(total) / double(1ull << 40))); // TB
+ });
+ }
+ };
+ return get_weight().then([this](auto weight) {
+ const crimson::crush::CrushLocation loc;
+ return seastar::do_with(
+ std::move(loc),
+ [this, weight] (crimson::crush::CrushLocation& loc) {
+ return loc.init_on_startup().then([this, weight, &loc]() {
+ logger().info("crush location is {}", loc);
+ string cmd = fmt::format(R"({{
+ "prefix": "osd crush create-or-move",
+ "id": {},
+ "weight": {:.4f},
+ "args": [{}]
+ }})", whoami, weight, loc);
+ return monc->run_command(std::move(cmd), {});
+ });
+ });
+ }).then([](auto&& command_result) {
+ [[maybe_unused]] auto [code, message, out] = std::move(command_result);
+ if (code) {
+ logger().warn("fail to add to crush: {} ({})", message, code);
+ throw std::runtime_error("fail to add to crush");
+ } else {
+ logger().info("added to crush: {}", message);
+ }
+ return seastar::now();
+ });
+}
+
+seastar::future<> OSD::handle_command(
+ crimson::net::ConnectionRef conn,
+ Ref<MCommand> m)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return asok->handle_command(conn, std::move(m));
+}
+
+/*
+ The OSD's Admin Socket object created here has two servers (i.e. - blocks of commands
+ to handle) registered to it:
+ - OSD's specific commands are handled by the OSD object;
+ - there are some common commands registered to be directly handled by the AdminSocket object
+ itself.
+*/
+seastar::future<> OSD::start_asok_admin()
+{
+ auto asok_path = local_conf().get_val<std::string>("admin_socket");
+ using namespace crimson::admin;
+ return asok->start(asok_path).then([this] {
+ asok->register_admin_commands();
+ asok->register_command(make_asok_hook<OsdStatusHook>(std::as_const(*this)));
+ asok->register_command(make_asok_hook<SendBeaconHook>(*this));
+ asok->register_command(make_asok_hook<FlushPgStatsHook>(*this));
+ asok->register_command(
+ make_asok_hook<DumpPGStateHistory>(std::as_const(pg_shard_manager)));
+ asok->register_command(make_asok_hook<DumpMetricsHook>());
+ asok->register_command(make_asok_hook<DumpPerfCountersHook>());
+ asok->register_command(make_asok_hook<InjectDataErrorHook>(get_shard_services()));
+ asok->register_command(make_asok_hook<InjectMDataErrorHook>(get_shard_services()));
+ // PG commands
+ asok->register_command(make_asok_hook<pg::QueryCommand>(*this));
+ asok->register_command(make_asok_hook<pg::MarkUnfoundLostCommand>(*this));
+ // ops commands
+ asok->register_command(
+ make_asok_hook<DumpInFlightOpsHook>(
+ std::as_const(pg_shard_manager)));
+ asok->register_command(
+ make_asok_hook<DumpHistoricOpsHook>(
+ std::as_const(get_shard_services().get_registry())));
+ asok->register_command(
+ make_asok_hook<DumpSlowestHistoricOpsHook>(
+ std::as_const(get_shard_services().get_registry())));
+ asok->register_command(
+ make_asok_hook<DumpRecoveryReservationsHook>(get_shard_services()));
+ });
+}
+
+seastar::future<> OSD::stop()
+{
+ logger().info("stop");
+ beacon_timer.cancel();
+ tick_timer.cancel();
+ // see also OSD::shutdown()
+ return prepare_to_stop().then([this] {
+ return pg_shard_manager.set_stopping();
+ }).then([this] {
+ logger().debug("prepared to stop");
+ public_msgr->stop();
+ cluster_msgr->stop();
+ auto gate_close_fut = gate.close();
+ return asok->stop().then([this] {
+ return heartbeat->stop();
+ }).then([this] {
+ return pg_shard_manager.stop_registries();
+ }).then([this] {
+ return store.umount();
+ }).then([this] {
+ return store.stop();
+ }).then([this] {
+ return pg_shard_manager.stop_pgs();
+ }).then([this] {
+ return monc->stop();
+ }).then([this] {
+ return mgrc->stop();
+ }).then([this] {
+ return shard_services.stop();
+ }).then([this] {
+ return osd_states.stop();
+ }).then([this] {
+ return osd_singleton_state.stop();
+ }).then([this] {
+ return pg_to_shard_mappings.stop();
+ }).then([fut=std::move(gate_close_fut)]() mutable {
+ return std::move(fut);
+ }).then([this] {
+ return when_all_succeed(
+ public_msgr->shutdown(),
+ cluster_msgr->shutdown()).discard_result();
+ }).handle_exception([](auto ep) {
+ logger().error("error while stopping osd: {}", ep);
+ });
+ });
+}
+
+void OSD::dump_status(Formatter* f) const
+{
+ f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
+ f->dump_stream("osd_fsid") << superblock.osd_fsid;
+ f->dump_unsigned("whoami", superblock.whoami);
+ f->dump_string("state", pg_shard_manager.get_osd_state_string());
+ f->dump_unsigned("oldest_map", superblock.oldest_map);
+ f->dump_unsigned("cluster_osdmap_trim_lower_bound",
+ superblock.cluster_osdmap_trim_lower_bound);
+ f->dump_unsigned("newest_map", superblock.newest_map);
+ f->dump_unsigned("num_pgs", pg_shard_manager.get_num_pgs());
+}
+
+void OSD::print(std::ostream& out) const
+{
+ out << "{osd." << superblock.whoami << " "
+ << superblock.osd_fsid << " [" << superblock.oldest_map
+ << "," << superblock.newest_map << "] "
+ << "tlb:" << superblock.cluster_osdmap_trim_lower_bound
+ << " pgs:" << pg_shard_manager.get_num_pgs()
+ << "}";
+}
+
+std::optional<seastar::future<>>
+OSD::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+ if (pg_shard_manager.is_stopping()) {
+ return seastar::now();
+ }
+ auto maybe_ret = do_ms_dispatch(conn, std::move(m));
+ if (!maybe_ret.has_value()) {
+ return std::nullopt;
+ }
+
+ gate.dispatch_in_background(
+ __func__, *this, [ret=std::move(maybe_ret.value())]() mutable {
+ return std::move(ret);
+ });
+ return seastar::now();
+}
+
+std::optional<seastar::future<>>
+OSD::do_ms_dispatch(
+ crimson::net::ConnectionRef conn,
+ MessageRef m)
+{
+ if (seastar::this_shard_id() != PRIMARY_CORE) {
+ switch (m->get_type()) {
+ case CEPH_MSG_OSD_MAP:
+ case MSG_COMMAND:
+ case MSG_OSD_MARK_ME_DOWN:
+ // FIXME: order is not guaranteed in this path
+ return conn.get_foreign(
+ ).then([this, m=std::move(m)](auto f_conn) {
+ return seastar::smp::submit_to(PRIMARY_CORE,
+ [f_conn=std::move(f_conn), m=std::move(m), this]() mutable {
+ auto conn = make_local_shared_foreign(std::move(f_conn));
+ auto ret = do_ms_dispatch(conn, std::move(m));
+ assert(ret.has_value());
+ return std::move(ret.value());
+ });
+ });
+ }
+ }
+
+ switch (m->get_type()) {
+ case CEPH_MSG_OSD_MAP:
+ return handle_osd_map(boost::static_pointer_cast<MOSDMap>(m));
+ case CEPH_MSG_OSD_OP:
+ return handle_osd_op(conn, boost::static_pointer_cast<MOSDOp>(m));
+ case MSG_OSD_PG_CREATE2:
+ return handle_pg_create(
+ conn, boost::static_pointer_cast<MOSDPGCreate2>(m));
+ return seastar::now();
+ case MSG_COMMAND:
+ return handle_command(conn, boost::static_pointer_cast<MCommand>(m));
+ case MSG_OSD_MARK_ME_DOWN:
+ return handle_mark_me_down(conn, boost::static_pointer_cast<MOSDMarkMeDown>(m));
+ case MSG_OSD_PG_PULL:
+ [[fallthrough]];
+ case MSG_OSD_PG_PUSH:
+ [[fallthrough]];
+ case MSG_OSD_PG_PUSH_REPLY:
+ [[fallthrough]];
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ [[fallthrough]];
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ [[fallthrough]];
+ case MSG_OSD_PG_SCAN:
+ [[fallthrough]];
+ case MSG_OSD_PG_BACKFILL:
+ [[fallthrough]];
+ case MSG_OSD_PG_BACKFILL_REMOVE:
+ return handle_recovery_subreq(conn, boost::static_pointer_cast<MOSDFastDispatchOp>(m));
+ case MSG_OSD_PG_LEASE:
+ [[fallthrough]];
+ case MSG_OSD_PG_LEASE_ACK:
+ [[fallthrough]];
+ case MSG_OSD_PG_NOTIFY2:
+ [[fallthrough]];
+ case MSG_OSD_PG_INFO2:
+ [[fallthrough]];
+ case MSG_OSD_PG_QUERY2:
+ [[fallthrough]];
+ case MSG_OSD_BACKFILL_RESERVE:
+ [[fallthrough]];
+ case MSG_OSD_RECOVERY_RESERVE:
+ [[fallthrough]];
+ case MSG_OSD_PG_LOG:
+ return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m));
+ case MSG_OSD_REPOP:
+ return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m));
+ case MSG_OSD_REPOPREPLY:
+ return handle_rep_op_reply(conn, boost::static_pointer_cast<MOSDRepOpReply>(m));
+ case MSG_OSD_SCRUB2:
+ return handle_scrub(conn, boost::static_pointer_cast<MOSDScrub2>(m));
+ case MSG_OSD_PG_UPDATE_LOG_MISSING:
+ return handle_update_log_missing(conn, boost::static_pointer_cast<
+ MOSDPGUpdateLogMissing>(m));
+ case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+ return handle_update_log_missing_reply(conn, boost::static_pointer_cast<
+ MOSDPGUpdateLogMissingReply>(m));
+ default:
+ return std::nullopt;
+ }
+}
+
+void OSD::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace)
+{
+ // TODO: cleanup the session attached to this connection
+ logger().warn("ms_handle_reset");
+}
+
+void OSD::ms_handle_remote_reset(crimson::net::ConnectionRef conn)
+{
+ logger().warn("ms_handle_remote_reset");
+}
+
+void OSD::handle_authentication(const EntityName& name,
+ const AuthCapsInfo& caps_info)
+{
+ // TODO: store the parsed cap and associate it with the connection
+ if (caps_info.allow_all) {
+ logger().debug("{} {} has all caps", __func__, name);
+ return;
+ }
+ if (caps_info.caps.length() > 0) {
+ auto p = caps_info.caps.cbegin();
+ string str;
+ try {
+ decode(str, p);
+ } catch (ceph::buffer::error& e) {
+ logger().warn("{} {} failed to decode caps string", __func__, name);
+ return;
+ }
+ OSDCap caps;
+ if (caps.parse(str)) {
+ logger().debug("{} {} has caps {}", __func__, name, str);
+ } else {
+ logger().warn("{} {} failed to parse caps {}", __func__, name, str);
+ }
+ }
+}
+
+void OSD::update_stats()
+{
+ osd_stat_seq++;
+ osd_stat.up_from = get_shard_services().get_up_epoch();
+ osd_stat.hb_peers = heartbeat->get_peers();
+ osd_stat.seq = (
+ static_cast<uint64_t>(get_shard_services().get_up_epoch()) << 32
+ ) | osd_stat_seq;
+ gate.dispatch_in_background("statfs", *this, [this] {
+ (void) store.stat().then([this](store_statfs_t&& st) {
+ osd_stat.statfs = st;
+ });
+ });
+}
+
+seastar::future<MessageURef> OSD::get_stats() const
+{
+ // MPGStats::had_map_for is not used since PGMonitor was removed
+ auto m = crimson::make_message<MPGStats>(monc->get_fsid(), osdmap->get_epoch());
+ m->osd_stat = osd_stat;
+ return pg_shard_manager.get_pg_stats(
+ ).then([m=std::move(m)](auto &&stats) mutable {
+ m->pg_stat = std::move(stats);
+ return seastar::make_ready_future<MessageURef>(std::move(m));
+ });
+}
+
+uint64_t OSD::send_pg_stats()
+{
+ // mgr client sends the report message in background
+ mgrc->report();
+ return osd_stat.seq;
+}
+
+seastar::future<> OSD::handle_osd_map(Ref<MOSDMap> m)
+{
+ /* Ensure that only one MOSDMap is processed at a time. Allowing concurrent
+ * processing may eventually be worthwhile, but such an implementation would
+ * need to ensure (among other things)
+ * 1. any particular map is only processed once
+ * 2. PGAdvanceMap operations are processed in order for each PG
+ * As map handling is not presently a bottleneck, we stick to this
+ * simpler invariant for now.
+ * See https://tracker.ceph.com/issues/59165
+ */
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return handle_osd_map_lock.lock().then([this, m] {
+ return _handle_osd_map(m);
+ }).finally([this] {
+ return handle_osd_map_lock.unlock();
+ });
+}
+
+seastar::future<> OSD::_handle_osd_map(Ref<MOSDMap> m)
+{
+ logger().info("handle_osd_map {}", *m);
+ if (m->fsid != superblock.cluster_fsid) {
+ logger().warn("fsid mismatched");
+ return seastar::now();
+ }
+ if (pg_shard_manager.is_initializing()) {
+ logger().warn("i am still initializing");
+ return seastar::now();
+ }
+
+ const auto first = m->get_first();
+ const auto last = m->get_last();
+ logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]",
+ first, last, superblock.newest_map,
+ m->cluster_osdmap_trim_lower_bound, m->newest_map);
+ // make sure there is something new, here, before we bother flushing
+ // the queues and such
+ if (last <= superblock.newest_map) {
+ return seastar::now();
+ }
+ // missing some?
+ bool skip_maps = false;
+ epoch_t start = superblock.newest_map + 1;
+ if (first > start) {
+ logger().info("handle_osd_map message skips epochs {}..{}",
+ start, first - 1);
+ if (m->cluster_osdmap_trim_lower_bound <= start) {
+ return get_shard_services().osdmap_subscribe(start, false);
+ }
+ // always try to get the full range of maps--as many as we can. this
+ // 1- is good to have
+ // 2- is at present the only way to ensure that we get a *full* map as
+ // the first map!
+ if (m->cluster_osdmap_trim_lower_bound < first) {
+ return get_shard_services().osdmap_subscribe(
+ m->cluster_osdmap_trim_lower_bound - 1, true);
+ }
+ skip_maps = true;
+ start = first;
+ }
+
+ return seastar::do_with(ceph::os::Transaction{},
+ [=, this](auto& t) {
+ return pg_shard_manager.store_maps(t, start, m).then([=, this, &t] {
+ // even if this map isn't from a mon, we may have satisfied our subscription
+ monc->sub_got("osdmap", last);
+ if (!superblock.oldest_map || skip_maps) {
+ superblock.oldest_map = first;
+ }
+ superblock.newest_map = last;
+ superblock.current_epoch = last;
+
+ // note in the superblock that we were clean thru the prior epoch
+ if (boot_epoch && boot_epoch >= superblock.mounted) {
+ superblock.mounted = boot_epoch;
+ superblock.clean_thru = last;
+ }
+ pg_shard_manager.get_meta_coll().store_superblock(t, superblock);
+ pg_shard_manager.set_superblock(superblock);
+ logger().debug("OSD::handle_osd_map: do_transaction...");
+ return store.get_sharded_store().do_transaction(
+ pg_shard_manager.get_meta_coll().collection(),
+ std::move(t));
+ });
+ }).then([=, this] {
+ // TODO: write to superblock and commit the transaction
+ return committed_osd_maps(start, last, m);
+ });
+}
+
+seastar::future<> OSD::committed_osd_maps(
+ version_t first,
+ version_t last,
+ Ref<MOSDMap> m)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ logger().info("osd.{}: committed_osd_maps({}, {})", whoami, first, last);
+ // advance through the new maps
+ return seastar::do_for_each(boost::make_counting_iterator(first),
+ boost::make_counting_iterator(last + 1),
+ [this](epoch_t cur) {
+ return pg_shard_manager.get_local_map(
+ cur
+ ).then([this](OSDMapService::local_cached_map_t&& o) {
+ osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(o));
+ return pg_shard_manager.update_map(std::move(o));
+ }).then([this] {
+ if (get_shard_services().get_up_epoch() == 0 &&
+ osdmap->is_up(whoami) &&
+ osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) {
+ return pg_shard_manager.set_up_epoch(
+ osdmap->get_epoch()
+ ).then([this] {
+ if (!boot_epoch) {
+ boot_epoch = osdmap->get_epoch();
+ }
+ });
+ } else {
+ return seastar::now();
+ }
+ });
+ }).then([m, this] {
+ auto fut = seastar::now();
+ if (osdmap->is_up(whoami)) {
+ const auto up_from = osdmap->get_up_from(whoami);
+ logger().info("osd.{}: map e {} marked me up: up_from {}, bind_epoch {}, state {}",
+ whoami, osdmap->get_epoch(), up_from, bind_epoch,
+ pg_shard_manager.get_osd_state_string());
+ if (bind_epoch < up_from &&
+ osdmap->get_addrs(whoami) == public_msgr->get_myaddrs() &&
+ pg_shard_manager.is_booting()) {
+ logger().info("osd.{}: activating...", whoami);
+ fut = pg_shard_manager.set_active().then([this] {
+ beacon_timer.arm_periodic(
+ std::chrono::seconds(local_conf()->osd_beacon_report_interval));
+ // timer continuation rearms when complete
+ tick_timer.arm(
+ std::chrono::seconds(TICK_INTERVAL));
+ });
+ }
+ } else {
+ if (pg_shard_manager.is_prestop()) {
+ got_stop_ack();
+ return seastar::now();
+ }
+ }
+ return fut.then([this] {
+ return check_osdmap_features().then([this] {
+ // yay!
+ logger().info("osd.{}: committed_osd_maps: broadcasting osdmaps up"
+ " to {} epoch to pgs", whoami, osdmap->get_epoch());
+ return pg_shard_manager.broadcast_map_to_pgs(osdmap->get_epoch());
+ });
+ });
+ }).then([m, this] {
+ if (pg_shard_manager.is_active()) {
+ logger().info("osd.{}: now active", whoami);
+ if (!osdmap->exists(whoami) ||
+ osdmap->is_stop(whoami)) {
+ return shutdown();
+ }
+ if (should_restart()) {
+ return restart();
+ } else {
+ return seastar::now();
+ }
+ } else if (pg_shard_manager.is_preboot()) {
+ logger().info("osd.{}: now preboot", whoami);
+
+ if (m->get_source().is_mon()) {
+ return _preboot(
+ m->cluster_osdmap_trim_lower_bound, m->newest_map);
+ } else {
+ logger().info("osd.{}: start_boot", whoami);
+ return start_boot();
+ }
+ } else {
+ logger().info("osd.{}: now {}", whoami,
+ pg_shard_manager.get_osd_state_string());
+ // XXX
+ return seastar::now();
+ }
+ });
+}
+
+seastar::future<> OSD::handle_osd_op(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDOp> m)
+{
+ return pg_shard_manager.start_pg_operation<ClientRequest>(
+ get_shard_services(),
+ conn,
+ std::move(m)).second;
+}
+
+seastar::future<> OSD::handle_pg_create(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPGCreate2> m)
+{
+ return seastar::do_for_each(m->pgs, [this, conn, m](auto& pg) {
+ auto& [pgid, when] = pg;
+ const auto &[created, created_stamp] = when;
+ auto q = m->pg_extra.find(pgid);
+ ceph_assert(q != m->pg_extra.end());
+ auto& [history, pi] = q->second;
+ logger().debug(
+ "{}: {} e{} @{} "
+ "history {} pi {}",
+ __func__, pgid, created, created_stamp,
+ history, pi);
+ if (!pi.empty() &&
+ m->epoch < pi.get_bounds().second) {
+ logger().error(
+ "got pg_create on {} epoch {} "
+ "unmatched past_intervals {} (history {})",
+ pgid, m->epoch,
+ pi, history);
+ return seastar::now();
+ } else {
+ return pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+ conn,
+ pg_shard_t(),
+ pgid,
+ m->epoch,
+ m->epoch,
+ NullEvt(),
+ true,
+ new PGCreateInfo(pgid, m->epoch, history, pi, true)).second;
+ }
+ });
+}
+
+seastar::future<> OSD::handle_update_log_missing(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPGUpdateLogMissing> m)
+{
+ m->decode_payload();
+ return pg_shard_manager.start_pg_operation<LogMissingRequest>(
+ std::move(conn),
+ std::move(m)).second;
+}
+
+seastar::future<> OSD::handle_update_log_missing_reply(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPGUpdateLogMissingReply> m)
+{
+ m->decode_payload();
+ return pg_shard_manager.start_pg_operation<LogMissingRequestReply>(
+ std::move(conn),
+ std::move(m)).second;
+}
+
+seastar::future<> OSD::handle_rep_op(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDRepOp> m)
+{
+ m->finish_decode();
+ return pg_shard_manager.start_pg_operation<RepRequest>(
+ std::move(conn),
+ std::move(m)).second;
+}
+
+seastar::future<> OSD::handle_rep_op_reply(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDRepOpReply> m)
+{
+ spg_t pgid = m->get_spg();
+ return pg_shard_manager.with_pg(
+ pgid,
+ [m=std::move(m)](auto &&pg) {
+ if (pg) {
+ m->finish_decode();
+ pg->handle_rep_op_reply(*m);
+ } else {
+ logger().warn("stale reply: {}", *m);
+ }
+ return seastar::now();
+ });
+}
+
+seastar::future<> OSD::handle_scrub(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDScrub2> m)
+{
+ if (m->fsid != superblock.cluster_fsid) {
+ logger().warn("fsid mismatched");
+ return seastar::now();
+ }
+ return seastar::parallel_for_each(std::move(m->scrub_pgs),
+ [m, conn, this](spg_t pgid) {
+ pg_shard_t from_shard{static_cast<int>(m->get_source().num()),
+ pgid.shard};
+ PeeringState::RequestScrub scrub_request{m->deep, m->repair};
+ return pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+ conn,
+ from_shard,
+ pgid,
+ PGPeeringEvent{m->epoch, m->epoch, scrub_request}).second;
+ });
+}
+
+seastar::future<> OSD::handle_mark_me_down(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDMarkMeDown> m)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ if (pg_shard_manager.is_prestop()) {
+ got_stop_ack();
+ }
+ return seastar::now();
+}
+
+seastar::future<> OSD::handle_recovery_subreq(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDFastDispatchOp> m)
+{
+ return pg_shard_manager.start_pg_operation<RecoverySubRequest>(
+ conn, std::move(m)).second;
+}
+
+bool OSD::should_restart() const
+{
+ if (!osdmap->is_up(whoami)) {
+ logger().info("map e {} marked osd.{} down",
+ osdmap->get_epoch(), whoami);
+ return true;
+ } else if (osdmap->get_addrs(whoami) != public_msgr->get_myaddrs()) {
+ logger().error("map e {} had wrong client addr ({} != my {})",
+ osdmap->get_epoch(),
+ osdmap->get_addrs(whoami),
+ public_msgr->get_myaddrs());
+ return true;
+ } else if (osdmap->get_cluster_addrs(whoami) != cluster_msgr->get_myaddrs()) {
+ logger().error("map e {} had wrong cluster addr ({} != my {})",
+ osdmap->get_epoch(),
+ osdmap->get_cluster_addrs(whoami),
+ cluster_msgr->get_myaddrs());
+ return true;
+ } else {
+ return false;
+ }
+}
+
+seastar::future<> OSD::restart()
+{
+ beacon_timer.cancel();
+ tick_timer.cancel();
+ return pg_shard_manager.set_up_epoch(
+ 0
+ ).then([this] {
+ bind_epoch = osdmap->get_epoch();
+ // TODO: promote to shutdown if being marked down for multiple times
+ // rebind messengers
+ return start_boot();
+ });
+}
+
+seastar::future<> OSD::shutdown()
+{
+ logger().info("shutting down per osdmap");
+ abort_source.request_abort();
+ return seastar::now();
+}
+
+seastar::future<> OSD::send_beacon()
+{
+ if (!pg_shard_manager.is_active()) {
+ return seastar::now();
+ }
+ // FIXME: min lec should be calculated from pg_stat
+ // and should set m->pgs
+ epoch_t min_last_epoch_clean = osdmap->get_epoch();
+ auto m = crimson::make_message<MOSDBeacon>(osdmap->get_epoch(),
+ min_last_epoch_clean,
+ superblock.last_purged_snaps_scrub,
+ local_conf()->osd_beacon_report_interval);
+ return monc->send_message(std::move(m));
+}
+
+seastar::future<> OSD::update_heartbeat_peers()
+{
+ if (!pg_shard_manager.is_active()) {
+ return seastar::now();;
+ }
+
+ pg_shard_manager.for_each_pgid([this](auto &pgid) {
+ vector<int> up, acting;
+ osdmap->pg_to_up_acting_osds(pgid.pgid,
+ &up, nullptr,
+ &acting, nullptr);
+ for (int osd : boost::join(up, acting)) {
+ if (osd == CRUSH_ITEM_NONE || osd == whoami) {
+ continue;
+ } else {
+ heartbeat->add_peer(osd, osdmap->get_epoch());
+ }
+ }
+ });
+ heartbeat->update_peers(whoami);
+ return seastar::now();
+}
+
+seastar::future<> OSD::handle_peering_op(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPeeringOp> m)
+{
+ const int from = m->get_source().num();
+ logger().debug("handle_peering_op on {} from {}", m->get_spg(), from);
+ m->set_features(conn->get_features());
+ std::unique_ptr<PGPeeringEvent> evt(m->get_event());
+ return pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+ conn,
+ pg_shard_t{from, m->get_spg().shard},
+ m->get_spg(),
+ std::move(*evt)).second;
+}
+
+seastar::future<> OSD::check_osdmap_features()
+{
+ assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return store.write_meta(
+ "require_osd_release",
+ stringify((int)osdmap->require_osd_release));
+}
+
+seastar::future<> OSD::prepare_to_stop()
+{
+ if (osdmap && osdmap->is_up(whoami)) {
+ pg_shard_manager.set_prestop();
+ const auto timeout =
+ std::chrono::duration_cast<std::chrono::milliseconds>(
+ std::chrono::duration<double>(
+ local_conf().get_val<double>("osd_mon_shutdown_timeout")));
+
+ return seastar::with_timeout(
+ seastar::timer<>::clock::now() + timeout,
+ monc->send_message(
+ crimson::make_message<MOSDMarkMeDown>(
+ monc->get_fsid(),
+ whoami,
+ osdmap->get_addrs(whoami),
+ osdmap->get_epoch(),
+ true)).then([this] {
+ return stop_acked.get_future();
+ })
+ ).handle_exception_type(
+ [](seastar::timed_out_error&) {
+ return seastar::now();
+ });
+ }
+ return seastar::now();
+}
+
+}
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
new file mode 100644
index 000000000..10ff60d47
--- /dev/null
+++ b/src/crimson/osd/osd.h
@@ -0,0 +1,251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/abort_source.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/timer.hh>
+
+#include "crimson/common/logclient.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/common/auth_handler.h"
+#include "crimson/common/gated.h"
+#include "crimson/admin/admin_socket.h"
+#include "crimson/common/simple_lru.h"
+#include "crimson/mgr/client.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/pg_shard_manager.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/state.h"
+
+#include "messages/MOSDOp.h"
+#include "osd/PeeringState.h"
+#include "osd/osd_types.h"
+#include "osd/osd_perf_counters.h"
+#include "osd/PGPeeringEvent.h"
+
+class MCommand;
+class MOSDMap;
+class MOSDRepOpReply;
+class MOSDRepOp;
+class MOSDScrub2;
+class OSDMeta;
+class Heartbeat;
+
+namespace ceph::os {
+ class Transaction;
+}
+
+namespace crimson::mon {
+ class Client;
+}
+
+namespace crimson::net {
+ class Messenger;
+}
+
+namespace crimson::os {
+ class FuturizedStore;
+}
+
+namespace crimson::osd {
+class PG;
+
+class OSD final : public crimson::net::Dispatcher,
+ private crimson::common::AuthHandler,
+ private crimson::mgr::WithStats {
+ const int whoami;
+ const uint32_t nonce;
+ seastar::abort_source& abort_source;
+ seastar::timer<seastar::lowres_clock> beacon_timer;
+ // talk with osd
+ crimson::net::MessengerRef cluster_msgr;
+ // talk with client/mon/mgr
+ crimson::net::MessengerRef public_msgr;
+
+ // HB Messengers
+ crimson::net::MessengerRef hb_front_msgr;
+ crimson::net::MessengerRef hb_back_msgr;
+
+ std::unique_ptr<crimson::mon::Client> monc;
+ std::unique_ptr<crimson::mgr::Client> mgrc;
+
+ // TODO: use a wrapper for ObjectStore
+ OSDMapService::cached_map_t osdmap;
+ crimson::os::FuturizedStore& store;
+
+ /// _first_ epoch we were marked up (after this process started)
+ epoch_t boot_epoch = 0;
+ //< epoch we last did a bind to new ip:ports
+ epoch_t bind_epoch = 0;
+ //< since when there is no more pending pg creates from mon
+ epoch_t last_pg_create_epoch = 0;
+
+ ceph::mono_time startup_time;
+
+ seastar::shared_mutex handle_osd_map_lock;
+
+ OSDSuperblock superblock;
+
+ // Dispatcher methods
+ std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef, MessageRef) final;
+ void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final;
+ void ms_handle_remote_reset(crimson::net::ConnectionRef conn) final;
+
+ std::optional<seastar::future<>> do_ms_dispatch(crimson::net::ConnectionRef, MessageRef);
+
+ // mgr::WithStats methods
+ // pg statistics including osd ones
+ osd_stat_t osd_stat;
+ uint32_t osd_stat_seq = 0;
+ void update_stats();
+ seastar::future<MessageURef> get_stats() const final;
+
+ // AuthHandler methods
+ void handle_authentication(const EntityName& name,
+ const AuthCapsInfo& caps) final;
+
+ seastar::sharded<PGShardMapping> pg_to_shard_mappings;
+ seastar::sharded<OSDSingletonState> osd_singleton_state;
+ seastar::sharded<OSDState> osd_states;
+ seastar::sharded<ShardServices> shard_services;
+
+ crimson::osd::PGShardManager pg_shard_manager;
+
+ std::unique_ptr<Heartbeat> heartbeat;
+ seastar::timer<seastar::lowres_clock> tick_timer;
+
+ // admin-socket
+ seastar::lw_shared_ptr<crimson::admin::AdminSocket> asok;
+
+public:
+ OSD(int id, uint32_t nonce,
+ seastar::abort_source& abort_source,
+ crimson::os::FuturizedStore& store,
+ crimson::net::MessengerRef cluster_msgr,
+ crimson::net::MessengerRef client_msgr,
+ crimson::net::MessengerRef hb_front_msgr,
+ crimson::net::MessengerRef hb_back_msgr);
+ ~OSD() final;
+
+ auto &get_pg_shard_manager() {
+ return pg_shard_manager;
+ }
+
+ seastar::future<> open_meta_coll();
+ static seastar::future<OSDMeta> open_or_create_meta_coll(
+ crimson::os::FuturizedStore &store
+ );
+ static seastar::future<> mkfs(
+ crimson::os::FuturizedStore &store,
+ unsigned whoami,
+ uuid_d osd_uuid,
+ uuid_d cluster_fsid,
+ std::string osdspec_affinity);
+
+ seastar::future<> start();
+ seastar::future<> stop();
+
+ void dump_status(Formatter*) const;
+ void print(std::ostream&) const;
+
+ /// @return the seq id of the pg stats being sent
+ uint64_t send_pg_stats();
+
+ auto &get_shard_services() {
+ return shard_services.local();
+ }
+
+private:
+ static seastar::future<> _write_superblock(
+ crimson::os::FuturizedStore &store,
+ OSDMeta meta,
+ OSDSuperblock superblock);
+ static seastar::future<> _write_key_meta(
+ crimson::os::FuturizedStore &store
+ );
+ seastar::future<> start_boot();
+ seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap);
+ seastar::future<> _send_boot();
+ seastar::future<> _add_me_to_crush();
+
+ seastar::future<> osdmap_subscribe(version_t epoch, bool force_request);
+
+ seastar::future<> start_asok_admin();
+
+ void write_superblock(ceph::os::Transaction& t);
+ seastar::future<> read_superblock();
+
+ seastar::future<> handle_osd_map(Ref<MOSDMap> m);
+ seastar::future<> _handle_osd_map(Ref<MOSDMap> m);
+ seastar::future<> handle_pg_create(crimson::net::ConnectionRef conn,
+ Ref<MOSDPGCreate2> m);
+ seastar::future<> handle_osd_op(crimson::net::ConnectionRef conn,
+ Ref<MOSDOp> m);
+ seastar::future<> handle_rep_op(crimson::net::ConnectionRef conn,
+ Ref<MOSDRepOp> m);
+ seastar::future<> handle_rep_op_reply(crimson::net::ConnectionRef conn,
+ Ref<MOSDRepOpReply> m);
+ seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn,
+ Ref<MOSDPeeringOp> m);
+ seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn,
+ Ref<MOSDFastDispatchOp> m);
+ seastar::future<> handle_scrub(crimson::net::ConnectionRef conn,
+ Ref<MOSDScrub2> m);
+ seastar::future<> handle_mark_me_down(crimson::net::ConnectionRef conn,
+ Ref<MOSDMarkMeDown> m);
+
+ seastar::future<> committed_osd_maps(version_t first,
+ version_t last,
+ Ref<MOSDMap> m);
+
+ seastar::future<> check_osdmap_features();
+
+ seastar::future<> handle_command(crimson::net::ConnectionRef conn,
+ Ref<MCommand> m);
+ seastar::future<> handle_update_log_missing(crimson::net::ConnectionRef conn,
+ Ref<MOSDPGUpdateLogMissing> m);
+ seastar::future<> handle_update_log_missing_reply(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPGUpdateLogMissingReply> m);
+
+private:
+ crimson::common::Gated gate;
+
+ seastar::promise<> stop_acked;
+ void got_stop_ack() {
+ stop_acked.set_value();
+ }
+ seastar::future<> prepare_to_stop();
+ bool should_restart() const;
+ seastar::future<> restart();
+ seastar::future<> shutdown();
+ seastar::future<> update_heartbeat_peers();
+ friend class PGAdvanceMap;
+
+public:
+ seastar::future<> send_beacon();
+
+private:
+ LogClient log_client;
+ LogChannelRef clog;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const OSD& osd) {
+ osd.print(out);
+ return out;
+}
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::OSD> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_connection_priv.h b/src/crimson/osd/osd_connection_priv.h
new file mode 100644
index 000000000..69edf94b8
--- /dev/null
+++ b/src/crimson/osd/osd_connection_priv.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+
+namespace crimson::osd {
+
+struct OSDConnectionPriv : public crimson::net::Connection::user_private_t {
+ ConnectionPipeline client_request_conn_pipeline;
+ ConnectionPipeline peering_request_conn_pipeline;
+ ConnectionPipeline replicated_request_conn_pipeline;
+};
+
+static OSDConnectionPriv &get_osd_priv(crimson::net::Connection *conn) {
+ if (!conn->has_user_private()) {
+ conn->set_user_private(std::make_unique<OSDConnectionPriv>());
+ }
+ return static_cast<OSDConnectionPriv&>(conn->get_user_private());
+}
+
+}
diff --git a/src/crimson/osd/osd_meta.cc b/src/crimson/osd/osd_meta.cc
new file mode 100644
index 000000000..e40b2b246
--- /dev/null
+++ b/src/crimson/osd/osd_meta.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd_meta.h"
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "os/Transaction.h"
+
+using std::string;
+using read_errorator = crimson::os::FuturizedStore::Shard::read_errorator;
+
+void OSDMeta::create(ceph::os::Transaction& t)
+{
+ t.create_collection(coll->get_cid(), 0);
+}
+
+void OSDMeta::store_map(ceph::os::Transaction& t,
+ epoch_t e, const bufferlist& m)
+{
+ t.write(coll->get_cid(), osdmap_oid(e), 0, m.length(), m);
+}
+
+seastar::future<bufferlist> OSDMeta::load_map(epoch_t e)
+{
+ return store.read(coll,
+ osdmap_oid(e), 0, 0,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED).handle_error(
+ read_errorator::all_same_way([e] {
+ ceph_abort_msg(fmt::format("{} read gave enoent on {}",
+ __func__, osdmap_oid(e)));
+ }));
+}
+
+void OSDMeta::store_superblock(ceph::os::Transaction& t,
+ const OSDSuperblock& superblock)
+{
+ bufferlist bl;
+ encode(superblock, bl);
+ t.write(coll->get_cid(), superblock_oid(), 0, bl.length(), bl);
+}
+
+OSDMeta::load_superblock_ret OSDMeta::load_superblock()
+{
+ return store.read(
+ coll, superblock_oid(), 0, 0
+ ).safe_then([] (bufferlist&& bl) {
+ auto p = bl.cbegin();
+ OSDSuperblock superblock;
+ decode(superblock, p);
+ return seastar::make_ready_future<OSDSuperblock>(std::move(superblock));
+ });
+}
+
+seastar::future<std::tuple<pg_pool_t,
+ std::string,
+ OSDMeta::ec_profile_t>>
+OSDMeta::load_final_pool_info(int64_t pool) {
+ return store.read(coll, final_pool_info_oid(pool),
+ 0, 0).safe_then([] (bufferlist&& bl) {
+ auto p = bl.cbegin();
+ pg_pool_t pi;
+ string name;
+ ec_profile_t ec_profile;
+ decode(pi, p);
+ decode(name, p);
+ decode(ec_profile, p);
+ return seastar::make_ready_future<std::tuple<pg_pool_t,
+ string,
+ ec_profile_t>>(
+ std::make_tuple(std::move(pi),
+ std::move(name),
+ std::move(ec_profile)));
+ },read_errorator::all_same_way([pool] {
+ throw std::runtime_error(fmt::format("read gave enoent on {}",
+ final_pool_info_oid(pool)));
+ }));
+}
+
+ghobject_t OSDMeta::osdmap_oid(epoch_t epoch)
+{
+ string name = fmt::format("osdmap.{}", epoch);
+ return ghobject_t(hobject_t(sobject_t(object_t(name), 0)));
+}
+
+ghobject_t OSDMeta::final_pool_info_oid(int64_t pool)
+{
+ string name = fmt::format("final_pool_{}", pool);
+ return ghobject_t(hobject_t(sobject_t(object_t(name), CEPH_NOSNAP)));
+}
+
+ghobject_t OSDMeta::superblock_oid()
+{
+ return ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)));
+}
diff --git a/src/crimson/osd/osd_meta.h b/src/crimson/osd/osd_meta.h
new file mode 100644
index 000000000..652266d9e
--- /dev/null
+++ b/src/crimson/osd/osd_meta.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <seastar/core/future.hh>
+#include "osd/osd_types.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+namespace ceph::os {
+ class Transaction;
+}
+
+namespace crimson::os {
+ class FuturizedCollection;
+ class FuturizedStore;
+}
+
+/// metadata shared across PGs, or put in another way,
+/// metadata not specific to certain PGs.
+class OSDMeta {
+ template<typename T> using Ref = boost::intrusive_ptr<T>;
+
+ crimson::os::FuturizedStore::Shard& store;
+ Ref<crimson::os::FuturizedCollection> coll;
+
+public:
+ OSDMeta(Ref<crimson::os::FuturizedCollection> coll,
+ crimson::os::FuturizedStore::Shard& store)
+ : store{store}, coll{coll}
+ {}
+
+ auto collection() {
+ return coll;
+ }
+ void create(ceph::os::Transaction& t);
+
+ void store_map(ceph::os::Transaction& t,
+ epoch_t e, const bufferlist& m);
+ seastar::future<bufferlist> load_map(epoch_t e);
+
+ void store_superblock(ceph::os::Transaction& t,
+ const OSDSuperblock& sb);
+
+ using load_superblock_ertr = crimson::os::FuturizedStore::Shard::read_errorator;
+ using load_superblock_ret = load_superblock_ertr::future<OSDSuperblock>;
+ load_superblock_ret load_superblock();
+
+ using ec_profile_t = std::map<std::string, std::string>;
+ seastar::future<std::tuple<pg_pool_t,
+ std::string,
+ ec_profile_t>> load_final_pool_info(int64_t pool);
+private:
+ static ghobject_t osdmap_oid(epoch_t epoch);
+ static ghobject_t final_pool_info_oid(int64_t pool);
+ static ghobject_t superblock_oid();
+};
diff --git a/src/crimson/osd/osd_operation.cc b/src/crimson/osd/osd_operation.cc
new file mode 100644
index 000000000..920fdc114
--- /dev/null
+++ b/src/crimson/osd/osd_operation.cc
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd_operation.h"
+#include "common/Formatter.h"
+#include "crimson/common/log.h"
+#include "crimson/osd/osd_operations/client_request.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+void OSDOperationRegistry::do_stop()
+{
+ logger().info("OSDOperationRegistry::{}", __func__);
+ // we need to decouple visiting the registry from destructing
+ // ops because of the auto-unlink feature of boost::intrusive.
+ // the list shouldn't change while iterating due to constrains
+ // on iterator's validity.
+ constexpr auto historic_reg_index =
+ static_cast<size_t>(OperationTypeCode::historic_client_request);
+ auto& historic_registry = get_registry<historic_reg_index>();
+ std::vector<ClientRequest::ICRef> to_ref_down;
+ std::transform(std::begin(historic_registry), std::end(historic_registry),
+ std::back_inserter(to_ref_down),
+ [] (const Operation& op) {
+ return ClientRequest::ICRef{
+ static_cast<const ClientRequest*>(&op),
+ /* add_ref= */ false
+ };
+ });
+ last_of_recents = std::end(historic_registry);
+ // to_ref_down is going off
+}
+
+OSDOperationRegistry::OSDOperationRegistry()
+ : OperationRegistryT(seastar::this_shard_id())
+{
+ constexpr auto historic_reg_index =
+ static_cast<size_t>(OperationTypeCode::historic_client_request);
+ auto& historic_registry = get_registry<historic_reg_index>();
+ last_of_recents = std::begin(historic_registry);
+}
+
+static auto get_duration(const ClientRequest& client_request)
+{
+ // TODO: consider enhancing `CompletionEvent` with computing duration
+ // once -- when it's enetered.
+ return client_request.get_completed() - client_request.get_started();
+}
+
+void OSDOperationRegistry::put_historic(const ClientRequest& op)
+{
+ // unlink the op from the client request registry. this is a part of
+ // the re-link procedure. finally it will be in historic registry.
+ constexpr auto client_reg_index =
+ static_cast<size_t>(OperationTypeCode::client_request);
+ constexpr auto historic_reg_index =
+ static_cast<size_t>(OperationTypeCode::historic_client_request);
+ auto& client_registry = get_registry<client_reg_index>();
+ auto& historic_registry = get_registry<historic_reg_index>();
+ historic_registry.splice(std::end(historic_registry),
+ client_registry,
+ client_registry.iterator_to(op));
+ ClientRequest::ICRef(
+ &op, /* add_ref= */true
+ ).detach(); // yes, "leak" it for now!
+
+ // check whether the history size limit is not exceeded; if so, then
+ // purge the oldest op.
+ // NOTE: Operation uses the auto-unlink feature of boost::intrusive.
+ // NOTE: the cleaning happens in OSDOperationRegistry::do_stop()
+ using crimson::common::local_conf;
+ if (num_recent_ops >= local_conf()->osd_op_history_size) {
+ ++last_of_recents;
+ ++num_slow_ops;
+ } else {
+ ++num_recent_ops;
+ }
+ if (num_slow_ops > local_conf()->osd_op_history_slow_op_size) {
+ // we're interested in keeping slowest ops. if the slow op history
+ // is disabled, the list will have only one element, so the full-blown
+ // search will boil down into `.front()`.
+ const auto fastest_historic_iter = std::min_element(
+ std::cbegin(historic_registry), last_of_recents,
+ [] (const auto& lop, const auto& rop) {
+ const auto& lclient_request = static_cast<const ClientRequest&>(lop);
+ const auto& rclient_request = static_cast<const ClientRequest&>(rop);
+ return get_duration(lclient_request) < get_duration(rclient_request);
+ });
+ assert(fastest_historic_iter != std::end(historic_registry));
+ const auto& fastest_historic_op =
+ static_cast<const ClientRequest&>(*fastest_historic_iter);
+ historic_registry.erase(fastest_historic_iter);
+ // clear a previously "leaked" op
+ ClientRequest::ICRef(&fastest_historic_op, /* add_ref= */false);
+ --num_slow_ops;
+ }
+}
+
+size_t OSDOperationRegistry::dump_historic_client_requests(ceph::Formatter* f) const
+{
+ const auto& historic_client_registry =
+ get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>();
+ f->open_object_section("op_history");
+ f->dump_int("size", historic_client_registry.size());
+ // TODO: f->dump_int("duration", history_duration.load());
+ // the intrusive list is configured to not store the size
+ size_t ops_count = 0;
+ {
+ f->open_array_section("ops");
+ for (const auto& op : historic_client_registry) {
+ op.dump(f);
+ ++ops_count;
+ }
+ f->close_section();
+ }
+ f->close_section();
+ return ops_count;
+}
+
+size_t OSDOperationRegistry::dump_slowest_historic_client_requests(ceph::Formatter* f) const
+{
+ const auto& historic_client_registry =
+ get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>();
+ f->open_object_section("op_history");
+ f->dump_int("size", historic_client_registry.size());
+ // TODO: f->dump_int("duration", history_duration.load());
+ // the intrusive list is configured to not store the size
+ std::multimap<utime_t,
+ const ClientRequest*,
+ std::greater<utime_t>> sorted_slowest_ops;
+ // iterating over the entire registry as a slow op could be also
+ // in the "recently added" part.
+ std::transform(std::begin(historic_client_registry),
+ std::end(historic_client_registry),
+ std::inserter(sorted_slowest_ops, std::end(sorted_slowest_ops)),
+ [] (const Operation& op) {
+ const auto& cop = static_cast<const ClientRequest&>(op);
+ return std::make_pair(get_duration(cop), &cop);
+ });
+ f->open_array_section("ops");
+ using crimson::common::local_conf;
+ size_t ops_count = 0;
+ for (auto it = std::begin(sorted_slowest_ops);
+ ops_count < local_conf()->osd_op_history_slow_op_size
+ && it != std::end(sorted_slowest_ops);
+ ++it, ++ops_count)
+ {
+ it->second->dump(f);
+ }
+ f->close_section();
+ return ops_count;
+}
+
+OperationThrottler::OperationThrottler(ConfigProxy &conf)
+ : scheduler(crimson::osd::scheduler::make_scheduler(conf))
+{
+ conf.add_observer(this);
+ update_from_config(conf);
+}
+
+void OperationThrottler::wake()
+{
+ while ((!max_in_progress || in_progress < max_in_progress) &&
+ !scheduler->empty()) {
+ auto item = scheduler->dequeue();
+ item.wake.set_value();
+ ++in_progress;
+ --pending;
+ }
+}
+
+void OperationThrottler::release_throttle()
+{
+ ceph_assert(in_progress > 0);
+ --in_progress;
+ wake();
+}
+
+seastar::future<> OperationThrottler::acquire_throttle(
+ crimson::osd::scheduler::params_t params)
+{
+ crimson::osd::scheduler::item_t item{params, seastar::promise<>()};
+ auto fut = item.wake.get_future();
+ scheduler->enqueue(std::move(item));
+ return fut;
+}
+
+void OperationThrottler::dump_detail(Formatter *f) const
+{
+ f->dump_unsigned("max_in_progress", max_in_progress);
+ f->dump_unsigned("in_progress", in_progress);
+ f->open_object_section("scheduler");
+ {
+ scheduler->dump(*f);
+ }
+ f->close_section();
+}
+
+void OperationThrottler::update_from_config(const ConfigProxy &conf)
+{
+ max_in_progress = conf.get_val<uint64_t>("crimson_osd_scheduler_concurrency");
+ wake();
+}
+
+const char** OperationThrottler::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "crimson_osd_scheduler_concurrency",
+ NULL
+ };
+ return KEYS;
+}
+
+void OperationThrottler::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ update_from_config(conf);
+}
+
+}
diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h
new file mode 100644
index 000000000..8ef44ee9e
--- /dev/null
+++ b/src/crimson/osd/osd_operation.h
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/operation.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/scheduler/scheduler.h"
+#include "osd/osd_types.h"
+
+namespace crimson::os::seastore {
+ template<class OpT>
+ class OperationProxyT;
+}
+
+namespace crimson::osd {
+
+/// Ordering stages for a class of operations ordered by PG.
+struct ConnectionPipeline {
+ struct AwaitActive : OrderedExclusivePhaseT<AwaitActive> {
+ static constexpr auto type_name =
+ "ConnectionPipeline::await_active";
+ } await_active;
+
+ struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+ static constexpr auto type_name =
+ "ConnectionPipeline::await_map";
+ } await_map;
+
+ struct GetPG : OrderedExclusivePhaseT<GetPG> {
+ static constexpr auto type_name =
+ "ConnectionPipeline::get_pg";
+ } get_pg;
+};
+
+enum class OperationTypeCode {
+ client_request = 0,
+ peering_event,
+ pg_advance_map,
+ pg_creation,
+ replicated_request,
+ background_recovery,
+ background_recovery_sub,
+ internal_client_request,
+ historic_client_request,
+ logmissing_request,
+ logmissing_request_reply,
+ snaptrim_event,
+ snaptrimobj_subevent,
+ last_op
+};
+
+static constexpr const char* const OP_NAMES[] = {
+ "client_request",
+ "peering_event",
+ "pg_advance_map",
+ "pg_creation",
+ "replicated_request",
+ "background_recovery",
+ "background_recovery_sub",
+ "internal_client_request",
+ "historic_client_request",
+ "logmissing_request",
+ "logmissing_request_reply",
+ "snaptrim_event",
+ "snaptrimobj_subevent",
+};
+
+// prevent the addition of OperationTypeCode-s with no matching OP_NAMES entry:
+static_assert(
+ (sizeof(OP_NAMES)/sizeof(OP_NAMES[0])) ==
+ static_cast<int>(OperationTypeCode::last_op));
+
+struct InterruptibleOperation : Operation {
+ template <typename ValuesT = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition, ValuesT>;
+ using interruptor =
+ ::crimson::interruptible::interruptor<
+ ::crimson::osd::IOInterruptCondition>;
+};
+
+template <typename T>
+struct OperationT : InterruptibleOperation {
+ static constexpr const char *type_name = OP_NAMES[static_cast<int>(T::type)];
+ using IRef = boost::intrusive_ptr<T>;
+ using ICRef = boost::intrusive_ptr<const T>;
+
+ unsigned get_type() const final {
+ return static_cast<unsigned>(T::type);
+ }
+
+ const char *get_type_name() const final {
+ return T::type_name;
+ }
+
+ virtual ~OperationT() = default;
+
+private:
+ virtual void dump_detail(ceph::Formatter *f) const = 0;
+};
+
+template <class T>
+class TrackableOperationT : public OperationT<T> {
+ T* that() {
+ return static_cast<T*>(this);
+ }
+ const T* that() const {
+ return static_cast<const T*>(this);
+ }
+
+protected:
+ template<class EventT>
+ decltype(auto) get_event() {
+ // all out derivates are supposed to define the list of tracking
+ // events accessible via `std::get`. This will usually boil down
+ // into an instance of `std::tuple`.
+ return std::get<EventT>(that()->tracking_events);
+ }
+
+ template<class EventT>
+ decltype(auto) get_event() const {
+ return std::get<EventT>(that()->tracking_events);
+ }
+
+ using OperationT<T>::OperationT;
+
+ struct StartEvent : TimeEvent<StartEvent> {};
+ struct CompletionEvent : TimeEvent<CompletionEvent> {};
+
+ template <class EventT, class... Args>
+ void track_event(Args&&... args) {
+ // the idea is to have a visitor-like interface that allows to double
+ // dispatch (backend, blocker type)
+ get_event<EventT>().trigger(*that(), std::forward<Args>(args)...);
+ }
+
+ template <class BlockingEventT, class InterruptorT=void, class F>
+ auto with_blocking_event(F&& f) {
+ auto ret = std::forward<F>(f)(typename BlockingEventT::template Trigger<T>{
+ get_event<BlockingEventT>(), *that()
+ });
+ if constexpr (std::is_same_v<InterruptorT, void>) {
+ return ret;
+ } else {
+ using ret_t = decltype(ret);
+ return typename InterruptorT::template futurize_t<ret_t>{std::move(ret)};
+ }
+ }
+
+public:
+ static constexpr bool is_trackable = true;
+};
+
+template <class T>
+class PhasedOperationT : public TrackableOperationT<T> {
+ using base_t = TrackableOperationT<T>;
+
+ T* that() {
+ return static_cast<T*>(this);
+ }
+ const T* that() const {
+ return static_cast<const T*>(this);
+ }
+
+protected:
+ using TrackableOperationT<T>::TrackableOperationT;
+
+ template <class InterruptorT=void, class StageT>
+ auto enter_stage(StageT& stage) {
+ return this->template with_blocking_event<typename StageT::BlockingEvent,
+ InterruptorT>(
+ [&stage, this] (auto&& trigger) {
+ // delegated storing the pipeline handle to let childs to match
+ // the lifetime of pipeline with e.g. ConnectedSocket (important
+ // for ConnectionPipeline).
+ return that()->get_handle().template enter<T>(stage, std::move(trigger));
+ });
+ }
+
+ template <class OpT>
+ friend class crimson::os::seastore::OperationProxyT;
+
+ // PGShardManager::start_pg_operation needs access to enter_stage, we can make this
+ // more sophisticated later on
+ friend class PGShardManager;
+};
+
+/**
+ * Maintains a set of lists of all active ops.
+ */
+struct OSDOperationRegistry : OperationRegistryT<
+ static_cast<size_t>(OperationTypeCode::last_op)
+> {
+ OSDOperationRegistry();
+
+ void do_stop() override;
+
+ void put_historic(const class ClientRequest& op);
+
+ size_t dump_historic_client_requests(ceph::Formatter* f) const;
+ size_t dump_slowest_historic_client_requests(ceph::Formatter* f) const;
+
+private:
+ op_list::const_iterator last_of_recents;
+ size_t num_recent_ops = 0;
+ size_t num_slow_ops = 0;
+};
+/**
+ * Throttles set of currently running operations
+ *
+ * Very primitive currently, assumes all ops are equally
+ * expensive and simply limits the number that can be
+ * concurrently active.
+ */
+class OperationThrottler : public BlockerT<OperationThrottler>,
+ private md_config_obs_t {
+ friend BlockerT<OperationThrottler>;
+ static constexpr const char* type_name = "OperationThrottler";
+
+ template <typename OperationT, typename F>
+ auto with_throttle(
+ OperationT* op,
+ crimson::osd::scheduler::params_t params,
+ F &&f) {
+ if (!max_in_progress) return f();
+ return acquire_throttle(params)
+ .then(std::forward<F>(f))
+ .then([this](auto x) {
+ release_throttle();
+ return x;
+ });
+ }
+
+ template <typename OperationT, typename F>
+ seastar::future<> with_throttle_while(
+ OperationT* op,
+ crimson::osd::scheduler::params_t params,
+ F &&f) {
+ return with_throttle(op, params, f).then([this, params, op, f](bool cont) {
+ return cont ? with_throttle_while(op, params, f) : seastar::now();
+ });
+ }
+
+
+public:
+ OperationThrottler(ConfigProxy &conf);
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) final;
+ void update_from_config(const ConfigProxy &conf);
+
+ template <class OpT, class... Args>
+ seastar::future<> with_throttle_while(
+ BlockingEvent::Trigger<OpT>&& trigger,
+ Args&&... args) {
+ return trigger.maybe_record_blocking(
+ with_throttle_while(std::forward<Args>(args)...), *this);
+ }
+
+private:
+ void dump_detail(Formatter *f) const final;
+
+ crimson::osd::scheduler::SchedulerRef scheduler;
+
+ uint64_t max_in_progress = 0;
+ uint64_t in_progress = 0;
+
+ uint64_t pending = 0;
+
+ void wake();
+
+ seastar::future<> acquire_throttle(
+ crimson::osd::scheduler::params_t params);
+
+ void release_throttle();
+};
+
+}
diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h
new file mode 100644
index 000000000..4b6dbf4b7
--- /dev/null
+++ b/src/crimson/osd/osd_operation_external_tracking.h
@@ -0,0 +1,307 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/osd_operations/recovery_subrequest.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+#include "crimson/osd/osd_operations/snaptrim_event.h"
+#include "crimson/osd/pg_activation_blocker.h"
+#include "crimson/osd/pg_map.h"
+
+namespace crimson::osd {
+
+// Just the boilerplate currently. Implementing
+struct LttngBackend
+ : ClientRequest::StartEvent::Backend,
+ ConnectionPipeline::AwaitActive::BlockingEvent::Backend,
+ ConnectionPipeline::AwaitMap::BlockingEvent::Backend,
+ ConnectionPipeline::GetPG::BlockingEvent::Backend,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
+ PGMap::PGCreationBlockingEvent::Backend,
+ ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend,
+ PGActivationBlocker::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend,
+ ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend,
+ ClientRequest::CompletionEvent::Backend
+{
+ void handle(ClientRequest::StartEvent&,
+ const Operation&) override {}
+
+ void handle(ConnectionPipeline::AwaitActive::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::AwaitActive& blocker) override {
+ }
+
+ void handle(ConnectionPipeline::AwaitMap::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::AwaitMap& blocker) override {
+ }
+
+ void handle(OSD_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const OSD_OSDMapGate::OSDMapBlocker&) override {
+ }
+
+ void handle(ConnectionPipeline::GetPG::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::GetPG& blocker) override {
+ }
+
+ void handle(PGMap::PGCreationBlockingEvent&,
+ const Operation&,
+ const PGMap::PGCreationBlocker&) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::AwaitMap& blocker) override {
+ }
+
+ void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const PG_OSDMapGate::OSDMapBlocker&) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::WaitForActive& blocker) override {
+ }
+
+ void handle(PGActivationBlocker::BlockingEvent& ev,
+ const Operation& op,
+ const PGActivationBlocker& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::RecoverMissing& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::GetOBC& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::Process& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::WaitRepop& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev,
+ const Operation& op) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::SendReply& blocker) override {
+ }
+
+ void handle(ClientRequest::CompletionEvent&,
+ const Operation&) override {}
+};
+
+struct HistoricBackend
+ : ClientRequest::StartEvent::Backend,
+ ConnectionPipeline::AwaitActive::BlockingEvent::Backend,
+ ConnectionPipeline::AwaitMap::BlockingEvent::Backend,
+ ConnectionPipeline::GetPG::BlockingEvent::Backend,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
+ PGMap::PGCreationBlockingEvent::Backend,
+ ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend,
+ PGActivationBlocker::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend,
+ ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend,
+ ClientRequest::CompletionEvent::Backend
+{
+ void handle(ClientRequest::StartEvent&,
+ const Operation&) override {}
+
+ void handle(ConnectionPipeline::AwaitActive::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::AwaitActive& blocker) override {
+ }
+
+ void handle(ConnectionPipeline::AwaitMap::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::AwaitMap& blocker) override {
+ }
+
+ void handle(OSD_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const OSD_OSDMapGate::OSDMapBlocker&) override {
+ }
+
+ void handle(ConnectionPipeline::GetPG::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::GetPG& blocker) override {
+ }
+
+ void handle(PGMap::PGCreationBlockingEvent&,
+ const Operation&,
+ const PGMap::PGCreationBlocker&) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::AwaitMap& blocker) override {
+ }
+
+ void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const PG_OSDMapGate::OSDMapBlocker&) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::WaitForActive& blocker) override {
+ }
+
+ void handle(PGActivationBlocker::BlockingEvent& ev,
+ const Operation& op,
+ const PGActivationBlocker& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::RecoverMissing& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::GetOBC& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::Process& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::WaitRepop& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev,
+ const Operation& op) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::SendReply& blocker) override {
+ }
+
+ static const ClientRequest& to_client_request(const Operation& op) {
+#ifdef NDEBUG
+ return static_cast<const ClientRequest&>(op);
+#else
+ return dynamic_cast<const ClientRequest&>(op);
+#endif
+ }
+
+ void handle(ClientRequest::CompletionEvent&, const Operation& op) override {
+ if (crimson::common::local_conf()->osd_op_history_size) {
+ to_client_request(op).put_historic();
+ }
+ }
+};
+
+} // namespace crimson::osd
+
+namespace crimson {
+
+template <>
+struct EventBackendRegistry<osd::ClientRequest> {
+ static std::tuple<osd::LttngBackend, osd::HistoricBackend> get_backends() {
+ return { {}, {} };
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::RemotePeeringEvent> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::LocalPeeringEvent> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::RepRequest> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+
+template <>
+struct EventBackendRegistry<osd::LogMissingRequest> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::LogMissingRequestReply> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::RecoverySubRequest> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::BackfillRecovery> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::PGAdvanceMap> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::SnapTrimObjSubEvent> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+};
+
+} // namespace crimson
diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc
new file mode 100644
index 000000000..953ec9595
--- /dev/null
+++ b/src/crimson/osd/osd_operations/background_recovery.cc
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+#include <seastar/core/sleep.hh>
+
+#include "messages/MOSDOp.h"
+
+#include "crimson/osd/pg.h"
+#include "crimson/osd/shard_services.h"
+#include "common/Formatter.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<osd::UrgentRecovery> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+
+ template <>
+ struct EventBackendRegistry<osd::PglogBasedRecovery> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+}
+
+namespace crimson::osd {
+
+template <class T>
+BackgroundRecoveryT<T>::BackgroundRecoveryT(
+ Ref<PG> pg,
+ ShardServices &ss,
+ epoch_t epoch_started,
+ crimson::osd::scheduler::scheduler_class_t scheduler_class,
+ float delay)
+ : pg(pg),
+ epoch_started(epoch_started),
+ delay(delay),
+ ss(ss),
+ scheduler_class(scheduler_class)
+{}
+
+template <class T>
+void BackgroundRecoveryT<T>::print(std::ostream &lhs) const
+{
+ lhs << "BackgroundRecovery(" << pg->get_pgid() << ")";
+}
+
+template <class T>
+void BackgroundRecoveryT<T>::dump_detail(Formatter *f) const
+{
+ f->dump_stream("pgid") << pg->get_pgid();
+ f->open_object_section("recovery_detail");
+ {
+ // TODO pg->dump_recovery_state(f);
+ }
+ f->close_section();
+}
+
+template <class T>
+seastar::future<> BackgroundRecoveryT<T>::start()
+{
+ logger().debug("{}: start", *this);
+
+ typename T::IRef ref = static_cast<T*>(this);
+ auto maybe_delay = seastar::now();
+ if (delay) {
+ maybe_delay = seastar::sleep(
+ std::chrono::milliseconds(std::lround(delay * 1000)));
+ }
+ return maybe_delay.then([ref, this] {
+ return this->template with_blocking_event<OperationThrottler::BlockingEvent>(
+ [ref, this] (auto&& trigger) {
+ return ss.with_throttle_while(
+ std::move(trigger),
+ this, get_scheduler_params(), [this] {
+ return T::interruptor::with_interruption([this] {
+ return do_recovery();
+ }, [](std::exception_ptr) {
+ return seastar::make_ready_future<bool>(false);
+ }, pg);
+ }).handle_exception_type([ref, this](const std::system_error& err) {
+ if (err.code() == std::make_error_code(std::errc::interrupted)) {
+ logger().debug("{} recovery interruped: {}", *pg, err.what());
+ return seastar::now();
+ }
+ return seastar::make_exception_future<>(err);
+ });
+ });
+ });
+}
+
+UrgentRecovery::UrgentRecovery(
+ const hobject_t& soid,
+ const eversion_t& need,
+ Ref<PG> pg,
+ ShardServices& ss,
+ epoch_t epoch_started)
+ : BackgroundRecoveryT{pg, ss, epoch_started,
+ crimson::osd::scheduler::scheduler_class_t::immediate},
+ soid{soid}, need(need)
+{
+}
+
+UrgentRecovery::interruptible_future<bool>
+UrgentRecovery::do_recovery()
+{
+ logger().debug("{}: {}", __func__, *this);
+ if (!pg->has_reset_since(epoch_started)) {
+ return with_blocking_event<RecoveryBackend::RecoveryBlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->get_recovery_handler()->recover_missing(trigger, soid, need);
+ }).then_interruptible([] {
+ return seastar::make_ready_future<bool>(false);
+ });
+ }
+ return seastar::make_ready_future<bool>(false);
+}
+
+void UrgentRecovery::print(std::ostream &lhs) const
+{
+ lhs << "UrgentRecovery(" << pg->get_pgid() << ", "
+ << soid << ", v" << need << ", epoch_started: "
+ << epoch_started << ")";
+}
+
+void UrgentRecovery::dump_detail(Formatter *f) const
+{
+ f->dump_stream("pgid") << pg->get_pgid();
+ f->open_object_section("recovery_detail");
+ {
+ f->dump_stream("oid") << soid;
+ f->dump_stream("version") << need;
+ }
+ f->close_section();
+}
+
+PglogBasedRecovery::PglogBasedRecovery(
+ Ref<PG> pg,
+ ShardServices &ss,
+ const epoch_t epoch_started,
+ float delay)
+ : BackgroundRecoveryT(
+ std::move(pg),
+ ss,
+ epoch_started,
+ crimson::osd::scheduler::scheduler_class_t::background_recovery,
+ delay)
+{}
+
+PglogBasedRecovery::interruptible_future<bool>
+PglogBasedRecovery::do_recovery()
+{
+ if (pg->has_reset_since(epoch_started)) {
+ return seastar::make_ready_future<bool>(false);
+ }
+ return with_blocking_event<RecoveryBackend::RecoveryBlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->get_recovery_handler()->start_recovery_ops(
+ trigger,
+ crimson::common::local_conf()->osd_recovery_max_single_start);
+ });
+}
+
+PGPeeringPipeline &BackfillRecovery::peering_pp(PG &pg)
+{
+ return pg.peering_request_pg_pipeline;
+}
+
+BackfillRecovery::interruptible_future<bool>
+BackfillRecovery::do_recovery()
+{
+ logger().debug("{}", __func__);
+
+ if (pg->has_reset_since(epoch_started)) {
+ logger().debug("{}: pg got reset since epoch_started={}",
+ __func__, epoch_started);
+ return seastar::make_ready_future<bool>(false);
+ }
+ // TODO: limits
+ return enter_stage<interruptor>(
+ // process_event() of our boost::statechart machine is non-reentrant.
+ // with the backfill_pipeline we protect it from a second entry from
+ // the implementation of BackfillListener.
+ // additionally, this stage serves to synchronize with PeeringEvent.
+ peering_pp(*pg).process
+ ).then_interruptible([this] {
+ pg->get_recovery_handler()->dispatch_backfill_event(std::move(evt));
+ return seastar::make_ready_future<bool>(false);
+ });
+}
+
+template class BackgroundRecoveryT<UrgentRecovery>;
+template class BackgroundRecoveryT<PglogBasedRecovery>;
+template class BackgroundRecoveryT<BackfillRecovery>;
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h
new file mode 100644
index 000000000..17f2cd57a
--- /dev/null
+++ b/src/crimson/osd/osd_operations/background_recovery.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/statechart/event_base.hpp>
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/recovery_backend.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg.h"
+
+namespace crimson::osd {
+class PG;
+class ShardServices;
+
+template <class T>
+class BackgroundRecoveryT : public PhasedOperationT<T> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::background_recovery;
+
+ BackgroundRecoveryT(
+ Ref<PG> pg,
+ ShardServices &ss,
+ epoch_t epoch_started,
+ crimson::osd::scheduler::scheduler_class_t scheduler_class, float delay = 0);
+
+ virtual void print(std::ostream &) const;
+ seastar::future<> start();
+
+protected:
+ Ref<PG> pg;
+ const epoch_t epoch_started;
+ float delay = 0;
+
+private:
+ virtual void dump_detail(Formatter *f) const;
+ crimson::osd::scheduler::params_t get_scheduler_params() const {
+ return {
+ 1, // cost
+ 0, // owner
+ scheduler_class
+ };
+ }
+ using do_recovery_ret_t = typename PhasedOperationT<T>::template interruptible_future<bool>;
+ virtual do_recovery_ret_t do_recovery() = 0;
+ ShardServices &ss;
+ const crimson::osd::scheduler::scheduler_class_t scheduler_class;
+};
+
+/// represent a recovery initiated for serving a client request
+///
+/// unlike @c PglogBasedRecovery and @c BackfillRecovery,
+/// @c UrgentRecovery is not throttled by the scheduler. and it
+/// utilizes @c RecoveryBackend directly to recover the unreadable
+/// object.
+class UrgentRecovery final : public BackgroundRecoveryT<UrgentRecovery> {
+public:
+ UrgentRecovery(
+ const hobject_t& soid,
+ const eversion_t& need,
+ Ref<PG> pg,
+ ShardServices& ss,
+ epoch_t epoch_started);
+ void print(std::ostream&) const final;
+
+ std::tuple<
+ OperationThrottler::BlockingEvent,
+ RecoveryBackend::RecoveryBlockingEvent
+ > tracking_events;
+
+private:
+ void dump_detail(Formatter* f) const final;
+ interruptible_future<bool> do_recovery() override;
+ const hobject_t soid;
+ const eversion_t need;
+};
+
+class PglogBasedRecovery final : public BackgroundRecoveryT<PglogBasedRecovery> {
+public:
+ PglogBasedRecovery(
+ Ref<PG> pg,
+ ShardServices &ss,
+ epoch_t epoch_started,
+ float delay = 0);
+
+ std::tuple<
+ OperationThrottler::BlockingEvent,
+ RecoveryBackend::RecoveryBlockingEvent
+ > tracking_events;
+
+private:
+ interruptible_future<bool> do_recovery() override;
+};
+
+class BackfillRecovery final : public BackgroundRecoveryT<BackfillRecovery> {
+public:
+
+ template <class EventT>
+ BackfillRecovery(
+ Ref<PG> pg,
+ ShardServices &ss,
+ epoch_t epoch_started,
+ const EventT& evt);
+
+ PipelineHandle& get_handle() { return handle; }
+
+ std::tuple<
+ OperationThrottler::BlockingEvent,
+ PGPeeringPipeline::Process::BlockingEvent
+ > tracking_events;
+
+private:
+ boost::intrusive_ptr<const boost::statechart::event_base> evt;
+ PipelineHandle handle;
+
+ static PGPeeringPipeline &peering_pp(PG &pg);
+ interruptible_future<bool> do_recovery() override;
+};
+
+template <class EventT>
+BackfillRecovery::BackfillRecovery(
+ Ref<PG> pg,
+ ShardServices &ss,
+ const epoch_t epoch_started,
+ const EventT& evt)
+ : BackgroundRecoveryT(
+ std::move(pg),
+ ss,
+ epoch_started,
+ crimson::osd::scheduler::scheduler_class_t::background_best_effort),
+ evt(evt.intrusive_from_this())
+{}
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::BackfillRecovery> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::osd::PglogBasedRecovery> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::osd::UrgentRecovery> : fmt::ostream_formatter {};
+template <class T> struct fmt::formatter<crimson::osd::BackgroundRecoveryT<T>> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
new file mode 100644
index 000000000..9374fbde2
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -0,0 +1,388 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd.h"
+#include "common/Formatter.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "osd/object_state_fmt.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+
+void ClientRequest::Orderer::requeue(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ for (auto &req: list) {
+ logger().debug("{}: {} requeueing {}", __func__, *pg, req);
+ req.reset_instance_handle();
+ std::ignore = req.with_pg_int(shard_services, pg);
+ }
+}
+
+void ClientRequest::Orderer::clear_and_cancel()
+{
+ for (auto i = list.begin(); i != list.end(); ) {
+ logger().debug(
+ "ClientRequest::Orderer::clear_and_cancel: {}",
+ *i);
+ i->complete_request();
+ remove_request(*(i++));
+ }
+}
+
+void ClientRequest::complete_request()
+{
+ track_event<CompletionEvent>();
+ on_complete.set_value();
+}
+
+ClientRequest::ClientRequest(
+ ShardServices &shard_services, crimson::net::ConnectionRef conn,
+ Ref<MOSDOp> &&m)
+ : put_historic_shard_services(&shard_services),
+ conn(std::move(conn)),
+ m(std::move(m)),
+ instance_handle(new instance_handle_t)
+{}
+
+ClientRequest::~ClientRequest()
+{
+ logger().debug("{}: destroying", *this);
+}
+
+void ClientRequest::print(std::ostream &lhs) const
+{
+ lhs << "m=[" << *m << "]";
+}
+
+void ClientRequest::dump_detail(Formatter *f) const
+{
+ logger().debug("{}: dumping", *this);
+ std::apply([f] (auto... event) {
+ (..., event.dump(f));
+ }, tracking_events);
+}
+
+ConnectionPipeline &ClientRequest::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).client_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &ClientRequest::client_pp(PG &pg)
+{
+ return pg.request_pg_pipeline;
+}
+
+bool ClientRequest::is_pg_op() const
+{
+ return std::any_of(
+ begin(m->ops), end(m->ops),
+ [](auto& op) { return ceph_osd_op_type_pg(op.op.op); });
+}
+
+seastar::future<> ClientRequest::with_pg_int(
+ ShardServices &shard_services, Ref<PG> pgref)
+{
+ epoch_t same_interval_since = pgref->get_interval_start_epoch();
+ logger().debug("{} same_interval_since: {}", *this, same_interval_since);
+ if (m->finish_decode()) {
+ m->clear_payload();
+ }
+ const auto this_instance_id = instance_id++;
+ OperationRef opref{this};
+ auto instance_handle = get_instance_handle();
+ auto &ihref = *instance_handle;
+ return interruptor::with_interruption(
+ [this, pgref, this_instance_id, &ihref, &shard_services]() mutable {
+ PG &pg = *pgref;
+ if (pg.can_discard_op(*m)) {
+ return shard_services.send_incremental_map(
+ std::ref(*conn), m->get_map_epoch()
+ ).then([this, this_instance_id, pgref] {
+ logger().debug("{}.{}: discarding", *this, this_instance_id);
+ pgref->client_request_orderer.remove_request(*this);
+ complete_request();
+ return interruptor::now();
+ });
+ }
+ return ihref.enter_stage<interruptor>(client_pp(pg).await_map, *this
+ ).then_interruptible([this, this_instance_id, &pg, &ihref] {
+ logger().debug("{}.{}: after await_map stage", *this, this_instance_id);
+ return ihref.enter_blocker(
+ *this, pg.osdmap_gate, &decltype(pg.osdmap_gate)::wait_for_map,
+ m->get_min_epoch(), nullptr);
+ }).then_interruptible([this, this_instance_id, &pg, &ihref](auto map) {
+ logger().debug("{}.{}: after wait_for_map", *this, this_instance_id);
+ return ihref.enter_stage<interruptor>(client_pp(pg).wait_for_active, *this);
+ }).then_interruptible([this, this_instance_id, &pg, &ihref]() {
+ logger().debug(
+ "{}.{}: after wait_for_active stage", *this, this_instance_id);
+ return ihref.enter_blocker(
+ *this,
+ pg.wait_for_active_blocker,
+ &decltype(pg.wait_for_active_blocker)::wait);
+ }).then_interruptible([this, pgref, this_instance_id, &ihref]() mutable
+ -> interruptible_future<> {
+ logger().debug(
+ "{}.{}: after wait_for_active", *this, this_instance_id);
+ if (is_pg_op()) {
+ return process_pg_op(pgref);
+ } else {
+ return process_op(ihref, pgref);
+ }
+ }).then_interruptible([this, this_instance_id, pgref] {
+ logger().debug("{}.{}: after process*", *this, this_instance_id);
+ pgref->client_request_orderer.remove_request(*this);
+ complete_request();
+ });
+ }, [this, this_instance_id, pgref](std::exception_ptr eptr) {
+ // TODO: better debug output
+ logger().debug("{}.{}: interrupted {}", *this, this_instance_id, eptr);
+ }, pgref).finally(
+ [opref=std::move(opref), pgref=std::move(pgref),
+ instance_handle=std::move(instance_handle), &ihref] {
+ ihref.handle.exit();
+ });
+}
+
+seastar::future<> ClientRequest::with_pg(
+ ShardServices &shard_services, Ref<PG> pgref)
+{
+ put_historic_shard_services = &shard_services;
+ pgref->client_request_orderer.add_request(*this);
+ auto ret = on_complete.get_future();
+ std::ignore = with_pg_int(
+ shard_services, std::move(pgref)
+ );
+ return ret;
+}
+
+ClientRequest::interruptible_future<>
+ClientRequest::process_pg_op(
+ Ref<PG> &pg)
+{
+ return pg->do_pg_ops(
+ m
+ ).then_interruptible([this, pg=std::move(pg)](MURef<MOSDOpReply> reply) {
+ return conn->send(std::move(reply));
+ });
+}
+
+auto ClientRequest::reply_op_error(const Ref<PG>& pg, int err)
+{
+ logger().debug("{}: replying with error {}", *this, err);
+ auto reply = crimson::make_message<MOSDOpReply>(
+ m.get(), err, pg->get_osdmap_epoch(),
+ m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK),
+ !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
+ reply->set_reply_versions(eversion_t(), 0);
+ reply->set_op_returns(std::vector<pg_log_op_return_item_t>{});
+ return conn->send(std::move(reply));
+}
+
+ClientRequest::interruptible_future<>
+ClientRequest::process_op(instance_handle_t &ihref, Ref<PG> &pg)
+{
+ return ihref.enter_stage<interruptor>(
+ client_pp(*pg).recover_missing,
+ *this
+ ).then_interruptible(
+ [this, pg]() mutable {
+ if (pg->is_primary()) {
+ return do_recover_missing(pg, m->get_hobj());
+ } else {
+ logger().debug("process_op: Skipping do_recover_missing"
+ "on non primary pg");
+ return interruptor::now();
+ }
+ }).then_interruptible([this, pg, &ihref]() mutable {
+ return pg->already_complete(m->get_reqid()).then_interruptible(
+ [this, pg, &ihref](auto completed) mutable
+ -> PG::load_obc_iertr::future<> {
+ if (completed) {
+ auto reply = crimson::make_message<MOSDOpReply>(
+ m.get(), completed->err, pg->get_osdmap_epoch(),
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, false);
+ reply->set_reply_versions(completed->version, completed->user_version);
+ return conn->send(std::move(reply));
+ } else {
+ return ihref.enter_stage<interruptor>(client_pp(*pg).get_obc, *this
+ ).then_interruptible(
+ [this, pg, &ihref]() mutable -> PG::load_obc_iertr::future<> {
+ logger().debug("{}: in get_obc stage", *this);
+ op_info.set_from_op(&*m, *pg->get_osdmap());
+ return pg->with_locked_obc(
+ m->get_hobj(), op_info,
+ [this, pg, &ihref](auto obc) mutable {
+ logger().debug("{}: got obc {}", *this, obc->obs);
+ return ihref.enter_stage<interruptor>(
+ client_pp(*pg).process, *this
+ ).then_interruptible([this, pg, obc, &ihref]() mutable {
+ return do_process(ihref, pg, obc);
+ });
+ });
+ });
+ }
+ });
+ }).handle_error_interruptible(
+ PG::load_obc_ertr::all_same_way([this, pg=std::move(pg)](const auto &code) {
+ logger().error("ClientRequest saw error code {}", code);
+ assert(code.value() > 0);
+ return reply_op_error(pg, -code.value());
+ }));
+}
+
+ClientRequest::interruptible_future<>
+ClientRequest::do_process(
+ instance_handle_t &ihref,
+ Ref<PG>& pg, crimson::osd::ObjectContextRef obc)
+{
+ if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
+ return reply_op_error(pg, -EINVAL);
+ }
+ const pg_pool_t pool = pg->get_pgpool().info;
+ if (pool.has_flag(pg_pool_t::FLAG_EIO)) {
+ // drop op on the floor; the client will handle returning EIO
+ if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) {
+ logger().debug("discarding op due to pool EIO flag");
+ return seastar::now();
+ } else {
+ logger().debug("replying EIO due to pool EIO flag");
+ return reply_op_error(pg, -EIO);
+ }
+ }
+ if (m->get_oid().name.size()
+ > crimson::common::local_conf()->osd_max_object_name_len) {
+ return reply_op_error(pg, -ENAMETOOLONG);
+ } else if (m->get_hobj().get_key().size()
+ > crimson::common::local_conf()->osd_max_object_name_len) {
+ return reply_op_error(pg, -ENAMETOOLONG);
+ } else if (m->get_hobj().nspace.size()
+ > crimson::common::local_conf()->osd_max_object_namespace_len) {
+ return reply_op_error(pg, -ENAMETOOLONG);
+ } else if (m->get_hobj().oid.name.empty()) {
+ return reply_op_error(pg, -EINVAL);
+ } else if (pg->get_osdmap()->is_blocklisted(conn->get_peer_addr())) {
+ logger().info("{} is blocklisted", conn->get_peer_addr());
+ return reply_op_error(pg, -EBLOCKLISTED);
+ }
+
+ if (!obc->obs.exists && !op_info.may_write()) {
+ return reply_op_error(pg, -ENOENT);
+ }
+
+ SnapContext snapc = get_snapc(pg,obc);
+
+ if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
+ snapc.seq < obc->ssc->snapset.seq) {
+ logger().debug("{} ORDERSNAP flag set and snapc seq {}",
+ " < snapset seq {} on {}",
+ __func__, snapc.seq, obc->ssc->snapset.seq,
+ obc->obs.oi.soid);
+ return reply_op_error(pg, -EOLDSNAPC);
+ }
+
+ if (!pg->is_primary()) {
+ // primary can handle both normal ops and balanced reads
+ if (is_misdirected(*pg)) {
+ logger().trace("do_process: dropping misdirected op");
+ return seastar::now();
+ } else if (const hobject_t& hoid = m->get_hobj();
+ !pg->get_peering_state().can_serve_replica_read(hoid)) {
+ logger().debug("{}: unstable write on replica, "
+ "bouncing to primary",
+ __func__);
+ return reply_op_error(pg, -EAGAIN);
+ } else {
+ logger().debug("{}: serving replica read on oid {}",
+ __func__, m->get_hobj());
+ }
+ }
+ return pg->do_osd_ops(m, conn, obc, op_info, snapc).safe_then_unpack_interruptible(
+ [this, pg, &ihref](auto submitted, auto all_completed) mutable {
+ return submitted.then_interruptible([this, pg, &ihref] {
+ return ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+ }).then_interruptible(
+ [this, pg, all_completed=std::move(all_completed), &ihref]() mutable {
+ return all_completed.safe_then_interruptible(
+ [this, pg, &ihref](MURef<MOSDOpReply> reply) {
+ return ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this
+ ).then_interruptible(
+ [this, reply=std::move(reply)]() mutable {
+ logger().debug("{}: sending response", *this);
+ return conn->send(std::move(reply));
+ });
+ }, crimson::ct_error::eagain::handle([this, pg, &ihref]() mutable {
+ return process_op(ihref, pg);
+ }));
+ });
+ }, crimson::ct_error::eagain::handle([this, pg, &ihref]() mutable {
+ return process_op(ihref, pg);
+ }));
+}
+
+bool ClientRequest::is_misdirected(const PG& pg) const
+{
+ // otherwise take a closer look
+ if (const int flags = m->get_flags();
+ flags & CEPH_OSD_FLAG_BALANCE_READS ||
+ flags & CEPH_OSD_FLAG_LOCALIZE_READS) {
+ if (!op_info.may_read()) {
+ // no read found, so it can't be balanced read
+ return true;
+ }
+ if (op_info.may_write() || op_info.may_cache()) {
+ // write op, but i am not primary
+ return true;
+ }
+ // balanced reads; any replica will do
+ return false;
+ }
+ // neither balanced nor localize reads
+ return true;
+}
+
+void ClientRequest::put_historic() const
+{
+ ceph_assert_always(put_historic_shard_services);
+ put_historic_shard_services->get_registry().put_historic(*this);
+}
+
+const SnapContext ClientRequest::get_snapc(
+ Ref<PG>& pg,
+ crimson::osd::ObjectContextRef obc) const
+{
+ SnapContext snapc;
+ if (op_info.may_write() || op_info.may_cache()) {
+ // snap
+ if (pg->get_pgpool().info.is_pool_snaps_mode()) {
+ // use pool's snapc
+ snapc = pg->get_pgpool().snapc;
+ logger().debug("{} using pool's snapc snaps={}",
+ __func__, snapc.snaps);
+
+ } else {
+ // client specified snapc
+ snapc.seq = m->get_snap_seq();
+ snapc.snaps = m->get_snaps();
+ logger().debug("{} client specified snapc seq={} snaps={}",
+ __func__, snapc.seq, snapc.snaps);
+ }
+ }
+ return snapc;
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
new file mode 100644
index 000000000..b2dce1e87
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive_ptr.hpp>
+
+#include "osd/osd_op_util.h"
+#include "crimson/net/Connection.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request_common.h"
+#include "crimson/osd/osd_operations/common/pg_pipeline.h"
+#include "crimson/osd/pg_activation_blocker.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/common/utility.h"
+#include "messages/MOSDOp.h"
+
+namespace crimson::osd {
+class PG;
+class OSD;
+class ShardServices;
+
+class ClientRequest final : public PhasedOperationT<ClientRequest>,
+ private CommonClientRequest {
+ // Initially set to primary core, updated to pg core after move,
+ // used by put_historic
+ ShardServices *put_historic_shard_services = nullptr;
+
+ crimson::net::ConnectionRef conn;
+ // must be after conn due to ConnectionPipeline's life-time
+ Ref<MOSDOp> m;
+ OpInfo op_info;
+ seastar::promise<> on_complete;
+ unsigned instance_id = 0;
+
+public:
+ class PGPipeline : public CommonPGPipeline {
+ public:
+ struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+ static constexpr auto type_name = "ClientRequest::PGPipeline::await_map";
+ } await_map;
+ struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
+ static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
+ } wait_repop;
+ struct SendReply : OrderedExclusivePhaseT<SendReply> {
+ static constexpr auto type_name = "ClientRequest::PGPipeline::send_reply";
+ } send_reply;
+ friend class ClientRequest;
+ friend class LttngBackend;
+ friend class HistoricBackend;
+ friend class ReqRequest;
+ friend class LogMissingRequest;
+ friend class LogMissingRequestReply;
+ };
+
+ /**
+ * instance_handle_t
+ *
+ * Client request is, at present, the only Operation which can be requeued.
+ * This is, mostly, fine. However, reusing the PipelineHandle or
+ * BlockingEvent structures before proving that the prior instance has stopped
+ * can create hangs or crashes due to violations of the BlockerT and
+ * PipelineHandle invariants.
+ *
+ * To solve this, we create an instance_handle_t which contains the events
+ * for the portion of execution that can be rerun as well as the
+ * PipelineHandle. ClientRequest::with_pg_int grabs a reference to the current
+ * instance_handle_t and releases its PipelineHandle in the finally block.
+ * On requeue, we create a new instance_handle_t with a fresh PipelineHandle
+ * and events tuple and use it and use it for the next invocation of
+ * with_pg_int.
+ */
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ CompletionEvent
+ > tracking_events;
+
+ class instance_handle_t : public boost::intrusive_ref_counter<
+ instance_handle_t, boost::thread_unsafe_counter> {
+ public:
+ // intrusive_ptr because seastar::lw_shared_ptr includes a cpu debug check
+ // that we will fail since the core on which we allocate the request may not
+ // be the core on which we perform with_pg_int. This is harmless, since we
+ // don't leave any references on the source core, so we just bypass it by using
+ // intrusive_ptr instead.
+ using ref_t = boost::intrusive_ptr<instance_handle_t>;
+ PipelineHandle handle;
+
+ std::tuple<
+ PGPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGPipeline::WaitForActive::BlockingEvent,
+ PGActivationBlocker::BlockingEvent,
+ PGPipeline::RecoverMissing::BlockingEvent,
+ PGPipeline::GetOBC::BlockingEvent,
+ PGPipeline::Process::BlockingEvent,
+ PGPipeline::WaitRepop::BlockingEvent,
+ PGPipeline::SendReply::BlockingEvent,
+ CompletionEvent
+ > pg_tracking_events;
+
+ template <typename BlockingEventT, typename InterruptorT=void, typename F>
+ auto with_blocking_event(F &&f, ClientRequest &op) {
+ auto ret = std::forward<F>(f)(
+ typename BlockingEventT::template Trigger<ClientRequest>{
+ std::get<BlockingEventT>(pg_tracking_events), op
+ });
+ if constexpr (std::is_same_v<InterruptorT, void>) {
+ return ret;
+ } else {
+ using ret_t = decltype(ret);
+ return typename InterruptorT::template futurize_t<ret_t>{std::move(ret)};
+ }
+ }
+
+ template <typename InterruptorT=void, typename StageT>
+ auto enter_stage(StageT &stage, ClientRequest &op) {
+ return this->template with_blocking_event<
+ typename StageT::BlockingEvent,
+ InterruptorT>(
+ [&stage, this](auto &&trigger) {
+ return handle.template enter<ClientRequest>(
+ stage, std::move(trigger));
+ }, op);
+ }
+
+ template <
+ typename InterruptorT=void, typename BlockingObj, typename Method,
+ typename... Args>
+ auto enter_blocker(
+ ClientRequest &op, BlockingObj &obj, Method method, Args&&... args) {
+ return this->template with_blocking_event<
+ typename BlockingObj::Blocker::BlockingEvent,
+ InterruptorT>(
+ [&obj, method,
+ args=std::forward_as_tuple(std::move(args)...)](auto &&trigger) mutable {
+ return apply_method_to_tuple(
+ obj, method,
+ std::tuple_cat(
+ std::forward_as_tuple(std::move(trigger)),
+ std::move(args))
+ );
+ }, op);
+ }
+ };
+ instance_handle_t::ref_t instance_handle;
+ void reset_instance_handle() {
+ instance_handle = new instance_handle_t;
+ }
+ auto get_instance_handle() { return instance_handle; }
+
+ using ordering_hook_t = boost::intrusive::list_member_hook<>;
+ ordering_hook_t ordering_hook;
+ class Orderer {
+ using list_t = boost::intrusive::list<
+ ClientRequest,
+ boost::intrusive::member_hook<
+ ClientRequest,
+ typename ClientRequest::ordering_hook_t,
+ &ClientRequest::ordering_hook>
+ >;
+ list_t list;
+
+ public:
+ void add_request(ClientRequest &request) {
+ assert(!request.ordering_hook.is_linked());
+ intrusive_ptr_add_ref(&request);
+ list.push_back(request);
+ }
+ void remove_request(ClientRequest &request) {
+ assert(request.ordering_hook.is_linked());
+ list.erase(list_t::s_iterator_to(request));
+ intrusive_ptr_release(&request);
+ }
+ void requeue(ShardServices &shard_services, Ref<PG> pg);
+ void clear_and_cancel();
+ };
+ void complete_request();
+
+ static constexpr OperationTypeCode type = OperationTypeCode::client_request;
+
+ ClientRequest(
+ ShardServices &shard_services,
+ crimson::net::ConnectionRef, Ref<MOSDOp> &&m);
+ ~ClientRequest();
+
+ void print(std::ostream &) const final;
+ void dump_detail(Formatter *f) const final;
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return m->get_spg();
+ }
+ PipelineHandle &get_handle() { return instance_handle->handle; }
+ epoch_t get_epoch() const { return m->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg_int(
+ ShardServices &shard_services, Ref<PG> pg);
+
+public:
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pgref);
+
+private:
+ template <typename FuncT>
+ interruptible_future<> with_sequencer(FuncT&& func);
+ auto reply_op_error(const Ref<PG>& pg, int err);
+
+ interruptible_future<> do_process(
+ instance_handle_t &ihref,
+ Ref<PG>& pg,
+ crimson::osd::ObjectContextRef obc);
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition> process_pg_op(
+ Ref<PG> &pg);
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition> process_op(
+ instance_handle_t &ihref,
+ Ref<PG> &pg);
+ bool is_pg_op() const;
+
+ PGPipeline &client_pp(PG &pg);
+
+ template <typename Errorator>
+ using interruptible_errorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ Errorator>;
+
+ bool is_misdirected(const PG& pg) const;
+
+ const SnapContext get_snapc(
+ Ref<PG>& pg,
+ crimson::osd::ObjectContextRef obc) const;
+
+public:
+
+ friend class LttngBackend;
+ friend class HistoricBackend;
+
+ auto get_started() const {
+ return get_event<StartEvent>().get_timestamp();
+ };
+
+ auto get_completed() const {
+ return get_event<CompletionEvent>().get_timestamp();
+ };
+
+ void put_historic() const;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::ClientRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/client_request_common.cc b/src/crimson/osd/osd_operations/client_request_common.cc
new file mode 100644
index 000000000..cfd22c774
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request_common.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "crimson/osd/osd_operations/client_request_common.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+typename InterruptibleOperation::template interruptible_future<>
+CommonClientRequest::do_recover_missing(
+ Ref<PG>& pg, const hobject_t& soid)
+{
+ eversion_t ver;
+ assert(pg->is_primary());
+ logger().debug("{} check for recovery, {}", __func__, soid);
+ if (!pg->is_unreadable_object(soid, &ver) &&
+ !pg->is_degraded_or_backfilling_object(soid)) {
+ return seastar::now();
+ }
+ logger().debug("{} need to wait for recovery, {}", __func__, soid);
+ if (pg->get_recovery_backend()->is_recovering(soid)) {
+ return pg->get_recovery_backend()->get_recovering(soid).wait_for_recovered();
+ } else {
+ auto [op, fut] =
+ pg->get_shard_services().start_operation<UrgentRecovery>(
+ soid, ver, pg, pg->get_shard_services(), pg->get_osdmap_epoch());
+ return std::move(fut);
+ }
+}
+
+bool CommonClientRequest::should_abort_request(
+ const Operation& op,
+ std::exception_ptr eptr)
+{
+ if (*eptr.__cxa_exception_type() ==
+ typeid(::crimson::common::actingset_changed)) {
+ try {
+ std::rethrow_exception(eptr);
+ } catch(::crimson::common::actingset_changed& e) {
+ if (e.is_primary()) {
+ logger().debug("{} {} operation restart, acting set changed", __func__, op);
+ return false;
+ } else {
+ logger().debug("{} {} operation abort, up primary changed", __func__, op);
+ return true;
+ }
+ }
+ } else {
+ assert(*eptr.__cxa_exception_type() ==
+ typeid(crimson::common::system_shutdown_exception));
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{} {} operation skipped, system shutdown", __func__, op);
+ return true;
+ }
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/client_request_common.h b/src/crimson/osd/osd_operations/client_request_common.h
new file mode 100644
index 000000000..6a8a78966
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request_common.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/operation.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace crimson::osd {
+
+struct CommonClientRequest {
+ static InterruptibleOperation::template interruptible_future<>
+ do_recover_missing(Ref<PG>& pg, const hobject_t& soid);
+
+ static bool should_abort_request(
+ const crimson::Operation& op, std::exception_ptr eptr);
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h
new file mode 100644
index 000000000..58fa07b8b
--- /dev/null
+++ b/src/crimson/osd/osd_operations/common/pg_pipeline.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "osd/osd_op_util.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace crimson::osd {
+
+class CommonPGPipeline {
+protected:
+ friend class InternalClientRequest;
+ friend class SnapTrimEvent;
+ friend class SnapTrimObjSubEvent;
+
+ struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
+ static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
+ } wait_for_active;
+ struct RecoverMissing : OrderedExclusivePhaseT<RecoverMissing> {
+ static constexpr auto type_name = "CommonPGPipeline::recover_missing";
+ } recover_missing;
+ struct GetOBC : OrderedExclusivePhaseT<GetOBC> {
+ static constexpr auto type_name = "CommonPGPipeline::get_obc";
+ } get_obc;
+ struct Process : OrderedExclusivePhaseT<Process> {
+ static constexpr auto type_name = "CommonPGPipeline::process";
+ } process;
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
new file mode 100644
index 000000000..1e9b842b2
--- /dev/null
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osd_operations/internal_client_request.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<osd::InternalClientRequest> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+}
+
+
+namespace crimson::osd {
+
+InternalClientRequest::InternalClientRequest(Ref<PG> pg)
+ : pg(std::move(pg))
+{
+ assert(bool(this->pg));
+ assert(this->pg->is_primary());
+}
+
+InternalClientRequest::~InternalClientRequest()
+{
+ logger().debug("{}: destroying", *this);
+}
+
+void InternalClientRequest::print(std::ostream &) const
+{
+}
+
+void InternalClientRequest::dump_detail(Formatter *f) const
+{
+}
+
+CommonPGPipeline& InternalClientRequest::client_pp()
+{
+ return pg->request_pg_pipeline;
+}
+
+seastar::future<> InternalClientRequest::start()
+{
+ track_event<StartEvent>();
+ return crimson::common::handle_system_shutdown([this] {
+ return seastar::repeat([this] {
+ logger().debug("{}: in repeat", *this);
+ return interruptor::with_interruption([this]() mutable {
+ return enter_stage<interruptor>(
+ client_pp().wait_for_active
+ ).then_interruptible([this] {
+ return with_blocking_event<PGActivationBlocker::BlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->wait_for_active_blocker.wait(std::move(trigger));
+ });
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().recover_missing);
+ }).then_interruptible([this] {
+ return do_recover_missing(pg, get_target_oid());
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().get_obc);
+ }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
+ logger().debug("{}: getting obc lock", *this);
+ return seastar::do_with(create_osd_ops(),
+ [this](auto& osd_ops) mutable {
+ logger().debug("InternalClientRequest: got {} OSDOps to execute",
+ std::size(osd_ops));
+ [[maybe_unused]] const int ret = op_info.set_from_op(
+ std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
+ assert(ret == 0);
+ return pg->with_locked_obc(get_target_oid(), op_info,
+ [&osd_ops, this](auto obc) {
+ return enter_stage<interruptor>(client_pp().process
+ ).then_interruptible(
+ [obc=std::move(obc), &osd_ops, this] {
+ return pg->do_osd_ops(
+ std::move(obc),
+ osd_ops,
+ std::as_const(op_info),
+ get_do_osd_ops_params(),
+ [] {
+ return PG::do_osd_ops_iertr::now();
+ },
+ [] (const std::error_code& e) {
+ return PG::do_osd_ops_iertr::now();
+ }
+ ).safe_then_unpack_interruptible(
+ [](auto submitted, auto all_completed) {
+ return all_completed.handle_error_interruptible(
+ crimson::ct_error::eagain::handle([] {
+ return seastar::now();
+ }));
+ }, crimson::ct_error::eagain::handle([] {
+ return interruptor::now();
+ })
+ );
+ });
+ });
+ });
+ }).handle_error_interruptible(PG::load_obc_ertr::all_same_way([] {
+ return seastar::now();
+ })).then_interruptible([] {
+ return seastar::stop_iteration::yes;
+ });
+ }, [this](std::exception_ptr eptr) {
+ if (should_abort_request(*this, std::move(eptr))) {
+ return seastar::stop_iteration::yes;
+ } else {
+ return seastar::stop_iteration::no;
+ }
+ }, pg);
+ }).then([this] {
+ track_event<CompletionEvent>();
+ });
+ });
+}
+
+} // namespace crimson::osd
+
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
new file mode 100644
index 000000000..8eed12e05
--- /dev/null
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request_common.h"
+#include "crimson/osd/osd_operations/common/pg_pipeline.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_activation_blocker.h"
+
+namespace crimson::osd {
+
+class InternalClientRequest : public PhasedOperationT<InternalClientRequest>,
+ private CommonClientRequest {
+public:
+ explicit InternalClientRequest(Ref<PG> pg);
+ ~InternalClientRequest();
+
+ // imposed by `ShardService::start_operation<T>(...)`.
+ seastar::future<> start();
+
+protected:
+ virtual const hobject_t& get_target_oid() const = 0;
+ virtual PG::do_osd_ops_params_t get_do_osd_ops_params() const = 0;
+ virtual std::vector<OSDOp> create_osd_ops() = 0;
+
+ const PG& get_pg() const {
+ return *pg;
+ }
+
+private:
+ friend OperationT<InternalClientRequest>;
+
+ static constexpr OperationTypeCode type =
+ OperationTypeCode::internal_client_request;
+
+ void print(std::ostream &) const final;
+ void dump_detail(Formatter *f) const final;
+
+ CommonPGPipeline& client_pp();
+
+ seastar::future<> do_process();
+
+ Ref<PG> pg;
+ OpInfo op_info;
+ PipelineHandle handle;
+
+public:
+ PipelineHandle& get_handle() { return handle; }
+
+ std::tuple<
+ StartEvent,
+ CommonPGPipeline::WaitForActive::BlockingEvent,
+ PGActivationBlocker::BlockingEvent,
+ CommonPGPipeline::RecoverMissing::BlockingEvent,
+ CommonPGPipeline::GetOBC::BlockingEvent,
+ CommonPGPipeline::Process::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+};
+
+} // namespace crimson::osd
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::InternalClientRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/logmissing_request.cc b/src/crimson/osd/osd_operations/logmissing_request.cc
new file mode 100644
index 000000000..739b46406
--- /dev/null
+++ b/src/crimson/osd/osd_operations/logmissing_request.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "logmissing_request.h"
+
+#include "common/Formatter.h"
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+LogMissingRequest::LogMissingRequest(crimson::net::ConnectionRef&& conn,
+ Ref<MOSDPGUpdateLogMissing> &&req)
+ : conn{std::move(conn)},
+ req{std::move(req)}
+{}
+
+void LogMissingRequest::print(std::ostream& os) const
+{
+ os << "LogMissingRequest("
+ << "from=" << req->from
+ << " req=" << *req
+ << ")";
+}
+
+void LogMissingRequest::dump_detail(Formatter *f) const
+{
+ f->open_object_section("LogMissingRequest");
+ f->dump_stream("req_tid") << req->get_tid();
+ f->dump_stream("pgid") << req->get_spg();
+ f->dump_unsigned("map_epoch", req->get_map_epoch());
+ f->dump_unsigned("min_epoch", req->get_min_epoch());
+ f->dump_stream("entries") << req->entries;
+ f->dump_stream("from") << req->from;
+ f->close_section();
+}
+
+ConnectionPipeline &LogMissingRequest::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).replicated_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &LogMissingRequest::client_pp(PG &pg)
+{
+ return pg.request_pg_pipeline;
+}
+
+seastar::future<> LogMissingRequest::with_pg(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ logger().debug("{}: LogMissingRequest::with_pg", *this);
+
+ IRef ref = this;
+ return interruptor::with_interruption([this, pg] {
+ logger().debug("{}: pg present", *this);
+ return this->template enter_stage<interruptor>(client_pp(*pg).await_map
+ ).then_interruptible([this, pg] {
+ return this->template with_blocking_event<
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent
+ >([this, pg](auto &&trigger) {
+ return pg->osdmap_gate.wait_for_map(
+ std::move(trigger), req->min_epoch);
+ });
+ }).then_interruptible([this, pg](auto) {
+ return pg->do_update_log_missing(req, conn);
+ });
+ }, [ref](std::exception_ptr) { return seastar::now(); }, pg);
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h
new file mode 100644
index 000000000..71d0816fd
--- /dev/null
+++ b/src/crimson/osd/osd_operations/logmissing_request.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+class OSD;
+class PG;
+
+class LogMissingRequest final : public PhasedOperationT<LogMissingRequest> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::logmissing_request;
+ LogMissingRequest(crimson::net::ConnectionRef&&, Ref<MOSDPGUpdateLogMissing>&&);
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return req->get_spg();
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return req->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ ClientRequest::PGPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent
+ > tracking_events;
+
+private:
+ ClientRequest::PGPipeline &client_pp(PG &pg);
+
+ crimson::net::ConnectionRef conn;
+ // must be after `conn` to ensure the ConnectionPipeline's is alive
+ PipelineHandle handle;
+ Ref<MOSDPGUpdateLogMissing> req;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::LogMissingRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.cc b/src/crimson/osd/osd_operations/logmissing_request_reply.cc
new file mode 100644
index 000000000..b4bf2938e
--- /dev/null
+++ b/src/crimson/osd/osd_operations/logmissing_request_reply.cc
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "logmissing_request_reply.h"
+
+#include "common/Formatter.h"
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+LogMissingRequestReply::LogMissingRequestReply(
+ crimson::net::ConnectionRef&& conn,
+ Ref<MOSDPGUpdateLogMissingReply> &&req)
+ : conn{std::move(conn)},
+ req{std::move(req)}
+{}
+
+void LogMissingRequestReply::print(std::ostream& os) const
+{
+ os << "LogMissingRequestReply("
+ << "from=" << req->from
+ << " req=" << *req
+ << ")";
+}
+
+void LogMissingRequestReply::dump_detail(Formatter *f) const
+{
+ f->open_object_section("LogMissingRequestReply");
+ f->dump_stream("rep_tid") << req->get_tid();
+ f->dump_stream("pgid") << req->get_spg();
+ f->dump_unsigned("map_epoch", req->get_map_epoch());
+ f->dump_unsigned("min_epoch", req->get_min_epoch());
+ f->dump_stream("from") << req->from;
+ f->close_section();
+}
+
+ConnectionPipeline &LogMissingRequestReply::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).replicated_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &LogMissingRequestReply::client_pp(PG &pg)
+{
+ return pg.request_pg_pipeline;
+}
+
+seastar::future<> LogMissingRequestReply::with_pg(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ logger().debug("{}: LogMissingRequestReply::with_pg", *this);
+
+ IRef ref = this;
+ return interruptor::with_interruption([this, pg] {
+ return pg->do_update_log_missing_reply(std::move(req));
+ }, [ref](std::exception_ptr) { return seastar::now(); }, pg);
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h
new file mode 100644
index 000000000..c89131fec
--- /dev/null
+++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+class OSD;
+class PG;
+
+class LogMissingRequestReply final : public PhasedOperationT<LogMissingRequestReply> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::logmissing_request_reply;
+ LogMissingRequestReply(crimson::net::ConnectionRef&&, Ref<MOSDPGUpdateLogMissingReply>&&);
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return req->get_spg();
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return req->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent
+ > tracking_events;
+
+private:
+ ClientRequest::PGPipeline &client_pp(PG &pg);
+
+ crimson::net::ConnectionRef conn;
+ // must be after `conn` to ensure the ConnectionPipeline's is alive
+ PipelineHandle handle;
+ Ref<MOSDPGUpdateLogMissingReply> req;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::LogMissingRequestReply> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h
new file mode 100644
index 000000000..c7b81e765
--- /dev/null
+++ b/src/crimson/osd/osd_operations/osdop_params.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "messages/MOSDOp.h"
+#include "osd/osd_types.h"
+#include "crimson/common/type_helpers.h"
+
+// The fields in this struct are parameters that may be needed in multiple
+// level of processing. I inclosed all those parameters in this struct to
+// avoid passing each of them as a method parameter.
+struct osd_op_params_t {
+ osd_reqid_t req_id;
+ utime_t mtime;
+ eversion_t at_version;
+ eversion_t pg_trim_to;
+ eversion_t min_last_complete_ondisk;
+ eversion_t last_complete;
+ version_t user_at_version = 0;
+ bool user_modify = false;
+ ObjectCleanRegions clean_regions;
+
+ osd_op_params_t() = default;
+};
diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc
new file mode 100644
index 000000000..ea4662bd0
--- /dev/null
+++ b/src/crimson/osd/osd_operations/peering_event.cc
@@ -0,0 +1,190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+#include <seastar/core/sleep.hh>
+
+#include "messages/MOSDPGLog.h"
+
+#include "common/Formatter.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_connection_priv.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+template <class T>
+void PeeringEvent<T>::print(std::ostream &lhs) const
+{
+ lhs << "PeeringEvent("
+ << "from=" << from
+ << " pgid=" << pgid
+ << " sent=" << evt.get_epoch_sent()
+ << " requested=" << evt.get_epoch_requested()
+ << " evt=" << evt.get_desc()
+ << ")";
+}
+
+template <class T>
+void PeeringEvent<T>::dump_detail(Formatter *f) const
+{
+ f->open_object_section("PeeringEvent");
+ f->dump_stream("from") << from;
+ f->dump_stream("pgid") << pgid;
+ f->dump_int("sent", evt.get_epoch_sent());
+ f->dump_int("requested", evt.get_epoch_requested());
+ f->dump_string("evt", evt.get_desc());
+ f->open_array_section("events");
+ {
+ std::apply([f](auto&... events) {
+ (..., events.dump(f));
+ }, static_cast<const T*>(this)->tracking_events);
+ }
+ f->close_section();
+ f->close_section();
+}
+
+
+template <class T>
+PGPeeringPipeline &PeeringEvent<T>::peering_pp(PG &pg)
+{
+ return pg.peering_request_pg_pipeline;
+}
+
+template <class T>
+seastar::future<> PeeringEvent<T>::with_pg(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ if (!pg) {
+ logger().warn("{}: pg absent, did not create", *this);
+ on_pg_absent(shard_services);
+ that()->get_handle().exit();
+ return complete_rctx_no_pg(shard_services);
+ }
+
+ using interruptor = typename T::interruptor;
+ return interruptor::with_interruption([this, pg, &shard_services] {
+ logger().debug("{}: pg present", *this);
+ return this->template enter_stage<interruptor>(peering_pp(*pg).await_map
+ ).then_interruptible([this, pg] {
+ return this->template with_blocking_event<
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent
+ >([this, pg](auto &&trigger) {
+ return pg->osdmap_gate.wait_for_map(
+ std::move(trigger), evt.get_epoch_sent());
+ });
+ }).then_interruptible([this, pg](auto) {
+ return this->template enter_stage<interruptor>(peering_pp(*pg).process);
+ }).then_interruptible([this, pg, &shard_services] {
+ return pg->do_peering_event(evt, ctx
+ ).then_interruptible([this, pg, &shard_services] {
+ that()->get_handle().exit();
+ return complete_rctx(shard_services, pg);
+ });
+ }).then_interruptible([pg, &shard_services]()
+ -> typename T::template interruptible_future<> {
+ if (!pg->get_need_up_thru()) {
+ return seastar::now();
+ }
+ return shard_services.send_alive(pg->get_same_interval_since());
+ }).then_interruptible([&shard_services] {
+ return shard_services.send_pg_temp();
+ });
+ }, [this](std::exception_ptr ep) {
+ logger().debug("{}: interrupted with {}", *this, ep);
+ }, pg);
+}
+
+template <class T>
+void PeeringEvent<T>::on_pg_absent(ShardServices &)
+{
+ logger().debug("{}: pg absent, dropping", *this);
+}
+
+template <class T>
+typename PeeringEvent<T>::template interruptible_future<>
+PeeringEvent<T>::complete_rctx(ShardServices &shard_services, Ref<PG> pg)
+{
+ logger().debug("{}: submitting ctx", *this);
+ return shard_services.dispatch_context(
+ pg->get_collection_ref(),
+ std::move(ctx));
+}
+
+ConnectionPipeline &RemotePeeringEvent::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).peering_request_conn_pipeline;
+}
+
+void RemotePeeringEvent::on_pg_absent(ShardServices &shard_services)
+{
+ if (auto& e = get_event().get_event();
+ e.dynamic_type() == MQuery::static_type()) {
+ const auto map_epoch =
+ shard_services.get_map()->get_epoch();
+ const auto& q = static_cast<const MQuery&>(e);
+ const pg_info_t empty{spg_t{pgid.pgid, q.query.to}};
+ if (q.query.type == q.query.LOG ||
+ q.query.type == q.query.FULLLOG) {
+ auto m = crimson::make_message<MOSDPGLog>(q.query.from, q.query.to,
+ map_epoch, empty,
+ q.query.epoch_sent);
+ ctx.send_osd_message(q.from.osd, std::move(m));
+ } else {
+ ctx.send_notify(q.from.osd, {q.query.from, q.query.to,
+ q.query.epoch_sent,
+ map_epoch, empty,
+ PastIntervals{}});
+ }
+ }
+}
+
+RemotePeeringEvent::interruptible_future<> RemotePeeringEvent::complete_rctx(
+ ShardServices &shard_services,
+ Ref<PG> pg)
+{
+ if (pg) {
+ return PeeringEvent::complete_rctx(shard_services, pg);
+ } else {
+ return shard_services.dispatch_context_messages(std::move(ctx));
+ }
+}
+
+seastar::future<> RemotePeeringEvent::complete_rctx_no_pg(
+ ShardServices &shard_services)
+{
+ return shard_services.dispatch_context_messages(std::move(ctx));
+}
+
+seastar::future<> LocalPeeringEvent::start()
+{
+ logger().debug("{}: start", *this);
+
+ IRef ref = this;
+ auto maybe_delay = seastar::now();
+ if (delay) {
+ maybe_delay = seastar::sleep(
+ std::chrono::milliseconds(std::lround(delay * 1000)));
+ }
+ return maybe_delay.then([this] {
+ return with_pg(pg->get_shard_services(), pg);
+ }).finally([ref=std::move(ref)] {
+ logger().debug("{}: complete", *ref);
+ });
+}
+
+
+LocalPeeringEvent::~LocalPeeringEvent() {}
+
+template class PeeringEvent<RemotePeeringEvent>;
+template class PeeringEvent<LocalPeeringEvent>;
+
+}
diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h
new file mode 100644
index 000000000..e94caead1
--- /dev/null
+++ b/src/crimson/osd/osd_operations/peering_event.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "osd/osd_types.h"
+#include "osd/PGPeeringEvent.h"
+#include "osd/PeeringState.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class OSD;
+class ShardServices;
+class PG;
+class BackfillRecovery;
+
+ class PGPeeringPipeline {
+ struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+ static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map";
+ } await_map;
+ struct Process : OrderedExclusivePhaseT<Process> {
+ static constexpr auto type_name = "PeeringEvent::PGPipeline::process";
+ } process;
+ template <class T>
+ friend class PeeringEvent;
+ friend class LocalPeeringEvent;
+ friend class RemotePeeringEvent;
+ friend class PGAdvanceMap;
+ friend class BackfillRecovery;
+ };
+
+template <class T>
+class PeeringEvent : public PhasedOperationT<T> {
+ T* that() {
+ return static_cast<T*>(this);
+ }
+ const T* that() const {
+ return static_cast<const T*>(this);
+ }
+
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::peering_event;
+
+protected:
+ PGPeeringPipeline &peering_pp(PG &pg);
+
+ PeeringCtx ctx;
+ pg_shard_t from;
+ spg_t pgid;
+ float delay = 0;
+ PGPeeringEvent evt;
+
+ const pg_shard_t get_from() const {
+ return from;
+ }
+
+ const spg_t get_pgid() const {
+ return pgid;
+ }
+
+ const PGPeeringEvent &get_event() const {
+ return evt;
+ }
+
+ virtual void on_pg_absent(ShardServices &);
+
+ virtual typename PeeringEvent::template interruptible_future<>
+ complete_rctx(ShardServices &, Ref<PG>);
+
+ virtual seastar::future<> complete_rctx_no_pg(
+ ShardServices &shard_services
+ ) { return seastar::now();}
+
+public:
+ template <typename... Args>
+ PeeringEvent(
+ const pg_shard_t &from, const spg_t &pgid,
+ Args&&... args) :
+ from(from),
+ pgid(pgid),
+ evt(std::forward<Args>(args)...)
+ {}
+ template <typename... Args>
+ PeeringEvent(
+ const pg_shard_t &from, const spg_t &pgid,
+ float delay, Args&&... args) :
+ from(from),
+ pgid(pgid),
+ delay(delay),
+ evt(std::forward<Args>(args)...)
+ {}
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+};
+
+class RemotePeeringEvent : public PeeringEvent<RemotePeeringEvent> {
+protected:
+ crimson::net::ConnectionRef conn;
+ // must be after conn due to ConnectionPipeline's life-time
+ PipelineHandle handle;
+
+ void on_pg_absent(ShardServices &) final;
+ PeeringEvent::interruptible_future<> complete_rctx(
+ ShardServices &shard_services,
+ Ref<PG> pg) override;
+ seastar::future<> complete_rctx_no_pg(
+ ShardServices &shard_services
+ ) override;
+
+public:
+ class OSDPipeline {
+ struct AwaitActive : OrderedExclusivePhaseT<AwaitActive> {
+ static constexpr auto type_name =
+ "PeeringRequest::OSDPipeline::await_active";
+ } await_active;
+ friend class RemotePeeringEvent;
+ };
+
+ template <typename... Args>
+ RemotePeeringEvent(crimson::net::ConnectionRef conn, Args&&... args) :
+ PeeringEvent(std::forward<Args>(args)...),
+ conn(conn)
+ {}
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ PGPeeringPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGPeeringPipeline::Process::BlockingEvent,
+ OSDPipeline::AwaitActive::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+
+ static constexpr bool can_create() { return true; }
+ auto get_create_info() { return std::move(evt.create_info); }
+ spg_t get_pgid() const {
+ return pgid;
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return evt.get_epoch_sent(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+};
+
+class LocalPeeringEvent final : public PeeringEvent<LocalPeeringEvent> {
+protected:
+ Ref<PG> pg;
+ PipelineHandle handle;
+
+public:
+ template <typename... Args>
+ LocalPeeringEvent(Ref<PG> pg, Args&&... args) :
+ PeeringEvent(std::forward<Args>(args)...),
+ pg(pg)
+ {}
+
+ seastar::future<> start();
+ virtual ~LocalPeeringEvent();
+
+ PipelineHandle &get_handle() { return handle; }
+
+ std::tuple<
+ StartEvent,
+ PGPeeringPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGPeeringPipeline::Process::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+};
+
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::LocalPeeringEvent> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::osd::RemotePeeringEvent> : fmt::ostream_formatter {};
+template <class T> struct fmt::formatter<crimson::osd::PeeringEvent<T>> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/pg_advance_map.cc b/src/crimson/osd/osd_operations/pg_advance_map.cc
new file mode 100644
index 000000000..3706af810
--- /dev/null
+++ b/src/crimson/osd/osd_operations/pg_advance_map.cc
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+
+#include "include/types.h"
+#include "common/Formatter.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "osd/PeeringState.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+PGAdvanceMap::PGAdvanceMap(
+ ShardServices &shard_services, Ref<PG> pg, epoch_t to,
+ PeeringCtx &&rctx, bool do_init)
+ : shard_services(shard_services), pg(pg), to(to),
+ rctx(std::move(rctx)), do_init(do_init)
+{
+ logger().debug("{}: created", *this);
+}
+
+PGAdvanceMap::~PGAdvanceMap() {}
+
+void PGAdvanceMap::print(std::ostream &lhs) const
+{
+ lhs << "PGAdvanceMap("
+ << "pg=" << pg->get_pgid()
+ << " from=" << (from ? *from : -1)
+ << " to=" << to;
+ if (do_init) {
+ lhs << " do_init";
+ }
+ lhs << ")";
+}
+
+void PGAdvanceMap::dump_detail(Formatter *f) const
+{
+ f->open_object_section("PGAdvanceMap");
+ f->dump_stream("pgid") << pg->get_pgid();
+ if (from) {
+ f->dump_int("from", *from);
+ }
+ f->dump_int("to", to);
+ f->dump_bool("do_init", do_init);
+ f->close_section();
+}
+
+PGPeeringPipeline &PGAdvanceMap::peering_pp(PG &pg)
+{
+ return pg.peering_request_pg_pipeline;
+}
+
+seastar::future<> PGAdvanceMap::start()
+{
+ using cached_map_t = OSDMapService::cached_map_t;
+
+ logger().debug("{}: start", *this);
+
+ IRef ref = this;
+ return enter_stage<>(
+ peering_pp(*pg).process
+ ).then([this] {
+ /*
+ * PGAdvanceMap is scheduled at pg creation and when
+ * broadcasting new osdmaps to pgs. We are not able to serialize
+ * between the two different PGAdvanceMap callers since a new pg
+ * will get advanced to the latest osdmap at it's creation.
+ * As a result, we may need to adjust the PGAdvance operation
+ * 'from' epoch.
+ * See: https://tracker.ceph.com/issues/61744
+ */
+ from = pg->get_osdmap_epoch();
+ auto fut = seastar::now();
+ if (do_init) {
+ fut = pg->handle_initialize(rctx
+ ).then([this] {
+ return pg->handle_activate_map(rctx);
+ });
+ }
+ return fut.then([this] {
+ ceph_assert(std::cmp_less_equal(*from, to));
+ return seastar::do_for_each(
+ boost::make_counting_iterator(*from + 1),
+ boost::make_counting_iterator(to + 1),
+ [this](epoch_t next_epoch) {
+ logger().debug("{}: start: getting map {}",
+ *this, next_epoch);
+ return shard_services.get_map(next_epoch).then(
+ [this] (cached_map_t&& next_map) {
+ logger().debug("{}: advancing map to {}",
+ *this, next_map->get_epoch());
+ return pg->handle_advance_map(next_map, rctx);
+ });
+ }).then([this] {
+ return pg->handle_activate_map(rctx).then([this] {
+ logger().debug("{}: map activated", *this);
+ if (do_init) {
+ shard_services.pg_created(pg->get_pgid(), pg);
+ logger().info("PGAdvanceMap::start new pg {}", *pg);
+ }
+ return seastar::when_all_succeed(
+ pg->get_need_up_thru()
+ ? shard_services.send_alive(
+ pg->get_same_interval_since())
+ : seastar::now(),
+ shard_services.dispatch_context(
+ pg->get_collection_ref(),
+ std::move(rctx)));
+ });
+ }).then_unpack([this] {
+ logger().debug("{}: sending pg temp", *this);
+ return shard_services.send_pg_temp();
+ });
+ });
+ }).then([this, ref=std::move(ref)] {
+ logger().debug("{}: complete", *this);
+ });
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h
new file mode 100644
index 000000000..b712cc12e
--- /dev/null
+++ b/src/crimson/osd/osd_operations/pg_advance_map.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "osd/osd_types.h"
+#include "crimson/common/type_helpers.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+class PG;
+
+class PGAdvanceMap : public PhasedOperationT<PGAdvanceMap> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::pg_advance_map;
+
+protected:
+ ShardServices &shard_services;
+ Ref<PG> pg;
+ PipelineHandle handle;
+
+ std::optional<epoch_t> from;
+ epoch_t to;
+
+ PeeringCtx rctx;
+ const bool do_init;
+
+public:
+ PGAdvanceMap(
+ ShardServices &shard_services, Ref<PG> pg, epoch_t to,
+ PeeringCtx &&rctx, bool do_init);
+ ~PGAdvanceMap();
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter *f) const final;
+ seastar::future<> start();
+ PipelineHandle &get_handle() { return handle; }
+
+ std::tuple<
+ PGPeeringPipeline::Process::BlockingEvent
+ > tracking_events;
+
+private:
+ PGPeeringPipeline &peering_pp(PG &pg);
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::PGAdvanceMap> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.cc b/src/crimson/osd/osd_operations/recovery_subrequest.cc
new file mode 100644
index 000000000..68655b8da
--- /dev/null
+++ b/src/crimson/osd/osd_operations/recovery_subrequest.cc
@@ -0,0 +1,46 @@
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "crimson/osd/osd_operations/recovery_subrequest.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd_connection_priv.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<osd::RecoverySubRequest> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+}
+
+namespace crimson::osd {
+
+seastar::future<> RecoverySubRequest::with_pg(
+ ShardServices &shard_services, Ref<PG> pgref)
+{
+ logger().debug("{}: {}", "RecoverySubRequest::with_pg", *this);
+
+ track_event<StartEvent>();
+ IRef opref = this;
+ return interruptor::with_interruption([this, pgref] {
+ return pgref->get_recovery_backend()->handle_recovery_op(m, conn);
+ }, [](std::exception_ptr) {
+ return seastar::now();
+ }, pgref).finally([this, opref, pgref] {
+ track_event<CompletionEvent>();
+ });
+}
+
+ConnectionPipeline &RecoverySubRequest::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).peering_request_conn_pipeline;
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h
new file mode 100644
index 000000000..07c7c95b5
--- /dev/null
+++ b/src/crimson/osd/osd_operations/recovery_subrequest.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "osd/osd_op_util.h"
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/pg.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDFastDispatchOp.h"
+
+namespace crimson::osd {
+
+class PG;
+
+class RecoverySubRequest final : public PhasedOperationT<RecoverySubRequest> {
+public:
+ static constexpr OperationTypeCode type =
+ OperationTypeCode::background_recovery_sub;
+
+ RecoverySubRequest(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDFastDispatchOp>&& m)
+ : conn(conn), m(m) {}
+
+ void print(std::ostream& out) const final
+ {
+ out << *m;
+ }
+
+ void dump_detail(Formatter *f) const final
+ {
+ }
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return m->get_spg();
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return m->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+
+private:
+ crimson::net::ConnectionRef conn;
+ // must be after `conn` to ensure the ConnectionPipeline's is alive
+ PipelineHandle handle;
+ Ref<MOSDFastDispatchOp> m;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::RecoverySubRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc
new file mode 100644
index 000000000..09217575c
--- /dev/null
+++ b/src/crimson/osd/osd_operations/replicated_request.cc
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "replicated_request.h"
+
+#include "common/Formatter.h"
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+RepRequest::RepRequest(crimson::net::ConnectionRef&& conn,
+ Ref<MOSDRepOp> &&req)
+ : conn{std::move(conn)},
+ req{std::move(req)}
+{}
+
+void RepRequest::print(std::ostream& os) const
+{
+ os << "RepRequest("
+ << "from=" << req->from
+ << " req=" << *req
+ << ")";
+}
+
+void RepRequest::dump_detail(Formatter *f) const
+{
+ f->open_object_section("RepRequest");
+ f->dump_stream("reqid") << req->reqid;
+ f->dump_stream("pgid") << req->get_spg();
+ f->dump_unsigned("map_epoch", req->get_map_epoch());
+ f->dump_unsigned("min_epoch", req->get_min_epoch());
+ f->dump_stream("oid") << req->poid;
+ f->dump_stream("from") << req->from;
+ f->close_section();
+}
+
+ConnectionPipeline &RepRequest::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).replicated_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &RepRequest::client_pp(PG &pg)
+{
+ return pg.request_pg_pipeline;
+}
+
+seastar::future<> RepRequest::with_pg(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ logger().debug("{}: RepRequest::with_pg", *this);
+ IRef ref = this;
+ return interruptor::with_interruption([this, pg] {
+ logger().debug("{}: pg present", *this);
+ return this->template enter_stage<interruptor>(client_pp(*pg).await_map
+ ).then_interruptible([this, pg] {
+ return this->template with_blocking_event<
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent
+ >([this, pg](auto &&trigger) {
+ return pg->osdmap_gate.wait_for_map(
+ std::move(trigger), req->min_epoch);
+ });
+ }).then_interruptible([this, pg] (auto) {
+ return pg->handle_rep_op(req);
+ });
+ }, [ref](std::exception_ptr) {
+ return seastar::now();
+ }, pg);
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h
new file mode 100644
index 000000000..c742888d9
--- /dev/null
+++ b/src/crimson/osd/osd_operations/replicated_request.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDRepOp.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+class OSD;
+class PG;
+
+class RepRequest final : public PhasedOperationT<RepRequest> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::replicated_request;
+ RepRequest(crimson::net::ConnectionRef&&, Ref<MOSDRepOp>&&);
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return req->get_spg();
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return req->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ ClientRequest::PGPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent
+ > tracking_events;
+
+private:
+ ClientRequest::PGPipeline &client_pp(PG &pg);
+
+ crimson::net::ConnectionRef conn;
+ PipelineHandle handle;
+ Ref<MOSDRepOp> req;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::RepRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
new file mode 100644
index 000000000..e4a1b04df
--- /dev/null
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -0,0 +1,569 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/osd_operations/snaptrim_event.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/pg.h"
+#include <seastar/core/sleep.hh>
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<osd::SnapTrimEvent> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+
+ template <>
+ struct EventBackendRegistry<osd::SnapTrimObjSubEvent> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+}
+
+namespace crimson::osd {
+
+PG::interruptible_future<>
+PG::SnapTrimMutex::lock(SnapTrimEvent &st_event) noexcept
+{
+ return st_event.enter_stage<interruptor>(wait_pg
+ ).then_interruptible([this] {
+ return mutex.lock();
+ });
+}
+
+void SnapTrimEvent::SubOpBlocker::dump_detail(Formatter *f) const
+{
+ f->open_array_section("dependent_operations");
+ {
+ for (const auto &kv : subops) {
+ f->dump_unsigned("op_id", kv.first);
+ }
+ }
+ f->close_section();
+}
+
+template <class... Args>
+void SnapTrimEvent::SubOpBlocker::emplace_back(Args&&... args)
+{
+ subops.emplace_back(std::forward<Args>(args)...);
+};
+
+SnapTrimEvent::remove_or_update_iertr::future<>
+SnapTrimEvent::SubOpBlocker::wait_completion()
+{
+ return interruptor::do_for_each(subops, [](auto&& kv) {
+ return std::move(kv.second);
+ });
+}
+
+void SnapTrimEvent::print(std::ostream &lhs) const
+{
+ lhs << "SnapTrimEvent("
+ << "pgid=" << pg->get_pgid()
+ << " snapid=" << snapid
+ << " needs_pause=" << needs_pause
+ << ")";
+}
+
+void SnapTrimEvent::dump_detail(Formatter *f) const
+{
+ f->open_object_section("SnapTrimEvent");
+ f->dump_stream("pgid") << pg->get_pgid();
+ f->close_section();
+}
+
+SnapTrimEvent::snap_trim_ertr::future<seastar::stop_iteration>
+SnapTrimEvent::start()
+{
+ logger().debug("{}: {}", *this, __func__);
+ return with_pg(
+ pg->get_shard_services(), pg
+ ).finally([ref=IRef{this}, this] {
+ logger().debug("{}: complete", *ref);
+ return handle.complete();
+ });
+}
+
+CommonPGPipeline& SnapTrimEvent::client_pp()
+{
+ return pg->request_pg_pipeline;
+}
+
+SnapTrimEvent::snap_trim_ertr::future<seastar::stop_iteration>
+SnapTrimEvent::with_pg(
+ ShardServices &shard_services, Ref<PG> _pg)
+{
+ return interruptor::with_interruption([&shard_services, this] {
+ return enter_stage<interruptor>(
+ client_pp().wait_for_active
+ ).then_interruptible([this] {
+ return with_blocking_event<PGActivationBlocker::BlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->wait_for_active_blocker.wait(std::move(trigger));
+ });
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().recover_missing);
+ }).then_interruptible([] {
+ //return do_recover_missing(pg, get_target_oid());
+ return seastar::now();
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().get_obc);
+ }).then_interruptible([this] {
+ return pg->snaptrim_mutex.lock(*this);
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().process);
+ }).then_interruptible([&shard_services, this] {
+ return interruptor::async([this] {
+ std::vector<hobject_t> to_trim;
+ using crimson::common::local_conf;
+ const auto max =
+ local_conf().get_val<uint64_t>("osd_pg_max_concurrent_snap_trims");
+ // we need to look for at least 1 snaptrim, otherwise we'll misinterpret
+ // the ENOENT below and erase snapid.
+ int r = snap_mapper.get_next_objects_to_trim(
+ snapid,
+ max,
+ &to_trim);
+ if (r == -ENOENT) {
+ to_trim.clear(); // paranoia
+ return to_trim;
+ } else if (r != 0) {
+ logger().error("{}: get_next_objects_to_trim returned {}",
+ *this, cpp_strerror(r));
+ ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
+ } else {
+ assert(!to_trim.empty());
+ }
+ logger().debug("{}: async almost done line {}", *this, __LINE__);
+ return to_trim;
+ }).then_interruptible([&shard_services, this] (const auto& to_trim) {
+ if (to_trim.empty()) {
+ // the legit ENOENT -> done
+ logger().debug("{}: to_trim is empty! Stopping iteration", *this);
+ pg->snaptrim_mutex.unlock();
+ return snap_trim_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ return [&shard_services, this](const auto &to_trim) {
+ for (const auto& object : to_trim) {
+ logger().debug("{}: trimming {}", *this, object);
+ auto [op, fut] = shard_services.start_operation_may_interrupt<
+ interruptor, SnapTrimObjSubEvent>(
+ pg,
+ object,
+ snapid);
+ subop_blocker.emplace_back(
+ op->get_id(),
+ std::move(fut)
+ );
+ }
+ return interruptor::now();
+ }(to_trim).then_interruptible([this] {
+ return enter_stage<interruptor>(wait_subop);
+ }).then_interruptible([this] {
+ logger().debug("{}: awaiting completion", *this);
+ return subop_blocker.wait_completion();
+ }).finally([this] {
+ pg->snaptrim_mutex.unlock();
+ }).safe_then_interruptible([this] {
+ if (!needs_pause) {
+ return interruptor::now();
+ }
+ // let's know operators we're waiting
+ return enter_stage<interruptor>(
+ wait_trim_timer
+ ).then_interruptible([this] {
+ using crimson::common::local_conf;
+ const auto time_to_sleep =
+ local_conf().template get_val<double>("osd_snap_trim_sleep");
+ logger().debug("{}: time_to_sleep {}", *this, time_to_sleep);
+ // TODO: this logic should be more sophisticated and distinguish
+ // between SSDs, HDDs and the hybrid case
+ return seastar::sleep(
+ std::chrono::milliseconds(std::lround(time_to_sleep * 1000)));
+ });
+ }).safe_then_interruptible([this] {
+ logger().debug("{}: all completed", *this);
+ return snap_trim_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ });
+ });
+ }, [this](std::exception_ptr eptr) -> snap_trim_ertr::future<seastar::stop_iteration> {
+ logger().debug("{}: interrupted {}", *this, eptr);
+ return crimson::ct_error::eagain::make();
+ }, pg);
+}
+
+
+CommonPGPipeline& SnapTrimObjSubEvent::client_pp()
+{
+ return pg->request_pg_pipeline;
+}
+
+SnapTrimObjSubEvent::remove_or_update_iertr::future<>
+SnapTrimObjSubEvent::start()
+{
+ logger().debug("{}: start", *this);
+ return with_pg(
+ pg->get_shard_services(), pg
+ ).finally([ref=IRef{this}, this] {
+ logger().debug("{}: complete", *ref);
+ return handle.complete();
+ });
+}
+
+SnapTrimObjSubEvent::remove_or_update_iertr::future<>
+SnapTrimObjSubEvent::remove_clone(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries
+) {
+ const auto p = std::find(
+ head_obc->ssc->snapset.clones.begin(),
+ head_obc->ssc->snapset.clones.end(),
+ coid.snap);
+ if (p == head_obc->ssc->snapset.clones.end()) {
+ logger().error("{}: Snap {} not in clones",
+ *this, coid.snap);
+ return crimson::ct_error::enoent::make();
+ }
+ assert(p != head_obc->ssc->snapset.clones.end());
+ snapid_t last = coid.snap;
+ delta_stats.num_bytes -= head_obc->ssc->snapset.get_clone_bytes(last);
+
+ if (p != head_obc->ssc->snapset.clones.begin()) {
+ // not the oldest... merge overlap into next older clone
+ std::vector<snapid_t>::iterator n = p - 1;
+ hobject_t prev_coid = coid;
+ prev_coid.snap = *n;
+
+ // does the classical OSD really need is_present_clone(prev_coid)?
+ delta_stats.num_bytes -= head_obc->ssc->snapset.get_clone_bytes(*n);
+ head_obc->ssc->snapset.clone_overlap[*n].intersection_of(
+ head_obc->ssc->snapset.clone_overlap[*p]);
+ delta_stats.num_bytes += head_obc->ssc->snapset.get_clone_bytes(*n);
+ }
+ delta_stats.num_objects--;
+ if (obc->obs.oi.is_dirty()) {
+ delta_stats.num_objects_dirty--;
+ }
+ if (obc->obs.oi.is_omap()) {
+ delta_stats.num_objects_omap--;
+ }
+ if (obc->obs.oi.is_whiteout()) {
+ logger().debug("{}: trimming whiteout on {}",
+ *this, coid);
+ delta_stats.num_whiteouts--;
+ }
+ delta_stats.num_object_clones--;
+
+ obc->obs.exists = false;
+ head_obc->ssc->snapset.clones.erase(p);
+ head_obc->ssc->snapset.clone_overlap.erase(last);
+ head_obc->ssc->snapset.clone_size.erase(last);
+ head_obc->ssc->snapset.clone_snaps.erase(last);
+
+ log_entries.emplace_back(
+ pg_log_entry_t{
+ pg_log_entry_t::DELETE,
+ coid,
+ osd_op_p.at_version,
+ obc->obs.oi.version,
+ 0,
+ osd_reqid_t(),
+ obc->obs.oi.mtime, // will be replaced in `apply_to()`
+ 0}
+ );
+ txn.remove(
+ pg->get_collection_ref()->get_cid(),
+ ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD});
+ obc->obs.oi = object_info_t(coid);
+ return OpsExecuter::snap_map_remove(coid, pg->snap_mapper, pg->osdriver, txn);
+}
+
+void SnapTrimObjSubEvent::remove_head_whiteout(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries
+) {
+ // NOTE: this arguably constitutes minor interference with the
+ // tiering agent if this is a cache tier since a snap trim event
+ // is effectively evicting a whiteout we might otherwise want to
+ // keep around.
+ const auto head_oid = coid.get_head();
+ logger().info("{}: {} removing {}",
+ *this, coid, head_oid);
+ log_entries.emplace_back(
+ pg_log_entry_t{
+ pg_log_entry_t::DELETE,
+ head_oid,
+ osd_op_p.at_version,
+ head_obc->obs.oi.version,
+ 0,
+ osd_reqid_t(),
+ obc->obs.oi.mtime, // will be replaced in `apply_to()`
+ 0}
+ );
+ logger().info("{}: remove snap head", *this);
+ object_info_t& oi = head_obc->obs.oi;
+ delta_stats.num_objects--;
+ if (oi.is_dirty()) {
+ delta_stats.num_objects_dirty--;
+ }
+ if (oi.is_omap()) {
+ delta_stats.num_objects_omap--;
+ }
+ if (oi.is_whiteout()) {
+ logger().debug("{}: trimming whiteout on {}",
+ *this, oi.soid);
+ delta_stats.num_whiteouts--;
+ }
+ head_obc->obs.exists = false;
+ head_obc->obs.oi = object_info_t(head_oid);
+ txn.remove(pg->get_collection_ref()->get_cid(),
+ ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD});
+}
+
+SnapTrimObjSubEvent::interruptible_future<>
+SnapTrimObjSubEvent::adjust_snaps(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ const std::set<snapid_t>& new_snaps,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries
+) {
+ head_obc->ssc->snapset.clone_snaps[coid.snap] =
+ std::vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
+
+ // we still do a 'modify' event on this object just to trigger a
+ // snapmapper.update ... :(
+ obc->obs.oi.prior_version = obc->obs.oi.version;
+ obc->obs.oi.version = osd_op_p.at_version;
+ ceph::bufferlist bl;
+ encode(obc->obs.oi,
+ bl,
+ pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ txn.setattr(
+ pg->get_collection_ref()->get_cid(),
+ ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD},
+ OI_ATTR,
+ bl);
+ log_entries.emplace_back(
+ pg_log_entry_t{
+ pg_log_entry_t::MODIFY,
+ coid,
+ obc->obs.oi.version,
+ obc->obs.oi.prior_version,
+ 0,
+ osd_reqid_t(),
+ obc->obs.oi.mtime,
+ 0}
+ );
+ return OpsExecuter::snap_map_modify(
+ coid, new_snaps, pg->snap_mapper, pg->osdriver, txn);
+}
+
+void SnapTrimObjSubEvent::update_head(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries
+) {
+ const auto head_oid = coid.get_head();
+ logger().info("{}: writing updated snapset on {}, snapset is {}",
+ *this, head_oid, head_obc->ssc->snapset);
+ log_entries.emplace_back(
+ pg_log_entry_t{
+ pg_log_entry_t::MODIFY,
+ head_oid,
+ osd_op_p.at_version,
+ head_obc->obs.oi.version,
+ 0,
+ osd_reqid_t(),
+ obc->obs.oi.mtime,
+ 0}
+ );
+
+ head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
+ head_obc->obs.oi.version = osd_op_p.at_version;
+
+ std::map<std::string, ceph::bufferlist, std::less<>> attrs;
+ ceph::bufferlist bl;
+ encode(head_obc->ssc->snapset, bl);
+ attrs[SS_ATTR] = std::move(bl);
+
+ bl.clear();
+ head_obc->obs.oi.encode_no_oid(bl,
+ pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ attrs[OI_ATTR] = std::move(bl);
+ txn.setattrs(
+ pg->get_collection_ref()->get_cid(),
+ ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD},
+ attrs);
+}
+
+SnapTrimObjSubEvent::remove_or_update_iertr::future<
+ SnapTrimObjSubEvent::remove_or_update_ret_t>
+SnapTrimObjSubEvent::remove_or_update(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc)
+{
+ auto citer = head_obc->ssc->snapset.clone_snaps.find(coid.snap);
+ if (citer == head_obc->ssc->snapset.clone_snaps.end()) {
+ logger().error("{}: No clone_snaps in snapset {} for object {}",
+ *this, head_obc->ssc->snapset, coid);
+ return crimson::ct_error::enoent::make();
+ }
+ const auto& old_snaps = citer->second;
+ if (old_snaps.empty()) {
+ logger().error("{}: no object info snaps for object {}",
+ *this, coid);
+ return crimson::ct_error::enoent::make();
+ }
+ if (head_obc->ssc->snapset.seq == 0) {
+ logger().error("{}: no snapset.seq for object {}",
+ *this, coid);
+ return crimson::ct_error::enoent::make();
+ }
+ const OSDMapRef& osdmap = pg->get_osdmap();
+ std::set<snapid_t> new_snaps;
+ for (const auto& old_snap : old_snaps) {
+ if (!osdmap->in_removed_snaps_queue(pg->get_info().pgid.pgid.pool(),
+ old_snap)
+ && old_snap != snap_to_trim) {
+ new_snaps.insert(old_snap);
+ }
+ }
+
+ return seastar::do_with(ceph::os::Transaction{}, [=, this](auto &txn) {
+ std::vector<pg_log_entry_t> log_entries{};
+
+ int64_t num_objects_before_trim = delta_stats.num_objects;
+ osd_op_p.at_version = pg->next_version();
+ auto ret = remove_or_update_iertr::now();
+ if (new_snaps.empty()) {
+ // remove clone from snapset
+ logger().info("{}: {} snaps {} -> {} ... deleting",
+ *this, coid, old_snaps, new_snaps);
+ ret = remove_clone(obc, head_obc, txn, log_entries);
+ } else {
+ // save adjusted snaps for this object
+ logger().info("{}: {} snaps {} -> {}",
+ *this, coid, old_snaps, new_snaps);
+ ret = adjust_snaps(obc, head_obc, new_snaps, txn, log_entries);
+ }
+ return std::move(ret).safe_then_interruptible(
+ [&txn, obc, num_objects_before_trim, log_entries=std::move(log_entries), head_obc=std::move(head_obc), this]() mutable {
+ osd_op_p.at_version = pg->next_version();
+
+ // save head snapset
+ logger().debug("{}: {} new snapset {} on {}",
+ *this, coid, head_obc->ssc->snapset, head_obc->obs.oi);
+ if (head_obc->ssc->snapset.clones.empty() && head_obc->obs.oi.is_whiteout()) {
+ remove_head_whiteout(obc, head_obc, txn, log_entries);
+ } else {
+ update_head(obc, head_obc, txn, log_entries);
+ }
+ // Stats reporting - Set number of objects trimmed
+ if (num_objects_before_trim > delta_stats.num_objects) {
+ //int64_t num_objects_trimmed =
+ // num_objects_before_trim - delta_stats.num_objects;
+ //add_objects_trimmed_count(num_objects_trimmed);
+ }
+ }).safe_then_interruptible(
+ [&txn, log_entries=std::move(log_entries)] () mutable {
+ return remove_or_update_iertr::make_ready_future<remove_or_update_ret_t>(
+ std::make_pair(std::move(txn), std::move(log_entries)));
+ });
+ });
+}
+
+SnapTrimObjSubEvent::remove_or_update_iertr::future<>
+SnapTrimObjSubEvent::with_pg(
+ ShardServices &shard_services, Ref<PG> _pg)
+{
+ return enter_stage<interruptor>(
+ client_pp().wait_for_active
+ ).then_interruptible([this] {
+ return with_blocking_event<PGActivationBlocker::BlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->wait_for_active_blocker.wait(std::move(trigger));
+ });
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().recover_missing);
+ }).then_interruptible([] {
+ //return do_recover_missing(pg, get_target_oid());
+ return seastar::now();
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().get_obc);
+ }).then_interruptible([this] {
+ logger().debug("{}: getting obc for {}", *this, coid);
+ // end of commonality
+ // with_clone_obc_direct lock both clone's and head's obcs
+ return pg->obc_loader.with_clone_obc_direct<RWState::RWWRITE>(
+ coid,
+ [this](auto head_obc, auto clone_obc) {
+ logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid());
+ return enter_stage<interruptor>(
+ client_pp().process
+ ).then_interruptible(
+ [this,clone_obc=std::move(clone_obc), head_obc=std::move(head_obc)]() mutable {
+ logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid());
+ return remove_or_update(
+ clone_obc, head_obc
+ ).safe_then_unpack_interruptible([clone_obc, this]
+ (auto&& txn, auto&& log_entries) mutable {
+ auto [submitted, all_completed] = pg->submit_transaction(
+ std::move(clone_obc),
+ std::move(txn),
+ std::move(osd_op_p),
+ std::move(log_entries));
+ return submitted.then_interruptible(
+ [all_completed=std::move(all_completed), this] () mutable {
+ return enter_stage<interruptor>(
+ wait_repop
+ ).then_interruptible([all_completed=std::move(all_completed)] () mutable {
+ return std::move(all_completed);
+ });
+ });
+ });
+ });
+ }).handle_error_interruptible(
+ remove_or_update_iertr::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error in SnapTrimObjSubEvent"}
+ );
+ });
+}
+
+void SnapTrimObjSubEvent::print(std::ostream &lhs) const
+{
+ lhs << "SnapTrimObjSubEvent("
+ << "coid=" << coid
+ << " snapid=" << snap_to_trim
+ << ")";
+}
+
+void SnapTrimObjSubEvent::dump_detail(Formatter *f) const
+{
+ f->open_object_section("SnapTrimObjSubEvent");
+ f->dump_stream("coid") << coid;
+ f->close_section();
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
new file mode 100644
index 000000000..a3a970a04
--- /dev/null
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/common/pg_pipeline.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_activation_blocker.h"
+#include "osd/osd_types.h"
+#include "osd/PGPeeringEvent.h"
+#include "osd/PeeringState.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+class SnapMapper;
+
+namespace crimson::osd {
+
+class OSD;
+class ShardServices;
+
+// trim up to `max` objects for snapshot `snapid
+class SnapTrimEvent final : public PhasedOperationT<SnapTrimEvent> {
+public:
+ using remove_or_update_ertr =
+ crimson::errorator<crimson::ct_error::enoent>;
+ using remove_or_update_iertr =
+ crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, remove_or_update_ertr>;
+ using snap_trim_ertr = remove_or_update_ertr::extend<
+ crimson::ct_error::eagain>;
+ using snap_trim_iertr = remove_or_update_iertr::extend<
+ crimson::ct_error::eagain>;
+
+ static constexpr OperationTypeCode type = OperationTypeCode::snaptrim_event;
+
+ SnapTrimEvent(Ref<PG> pg,
+ SnapMapper& snap_mapper,
+ const snapid_t snapid,
+ const bool needs_pause)
+ : pg(std::move(pg)),
+ snap_mapper(snap_mapper),
+ snapid(snapid),
+ needs_pause(needs_pause) {}
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+ snap_trim_ertr::future<seastar::stop_iteration> start();
+ snap_trim_ertr::future<seastar::stop_iteration> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+private:
+ CommonPGPipeline& client_pp();
+
+ // bases on 998cb8c141bb89aafae298a9d5e130fbd78fe5f2
+ struct SubOpBlocker : crimson::BlockerT<SubOpBlocker> {
+ static constexpr const char* type_name = "CompoundOpBlocker";
+
+ using id_done_t = std::pair<crimson::Operation::id_t,
+ remove_or_update_iertr::future<>>;
+
+ void dump_detail(Formatter *f) const final;
+
+ template <class... Args>
+ void emplace_back(Args&&... args);
+
+ remove_or_update_iertr::future<> wait_completion();
+ private:
+ std::vector<id_done_t> subops;
+ } subop_blocker;
+
+ // we don't need to synchronize with other instances of SnapTrimEvent;
+ // it's here for the sake of op tracking.
+ struct WaitSubop : OrderedConcurrentPhaseT<WaitSubop> {
+ static constexpr auto type_name = "SnapTrimEvent::wait_subop";
+ } wait_subop;
+
+ // an instantiator can instruct us to go over this stage and then
+ // wait for the future to implement throttling. It is implemented
+ // that way to for the sake of tracking ops.
+ struct WaitTrimTimer : OrderedExclusivePhaseT<WaitTrimTimer> {
+ static constexpr auto type_name = "SnapTrimEvent::wait_trim_timer";
+ } wait_trim_timer;
+
+ PipelineHandle handle;
+ Ref<PG> pg;
+ SnapMapper& snap_mapper;
+ const snapid_t snapid;
+ const bool needs_pause;
+
+public:
+ PipelineHandle& get_handle() { return handle; }
+
+ std::tuple<
+ StartEvent,
+ CommonPGPipeline::WaitForActive::BlockingEvent,
+ PGActivationBlocker::BlockingEvent,
+ CommonPGPipeline::RecoverMissing::BlockingEvent,
+ CommonPGPipeline::GetOBC::BlockingEvent,
+ CommonPGPipeline::Process::BlockingEvent,
+ WaitSubop::BlockingEvent,
+ PG::SnapTrimMutex::WaitPG::BlockingEvent,
+ WaitTrimTimer::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+
+ friend class PG::SnapTrimMutex;
+};
+
+// remove single object. a SnapTrimEvent can create multiple subrequests.
+// the division of labour is needed because of the restriction that an Op
+// cannot revisite a pipeline's stage it already saw.
+class SnapTrimObjSubEvent : public PhasedOperationT<SnapTrimObjSubEvent> {
+public:
+ using remove_or_update_ertr =
+ crimson::errorator<crimson::ct_error::enoent>;
+ using remove_or_update_iertr =
+ crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, remove_or_update_ertr>;
+
+ static constexpr OperationTypeCode type =
+ OperationTypeCode::snaptrimobj_subevent;
+
+ SnapTrimObjSubEvent(
+ Ref<PG> pg,
+ const hobject_t& coid,
+ snapid_t snap_to_trim)
+ : pg(std::move(pg)),
+ coid(coid),
+ snap_to_trim(snap_to_trim) {
+ }
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+ remove_or_update_iertr::future<> start();
+ remove_or_update_iertr::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ CommonPGPipeline& client_pp();
+
+private:
+ object_stat_sum_t delta_stats;
+
+ remove_or_update_iertr::future<> remove_clone(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries);
+ void remove_head_whiteout(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries);
+ interruptible_future<> adjust_snaps(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ const std::set<snapid_t>& new_snaps,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries);
+ void update_head(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries);
+
+ using remove_or_update_ret_t =
+ std::pair<ceph::os::Transaction, std::vector<pg_log_entry_t>>;
+ remove_or_update_iertr::future<remove_or_update_ret_t>
+ remove_or_update(ObjectContextRef obc, ObjectContextRef head_obc);
+
+ // we don't need to synchronize with other instances started by
+ // SnapTrimEvent; it's here for the sake of op tracking.
+ struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
+ static constexpr auto type_name = "SnapTrimObjSubEvent::wait_repop";
+ } wait_repop;
+
+ Ref<PG> pg;
+ PipelineHandle handle;
+ osd_op_params_t osd_op_p;
+ const hobject_t coid;
+ const snapid_t snap_to_trim;
+
+public:
+ PipelineHandle& get_handle() { return handle; }
+
+ std::tuple<
+ StartEvent,
+ CommonPGPipeline::WaitForActive::BlockingEvent,
+ PGActivationBlocker::BlockingEvent,
+ CommonPGPipeline::RecoverMissing::BlockingEvent,
+ CommonPGPipeline::GetOBC::BlockingEvent,
+ CommonPGPipeline::Process::BlockingEvent,
+ WaitRepop::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+};
+
+} // namespace crimson::osd
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::SnapTrimEvent> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::osd::SnapTrimObjSubEvent> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osdmap_gate.cc b/src/crimson/osd/osdmap_gate.cc
new file mode 100644
index 000000000..171ec436d
--- /dev/null
+++ b/src/crimson/osd/osdmap_gate.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/shard_services.h"
+#include "common/Formatter.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+template <OSDMapGateType OSDMapGateTypeV>
+void OSDMapGate<OSDMapGateTypeV>::OSDMapBlocker::dump_detail(Formatter *f) const
+{
+ f->open_object_section("OSDMapGate");
+ f->dump_int("epoch", epoch);
+ f->close_section();
+}
+
+template <OSDMapGateType OSDMapGateTypeV>
+seastar::future<epoch_t> OSDMapGate<OSDMapGateTypeV>::wait_for_map(
+ typename OSDMapBlocker::BlockingEvent::TriggerI&& trigger,
+ epoch_t epoch,
+ ShardServices *shard_services)
+{
+ if (__builtin_expect(stopping, false)) {
+ return seastar::make_exception_future<epoch_t>(
+ crimson::common::system_shutdown_exception());
+ }
+ if (current >= epoch) {
+ return seastar::make_ready_future<epoch_t>(current);
+ } else {
+ logger().info("evt epoch is {}, i have {}, will wait", epoch, current);
+ auto &blocker = waiting_peering.emplace(
+ epoch, std::make_pair(blocker_type, epoch)).first->second;
+ auto fut = blocker.promise.get_shared_future();
+ if (shard_services) {
+ return trigger.maybe_record_blocking(
+ shard_services->osdmap_subscribe(current, true).then(
+ [fut=std::move(fut)]() mutable {
+ return std::move(fut);
+ }),
+ blocker);
+ } else {
+ return trigger.maybe_record_blocking(std::move(fut), blocker);
+ }
+ }
+}
+
+template <OSDMapGateType OSDMapGateTypeV>
+void OSDMapGate<OSDMapGateTypeV>::got_map(epoch_t epoch) {
+ if (epoch == 0) {
+ return;
+ }
+ ceph_assert(epoch > current);
+ current = epoch;
+ auto first = waiting_peering.begin();
+ auto last = waiting_peering.upper_bound(epoch);
+ std::for_each(first, last, [epoch](auto& blocked_requests) {
+ blocked_requests.second.promise.set_value(epoch);
+ });
+ waiting_peering.erase(first, last);
+}
+
+template <OSDMapGateType OSDMapGateTypeV>
+seastar::future<> OSDMapGate<OSDMapGateTypeV>::stop() {
+ logger().info("osdmap::stop");
+ stopping = true;
+ auto first = waiting_peering.begin();
+ auto last = waiting_peering.end();
+ std::for_each(first, last, [](auto& blocked_requests) {
+ blocked_requests.second.promise.set_exception(
+ crimson::common::system_shutdown_exception());
+ });
+ return seastar::now();
+}
+
+template class OSDMapGate<OSDMapGateType::PG>;
+template class OSDMapGate<OSDMapGateType::OSD>;
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osdmap_gate.h b/src/crimson/osd/osdmap_gate.h
new file mode 100644
index 000000000..d76c4b82f
--- /dev/null
+++ b/src/crimson/osd/osdmap_gate.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <optional>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/types.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+enum class OSDMapGateType {
+ OSD,
+ PG,
+};
+
+template <OSDMapGateType OSDMapGateTypeV>
+class OSDMapGate {
+public:
+ struct OSDMapBlocker : BlockerT<OSDMapBlocker> {
+ const char * type_name;
+ epoch_t epoch;
+
+ OSDMapBlocker(std::pair<const char *, epoch_t> args)
+ : type_name(args.first), epoch(args.second) {}
+
+ OSDMapBlocker(const OSDMapBlocker &) = delete;
+ OSDMapBlocker(OSDMapBlocker &&) = delete;
+ OSDMapBlocker &operator=(const OSDMapBlocker &) = delete;
+ OSDMapBlocker &operator=(OSDMapBlocker &&) = delete;
+
+ seastar::shared_promise<epoch_t> promise;
+
+ void dump_detail(Formatter *f) const final;
+ };
+ using Blocker = OSDMapBlocker;
+
+private:
+ // order the promises in ascending order of the waited osdmap epoch,
+ // so we can access all the waiters expecting a map whose epoch is less
+ // than or equal to a given epoch
+ using waiting_peering_t = std::map<epoch_t,
+ OSDMapBlocker>;
+ const char *blocker_type;
+ waiting_peering_t waiting_peering;
+ epoch_t current = 0;
+ bool stopping = false;
+public:
+ OSDMapGate(const char *blocker_type)
+ : blocker_type(blocker_type) {}
+
+ /**
+ * wait_for_map
+ *
+ * Wait for an osdmap whose epoch is greater or equal to given epoch.
+ * If shard_services is non-null, request map if not present.
+ */
+ seastar::future<epoch_t>
+ wait_for_map(
+ typename OSDMapBlocker::BlockingEvent::TriggerI&& trigger,
+ epoch_t epoch,
+ ShardServices *shard_services=nullptr
+ );
+ void got_map(epoch_t epoch);
+ seastar::future<> stop();
+};
+
+using OSD_OSDMapGate = OSDMapGate<OSDMapGateType::OSD>;
+using PG_OSDMapGate = OSDMapGate<OSDMapGateType::PG>;
+
+}
diff --git a/src/crimson/osd/osdmap_service.h b/src/crimson/osd/osdmap_service.h
new file mode 100644
index 000000000..017303536
--- /dev/null
+++ b/src/crimson/osd/osdmap_service.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/types.h"
+#include "osd/OSDMap.h"
+
+class OSDMap;
+
+class OSDMapService {
+public:
+ using cached_map_t = OSDMapRef;
+ using local_cached_map_t = LocalOSDMapRef;
+
+ virtual ~OSDMapService() = default;
+ virtual seastar::future<cached_map_t> get_map(epoch_t e) = 0;
+ /// get the latest map
+ virtual cached_map_t get_map() const = 0;
+ virtual epoch_t get_up_epoch() const = 0;
+};
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
new file mode 100644
index 000000000..7cf3b158c
--- /dev/null
+++ b/src/crimson/osd/pg.cc
@@ -0,0 +1,1544 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "pg.h"
+
+#include <functional>
+
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm/max_element.hpp>
+#include <boost/range/numeric.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "common/hobject_fmt.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+
+#include "osd/OSDMap.h"
+#include "osd/osd_types_fmt.h"
+
+#include "os/Transaction.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/os/cyanstore/cyan_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/pg_meta.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/osd_operations/osdop_params.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/osd_operations/snaptrim_event.h"
+#include "crimson/osd/pg_recovery.h"
+#include "crimson/osd/replicated_recovery_backend.h"
+#include "crimson/osd/watch.h"
+
+using std::ostream;
+using std::set;
+using std::string;
+using std::vector;
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace std::chrono {
+std::ostream& operator<<(std::ostream& out, const signedspan& d)
+{
+ auto s = std::chrono::duration_cast<std::chrono::seconds>(d).count();
+ auto ns = std::abs((d % 1s).count());
+ fmt::print(out, "{}{}s", s, ns ? fmt::format(".{:0>9}", ns) : "");
+ return out;
+}
+}
+
+template <typename T>
+struct fmt::formatter<std::optional<T>> : fmt::formatter<T> {
+ template <typename FormatContext>
+ auto format(const std::optional<T>& v, FormatContext& ctx) const {
+ if (v.has_value()) {
+ return fmt::formatter<T>::format(*v, ctx);
+ }
+ return fmt::format_to(ctx.out(), "<null>");
+ }
+};
+
+namespace crimson::osd {
+
+using crimson::common::local_conf;
+
+class RecoverablePredicate : public IsPGRecoverablePredicate {
+public:
+ bool operator()(const set<pg_shard_t> &have) const override {
+ return !have.empty();
+ }
+};
+
+class ReadablePredicate: public IsPGReadablePredicate {
+ pg_shard_t whoami;
+public:
+ explicit ReadablePredicate(pg_shard_t whoami) : whoami(whoami) {}
+ bool operator()(const set<pg_shard_t> &have) const override {
+ return have.count(whoami);
+ }
+};
+
+PG::PG(
+ spg_t pgid,
+ pg_shard_t pg_shard,
+ crimson::os::CollectionRef coll_ref,
+ pg_pool_t&& pool,
+ std::string&& name,
+ cached_map_t osdmap,
+ ShardServices &shard_services,
+ ec_profile_t profile)
+ : pgid{pgid},
+ pg_whoami{pg_shard},
+ coll_ref{coll_ref},
+ pgmeta_oid{pgid.make_pgmeta_oid()},
+ osdmap_gate("PG::osdmap_gate"),
+ shard_services{shard_services},
+ backend(
+ PGBackend::create(
+ pgid.pgid,
+ pg_shard,
+ pool,
+ coll_ref,
+ shard_services,
+ profile,
+ *this)),
+ recovery_backend(
+ std::make_unique<ReplicatedRecoveryBackend>(
+ *this, shard_services, coll_ref, backend.get())),
+ recovery_handler(
+ std::make_unique<PGRecovery>(this)),
+ peering_state(
+ shard_services.get_cct(),
+ pg_shard,
+ pgid,
+ PGPool(
+ osdmap,
+ pgid.pool(),
+ pool,
+ name),
+ osdmap,
+ this,
+ this),
+ obc_registry{
+ local_conf()},
+ obc_loader{
+ obc_registry,
+ *backend.get(),
+ *this},
+ osdriver(
+ &shard_services.get_store(),
+ coll_ref,
+ pgid.make_pgmeta_oid()),
+ snap_mapper(
+ this->shard_services.get_cct(),
+ &osdriver,
+ pgid.ps(),
+ pgid.get_split_bits(pool.get_pg_num()),
+ pgid.pool(),
+ pgid.shard),
+ wait_for_active_blocker(this)
+{
+ peering_state.set_backend_predicates(
+ new ReadablePredicate(pg_whoami),
+ new RecoverablePredicate());
+ osdmap_gate.got_map(osdmap->get_epoch());
+}
+
+PG::~PG() {}
+
+void PG::check_blocklisted_watchers()
+{
+ logger().debug("{}", __func__);
+ obc_registry.for_each([this](ObjectContextRef obc) {
+ assert(obc);
+ for (const auto& [key, watch] : obc->watchers) {
+ assert(watch->get_pg() == this);
+ const auto& ea = watch->get_peer_addr();
+ logger().debug("watch: Found {} cookie {}. Checking entity_add_t {}",
+ watch->get_entity(), watch->get_cookie(), ea);
+ if (get_osdmap()->is_blocklisted(ea)) {
+ logger().info("watch: Found blocklisted watcher for {}", ea);
+ watch->do_watch_timeout();
+ }
+ }
+ });
+}
+
+bool PG::try_flush_or_schedule_async() {
+ logger().debug("PG::try_flush_or_schedule_async: flush ...");
+ (void)shard_services.get_store().flush(
+ coll_ref
+ ).then(
+ [this, epoch=get_osdmap_epoch()]() {
+ return shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ epoch,
+ epoch,
+ PeeringState::IntervalFlush());
+ });
+ return false;
+}
+
+void PG::publish_stats_to_osd()
+{
+ if (!is_primary())
+ return;
+ if (auto new_pg_stats = peering_state.prepare_stats_for_publish(
+ pg_stats,
+ object_stat_collection_t());
+ new_pg_stats.has_value()) {
+ pg_stats = std::move(new_pg_stats);
+ }
+}
+
+void PG::clear_publish_stats()
+{
+ pg_stats.reset();
+}
+
+pg_stat_t PG::get_stats() const
+{
+ return pg_stats.value_or(pg_stat_t{});
+}
+
+void PG::queue_check_readable(epoch_t last_peering_reset, ceph::timespan delay)
+{
+ // handle the peering event in the background
+ logger().debug(
+ "{}: PG::queue_check_readable lpr: {}, delay: {}",
+ *this, last_peering_reset, delay);
+ check_readable_timer.cancel();
+ check_readable_timer.set_callback([last_peering_reset, this] {
+ logger().debug(
+ "{}: PG::queue_check_readable callback lpr: {}",
+ *this, last_peering_reset);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ last_peering_reset,
+ last_peering_reset,
+ PeeringState::CheckReadable{});
+ });
+ check_readable_timer.arm(
+ std::chrono::duration_cast<seastar::lowres_clock::duration>(delay));
+}
+
+void PG::recheck_readable()
+{
+ bool changed = false;
+ const auto mnow = shard_services.get_mnow();
+ if (peering_state.state_test(PG_STATE_WAIT)) {
+ auto prior_readable_until_ub = peering_state.get_prior_readable_until_ub();
+ if (mnow < prior_readable_until_ub) {
+ logger().info(
+ "{}: {} will wait (mnow {} < prior_readable_until_ub {})",
+ *this, __func__, mnow, prior_readable_until_ub);
+ queue_check_readable(
+ peering_state.get_last_peering_reset(),
+ prior_readable_until_ub - mnow);
+ } else {
+ logger().info(
+ "{}:{} no longer wait (mnow {} >= prior_readable_until_ub {})",
+ *this, __func__, mnow, prior_readable_until_ub);
+ peering_state.state_clear(PG_STATE_WAIT);
+ peering_state.clear_prior_readable_until_ub();
+ changed = true;
+ }
+ }
+ if (peering_state.state_test(PG_STATE_LAGGY)) {
+ auto readable_until = peering_state.get_readable_until();
+ if (readable_until == readable_until.zero()) {
+ logger().info(
+ "{}:{} still laggy (mnow {}, readable_until zero)",
+ *this, __func__, mnow);
+ } else if (mnow >= readable_until) {
+ logger().info(
+ "{}:{} still laggy (mnow {} >= readable_until {})",
+ *this, __func__, mnow, readable_until);
+ } else {
+ logger().info(
+ "{}:{} no longer laggy (mnow {} < readable_until {})",
+ *this, __func__, mnow, readable_until);
+ peering_state.state_clear(PG_STATE_LAGGY);
+ changed = true;
+ }
+ }
+ if (changed) {
+ publish_stats_to_osd();
+ if (!peering_state.state_test(PG_STATE_WAIT) &&
+ !peering_state.state_test(PG_STATE_LAGGY)) {
+ // TODO: requeue ops waiting for readable
+ }
+ }
+}
+
+unsigned PG::get_target_pg_log_entries() const
+{
+ const unsigned local_num_pgs = shard_services.get_num_local_pgs();
+ const unsigned local_target =
+ local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd") /
+ seastar::smp::count;
+ const unsigned min_pg_log_entries =
+ local_conf().get_val<uint64_t>("osd_min_pg_log_entries");
+ if (local_num_pgs > 0 && local_target > 0) {
+ // target an even spread of our budgeted log entries across all
+ // PGs. note that while we only get to control the entry count
+ // for primary PGs, we'll normally be responsible for a mix of
+ // primary and replica PGs (for the same pool(s) even), so this
+ // will work out.
+ const unsigned max_pg_log_entries =
+ local_conf().get_val<uint64_t>("osd_max_pg_log_entries");
+ return std::clamp(local_target / local_num_pgs,
+ min_pg_log_entries,
+ max_pg_log_entries);
+ } else {
+ // fall back to a per-pg value.
+ return min_pg_log_entries;
+ }
+}
+
+void PG::on_removal(ceph::os::Transaction &t) {
+ t.register_on_commit(
+ new LambdaContext(
+ [this](int r) {
+ ceph_assert(r == 0);
+ (void)shard_services.start_operation<LocalPeeringEvent>(
+ this, pg_whoami, pgid, float(0.001), get_osdmap_epoch(),
+ get_osdmap_epoch(), PeeringState::DeleteSome());
+ }));
+}
+
+void PG::on_activate(interval_set<snapid_t> snaps)
+{
+ logger().debug("{}: {} snaps={}", *this, __func__, snaps);
+ snap_trimq = std::move(snaps);
+ projected_last_update = peering_state.get_info().last_update;
+}
+
+void PG::on_activate_complete()
+{
+ wait_for_active_blocker.unblock();
+
+ if (peering_state.needs_recovery()) {
+ logger().info("{}: requesting recovery",
+ __func__);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ float(0.001),
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::DoRecovery{});
+ } else if (peering_state.needs_backfill()) {
+ logger().info("{}: requesting backfill",
+ __func__);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ float(0.001),
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::RequestBackfill{});
+ } else {
+ logger().debug("{}: no need to recover or backfill, AllReplicasRecovered",
+ " for pg: {}", __func__, pgid);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ float(0.001),
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::AllReplicasRecovered{});
+ }
+ publish_stats_to_osd();
+}
+
+void PG::prepare_write(pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ PGLog &pglog,
+ bool dirty_info,
+ bool dirty_big_info,
+ bool need_write_epoch,
+ ceph::os::Transaction &t)
+{
+ std::map<string,bufferlist> km;
+ std::string key_to_remove;
+ if (dirty_big_info || dirty_info) {
+ int ret = prepare_info_keymap(
+ shard_services.get_cct(),
+ &km,
+ &key_to_remove,
+ get_osdmap_epoch(),
+ info,
+ last_written_info,
+ past_intervals,
+ dirty_big_info,
+ need_write_epoch,
+ true,
+ nullptr,
+ this);
+ ceph_assert(ret == 0);
+ }
+ pglog.write_log_and_missing(
+ t, &km, coll_ref->get_cid(), pgmeta_oid,
+ peering_state.get_pgpool().info.require_rollback());
+ if (!km.empty()) {
+ t.omap_setkeys(coll_ref->get_cid(), pgmeta_oid, km);
+ }
+ if (!key_to_remove.empty()) {
+ t.omap_rmkey(coll_ref->get_cid(), pgmeta_oid, key_to_remove);
+ }
+}
+
+std::pair<ghobject_t, bool>
+PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
+{
+ logger().info("removing pg {}", pgid);
+ auto fut = interruptor::make_interruptible(
+ shard_services.get_store().list_objects(
+ coll_ref,
+ _next,
+ ghobject_t::get_max(),
+ local_conf()->osd_target_transaction_size));
+
+ auto [objs_to_rm, next] = fut.get();
+ if (objs_to_rm.empty()) {
+ logger().info("all objs removed, removing coll for {}", pgid);
+ t.remove(coll_ref->get_cid(), pgmeta_oid);
+ t.remove_collection(coll_ref->get_cid());
+ (void) shard_services.get_store().do_transaction(
+ coll_ref, std::move(t)).then([this] {
+ return shard_services.remove_pg(pgid);
+ });
+ return {next, false};
+ } else {
+ for (auto &obj : objs_to_rm) {
+ if (obj == pgmeta_oid) {
+ continue;
+ }
+ logger().trace("pg {}, removing obj {}", pgid, obj);
+ t.remove(coll_ref->get_cid(), obj);
+ }
+ t.register_on_commit(
+ new LambdaContext([this](int r) {
+ ceph_assert(r == 0);
+ logger().trace("triggering more pg delete {}", pgid);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ float(0.001),
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::DeleteSome{});
+ }));
+ return {next, true};
+ }
+}
+
+Context *PG::on_clean()
+{
+ // Not needed yet (will be needed for IO unblocking)
+ return nullptr;
+}
+
+void PG::on_active_actmap()
+{
+ logger().debug("{}: {} snap_trimq={}", *this, __func__, snap_trimq);
+ peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR);
+ // loops until snap_trimq is empty or SNAPTRIM_ERROR.
+ std::ignore = seastar::do_until(
+ [this] { return snap_trimq.empty()
+ || peering_state.state_test(PG_STATE_SNAPTRIM_ERROR);
+ },
+ [this] {
+ peering_state.state_set(PG_STATE_SNAPTRIM);
+ publish_stats_to_osd();
+ const auto to_trim = snap_trimq.range_start();
+ snap_trimq.erase(to_trim);
+ const auto needs_pause = !snap_trimq.empty();
+ return seastar::repeat([to_trim, needs_pause, this] {
+ logger().debug("{}: going to start SnapTrimEvent, to_trim={}",
+ *this, to_trim);
+ return shard_services.start_operation<SnapTrimEvent>(
+ this,
+ snap_mapper,
+ to_trim,
+ needs_pause
+ ).second.handle_error(
+ crimson::ct_error::enoent::handle([this] {
+ logger().error("{}: ENOENT saw, trimming stopped", *this);
+ peering_state.state_set(PG_STATE_SNAPTRIM_ERROR);
+ publish_stats_to_osd();
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }), crimson::ct_error::eagain::handle([this] {
+ logger().info("{}: EAGAIN saw, trimming restarted", *this);
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ })
+ );
+ }).then([this, trimmed=to_trim] {
+ logger().debug("{}: trimmed snap={}", *this, trimmed);
+ });
+ }).finally([this] {
+ logger().debug("{}: PG::on_active_actmap() finished trimming",
+ *this);
+ peering_state.state_clear(PG_STATE_SNAPTRIM);
+ peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR);
+ publish_stats_to_osd();
+ });
+}
+
+void PG::on_active_advmap(const OSDMapRef &osdmap)
+{
+ const auto new_removed_snaps = osdmap->get_new_removed_snaps();
+ if (auto it = new_removed_snaps.find(get_pgid().pool());
+ it != new_removed_snaps.end()) {
+ bool bad = false;
+ for (auto j : it->second) {
+ if (snap_trimq.intersects(j.first, j.second)) {
+ decltype(snap_trimq) added, overlap;
+ added.insert(j.first, j.second);
+ overlap.intersection_of(snap_trimq, added);
+ logger().error("{}: {} removed_snaps already contains {}",
+ *this, __func__, overlap);
+ bad = true;
+ snap_trimq.union_of(added);
+ } else {
+ snap_trimq.insert(j.first, j.second);
+ }
+ }
+ logger().info("{}: {} new removed snaps {}, snap_trimq now{}",
+ *this, __func__, it->second, snap_trimq);
+ assert(!bad || !local_conf().get_val<bool>("osd_debug_verify_cached_snaps"));
+ }
+}
+
+void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
+{
+ // TODO: should update the stats upon finishing the scrub
+ peering_state.update_stats([scrub_level, this](auto& history, auto& stats) {
+ const utime_t now = ceph_clock_now();
+ history.last_scrub = peering_state.get_info().last_update;
+ history.last_scrub_stamp = now;
+ history.last_clean_scrub_stamp = now;
+ if (scrub_level == scrub_level_t::deep) {
+ history.last_deep_scrub = history.last_scrub;
+ history.last_deep_scrub_stamp = now;
+ }
+ // yes, please publish the stats
+ return true;
+ });
+}
+
+void PG::log_state_enter(const char *state) {
+ logger().info("Entering state: {}", state);
+}
+
+void PG::log_state_exit(
+ const char *state_name, utime_t enter_time,
+ uint64_t events, utime_t event_dur) {
+ logger().info(
+ "Exiting state: {}, entered at {}, {} spent on {} events",
+ state_name,
+ enter_time,
+ event_dur,
+ events);
+}
+
+ceph::signedspan PG::get_mnow() const
+{
+ return shard_services.get_mnow();
+}
+
+HeartbeatStampsRef PG::get_hb_stamps(int peer)
+{
+ return shard_services.get_hb_stamps(peer);
+}
+
+void PG::schedule_renew_lease(epoch_t last_peering_reset, ceph::timespan delay)
+{
+ // handle the peering event in the background
+ renew_lease_timer.cancel();
+ renew_lease_timer.set_callback([last_peering_reset, this] {
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ last_peering_reset,
+ last_peering_reset,
+ RenewLease{});
+ });
+ renew_lease_timer.arm(
+ std::chrono::duration_cast<seastar::lowres_clock::duration>(delay));
+}
+
+
+void PG::init(
+ int role,
+ const vector<int>& newup, int new_up_primary,
+ const vector<int>& newacting, int new_acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pi,
+ ObjectStore::Transaction &t)
+{
+ peering_state.init(
+ role, newup, new_up_primary, newacting,
+ new_acting_primary, history, pi, t);
+}
+
+seastar::future<> PG::read_state(crimson::os::FuturizedStore::Shard* store)
+{
+ if (__builtin_expect(stopping, false)) {
+ return seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception());
+ }
+
+ return seastar::do_with(PGMeta(*store, pgid), [] (auto& pg_meta) {
+ return pg_meta.load();
+ }).then([this, store](auto&& ret) {
+ auto [pg_info, past_intervals] = std::move(ret);
+ return peering_state.init_from_disk_state(
+ std::move(pg_info),
+ std::move(past_intervals),
+ [this, store] (PGLog &pglog) {
+ return pglog.read_log_and_missing_crimson(
+ *store,
+ coll_ref,
+ peering_state.get_info(),
+ pgmeta_oid);
+ });
+ }).then([this]() {
+ int primary, up_primary;
+ vector<int> acting, up;
+ peering_state.get_osdmap()->pg_to_up_acting_osds(
+ pgid.pgid, &up, &up_primary, &acting, &primary);
+ peering_state.init_primary_up_acting(
+ up,
+ acting,
+ up_primary,
+ primary);
+ int rr = OSDMap::calc_pg_role(pg_whoami, acting);
+ peering_state.set_role(rr);
+
+ epoch_t epoch = get_osdmap_epoch();
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ epoch,
+ epoch,
+ PeeringState::Initialize());
+
+ return seastar::now();
+ });
+}
+
+PG::interruptible_future<> PG::do_peering_event(
+ PGPeeringEvent& evt, PeeringCtx &rctx)
+{
+ if (peering_state.pg_has_reset_since(evt.get_epoch_requested()) ||
+ peering_state.pg_has_reset_since(evt.get_epoch_sent())) {
+ logger().debug("{} ignoring {} -- pg has reset", __func__, evt.get_desc());
+ return interruptor::now();
+ } else {
+ logger().debug("{} handling {} for pg: {}", __func__, evt.get_desc(), pgid);
+ // all peering event handling needs to be run in a dedicated seastar::thread,
+ // so that event processing can involve I/O reqs freely, for example: PG::on_removal,
+ // PG::on_new_interval
+ return interruptor::async([this, &evt, &rctx] {
+ peering_state.handle_event(
+ evt.get_event(),
+ &rctx);
+ peering_state.write_if_dirty(rctx.transaction);
+ });
+ }
+}
+
+seastar::future<> PG::handle_advance_map(
+ cached_map_t next_map, PeeringCtx &rctx)
+{
+ return seastar::async([this, next_map=std::move(next_map), &rctx] {
+ vector<int> newup, newacting;
+ int up_primary, acting_primary;
+ next_map->pg_to_up_acting_osds(
+ pgid.pgid,
+ &newup, &up_primary,
+ &newacting, &acting_primary);
+ peering_state.advance_map(
+ next_map,
+ peering_state.get_osdmap(),
+ newup,
+ up_primary,
+ newacting,
+ acting_primary,
+ rctx);
+ osdmap_gate.got_map(next_map->get_epoch());
+ });
+}
+
+seastar::future<> PG::handle_activate_map(PeeringCtx &rctx)
+{
+ return seastar::async([this, &rctx] {
+ peering_state.activate_map(rctx);
+ });
+}
+
+seastar::future<> PG::handle_initialize(PeeringCtx &rctx)
+{
+ return seastar::async([this, &rctx] {
+ peering_state.handle_event(PeeringState::Initialize{}, &rctx);
+ });
+}
+
+
+void PG::print(ostream& out) const
+{
+ out << peering_state << " ";
+}
+
+void PG::dump_primary(Formatter* f)
+{
+ peering_state.dump_peering_state(f);
+
+ f->open_array_section("recovery_state");
+ PeeringState::QueryState q(f);
+ peering_state.handle_event(q, 0);
+ f->close_section();
+
+ // TODO: snap_trimq
+ // TODO: scrubber state
+ // TODO: agent state
+}
+
+std::ostream& operator<<(std::ostream& os, const PG& pg)
+{
+ os << " pg_epoch " << pg.get_osdmap_epoch() << " ";
+ pg.print(os);
+ return os;
+}
+
+std::tuple<PG::interruptible_future<>,
+ PG::interruptible_future<>>
+PG::submit_transaction(
+ ObjectContextRef&& obc,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ std::vector<pg_log_entry_t>&& log_entries)
+{
+ if (__builtin_expect(stopping, false)) {
+ return {seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception()),
+ seastar::now()};
+ }
+
+ epoch_t map_epoch = get_osdmap_epoch();
+ ceph_assert(!has_reset_since(osd_op_p.at_version.epoch));
+
+ peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version);
+ peering_state.append_log_with_trim_to_updated(std::move(log_entries), osd_op_p.at_version,
+ txn, true, false);
+
+ auto [submitted, all_completed] = backend->mutate_object(
+ peering_state.get_acting_recovery_backfill(),
+ std::move(obc),
+ std::move(txn),
+ std::move(osd_op_p),
+ peering_state.get_last_peering_reset(),
+ map_epoch,
+ std::move(log_entries));
+ return std::make_tuple(std::move(submitted), all_completed.then_interruptible(
+ [this, last_complete=peering_state.get_info().last_complete,
+ at_version=osd_op_p.at_version](auto acked) {
+ for (const auto& peer : acked) {
+ peering_state.update_peer_last_complete_ondisk(
+ peer.shard, peer.last_complete_ondisk);
+ }
+ peering_state.complete_write(at_version, last_complete);
+ return seastar::now();
+ }));
+}
+
+PG::interruptible_future<> PG::repair_object(
+ const hobject_t& oid,
+ eversion_t& v)
+{
+ // see also PrimaryLogPG::rep_repair_primary_object()
+ assert(is_primary());
+ logger().debug("{}: {} peers osd.{}", __func__, oid, get_acting_recovery_backfill());
+ // Add object to PG's missing set if it isn't there already
+ assert(!get_local_missing().is_missing(oid));
+ peering_state.force_object_missing(pg_whoami, oid, v);
+ auto [op, fut] = get_shard_services().start_operation<UrgentRecovery>(
+ oid, v, this, get_shard_services(), get_osdmap_epoch());
+ return std::move(fut);
+}
+
+template <class Ret, class SuccessFunc, class FailureFunc>
+PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<Ret>>
+PG::do_osd_ops_execute(
+ seastar::lw_shared_ptr<OpsExecuter> ox,
+ std::vector<OSDOp>& ops,
+ SuccessFunc&& success_func,
+ FailureFunc&& failure_func)
+{
+ assert(ox);
+ auto rollbacker = ox->create_rollbacker([this] (auto& obc) {
+ return obc_loader.reload_obc(obc).handle_error_interruptible(
+ load_obc_ertr::assert_all{"can't live with object state messed up"});
+ });
+ auto failure_func_ptr = seastar::make_lw_shared(std::move(failure_func));
+ return interruptor::do_for_each(ops, [ox](OSDOp& osd_op) {
+ logger().debug(
+ "do_osd_ops_execute: object {} - handling op {}",
+ ox->get_target(),
+ ceph_osd_op_name(osd_op.op.op));
+ return ox->execute_op(osd_op);
+ }).safe_then_interruptible([this, ox, &ops] {
+ logger().debug(
+ "do_osd_ops_execute: object {} all operations successful",
+ ox->get_target());
+ // check for full
+ if ((ox->delta_stats.num_bytes > 0 ||
+ ox->delta_stats.num_objects > 0) &&
+ get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
+ const auto& m = ox->get_message();
+ if (m.get_reqid().name.is_mds() || // FIXME: ignore MDS for now
+ m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
+ logger().info(" full, but proceeding due to FULL_FORCE or MDS");
+ } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+ // they tried, they failed.
+ logger().info(" full, replying to FULL_TRY op");
+ if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA))
+ return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
+ seastar::now(),
+ OpsExecuter::osd_op_ierrorator::future<>(
+ crimson::ct_error::edquot::make()));
+ else
+ return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
+ seastar::now(),
+ OpsExecuter::osd_op_ierrorator::future<>(
+ crimson::ct_error::enospc::make()));
+ } else {
+ // drop request
+ logger().info(" full, dropping request (bad client)");
+ return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
+ seastar::now(),
+ OpsExecuter::osd_op_ierrorator::future<>(
+ crimson::ct_error::eagain::make()));
+ }
+ }
+ return std::move(*ox).flush_changes_n_do_ops_effects(
+ ops,
+ snap_mapper,
+ osdriver,
+ [this] (auto&& txn,
+ auto&& obc,
+ auto&& osd_op_p,
+ auto&& log_entries) {
+ logger().debug(
+ "do_osd_ops_execute: object {} submitting txn",
+ obc->get_oid());
+ return submit_transaction(
+ std::move(obc),
+ std::move(txn),
+ std::move(osd_op_p),
+ std::move(log_entries));
+ });
+ }).safe_then_unpack_interruptible(
+ [success_func=std::move(success_func), rollbacker, this, failure_func_ptr]
+ (auto submitted_fut, auto all_completed_fut) mutable {
+ return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
+ std::move(submitted_fut),
+ all_completed_fut.safe_then_interruptible_tuple(
+ std::move(success_func),
+ crimson::ct_error::object_corrupted::handle(
+ [rollbacker, this] (const std::error_code& e) mutable {
+ // this is a path for EIO. it's special because we want to fix the obejct
+ // and try again. that is, the layer above `PG::do_osd_ops` is supposed to
+ // restart the execution.
+ return rollbacker.rollback_obc_if_modified(e).then_interruptible(
+ [obc=rollbacker.get_obc(), this] {
+ return repair_object(obc->obs.oi.soid,
+ obc->obs.oi.version).then_interruptible([] {
+ return do_osd_ops_iertr::future<Ret>{crimson::ct_error::eagain::make()};
+ });
+ });
+ }), OpsExecuter::osd_op_errorator::all_same_way(
+ [rollbacker, failure_func_ptr]
+ (const std::error_code& e) mutable {
+ return rollbacker.rollback_obc_if_modified(e).then_interruptible(
+ [e, failure_func_ptr] {
+ return (*failure_func_ptr)(e);
+ });
+ })
+ )
+ );
+ }, OpsExecuter::osd_op_errorator::all_same_way(
+ [rollbacker, failure_func_ptr]
+ (const std::error_code& e) mutable {
+ return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
+ seastar::now(),
+ e.value() == ENOENT ? (*failure_func_ptr)(e) :
+ rollbacker.rollback_obc_if_modified(e).then_interruptible(
+ [e, failure_func_ptr] {
+ return (*failure_func_ptr)(e);
+ }));
+ }));
+}
+seastar::future<> PG::submit_error_log(
+ Ref<MOSDOp> m,
+ const OpInfo &op_info,
+ ObjectContextRef obc,
+ const std::error_code e,
+ ceph_tid_t rep_tid,
+ eversion_t &version)
+{
+ const osd_reqid_t &reqid = m->get_reqid();
+ mempool::osd_pglog::list<pg_log_entry_t> log_entries;
+ log_entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR,
+ obc->obs.oi.soid,
+ next_version(),
+ eversion_t(), 0,
+ reqid, utime_t(),
+ -e.value()));
+ if (op_info.allows_returnvec()) {
+ log_entries.back().set_op_returns(m->ops);
+ }
+ ceph_assert(is_primary());
+ if (!log_entries.empty()) {
+ ceph_assert(log_entries.rbegin()->version >= projected_last_update);
+ version = projected_last_update = log_entries.rbegin()->version;
+ }
+ ceph::os::Transaction t;
+ peering_state.merge_new_log_entries(
+ log_entries, t, peering_state.get_pg_trim_to(),
+ peering_state.get_min_last_complete_ondisk());
+
+ set<pg_shard_t> waiting_on;
+ for (auto &i : get_acting_recovery_backfill()) {
+ pg_shard_t peer(i);
+ if (peer == pg_whoami) continue;
+ ceph_assert(peering_state.get_peer_missing().count(peer));
+ ceph_assert(peering_state.has_peer_info(peer));
+ auto log_m = crimson::make_message<MOSDPGUpdateLogMissing>(
+ log_entries,
+ spg_t(peering_state.get_info().pgid.pgid, i.shard),
+ pg_whoami.shard,
+ get_osdmap_epoch(),
+ get_last_peering_reset(),
+ rep_tid,
+ peering_state.get_pg_trim_to(),
+ peering_state.get_min_last_complete_ondisk());
+ send_cluster_message(peer.osd, std::move(log_m), get_osdmap_epoch());
+ waiting_on.insert(peer);
+ }
+ waiting_on.insert(pg_whoami);
+ log_entry_update_waiting_on.insert(
+ std::make_pair(rep_tid, log_update_t{std::move(waiting_on)}));
+ return shard_services.get_store().do_transaction(
+ get_collection_ref(), std::move(t))
+ .then([this] {
+ peering_state.update_trim_to();
+ return seastar::now();
+ });
+}
+
+PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<MURef<MOSDOpReply>>>
+PG::do_osd_ops(
+ Ref<MOSDOp> m,
+ crimson::net::ConnectionRef conn,
+ ObjectContextRef obc,
+ const OpInfo &op_info,
+ const SnapContext& snapc)
+{
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+ return do_osd_ops_execute<MURef<MOSDOpReply>>(
+ seastar::make_lw_shared<OpsExecuter>(
+ Ref<PG>{this}, obc, op_info, *m, conn, snapc),
+ m->ops,
+ [this, m, obc, may_write = op_info.may_write(),
+ may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] {
+ // TODO: should stop at the first op which returns a negative retval,
+ // cmpext uses it for returning the index of first unmatched byte
+ int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
+ if (may_read && result >= 0) {
+ for (auto &osdop : m->ops) {
+ if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+ result = osdop.rval.code;
+ break;
+ }
+ }
+ } else if (result > 0 && may_write && !rvec) {
+ result = 0;
+ } else if (result < 0 && (m->ops.empty() ?
+ 0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+ result = 0;
+ }
+ auto reply = crimson::make_message<MOSDOpReply>(m.get(),
+ result,
+ get_osdmap_epoch(),
+ 0,
+ false);
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ logger().debug(
+ "do_osd_ops: {} - object {} sending reply",
+ *m,
+ m->get_hobj());
+ if (obc->obs.exists) {
+ reply->set_reply_versions(peering_state.get_info().last_update,
+ obc->obs.oi.user_version);
+ } else {
+ reply->set_reply_versions(peering_state.get_info().last_update,
+ peering_state.get_info().last_user_version);
+ }
+ return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
+ std::move(reply));
+ },
+ [m, &op_info, obc, this] (const std::error_code& e) {
+ return seastar::do_with(eversion_t(), [m, &op_info, obc, e, this](auto &version) {
+ auto fut = seastar::now();
+ epoch_t epoch = get_osdmap_epoch();
+ ceph_tid_t rep_tid = shard_services.get_tid();
+ auto last_complete = peering_state.get_info().last_complete;
+ if (op_info.may_write()) {
+ fut = submit_error_log(m, op_info, obc, e, rep_tid, version);
+ }
+ return fut.then([m, e, epoch, &op_info, rep_tid, &version, last_complete, this] {
+ auto log_reply = [m, e, this] {
+ auto reply = crimson::make_message<MOSDOpReply>(
+ m.get(), -e.value(), get_osdmap_epoch(), 0, false);
+ if (m->ops.empty() ? 0 :
+ m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) {
+ reply->set_result(0);
+ }
+ // For all ops except for CMPEXT, the correct error value is encoded
+ // in e.value(). For CMPEXT, osdop.rval has the actual error value.
+ if (e.value() == ct_error::cmp_fail_error_value) {
+ assert(!m->ops.empty());
+ for (auto &osdop : m->ops) {
+ if (osdop.rval < 0) {
+ reply->set_result(osdop.rval);
+ break;
+ }
+ }
+ }
+ reply->set_enoent_reply_versions(
+ peering_state.get_info().last_update,
+ peering_state.get_info().last_user_version);
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
+ std::move(reply));
+ };
+
+ if (!peering_state.pg_has_reset_since(epoch) && op_info.may_write()) {
+ auto it = log_entry_update_waiting_on.find(rep_tid);
+ ceph_assert(it != log_entry_update_waiting_on.end());
+ auto it2 = it->second.waiting_on.find(pg_whoami);
+ ceph_assert(it2 != it->second.waiting_on.end());
+ it->second.waiting_on.erase(it2);
+
+ if (it->second.waiting_on.empty()) {
+ log_entry_update_waiting_on.erase(it);
+ if (version != eversion_t()) {
+ peering_state.complete_write(version, last_complete);
+ }
+ return log_reply();
+ } else {
+ return it->second.all_committed.get_shared_future()
+ .then([this, &version, last_complete, log_reply = std::move(log_reply)] {
+ if (version != eversion_t()) {
+ peering_state.complete_write(version, last_complete);
+ }
+ return log_reply();
+ });
+ }
+ } else {
+ return log_reply();
+ }
+ });
+ });
+ });
+}
+
+PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<>>
+PG::do_osd_ops(
+ ObjectContextRef obc,
+ std::vector<OSDOp>& ops,
+ const OpInfo &op_info,
+ const do_osd_ops_params_t &&msg_params,
+ do_osd_ops_success_func_t success_func,
+ do_osd_ops_failure_func_t failure_func)
+{
+ // This overload is generally used for internal client requests,
+ // use an empty SnapContext.
+ return seastar::do_with(
+ std::move(msg_params),
+ [=, this, &ops, &op_info](auto &msg_params) {
+ return do_osd_ops_execute<void>(
+ seastar::make_lw_shared<OpsExecuter>(
+ Ref<PG>{this},
+ std::move(obc),
+ op_info,
+ msg_params,
+ msg_params.get_connection(),
+ SnapContext{}
+ ),
+ ops,
+ std::move(success_func),
+ std::move(failure_func));
+ });
+}
+
+PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m)
+{
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+
+ auto ox = std::make_unique<PgOpsExecuter>(std::as_const(*this),
+ std::as_const(*m));
+ return interruptor::do_for_each(m->ops, [ox = ox.get()](OSDOp& osd_op) {
+ logger().debug("will be handling pg op {}", ceph_osd_op_name(osd_op.op.op));
+ return ox->execute_op(osd_op);
+ }).then_interruptible([m, this, ox = std::move(ox)] {
+ auto reply = crimson::make_message<MOSDOpReply>(m.get(), 0, get_osdmap_epoch(),
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+ false);
+ reply->claim_op_out_data(m->ops);
+ reply->set_reply_versions(peering_state.get_info().last_update,
+ peering_state.get_info().last_user_version);
+ return seastar::make_ready_future<MURef<MOSDOpReply>>(std::move(reply));
+ }).handle_exception_type_interruptible([=, this](const crimson::osd::error& e) {
+ auto reply = crimson::make_message<MOSDOpReply>(
+ m.get(), -e.code().value(), get_osdmap_epoch(), 0, false);
+ reply->set_enoent_reply_versions(peering_state.get_info().last_update,
+ peering_state.get_info().last_user_version);
+ return seastar::make_ready_future<MURef<MOSDOpReply>>(std::move(reply));
+ });
+}
+
+hobject_t PG::get_oid(const hobject_t& hobj)
+{
+ return hobj.snap == CEPH_SNAPDIR ? hobj.get_head() : hobj;
+}
+
+RWState::State PG::get_lock_type(const OpInfo &op_info)
+{
+
+ if (op_info.rwordered() && op_info.may_read()) {
+ return RWState::RWEXCL;
+ } else if (op_info.rwordered()) {
+ return RWState::RWWRITE;
+ } else {
+ ceph_assert(op_info.may_read());
+ return RWState::RWREAD;
+ }
+}
+
+void PG::check_blocklisted_obc_watchers(
+ ObjectContextRef &obc)
+{
+ if (obc->watchers.empty()) {
+ for (auto &[src, winfo] : obc->obs.oi.watchers) {
+ auto watch = crimson::osd::Watch::create(
+ obc, winfo, src.second, this);
+ watch->disconnect();
+ auto [it, emplaced] = obc->watchers.emplace(src, std::move(watch));
+ assert(emplaced);
+ logger().debug("added watch for obj {}, client {}",
+ obc->get_oid(), src.second);
+ }
+ }
+}
+
+PG::load_obc_iertr::future<>
+PG::with_locked_obc(const hobject_t &hobj,
+ const OpInfo &op_info,
+ with_obc_func_t &&f)
+{
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+ const hobject_t oid = get_oid(hobj);
+ auto wrapper = [f=std::move(f), this](auto obc) {
+ check_blocklisted_obc_watchers(obc);
+ return f(obc);
+ };
+ switch (get_lock_type(op_info)) {
+ case RWState::RWREAD:
+ return obc_loader.with_obc<RWState::RWREAD>(oid, std::move(wrapper));
+ case RWState::RWWRITE:
+ return obc_loader.with_obc<RWState::RWWRITE>(oid, std::move(wrapper));
+ case RWState::RWEXCL:
+ return obc_loader.with_obc<RWState::RWEXCL>(oid, std::move(wrapper));
+ default:
+ ceph_abort();
+ };
+}
+
+PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
+{
+ if (__builtin_expect(stopping, false)) {
+ return seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception());
+ }
+
+ logger().debug("{}: {}", __func__, *req);
+ if (can_discard_replica_op(*req)) {
+ return seastar::now();
+ }
+
+ ceph::os::Transaction txn;
+ auto encoded_txn = req->get_data().cbegin();
+ decode(txn, encoded_txn);
+ auto p = req->logbl.cbegin();
+ std::vector<pg_log_entry_t> log_entries;
+ decode(log_entries, p);
+ log_operation(std::move(log_entries),
+ req->pg_trim_to,
+ req->version,
+ req->min_last_complete_ondisk,
+ !txn.empty(),
+ txn,
+ false);
+ logger().debug("PG::handle_rep_op: do_transaction...");
+ return interruptor::make_interruptible(shard_services.get_store().do_transaction(
+ coll_ref, std::move(txn))).then_interruptible(
+ [req, lcod=peering_state.get_info().last_complete, this] {
+ peering_state.update_last_complete_ondisk(lcod);
+ const auto map_epoch = get_osdmap_epoch();
+ auto reply = crimson::make_message<MOSDRepOpReply>(
+ req.get(), pg_whoami, 0,
+ map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
+ reply->set_last_complete_ondisk(lcod);
+ return shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch);
+ });
+}
+
+void PG::log_operation(
+ std::vector<pg_log_entry_t>&& logv,
+ const eversion_t &trim_to,
+ const eversion_t &roll_forward_to,
+ const eversion_t &min_last_complete_ondisk,
+ bool transaction_applied,
+ ObjectStore::Transaction &txn,
+ bool async) {
+ logger().debug("{}", __func__);
+ if (is_primary()) {
+ ceph_assert(trim_to <= peering_state.get_last_update_ondisk());
+ }
+ /* TODO: when we add snap mapper and projected log support,
+ * we'll likely want to update them here.
+ *
+ * See src/osd/PrimaryLogPG.h:log_operation for how classic
+ * handles these cases.
+ */
+#if 0
+ if (transaction_applied) {
+ //TODO:
+ //update_snap_map(logv, t);
+ }
+ auto last = logv.rbegin();
+ if (is_primary() && last != logv.rend()) {
+ projected_log.skip_can_rollback_to_to_head();
+ projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
+ }
+#endif
+ if (!is_primary()) { // && !is_ec_pg()
+ replica_clear_repop_obc(logv);
+ }
+ peering_state.append_log(std::move(logv),
+ trim_to,
+ roll_forward_to,
+ min_last_complete_ondisk,
+ txn,
+ !txn.empty(),
+ false);
+}
+
+void PG::replica_clear_repop_obc(
+ const std::vector<pg_log_entry_t> &logv) {
+ logger().debug("{} clearing {} entries", __func__, logv.size());
+ for (auto &&e: logv) {
+ logger().debug(" {} get_object_boundary(from): {} "
+ " head version(to): {}",
+ e.soid,
+ e.soid.get_object_boundary(),
+ e.soid.get_head());
+ /* Have to blast all clones, they share a snapset */
+ obc_registry.clear_range(
+ e.soid.get_object_boundary(), e.soid.get_head());
+ }
+}
+
+void PG::handle_rep_op_reply(const MOSDRepOpReply& m)
+{
+ if (!can_discard_replica_op(m)) {
+ backend->got_rep_op_reply(m);
+ }
+}
+
+PG::interruptible_future<> PG::do_update_log_missing(
+ Ref<MOSDPGUpdateLogMissing> m,
+ crimson::net::ConnectionRef conn)
+{
+ if (__builtin_expect(stopping, false)) {
+ return seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception());
+ }
+
+ ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
+ ObjectStore::Transaction t;
+ std::optional<eversion_t> op_trim_to, op_roll_forward_to;
+ if (m->pg_trim_to != eversion_t())
+ op_trim_to = m->pg_trim_to;
+ if (m->pg_roll_forward_to != eversion_t())
+ op_roll_forward_to = m->pg_roll_forward_to;
+ logger().debug("op_trim_to = {}, op_roll_forward_to = {}",
+ op_trim_to, op_roll_forward_to);
+
+ peering_state.append_log_entries_update_missing(
+ m->entries, t, op_trim_to, op_roll_forward_to);
+
+ return interruptor::make_interruptible(shard_services.get_store().do_transaction(
+ coll_ref, std::move(t))).then_interruptible(
+ [m, conn, lcod=peering_state.get_info().last_complete, this] {
+ if (!peering_state.pg_has_reset_since(m->get_epoch())) {
+ peering_state.update_last_complete_ondisk(lcod);
+ auto reply =
+ crimson::make_message<MOSDPGUpdateLogMissingReply>(
+ spg_t(peering_state.get_info().pgid.pgid, get_primary().shard),
+ pg_whoami.shard,
+ m->get_epoch(),
+ m->min_epoch,
+ m->get_tid(),
+ lcod);
+ reply->set_priority(CEPH_MSG_PRIO_HIGH);
+ return conn->send(std::move(reply));
+ }
+ return seastar::now();
+ });
+}
+
+
+PG::interruptible_future<> PG::do_update_log_missing_reply(
+ Ref<MOSDPGUpdateLogMissingReply> m)
+{
+ logger().debug("{}: got reply from {}", __func__, m->get_from());
+
+ auto it = log_entry_update_waiting_on.find(m->get_tid());
+ if (it != log_entry_update_waiting_on.end()) {
+ if (it->second.waiting_on.count(m->get_from())) {
+ it->second.waiting_on.erase(m->get_from());
+ if (m->last_complete_ondisk != eversion_t()) {
+ peering_state.update_peer_last_complete_ondisk(
+ m->get_from(), m->last_complete_ondisk);
+ }
+ } else {
+ logger().error("{} : {} got reply {} from shard we are not waiting for ",
+ __func__, peering_state.get_info().pgid, *m, m->get_from());
+ }
+
+ if (it->second.waiting_on.empty()) {
+ it->second.all_committed.set_value();
+ it->second.all_committed = {};
+ log_entry_update_waiting_on.erase(it);
+ }
+ } else {
+ logger().error("{} : {} got reply {} on unknown tid {}",
+ __func__, peering_state.get_info().pgid, *m, m->get_tid());
+ }
+ return seastar::now();
+}
+
+bool PG::old_peering_msg(
+ const epoch_t reply_epoch,
+ const epoch_t query_epoch) const
+{
+ if (const epoch_t lpr = peering_state.get_last_peering_reset();
+ lpr > reply_epoch || lpr > query_epoch) {
+ logger().debug("{}: pg changed {} lpr {}, reply_epoch {}, query_epoch {}",
+ __func__, get_info().history, lpr, reply_epoch, query_epoch);
+ return true;
+ }
+ return false;
+}
+
+bool PG::can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const
+{
+ // if a repop is replied after a replica goes down in a new osdmap, and
+ // before the pg advances to this new osdmap, the repop replies before this
+ // repop can be discarded by that replica OSD, because the primary resets the
+ // connection to it when handling the new osdmap marking it down, and also
+ // resets the messenger sesssion when the replica reconnects. to avoid the
+ // out-of-order replies, the messages from that replica should be discarded.
+ const auto osdmap = peering_state.get_osdmap();
+ const int from_osd = m.get_source().num();
+ if (osdmap->is_down(from_osd)) {
+ return true;
+ }
+ // Mostly, this overlaps with the old_peering_msg
+ // condition. An important exception is pushes
+ // sent by replicas not in the acting set, since
+ // if such a replica goes down it does not cause
+ // a new interval.
+ if (osdmap->get_down_at(from_osd) >= m_map_epoch) {
+ return true;
+ }
+ // same pg?
+ // if pg changes *at all*, we reset and repeer!
+ return old_peering_msg(m_map_epoch, m_map_epoch);
+}
+
+seastar::future<> PG::stop()
+{
+ logger().info("PG {} {}", pgid, __func__);
+ stopping = true;
+ cancel_local_background_io_reservation();
+ cancel_remote_recovery_reservation();
+ check_readable_timer.cancel();
+ renew_lease_timer.cancel();
+ return osdmap_gate.stop().then([this] {
+ return wait_for_active_blocker.stop();
+ }).then([this] {
+ return recovery_handler->stop();
+ }).then([this] {
+ return recovery_backend->stop();
+ }).then([this] {
+ return backend->stop();
+ });
+}
+
+void PG::on_change(ceph::os::Transaction &t) {
+ logger().debug("{} {}:", *this, __func__);
+ context_registry_on_change();
+ obc_loader.notify_on_change(is_primary());
+ recovery_backend->on_peering_interval_change(t);
+ backend->on_actingset_changed(is_primary());
+ wait_for_active_blocker.unblock();
+ if (is_primary()) {
+ logger().debug("{} {}: requeueing", *this, __func__);
+ client_request_orderer.requeue(shard_services, this);
+ } else {
+ logger().debug("{} {}: dropping requests", *this, __func__);
+ client_request_orderer.clear_and_cancel();
+ }
+}
+
+void PG::context_registry_on_change() {
+ obc_registry.for_each([](ObjectContextRef obc) {
+ assert(obc);
+ for (auto j = obc->watchers.begin();
+ j != obc->watchers.end();
+ j = obc->watchers.erase(j)) {
+ j->second->discard_state();
+ }
+ });
+}
+
+bool PG::can_discard_op(const MOSDOp& m) const {
+ if (m.get_map_epoch() <
+ peering_state.get_info().history.same_primary_since) {
+ logger().debug("{} changed after {} dropping {} ",
+ __func__ , m.get_map_epoch(), m);
+ return true;
+ }
+
+ if ((m.get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS))
+ && !is_primary()
+ && (m.get_map_epoch() <
+ peering_state.get_info().history.same_interval_since))
+ {
+ // Note: the Objecter will resend on interval change without the primary
+ // changing if it actually sent to a replica. If the primary hasn't
+ // changed since the send epoch, we got it, and we're primary, it won't
+ // have resent even if the interval did change as it sent it to the primary
+ // (us).
+ return true;
+ }
+ return __builtin_expect(m.get_map_epoch()
+ < peering_state.get_info().history.same_primary_since, false);
+}
+
+bool PG::is_degraded_or_backfilling_object(const hobject_t& soid) const {
+ /* The conditions below may clear (on_local_recover, before we queue
+ * the transaction) before we actually requeue the degraded waiters
+ * in on_global_recover after the transaction completes.
+ */
+ if (peering_state.get_pg_log().get_missing().get_items().count(soid))
+ return true;
+ ceph_assert(!get_acting_recovery_backfill().empty());
+ for (auto& peer : get_acting_recovery_backfill()) {
+ if (peer == get_primary()) continue;
+ auto peer_missing_entry = peering_state.get_peer_missing().find(peer);
+ // If an object is missing on an async_recovery_target, return false.
+ // This will not block the op and the object is async recovered later.
+ if (peer_missing_entry != peering_state.get_peer_missing().end() &&
+ peer_missing_entry->second.get_items().count(soid)) {
+ return true;
+ }
+ // Object is degraded if after last_backfill AND
+ // we are backfilling it
+ if (is_backfill_target(peer) &&
+ peering_state.get_peer_info(peer).last_backfill <= soid &&
+ recovery_handler->backfill_state &&
+ recovery_handler->backfill_state->get_last_backfill_started() >= soid &&
+ recovery_backend->is_recovering(soid)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+PG::interruptible_future<std::optional<PG::complete_op_t>>
+PG::already_complete(const osd_reqid_t& reqid)
+{
+ eversion_t version;
+ version_t user_version;
+ int ret;
+ std::vector<pg_log_op_return_item_t> op_returns;
+
+ if (peering_state.get_pg_log().get_log().get_request(
+ reqid, &version, &user_version, &ret, &op_returns)) {
+ complete_op_t dupinfo{
+ user_version,
+ version,
+ ret};
+ return backend->request_committed(reqid, version).then([dupinfo] {
+ return seastar::make_ready_future<std::optional<complete_op_t>>(dupinfo);
+ });
+ } else {
+ return seastar::make_ready_future<std::optional<complete_op_t>>(std::nullopt);
+ }
+}
+
+}
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
new file mode 100644
index 000000000..d96db2e20
--- /dev/null
+++ b/src/crimson/osd/pg.h
@@ -0,0 +1,833 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "common/dout.h"
+#include "include/interval_set.h"
+#include "crimson/net/Fwd.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDOpReply.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+#include "osd/osd_types_fmt.h"
+#include "crimson/osd/object_context.h"
+#include "osd/PeeringState.h"
+#include "osd/SnapMapper.h"
+
+#include "crimson/common/interruptible_future.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/logmissing_request.h"
+#include "crimson/osd/osd_operations/logmissing_request_reply.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/pg_activation_blocker.h"
+#include "crimson/osd/pg_recovery.h"
+#include "crimson/osd/pg_recovery_listener.h"
+#include "crimson/osd/recovery_backend.h"
+#include "crimson/osd/object_context_loader.h"
+
+class MQuery;
+class OSDMap;
+class PGBackend;
+class PGPeeringEvent;
+class osd_op_params_t;
+
+namespace recovery {
+ class Context;
+}
+
+namespace crimson::net {
+ class Messenger;
+}
+
+namespace crimson::os {
+ class FuturizedStore;
+}
+
+namespace crimson::osd {
+class OpsExecuter;
+class BackfillRecovery;
+class SnapTrimEvent;
+
+class PG : public boost::intrusive_ref_counter<
+ PG,
+ boost::thread_unsafe_counter>,
+ public PGRecoveryListener,
+ PeeringState::PeeringListener,
+ DoutPrefixProvider
+{
+ using ec_profile_t = std::map<std::string,std::string>;
+ using cached_map_t = OSDMapService::cached_map_t;
+
+ ClientRequest::PGPipeline request_pg_pipeline;
+ PGPeeringPipeline peering_request_pg_pipeline;
+
+ ClientRequest::Orderer client_request_orderer;
+
+ spg_t pgid;
+ pg_shard_t pg_whoami;
+ crimson::os::CollectionRef coll_ref;
+ ghobject_t pgmeta_oid;
+
+ seastar::timer<seastar::lowres_clock> check_readable_timer;
+ seastar::timer<seastar::lowres_clock> renew_lease_timer;
+
+public:
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition, T>;
+
+ PG(spg_t pgid,
+ pg_shard_t pg_shard,
+ crimson::os::CollectionRef coll_ref,
+ pg_pool_t&& pool,
+ std::string&& name,
+ cached_map_t osdmap,
+ ShardServices &shard_services,
+ ec_profile_t profile);
+
+ ~PG();
+
+ const pg_shard_t& get_pg_whoami() const final {
+ return pg_whoami;
+ }
+
+ const spg_t& get_pgid() const final {
+ return pgid;
+ }
+
+ PGBackend& get_backend() {
+ return *backend;
+ }
+ const PGBackend& get_backend() const {
+ return *backend;
+ }
+ // EpochSource
+ epoch_t get_osdmap_epoch() const final {
+ return peering_state.get_osdmap_epoch();
+ }
+
+ eversion_t get_pg_trim_to() const {
+ return peering_state.get_pg_trim_to();
+ }
+
+ eversion_t get_min_last_complete_ondisk() const {
+ return peering_state.get_min_last_complete_ondisk();
+ }
+
+ const pg_info_t& get_info() const final {
+ return peering_state.get_info();
+ }
+
+ // DoutPrefixProvider
+ std::ostream& gen_prefix(std::ostream& out) const final {
+ return out << *this;
+ }
+ crimson::common::CephContext *get_cct() const final {
+ return shard_services.get_cct();
+ }
+ unsigned get_subsys() const final {
+ return ceph_subsys_osd;
+ }
+
+ crimson::os::CollectionRef get_collection_ref() {
+ return coll_ref;
+ }
+
+ // PeeringListener
+ void prepare_write(
+ pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ PGLog &pglog,
+ bool dirty_info,
+ bool dirty_big_info,
+ bool need_write_epoch,
+ ceph::os::Transaction &t) final;
+
+ void on_info_history_change() final {
+ // Not needed yet -- mainly for scrub scheduling
+ }
+
+ /// Notify PG that Primary/Replica status has changed (to update scrub registration)
+ void on_primary_status_change(bool was_primary, bool now_primary) final {
+ }
+
+ /// Need to reschedule next scrub. Assuming no change in role
+ void reschedule_scrub() final {
+ }
+
+ void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) final;
+
+ uint64_t get_snap_trimq_size() const final {
+ return std::size(snap_trimq);
+ }
+
+ void send_cluster_message(
+ int osd, MessageURef m,
+ epoch_t epoch, bool share_map_update=false) final {
+ (void)shard_services.send_to_osd(osd, std::move(m), epoch);
+ }
+
+ void send_pg_created(pg_t pgid) final {
+ (void)shard_services.send_pg_created(pgid);
+ }
+
+ bool try_flush_or_schedule_async() final;
+
+ void start_flush_on_transaction(
+ ceph::os::Transaction &t) final {
+ t.register_on_commit(
+ new LambdaContext([this](int r){
+ peering_state.complete_flush();
+ }));
+ }
+
+ void on_flushed() final {
+ // will be needed for unblocking IO operations/peering
+ }
+
+ template <typename T>
+ void start_peering_event_operation(T &&evt, float delay = 0) {
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ delay,
+ std::forward<T>(evt));
+ }
+
+ void schedule_event_after(
+ PGPeeringEventRef event,
+ float delay) final {
+ start_peering_event_operation(std::move(*event), delay);
+ }
+ std::vector<pg_shard_t> get_replica_recovery_order() const final {
+ return peering_state.get_replica_recovery_order();
+ }
+ void request_local_background_io_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.local_request_reservation(
+ pgid,
+ on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) {
+ start_peering_event_operation(std::move(*on_grant));
+ }) : nullptr,
+ priority,
+ on_preempt ? make_lambda_context(
+ [this, on_preempt=std::move(on_preempt)] (int) {
+ start_peering_event_operation(std::move(*on_preempt));
+ }) : nullptr);
+ }
+
+ void update_local_background_io_priority(
+ unsigned priority) final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.local_update_priority(
+ pgid,
+ priority);
+ }
+
+ void cancel_local_background_io_reservation() final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.local_cancel_reservation(
+ pgid);
+ }
+
+ void request_remote_recovery_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.remote_request_reservation(
+ pgid,
+ on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) {
+ start_peering_event_operation(std::move(*on_grant));
+ }) : nullptr,
+ priority,
+ on_preempt ? make_lambda_context(
+ [this, on_preempt=std::move(on_preempt)] (int) {
+ start_peering_event_operation(std::move(*on_preempt));
+ }) : nullptr);
+ }
+
+ void cancel_remote_recovery_reservation() final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.remote_cancel_reservation(
+ pgid);
+ }
+
+ void schedule_event_on_commit(
+ ceph::os::Transaction &t,
+ PGPeeringEventRef on_commit) final {
+ t.register_on_commit(
+ make_lambda_context(
+ [this, on_commit=std::move(on_commit)](int) {
+ start_peering_event_operation(std::move(*on_commit));
+ }));
+ }
+
+ void update_heartbeat_peers(std::set<int> peers) final {
+ // Not needed yet
+ }
+ void set_probe_targets(const std::set<pg_shard_t> &probe_set) final {
+ // Not needed yet
+ }
+ void clear_probe_targets() final {
+ // Not needed yet
+ }
+ void queue_want_pg_temp(const std::vector<int> &wanted) final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.queue_want_pg_temp(pgid.pgid, wanted);
+ }
+ void clear_want_pg_temp() final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.remove_want_pg_temp(pgid.pgid);
+ }
+ void check_recovery_sources(const OSDMapRef& newmap) final {
+ // Not needed yet
+ }
+ void check_blocklisted_watchers() final;
+ void clear_primary_state() final {
+ // Not needed yet
+ }
+
+ void queue_check_readable(epoch_t last_peering_reset,
+ ceph::timespan delay) final;
+ void recheck_readable() final;
+
+ unsigned get_target_pg_log_entries() const final;
+
+ void on_pool_change() final {
+ // Not needed yet
+ }
+ void on_role_change() final {
+ // Not needed yet
+ }
+ void on_change(ceph::os::Transaction &t) final;
+ void on_activate(interval_set<snapid_t> to_trim) final;
+ void on_activate_complete() final;
+ void on_new_interval() final {
+ // Not needed yet
+ }
+ Context *on_clean() final;
+ void on_activate_committed() final {
+ // Not needed yet (will be needed for IO unblocking)
+ }
+ void on_active_exit() final {
+ // Not needed yet
+ }
+
+ void on_removal(ceph::os::Transaction &t) final;
+
+ std::pair<ghobject_t, bool>
+ do_delete_work(ceph::os::Transaction &t, ghobject_t _next) final;
+
+ // merge/split not ready
+ void clear_ready_to_merge() final {}
+ void set_not_ready_to_merge_target(pg_t pgid, pg_t src) final {}
+ void set_not_ready_to_merge_source(pg_t pgid) final {}
+ void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) final {}
+ void set_ready_to_merge_source(eversion_t lu) final {}
+
+ void on_active_actmap() final;
+ void on_active_advmap(const OSDMapRef &osdmap) final;
+
+ epoch_t cluster_osdmap_trim_lower_bound() final {
+ // TODO
+ return 0;
+ }
+
+ void on_backfill_reserved() final {
+ recovery_handler->on_backfill_reserved();
+ }
+ void on_backfill_canceled() final {
+ ceph_assert(0 == "Not implemented");
+ }
+
+ void on_recovery_reserved() final {
+ recovery_handler->start_pglogbased_recovery();
+ }
+
+
+ bool try_reserve_recovery_space(
+ int64_t primary_num_bytes, int64_t local_num_bytes) final {
+ // TODO
+ return true;
+ }
+ void unreserve_recovery_space() final {}
+
+ struct PGLogEntryHandler : public PGLog::LogEntryHandler {
+ PG *pg;
+ ceph::os::Transaction *t;
+ PGLogEntryHandler(PG *pg, ceph::os::Transaction *t) : pg(pg), t(t) {}
+
+ // LogEntryHandler
+ void remove(const hobject_t &hoid) override {
+ // TODO
+ }
+ void try_stash(const hobject_t &hoid, version_t v) override {
+ // TODO
+ }
+ void rollback(const pg_log_entry_t &entry) override {
+ // TODO
+ }
+ void rollforward(const pg_log_entry_t &entry) override {
+ // TODO
+ }
+ void trim(const pg_log_entry_t &entry) override {
+ // TODO
+ }
+ };
+ PGLog::LogEntryHandlerRef get_log_handler(
+ ceph::os::Transaction &t) final {
+ return std::make_unique<PG::PGLogEntryHandler>(this, &t);
+ }
+
+ void rebuild_missing_set_with_deletes(PGLog &pglog) final {
+ pglog.rebuild_missing_set_with_deletes_crimson(
+ shard_services.get_store(),
+ coll_ref,
+ peering_state.get_info()).get();
+ }
+
+ PerfCounters &get_peering_perf() final {
+ return shard_services.get_recoverystate_perf_logger();
+ }
+ PerfCounters &get_perf_logger() final {
+ return shard_services.get_perf_logger();
+ }
+
+ void log_state_enter(const char *state) final;
+ void log_state_exit(
+ const char *state_name, utime_t enter_time,
+ uint64_t events, utime_t event_dur) final;
+
+ void dump_recovery_info(Formatter *f) const final {
+ }
+
+ OstreamTemp get_clog_info() final {
+ // not needed yet: replace with not a stub (needs to be wired up to monc)
+ return OstreamTemp(CLOG_INFO, nullptr);
+ }
+ OstreamTemp get_clog_debug() final {
+ // not needed yet: replace with not a stub (needs to be wired up to monc)
+ return OstreamTemp(CLOG_DEBUG, nullptr);
+ }
+ OstreamTemp get_clog_error() final {
+ // not needed yet: replace with not a stub (needs to be wired up to monc)
+ return OstreamTemp(CLOG_ERROR, nullptr);
+ }
+
+ ceph::signedspan get_mnow() const final;
+ HeartbeatStampsRef get_hb_stamps(int peer) final;
+ void schedule_renew_lease(epoch_t plr, ceph::timespan delay) final;
+
+
+ // Utility
+ bool is_primary() const final {
+ return peering_state.is_primary();
+ }
+ bool is_nonprimary() const {
+ return peering_state.is_nonprimary();
+ }
+ bool is_peered() const final {
+ return peering_state.is_peered();
+ }
+ bool is_recovering() const final {
+ return peering_state.is_recovering();
+ }
+ bool is_backfilling() const final {
+ return peering_state.is_backfilling();
+ }
+ uint64_t get_last_user_version() const {
+ return get_info().last_user_version;
+ }
+ bool get_need_up_thru() const {
+ return peering_state.get_need_up_thru();
+ }
+ epoch_t get_same_interval_since() const {
+ return get_info().history.same_interval_since;
+ }
+
+ const auto& get_pgpool() const {
+ return peering_state.get_pgpool();
+ }
+ pg_shard_t get_primary() const {
+ return peering_state.get_primary();
+ }
+
+ /// initialize created PG
+ void init(
+ int role,
+ const std::vector<int>& up,
+ int up_primary,
+ const std::vector<int>& acting,
+ int acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pim,
+ ceph::os::Transaction &t);
+
+ seastar::future<> read_state(crimson::os::FuturizedStore::Shard* store);
+
+ interruptible_future<> do_peering_event(
+ PGPeeringEvent& evt, PeeringCtx &rctx);
+
+ seastar::future<> handle_advance_map(cached_map_t next_map, PeeringCtx &rctx);
+ seastar::future<> handle_activate_map(PeeringCtx &rctx);
+ seastar::future<> handle_initialize(PeeringCtx &rctx);
+
+ static hobject_t get_oid(const hobject_t& hobj);
+ static RWState::State get_lock_type(const OpInfo &op_info);
+
+ using load_obc_ertr = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::object_corrupted>;
+ using load_obc_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ load_obc_ertr>;
+ using interruptor = ::crimson::interruptible::interruptor<
+ ::crimson::osd::IOInterruptCondition>;
+
+public:
+ using with_obc_func_t =
+ std::function<load_obc_iertr::future<> (ObjectContextRef)>;
+
+ load_obc_iertr::future<> with_locked_obc(
+ const hobject_t &hobj,
+ const OpInfo &op_info,
+ with_obc_func_t&& f);
+
+ interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m);
+ void log_operation(
+ std::vector<pg_log_entry_t>&& logv,
+ const eversion_t &trim_to,
+ const eversion_t &roll_forward_to,
+ const eversion_t &min_last_complete_ondisk,
+ bool transaction_applied,
+ ObjectStore::Transaction &txn,
+ bool async = false);
+ void replica_clear_repop_obc(
+ const std::vector<pg_log_entry_t> &logv);
+ void handle_rep_op_reply(const MOSDRepOpReply& m);
+ interruptible_future<> do_update_log_missing(
+ Ref<MOSDPGUpdateLogMissing> m,
+ crimson::net::ConnectionRef conn);
+ interruptible_future<> do_update_log_missing_reply(
+ Ref<MOSDPGUpdateLogMissingReply> m);
+
+
+ void print(std::ostream& os) const;
+ void dump_primary(Formatter*);
+ seastar::future<> submit_error_log(
+ Ref<MOSDOp> m,
+ const OpInfo &op_info,
+ ObjectContextRef obc,
+ const std::error_code e,
+ ceph_tid_t rep_tid,
+ eversion_t &version);
+
+private:
+
+ struct SnapTrimMutex {
+ struct WaitPG : OrderedConcurrentPhaseT<WaitPG> {
+ static constexpr auto type_name = "SnapTrimEvent::wait_pg";
+ } wait_pg;
+ seastar::shared_mutex mutex;
+
+ interruptible_future<> lock(SnapTrimEvent &st_event) noexcept;
+
+ void unlock() noexcept {
+ mutex.unlock();
+ }
+ } snaptrim_mutex;
+
+ using do_osd_ops_ertr = crimson::errorator<
+ crimson::ct_error::eagain>;
+ using do_osd_ops_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ ::crimson::errorator<crimson::ct_error::eagain>>;
+ template <typename Ret = void>
+ using pg_rep_op_fut_t =
+ std::tuple<interruptible_future<>,
+ do_osd_ops_iertr::future<Ret>>;
+ do_osd_ops_iertr::future<pg_rep_op_fut_t<MURef<MOSDOpReply>>> do_osd_ops(
+ Ref<MOSDOp> m,
+ crimson::net::ConnectionRef conn,
+ ObjectContextRef obc,
+ const OpInfo &op_info,
+ const SnapContext& snapc);
+ using do_osd_ops_success_func_t =
+ std::function<do_osd_ops_iertr::future<>()>;
+ using do_osd_ops_failure_func_t =
+ std::function<do_osd_ops_iertr::future<>(const std::error_code&)>;
+ struct do_osd_ops_params_t;
+ do_osd_ops_iertr::future<pg_rep_op_fut_t<>> do_osd_ops(
+ ObjectContextRef obc,
+ std::vector<OSDOp>& ops,
+ const OpInfo &op_info,
+ const do_osd_ops_params_t &&params,
+ do_osd_ops_success_func_t success_func,
+ do_osd_ops_failure_func_t failure_func);
+ template <class Ret, class SuccessFunc, class FailureFunc>
+ do_osd_ops_iertr::future<pg_rep_op_fut_t<Ret>> do_osd_ops_execute(
+ seastar::lw_shared_ptr<OpsExecuter> ox,
+ std::vector<OSDOp>& ops,
+ SuccessFunc&& success_func,
+ FailureFunc&& failure_func);
+ interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
+ std::tuple<interruptible_future<>, interruptible_future<>>
+ submit_transaction(
+ ObjectContextRef&& obc,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& oop,
+ std::vector<pg_log_entry_t>&& log_entries);
+ interruptible_future<> repair_object(
+ const hobject_t& oid,
+ eversion_t& v);
+ void check_blocklisted_obc_watchers(ObjectContextRef &obc);
+
+private:
+ PG_OSDMapGate osdmap_gate;
+ ShardServices &shard_services;
+
+
+public:
+ cached_map_t get_osdmap() { return peering_state.get_osdmap(); }
+ eversion_t next_version() {
+ return eversion_t(get_osdmap_epoch(),
+ ++projected_last_update.version);
+ }
+ ShardServices& get_shard_services() final {
+ return shard_services;
+ }
+ seastar::future<> stop();
+private:
+ std::unique_ptr<PGBackend> backend;
+ std::unique_ptr<RecoveryBackend> recovery_backend;
+ std::unique_ptr<PGRecovery> recovery_handler;
+
+ PeeringState peering_state;
+ eversion_t projected_last_update;
+
+public:
+ ObjectContextRegistry obc_registry;
+ ObjectContextLoader obc_loader;
+
+private:
+ OSDriver osdriver;
+ SnapMapper snap_mapper;
+
+public:
+ // PeeringListener
+ void publish_stats_to_osd() final;
+ void clear_publish_stats() final;
+ pg_stat_t get_stats() const;
+private:
+ std::optional<pg_stat_t> pg_stats;
+
+public:
+ RecoveryBackend* get_recovery_backend() final {
+ return recovery_backend.get();
+ }
+ PGRecovery* get_recovery_handler() final {
+ return recovery_handler.get();
+ }
+ PeeringState& get_peering_state() final {
+ return peering_state;
+ }
+ bool has_reset_since(epoch_t epoch) const final {
+ return peering_state.pg_has_reset_since(epoch);
+ }
+
+ const pg_missing_tracker_t& get_local_missing() const {
+ return peering_state.get_pg_log().get_missing();
+ }
+ epoch_t get_last_peering_reset() const final {
+ return peering_state.get_last_peering_reset();
+ }
+ const std::set<pg_shard_t> &get_acting_recovery_backfill() const {
+ return peering_state.get_acting_recovery_backfill();
+ }
+ bool is_backfill_target(pg_shard_t osd) const {
+ return peering_state.is_backfill_target(osd);
+ }
+ void begin_peer_recover(pg_shard_t peer, const hobject_t oid) {
+ peering_state.begin_peer_recover(peer, oid);
+ }
+ uint64_t min_peer_features() const {
+ return peering_state.get_min_peer_features();
+ }
+ const std::map<hobject_t, std::set<pg_shard_t>>&
+ get_missing_loc_shards() const {
+ return peering_state.get_missing_loc().get_missing_locs();
+ }
+ const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const {
+ return peering_state.get_peer_missing();
+ }
+ epoch_t get_interval_start_epoch() const {
+ return get_info().history.same_interval_since;
+ }
+ const pg_missing_const_i* get_shard_missing(pg_shard_t shard) const {
+ if (shard == pg_whoami)
+ return &get_local_missing();
+ else {
+ auto it = peering_state.get_peer_missing().find(shard);
+ if (it == peering_state.get_peer_missing().end())
+ return nullptr;
+ else
+ return &it->second;
+ }
+ }
+
+ struct complete_op_t {
+ const version_t user_version;
+ const eversion_t version;
+ const int err;
+ };
+ interruptible_future<std::optional<complete_op_t>>
+ already_complete(const osd_reqid_t& reqid);
+ int get_recovery_op_priority() const {
+ int64_t pri = 0;
+ get_pgpool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+ return pri > 0 ? pri : crimson::common::local_conf()->osd_recovery_op_priority;
+ }
+ seastar::future<> mark_unfound_lost(int) {
+ // TODO: see PrimaryLogPG::mark_all_unfound_lost()
+ return seastar::now();
+ }
+
+ bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) const;
+
+ template <typename MsgType>
+ bool can_discard_replica_op(const MsgType& m) const {
+ return can_discard_replica_op(m, m.map_epoch);
+ }
+
+private:
+ // instead of seastar::gate, we use a boolean flag to indicate
+ // whether the system is shutting down, as we don't need to track
+ // continuations here.
+ bool stopping = false;
+
+ PGActivationBlocker wait_for_active_blocker;
+
+ friend std::ostream& operator<<(std::ostream&, const PG& pg);
+ friend class ClientRequest;
+ friend struct CommonClientRequest;
+ friend class PGAdvanceMap;
+ template <class T>
+ friend class PeeringEvent;
+ friend class RepRequest;
+ friend class LogMissingRequest;
+ friend class LogMissingRequestReply;
+ friend class BackfillRecovery;
+ friend struct PGFacade;
+ friend class InternalClientRequest;
+ friend class WatchTimeoutRequest;
+ friend class SnapTrimEvent;
+ friend class SnapTrimObjSubEvent;
+private:
+ seastar::future<bool> find_unfound() {
+ return seastar::make_ready_future<bool>(true);
+ }
+
+ bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const;
+ bool can_discard_op(const MOSDOp& m) const;
+ void context_registry_on_change();
+ bool is_missing_object(const hobject_t& soid) const {
+ return peering_state.get_pg_log().get_missing().get_items().count(soid);
+ }
+ bool is_unreadable_object(const hobject_t &oid,
+ eversion_t* v = 0) const final {
+ return is_missing_object(oid) ||
+ !peering_state.get_missing_loc().readable_with_acting(
+ oid, get_actingset(), v);
+ }
+ bool is_degraded_or_backfilling_object(const hobject_t& soid) const;
+ const std::set<pg_shard_t> &get_actingset() const {
+ return peering_state.get_actingset();
+ }
+
+private:
+ friend class IOInterruptCondition;
+ struct log_update_t {
+ std::set<pg_shard_t> waiting_on;
+ seastar::shared_promise<> all_committed;
+ };
+
+ std::map<ceph_tid_t, log_update_t> log_entry_update_waiting_on;
+ // snap trimming
+ interval_set<snapid_t> snap_trimq;
+};
+
+struct PG::do_osd_ops_params_t {
+ crimson::net::ConnectionRef &get_connection() const {
+ return conn;
+ }
+ osd_reqid_t get_reqid() const {
+ return reqid;
+ }
+ utime_t get_mtime() const {
+ return mtime;
+ };
+ epoch_t get_map_epoch() const {
+ return map_epoch;
+ }
+ entity_inst_t get_orig_source_inst() const {
+ return orig_source_inst;
+ }
+ uint64_t get_features() const {
+ return features;
+ }
+ // Only used by InternalClientRequest, no op flags
+ bool has_flag(uint32_t flag) const {
+ return false;
+ }
+
+ // Only used by ExecutableMessagePimpl
+ entity_name_t get_source() const {
+ return orig_source_inst.name;
+ }
+
+ crimson::net::ConnectionRef &conn;
+ osd_reqid_t reqid;
+ utime_t mtime;
+ epoch_t map_epoch;
+ entity_inst_t orig_source_inst;
+ uint64_t features;
+};
+
+std::ostream& operator<<(std::ostream&, const PG& pg);
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::PG> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/pg_activation_blocker.cc b/src/crimson/osd/pg_activation_blocker.cc
new file mode 100644
index 000000000..48ffe3f84
--- /dev/null
+++ b/src/crimson/osd/pg_activation_blocker.cc
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_activation_blocker.h"
+
+namespace crimson::osd {
+
+void PGActivationBlocker::dump_detail(Formatter *f) const
+{
+ f->dump_stream("pgid") << pg->get_pgid();
+}
+
+void PGActivationBlocker::unblock()
+{
+ p.set_value();
+ p = {};
+}
+
+seastar::future<>
+PGActivationBlocker::wait(PGActivationBlocker::BlockingEvent::TriggerI&& trigger)
+{
+ if (pg->get_peering_state().is_active()) {
+ return seastar::now();
+ } else {
+ return trigger.maybe_record_blocking(p.get_shared_future(), *this);
+ }
+}
+
+seastar::future<> PGActivationBlocker::stop()
+{
+ p.set_exception(crimson::common::system_shutdown_exception());
+ return seastar::now();
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/pg_activation_blocker.h b/src/crimson/osd/pg_activation_blocker.h
new file mode 100644
index 000000000..fff8219d1
--- /dev/null
+++ b/src/crimson/osd/pg_activation_blocker.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "crimson/common/operation.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace crimson::osd {
+
+class PG;
+
+class PGActivationBlocker : public crimson::BlockerT<PGActivationBlocker> {
+ PG *pg;
+
+ const spg_t pgid;
+ seastar::shared_promise<> p;
+
+protected:
+ void dump_detail(Formatter *f) const;
+
+public:
+ static constexpr const char *type_name = "PGActivationBlocker";
+ using Blocker = PGActivationBlocker;
+
+ PGActivationBlocker(PG *pg) : pg(pg) {}
+ void unblock();
+ seastar::future<> wait(PGActivationBlocker::BlockingEvent::TriggerI&&);
+ seastar::future<> stop();
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc
new file mode 100644
index 000000000..02acb9a55
--- /dev/null
+++ b/src/crimson/osd/pg_backend.cc
@@ -0,0 +1,1811 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg_backend.h"
+
+#include <charconv>
+#include <optional>
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/copy.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <seastar/core/print.hh>
+
+#include "messages/MOSDOp.h"
+#include "os/Transaction.h"
+#include "common/Checksummer.h"
+#include "common/Clock.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/common/tmap_helpers.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/object_context_loader.h"
+#include "replicated_backend.h"
+#include "replicated_recovery_backend.h"
+#include "ec_backend.h"
+#include "exceptions.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::runtime_error;
+using std::string;
+using std::string_view;
+using crimson::common::local_conf;
+
+std::unique_ptr<PGBackend>
+PGBackend::create(pg_t pgid,
+ const pg_shard_t pg_shard,
+ const pg_pool_t& pool,
+ crimson::os::CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ const ec_profile_t& ec_profile,
+ DoutPrefixProvider &dpp)
+{
+ switch (pool.type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ return std::make_unique<ReplicatedBackend>(pgid, pg_shard,
+ coll, shard_services,
+ dpp);
+ case pg_pool_t::TYPE_ERASURE:
+ return std::make_unique<ECBackend>(pg_shard.shard, coll, shard_services,
+ std::move(ec_profile),
+ pool.stripe_width,
+ dpp);
+ default:
+ throw runtime_error(seastar::format("unsupported pool type '{}'",
+ pool.type));
+ }
+}
+
+PGBackend::PGBackend(shard_id_t shard,
+ CollectionRef coll,
+ crimson::osd::ShardServices &shard_services,
+ DoutPrefixProvider &dpp)
+ : shard{shard},
+ coll{coll},
+ shard_services{shard_services},
+ dpp{dpp},
+ store{&shard_services.get_store()}
+{}
+
+PGBackend::load_metadata_iertr::future
+ <PGBackend::loaded_object_md_t::ref>
+PGBackend::load_metadata(const hobject_t& oid)
+{
+ return interruptor::make_interruptible(store->get_attrs(
+ coll,
+ ghobject_t{oid, ghobject_t::NO_GEN, shard})).safe_then_interruptible(
+ [oid](auto &&attrs) -> load_metadata_ertr::future<loaded_object_md_t::ref>{
+ loaded_object_md_t::ref ret(new loaded_object_md_t());
+ if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) {
+ bufferlist bl = std::move(oiiter->second);
+ try {
+ ret->os = ObjectState(
+ object_info_t(bl, oid),
+ true);
+ } catch (const buffer::error&) {
+ logger().warn("unable to decode ObjectState");
+ throw crimson::osd::invalid_argument();
+ }
+ } else {
+ logger().error(
+ "load_metadata: object {} present but missing object info",
+ oid);
+ return crimson::ct_error::object_corrupted::make();
+ }
+
+ if (oid.is_head()) {
+ // Returning object_corrupted when the object exsits and the
+ // Snapset is either not found or empty.
+ bool object_corrupted = true;
+ if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) {
+ object_corrupted = false;
+ bufferlist bl = std::move(ssiter->second);
+ if (bl.length()) {
+ ret->ssc = new crimson::osd::SnapSetContext(oid.get_snapdir());
+ try {
+ ret->ssc->snapset = SnapSet(bl);
+ ret->ssc->exists = true;
+ logger().debug(
+ "load_metadata: object {} and snapset {} present",
+ oid, ret->ssc->snapset);
+ } catch (const buffer::error&) {
+ logger().warn("unable to decode SnapSet");
+ throw crimson::osd::invalid_argument();
+ }
+ } else {
+ object_corrupted = true;
+ }
+ }
+ if (object_corrupted) {
+ logger().error(
+ "load_metadata: object {} present but missing snapset",
+ oid);
+ return crimson::ct_error::object_corrupted::make();
+ }
+ }
+
+ return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>(
+ std::move(ret));
+ }, crimson::ct_error::enoent::handle([oid] {
+ logger().debug(
+ "load_metadata: object {} doesn't exist, returning empty metadata",
+ oid);
+ return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>(
+ new loaded_object_md_t{
+ ObjectState(
+ object_info_t(oid),
+ false),
+ oid.is_head() ? (new crimson::osd::SnapSetContext(oid)) : nullptr
+ });
+ }));
+}
+
+PGBackend::rep_op_fut_t
+PGBackend::mutate_object(
+ std::set<pg_shard_t> pg_shards,
+ crimson::osd::ObjectContextRef &&obc,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch,
+ epoch_t map_epoch,
+ std::vector<pg_log_entry_t>&& log_entries)
+{
+ logger().trace("mutate_object: num_ops={}", txn.get_num_ops());
+ if (obc->obs.exists) {
+#if 0
+ obc->obs.oi.version = ctx->at_version;
+ obc->obs.oi.prior_version = ctx->obs->oi.version;
+#endif
+
+ obc->obs.oi.prior_version = obc->obs.oi.version;
+ obc->obs.oi.version = osd_op_p.at_version;
+ if (osd_op_p.user_at_version > obc->obs.oi.user_version)
+ obc->obs.oi.user_version = osd_op_p.user_at_version;
+ obc->obs.oi.last_reqid = osd_op_p.req_id;
+ obc->obs.oi.mtime = osd_op_p.mtime;
+ obc->obs.oi.local_mtime = ceph_clock_now();
+
+ // object_info_t
+ {
+ ceph::bufferlist osv;
+ obc->obs.oi.encode_no_oid(osv, CEPH_FEATURES_ALL);
+ // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv);
+ }
+
+ // snapset
+ if (obc->obs.oi.soid.snap == CEPH_NOSNAP) {
+ logger().debug("final snapset {} in {}",
+ obc->ssc->snapset, obc->obs.oi.soid);
+ ceph::bufferlist bss;
+ encode(obc->ssc->snapset, bss);
+ txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, SS_ATTR, bss);
+ obc->ssc->exists = true;
+ } else {
+ logger().debug("no snapset (this is a clone)");
+ }
+ } else {
+ // reset cached ObjectState without enforcing eviction
+ obc->obs.oi = object_info_t(obc->obs.oi.soid);
+ }
+ return _submit_transaction(
+ std::move(pg_shards), obc->obs.oi.soid, std::move(txn),
+ std::move(osd_op_p), min_epoch, map_epoch, std::move(log_entries));
+}
+
+static inline bool _read_verify_data(
+ const object_info_t& oi,
+ const ceph::bufferlist& data)
+{
+ if (oi.is_data_digest() && oi.size == data.length()) {
+ // whole object? can we verify the checksum?
+ if (auto crc = data.crc32c(-1); crc != oi.data_digest) {
+ logger().error("full-object read crc {} != expected {} on {}",
+ crc, oi.data_digest, oi.soid);
+ // todo: mark soid missing, perform recovery, and retry
+ return false;
+ }
+ }
+ return true;
+}
+
+PGBackend::read_ierrorator::future<>
+PGBackend::read(const ObjectState& os, OSDOp& osd_op,
+ object_stat_sum_t& delta_stats)
+{
+ const auto& oi = os.oi;
+ const ceph_osd_op& op = osd_op.op;
+ const uint64_t offset = op.extent.offset;
+ uint64_t length = op.extent.length;
+ logger().trace("read: {} {}~{}", oi.soid, offset, length);
+
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: {} DNE", __func__, os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ // are we beyond truncate_size?
+ size_t size = oi.size;
+ if ((op.extent.truncate_seq > oi.truncate_seq) &&
+ (op.extent.truncate_size < offset + length) &&
+ (op.extent.truncate_size < size)) {
+ size = op.extent.truncate_size;
+ }
+ if (offset >= size) {
+ // read size was trimmed to zero and it is expected to do nothing,
+ return read_errorator::now();
+ }
+ if (!length) {
+ // read the whole object if length is 0
+ length = size;
+ }
+ return _read(oi.soid, offset, length, op.flags).safe_then_interruptible_tuple(
+ [&delta_stats, &oi, &osd_op](auto&& bl) -> read_errorator::future<> {
+ if (!_read_verify_data(oi, bl)) {
+ // crc mismatches
+ return crimson::ct_error::object_corrupted::make();
+ }
+ logger().debug("read: data length: {}", bl.length());
+ osd_op.op.extent.length = bl.length();
+ osd_op.rval = 0;
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+ osd_op.outdata = std::move(bl);
+ return read_errorator::now();
+ }, crimson::ct_error::input_output_error::handle([] {
+ return read_errorator::future<>{crimson::ct_error::object_corrupted::make()};
+ }),
+ read_errorator::pass_further{});
+}
+
+PGBackend::read_ierrorator::future<>
+PGBackend::sparse_read(const ObjectState& os, OSDOp& osd_op,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: {} DNE", __func__, os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+
+ const auto& op = osd_op.op;
+ /* clients (particularly cephfs) may send truncate operations out of order
+ * w.r.t. reads. op.extent.truncate_seq and op.extent.truncate_size allow
+ * the OSD to determine whether the client submitted read needs to be
+ * adjusted to compensate for a truncate the OSD hasn't seen yet.
+ */
+ uint64_t adjusted_size = os.oi.size;
+ const uint64_t offset = op.extent.offset;
+ uint64_t adjusted_length = op.extent.length;
+ if ((os.oi.truncate_seq < op.extent.truncate_seq) &&
+ (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
+ (adjusted_size > op.extent.truncate_size)) {
+ adjusted_size = op.extent.truncate_size;
+ }
+ if (offset > adjusted_size) {
+ adjusted_length = 0;
+ } else if (offset + adjusted_length > adjusted_size) {
+ adjusted_length = adjusted_size - offset;
+ }
+ logger().trace("sparse_read: {} {}~{}",
+ os.oi.soid, op.extent.offset, op.extent.length);
+ return interruptor::make_interruptible(store->fiemap(coll, ghobject_t{os.oi.soid},
+ offset, adjusted_length)).safe_then_interruptible(
+ [&delta_stats, &os, &osd_op, this](auto&& m) {
+ return seastar::do_with(interval_set<uint64_t>{std::move(m)},
+ [&delta_stats, &os, &osd_op, this](auto&& extents) {
+ return interruptor::make_interruptible(store->readv(coll, ghobject_t{os.oi.soid},
+ extents, osd_op.op.flags)).safe_then_interruptible_tuple(
+ [&delta_stats, &os, &osd_op, &extents](auto&& bl) -> read_errorator::future<> {
+ if (_read_verify_data(os.oi, bl)) {
+ osd_op.op.extent.length = bl.length();
+ // re-encode since it might be modified
+ ceph::encode(extents, osd_op.outdata);
+ encode_destructively(bl, osd_op.outdata);
+ logger().trace("sparse_read got {} bytes from object {}",
+ osd_op.op.extent.length, os.oi.soid);
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(osd_op.op.extent.length, 10);
+ return read_errorator::make_ready_future<>();
+ } else {
+ // crc mismatches
+ return crimson::ct_error::object_corrupted::make();
+ }
+ }, crimson::ct_error::input_output_error::handle([] {
+ return read_errorator::future<>{crimson::ct_error::object_corrupted::make()};
+ }),
+ read_errorator::pass_further{});
+ });
+ });
+}
+
+namespace {
+
+ template<class CSum>
+ PGBackend::checksum_errorator::future<>
+ do_checksum(ceph::bufferlist& init_value_bl,
+ size_t chunk_size,
+ const ceph::bufferlist& buf,
+ ceph::bufferlist& result)
+ {
+ typename CSum::init_value_t init_value;
+ auto init_value_p = init_value_bl.cbegin();
+ try {
+ decode(init_value, init_value_p);
+ // chop off the consumed part
+ init_value_bl.splice(0, init_value_p.get_off());
+ } catch (const ceph::buffer::end_of_buffer&) {
+ logger().warn("{}: init value not provided", __func__);
+ return crimson::ct_error::invarg::make();
+ }
+ const uint32_t chunk_count = buf.length() / chunk_size;
+ ceph::bufferptr csum_data{
+ ceph::buffer::create(sizeof(typename CSum::value_t) * chunk_count)};
+ Checksummer::calculate<CSum>(
+ init_value, chunk_size, 0, buf.length(), buf, &csum_data);
+ encode(chunk_count, result);
+ result.append(std::move(csum_data));
+ return PGBackend::checksum_errorator::now();
+ }
+}
+
+PGBackend::checksum_ierrorator::future<>
+PGBackend::checksum(const ObjectState& os, OSDOp& osd_op)
+{
+ // sanity tests and normalize the argments
+ auto& checksum = osd_op.op.checksum;
+ if (checksum.offset == 0 && checksum.length == 0) {
+ // zeroed offset+length implies checksum whole object
+ checksum.length = os.oi.size;
+ } else if (checksum.offset >= os.oi.size) {
+ // read size was trimmed to zero, do nothing,
+ // see PGBackend::read()
+ return checksum_errorator::now();
+ }
+ if (checksum.chunk_size > 0) {
+ if (checksum.length == 0) {
+ logger().warn("{}: length required when chunk size provided", __func__);
+ return crimson::ct_error::invarg::make();
+ }
+ if (checksum.length % checksum.chunk_size != 0) {
+ logger().warn("{}: length not aligned to chunk size", __func__);
+ return crimson::ct_error::invarg::make();
+ }
+ } else {
+ checksum.chunk_size = checksum.length;
+ }
+ if (checksum.length == 0) {
+ uint32_t count = 0;
+ encode(count, osd_op.outdata);
+ return checksum_errorator::now();
+ }
+
+ // read the chunk to be checksum'ed
+ return _read(os.oi.soid, checksum.offset, checksum.length, osd_op.op.flags)
+ .safe_then_interruptible(
+ [&osd_op](auto&& read_bl) mutable -> checksum_errorator::future<> {
+ auto& checksum = osd_op.op.checksum;
+ if (read_bl.length() != checksum.length) {
+ logger().warn("checksum: bytes read {} != {}",
+ read_bl.length(), checksum.length);
+ return crimson::ct_error::invarg::make();
+ }
+ // calculate its checksum and put the result in outdata
+ switch (checksum.type) {
+ case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
+ return do_checksum<Checksummer::xxhash32>(osd_op.indata,
+ checksum.chunk_size,
+ read_bl,
+ osd_op.outdata);
+ case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
+ return do_checksum<Checksummer::xxhash64>(osd_op.indata,
+ checksum.chunk_size,
+ read_bl,
+ osd_op.outdata);
+ case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
+ return do_checksum<Checksummer::crc32c>(osd_op.indata,
+ checksum.chunk_size,
+ read_bl,
+ osd_op.outdata);
+ default:
+ logger().warn("checksum: unknown crc type ({})",
+ static_cast<uint32_t>(checksum.type));
+ return crimson::ct_error::invarg::make();
+ }
+ });
+}
+
+PGBackend::cmp_ext_ierrorator::future<>
+PGBackend::cmp_ext(const ObjectState& os, OSDOp& osd_op)
+{
+ const ceph_osd_op& op = osd_op.op;
+ uint64_t obj_size = os.oi.size;
+ if (os.oi.truncate_seq < op.extent.truncate_seq &&
+ op.extent.offset + op.extent.length > op.extent.truncate_size) {
+ obj_size = op.extent.truncate_size;
+ }
+ uint64_t ext_len;
+ if (op.extent.offset >= obj_size) {
+ ext_len = 0;
+ } else if (op.extent.offset + op.extent.length > obj_size) {
+ ext_len = obj_size - op.extent.offset;
+ } else {
+ ext_len = op.extent.length;
+ }
+ auto read_ext = ll_read_ierrorator::make_ready_future<ceph::bufferlist>();
+ if (ext_len == 0) {
+ logger().debug("{}: zero length extent", __func__);
+ } else if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: {} DNE", __func__, os.oi.soid);
+ } else {
+ read_ext = _read(os.oi.soid, op.extent.offset, ext_len, 0);
+ }
+ return read_ext.safe_then_interruptible([&osd_op](auto&& read_bl)
+ -> cmp_ext_errorator::future<> {
+ for (unsigned index = 0; index < osd_op.indata.length(); index++) {
+ char byte_in_op = osd_op.indata[index];
+ char byte_from_disk = (index < read_bl.length() ? read_bl[index] : 0);
+ if (byte_in_op != byte_from_disk) {
+ logger().debug("cmp_ext: mismatch at {}", index);
+ // Unlike other ops, we set osd_op.rval here and return a different
+ // error code via ct_error::cmp_fail.
+ osd_op.rval = -MAX_ERRNO - index;
+ return crimson::ct_error::cmp_fail::make();
+ }
+ }
+ osd_op.rval = 0;
+ return cmp_ext_errorator::make_ready_future<>();
+ });
+}
+
+PGBackend::stat_ierrorator::future<>
+PGBackend::stat(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats)
+{
+ if (os.exists/* TODO: && !os.is_whiteout() */) {
+ logger().debug("stat os.oi.size={}, os.oi.mtime={}", os.oi.size, os.oi.mtime);
+ encode(os.oi.size, osd_op.outdata);
+ encode(os.oi.mtime, osd_op.outdata);
+ } else {
+ logger().debug("stat object does not exist");
+ return crimson::ct_error::enoent::make();
+ }
+ delta_stats.num_rd++;
+ return stat_errorator::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::_writefull(
+ ObjectState& os,
+ off_t truncate_size,
+ const bufferlist& bl,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ unsigned flags)
+{
+ const bool existing = maybe_create_new_object(os, txn, delta_stats);
+ if (existing && bl.length() < os.oi.size) {
+
+ txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, bl.length());
+ truncate_update_size_and_usage(delta_stats, os.oi, truncate_size);
+
+ osd_op_params.clean_regions.mark_data_region_dirty(
+ bl.length(),
+ os.oi.size - bl.length());
+ }
+ if (bl.length()) {
+ txn.write(
+ coll->get_cid(), ghobject_t{os.oi.soid}, 0, bl.length(),
+ bl, flags);
+ update_size_and_usage(
+ delta_stats, os.oi, 0,
+ bl.length(), true);
+ osd_op_params.clean_regions.mark_data_region_dirty(
+ 0,
+ std::max((uint64_t)bl.length(), os.oi.size));
+ }
+ return seastar::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::_truncate(
+ ObjectState& os,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ size_t offset,
+ size_t truncate_size,
+ uint32_t truncate_seq)
+{
+ if (truncate_seq) {
+ assert(offset == truncate_size);
+ if (truncate_seq <= os.oi.truncate_seq) {
+ logger().debug("{} truncate seq {} <= current {}, no-op",
+ __func__, truncate_seq, os.oi.truncate_seq);
+ return write_ertr::make_ready_future<>();
+ } else {
+ logger().debug("{} truncate seq {} > current {}, truncating",
+ __func__, truncate_seq, os.oi.truncate_seq);
+ os.oi.truncate_seq = truncate_seq;
+ os.oi.truncate_size = truncate_size;
+ }
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ if (os.oi.size != offset) {
+ txn.truncate(
+ coll->get_cid(),
+ ghobject_t{os.oi.soid}, offset);
+ if (os.oi.size > offset) {
+ // TODO: modified_ranges.union_of(trim);
+ osd_op_params.clean_regions.mark_data_region_dirty(
+ offset,
+ os.oi.size - offset);
+ } else {
+ // os.oi.size < offset
+ osd_op_params.clean_regions.mark_data_region_dirty(
+ os.oi.size,
+ offset - os.oi.size);
+ }
+ truncate_update_size_and_usage(delta_stats, os.oi, offset);
+ os.oi.clear_data_digest();
+ }
+ delta_stats.num_wr++;
+ return write_ertr::now();
+}
+
+bool PGBackend::maybe_create_new_object(
+ ObjectState& os,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists) {
+ ceph_assert(!os.oi.is_whiteout());
+ os.exists = true;
+ os.oi.new_object();
+
+ txn.touch(coll->get_cid(), ghobject_t{os.oi.soid});
+ delta_stats.num_objects++;
+ return false;
+ } else if (os.oi.is_whiteout()) {
+ os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ delta_stats.num_whiteouts--;
+ }
+ return true;
+}
+
+void PGBackend::update_size_and_usage(object_stat_sum_t& delta_stats,
+ object_info_t& oi, uint64_t offset,
+ uint64_t length, bool write_full)
+{
+ if (write_full ||
+ (offset + length > oi.size && length)) {
+ uint64_t new_size = offset + length;
+ delta_stats.num_bytes -= oi.size;
+ delta_stats.num_bytes += new_size;
+ oi.size = new_size;
+ }
+ delta_stats.num_wr++;
+ delta_stats.num_wr_kb += shift_round_up(length, 10);
+}
+
+void PGBackend::truncate_update_size_and_usage(object_stat_sum_t& delta_stats,
+ object_info_t& oi,
+ uint64_t truncate_size)
+{
+ if (oi.size != truncate_size) {
+ delta_stats.num_bytes -= oi.size;
+ delta_stats.num_bytes += truncate_size;
+ oi.size = truncate_size;
+ }
+}
+
+static bool is_offset_and_length_valid(
+ const std::uint64_t offset,
+ const std::uint64_t length)
+{
+ if (const std::uint64_t max = local_conf()->osd_max_object_size;
+ offset >= max || length > max || offset + length > max) {
+ logger().debug("{} osd_max_object_size: {}, offset: {}, len: {}; "
+ "Hard limit of object size is 4GB",
+ __func__, max, offset, length);
+ return false;
+ } else {
+ return true;
+ }
+}
+
+PGBackend::interruptible_future<> PGBackend::set_allochint(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ maybe_create_new_object(os, txn, delta_stats);
+
+ os.oi.expected_object_size = osd_op.op.alloc_hint.expected_object_size;
+ os.oi.expected_write_size = osd_op.op.alloc_hint.expected_write_size;
+ os.oi.alloc_hint_flags = osd_op.op.alloc_hint.flags;
+ txn.set_alloc_hint(coll->get_cid(),
+ ghobject_t{os.oi.soid},
+ os.oi.expected_object_size,
+ os.oi.expected_write_size,
+ os.oi.alloc_hint_flags);
+ return seastar::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::write(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ const ceph_osd_op& op = osd_op.op;
+ uint64_t offset = op.extent.offset;
+ uint64_t length = op.extent.length;
+ bufferlist buf = osd_op.indata;
+ if (op.extent.length != osd_op.indata.length()) {
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+ return crimson::ct_error::file_too_large::make();
+ }
+
+ if (auto seq = os.oi.truncate_seq;
+ seq != 0 && op.extent.truncate_seq < seq) {
+ // old write, arrived after trimtrunc
+ if (offset + length > os.oi.size) {
+ // no-op
+ if (offset > os.oi.size) {
+ length = 0;
+ buf.clear();
+ } else {
+ // truncate
+ auto len = os.oi.size - offset;
+ buf.splice(len, length);
+ length = len;
+ }
+ }
+ } else if (op.extent.truncate_seq > seq) {
+ // write arrives before trimtrunc
+ if (os.exists && !os.oi.is_whiteout()) {
+ txn.truncate(coll->get_cid(),
+ ghobject_t{os.oi.soid}, op.extent.truncate_size);
+ if (op.extent.truncate_size != os.oi.size) {
+ os.oi.size = length;
+ if (op.extent.truncate_size > os.oi.size) {
+ osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size,
+ op.extent.truncate_size - os.oi.size);
+ } else {
+ osd_op_params.clean_regions.mark_data_region_dirty(op.extent.truncate_size,
+ os.oi.size - op.extent.truncate_size);
+ }
+ }
+ truncate_update_size_and_usage(delta_stats, os.oi, op.extent.truncate_size);
+ }
+ os.oi.truncate_seq = op.extent.truncate_seq;
+ os.oi.truncate_size = op.extent.truncate_size;
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ if (length == 0) {
+ if (offset > os.oi.size) {
+ txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.offset);
+ truncate_update_size_and_usage(delta_stats, os.oi, op.extent.offset);
+ } else {
+ txn.nop();
+ }
+ } else {
+ txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+ offset, length, std::move(buf), op.flags);
+ update_size_and_usage(delta_stats, os.oi, offset, length);
+ }
+ osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset,
+ op.extent.length);
+
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<> PGBackend::write_same(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ const ceph_osd_op& op = osd_op.op;
+ const uint64_t len = op.writesame.length;
+ if (len == 0) {
+ return seastar::now();
+ }
+ if (op.writesame.data_length == 0 ||
+ len % op.writesame.data_length != 0 ||
+ op.writesame.data_length != osd_op.indata.length()) {
+ throw crimson::osd::invalid_argument();
+ }
+ ceph::bufferlist repeated_indata;
+ for (uint64_t size = 0; size < len; size += op.writesame.data_length) {
+ repeated_indata.append(osd_op.indata);
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+ op.writesame.offset, len,
+ std::move(repeated_indata), op.flags);
+ update_size_and_usage(delta_stats, os.oi, op.writesame.offset, len);
+ osd_op_params.clean_regions.mark_data_region_dirty(op.writesame.offset, len);
+ return seastar::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::writefull(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ const ceph_osd_op& op = osd_op.op;
+ if (op.extent.length != osd_op.indata.length()) {
+ return crimson::ct_error::invarg::make();
+ }
+ if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+ return crimson::ct_error::file_too_large::make();
+ }
+
+ return _writefull(
+ os,
+ op.extent.truncate_size,
+ osd_op.indata,
+ txn,
+ osd_op_params,
+ delta_stats,
+ op.flags);
+}
+
+PGBackend::rollback_iertr::future<> PGBackend::rollback(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ crimson::osd::ObjectContextRef head,
+ crimson::osd::ObjectContextLoader& obc_loader)
+{
+ const ceph_osd_op& op = osd_op.op;
+ snapid_t snapid = (uint64_t)op.snap.snapid;
+ assert(os.oi.soid.is_head());
+ logger().debug("{} deleting {} and rolling back to old snap {}",
+ __func__, os.oi.soid ,snapid);
+ hobject_t target_coid = os.oi.soid;
+ target_coid.snap = snapid;
+ return obc_loader.with_clone_obc_only<RWState::RWWRITE>(
+ head, target_coid,
+ [this, &os, &txn, &delta_stats, &osd_op_params]
+ (auto resolved_obc) {
+ if (resolved_obc->obs.oi.soid.is_head()) {
+ // no-op: The resolved oid returned the head object
+ logger().debug("PGBackend::rollback: loaded head_obc: {}"
+ " do nothing",
+ resolved_obc->obs.oi.soid);
+ return rollback_iertr::now();
+ }
+ /* TODO: https://tracker.ceph.com/issues/59114 This implementation will not
+ * behave correctly for a rados operation consisting of a mutation followed
+ * by a rollback to a snapshot since the last mutation of the object.
+ * The correct behavior would be for the rollback to undo the mutation
+ * earlier in the operation by resolving to the clone created at the start
+ * of the operation (see resolve_oid).
+ * Instead, it will select HEAD leaving that mutation intact since the SnapSet won't
+ * yet contain that clone. This behavior exists in classic as well.
+ */
+ logger().debug("PGBackend::rollback: loaded clone_obc: {}",
+ resolved_obc->obs.oi.soid);
+ // 1) Delete current head
+ if (os.exists) {
+ txn.remove(coll->get_cid(), ghobject_t{os.oi.soid,
+ ghobject_t::NO_GEN, shard});
+ }
+ // 2) Clone correct snapshot into head
+ txn.clone(coll->get_cid(), ghobject_t{resolved_obc->obs.oi.soid},
+ ghobject_t{os.oi.soid});
+ // Copy clone obc.os.oi to os.oi
+ os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ os.oi.copy_user_bits(resolved_obc->obs.oi);
+ delta_stats.num_bytes -= os.oi.size;
+ delta_stats.num_bytes += resolved_obc->obs.oi.size;
+ osd_op_params.clean_regions.mark_data_region_dirty(0,
+ std::max(os.oi.size, resolved_obc->obs.oi.size));
+ osd_op_params.clean_regions.mark_omap_dirty();
+ // TODO: 3) Calculate clone_overlaps by following overlaps
+ // forward from rollback snapshot
+ // https://tracker.ceph.com/issues/58263
+ return rollback_iertr::now();
+ }).safe_then_interruptible([] {
+ logger().debug("PGBackend::rollback succefully");
+ return rollback_iertr::now();
+ },// there's no snapshot here, or there's no object.
+ // if there's no snapshot, we delete the object;
+ // otherwise, do nothing.
+ crimson::ct_error::enoent::handle(
+ [this, &os, &snapid, &txn, &delta_stats] {
+ logger().debug("PGBackend::rollback: deleting head on {}"
+ " with snap_id of {}"
+ " because got ENOENT|whiteout on obc lookup",
+ os.oi.soid, snapid);
+ return remove(os, txn, delta_stats, false);
+ }),
+ rollback_ertr::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error in rollback"}
+ );
+}
+
+PGBackend::append_ierrorator::future<> PGBackend::append(
+ ObjectState& os,
+ OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ const ceph_osd_op& op = osd_op.op;
+ if (op.extent.length != osd_op.indata.length()) {
+ return crimson::ct_error::invarg::make();
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ if (op.extent.length) {
+ txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+ os.oi.size /* offset */, op.extent.length,
+ std::move(osd_op.indata), op.flags);
+ update_size_and_usage(delta_stats, os.oi, os.oi.size,
+ op.extent.length);
+ osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size,
+ op.extent.length);
+ }
+ return seastar::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::truncate(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{} object dne, truncate is a no-op", __func__);
+ return write_ertr::now();
+ }
+ const ceph_osd_op& op = osd_op.op;
+ if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+ return crimson::ct_error::file_too_large::make();
+ }
+ return _truncate(
+ os, txn, osd_op_params, delta_stats,
+ op.extent.offset, op.extent.truncate_size, op.extent.truncate_seq);
+}
+
+PGBackend::write_iertr::future<> PGBackend::zero(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{} object dne, zero is a no-op", __func__);
+ return write_ertr::now();
+ }
+ const ceph_osd_op& op = osd_op.op;
+ if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+ return crimson::ct_error::file_too_large::make();
+ }
+
+ if (op.extent.offset >= os.oi.size || op.extent.length == 0) {
+ return write_iertr::now(); // noop
+ }
+
+ if (op.extent.offset + op.extent.length >= os.oi.size) {
+ return _truncate(
+ os, txn, osd_op_params, delta_stats,
+ op.extent.offset, op.extent.truncate_size, op.extent.truncate_seq);
+ }
+
+ txn.zero(coll->get_cid(),
+ ghobject_t{os.oi.soid},
+ op.extent.offset,
+ op.extent.length);
+ // TODO: modified_ranges.union_of(zeroed);
+ osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset,
+ op.extent.length);
+ delta_stats.num_wr++;
+ os.oi.clear_data_digest();
+ return write_ertr::now();
+}
+
+PGBackend::create_iertr::future<> PGBackend::create(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ if (os.exists && !os.oi.is_whiteout() &&
+ (osd_op.op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
+ // this is an exclusive create
+ return crimson::ct_error::eexist::make();
+ }
+
+ if (osd_op.indata.length()) {
+ // handle the legacy. `category` is no longer implemented.
+ try {
+ auto p = osd_op.indata.cbegin();
+ std::string category;
+ decode(category, p);
+ } catch (buffer::error&) {
+ return crimson::ct_error::invarg::make();
+ }
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ txn.create(coll->get_cid(),
+ ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<>
+PGBackend::remove(ObjectState& os, ceph::os::Transaction& txn)
+{
+ // todo: snapset
+ txn.remove(coll->get_cid(),
+ ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+ os.oi.size = 0;
+ os.oi.new_object();
+ os.exists = false;
+ // todo: update watchers
+ if (os.oi.is_whiteout()) {
+ os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ }
+ return seastar::now();
+}
+
+PGBackend::remove_iertr::future<>
+PGBackend::remove(ObjectState& os, ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats, bool whiteout)
+{
+ if (!os.exists) {
+ return crimson::ct_error::enoent::make();
+ }
+
+ if (!os.exists) {
+ logger().debug("{} {} does not exist",__func__, os.oi.soid);
+ return seastar::now();
+ }
+ if (whiteout && os.oi.is_whiteout()) {
+ logger().debug("{} whiteout set on {} ",__func__, os.oi.soid);
+ return seastar::now();
+ }
+ txn.remove(coll->get_cid(),
+ ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+ delta_stats.num_bytes -= os.oi.size;
+ os.oi.size = 0;
+ os.oi.new_object();
+
+ // todo: clone_overlap
+ if (whiteout) {
+ logger().debug("{} setting whiteout on {} ",__func__, os.oi.soid);
+ os.oi.set_flag(object_info_t::FLAG_WHITEOUT);
+ delta_stats.num_whiteouts++;
+ txn.create(coll->get_cid(),
+ ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+ return seastar::now();
+ }
+ // todo: update watchers
+ if (os.oi.is_whiteout()) {
+ os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ delta_stats.num_whiteouts--;
+ }
+ delta_stats.num_objects--;
+ os.exists = false;
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>>
+PGBackend::list_objects(const hobject_t& start, uint64_t limit) const
+{
+ auto gstart = start.is_min() ? ghobject_t{} : ghobject_t{start, 0, shard};
+ return interruptor::make_interruptible(store->list_objects(coll,
+ gstart,
+ ghobject_t::get_max(),
+ limit))
+ .then_interruptible([](auto ret) {
+ auto& [gobjects, next] = ret;
+ std::vector<hobject_t> objects;
+ boost::copy(gobjects |
+ boost::adaptors::filtered([](const ghobject_t& o) {
+ if (o.is_pgmeta()) {
+ return false;
+ } else if (o.hobj.is_temp()) {
+ return false;
+ } else {
+ return o.is_no_gen();
+ }
+ }) |
+ boost::adaptors::transformed([](const ghobject_t& o) {
+ return o.hobj;
+ }),
+ std::back_inserter(objects));
+ return seastar::make_ready_future<std::tuple<std::vector<hobject_t>, hobject_t>>(
+ std::make_tuple(objects, next.hobj));
+ });
+}
+
+PGBackend::setxattr_ierrorator::future<> PGBackend::setxattr(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ if (local_conf()->osd_max_attr_size > 0 &&
+ osd_op.op.xattr.value_len > local_conf()->osd_max_attr_size) {
+ return crimson::ct_error::file_too_large::make();
+ }
+
+ const auto max_name_len = std::min<uint64_t>(
+ store->get_max_attr_name_length(), local_conf()->osd_max_attr_name_len);
+ if (osd_op.op.xattr.name_len > max_name_len) {
+ return crimson::ct_error::enametoolong::make();
+ }
+
+ maybe_create_new_object(os, txn, delta_stats);
+
+ std::string name{"_"};
+ ceph::bufferlist val;
+ {
+ auto bp = osd_op.indata.cbegin();
+ bp.copy(osd_op.op.xattr.name_len, name);
+ bp.copy(osd_op.op.xattr.value_len, val);
+ }
+ logger().debug("setxattr on obj={} for attr={}", os.oi.soid, name);
+ txn.setattr(coll->get_cid(), ghobject_t{os.oi.soid}, name, val);
+ delta_stats.num_wr++;
+ return seastar::now();
+}
+
+PGBackend::get_attr_ierrorator::future<> PGBackend::getxattr(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ std::string name;
+ ceph::bufferlist val;
+ {
+ auto bp = osd_op.indata.cbegin();
+ std::string aname;
+ bp.copy(osd_op.op.xattr.name_len, aname);
+ name = "_" + aname;
+ }
+ logger().debug("getxattr on obj={} for attr={}", os.oi.soid, name);
+ return getxattr(os.oi.soid, std::move(name)).safe_then_interruptible(
+ [&delta_stats, &osd_op] (ceph::bufferlist&& val) {
+ osd_op.outdata = std::move(val);
+ osd_op.op.xattr.value_len = osd_op.outdata.length();
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ return get_attr_errorator::now();
+ });
+}
+
+PGBackend::get_attr_ierrorator::future<ceph::bufferlist>
+PGBackend::getxattr(
+ const hobject_t& soid,
+ std::string_view key) const
+{
+ return store->get_attr(coll, ghobject_t{soid}, key);
+}
+
+PGBackend::get_attr_ierrorator::future<ceph::bufferlist>
+PGBackend::getxattr(
+ const hobject_t& soid,
+ std::string&& key) const
+{
+ return seastar::do_with(key, [this, &soid](auto &key) {
+ return store->get_attr(coll, ghobject_t{soid}, key);
+ });
+}
+
+PGBackend::get_attr_ierrorator::future<> PGBackend::get_xattrs(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ return store->get_attrs(coll, ghobject_t{os.oi.soid}).safe_then(
+ [&delta_stats, &osd_op](auto&& attrs) {
+ std::vector<std::pair<std::string, bufferlist>> user_xattrs;
+ ceph::bufferlist bl;
+ for (auto& [key, val] : attrs) {
+ if (key.size() > 1 && key[0] == '_') {
+ bl.append(std::move(val));
+ user_xattrs.emplace_back(key.substr(1), std::move(bl));
+ }
+ }
+ ceph::encode(user_xattrs, osd_op.outdata);
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+ return get_attr_errorator::now();
+ });
+}
+
+namespace {
+
+template<typename U, typename V>
+int do_cmp_xattr(int op, const U& lhs, const V& rhs)
+{
+ switch (op) {
+ case CEPH_OSD_CMPXATTR_OP_EQ:
+ return lhs == rhs;
+ case CEPH_OSD_CMPXATTR_OP_NE:
+ return lhs != rhs;
+ case CEPH_OSD_CMPXATTR_OP_GT:
+ return lhs > rhs;
+ case CEPH_OSD_CMPXATTR_OP_GTE:
+ return lhs >= rhs;
+ case CEPH_OSD_CMPXATTR_OP_LT:
+ return lhs < rhs;
+ case CEPH_OSD_CMPXATTR_OP_LTE:
+ return lhs <= rhs;
+ default:
+ return -EINVAL;
+ }
+}
+
+} // anonymous namespace
+
+static int do_xattr_cmp_u64(int op, uint64_t lhs, bufferlist& rhs_xattr)
+{
+ uint64_t rhs;
+
+ if (rhs_xattr.length() > 0) {
+ const char* first = rhs_xattr.c_str();
+ if (auto [p, ec] = std::from_chars(first, first + rhs_xattr.length(), rhs);
+ ec != std::errc()) {
+ return -EINVAL;
+ }
+ } else {
+ rhs = 0;
+ }
+ logger().debug("do_xattr_cmp_u64 '{}' vs '{}' op {}", lhs, rhs, op);
+ return do_cmp_xattr(op, lhs, rhs);
+}
+
+PGBackend::cmp_xattr_ierrorator::future<> PGBackend::cmp_xattr(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ std::string name{"_"};
+ auto bp = osd_op.indata.cbegin();
+ bp.copy(osd_op.op.xattr.name_len, name);
+
+ logger().debug("cmpxattr on obj={} for attr={}", os.oi.soid, name);
+ return getxattr(os.oi.soid, std::move(name)).safe_then_interruptible(
+ [&delta_stats, &osd_op] (auto &&xattr) -> cmp_xattr_ierrorator::future<> {
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(osd_op.op.xattr.value_len, 10);
+
+ int result = 0;
+ auto bp = osd_op.indata.cbegin();
+ bp += osd_op.op.xattr.name_len;
+
+ switch (osd_op.op.xattr.cmp_mode) {
+ case CEPH_OSD_CMPXATTR_MODE_STRING:
+ {
+ string lhs;
+ bp.copy(osd_op.op.xattr.value_len, lhs);
+ string_view rhs(xattr.c_str(), xattr.length());
+ result = do_cmp_xattr(osd_op.op.xattr.cmp_op, lhs, rhs);
+ logger().debug("cmpxattr lhs={}, rhs={}", lhs, rhs);
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_MODE_U64:
+ {
+ uint64_t lhs;
+ try {
+ decode(lhs, bp);
+ } catch (ceph::buffer::error& e) {
+ logger().info("cmp_xattr: buffer error expection");
+ result = -EINVAL;
+ break;
+ }
+ result = do_xattr_cmp_u64(osd_op.op.xattr.cmp_op, lhs, xattr);
+ }
+ break;
+ default:
+ logger().info("bad cmp mode {}", osd_op.op.xattr.cmp_mode);
+ result = -EINVAL;
+ }
+ if (result == 0) {
+ logger().info("cmp_xattr: comparison returned false");
+ return crimson::ct_error::ecanceled::make();
+ } else if (result == -EINVAL) {
+ return crimson::ct_error::invarg::make();
+ } else {
+ osd_op.rval = 1;
+ return cmp_xattr_ierrorator::now();
+ }
+ }).handle_error_interruptible(
+ crimson::ct_error::enodata::handle([&delta_stats, &osd_op] ()
+ ->cmp_xattr_errorator::future<> {
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(osd_op.op.xattr.value_len, 10);
+ return crimson::ct_error::ecanceled::make();
+ }),
+ cmp_xattr_errorator::pass_further{}
+ );
+}
+
+PGBackend::rm_xattr_iertr::future<>
+PGBackend::rm_xattr(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: {} DNE", __func__, os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ auto bp = osd_op.indata.cbegin();
+ string attr_name{"_"};
+ bp.copy(osd_op.op.xattr.name_len, attr_name);
+ txn.rmattr(coll->get_cid(), ghobject_t{os.oi.soid}, attr_name);
+ return rm_xattr_iertr::now();
+}
+
+void PGBackend::clone(
+ /* const */object_info_t& snap_oi,
+ const ObjectState& os,
+ const ObjectState& d_os,
+ ceph::os::Transaction& txn)
+{
+ // See OpsExecutor::execute_clone documentation
+ txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid});
+ {
+ ceph::bufferlist bv;
+ snap_oi.encode_no_oid(bv, CEPH_FEATURES_ALL);
+ txn.setattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, OI_ATTR, bv);
+ }
+ txn.rmattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, SS_ATTR);
+}
+
+using get_omap_ertr =
+ crimson::os::FuturizedStore::Shard::read_errorator::extend<
+ crimson::ct_error::enodata>;
+using get_omap_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ get_omap_ertr>;
+static
+get_omap_iertr::future<
+ crimson::os::FuturizedStore::Shard::omap_values_t>
+maybe_get_omap_vals_by_keys(
+ crimson::os::FuturizedStore::Shard* store,
+ const crimson::os::CollectionRef& coll,
+ const object_info_t& oi,
+ const std::set<std::string>& keys_to_get)
+{
+ if (oi.is_omap()) {
+ return store->omap_get_values(coll, ghobject_t{oi.soid}, keys_to_get);
+ } else {
+ return crimson::ct_error::enodata::make();
+ }
+}
+
+static
+get_omap_iertr::future<
+ std::tuple<bool, crimson::os::FuturizedStore::Shard::omap_values_t>>
+maybe_get_omap_vals(
+ crimson::os::FuturizedStore::Shard* store,
+ const crimson::os::CollectionRef& coll,
+ const object_info_t& oi,
+ const std::string& start_after)
+{
+ if (oi.is_omap()) {
+ return store->omap_get_values(coll, ghobject_t{oi.soid}, start_after);
+ } else {
+ return crimson::ct_error::enodata::make();
+ }
+}
+
+PGBackend::ll_read_ierrorator::future<ceph::bufferlist>
+PGBackend::omap_get_header(
+ const crimson::os::CollectionRef& c,
+ const ghobject_t& oid) const
+{
+ return store->omap_get_header(c, oid)
+ .handle_error(
+ crimson::ct_error::enodata::handle([] {
+ return seastar::make_ready_future<bufferlist>();
+ }),
+ ll_read_errorator::pass_further{}
+ );
+}
+
+PGBackend::ll_read_ierrorator::future<>
+PGBackend::omap_get_header(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (os.oi.is_omap()) {
+ return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible(
+ [&delta_stats, &osd_op] (ceph::bufferlist&& header) {
+ osd_op.outdata = std::move(header);
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ delta_stats.num_rd++;
+ return seastar::now();
+ });
+ } else {
+ // no omap? return empty data but not ENOENT. This is imporant for
+ // the case when the object is being creating due to to may_write().
+ return seastar::now();
+ }
+}
+
+PGBackend::ll_read_ierrorator::future<>
+PGBackend::omap_get_keys(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ std::string start_after;
+ uint64_t max_return;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode(start_after, p);
+ decode(max_return, p);
+ } catch (buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+ max_return =
+ std::min(max_return, local_conf()->osd_max_omap_entries_per_request);
+
+
+ // TODO: truly chunk the reading
+ return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then_interruptible(
+ [=,&delta_stats, &osd_op](auto ret) {
+ ceph::bufferlist result;
+ bool truncated = false;
+ uint32_t num = 0;
+ for (auto &[key, val] : std::get<1>(ret)) {
+ if (num >= max_return ||
+ result.length() >= local_conf()->osd_max_omap_bytes_per_request) {
+ truncated = true;
+ break;
+ }
+ encode(key, result);
+ ++num;
+ }
+ encode(num, osd_op.outdata);
+ osd_op.outdata.claim_append(result);
+ encode(truncated, osd_op.outdata);
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ delta_stats.num_rd++;
+ return seastar::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::enodata::handle([&osd_op] {
+ uint32_t num = 0;
+ bool truncated = false;
+ encode(num, osd_op.outdata);
+ encode(truncated, osd_op.outdata);
+ osd_op.rval = 0;
+ return seastar::now();
+ }),
+ ll_read_errorator::pass_further{}
+ );
+}
+static
+PGBackend::omap_cmp_ertr::future<> do_omap_val_cmp(
+ std::map<std::string, bufferlist, std::less<>> out,
+ std::map<std::string, std::pair<bufferlist, int>> assertions)
+{
+ bufferlist empty;
+ for (const auto &[akey, avalue] : assertions) {
+ const auto [abl, aflag] = avalue;
+ auto out_entry = out.find(akey);
+ bufferlist &bl = (out_entry != out.end()) ? out_entry->second : empty;
+ switch (aflag) {
+ case CEPH_OSD_CMPXATTR_OP_EQ:
+ if (!(bl == abl)) {
+ return crimson::ct_error::ecanceled::make();
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_OP_LT:
+ if (!(bl < abl)) {
+ return crimson::ct_error::ecanceled::make();
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_OP_GT:
+ if (!(bl > abl)) {
+ return crimson::ct_error::ecanceled::make();
+ }
+ break;
+ default:
+ return crimson::ct_error::invarg::make();
+ }
+ }
+ return PGBackend::omap_cmp_ertr::now();
+}
+PGBackend::omap_cmp_iertr::future<>
+PGBackend::omap_cmp(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+
+ auto bp = osd_op.indata.cbegin();
+ std::map<std::string, std::pair<bufferlist, int> > assertions;
+ try {
+ decode(assertions, bp);
+ } catch (buffer::error&) {
+ return crimson::ct_error::invarg::make();
+ }
+
+ delta_stats.num_rd++;
+ if (os.oi.is_omap()) {
+ std::set<std::string> to_get;
+ for (auto &i: assertions) {
+ to_get.insert(i.first);
+ }
+ return store->omap_get_values(coll, ghobject_t{os.oi.soid}, to_get)
+ .safe_then([=, &osd_op] (auto&& out) -> omap_cmp_iertr::future<> {
+ osd_op.rval = 0;
+ return do_omap_val_cmp(out, assertions);
+ });
+ } else {
+ return crimson::ct_error::ecanceled::make();
+ }
+}
+PGBackend::ll_read_ierrorator::future<>
+PGBackend::omap_get_vals(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ std::string start_after;
+ uint64_t max_return;
+ std::string filter_prefix;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode(start_after, p);
+ decode(max_return, p);
+ decode(filter_prefix, p);
+ } catch (buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ max_return = \
+ std::min(max_return, local_conf()->osd_max_omap_entries_per_request);
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ delta_stats.num_rd++;
+
+ // TODO: truly chunk the reading
+ return maybe_get_omap_vals(store, coll, os.oi, start_after)
+ .safe_then_interruptible(
+ [=, &osd_op] (auto&& ret) {
+ auto [done, vals] = std::move(ret);
+ assert(done);
+ ceph::bufferlist result;
+ bool truncated = false;
+ uint32_t num = 0;
+ auto iter = filter_prefix > start_after ? vals.lower_bound(filter_prefix)
+ : std::begin(vals);
+ for (; iter != std::end(vals); ++iter) {
+ const auto& [key, value] = *iter;
+ if (key.substr(0, filter_prefix.size()) != filter_prefix) {
+ break;
+ } else if (num >= max_return ||
+ result.length() >= local_conf()->osd_max_omap_bytes_per_request) {
+ truncated = true;
+ break;
+ }
+ encode(key, result);
+ encode(value, result);
+ ++num;
+ }
+ encode(num, osd_op.outdata);
+ osd_op.outdata.claim_append(result);
+ encode(truncated, osd_op.outdata);
+ return ll_read_errorator::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::enodata::handle([&osd_op] {
+ encode(uint32_t{0} /* num */, osd_op.outdata);
+ encode(bool{false} /* truncated */, osd_op.outdata);
+ osd_op.rval = 0;
+ return ll_read_errorator::now();
+ }),
+ ll_read_errorator::pass_further{}
+ );
+}
+
+PGBackend::ll_read_ierrorator::future<>
+PGBackend::omap_get_vals_by_keys(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", __func__, os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+
+ std::set<std::string> keys_to_get;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode(keys_to_get, p);
+ } catch (buffer::error&) {
+ throw crimson::osd::invalid_argument();
+ }
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ delta_stats.num_rd++;
+ return maybe_get_omap_vals_by_keys(store, coll, os.oi, keys_to_get)
+ .safe_then_interruptible(
+ [&osd_op] (crimson::os::FuturizedStore::Shard::omap_values_t&& vals) {
+ encode(vals, osd_op.outdata);
+ return ll_read_errorator::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::enodata::handle([&osd_op] {
+ uint32_t num = 0;
+ encode(num, osd_op.outdata);
+ osd_op.rval = 0;
+ return ll_read_errorator::now();
+ }),
+ ll_read_errorator::pass_further{}
+ );
+}
+
+PGBackend::interruptible_future<>
+PGBackend::omap_set_vals(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ maybe_create_new_object(os, txn, delta_stats);
+
+ ceph::bufferlist to_set_bl;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode_str_str_map_to_bl(p, &to_set_bl);
+ } catch (buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ txn.omap_setkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_set_bl);
+ osd_op_params.clean_regions.mark_omap_dirty();
+ delta_stats.num_wr++;
+ delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
+ os.oi.set_flag(object_info_t::FLAG_OMAP);
+ os.oi.clear_omap_digest();
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<>
+PGBackend::omap_set_header(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ maybe_create_new_object(os, txn, delta_stats);
+ txn.omap_setheader(coll->get_cid(), ghobject_t{os.oi.soid}, osd_op.indata);
+ osd_op_params.clean_regions.mark_omap_dirty();
+ delta_stats.num_wr++;
+ os.oi.set_flag(object_info_t::FLAG_OMAP);
+ os.oi.clear_omap_digest();
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<> PGBackend::omap_remove_range(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ std::string key_begin, key_end;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode(key_begin, p);
+ decode(key_end, p);
+ } catch (buffer::error& e) {
+ throw crimson::osd::invalid_argument{};
+ }
+ txn.omap_rmkeyrange(coll->get_cid(), ghobject_t{os.oi.soid}, key_begin, key_end);
+ delta_stats.num_wr++;
+ os.oi.clear_omap_digest();
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<> PGBackend::omap_remove_key(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn)
+{
+ ceph::bufferlist to_rm_bl;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode_str_set_to_bl(p, &to_rm_bl);
+ } catch (buffer::error& e) {
+ throw crimson::osd::invalid_argument{};
+ }
+ txn.omap_rmkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_rm_bl);
+ // TODO:
+ // ctx->clean_regions.mark_omap_dirty();
+ // ctx->delta_stats.num_wr++;
+ os.oi.clear_omap_digest();
+ return seastar::now();
+}
+
+PGBackend::omap_clear_iertr::future<>
+PGBackend::omap_clear(
+ ObjectState& os,
+ OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ if (!os.oi.is_omap()) {
+ return omap_clear_ertr::now();
+ }
+ txn.omap_clear(coll->get_cid(), ghobject_t{os.oi.soid});
+ osd_op_params.clean_regions.mark_omap_dirty();
+ delta_stats.num_wr++;
+ os.oi.clear_omap_digest();
+ os.oi.clear_flag(object_info_t::FLAG_OMAP);
+ return omap_clear_ertr::now();
+}
+
+PGBackend::interruptible_future<struct stat>
+PGBackend::stat(
+ CollectionRef c,
+ const ghobject_t& oid) const
+{
+ return store->stat(c, oid);
+}
+
+PGBackend::read_errorator::future<std::map<uint64_t, uint64_t>>
+PGBackend::fiemap(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len)
+{
+ return store->fiemap(c, oid, off, len);
+}
+
+PGBackend::write_iertr::future<> PGBackend::tmapput(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats,
+ osd_op_params_t& osd_op_params)
+{
+ logger().debug("PGBackend::tmapput: {}", os.oi.soid);
+ auto ret = crimson::common::do_tmap_put(osd_op.indata.cbegin());
+ if (!ret.has_value()) {
+ logger().debug("PGBackend::tmapup: {}, ret={}", os.oi.soid, ret.error());
+ ceph_assert(ret.error() == -EINVAL);
+ return crimson::ct_error::invarg::make();
+ } else {
+ auto bl = std::move(ret.value());
+ return _writefull(
+ os,
+ bl.length(),
+ std::move(bl),
+ txn,
+ osd_op_params,
+ delta_stats,
+ 0);
+ }
+}
+
+PGBackend::tmapup_iertr::future<> PGBackend::tmapup(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats,
+ osd_op_params_t& osd_op_params)
+{
+ logger().debug("PGBackend::tmapup: {}", os.oi.soid);
+ return PGBackend::write_iertr::now(
+ ).si_then([this, &os] {
+ return _read(os.oi.soid, 0, os.oi.size, 0);
+ }).handle_error_interruptible(
+ crimson::ct_error::enoent::handle([](auto &) {
+ return seastar::make_ready_future<bufferlist>();
+ }),
+ PGBackend::write_iertr::pass_further{},
+ crimson::ct_error::assert_all{"read error in mutate_object_contents"}
+ ).si_then([this, &os, &osd_op, &txn,
+ &delta_stats, &osd_op_params]
+ (auto &&bl) mutable -> PGBackend::tmapup_iertr::future<> {
+ auto result = crimson::common::do_tmap_up(
+ osd_op.indata.cbegin(),
+ std::move(bl));
+ if (!result.has_value()) {
+ int ret = result.error();
+ logger().debug("PGBackend::tmapup: {}, ret={}", os.oi.soid, ret);
+ switch (ret) {
+ case -EEXIST:
+ return crimson::ct_error::eexist::make();
+ case -ENOENT:
+ return crimson::ct_error::enoent::make();
+ case -EINVAL:
+ return crimson::ct_error::invarg::make();
+ default:
+ ceph_assert(0 == "impossible error");
+ return crimson::ct_error::invarg::make();
+ }
+ }
+
+ logger().debug(
+ "PGBackend::tmapup: {}, result.value.length()={}, ret=0",
+ os.oi.soid, result.value().length());
+ return _writefull(
+ os,
+ result.value().length(),
+ result.value(),
+ txn,
+ osd_op_params,
+ delta_stats,
+ 0);
+ });
+}
+
+PGBackend::read_ierrorator::future<> PGBackend::tmapget(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats)
+{
+ logger().debug("PGBackend::tmapget: {}", os.oi.soid);
+ const auto& oi = os.oi;
+ logger().debug("PGBackend::tmapget: read {} 0~{}", oi.soid, oi.size);
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("PGBackend::tmapget: {} DNE", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+
+ return _read(oi.soid, 0, oi.size, 0).safe_then_interruptible_tuple(
+ [&delta_stats, &osd_op](auto&& bl) -> read_errorator::future<> {
+ logger().debug("PGBackend::tmapget: data length: {}", bl.length());
+ osd_op.op.extent.length = bl.length();
+ osd_op.rval = 0;
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+ osd_op.outdata = std::move(bl);
+ return read_errorator::now();
+ }, crimson::ct_error::input_output_error::handle([] {
+ return read_errorator::future<>{crimson::ct_error::object_corrupted::make()};
+ }),
+ read_errorator::pass_further{});
+}
+
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
new file mode 100644
index 000000000..fbad37d4c
--- /dev/null
+++ b/src/crimson/osd/pg_backend.h
@@ -0,0 +1,448 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <boost/container/flat_set.hpp>
+
+#include "include/rados.h"
+
+#include "crimson/os/futurized_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/acked_peers.h"
+#include "crimson/common/shared_lru.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/osdop_params.h"
+
+struct hobject_t;
+
+namespace ceph::os {
+ class Transaction;
+}
+
+namespace crimson::osd {
+ class ShardServices;
+ class PG;
+ class ObjectContextLoader;
+}
+
+class PGBackend
+{
+protected:
+ using CollectionRef = crimson::os::CollectionRef;
+ using ec_profile_t = std::map<std::string, std::string>;
+ // low-level read errorator
+ using ll_read_errorator = crimson::os::FuturizedStore::Shard::read_errorator;
+ using ll_read_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ ll_read_errorator>;
+
+public:
+ using load_metadata_ertr = crimson::errorator<
+ crimson::ct_error::object_corrupted>;
+ using load_metadata_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ load_metadata_ertr>;
+ using interruptor =
+ ::crimson::interruptible::interruptor<
+ ::crimson::osd::IOInterruptCondition>;
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition, T>;
+ using rep_op_fut_t =
+ std::tuple<interruptible_future<>,
+ interruptible_future<crimson::osd::acked_peers_t>>;
+ PGBackend(shard_id_t shard, CollectionRef coll,
+ crimson::osd::ShardServices &shard_services,
+ DoutPrefixProvider &dpp);
+ virtual ~PGBackend() = default;
+ static std::unique_ptr<PGBackend> create(pg_t pgid,
+ const pg_shard_t pg_shard,
+ const pg_pool_t& pool,
+ crimson::os::CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ const ec_profile_t& ec_profile,
+ DoutPrefixProvider &dpp);
+ using attrs_t =
+ std::map<std::string, ceph::bufferptr, std::less<>>;
+ using read_errorator = ll_read_errorator::extend<
+ crimson::ct_error::object_corrupted>;
+ using read_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ read_errorator>;
+ read_ierrorator::future<> read(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats);
+ read_ierrorator::future<> sparse_read(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats);
+ using checksum_errorator = ll_read_errorator::extend<
+ crimson::ct_error::object_corrupted,
+ crimson::ct_error::invarg>;
+ using checksum_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ checksum_errorator>;
+ checksum_ierrorator::future<> checksum(
+ const ObjectState& os,
+ OSDOp& osd_op);
+ using cmp_ext_errorator = ll_read_errorator::extend<
+ crimson::ct_error::invarg,
+ crimson::ct_error::cmp_fail>;
+ using cmp_ext_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ cmp_ext_errorator>;
+ cmp_ext_ierrorator::future<> cmp_ext(
+ const ObjectState& os,
+ OSDOp& osd_op);
+ using stat_errorator = crimson::errorator<crimson::ct_error::enoent>;
+ using stat_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ stat_errorator>;
+ stat_ierrorator::future<> stat(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats);
+
+ // TODO: switch the entire write family to errorator.
+ using write_ertr = crimson::errorator<
+ crimson::ct_error::file_too_large,
+ crimson::ct_error::invarg>;
+ using write_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ write_ertr>;
+ using create_ertr = crimson::errorator<
+ crimson::ct_error::invarg,
+ crimson::ct_error::eexist>;
+ using create_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ create_ertr>;
+ create_iertr::future<> create(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats);
+ using remove_ertr = crimson::errorator<
+ crimson::ct_error::enoent>;
+ using remove_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ remove_ertr>;
+ remove_iertr::future<> remove(
+ ObjectState& os,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats,
+ bool whiteout);
+ interruptible_future<> remove(
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ interruptible_future<> set_allochint(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats);
+ write_iertr::future<> write(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ interruptible_future<> write_same(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ write_iertr::future<> writefull(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ using append_errorator = crimson::errorator<
+ crimson::ct_error::invarg>;
+ using append_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ append_errorator>;
+ append_ierrorator::future<> append(
+ ObjectState& os,
+ OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ using rollback_ertr = crimson::errorator<
+ crimson::ct_error::enoent>;
+ using rollback_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ rollback_ertr>;
+ rollback_iertr::future<> rollback(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ crimson::osd::ObjectContextRef head,
+ crimson::osd::ObjectContextLoader& obc_loader);
+ write_iertr::future<> truncate(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ write_iertr::future<> zero(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ rep_op_fut_t mutate_object(
+ std::set<pg_shard_t> pg_shards,
+ crimson::osd::ObjectContextRef &&obc,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch,
+ epoch_t map_epoch,
+ std::vector<pg_log_entry_t>&& log_entries);
+ interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> list_objects(
+ const hobject_t& start,
+ uint64_t limit) const;
+ using setxattr_errorator = crimson::errorator<
+ crimson::ct_error::file_too_large,
+ crimson::ct_error::enametoolong>;
+ using setxattr_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ setxattr_errorator>;
+ setxattr_ierrorator::future<> setxattr(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats);
+ using get_attr_errorator = crimson::os::FuturizedStore::Shard::get_attr_errorator;
+ using get_attr_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ get_attr_errorator>;
+ get_attr_ierrorator::future<> getxattr(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ get_attr_ierrorator::future<ceph::bufferlist> getxattr(
+ const hobject_t& soid,
+ std::string_view key) const;
+ get_attr_ierrorator::future<ceph::bufferlist> getxattr(
+ const hobject_t& soid,
+ std::string&& key) const;
+ get_attr_ierrorator::future<> get_xattrs(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ using cmp_xattr_errorator = get_attr_errorator::extend<
+ crimson::ct_error::ecanceled,
+ crimson::ct_error::invarg>;
+ using cmp_xattr_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ cmp_xattr_errorator>;
+ cmp_xattr_ierrorator::future<> cmp_xattr(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ using rm_xattr_ertr = crimson::errorator<crimson::ct_error::enoent>;
+ using rm_xattr_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ rm_xattr_ertr>;
+ rm_xattr_iertr::future<> rm_xattr(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans);
+ void clone(
+ /* const */object_info_t& snap_oi,
+ const ObjectState& os,
+ const ObjectState& d_os,
+ ceph::os::Transaction& trans);
+ interruptible_future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid) const;
+ read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len);
+
+ write_iertr::future<> tmapput(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats,
+ osd_op_params_t& osd_op_params);
+
+ using tmapup_ertr = write_ertr::extend<
+ crimson::ct_error::enoent,
+ crimson::ct_error::eexist>;
+ using tmapup_iertr = ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ tmapup_ertr>;
+ tmapup_iertr::future<> tmapup(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats,
+ osd_op_params_t& osd_op_params);
+
+ read_ierrorator::future<> tmapget(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats);
+
+ // OMAP
+ ll_read_ierrorator::future<> omap_get_keys(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ using omap_cmp_ertr =
+ crimson::os::FuturizedStore::Shard::read_errorator::extend<
+ crimson::ct_error::ecanceled,
+ crimson::ct_error::invarg>;
+ using omap_cmp_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ omap_cmp_ertr>;
+ omap_cmp_iertr::future<> omap_cmp(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ ll_read_ierrorator::future<> omap_get_vals(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ ll_read_ierrorator::future<> omap_get_vals_by_keys(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ interruptible_future<> omap_set_vals(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ ll_read_ierrorator::future<ceph::bufferlist> omap_get_header(
+ const crimson::os::CollectionRef& c,
+ const ghobject_t& oid) const;
+ ll_read_ierrorator::future<> omap_get_header(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ interruptible_future<> omap_set_header(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ interruptible_future<> omap_remove_range(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats);
+ interruptible_future<> omap_remove_key(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans);
+ using omap_clear_ertr = crimson::errorator<crimson::ct_error::enoent>;
+ using omap_clear_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ omap_clear_ertr>;
+ omap_clear_iertr::future<> omap_clear(
+ ObjectState& os,
+ OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+
+ virtual void got_rep_op_reply(const MOSDRepOpReply&) {}
+ virtual seastar::future<> stop() = 0;
+ virtual void on_actingset_changed(bool same_primary) = 0;
+protected:
+ const shard_id_t shard;
+ CollectionRef coll;
+ crimson::osd::ShardServices &shard_services;
+ DoutPrefixProvider &dpp; ///< provides log prefix context
+ crimson::os::FuturizedStore::Shard* store;
+ virtual seastar::future<> request_committed(
+ const osd_reqid_t& reqid,
+ const eversion_t& at_version) = 0;
+public:
+ struct loaded_object_md_t {
+ ObjectState os;
+ crimson::osd::SnapSetContextRef ssc;
+ using ref = std::unique_ptr<loaded_object_md_t>;
+ };
+ load_metadata_iertr::future<loaded_object_md_t::ref>
+ load_metadata(
+ const hobject_t &oid);
+
+private:
+ virtual ll_read_ierrorator::future<ceph::bufferlist> _read(
+ const hobject_t& hoid,
+ size_t offset,
+ size_t length,
+ uint32_t flags) = 0;
+ write_iertr::future<> _writefull(
+ ObjectState& os,
+ off_t truncate_size,
+ const bufferlist& bl,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ unsigned flags);
+ write_iertr::future<> _truncate(
+ ObjectState& os,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ size_t offset,
+ size_t truncate_size,
+ uint32_t truncate_seq);
+
+ bool maybe_create_new_object(ObjectState& os,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats);
+ void update_size_and_usage(object_stat_sum_t& delta_stats,
+ object_info_t& oi, uint64_t offset,
+ uint64_t length, bool write_full = false);
+ void truncate_update_size_and_usage(
+ object_stat_sum_t& delta_stats,
+ object_info_t& oi,
+ uint64_t truncate_size);
+ virtual rep_op_fut_t
+ _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch, epoch_t max_epoch,
+ std::vector<pg_log_entry_t>&& log_entries) = 0;
+ friend class ReplicatedRecoveryBackend;
+ friend class ::crimson::osd::PG;
+};
diff --git a/src/crimson/osd/pg_interval_interrupt_condition.cc b/src/crimson/osd/pg_interval_interrupt_condition.cc
new file mode 100644
index 000000000..36243b825
--- /dev/null
+++ b/src/crimson/osd/pg_interval_interrupt_condition.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg_interval_interrupt_condition.h"
+#include "pg.h"
+
+#include "crimson/common/log.h"
+
+SET_SUBSYS(osd);
+
+namespace crimson::osd {
+
+IOInterruptCondition::IOInterruptCondition(Ref<PG>& pg)
+ : pg(pg), e(pg->get_osdmap_epoch()) {}
+
+IOInterruptCondition::~IOInterruptCondition() {
+ // for the sake of forward declaring PG (which is a detivate of
+ // intrusive_ref_counter<...>)
+}
+
+bool IOInterruptCondition::new_interval_created() {
+ LOG_PREFIX(IOInterruptCondition::new_interval_created);
+ const epoch_t interval_start = pg->get_interval_start_epoch();
+ bool ret = e < interval_start;
+ if (ret) {
+ DEBUGDPP("stored interval e{} < interval_start e{}", *pg, e, interval_start);
+ }
+ return ret;
+}
+
+bool IOInterruptCondition::is_stopping() {
+ LOG_PREFIX(IOInterruptCondition::is_stopping);
+ if (pg->stopping) {
+ DEBUGDPP("pg stopping", *pg);
+ }
+ return pg->stopping;
+}
+
+bool IOInterruptCondition::is_primary() {
+ return pg->is_primary();
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/pg_interval_interrupt_condition.h b/src/crimson/osd/pg_interval_interrupt_condition.h
new file mode 100644
index 000000000..a3a0a1edb
--- /dev/null
+++ b/src/crimson/osd/pg_interval_interrupt_condition.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include "include/types.h"
+#include "crimson/common/errorator.h"
+#include "crimson/common/exception.h"
+#include "crimson/common/type_helpers.h"
+
+namespace crimson::osd {
+
+class PG;
+
+class IOInterruptCondition {
+public:
+ IOInterruptCondition(Ref<PG>& pg);
+ ~IOInterruptCondition();
+
+ bool new_interval_created();
+
+ bool is_stopping();
+
+ bool is_primary();
+
+ template <typename Fut>
+ std::optional<Fut> may_interrupt() {
+ if (new_interval_created()) {
+ return seastar::futurize<Fut>::make_exception_future(
+ ::crimson::common::actingset_changed(is_primary()));
+ }
+ if (is_stopping()) {
+ return seastar::futurize<Fut>::make_exception_future(
+ ::crimson::common::system_shutdown_exception());
+ }
+ return std::optional<Fut>();
+ }
+
+ template <typename T>
+ static constexpr bool is_interruption_v =
+ std::is_same_v<T, ::crimson::common::actingset_changed>
+ || std::is_same_v<T, ::crimson::common::system_shutdown_exception>;
+
+ static bool is_interruption(std::exception_ptr& eptr) {
+ return (*eptr.__cxa_exception_type() ==
+ typeid(::crimson::common::actingset_changed) ||
+ *eptr.__cxa_exception_type() ==
+ typeid(::crimson::common::system_shutdown_exception));
+ }
+
+private:
+ Ref<PG> pg;
+ epoch_t e;
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/pg_map.cc b/src/crimson/osd/pg_map.cc
new file mode 100644
index 000000000..193781250
--- /dev/null
+++ b/src/crimson/osd/pg_map.cc
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/pg_map.h"
+
+#include "crimson/osd/pg.h"
+#include "common/Formatter.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::make_pair;
+
+namespace crimson::osd {
+
+PGMap::PGCreationState::PGCreationState(spg_t pgid) : pgid(pgid) {}
+PGMap::PGCreationState::~PGCreationState() {}
+
+void PGMap::PGCreationState::dump_detail(Formatter *f) const
+{
+ f->dump_stream("pgid") << pgid;
+ f->dump_bool("creating", creating);
+}
+
+PGMap::wait_for_pg_ret
+PGMap::wait_for_pg(PGCreationBlockingEvent::TriggerI&& trigger, spg_t pgid)
+{
+ if (auto pg = get_pg(pgid)) {
+ return make_pair(
+ wait_for_pg_fut(wait_for_pg_ertr::ready_future_marker{}, pg),
+ true);
+ } else {
+ auto &state = pgs_creating.emplace(pgid, pgid).first->second;
+ return make_pair(
+ wait_for_pg_fut(
+ trigger.maybe_record_blocking(state.promise.get_shared_future(), state)
+ ), state.creating);
+ }
+}
+
+void PGMap::remove_pg(spg_t pgid) {
+ ceph_assert(pgs.erase(pgid) == 1);
+}
+
+Ref<PG> PGMap::get_pg(spg_t pgid)
+{
+ if (auto pg = pgs.find(pgid); pg != pgs.end()) {
+ return pg->second;
+ } else {
+ return nullptr;
+ }
+}
+
+void PGMap::set_creating(spg_t pgid)
+{
+ logger().debug("Creating {}", pgid);
+ ceph_assert(pgs.count(pgid) == 0);
+ auto pg = pgs_creating.find(pgid);
+ ceph_assert(pg != pgs_creating.end());
+ ceph_assert(pg->second.creating == false);
+ pg->second.creating = true;
+}
+
+void PGMap::pg_created(spg_t pgid, Ref<PG> pg)
+{
+ logger().debug("Created {}", pgid);
+ ceph_assert(!pgs.count(pgid));
+ pgs.emplace(pgid, pg);
+
+ auto creating_iter = pgs_creating.find(pgid);
+ ceph_assert(creating_iter != pgs_creating.end());
+ auto promise = std::move(creating_iter->second.promise);
+ pgs_creating.erase(creating_iter);
+ promise.set_value(pg);
+}
+
+void PGMap::pg_loaded(spg_t pgid, Ref<PG> pg)
+{
+ ceph_assert(!pgs.count(pgid));
+ pgs.emplace(pgid, pg);
+}
+
+void PGMap::pg_creation_canceled(spg_t pgid)
+{
+ logger().debug("PGMap::pg_creation_canceled: {}", pgid);
+ ceph_assert(!pgs.count(pgid));
+
+ auto creating_iter = pgs_creating.find(pgid);
+ ceph_assert(creating_iter != pgs_creating.end());
+ auto promise = std::move(creating_iter->second.promise);
+ pgs_creating.erase(creating_iter);
+ promise.set_exception(
+ crimson::ct_error::ecanceled::exception_ptr()
+ );
+}
+
+PGMap::~PGMap() {}
+
+}
diff --git a/src/crimson/osd/pg_map.h b/src/crimson/osd/pg_map.h
new file mode 100644
index 000000000..3269de434
--- /dev/null
+++ b/src/crimson/osd/pg_map.h
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <algorithm>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/types.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/common/smp_helpers.h"
+#include "crimson/osd/osd_operation.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd {
+class PG;
+
+/**
+ * PGShardMapping
+ *
+ * Maintains a mapping from spg_t to the core containing that PG. Internally, each
+ * core has a local copy of the mapping to enable core-local lookups. Updates
+ * are proxied to core 0, and the back out to all other cores -- see maybe_create_pg.
+ */
+class PGShardMapping : public seastar::peering_sharded_service<PGShardMapping> {
+public:
+ /// Returns mapping if present, NULL_CORE otherwise
+ core_id_t get_pg_mapping(spg_t pgid) {
+ auto iter = pg_to_core.find(pgid);
+ ceph_assert_always(iter == pg_to_core.end() || iter->second != NULL_CORE);
+ return iter == pg_to_core.end() ? NULL_CORE : iter->second;
+ }
+
+ /// Returns mapping for pgid, creates new one if it doesn't already exist
+ seastar::future<core_id_t> maybe_create_pg(
+ spg_t pgid,
+ core_id_t core = NULL_CORE) {
+ auto find_iter = pg_to_core.find(pgid);
+ if (find_iter != pg_to_core.end()) {
+ ceph_assert_always(find_iter->second != NULL_CORE);
+ if (core != NULL_CORE) {
+ ceph_assert_always(find_iter->second == core);
+ }
+ return seastar::make_ready_future<core_id_t>(find_iter->second);
+ } else {
+ return container().invoke_on(0,[pgid, core]
+ (auto &primary_mapping) {
+ auto [insert_iter, inserted] = primary_mapping.pg_to_core.emplace(pgid, core);
+ ceph_assert_always(inserted);
+ ceph_assert_always(primary_mapping.core_to_num_pgs.size() > 0);
+ std::map<core_id_t, unsigned>::iterator core_iter;
+ if (core == NULL_CORE) {
+ core_iter = std::min_element(
+ primary_mapping.core_to_num_pgs.begin(),
+ primary_mapping.core_to_num_pgs.end(),
+ [](const auto &left, const auto &right) {
+ return left.second < right.second;
+ });
+ } else {
+ core_iter = primary_mapping.core_to_num_pgs.find(core);
+ }
+ ceph_assert_always(primary_mapping.core_to_num_pgs.end() != core_iter);
+ insert_iter->second = core_iter->first;
+ core_iter->second++;
+ return primary_mapping.container().invoke_on_others(
+ [pgid = insert_iter->first, core = insert_iter->second]
+ (auto &other_mapping) {
+ ceph_assert_always(core != NULL_CORE);
+ auto [insert_iter, inserted] = other_mapping.pg_to_core.emplace(pgid, core);
+ ceph_assert_always(inserted);
+ });
+ }).then([this, pgid] {
+ auto find_iter = pg_to_core.find(pgid);
+ return seastar::make_ready_future<core_id_t>(find_iter->second);
+ });
+ }
+ }
+
+ /// Remove pgid
+ seastar::future<> remove_pg(spg_t pgid) {
+ return container().invoke_on(0, [pgid](auto &primary_mapping) {
+ auto iter = primary_mapping.pg_to_core.find(pgid);
+ ceph_assert_always(iter != primary_mapping.pg_to_core.end());
+ ceph_assert_always(iter->second != NULL_CORE);
+ auto count_iter = primary_mapping.core_to_num_pgs.find(iter->second);
+ ceph_assert_always(count_iter != primary_mapping.core_to_num_pgs.end());
+ ceph_assert_always(count_iter->second > 0);
+ --(count_iter->second);
+ primary_mapping.pg_to_core.erase(iter);
+ return primary_mapping.container().invoke_on_others(
+ [pgid](auto &other_mapping) {
+ auto iter = other_mapping.pg_to_core.find(pgid);
+ ceph_assert_always(iter != other_mapping.pg_to_core.end());
+ ceph_assert_always(iter->second != NULL_CORE);
+ other_mapping.pg_to_core.erase(iter);
+ });
+ });
+ }
+
+ size_t get_num_pgs() const { return pg_to_core.size(); }
+
+ /// Map to cores in [min_core_mapping, core_mapping_limit)
+ PGShardMapping(core_id_t min_core_mapping, core_id_t core_mapping_limit) {
+ ceph_assert_always(min_core_mapping < core_mapping_limit);
+ for (auto i = min_core_mapping; i != core_mapping_limit; ++i) {
+ core_to_num_pgs.emplace(i, 0);
+ }
+ }
+
+ template <typename F>
+ void for_each_pgid(F &&f) const {
+ for (const auto &i: pg_to_core) {
+ std::invoke(f, i.first);
+ }
+ }
+
+private:
+ std::map<core_id_t, unsigned> core_to_num_pgs;
+ std::map<spg_t, core_id_t> pg_to_core;
+};
+
+/**
+ * PGMap
+ *
+ * Maps spg_t to PG instance within a shard. Handles dealing with waiting
+ * on pg creation.
+ */
+class PGMap {
+ struct PGCreationState : BlockerT<PGCreationState> {
+ static constexpr const char * type_name = "PGCreation";
+
+ void dump_detail(Formatter *f) const final;
+
+ spg_t pgid;
+ seastar::shared_promise<Ref<PG>> promise;
+ bool creating = false;
+ PGCreationState(spg_t pgid);
+
+ PGCreationState(const PGCreationState &) = delete;
+ PGCreationState(PGCreationState &&) = delete;
+ PGCreationState &operator=(const PGCreationState &) = delete;
+ PGCreationState &operator=(PGCreationState &&) = delete;
+
+ ~PGCreationState();
+ };
+
+ std::map<spg_t, PGCreationState> pgs_creating;
+ using pgs_t = std::map<spg_t, Ref<PG>>;
+ pgs_t pgs;
+
+public:
+ using PGCreationBlocker = PGCreationState;
+ using PGCreationBlockingEvent = PGCreationBlocker::BlockingEvent;
+ /**
+ * Get future for pg with a bool indicating whether it's already being
+ * created.
+ */
+ using wait_for_pg_ertr = crimson::errorator<
+ crimson::ct_error::ecanceled>;
+ using wait_for_pg_fut = wait_for_pg_ertr::future<Ref<PG>>;
+ using wait_for_pg_ret = std::pair<wait_for_pg_fut, bool>;
+ wait_for_pg_ret wait_for_pg(PGCreationBlockingEvent::TriggerI&&, spg_t pgid);
+
+ /**
+ * get PG in non-blocking manner
+ */
+ Ref<PG> get_pg(spg_t pgid);
+
+ /**
+ * Set creating
+ */
+ void set_creating(spg_t pgid);
+
+ /**
+ * Set newly created pg
+ */
+ void pg_created(spg_t pgid, Ref<PG> pg);
+
+ /**
+ * Add newly loaded pg
+ */
+ void pg_loaded(spg_t pgid, Ref<PG> pg);
+
+ /**
+ * Cancel pending creation of pgid.
+ */
+ void pg_creation_canceled(spg_t pgid);
+
+ void remove_pg(spg_t pgid);
+
+ pgs_t& get_pgs() { return pgs; }
+ const pgs_t& get_pgs() const { return pgs; }
+ auto get_pg_count() const { return pgs.size(); }
+ PGMap() = default;
+ ~PGMap();
+};
+
+}
diff --git a/src/crimson/osd/pg_meta.cc b/src/crimson/osd/pg_meta.cc
new file mode 100644
index 000000000..288ee52a0
--- /dev/null
+++ b/src/crimson/osd/pg_meta.cc
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg_meta.h"
+
+#include <string_view>
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+using std::string;
+using std::string_view;
+// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
+// easily skip them
+using crimson::os::FuturizedStore;
+
+PGMeta::PGMeta(FuturizedStore::Shard& store, spg_t pgid)
+ : store{store},
+ pgid{pgid}
+{}
+
+namespace {
+ template<typename T>
+ std::optional<T> find_value(const FuturizedStore::Shard::omap_values_t& values,
+ string_view key)
+ {
+ auto found = values.find(key);
+ if (found == values.end()) {
+ return {};
+ }
+ auto p = found->second.cbegin();
+ T value;
+ decode(value, p);
+ return std::make_optional(std::move(value));
+ }
+}
+
+seastar::future<epoch_t> PGMeta::get_epoch()
+{
+ return store.open_collection(coll_t{pgid}).then([this](auto ch) {
+ return store.omap_get_values(ch,
+ pgid.make_pgmeta_oid(),
+ {string{infover_key},
+ string{epoch_key}}).safe_then(
+ [](auto&& values) {
+ {
+ // sanity check
+ auto infover = find_value<__u8>(values, infover_key);
+ assert(infover);
+ if (*infover < 10) {
+ throw std::runtime_error("incompatible pg meta");
+ }
+ }
+ {
+ auto epoch = find_value<epoch_t>(values, epoch_key);
+ assert(epoch);
+ return seastar::make_ready_future<epoch_t>(*epoch);
+ }
+ },
+ FuturizedStore::Shard::read_errorator::assert_all{
+ "PGMeta::get_epoch: unable to read pgmeta"
+ });
+ });
+}
+
+seastar::future<std::tuple<pg_info_t, PastIntervals>> PGMeta::load()
+{
+ return store.open_collection(coll_t{pgid}).then([this](auto ch) {
+ return store.omap_get_values(ch,
+ pgid.make_pgmeta_oid(),
+ {string{infover_key},
+ string{info_key},
+ string{biginfo_key},
+ string{fastinfo_key}});
+ }).safe_then([](auto&& values) {
+ {
+ // sanity check
+ auto infover = find_value<__u8>(values, infover_key);
+ assert(infover);
+ if (infover < 10) {
+ throw std::runtime_error("incompatible pg meta");
+ }
+ }
+ pg_info_t info;
+ {
+ auto found = find_value<pg_info_t>(values, info_key);
+ assert(found);
+ info = *std::move(found);
+ }
+ PastIntervals past_intervals;
+ {
+ using biginfo_t = std::pair<PastIntervals, decltype(info.purged_snaps)>;
+ auto big_info = find_value<biginfo_t>(values, biginfo_key);
+ assert(big_info);
+ past_intervals = std::move(big_info->first);
+ info.purged_snaps = std::move(big_info->second);
+ }
+ {
+ auto fast_info = find_value<pg_fast_info_t>(values, fastinfo_key);
+ if (fast_info) {
+ fast_info->try_apply_to(&info);
+ }
+ }
+ return seastar::make_ready_future<std::tuple<pg_info_t, PastIntervals>>(
+ std::make_tuple(std::move(info), std::move(past_intervals)));
+ },
+ FuturizedStore::Shard::read_errorator::assert_all{
+ "PGMeta::load: unable to read pgmeta"
+ });
+}
diff --git a/src/crimson/osd/pg_meta.h b/src/crimson/osd/pg_meta.h
new file mode 100644
index 000000000..21c2bb373
--- /dev/null
+++ b/src/crimson/osd/pg_meta.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <tuple>
+#include <seastar/core/future.hh>
+#include "osd/osd_types.h"
+#include "crimson/os/futurized_store.h"
+
+/// PG related metadata
+class PGMeta
+{
+ crimson::os::FuturizedStore::Shard& store;
+ const spg_t pgid;
+public:
+ PGMeta(crimson::os::FuturizedStore::Shard& store, spg_t pgid);
+ seastar::future<epoch_t> get_epoch();
+ seastar::future<std::tuple<pg_info_t, PastIntervals>> load();
+};
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
new file mode 100644
index 000000000..09b45779e
--- /dev/null
+++ b/src/crimson/osd/pg_recovery.cc
@@ -0,0 +1,569 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <fmt/ranges.h>
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/backfill_facades.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_recovery.h"
+
+#include "osd/osd_types.h"
+#include "osd/PeeringState.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::map;
+using std::set;
+
+void PGRecovery::start_pglogbased_recovery()
+{
+ using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
+ (void) pg->get_shard_services().start_operation<PglogBasedRecovery>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_shard_services(),
+ pg->get_osdmap_epoch(),
+ float(0.001));
+}
+
+PGRecovery::interruptible_future<bool>
+PGRecovery::start_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start)
+{
+ assert(pg->is_primary());
+ assert(pg->is_peered());
+ assert(pg->is_recovering());
+ // in ceph-osd the do_recovery() path handles both the pg log-based
+ // recovery and the backfill, albeit they are separated at the layer
+ // of PeeringState. In crimson-osd backfill has been cut from it, so
+ // and do_recovery() is actually solely for pg log-based recovery.
+ // At the time of writing it's considered to move it to FSM and fix
+ // the naming as well.
+ assert(!pg->is_backfilling());
+ assert(!pg->get_peering_state().is_deleting());
+
+ std::vector<interruptible_future<>> started;
+ started.reserve(max_to_start);
+ max_to_start -= start_primary_recovery_ops(trigger, max_to_start, &started);
+ if (max_to_start > 0) {
+ max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started);
+ }
+ using interruptor =
+ crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>;
+ return interruptor::parallel_for_each(started,
+ [] (auto&& ifut) {
+ return std::move(ifut);
+ }).then_interruptible([this] {
+ bool done = !pg->get_peering_state().needs_recovery();
+ if (done) {
+ logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}",
+ pg->get_pgid());
+ using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
+ if (!pg->get_peering_state().needs_backfill()) {
+ logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}",
+ pg->get_pgid());
+ (void) pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::AllReplicasRecovered{});
+ } else {
+ logger().debug("start_recovery_ops: RequestBackfill for pg: {}",
+ pg->get_pgid());
+ (void) pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::RequestBackfill{});
+ }
+ }
+ return seastar::make_ready_future<bool>(!done);
+ });
+}
+
+size_t PGRecovery::start_primary_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start,
+ std::vector<PGRecovery::interruptible_future<>> *out)
+{
+ if (!pg->is_recovering()) {
+ return 0;
+ }
+
+ if (!pg->get_peering_state().have_missing()) {
+ pg->get_peering_state().local_recovery_complete();
+ return 0;
+ }
+
+ const auto &missing = pg->get_peering_state().get_pg_log().get_missing();
+
+ logger().info("{} recovering {} in pg {}, missing {}", __func__,
+ pg->get_recovery_backend()->total_recovering(),
+ *static_cast<crimson::osd::PG*>(pg),
+ missing);
+
+ unsigned started = 0;
+ int skipped = 0;
+
+ map<version_t, hobject_t>::const_iterator p =
+ missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested);
+ while (started < max_to_start && p != missing.get_rmissing().end()) {
+ // TODO: chain futures here to enable yielding to scheduler?
+ hobject_t soid;
+ version_t v = p->first;
+
+ auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second);
+ if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) {
+ // look at log!
+ pg_log_entry_t *latest = it_objects->second;
+ assert(latest->is_update() || latest->is_delete());
+ soid = latest->soid;
+ } else {
+ soid = p->second;
+ }
+ const pg_missing_item& item = missing.get_items().find(p->second)->second;
+ ++p;
+
+ hobject_t head = soid.get_head();
+
+ logger().info(
+ "{} {} item.need {} {} {} {} {}",
+ __func__,
+ soid,
+ item.need,
+ missing.is_missing(soid) ? " (missing)":"",
+ missing.is_missing(head) ? " (missing head)":"",
+ pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"",
+ pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":"");
+
+ // TODO: handle lost/unfound
+ if (pg->get_recovery_backend()->is_recovering(soid)) {
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ out->emplace_back(recovery_waiter.wait_for_recovered(trigger));
+ ++started;
+ } else if (pg->get_recovery_backend()->is_recovering(head)) {
+ ++skipped;
+ } else {
+ out->emplace_back(recover_missing(trigger, soid, item.need));
+ ++started;
+ }
+
+ if (!skipped)
+ pg->get_peering_state().set_last_requested(v);
+ }
+
+ logger().info("{} started {} skipped {}", __func__, started, skipped);
+
+ return started;
+}
+
+size_t PGRecovery::start_replica_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start,
+ std::vector<PGRecovery::interruptible_future<>> *out)
+{
+ if (!pg->is_recovering()) {
+ return 0;
+ }
+ uint64_t started = 0;
+
+ assert(!pg->get_peering_state().get_acting_recovery_backfill().empty());
+
+ auto recovery_order = get_replica_recovery_order();
+ for (auto &peer : recovery_order) {
+ assert(peer != pg->get_peering_state().get_primary());
+ const auto& pm = pg->get_peering_state().get_peer_missing(peer);
+
+ logger().debug("{}: peer osd.{} missing {} objects", __func__,
+ peer, pm.num_missing());
+ logger().trace("{}: peer osd.{} missing {}", __func__,
+ peer, pm.get_items());
+
+ // recover oldest first
+ for (auto p = pm.get_rmissing().begin();
+ p != pm.get_rmissing().end() && started < max_to_start;
+ ++p) {
+ const auto &soid = p->second;
+
+ if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) {
+ logger().debug("{}: object {} still unfound", __func__, soid);
+ continue;
+ }
+
+ const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer);
+ if (soid > pi.last_backfill) {
+ if (!pg->get_recovery_backend()->is_recovering(soid)) {
+ logger().error(
+ "{}: object {} in missing set for backfill (last_backfill {})"
+ " but not in recovering",
+ __func__,
+ soid,
+ pi.last_backfill);
+ ceph_abort();
+ }
+ continue;
+ }
+
+ if (pg->get_recovery_backend()->is_recovering(soid)) {
+ logger().debug("{}: already recovering object {}", __func__, soid);
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ out->emplace_back(recovery_waiter.wait_for_recovered(trigger));
+ started++;
+ continue;
+ }
+
+ if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) {
+ logger().debug("{}: soid {} is a delete, removing", __func__, soid);
+ map<hobject_t,pg_missing_item>::const_iterator r =
+ pm.get_items().find(soid);
+ started++;
+ out->emplace_back(
+ prep_object_replica_deletes(trigger, soid, r->second.need));
+ continue;
+ }
+
+ if (soid.is_snap() &&
+ pg->get_peering_state().get_pg_log().get_missing().is_missing(
+ soid.get_head())) {
+ logger().debug("{}: head {} still missing on primary", __func__,
+ soid.get_head());
+ continue;
+ }
+
+ if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) {
+ logger().debug("{}: soid {} still missing on primary", __func__, soid);
+ continue;
+ }
+
+ logger().debug("{}: recover_object_replicas({})", __func__,soid);
+ map<hobject_t,pg_missing_item>::const_iterator r = pm.get_items().find(
+ soid);
+ started++;
+ out->emplace_back(
+ prep_object_replica_pushes(trigger, soid, r->second.need));
+ }
+ }
+
+ return started;
+}
+
+PGRecovery::interruptible_future<>
+PGRecovery::recover_missing(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t &soid, eversion_t need)
+{
+ if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) {
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_delete(soid, need));
+ } else {
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_object(soid, need)
+ .handle_exception_interruptible(
+ [=, this, soid = std::move(soid)] (auto e) {
+ on_failed_recover({ pg->get_pg_whoami() }, soid, need);
+ return seastar::make_ready_future<>();
+ })
+ );
+ }
+}
+
+RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_deletes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need)
+{
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->push_delete(soid, need).then_interruptible(
+ [=, this] {
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ on_global_recover(soid, stat_diff, true);
+ return seastar::make_ready_future<>();
+ })
+ );
+}
+
+RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_pushes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need)
+{
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_object(soid, need)
+ .handle_exception_interruptible(
+ [=, this, soid = std::move(soid)] (auto e) {
+ on_failed_recover({ pg->get_pg_whoami() }, soid, need);
+ return seastar::make_ready_future<>();
+ })
+ );
+}
+
+void PGRecovery::on_local_recover(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& recovery_info,
+ const bool is_delete,
+ ceph::os::Transaction& t)
+{
+ if (const auto &log = pg->get_peering_state().get_pg_log();
+ !is_delete &&
+ log.get_missing().is_missing(recovery_info.soid) &&
+ log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
+ assert(pg->is_primary());
+ if (const auto* latest = log.get_log().objects.find(recovery_info.soid)->second;
+ latest->op == pg_log_entry_t::LOST_REVERT) {
+ ceph_abort("mark_unfound_lost (LOST_REVERT) is not implemented yet");
+ }
+ }
+ pg->get_peering_state().recover_got(soid,
+ recovery_info.version, is_delete, t);
+
+ if (pg->is_primary()) {
+ if (!is_delete) {
+ auto& obc = pg->get_recovery_backend()->get_recovering(soid).obc; //TODO: move to pg backend?
+ obc->obs.exists = true;
+ obc->obs.oi = recovery_info.oi;
+ }
+ if (!pg->is_unreadable_object(soid)) {
+ pg->get_recovery_backend()->get_recovering(soid).set_readable();
+ }
+ pg->publish_stats_to_osd();
+ }
+}
+
+void PGRecovery::on_global_recover (
+ const hobject_t& soid,
+ const object_stat_sum_t& stat_diff,
+ const bool is_delete)
+{
+ logger().info("{} {}", __func__, soid);
+ pg->get_peering_state().object_recovered(soid, stat_diff);
+ pg->publish_stats_to_osd();
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ if (!is_delete)
+ recovery_waiter.obc->drop_recovery_read();
+ recovery_waiter.set_recovered();
+ pg->get_recovery_backend()->remove_recovering(soid);
+}
+
+void PGRecovery::on_failed_recover(
+ const set<pg_shard_t>& from,
+ const hobject_t& soid,
+ const eversion_t& v)
+{
+ for (auto pg_shard : from) {
+ if (pg_shard != pg->get_pg_whoami()) {
+ pg->get_peering_state().force_object_missing(pg_shard, soid, v);
+ }
+ }
+}
+
+void PGRecovery::on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info)
+{
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{}: {}, {} on {}", __func__, oid,
+ recovery_info.version, peer);
+ pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version);
+}
+
+void PGRecovery::_committed_pushed_object(epoch_t epoch,
+ eversion_t last_complete)
+{
+ if (!pg->has_reset_since(epoch)) {
+ pg->get_peering_state().recovery_committed_to(last_complete);
+ } else {
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{} pg has changed, not touching last_complete_ondisk",
+ __func__);
+ }
+}
+
+template <class EventT>
+void PGRecovery::start_backfill_recovery(const EventT& evt)
+{
+ using BackfillRecovery = crimson::osd::BackfillRecovery;
+ std::ignore = pg->get_shard_services().start_operation<BackfillRecovery>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_shard_services(),
+ pg->get_osdmap_epoch(),
+ evt);
+}
+
+void PGRecovery::request_replica_scan(
+ const pg_shard_t& target,
+ const hobject_t& begin,
+ const hobject_t& end)
+{
+ logger().debug("{}: target.osd={}", __func__, target.osd);
+ auto msg = crimson::make_message<MOSDPGScan>(
+ MOSDPGScan::OP_SCAN_GET_DIGEST,
+ pg->get_pg_whoami(),
+ pg->get_osdmap_epoch(),
+ pg->get_last_peering_reset(),
+ spg_t(pg->get_pgid().pgid, target.shard),
+ begin,
+ end);
+ std::ignore = pg->get_shard_services().send_to_osd(
+ target.osd,
+ std::move(msg),
+ pg->get_osdmap_epoch());
+}
+
+void PGRecovery::request_primary_scan(
+ const hobject_t& begin)
+{
+ logger().debug("{}", __func__);
+ using crimson::common::local_conf;
+ std::ignore = pg->get_recovery_backend()->scan_for_backfill(
+ begin,
+ local_conf()->osd_backfill_scan_min,
+ local_conf()->osd_backfill_scan_max
+ ).then_interruptible([this] (BackfillInterval bi) {
+ logger().debug("request_primary_scan:{}", __func__);
+ using BackfillState = crimson::osd::BackfillState;
+ start_backfill_recovery(BackfillState::PrimaryScanned{ std::move(bi) });
+ });
+}
+
+void PGRecovery::enqueue_push(
+ const hobject_t& obj,
+ const eversion_t& v)
+{
+ logger().debug("{}: obj={} v={}",
+ __func__, obj, v);
+ pg->get_recovery_backend()->add_recovering(obj);
+ std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\
+ handle_exception_interruptible([] (auto) {
+ ceph_abort_msg("got exception on backfill's push");
+ return seastar::make_ready_future<>();
+ }).then_interruptible([this, obj] {
+ logger().debug("enqueue_push:{}", __func__);
+ using BackfillState = crimson::osd::BackfillState;
+ start_backfill_recovery(BackfillState::ObjectPushed(std::move(obj)));
+ });
+}
+
+void PGRecovery::enqueue_drop(
+ const pg_shard_t& target,
+ const hobject_t& obj,
+ const eversion_t& v)
+{
+ // allocate a pair if target is seen for the first time
+ auto& req = backfill_drop_requests[target];
+ if (!req) {
+ req = crimson::make_message<MOSDPGBackfillRemove>(
+ spg_t(pg->get_pgid().pgid, target.shard), pg->get_osdmap_epoch());
+ }
+ req->ls.emplace_back(obj, v);
+}
+
+void PGRecovery::maybe_flush()
+{
+ for (auto& [target, req] : backfill_drop_requests) {
+ std::ignore = pg->get_shard_services().send_to_osd(
+ target.osd,
+ std::move(req),
+ pg->get_osdmap_epoch());
+ }
+ backfill_drop_requests.clear();
+}
+
+void PGRecovery::update_peers_last_backfill(
+ const hobject_t& new_last_backfill)
+{
+ logger().debug("{}: new_last_backfill={}",
+ __func__, new_last_backfill);
+ // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
+ // all the backfill targets. Otherwise, we will move last_backfill up on
+ // those targets need it and send OP_BACKFILL_PROGRESS to them.
+ for (const auto& bt : pg->get_peering_state().get_backfill_targets()) {
+ if (const pg_info_t& pinfo = pg->get_peering_state().get_peer_info(bt);
+ new_last_backfill > pinfo.last_backfill) {
+ pg->get_peering_state().update_peer_last_backfill(bt, new_last_backfill);
+ auto m = crimson::make_message<MOSDPGBackfill>(
+ pinfo.last_backfill.is_max() ? MOSDPGBackfill::OP_BACKFILL_FINISH
+ : MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+ pg->get_osdmap_epoch(),
+ pg->get_last_peering_reset(),
+ spg_t(pg->get_pgid().pgid, bt.shard));
+ // Use default priority here, must match sub_op priority
+ // TODO: if pinfo.last_backfill.is_max(), then
+ // start_recovery_op(hobject_t::get_max());
+ m->last_backfill = pinfo.last_backfill;
+ m->stats = pinfo.stats;
+ std::ignore = pg->get_shard_services().send_to_osd(
+ bt.osd, std::move(m), pg->get_osdmap_epoch());
+ logger().info("{}: peer {} num_objects now {} / {}",
+ __func__,
+ bt,
+ pinfo.stats.stats.sum.num_objects,
+ pg->get_info().stats.stats.sum.num_objects);
+ }
+ }
+}
+
+bool PGRecovery::budget_available() const
+{
+ // TODO: the limits!
+ return true;
+}
+
+void PGRecovery::backfilled()
+{
+ using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
+ std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::Backfilled{});
+}
+
+void PGRecovery::dispatch_backfill_event(
+ boost::intrusive_ptr<const boost::statechart::event_base> evt)
+{
+ logger().debug("{}", __func__);
+ backfill_state->process_event(evt);
+}
+
+void PGRecovery::on_backfill_reserved()
+{
+ logger().debug("{}", __func__);
+ // PIMP and depedency injection for the sake unittestability.
+ // I'm not afraid about the performance here.
+ using BackfillState = crimson::osd::BackfillState;
+ backfill_state = std::make_unique<BackfillState>(
+ *this,
+ std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()),
+ std::make_unique<crimson::osd::PGFacade>(
+ *static_cast<crimson::osd::PG*>(pg)));
+ // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING
+ // will be set after on_backfill_reserved() returns.
+ // Backfill needs to take this into consideration when scheduling
+ // events -- they must be mutually exclusive with PeeringEvent
+ // instances. Otherwise the execution might begin without having
+ // the state updated.
+ ceph_assert(!pg->get_peering_state().is_backfilling());
+ start_backfill_recovery(BackfillState::Triggered{});
+}
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
new file mode 100644
index 000000000..719d0ad2d
--- /dev/null
+++ b/src/crimson/osd/pg_recovery.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/pg_recovery_listener.h"
+#include "crimson/osd/scheduler/scheduler.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/recovery_backend.h"
+
+#include "osd/object_state.h"
+
+namespace crimson::osd {
+class UrgentRecovery;
+}
+
+class MOSDPGBackfillRemove;
+class PGBackend;
+
+class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
+public:
+ template <typename T = void>
+ using interruptible_future = RecoveryBackend::interruptible_future<T>;
+ PGRecovery(PGRecoveryListener* pg) : pg(pg) {}
+ virtual ~PGRecovery() {}
+ void start_pglogbased_recovery();
+
+ interruptible_future<bool> start_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ size_t max_to_start);
+ void on_backfill_reserved();
+ void dispatch_backfill_event(
+ boost::intrusive_ptr<const boost::statechart::event_base> evt);
+
+ seastar::future<> stop() { return seastar::now(); }
+private:
+ PGRecoveryListener* pg;
+ size_t start_primary_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ size_t max_to_start,
+ std::vector<interruptible_future<>> *out);
+ size_t start_replica_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ size_t max_to_start,
+ std::vector<interruptible_future<>> *out);
+
+ std::vector<pg_shard_t> get_replica_recovery_order() const {
+ return pg->get_replica_recovery_order();
+ }
+ RecoveryBackend::interruptible_future<> recover_missing(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ const hobject_t &soid, eversion_t need);
+ RecoveryBackend::interruptible_future<> prep_object_replica_deletes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need);
+ RecoveryBackend::interruptible_future<> prep_object_replica_pushes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need);
+
+ void on_local_recover(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& recovery_info,
+ bool is_delete,
+ ceph::os::Transaction& t);
+ void on_global_recover (
+ const hobject_t& soid,
+ const object_stat_sum_t& stat_diff,
+ bool is_delete);
+ void on_failed_recover(
+ const std::set<pg_shard_t>& from,
+ const hobject_t& soid,
+ const eversion_t& v);
+ void on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info);
+ void _committed_pushed_object(epoch_t epoch,
+ eversion_t last_complete);
+ friend class ReplicatedRecoveryBackend;
+ friend class crimson::osd::UrgentRecovery;
+
+ // backfill begin
+ std::unique_ptr<crimson::osd::BackfillState> backfill_state;
+ std::map<pg_shard_t,
+ MURef<MOSDPGBackfillRemove>> backfill_drop_requests;
+
+ template <class EventT>
+ void start_backfill_recovery(
+ const EventT& evt);
+ void request_replica_scan(
+ const pg_shard_t& target,
+ const hobject_t& begin,
+ const hobject_t& end) final;
+ void request_primary_scan(
+ const hobject_t& begin) final;
+ void enqueue_push(
+ const hobject_t& obj,
+ const eversion_t& v) final;
+ void enqueue_drop(
+ const pg_shard_t& target,
+ const hobject_t& obj,
+ const eversion_t& v) final;
+ void maybe_flush() final;
+ void update_peers_last_backfill(
+ const hobject_t& new_last_backfill) final;
+ bool budget_available() const final;
+ void backfilled() final;
+ friend crimson::osd::BackfillState::PGFacade;
+ friend crimson::osd::PG;
+ // backfill end
+};
diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h
new file mode 100644
index 000000000..c922b9956
--- /dev/null
+++ b/src/crimson/osd/pg_recovery_listener.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "common/hobject.h"
+#include "include/types.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd {
+ class ShardServices;
+};
+
+class RecoveryBackend;
+class PGRecovery;
+
+class PGRecoveryListener {
+public:
+ virtual crimson::osd::ShardServices& get_shard_services() = 0;
+ virtual PGRecovery* get_recovery_handler() = 0;
+ virtual epoch_t get_osdmap_epoch() const = 0;
+ virtual bool is_primary() const = 0;
+ virtual bool is_peered() const = 0;
+ virtual bool is_recovering() const = 0;
+ virtual bool is_backfilling() const = 0;
+ virtual PeeringState& get_peering_state() = 0;
+ virtual const pg_shard_t& get_pg_whoami() const = 0;
+ virtual const spg_t& get_pgid() const = 0;
+ virtual RecoveryBackend* get_recovery_backend() = 0;
+ virtual bool is_unreadable_object(const hobject_t&, eversion_t* v = 0) const = 0;
+ virtual bool has_reset_since(epoch_t) const = 0;
+ virtual std::vector<pg_shard_t> get_replica_recovery_order() const = 0;
+ virtual epoch_t get_last_peering_reset() const = 0;
+ virtual const pg_info_t& get_info() const= 0;
+ virtual seastar::future<> stop() = 0;
+ virtual void publish_stats_to_osd() = 0;
+};
diff --git a/src/crimson/osd/pg_shard_manager.cc b/src/crimson/osd/pg_shard_manager.cc
new file mode 100644
index 000000000..6061c856b
--- /dev/null
+++ b/src/crimson/osd/pg_shard_manager.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/pg_shard_manager.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+seastar::future<> PGShardManager::load_pgs(crimson::os::FuturizedStore& store)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return store.list_collections(
+ ).then([this](auto colls_cores) {
+ return seastar::parallel_for_each(
+ colls_cores,
+ [this](auto coll_core) {
+ auto[coll, shard_core] = coll_core;
+ spg_t pgid;
+ if (coll.is_pg(&pgid)) {
+ return get_pg_to_shard_mapping().maybe_create_pg(
+ pgid, shard_core
+ ).then([this, pgid] (auto core) {
+ return this->template with_remote_shard_state(
+ core,
+ [pgid](
+ PerShardState &per_shard_state,
+ ShardServices &shard_services) {
+ return shard_services.load_pg(
+ pgid
+ ).then([pgid, &per_shard_state](auto &&pg) {
+ logger().info("load_pgs: loaded {}", pgid);
+ per_shard_state.pg_map.pg_loaded(pgid, std::move(pg));
+ return seastar::now();
+ });
+ });
+ });
+ } else if (coll.is_temp(&pgid)) {
+ logger().warn(
+ "found temp collection on crimson osd, should be impossible: {}",
+ coll);
+ ceph_assert(0 == "temp collection on crimson osd, should be impossible");
+ return seastar::now();
+ } else {
+ logger().warn("ignoring unrecognized collection: {}", coll);
+ return seastar::now();
+ }
+ });
+ });
+}
+
+seastar::future<> PGShardManager::stop_pgs()
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return shard_services.invoke_on_all([](auto &local_service) {
+ return local_service.local_state.stop_pgs();
+ });
+}
+
+seastar::future<std::map<pg_t, pg_stat_t>>
+PGShardManager::get_pg_stats() const
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return shard_services.map_reduce0(
+ [](auto &local) {
+ return local.local_state.get_pg_stats();
+ },
+ std::map<pg_t, pg_stat_t>(),
+ [](auto &&left, auto &&right) {
+ left.merge(std::move(right));
+ return std::move(left);
+ });
+}
+
+seastar::future<> PGShardManager::broadcast_map_to_pgs(epoch_t epoch)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return shard_services.invoke_on_all([epoch](auto &local_service) {
+ return local_service.local_state.broadcast_map_to_pgs(
+ local_service, epoch
+ );
+ }).then([this, epoch] {
+ logger().debug("PGShardManager::broadcast_map_to_pgs "
+ "broadcasted up to {}",
+ epoch);
+ return shard_services.invoke_on_all([epoch](auto &local_service) {
+ local_service.local_state.osdmap_gate.got_map(epoch);
+ return seastar::now();
+ });
+ });
+}
+
+seastar::future<> PGShardManager::set_up_epoch(epoch_t e) {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return shard_services.invoke_on_all(
+ seastar::smp_submit_to_options{},
+ [e](auto &local_service) {
+ local_service.local_state.set_up_epoch(e);
+ return seastar::now();
+ });
+}
+
+}
diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h
new file mode 100644
index 000000000..2f3a3015d
--- /dev/null
+++ b/src/crimson/osd/pg_shard_manager.h
@@ -0,0 +1,390 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/sharded.hh>
+
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/pg_map.h"
+
+namespace crimson::os {
+ class FuturizedStore;
+}
+
+namespace crimson::osd {
+/**
+ * PGShardManager
+ *
+ * Manages all state required to partition PGs over seastar reactors
+ * as well as state required to route messages to pgs. Mediates access to
+ * shared resources required by PGs (objectstore, messenger, monclient,
+ * etc)
+ */
+class PGShardManager {
+ seastar::sharded<OSDSingletonState> &osd_singleton_state;
+ seastar::sharded<ShardServices> &shard_services;
+ seastar::sharded<PGShardMapping> &pg_to_shard_mapping;
+
+#define FORWARD_CONST(FROM_METHOD, TO_METHOD, TARGET) \
+ template <typename... Args> \
+ auto FROM_METHOD(Args&&... args) const { \
+ return TARGET.TO_METHOD(std::forward<Args>(args)...); \
+ }
+
+#define FORWARD(FROM_METHOD, TO_METHOD, TARGET) \
+ template <typename... Args> \
+ auto FROM_METHOD(Args&&... args) { \
+ return TARGET.TO_METHOD(std::forward<Args>(args)...); \
+ }
+
+#define FORWARD_TO_OSD_SINGLETON(METHOD) \
+ FORWARD(METHOD, METHOD, get_osd_singleton_state())
+
+public:
+ using cached_map_t = OSDMapService::cached_map_t;
+ using local_cached_map_t = OSDMapService::local_cached_map_t;
+
+ PGShardManager(
+ seastar::sharded<OSDSingletonState> &osd_singleton_state,
+ seastar::sharded<ShardServices> &shard_services,
+ seastar::sharded<PGShardMapping> &pg_to_shard_mapping)
+ : osd_singleton_state(osd_singleton_state),
+ shard_services(shard_services),
+ pg_to_shard_mapping(pg_to_shard_mapping) {}
+
+ auto &get_osd_singleton_state() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return osd_singleton_state.local();
+ }
+ auto &get_osd_singleton_state() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return osd_singleton_state.local();
+ }
+ auto &get_shard_services() {
+ return shard_services.local();
+ }
+ auto &get_shard_services() const {
+ return shard_services.local();
+ }
+ auto &get_local_state() { return get_shard_services().local_state; }
+ auto &get_local_state() const { return get_shard_services().local_state; }
+ auto &get_pg_to_shard_mapping() { return pg_to_shard_mapping.local(); }
+ auto &get_pg_to_shard_mapping() const { return pg_to_shard_mapping.local(); }
+
+ seastar::future<> update_map(local_cached_map_t &&map) {
+ get_osd_singleton_state().update_map(
+ make_local_shared_foreign(local_cached_map_t(map))
+ );
+ /* We need each core to get its own foreign_ptr<local_cached_map_t>.
+ * foreign_ptr can't be cheaply copied, so we make one for each core
+ * up front. */
+ return seastar::do_with(
+ std::vector<seastar::foreign_ptr<local_cached_map_t>>(),
+ [this, map](auto &fmaps) {
+ fmaps.resize(seastar::smp::count);
+ for (auto &i: fmaps) {
+ i = seastar::foreign_ptr(map);
+ }
+ return shard_services.invoke_on_all(
+ [&fmaps](auto &local) mutable {
+ local.local_state.update_map(
+ make_local_shared_foreign(
+ std::move(fmaps[seastar::this_shard_id()])
+ ));
+ });
+ });
+ }
+
+ seastar::future<> stop_registries() {
+ return shard_services.invoke_on_all([](auto &local) {
+ return local.local_state.stop_registry();
+ });
+ }
+
+ FORWARD_TO_OSD_SINGLETON(send_pg_created)
+
+ // osd state forwards
+ FORWARD(is_active, is_active, get_shard_services().local_state.osd_state)
+ FORWARD(is_preboot, is_preboot, get_shard_services().local_state.osd_state)
+ FORWARD(is_booting, is_booting, get_shard_services().local_state.osd_state)
+ FORWARD(is_stopping, is_stopping, get_shard_services().local_state.osd_state)
+ FORWARD(is_prestop, is_prestop, get_shard_services().local_state.osd_state)
+ FORWARD(is_initializing, is_initializing, get_shard_services().local_state.osd_state)
+ FORWARD(set_prestop, set_prestop, get_shard_services().local_state.osd_state)
+ FORWARD(set_preboot, set_preboot, get_shard_services().local_state.osd_state)
+ FORWARD(set_booting, set_booting, get_shard_services().local_state.osd_state)
+ FORWARD(set_stopping, set_stopping, get_shard_services().local_state.osd_state)
+ FORWARD(set_active, set_active, get_shard_services().local_state.osd_state)
+ FORWARD(when_active, when_active, get_shard_services().local_state.osd_state)
+ FORWARD_CONST(get_osd_state_string, to_string, get_shard_services().local_state.osd_state)
+
+ FORWARD(got_map, got_map, get_shard_services().local_state.osdmap_gate)
+ FORWARD(wait_for_map, wait_for_map, get_shard_services().local_state.osdmap_gate)
+
+ // Metacoll
+ FORWARD_TO_OSD_SINGLETON(init_meta_coll)
+ FORWARD_TO_OSD_SINGLETON(get_meta_coll)
+
+ FORWARD_TO_OSD_SINGLETON(set_superblock)
+
+ // Core OSDMap methods
+ FORWARD_TO_OSD_SINGLETON(get_local_map)
+ FORWARD_TO_OSD_SINGLETON(load_map_bl)
+ FORWARD_TO_OSD_SINGLETON(load_map_bls)
+ FORWARD_TO_OSD_SINGLETON(store_maps)
+
+ seastar::future<> set_up_epoch(epoch_t e);
+
+ template <typename F>
+ auto with_remote_shard_state(core_id_t core, F &&f) {
+ return shard_services.invoke_on(
+ core, [f=std::move(f)](auto &target_shard_services) mutable {
+ return std::invoke(
+ std::move(f), target_shard_services.local_state,
+ target_shard_services);
+ });
+ }
+
+ template <typename T, typename F>
+ auto with_remote_shard_state_and_op(
+ core_id_t core,
+ typename T::IRef &&op,
+ F &&f) {
+ if (seastar::this_shard_id() == core) {
+ auto &target_shard_services = shard_services.local();
+ return std::invoke(
+ std::move(f),
+ target_shard_services.local_state,
+ target_shard_services,
+ std::move(op));
+ }
+ return op->prepare_remote_submission(
+ ).then([op=std::move(op), f=std::move(f), this, core
+ ](auto f_conn) mutable {
+ return shard_services.invoke_on(
+ core,
+ [f=std::move(f), op=std::move(op), f_conn=std::move(f_conn)
+ ](auto &target_shard_services) mutable {
+ op->finish_remote_submission(std::move(f_conn));
+ return std::invoke(
+ std::move(f),
+ target_shard_services.local_state,
+ target_shard_services,
+ std::move(op));
+ });
+ });
+ }
+
+ /// Runs opref on the appropriate core, creating the pg as necessary.
+ template <typename T>
+ seastar::future<> run_with_pg_maybe_create(
+ typename T::IRef op
+ ) {
+ ceph_assert(op->use_count() == 1);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ static_assert(T::can_create());
+ logger.debug("{}: can_create", *op);
+
+ get_local_state().registry.remove_from_registry(*op);
+ return get_pg_to_shard_mapping().maybe_create_pg(
+ op->get_pgid()
+ ).then([this, op = std::move(op)](auto core) mutable {
+ return this->template with_remote_shard_state_and_op<T>(
+ core, std::move(op),
+ [](PerShardState &per_shard_state,
+ ShardServices &shard_services,
+ typename T::IRef op) {
+ per_shard_state.registry.add_to_registry(*op);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ auto &opref = *op;
+ return opref.template with_blocking_event<
+ PGMap::PGCreationBlockingEvent
+ >([&shard_services, &opref](
+ auto &&trigger) {
+ return shard_services.get_or_create_pg(
+ std::move(trigger),
+ opref.get_pgid(),
+ std::move(opref.get_create_info())
+ );
+ }).safe_then([&logger, &shard_services, &opref](Ref<PG> pgref) {
+ logger.debug("{}: have_pg", opref);
+ return opref.with_pg(shard_services, pgref);
+ }).handle_error(
+ crimson::ct_error::ecanceled::handle([&logger, &opref](auto) {
+ logger.debug("{}: pg creation canceled, dropping", opref);
+ return seastar::now();
+ })
+ ).then([op=std::move(op)] {});
+ });
+ });
+ }
+
+ /// Runs opref on the appropriate core, waiting for pg as necessary
+ template <typename T>
+ seastar::future<> run_with_pg_maybe_wait(
+ typename T::IRef op
+ ) {
+ ceph_assert(op->use_count() == 1);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ static_assert(!T::can_create());
+ logger.debug("{}: !can_create", *op);
+
+ get_local_state().registry.remove_from_registry(*op);
+ return get_pg_to_shard_mapping().maybe_create_pg(
+ op->get_pgid()
+ ).then([this, op = std::move(op)](auto core) mutable {
+ return this->template with_remote_shard_state_and_op<T>(
+ core, std::move(op),
+ [](PerShardState &per_shard_state,
+ ShardServices &shard_services,
+ typename T::IRef op) {
+ per_shard_state.registry.add_to_registry(*op);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ auto &opref = *op;
+ return opref.template with_blocking_event<
+ PGMap::PGCreationBlockingEvent
+ >([&shard_services, &opref](
+ auto &&trigger) {
+ return shard_services.wait_for_pg(
+ std::move(trigger), opref.get_pgid());
+ }).safe_then([&logger, &shard_services, &opref](Ref<PG> pgref) {
+ logger.debug("{}: have_pg", opref);
+ return opref.with_pg(shard_services, pgref);
+ }).handle_error(
+ crimson::ct_error::ecanceled::handle([&logger, &opref](auto) {
+ logger.debug("{}: pg creation canceled, dropping", opref);
+ return seastar::now();
+ })
+ ).then([op=std::move(op)] {});
+ });
+ });
+ }
+
+ seastar::future<> load_pgs(crimson::os::FuturizedStore& store);
+ seastar::future<> stop_pgs();
+
+ seastar::future<std::map<pg_t, pg_stat_t>> get_pg_stats() const;
+
+ /**
+ * invoke_method_on_each_shard_seq
+ *
+ * Invokes shard_services method on each shard sequentially.
+ */
+ template <typename F, typename... Args>
+ seastar::future<> invoke_on_each_shard_seq(
+ F &&f) const {
+ return sharded_map_seq(
+ shard_services,
+ [f=std::forward<F>(f)](const ShardServices &shard_services) mutable {
+ return std::invoke(
+ f,
+ shard_services);
+ });
+ }
+
+ /**
+ * for_each_pg
+ *
+ * Invokes f on each pg sequentially. Caller may rely on f not being
+ * invoked concurrently on multiple cores.
+ */
+ template <typename F>
+ seastar::future<> for_each_pg(F &&f) const {
+ return invoke_on_each_shard_seq(
+ [f=std::move(f)](const auto &local_service) mutable {
+ for (auto &pg: local_service.local_state.pg_map.get_pgs()) {
+ std::apply(f, pg);
+ }
+ return seastar::now();
+ });
+ }
+
+ /**
+ * for_each_pgid
+ *
+ * Syncronously invokes f on each pgid
+ */
+ template <typename F>
+ void for_each_pgid(F &&f) const {
+ return get_pg_to_shard_mapping().for_each_pgid(
+ std::forward<F>(f));
+ }
+
+ auto get_num_pgs() const {
+ return get_pg_to_shard_mapping().get_num_pgs();
+ }
+
+ seastar::future<> broadcast_map_to_pgs(epoch_t epoch);
+
+ template <typename F>
+ auto with_pg(spg_t pgid, F &&f) {
+ core_id_t core = get_pg_to_shard_mapping().get_pg_mapping(pgid);
+ return with_remote_shard_state(
+ core,
+ [pgid, f=std::move(f)](auto &local_state, auto &local_service) mutable {
+ return std::invoke(
+ std::move(f),
+ local_state.pg_map.get_pg(pgid));
+ });
+ }
+
+ template <typename T, typename... Args>
+ auto start_pg_operation(Args&&... args) {
+ auto op = get_local_state().registry.create_operation<T>(
+ std::forward<Args>(args)...);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ logger.debug("{}: starting {}", *op, __func__);
+
+ auto &opref = *op;
+ auto id = op->get_id();
+ if constexpr (T::is_trackable) {
+ op->template track_event<typename T::StartEvent>();
+ }
+ auto fut = opref.template enter_stage<>(
+ opref.get_connection_pipeline().await_active
+ ).then([this, &opref, &logger] {
+ logger.debug("{}: start_pg_operation in await_active stage", opref);
+ return get_shard_services().local_state.osd_state.when_active();
+ }).then([&logger, &opref] {
+ logger.debug("{}: start_pg_operation active, entering await_map", opref);
+ return opref.template enter_stage<>(
+ opref.get_connection_pipeline().await_map);
+ }).then([this, &logger, &opref] {
+ logger.debug("{}: start_pg_operation await_map stage", opref);
+ using OSDMapBlockingEvent =
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent;
+ return opref.template with_blocking_event<OSDMapBlockingEvent>(
+ [this, &opref](auto &&trigger) {
+ std::ignore = this;
+ return get_shard_services().local_state.osdmap_gate.wait_for_map(
+ std::move(trigger),
+ opref.get_epoch(),
+ &get_shard_services());
+ });
+ }).then([&logger, &opref](auto epoch) {
+ logger.debug("{}: got map {}, entering get_pg", opref, epoch);
+ return opref.template enter_stage<>(
+ opref.get_connection_pipeline().get_pg);
+ }).then([this, &logger, &opref, op=std::move(op)]() mutable {
+ logger.debug("{}: in get_pg core {}", opref, seastar::this_shard_id());
+ logger.debug("{}: in get_pg", opref);
+ if constexpr (T::can_create()) {
+ logger.debug("{}: can_create", opref);
+ return run_with_pg_maybe_create<T>(std::move(op));
+ } else {
+ logger.debug("{}: !can_create", opref);
+ return run_with_pg_maybe_wait<T>(std::move(op));
+ }
+ });
+ return std::make_pair(id, std::move(fut));
+ }
+
+#undef FORWARD
+#undef FORWARD_CONST
+#undef FORWARD_TO_OSD_SINGLETON
+};
+
+}
diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc
new file mode 100644
index 000000000..b5394bfdc
--- /dev/null
+++ b/src/crimson/osd/recovery_backend.cc
@@ -0,0 +1,328 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/format.h>
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/recovery_backend.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+
+#include "messages/MOSDFastDispatchOp.h"
+#include "osd/osd_types.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+hobject_t RecoveryBackend::get_temp_recovery_object(
+ const hobject_t& target,
+ eversion_t version) const
+{
+ hobject_t hoid =
+ target.make_temp_hobject(fmt::format("temp_recovering_{}_{}_{}_{}",
+ pg.get_info().pgid,
+ version,
+ pg.get_info().history.same_interval_since,
+ target.snap));
+ logger().debug("{} {}", __func__, hoid);
+ return hoid;
+}
+
+void RecoveryBackend::clean_up(ceph::os::Transaction& t,
+ std::string_view why)
+{
+ for (auto& soid : temp_contents) {
+ t.remove(pg.get_collection_ref()->get_cid(),
+ ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard));
+ }
+ temp_contents.clear();
+
+ for (auto& [soid, recovery_waiter] : recovering) {
+ if ((recovery_waiter->pull_info
+ && recovery_waiter->pull_info->is_complete())
+ || (!recovery_waiter->pull_info
+ && recovery_waiter->obc && recovery_waiter->obc->obs.exists)) {
+ recovery_waiter->obc->interrupt(
+ ::crimson::common::actingset_changed(
+ pg.is_primary()));
+ recovery_waiter->interrupt(why);
+ }
+ }
+ recovering.clear();
+}
+
+void RecoveryBackend::WaitForObjectRecovery::stop() {
+ readable.set_exception(
+ crimson::common::system_shutdown_exception());
+ recovered.set_exception(
+ crimson::common::system_shutdown_exception());
+ pulled.set_exception(
+ crimson::common::system_shutdown_exception());
+ for (auto& [pg_shard, pr] : pushes) {
+ pr.set_exception(
+ crimson::common::system_shutdown_exception());
+ }
+}
+
+void RecoveryBackend::handle_backfill_finish(
+ MOSDPGBackfill& m,
+ crimson::net::ConnectionRef conn)
+{
+ logger().debug("{}", __func__);
+ ceph_assert(!pg.is_primary());
+ ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 1);
+ auto reply = crimson::make_message<MOSDPGBackfill>(
+ MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
+ pg.get_osdmap_epoch(),
+ m.query_epoch,
+ spg_t(pg.get_pgid().pgid, pg.get_primary().shard));
+ reply->set_priority(pg.get_recovery_op_priority());
+ std::ignore = conn->send(std::move(reply));
+ shard_services.start_operation<crimson::osd::LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(&pg),
+ pg.get_pg_whoami(),
+ pg.get_pgid(),
+ pg.get_osdmap_epoch(),
+ pg.get_osdmap_epoch(),
+ RecoveryDone{});
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_backfill_progress(
+ MOSDPGBackfill& m)
+{
+ logger().debug("{}", __func__);
+ ceph_assert(!pg.is_primary());
+ ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 2);
+
+ ObjectStore::Transaction t;
+ pg.get_peering_state().update_backfill_progress(
+ m.last_backfill,
+ m.stats,
+ m.op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+ t);
+ logger().debug("RecoveryBackend::handle_backfill_progress: do_transaction...");
+ return shard_services.get_store().do_transaction(
+ pg.get_collection_ref(), std::move(t)).or_terminate();
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_backfill_finish_ack(
+ MOSDPGBackfill& m)
+{
+ logger().debug("{}", __func__);
+ ceph_assert(pg.is_primary());
+ ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 3);
+ // TODO:
+ // finish_recovery_op(hobject_t::get_max());
+ return seastar::now();
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_backfill(
+ MOSDPGBackfill& m,
+ crimson::net::ConnectionRef conn)
+{
+ logger().debug("{}", __func__);
+ if (pg.old_peering_msg(m.map_epoch, m.query_epoch)) {
+ logger().debug("{}: discarding {}", __func__, m);
+ return seastar::now();
+ }
+ switch (m.op) {
+ case MOSDPGBackfill::OP_BACKFILL_FINISH:
+ handle_backfill_finish(m, conn);
+ [[fallthrough]];
+ case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
+ return handle_backfill_progress(m);
+ case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
+ return handle_backfill_finish_ack(m);
+ default:
+ ceph_assert("unknown op type for pg backfill");
+ return seastar::now();
+ }
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_backfill_remove(
+ MOSDPGBackfillRemove& m)
+{
+ logger().debug("{} m.ls={}", __func__, m.ls);
+ assert(m.get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
+ if (pg.can_discard_replica_op(m)) {
+ logger().debug("{}: discarding {}", __func__, m);
+ return seastar::now();
+ }
+ ObjectStore::Transaction t;
+ for ([[maybe_unused]] const auto& [soid, ver] : m.ls) {
+ // TODO: the reserved space management. PG::try_reserve_recovery_space().
+ t.remove(pg.get_collection_ref()->get_cid(),
+ ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard));
+ }
+ logger().debug("RecoveryBackend::handle_backfill_remove: do_transaction...");
+ return shard_services.get_store().do_transaction(
+ pg.get_collection_ref(), std::move(t)).or_terminate();
+}
+
+RecoveryBackend::interruptible_future<BackfillInterval>
+RecoveryBackend::scan_for_backfill(
+ const hobject_t& start,
+ [[maybe_unused]] const std::int64_t min,
+ const std::int64_t max)
+{
+ logger().debug("{} starting from {}", __func__, start);
+ auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>();
+ return backend->list_objects(start, max).then_interruptible(
+ [this, start, version_map] (auto&& ret) {
+ auto&& [objects, next] = std::move(ret);
+ return seastar::do_with(
+ std::move(objects),
+ [this, version_map](auto &objects) {
+ return interruptor::parallel_for_each(objects,
+ [this, version_map] (const hobject_t& object)
+ -> interruptible_future<> {
+ crimson::osd::ObjectContextRef obc;
+ if (pg.is_primary()) {
+ obc = pg.obc_registry.maybe_get_cached_obc(object);
+ }
+ if (obc) {
+ if (obc->obs.exists) {
+ logger().debug("scan_for_backfill found (primary): {} {}",
+ object, obc->obs.oi.version);
+ version_map->emplace(object, obc->obs.oi.version);
+ } else {
+ // if the object does not exist here, it must have been removed
+ // between the collection_list_partial and here. This can happen
+ // for the first item in the range, which is usually last_backfill.
+ }
+ return seastar::now();
+ } else {
+ return backend->load_metadata(object).safe_then_interruptible(
+ [version_map, object] (auto md) {
+ if (md->os.exists) {
+ logger().debug("scan_for_backfill found: {} {}",
+ object, md->os.oi.version);
+ version_map->emplace(object, md->os.oi.version);
+ }
+ return seastar::now();
+ }, PGBackend::load_metadata_ertr::assert_all{});
+ }
+ });
+ }).then_interruptible([version_map, start=std::move(start), next=std::move(next), this] {
+ BackfillInterval bi;
+ bi.begin = std::move(start);
+ bi.end = std::move(next);
+ bi.version = pg.get_info().last_update;
+ bi.objects = std::move(*version_map);
+ logger().debug("{} BackfillInterval filled, leaving",
+ "scan_for_backfill");
+ return seastar::make_ready_future<BackfillInterval>(std::move(bi));
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_scan_get_digest(
+ MOSDPGScan& m,
+ crimson::net::ConnectionRef conn)
+{
+ logger().debug("{}", __func__);
+ if (false /* FIXME: check for backfill too full */) {
+ std::ignore = shard_services.start_operation<crimson::osd::LocalPeeringEvent>(
+ // TODO: abstract start_background_recovery
+ static_cast<crimson::osd::PG*>(&pg),
+ pg.get_pg_whoami(),
+ pg.get_pgid(),
+ pg.get_osdmap_epoch(),
+ pg.get_osdmap_epoch(),
+ PeeringState::BackfillTooFull());
+ return seastar::now();
+ }
+ return scan_for_backfill(
+ std::move(m.begin),
+ crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"),
+ crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max")
+ ).then_interruptible(
+ [this, query_epoch=m.query_epoch, conn
+ ](auto backfill_interval) {
+ auto reply = crimson::make_message<MOSDPGScan>(
+ MOSDPGScan::OP_SCAN_DIGEST,
+ pg.get_pg_whoami(),
+ pg.get_osdmap_epoch(),
+ query_epoch,
+ spg_t(pg.get_info().pgid.pgid, pg.get_primary().shard),
+ backfill_interval.begin,
+ backfill_interval.end);
+ encode(backfill_interval.objects, reply->get_data());
+ return conn->send(std::move(reply));
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_scan_digest(
+ MOSDPGScan& m)
+{
+ logger().debug("{}", __func__);
+ // Check that from is in backfill_targets vector
+ ceph_assert(pg.is_backfill_target(m.from));
+
+ BackfillInterval bi;
+ bi.begin = m.begin;
+ bi.end = m.end;
+ {
+ auto p = m.get_data().cbegin();
+ // take care to preserve ordering!
+ bi.clear_objects();
+ ::decode_noclear(bi.objects, p);
+ }
+ shard_services.start_operation<crimson::osd::BackfillRecovery>(
+ static_cast<crimson::osd::PG*>(&pg),
+ shard_services,
+ pg.get_osdmap_epoch(),
+ crimson::osd::BackfillState::ReplicaScanned{ m.from, std::move(bi) });
+ return seastar::now();
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_scan(
+ MOSDPGScan& m,
+ crimson::net::ConnectionRef conn)
+{
+ logger().debug("{}", __func__);
+ if (pg.old_peering_msg(m.map_epoch, m.query_epoch)) {
+ logger().debug("{}: discarding {}", __func__, m);
+ return seastar::now();
+ }
+ switch (m.op) {
+ case MOSDPGScan::OP_SCAN_GET_DIGEST:
+ return handle_scan_get_digest(m, conn);
+ case MOSDPGScan::OP_SCAN_DIGEST:
+ return handle_scan_digest(m);
+ default:
+ // FIXME: move to errorator
+ ceph_assert("unknown op type for pg scan");
+ return seastar::now();
+ }
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_recovery_op(
+ Ref<MOSDFastDispatchOp> m,
+ crimson::net::ConnectionRef conn)
+{
+ switch (m->get_header().type) {
+ case MSG_OSD_PG_BACKFILL:
+ return handle_backfill(*boost::static_pointer_cast<MOSDPGBackfill>(m), conn);
+ case MSG_OSD_PG_BACKFILL_REMOVE:
+ return handle_backfill_remove(*boost::static_pointer_cast<MOSDPGBackfillRemove>(m));
+ case MSG_OSD_PG_SCAN:
+ return handle_scan(*boost::static_pointer_cast<MOSDPGScan>(m), conn);
+ default:
+ return seastar::make_exception_future<>(
+ std::invalid_argument(fmt::format("invalid request type: {}",
+ m->get_header().type)));
+ }
+}
diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h
new file mode 100644
index 000000000..65e9bb01f
--- /dev/null
+++ b/src/crimson/osd/recovery_backend.h
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/shard_services.h"
+
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MOSDPGScan.h"
+#include "osd/recovery_types.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd{
+ class PG;
+}
+
+class PGBackend;
+
+class RecoveryBackend {
+public:
+ class WaitForObjectRecovery;
+public:
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition, T>;
+ using interruptor =
+ ::crimson::interruptible::interruptor<
+ ::crimson::osd::IOInterruptCondition>;
+ RecoveryBackend(crimson::osd::PG& pg,
+ crimson::osd::ShardServices& shard_services,
+ crimson::os::CollectionRef coll,
+ PGBackend* backend)
+ : pg{pg},
+ shard_services{shard_services},
+ store{&shard_services.get_store()},
+ coll{coll},
+ backend{backend} {}
+ virtual ~RecoveryBackend() {}
+ WaitForObjectRecovery& add_recovering(const hobject_t& soid) {
+ auto [it, added] = recovering.emplace(soid, new WaitForObjectRecovery{});
+ assert(added);
+ return *(it->second);
+ }
+ WaitForObjectRecovery& get_recovering(const hobject_t& soid) {
+ assert(is_recovering(soid));
+ return *(recovering.at(soid));
+ }
+ void remove_recovering(const hobject_t& soid) {
+ recovering.erase(soid);
+ }
+ bool is_recovering(const hobject_t& soid) const {
+ return recovering.count(soid) != 0;
+ }
+ uint64_t total_recovering() const {
+ return recovering.size();
+ }
+
+ virtual interruptible_future<> handle_recovery_op(
+ Ref<MOSDFastDispatchOp> m,
+ crimson::net::ConnectionRef conn);
+
+ virtual interruptible_future<> recover_object(
+ const hobject_t& soid,
+ eversion_t need) = 0;
+ virtual interruptible_future<> recover_delete(
+ const hobject_t& soid,
+ eversion_t need) = 0;
+ virtual interruptible_future<> push_delete(
+ const hobject_t& soid,
+ eversion_t need) = 0;
+
+ interruptible_future<BackfillInterval> scan_for_backfill(
+ const hobject_t& from,
+ std::int64_t min,
+ std::int64_t max);
+
+ void on_peering_interval_change(ceph::os::Transaction& t) {
+ clean_up(t, "new peering interval");
+ }
+
+ seastar::future<> stop() {
+ for (auto& [soid, recovery_waiter] : recovering) {
+ recovery_waiter->stop();
+ }
+ return on_stop();
+ }
+protected:
+ crimson::osd::PG& pg;
+ crimson::osd::ShardServices& shard_services;
+ crimson::os::FuturizedStore::Shard* store;
+ crimson::os::CollectionRef coll;
+ PGBackend* backend;
+
+ struct pull_info_t {
+ pg_shard_t from;
+ hobject_t soid;
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ crimson::osd::ObjectContextRef head_ctx;
+ crimson::osd::ObjectContextRef obc;
+ object_stat_sum_t stat;
+ bool is_complete() const {
+ return recovery_progress.is_complete(recovery_info);
+ }
+ };
+
+ struct push_info_t {
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ crimson::osd::ObjectContextRef obc;
+ object_stat_sum_t stat;
+ };
+
+public:
+ class WaitForObjectRecovery :
+ public boost::intrusive_ref_counter<
+ WaitForObjectRecovery, boost::thread_unsafe_counter>,
+ public crimson::BlockerT<WaitForObjectRecovery> {
+ seastar::shared_promise<> readable, recovered, pulled;
+ std::map<pg_shard_t, seastar::shared_promise<>> pushes;
+ public:
+ static constexpr const char* type_name = "WaitForObjectRecovery";
+
+ crimson::osd::ObjectContextRef obc;
+ std::optional<pull_info_t> pull_info;
+ std::map<pg_shard_t, push_info_t> pushing;
+
+ seastar::future<> wait_for_readable() {
+ return readable.get_shared_future();
+ }
+ seastar::future<> wait_for_pushes(pg_shard_t shard) {
+ return pushes[shard].get_shared_future();
+ }
+ seastar::future<> wait_for_recovered() {
+ return recovered.get_shared_future();
+ }
+ template <typename T, typename F>
+ auto wait_track_blocking(T &trigger, F &&fut) {
+ WaitForObjectRecoveryRef ref = this;
+ return track_blocking(
+ trigger,
+ std::forward<F>(fut)
+ ).finally([ref] {});
+ }
+ template <typename T>
+ seastar::future<> wait_for_recovered(T &trigger) {
+ WaitForObjectRecoveryRef ref = this;
+ return wait_track_blocking(trigger, recovered.get_shared_future());
+ }
+ seastar::future<> wait_for_pull() {
+ return pulled.get_shared_future();
+ }
+ void set_readable() {
+ readable.set_value();
+ }
+ void set_recovered() {
+ recovered.set_value();
+ }
+ void set_pushed(pg_shard_t shard) {
+ pushes[shard].set_value();
+ }
+ void set_pulled() {
+ pulled.set_value();
+ }
+ void set_push_failed(pg_shard_t shard, std::exception_ptr e) {
+ pushes.at(shard).set_exception(e);
+ }
+ void interrupt(std::string_view why) {
+ readable.set_exception(std::system_error(
+ std::make_error_code(std::errc::interrupted), why.data()));
+ recovered.set_exception(std::system_error(
+ std::make_error_code(std::errc::interrupted), why.data()));
+ pulled.set_exception(std::system_error(
+ std::make_error_code(std::errc::interrupted), why.data()));
+ for (auto& [pg_shard, pr] : pushes) {
+ pr.set_exception(std::system_error(
+ std::make_error_code(std::errc::interrupted), why.data()));
+ }
+ }
+ void stop();
+ void dump_detail(Formatter* f) const {
+ }
+ };
+ using RecoveryBlockingEvent =
+ crimson::AggregateBlockingEvent<WaitForObjectRecovery::BlockingEvent>;
+ using WaitForObjectRecoveryRef = boost::intrusive_ptr<WaitForObjectRecovery>;
+protected:
+ std::map<hobject_t, WaitForObjectRecoveryRef> recovering;
+ hobject_t get_temp_recovery_object(
+ const hobject_t& target,
+ eversion_t version) const;
+
+ boost::container::flat_set<hobject_t> temp_contents;
+
+ void add_temp_obj(const hobject_t &oid) {
+ temp_contents.insert(oid);
+ }
+ void clear_temp_obj(const hobject_t &oid) {
+ temp_contents.erase(oid);
+ }
+ void clean_up(ceph::os::Transaction& t, std::string_view why);
+ virtual seastar::future<> on_stop() = 0;
+private:
+ void handle_backfill_finish(
+ MOSDPGBackfill& m,
+ crimson::net::ConnectionRef conn);
+ interruptible_future<> handle_backfill_progress(
+ MOSDPGBackfill& m);
+ interruptible_future<> handle_backfill_finish_ack(
+ MOSDPGBackfill& m);
+ interruptible_future<> handle_backfill(
+ MOSDPGBackfill& m,
+ crimson::net::ConnectionRef conn);
+
+ interruptible_future<> handle_scan_get_digest(
+ MOSDPGScan& m,
+ crimson::net::ConnectionRef conn);
+ interruptible_future<> handle_scan_digest(
+ MOSDPGScan& m);
+ interruptible_future<> handle_scan(
+ MOSDPGScan& m,
+ crimson::net::ConnectionRef conn);
+ interruptible_future<> handle_backfill_remove(MOSDPGBackfillRemove& m);
+};
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
new file mode 100644
index 000000000..0ff4ad573
--- /dev/null
+++ b/src/crimson/osd/replicated_backend.cc
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "replicated_backend.h"
+
+#include "messages/MOSDRepOpReply.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/common/log.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/shard_services.h"
+#include "osd/PeeringState.h"
+
+SET_SUBSYS(osd);
+
+ReplicatedBackend::ReplicatedBackend(pg_t pgid,
+ pg_shard_t whoami,
+ ReplicatedBackend::CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ DoutPrefixProvider &dpp)
+ : PGBackend{whoami.shard, coll, shard_services, dpp},
+ pgid{pgid},
+ whoami{whoami}
+{}
+
+ReplicatedBackend::ll_read_ierrorator::future<ceph::bufferlist>
+ReplicatedBackend::_read(const hobject_t& hoid,
+ const uint64_t off,
+ const uint64_t len,
+ const uint32_t flags)
+{
+ return store->read(coll, ghobject_t{hoid}, off, len, flags);
+}
+
+ReplicatedBackend::rep_op_fut_t
+ReplicatedBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch, epoch_t map_epoch,
+ std::vector<pg_log_entry_t>&& log_entries)
+{
+ LOG_PREFIX(ReplicatedBackend::_submit_transaction);
+
+ const ceph_tid_t tid = shard_services.get_tid();
+ auto pending_txn =
+ pending_trans.try_emplace(tid, pg_shards.size(), osd_op_p.at_version).first;
+ bufferlist encoded_txn;
+ encode(txn, encoded_txn);
+
+ DEBUGDPP("object {}", dpp, hoid);
+ auto all_completed = interruptor::make_interruptible(
+ shard_services.get_store().do_transaction(coll, std::move(txn))
+ ).then_interruptible([FNAME, this,
+ peers=pending_txn->second.weak_from_this()] {
+ if (!peers) {
+ // for now, only actingset_changed can cause peers
+ // to be nullptr
+ ERRORDPP("peers is null, this should be impossible", dpp);
+ assert(0 == "impossible");
+ }
+ if (--peers->pending == 0) {
+ peers->all_committed.set_value();
+ peers->all_committed = {};
+ return seastar::now();
+ }
+ return peers->all_committed.get_shared_future();
+ }).then_interruptible([pending_txn, this] {
+ auto acked_peers = std::move(pending_txn->second.acked_peers);
+ pending_trans.erase(pending_txn);
+ return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers));
+ });
+
+ auto sends = std::make_unique<std::vector<seastar::future<>>>();
+ for (auto pg_shard : pg_shards) {
+ if (pg_shard != whoami) {
+ auto m = crimson::make_message<MOSDRepOp>(
+ osd_op_p.req_id,
+ whoami,
+ spg_t{pgid, pg_shard.shard},
+ hoid,
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+ map_epoch,
+ min_epoch,
+ tid,
+ osd_op_p.at_version);
+ m->set_data(encoded_txn);
+ pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
+ encode(log_entries, m->logbl);
+ m->pg_trim_to = osd_op_p.pg_trim_to;
+ m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk;
+ m->set_rollback_to(osd_op_p.at_version);
+ // TODO: set more stuff. e.g., pg_states
+ sends->emplace_back(shard_services.send_to_osd(pg_shard.osd, std::move(m), map_epoch));
+ }
+ }
+ auto sends_complete = seastar::when_all_succeed(
+ sends->begin(), sends->end()
+ ).finally([sends=std::move(sends)] {});
+ return {std::move(sends_complete), std::move(all_completed)};
+}
+
+void ReplicatedBackend::on_actingset_changed(bool same_primary)
+{
+ crimson::common::actingset_changed e_actingset_changed{same_primary};
+ for (auto& [tid, pending_txn] : pending_trans) {
+ pending_txn.all_committed.set_exception(e_actingset_changed);
+ }
+ pending_trans.clear();
+}
+
+void ReplicatedBackend::got_rep_op_reply(const MOSDRepOpReply& reply)
+{
+ LOG_PREFIX(ReplicatedBackend::got_rep_op_reply);
+ auto found = pending_trans.find(reply.get_tid());
+ if (found == pending_trans.end()) {
+ WARNDPP("cannot find rep op for message {}", dpp, reply);
+ return;
+ }
+ auto& peers = found->second;
+ for (auto& peer : peers.acked_peers) {
+ if (peer.shard == reply.from) {
+ peer.last_complete_ondisk = reply.get_last_complete_ondisk();
+ if (--peers.pending == 0) {
+ peers.all_committed.set_value();
+ peers.all_committed = {};
+ }
+ return;
+ }
+ }
+}
+
+seastar::future<> ReplicatedBackend::stop()
+{
+ LOG_PREFIX(ReplicatedBackend::stop);
+ INFODPP("cid {}", coll->get_cid());
+ for (auto& [tid, pending_on] : pending_trans) {
+ pending_on.all_committed.set_exception(
+ crimson::common::system_shutdown_exception());
+ }
+ pending_trans.clear();
+ return seastar::now();
+}
+
+seastar::future<>
+ReplicatedBackend::request_committed(const osd_reqid_t& reqid,
+ const eversion_t& at_version)
+{
+ if (std::empty(pending_trans)) {
+ return seastar::now();
+ }
+ auto iter = pending_trans.begin();
+ auto& pending_txn = iter->second;
+ if (pending_txn.at_version > at_version) {
+ return seastar::now();
+ }
+ for (; iter->second.at_version < at_version; ++iter);
+ // As for now, the previous client_request with the same reqid
+ // mustn't have finished, as that would mean later client_requests
+ // has finished before earlier ones.
+ //
+ // The following line of code should be "assert(pending_txn.at_version == at_version)",
+ // as there can be only one transaction at any time in pending_trans due to
+ // PG::request_pg_pipeline. But there's a high possibility that we will
+ // improve the parallelism here in the future, which means there may be multiple
+ // client requests in flight, so we loosed the restriction to as follows. Correct
+ // me if I'm wrong:-)
+ assert(iter != pending_trans.end() && iter->second.at_version == at_version);
+ if (iter->second.pending) {
+ return iter->second.all_committed.get_shared_future();
+ } else {
+ return seastar::now();
+ }
+}
diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h
new file mode 100644
index 000000000..f789a35ea
--- /dev/null
+++ b/src/crimson/osd/replicated_backend.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/weak_ptr.hh>
+#include "include/buffer_fwd.h"
+#include "osd/osd_types.h"
+
+#include "acked_peers.h"
+#include "pg_backend.h"
+
+namespace crimson::osd {
+ class ShardServices;
+}
+
+class ReplicatedBackend : public PGBackend
+{
+public:
+ ReplicatedBackend(pg_t pgid, pg_shard_t whoami,
+ CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ DoutPrefixProvider &dpp);
+ void got_rep_op_reply(const MOSDRepOpReply& reply) final;
+ seastar::future<> stop() final;
+ void on_actingset_changed(bool same_primary) final;
+private:
+ ll_read_ierrorator::future<ceph::bufferlist>
+ _read(const hobject_t& hoid, uint64_t off,
+ uint64_t len, uint32_t flags) override;
+ rep_op_fut_t _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch, epoch_t max_epoch,
+ std::vector<pg_log_entry_t>&& log_entries) final;
+ const pg_t pgid;
+ const pg_shard_t whoami;
+ class pending_on_t : public seastar::weakly_referencable<pending_on_t> {
+ public:
+ pending_on_t(size_t pending, const eversion_t& at_version)
+ : pending{static_cast<unsigned>(pending)}, at_version(at_version)
+ {}
+ unsigned pending;
+ // The order of pending_txns' at_version must be the same as their
+ // corresponding ceph_tid_t, as we rely on this condition for checking
+ // whether a client request is already completed. To put it another
+ // way, client requests at_version must be updated synchorously/simultaneously
+ // with ceph_tid_t.
+ const eversion_t at_version;
+ crimson::osd::acked_peers_t acked_peers;
+ seastar::shared_promise<> all_committed;
+ };
+ using pending_transactions_t = std::map<ceph_tid_t, pending_on_t>;
+ pending_transactions_t pending_trans;
+
+ seastar::future<> request_committed(
+ const osd_reqid_t& reqid, const eversion_t& at_version) final;
+};
diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc
new file mode 100644
index 000000000..bd301cc2b
--- /dev/null
+++ b/src/crimson/osd/replicated_recovery_backend.cc
@@ -0,0 +1,1182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <seastar/core/future.hh>
+#include <seastar/core/do_with.hh>
+
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "osd/osd_types_fmt.h"
+#include "replicated_recovery_backend.h"
+#include "msg/Message.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::less;
+using std::map;
+using std::string;
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::recover_object(
+ const hobject_t& soid,
+ eversion_t need)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+ // always add_recovering(soid) before recover_object(soid)
+ assert(is_recovering(soid));
+ // start tracking the recovery of soid
+ return maybe_pull_missing_obj(soid, need).then_interruptible([this, soid, need] {
+ logger().debug("recover_object: loading obc: {}", soid);
+ return pg.obc_loader.with_obc<RWState::RWREAD>(soid,
+ [this, soid, need](auto obc) {
+ logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid);
+ auto& recovery_waiter = get_recovering(soid);
+ recovery_waiter.obc = obc;
+ recovery_waiter.obc->wait_recovery_read();
+ return maybe_push_shards(soid, need);
+ }).handle_error_interruptible(
+ crimson::osd::PG::load_obc_ertr::all_same_way([soid](auto& code) {
+ // TODO: may need eio handling?
+ logger().error("recover_object saw error code {}, ignoring object {}",
+ code, soid);
+ }));
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::maybe_push_shards(
+ const hobject_t& soid,
+ eversion_t need)
+{
+ return seastar::do_with(
+ get_shards_to_push(soid),
+ [this, need, soid](auto &shards) {
+ return interruptor::parallel_for_each(
+ shards,
+ [this, need, soid](auto shard) {
+ return prep_push(soid, need, shard).then_interruptible([this, soid, shard](auto push) {
+ auto msg = crimson::make_message<MOSDPGPush>();
+ msg->from = pg.get_pg_whoami();
+ msg->pgid = pg.get_pgid();
+ msg->map_epoch = pg.get_osdmap_epoch();
+ msg->min_epoch = pg.get_last_peering_reset();
+ msg->pushes.push_back(std::move(push));
+ msg->set_priority(pg.get_recovery_op_priority());
+ return interruptor::make_interruptible(
+ shard_services.send_to_osd(shard.osd,
+ std::move(msg),
+ pg.get_osdmap_epoch()))
+ .then_interruptible(
+ [this, soid, shard] {
+ return get_recovering(soid).wait_for_pushes(shard);
+ });
+ });
+ });
+ }).then_interruptible([this, soid] {
+ auto &recovery = get_recovering(soid);
+ if (auto push_info = recovery.pushing.begin();
+ push_info != recovery.pushing.end()) {
+ pg.get_recovery_handler()->on_global_recover(soid,
+ push_info->second.stat,
+ false);
+ } else if (recovery.pull_info) {
+ // no push happened (empty get_shards_to_push()) but pull actually did
+ pg.get_recovery_handler()->on_global_recover(soid,
+ recovery.pull_info->stat,
+ false);
+ } else {
+ // no pulls, no pushes
+ }
+ return seastar::make_ready_future<>();
+ }).handle_exception_interruptible([this, soid](auto e) {
+ auto &recovery = get_recovering(soid);
+ if (recovery.obc) {
+ recovery.obc->drop_recovery_read();
+ }
+ recovering.erase(soid);
+ return seastar::make_exception_future<>(e);
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::maybe_pull_missing_obj(
+ const hobject_t& soid,
+ eversion_t need)
+{
+ pg_missing_tracker_t local_missing = pg.get_local_missing();
+ if (!local_missing.is_missing(soid)) {
+ return seastar::make_ready_future<>();
+ }
+ PullOp pull_op;
+ auto& recovery_waiter = get_recovering(soid);
+ recovery_waiter.pull_info =
+ std::make_optional<RecoveryBackend::pull_info_t>();
+ auto& pull_info = *recovery_waiter.pull_info;
+ prepare_pull(pull_op, pull_info, soid, need);
+ auto msg = crimson::make_message<MOSDPGPull>();
+ msg->from = pg.get_pg_whoami();
+ msg->set_priority(pg.get_recovery_op_priority());
+ msg->pgid = pg.get_pgid();
+ msg->map_epoch = pg.get_osdmap_epoch();
+ msg->min_epoch = pg.get_last_peering_reset();
+ msg->set_pulls({std::move(pull_op)});
+ return interruptor::make_interruptible(
+ shard_services.send_to_osd(
+ pull_info.from.osd,
+ std::move(msg),
+ pg.get_osdmap_epoch()
+ )).then_interruptible([&recovery_waiter] {
+ return recovery_waiter.wait_for_pull();
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::push_delete(
+ const hobject_t& soid,
+ eversion_t need)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+ epoch_t min_epoch = pg.get_last_peering_reset();
+
+ assert(pg.get_acting_recovery_backfill().size() > 0);
+ return interruptor::parallel_for_each(pg.get_acting_recovery_backfill(),
+ [this, soid, need, min_epoch](pg_shard_t shard)
+ -> interruptible_future<> {
+ if (shard == pg.get_pg_whoami())
+ return seastar::make_ready_future<>();
+ auto iter = pg.get_shard_missing().find(shard);
+ if (iter == pg.get_shard_missing().end())
+ return seastar::make_ready_future<>();
+ if (iter->second.is_missing(soid)) {
+ logger().debug("push_delete: will remove {} from {}", soid, shard);
+ pg.begin_peer_recover(shard, soid);
+ spg_t target_pg(pg.get_info().pgid.pgid, shard.shard);
+ auto msg = crimson::make_message<MOSDPGRecoveryDelete>(
+ pg.get_pg_whoami(), target_pg, pg.get_osdmap_epoch(), min_epoch);
+ msg->set_priority(pg.get_recovery_op_priority());
+ msg->objects.push_back(std::make_pair(soid, need));
+ return interruptor::make_interruptible(
+ shard_services.send_to_osd(shard.osd, std::move(msg),
+ pg.get_osdmap_epoch())).then_interruptible(
+ [this, soid, shard] {
+ return get_recovering(soid).wait_for_pushes(shard);
+ });
+ }
+ return seastar::make_ready_future<>();
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_recovery_delete(
+ Ref<MOSDPGRecoveryDelete> m)
+{
+ logger().debug("{}: {}", __func__, *m);
+
+ auto& p = m->objects.front(); //TODO: only one delete per message for now.
+ return local_recover_delete(p.first, p.second, pg.get_osdmap_epoch())
+ .then_interruptible(
+ [this, m] {
+ auto reply = crimson::make_message<MOSDPGRecoveryDeleteReply>();
+ reply->from = pg.get_pg_whoami();
+ reply->set_priority(m->get_priority());
+ reply->pgid = spg_t(pg.get_info().pgid.pgid, m->from.shard);
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->objects = m->objects;
+ return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch());
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::on_local_recover_persist(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& _recovery_info,
+ bool is_delete,
+ epoch_t epoch_frozen)
+{
+ logger().debug("{}", __func__);
+ ceph::os::Transaction t;
+ pg.get_recovery_handler()->on_local_recover(soid, _recovery_info, is_delete, t);
+ logger().debug("ReplicatedRecoveryBackend::on_local_recover_persist: do_transaction...");
+ return interruptor::make_interruptible(
+ shard_services.get_store().do_transaction(coll, std::move(t)))
+ .then_interruptible(
+ [this, epoch_frozen, last_complete = pg.get_info().last_complete] {
+ pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+ return seastar::make_ready_future<>();
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::local_recover_delete(
+ const hobject_t& soid,
+ eversion_t need,
+ epoch_t epoch_to_freeze)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+ return backend->load_metadata(soid).safe_then_interruptible([this]
+ (auto lomt) -> interruptible_future<> {
+ if (lomt->os.exists) {
+ return seastar::do_with(ceph::os::Transaction(),
+ [this, lomt = std::move(lomt)](auto& txn) {
+ return backend->remove(lomt->os, txn).then_interruptible(
+ [this, &txn]() mutable {
+ logger().debug("ReplicatedRecoveryBackend::local_recover_delete: do_transaction...");
+ return shard_services.get_store().do_transaction(coll,
+ std::move(txn));
+ });
+ });
+ }
+ return seastar::make_ready_future<>();
+ }).safe_then_interruptible([this, soid, epoch_to_freeze, need] {
+ ObjectRecoveryInfo recovery_info;
+ recovery_info.soid = soid;
+ recovery_info.version = need;
+ return on_local_recover_persist(soid, recovery_info,
+ true, epoch_to_freeze);
+ }, PGBackend::load_metadata_ertr::all_same_way(
+ [this, soid, epoch_to_freeze, need] (auto e) {
+ ObjectRecoveryInfo recovery_info;
+ recovery_info.soid = soid;
+ recovery_info.version = need;
+ return on_local_recover_persist(soid, recovery_info,
+ true, epoch_to_freeze);
+ })
+ );
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::recover_delete(
+ const hobject_t &soid, eversion_t need)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+
+ epoch_t cur_epoch = pg.get_osdmap_epoch();
+ return seastar::do_with(object_stat_sum_t(),
+ [this, soid, need, cur_epoch](auto& stat_diff) {
+ return local_recover_delete(soid, need, cur_epoch).then_interruptible(
+ [this, &stat_diff, cur_epoch, soid, need]()
+ -> interruptible_future<> {
+ if (!pg.has_reset_since(cur_epoch)) {
+ bool object_missing = false;
+ for (const auto& shard : pg.get_acting_recovery_backfill()) {
+ if (shard == pg.get_pg_whoami())
+ continue;
+ if (pg.get_shard_missing(shard)->is_missing(soid)) {
+ logger().debug("recover_delete: soid {} needs to deleted from replca {}",
+ soid, shard);
+ object_missing = true;
+ break;
+ }
+ }
+
+ if (!object_missing) {
+ stat_diff.num_objects_recovered = 1;
+ return seastar::make_ready_future<>();
+ } else {
+ return push_delete(soid, need);
+ }
+ }
+ return seastar::make_ready_future<>();
+ }).then_interruptible([this, soid, &stat_diff] {
+ pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+ return seastar::make_ready_future<>();
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<PushOp>
+ReplicatedRecoveryBackend::prep_push(
+ const hobject_t& soid,
+ eversion_t need,
+ pg_shard_t pg_shard)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+
+ auto& recovery_waiter = get_recovering(soid);
+ auto& obc = recovery_waiter.obc;
+ interval_set<uint64_t> data_subset;
+ if (obc->obs.oi.size) {
+ data_subset.insert(0, obc->obs.oi.size);
+ }
+ const auto& missing = pg.get_shard_missing().find(pg_shard)->second;
+ const auto it = missing.get_items().find(soid);
+ assert(it != missing.get_items().end());
+ data_subset.intersection_of(it->second.clean_regions.get_dirty_regions());
+ logger().debug("prep_push: {} data_subset {} to {}",
+ soid, data_subset, pg_shard);
+
+ auto& push_info = recovery_waiter.pushing[pg_shard];
+ pg.begin_peer_recover(pg_shard, soid);
+ const auto pmissing_iter = pg.get_shard_missing().find(pg_shard);
+ const auto missing_iter = pmissing_iter->second.get_items().find(soid);
+ assert(missing_iter != pmissing_iter->second.get_items().end());
+
+ push_info.obc = obc;
+ push_info.recovery_info.size = obc->obs.oi.size;
+ push_info.recovery_info.copy_subset = data_subset;
+ push_info.recovery_info.soid = soid;
+ push_info.recovery_info.oi = obc->obs.oi;
+ push_info.recovery_info.version = obc->obs.oi.version;
+ push_info.recovery_info.object_exist =
+ missing_iter->second.clean_regions.object_is_exist();
+ push_info.recovery_progress.omap_complete =
+ !missing_iter->second.clean_regions.omap_is_dirty();
+
+ return build_push_op(push_info.recovery_info,
+ push_info.recovery_progress,
+ &push_info.stat).then_interruptible(
+ [this, soid, pg_shard](auto push_op) {
+ auto& recovery_waiter = get_recovering(soid);
+ auto& push_info = recovery_waiter.pushing[pg_shard];
+ push_info.recovery_progress = push_op.after_progress;
+ return push_op;
+ });
+}
+
+void ReplicatedRecoveryBackend::prepare_pull(PullOp& pull_op,
+ pull_info_t& pull_info,
+ const hobject_t& soid,
+ eversion_t need) {
+ logger().debug("{}: {}, {}", __func__, soid, need);
+
+ pg_missing_tracker_t local_missing = pg.get_local_missing();
+ const auto missing_iter = local_missing.get_items().find(soid);
+ auto m = pg.get_missing_loc_shards();
+ pg_shard_t fromshard = *(m[soid].begin());
+
+ //TODO: skipped snap objects case for now
+ pull_op.recovery_info.copy_subset.insert(0, (uint64_t) -1);
+ pull_op.recovery_info.copy_subset.intersection_of(
+ missing_iter->second.clean_regions.get_dirty_regions());
+ pull_op.recovery_info.size = ((uint64_t) -1);
+ pull_op.recovery_info.object_exist =
+ missing_iter->second.clean_regions.object_is_exist();
+ pull_op.recovery_info.soid = soid;
+ pull_op.soid = soid;
+ pull_op.recovery_progress.data_complete = false;
+ pull_op.recovery_progress.omap_complete =
+ !missing_iter->second.clean_regions.omap_is_dirty();
+ pull_op.recovery_progress.data_recovered_to = 0;
+ pull_op.recovery_progress.first = true;
+
+ pull_info.from = fromshard;
+ pull_info.soid = soid;
+ pull_info.recovery_info = pull_op.recovery_info;
+ pull_info.recovery_progress = pull_op.recovery_progress;
+}
+
+RecoveryBackend::interruptible_future<PushOp>
+ReplicatedRecoveryBackend::build_push_op(
+ const ObjectRecoveryInfo& recovery_info,
+ const ObjectRecoveryProgress& progress,
+ object_stat_sum_t* stat)
+{
+ logger().debug("{} {} @{}",
+ __func__, recovery_info.soid, recovery_info.version);
+ return seastar::do_with(ObjectRecoveryProgress(progress),
+ uint64_t(crimson::common::local_conf()
+ ->osd_recovery_max_chunk),
+ recovery_info.version,
+ PushOp(),
+ [this, &recovery_info, &progress, stat]
+ (auto& new_progress, auto& available, auto& v, auto& push_op) {
+ return read_metadata_for_push_op(recovery_info.soid,
+ progress, new_progress,
+ v, &push_op
+ ).then_interruptible([&](eversion_t local_ver) mutable {
+ // If requestor didn't know the version, use ours
+ if (v == eversion_t()) {
+ v = local_ver;
+ } else if (v != local_ver) {
+ logger().error("build_push_op: {} push {} v{} failed because local copy is {}",
+ pg.get_pgid(), recovery_info.soid, recovery_info.version, local_ver);
+ // TODO: bail out
+ }
+ return read_omap_for_push_op(recovery_info.soid,
+ progress,
+ new_progress,
+ available, &push_op);
+ }).then_interruptible([this, &recovery_info, &progress,
+ &available, &push_op]() mutable {
+ logger().debug("build_push_op: available: {}, copy_subset: {}",
+ available, recovery_info.copy_subset);
+ return read_object_for_push_op(recovery_info.soid,
+ recovery_info.copy_subset,
+ progress.data_recovered_to,
+ available, &push_op);
+ }).then_interruptible([&recovery_info, &v, &progress,
+ &new_progress, stat, &push_op]
+ (uint64_t recovered_to) mutable {
+ new_progress.data_recovered_to = recovered_to;
+ if (new_progress.is_complete(recovery_info)) {
+ new_progress.data_complete = true;
+ if (stat)
+ stat->num_objects_recovered++;
+ } else if (progress.first && progress.omap_complete) {
+ // If omap is not changed, we need recovery omap
+ // when recovery cannot be completed once
+ new_progress.omap_complete = false;
+ }
+ if (stat) {
+ stat->num_keys_recovered += push_op.omap_entries.size();
+ stat->num_bytes_recovered += push_op.data.length();
+ }
+ push_op.version = v;
+ push_op.soid = recovery_info.soid;
+ push_op.recovery_info = recovery_info;
+ push_op.after_progress = new_progress;
+ push_op.before_progress = progress;
+ logger().debug("build_push_op: push_op version:"
+ " {}, push_op data length: {}",
+ push_op.version, push_op.data.length());
+ return seastar::make_ready_future<PushOp>(std::move(push_op));
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<eversion_t>
+ReplicatedRecoveryBackend::read_metadata_for_push_op(
+ const hobject_t& oid,
+ const ObjectRecoveryProgress& progress,
+ ObjectRecoveryProgress& new_progress,
+ eversion_t ver,
+ PushOp* push_op)
+{
+ logger().debug("{}, {}", __func__, oid);
+ if (!progress.first) {
+ return seastar::make_ready_future<eversion_t>(ver);
+ }
+ return interruptor::make_interruptible(interruptor::when_all_succeed(
+ backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>(
+ crimson::os::FuturizedStore::Shard::read_errorator::all_same_way(
+ [oid] (const std::error_code& e) {
+ logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid);
+ return seastar::make_ready_future<bufferlist>();
+ })),
+ interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid)))
+ .handle_error_interruptible<false>(
+ crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way(
+ [oid] (const std::error_code& e) {
+ logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid);
+ return seastar::make_ready_future<crimson::os::FuturizedStore::Shard::attrs_t>();
+ }))
+ )).then_unpack_interruptible([&new_progress, push_op](auto bl, auto attrs) {
+ if (bl.length() == 0) {
+ logger().warn("read_metadata_for_push_op: fail to read omap header");
+ } else if (attrs.empty()) {
+ logger().error("read_metadata_for_push_op: fail to read attrs");
+ return eversion_t{};
+ }
+ push_op->omap_header.claim_append(std::move(bl));
+ for (auto&& [key, val] : attrs) {
+ push_op->attrset.emplace(std::move(key), std::move(val));
+ }
+ logger().debug("read_metadata_for_push_op: {}", push_op->attrset[OI_ATTR]);
+ object_info_t oi;
+ oi.decode_no_oid(push_op->attrset[OI_ATTR]);
+ new_progress.first = false;
+ return oi.version;
+ });
+}
+
+RecoveryBackend::interruptible_future<uint64_t>
+ReplicatedRecoveryBackend::read_object_for_push_op(
+ const hobject_t& oid,
+ const interval_set<uint64_t>& copy_subset,
+ uint64_t offset,
+ uint64_t max_len,
+ PushOp* push_op)
+{
+ if (max_len == 0 || copy_subset.empty()) {
+ push_op->data_included.clear();
+ return seastar::make_ready_future<uint64_t>(offset);
+ }
+ // 1. get the extents in the interested range
+ return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid},
+ 0, copy_subset.range_end())).safe_then_interruptible(
+ [=, this](auto&& fiemap_included) mutable {
+ interval_set<uint64_t> extents;
+ try {
+ extents.intersection_of(copy_subset, std::move(fiemap_included));
+ } catch (std::exception &) {
+ // if fiemap() fails, we will read nothing, as the intersection of
+ // copy_subset and an empty interval_set would be empty anyway
+ extents.clear();
+ }
+ // 2. we can read up to "max_len" bytes from "offset", so truncate the
+ // extents down to this quota. no need to return the number of consumed
+ // bytes, as this is the last consumer of this quota
+ push_op->data_included.span_of(extents, offset, max_len);
+ // 3. read the truncated extents
+ // TODO: check if the returned extents are pruned
+ return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid},
+ push_op->data_included, 0));
+ }).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) {
+ push_op->data.claim_append(std::move(bl));
+ uint64_t recovered_to = 0;
+ if (push_op->data_included.empty()) {
+ // zero filled section, skip to end!
+ recovered_to = range_end;
+ } else {
+ // note down the progress, we will start from there next time
+ recovered_to = push_op->data_included.range_end();
+ }
+ return seastar::make_ready_future<uint64_t>(recovered_to);
+ }, PGBackend::read_errorator::all_same_way([](auto e) {
+ logger().debug("build_push_op: read exception");
+ return seastar::make_exception_future<uint64_t>(e);
+ }));
+}
+
+static std::optional<std::string> nullopt_if_empty(const std::string& s)
+{
+ return s.empty() ? std::nullopt : std::make_optional(s);
+}
+
+static bool is_too_many_entries_per_chunk(const PushOp* push_op)
+{
+ const uint64_t entries_per_chunk =
+ crimson::common::local_conf()->osd_recovery_max_omap_entries_per_chunk;
+ if (!entries_per_chunk) {
+ // the limit is disabled
+ return false;
+ }
+ return push_op->omap_entries.size() >= entries_per_chunk;
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::read_omap_for_push_op(
+ const hobject_t& oid,
+ const ObjectRecoveryProgress& progress,
+ ObjectRecoveryProgress& new_progress,
+ uint64_t& max_len,
+ PushOp* push_op)
+{
+ if (progress.omap_complete) {
+ return seastar::make_ready_future<>();
+ }
+ return seastar::repeat([&new_progress, &max_len, push_op, &oid, this] {
+ return shard_services.get_store().omap_get_values(
+ coll, ghobject_t{oid}, nullopt_if_empty(new_progress.omap_recovered_to)
+ ).safe_then([&new_progress, &max_len, push_op](const auto& ret) {
+ const auto& [done, kvs] = ret;
+ bool stop = done;
+ // assuming "values.empty() only if done" holds here!
+ for (const auto& [key, value] : kvs) {
+ if (is_too_many_entries_per_chunk(push_op)) {
+ stop = true;
+ break;
+ }
+ if (const uint64_t entry_size = key.size() + value.length();
+ entry_size > max_len) {
+ stop = true;
+ break;
+ } else {
+ max_len -= std::min(max_len, entry_size);
+ }
+ push_op->omap_entries.emplace(key, value);
+ }
+ if (!push_op->omap_entries.empty()) {
+ // we iterate in order
+ new_progress.omap_recovered_to = std::rbegin(push_op->omap_entries)->first;
+ }
+ if (done) {
+ new_progress.omap_complete = true;
+ }
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ stop ? seastar::stop_iteration::yes : seastar::stop_iteration::no
+ );
+ }, crimson::os::FuturizedStore::Shard::read_errorator::assert_all{});
+ });
+}
+
+std::vector<pg_shard_t>
+ReplicatedRecoveryBackend::get_shards_to_push(const hobject_t& soid) const
+{
+ std::vector<pg_shard_t> shards;
+ assert(pg.get_acting_recovery_backfill().size() > 0);
+ for (const auto& peer : pg.get_acting_recovery_backfill()) {
+ if (peer == pg.get_pg_whoami())
+ continue;
+ auto shard_missing =
+ pg.get_shard_missing().find(peer);
+ assert(shard_missing != pg.get_shard_missing().end());
+ if (shard_missing->second.is_missing(soid)) {
+ shards.push_back(shard_missing->first);
+ }
+ }
+ return shards;
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_pull(Ref<MOSDPGPull> m)
+{
+ logger().debug("{}: {}", __func__, *m);
+ if (pg.can_discard_replica_op(*m)) {
+ logger().debug("{}: discarding {}", __func__, *m);
+ return seastar::now();
+ }
+ return seastar::do_with(m->take_pulls(), [this, from=m->from](auto& pulls) {
+ return interruptor::parallel_for_each(pulls,
+ [this, from](auto& pull_op) {
+ const hobject_t& soid = pull_op.soid;
+ logger().debug("handle_pull: {}", soid);
+ return backend->stat(coll, ghobject_t(soid)).then_interruptible(
+ [this, &pull_op](auto st) {
+ ObjectRecoveryInfo &recovery_info = pull_op.recovery_info;
+ ObjectRecoveryProgress &progress = pull_op.recovery_progress;
+ if (progress.first && recovery_info.size == ((uint64_t) -1)) {
+ // Adjust size and copy_subset
+ recovery_info.size = st.st_size;
+ if (st.st_size) {
+ interval_set<uint64_t> object_range;
+ object_range.insert(0, st.st_size);
+ recovery_info.copy_subset.intersection_of(object_range);
+ } else {
+ recovery_info.copy_subset.clear();
+ }
+ assert(recovery_info.clone_subset.empty());
+ }
+ return build_push_op(recovery_info, progress, 0);
+ }).then_interruptible([this, from](auto push_op) {
+ auto msg = crimson::make_message<MOSDPGPush>();
+ msg->from = pg.get_pg_whoami();
+ msg->pgid = pg.get_pgid();
+ msg->map_epoch = pg.get_osdmap_epoch();
+ msg->min_epoch = pg.get_last_peering_reset();
+ msg->set_priority(pg.get_recovery_op_priority());
+ msg->pushes.push_back(std::move(push_op));
+ return shard_services.send_to_osd(from.osd, std::move(msg),
+ pg.get_osdmap_epoch());
+ });
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<bool>
+ReplicatedRecoveryBackend::_handle_pull_response(
+ pg_shard_t from,
+ PushOp& push_op,
+ PullOp* response,
+ ceph::os::Transaction* t)
+{
+ logger().debug("handle_pull_response {} {} data.size() is {} data_included: {}",
+ push_op.recovery_info, push_op.after_progress,
+ push_op.data.length(), push_op.data_included);
+
+ const hobject_t &hoid = push_op.soid;
+ auto& recovery_waiter = get_recovering(hoid);
+ auto& pull_info = *recovery_waiter.pull_info;
+ if (pull_info.recovery_info.size == (uint64_t(-1))) {
+ pull_info.recovery_info.size = push_op.recovery_info.size;
+ pull_info.recovery_info.copy_subset.intersection_of(
+ push_op.recovery_info.copy_subset);
+ }
+
+ // If primary doesn't have object info and didn't know version
+ if (pull_info.recovery_info.version == eversion_t())
+ pull_info.recovery_info.version = push_op.version;
+
+ auto prepare_waiter = interruptor::make_interruptible(
+ seastar::make_ready_future<>());
+ if (pull_info.recovery_progress.first) {
+ prepare_waiter = pg.obc_loader.with_obc<RWState::RWNONE>(
+ pull_info.recovery_info.soid,
+ [&pull_info, &recovery_waiter, &push_op](auto obc) {
+ pull_info.obc = obc;
+ recovery_waiter.obc = obc;
+ obc->obs.oi.decode_no_oid(push_op.attrset.at(OI_ATTR), push_op.soid);
+ pull_info.recovery_info.oi = obc->obs.oi;
+ return crimson::osd::PG::load_obc_ertr::now();
+ }).handle_error_interruptible(crimson::ct_error::assert_all{});
+ };
+ return prepare_waiter.then_interruptible(
+ [this, &pull_info, &push_op, t, response]() mutable {
+ const bool first = pull_info.recovery_progress.first;
+ pull_info.recovery_progress = push_op.after_progress;
+ logger().debug("new recovery_info {}, new progress {}",
+ pull_info.recovery_info, pull_info.recovery_progress);
+ interval_set<uint64_t> data_zeros;
+ {
+ uint64_t offset = push_op.before_progress.data_recovered_to;
+ uint64_t length = (push_op.after_progress.data_recovered_to -
+ push_op.before_progress.data_recovered_to);
+ if (length) {
+ data_zeros.insert(offset, length);
+ }
+ }
+ auto [usable_intervals, data] =
+ trim_pushed_data(pull_info.recovery_info.copy_subset,
+ push_op.data_included, push_op.data);
+ bool complete = pull_info.is_complete();
+ bool clear_omap = !push_op.before_progress.omap_complete;
+ return submit_push_data(pull_info.recovery_info,
+ first, complete, clear_omap,
+ std::move(data_zeros), std::move(usable_intervals),
+ std::move(data), std::move(push_op.omap_header),
+ push_op.attrset, std::move(push_op.omap_entries), t)
+ .then_interruptible(
+ [this, response, &pull_info, &push_op, complete,
+ t, bytes_recovered=data.length()] {
+ pull_info.stat.num_keys_recovered += push_op.omap_entries.size();
+ pull_info.stat.num_bytes_recovered += bytes_recovered;
+
+ if (complete) {
+ pull_info.stat.num_objects_recovered++;
+ pg.get_recovery_handler()->on_local_recover(
+ push_op.soid, get_recovering(push_op.soid).pull_info->recovery_info,
+ false, *t);
+ return true;
+ } else {
+ response->soid = push_op.soid;
+ response->recovery_info = pull_info.recovery_info;
+ response->recovery_progress = pull_info.recovery_progress;
+ return false;
+ }
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_pull_response(
+ Ref<MOSDPGPush> m)
+{
+ if (pg.can_discard_replica_op(*m)) {
+ logger().debug("{}: discarding {}", __func__, *m);
+ return seastar::now();
+ }
+ const PushOp& push_op = m->pushes[0]; //TODO: only one push per message for now.
+ if (push_op.version == eversion_t()) {
+ // replica doesn't have it!
+ pg.get_recovery_handler()->on_failed_recover({ m->from }, push_op.soid,
+ get_recovering(push_op.soid).pull_info->recovery_info.version);
+ return seastar::make_exception_future<>(
+ std::runtime_error(fmt::format(
+ "Error on pushing side {} when pulling obj {}",
+ m->from, push_op.soid)));
+ }
+
+ logger().debug("{}: {}", __func__, *m);
+ return seastar::do_with(PullOp(), [this, m](auto& response) {
+ return seastar::do_with(ceph::os::Transaction(), m.get(),
+ [this, &response](auto& t, auto& m) {
+ pg_shard_t from = m->from;
+ PushOp& push_op = m->pushes[0]; // only one push per message for now
+ return _handle_pull_response(from, push_op, &response, &t
+ ).then_interruptible(
+ [this, &t](bool complete) {
+ epoch_t epoch_frozen = pg.get_osdmap_epoch();
+ logger().debug("ReplicatedRecoveryBackend::handle_pull_response: do_transaction...");
+ return shard_services.get_store().do_transaction(coll, std::move(t))
+ .then([this, epoch_frozen, complete,
+ last_complete = pg.get_info().last_complete] {
+ pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+ return seastar::make_ready_future<bool>(complete);
+ });
+ });
+ }).then_interruptible([this, m, &response](bool complete) {
+ if (complete) {
+ auto& push_op = m->pushes[0];
+ get_recovering(push_op.soid).set_pulled();
+ return seastar::make_ready_future<>();
+ } else {
+ auto reply = crimson::make_message<MOSDPGPull>();
+ reply->from = pg.get_pg_whoami();
+ reply->set_priority(m->get_priority());
+ reply->pgid = pg.get_info().pgid;
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->set_pulls({std::move(response)});
+ return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch());
+ }
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::_handle_push(
+ pg_shard_t from,
+ PushOp &push_op,
+ PushReplyOp *response,
+ ceph::os::Transaction *t)
+{
+ logger().debug("{}", __func__);
+
+ bool first = push_op.before_progress.first;
+ interval_set<uint64_t> data_zeros;
+ {
+ uint64_t offset = push_op.before_progress.data_recovered_to;
+ uint64_t length = (push_op.after_progress.data_recovered_to -
+ push_op.before_progress.data_recovered_to);
+ if (length) {
+ data_zeros.insert(offset, length);
+ }
+ }
+ bool complete = (push_op.after_progress.data_complete &&
+ push_op.after_progress.omap_complete);
+ bool clear_omap = !push_op.before_progress.omap_complete;
+ response->soid = push_op.recovery_info.soid;
+
+ return submit_push_data(push_op.recovery_info, first, complete, clear_omap,
+ std::move(data_zeros),
+ std::move(push_op.data_included),
+ std::move(push_op.data),
+ std::move(push_op.omap_header),
+ push_op.attrset,
+ std::move(push_op.omap_entries), t)
+ .then_interruptible(
+ [this, complete, &push_op, t] {
+ if (complete) {
+ pg.get_recovery_handler()->on_local_recover(
+ push_op.recovery_info.soid, push_op.recovery_info,
+ false, *t);
+ }
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_push(
+ Ref<MOSDPGPush> m)
+{
+ if (pg.can_discard_replica_op(*m)) {
+ logger().debug("{}: discarding {}", __func__, *m);
+ return seastar::now();
+ }
+ if (pg.is_primary()) {
+ return handle_pull_response(m);
+ }
+
+ logger().debug("{}: {}", __func__, *m);
+ return seastar::do_with(PushReplyOp(), [this, m](auto& response) {
+ PushOp& push_op = m->pushes[0]; // TODO: only one push per message for now
+ return seastar::do_with(ceph::os::Transaction(),
+ [this, m, &push_op, &response](auto& t) {
+ return _handle_push(m->from, push_op, &response, &t).then_interruptible(
+ [this, &t] {
+ epoch_t epoch_frozen = pg.get_osdmap_epoch();
+ logger().debug("ReplicatedRecoveryBackend::handle_push: do_transaction...");
+ return interruptor::make_interruptible(
+ shard_services.get_store().do_transaction(coll, std::move(t))).then_interruptible(
+ [this, epoch_frozen, last_complete = pg.get_info().last_complete] {
+ //TODO: this should be grouped with pg.on_local_recover somehow.
+ pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+ });
+ });
+ }).then_interruptible([this, m, &response]() mutable {
+ auto reply = crimson::make_message<MOSDPGPushReply>();
+ reply->from = pg.get_pg_whoami();
+ reply->set_priority(m->get_priority());
+ reply->pgid = pg.get_info().pgid;
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ std::vector<PushReplyOp> replies = { std::move(response) };
+ reply->replies.swap(replies);
+ return shard_services.send_to_osd(m->from.osd,
+ std::move(reply), pg.get_osdmap_epoch());
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<std::optional<PushOp>>
+ReplicatedRecoveryBackend::_handle_push_reply(
+ pg_shard_t peer,
+ const PushReplyOp &op)
+{
+ const hobject_t& soid = op.soid;
+ logger().debug("{}, soid {}, from {}", __func__, soid, peer);
+ auto recovering_iter = recovering.find(soid);
+ if (recovering_iter == recovering.end()
+ || !recovering_iter->second->pushing.count(peer)) {
+ logger().debug("huh, i wasn't pushing {} to osd.{}", soid, peer);
+ return seastar::make_ready_future<std::optional<PushOp>>();
+ } else {
+ auto& push_info = recovering_iter->second->pushing[peer];
+ bool error = push_info.recovery_progress.error;
+ if (!push_info.recovery_progress.data_complete && !error) {
+ return build_push_op(push_info.recovery_info, push_info.recovery_progress,
+ &push_info.stat
+ ).then_interruptible([&push_info] (auto push_op) {
+ push_info.recovery_progress = push_op.after_progress;
+ return seastar::make_ready_future<std::optional<PushOp>>(
+ std::move(push_op));
+ }).handle_exception_interruptible(
+ [recovering_iter, &push_info, peer] (auto e) {
+ push_info.recovery_progress.error = true;
+ recovering_iter->second->set_push_failed(peer, e);
+ return seastar::make_ready_future<std::optional<PushOp>>();
+ });
+ }
+ if (!error) {
+ pg.get_recovery_handler()->on_peer_recover(peer,
+ soid,
+ push_info.recovery_info);
+ }
+ recovering_iter->second->set_pushed(peer);
+ return seastar::make_ready_future<std::optional<PushOp>>();
+ }
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_push_reply(
+ Ref<MOSDPGPushReply> m)
+{
+ logger().debug("{}: {}", __func__, *m);
+ auto from = m->from;
+ auto& push_reply = m->replies[0]; //TODO: only one reply per message
+
+ return _handle_push_reply(from, push_reply).then_interruptible(
+ [this, from](std::optional<PushOp> push_op) {
+ if (push_op) {
+ auto msg = crimson::make_message<MOSDPGPush>();
+ msg->from = pg.get_pg_whoami();
+ msg->pgid = pg.get_pgid();
+ msg->map_epoch = pg.get_osdmap_epoch();
+ msg->min_epoch = pg.get_last_peering_reset();
+ msg->set_priority(pg.get_recovery_op_priority());
+ msg->pushes.push_back(std::move(*push_op));
+ return shard_services.send_to_osd(from.osd,
+ std::move(msg),
+ pg.get_osdmap_epoch());
+ } else {
+ return seastar::make_ready_future<>();
+ }
+ });
+}
+
+std::pair<interval_set<uint64_t>,
+ bufferlist>
+ReplicatedRecoveryBackend::trim_pushed_data(
+ const interval_set<uint64_t> &copy_subset,
+ const interval_set<uint64_t> &intervals_received,
+ ceph::bufferlist data_received)
+{
+ logger().debug("{}", __func__);
+ // what i have is only a subset of what i want
+ if (intervals_received.subset_of(copy_subset)) {
+ return {intervals_received, data_received};
+ }
+ // only collect the extents included by copy_subset and intervals_received
+ interval_set<uint64_t> intervals_usable;
+ bufferlist data_usable;
+ intervals_usable.intersection_of(copy_subset, intervals_received);
+ uint64_t have_off = 0;
+ for (auto [have_start, have_len] : intervals_received) {
+ interval_set<uint64_t> want;
+ want.insert(have_start, have_len);
+ want.intersection_of(copy_subset);
+ for (auto [want_start, want_len] : want) {
+ bufferlist sub;
+ uint64_t data_off = have_off + (want_start - have_start);
+ sub.substr_of(data_received, data_off, want_len);
+ data_usable.claim_append(sub);
+ }
+ have_off += have_len;
+ }
+ return {intervals_usable, data_usable};
+}
+
+RecoveryBackend::interruptible_future<hobject_t>
+ReplicatedRecoveryBackend::prep_push_target(
+ const ObjectRecoveryInfo& recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ ObjectStore::Transaction* t,
+ const map<string, bufferlist, less<>>& attrs,
+ bufferlist&& omap_header)
+{
+ if (!first) {
+ return seastar::make_ready_future<hobject_t>(
+ get_temp_recovery_object(recovery_info.soid,
+ recovery_info.version));
+ }
+
+ ghobject_t target_oid;
+ if (complete) {
+ // overwrite the original object
+ target_oid = ghobject_t(recovery_info.soid);
+ } else {
+ target_oid = ghobject_t(get_temp_recovery_object(recovery_info.soid,
+ recovery_info.version));
+ logger().debug("{}: Adding oid {} in the temp collection",
+ __func__, target_oid);
+ add_temp_obj(target_oid.hobj);
+ }
+ // create a new object
+ if (!complete || !recovery_info.object_exist) {
+ t->remove(coll->get_cid(), target_oid);
+ t->touch(coll->get_cid(), target_oid);
+ object_info_t oi;
+ oi.decode_no_oid(attrs.at(OI_ATTR));
+ t->set_alloc_hint(coll->get_cid(), target_oid,
+ oi.expected_object_size,
+ oi.expected_write_size,
+ oi.alloc_hint_flags);
+ }
+ if (complete) {
+ // remove xattr and update later if overwrite on original object
+ t->rmattrs(coll->get_cid(), target_oid);
+ // if need update omap, clear the previous content first
+ if (clear_omap) {
+ t->omap_clear(coll->get_cid(), target_oid);
+ }
+ }
+ t->truncate(coll->get_cid(), target_oid, recovery_info.size);
+ if (omap_header.length()) {
+ t->omap_setheader(coll->get_cid(), target_oid, omap_header);
+ }
+ if (complete || !recovery_info.object_exist) {
+ return seastar::make_ready_future<hobject_t>(target_oid.hobj);
+ }
+ // clone overlap content in local object if using a new object
+ return interruptor::make_interruptible(store->stat(coll, ghobject_t(recovery_info.soid)))
+ .then_interruptible(
+ [this, &recovery_info, t, target_oid] (auto st) {
+ // TODO: pg num bytes counting
+ uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size);
+ interval_set<uint64_t> local_intervals_included, local_intervals_excluded;
+ if (local_size) {
+ local_intervals_included.insert(0, local_size);
+ local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset);
+ local_intervals_included.subtract(local_intervals_excluded);
+ }
+ for (auto [off, len] : local_intervals_included) {
+ logger().debug(" clone_range {} {}~{}",
+ recovery_info.soid, off, len);
+ t->clone_range(coll->get_cid(), ghobject_t(recovery_info.soid),
+ target_oid, off, len, off);
+ }
+ return seastar::make_ready_future<hobject_t>(target_oid.hobj);
+ });
+}
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::submit_push_data(
+ const ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ interval_set<uint64_t>&& data_zeros,
+ interval_set<uint64_t>&& intervals_included,
+ bufferlist&& data_included,
+ bufferlist&& omap_header,
+ const map<string, bufferlist, less<>> &attrs,
+ map<string, bufferlist>&& omap_entries,
+ ObjectStore::Transaction *t)
+{
+ logger().debug("{}", __func__);
+ return prep_push_target(recovery_info, first, complete,
+ clear_omap, t, attrs,
+ std::move(omap_header)).then_interruptible(
+ [this,
+ &recovery_info, t,
+ first, complete,
+ data_zeros=std::move(data_zeros),
+ intervals_included=std::move(intervals_included),
+ data_included=std::move(data_included),
+ omap_entries=std::move(omap_entries),
+ &attrs](auto target_oid) mutable {
+
+ uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
+ // Punch zeros for data, if fiemap indicates nothing but it is marked dirty
+ if (!data_zeros.empty()) {
+ data_zeros.intersection_of(recovery_info.copy_subset);
+ assert(intervals_included.subset_of(data_zeros));
+ data_zeros.subtract(intervals_included);
+
+ logger().debug("submit_push_data recovering object {} copy_subset: {} "
+ "intervals_included: {} data_zeros: {}",
+ recovery_info.soid, recovery_info.copy_subset,
+ intervals_included, data_zeros);
+
+ for (auto [start, len] : data_zeros) {
+ t->zero(coll->get_cid(), ghobject_t(target_oid), start, len);
+ }
+ }
+ uint64_t off = 0;
+ for (auto [start, len] : intervals_included) {
+ bufferlist bit;
+ bit.substr_of(data_included, off, len);
+ t->write(coll->get_cid(), ghobject_t(target_oid),
+ start, len, bit, fadvise_flags);
+ off += len;
+ }
+
+ if (!omap_entries.empty())
+ t->omap_setkeys(coll->get_cid(), ghobject_t(target_oid), omap_entries);
+ if (!attrs.empty())
+ t->setattrs(coll->get_cid(), ghobject_t(target_oid), attrs);
+
+ if (complete) {
+ if (!first) {
+ logger().debug("submit_push_data: Removing oid {} from the temp collection",
+ target_oid);
+ clear_temp_obj(target_oid);
+ t->remove(coll->get_cid(), ghobject_t(recovery_info.soid));
+ t->collection_move_rename(coll->get_cid(), ghobject_t(target_oid),
+ coll->get_cid(), ghobject_t(recovery_info.soid));
+ }
+ submit_push_complete(recovery_info, t);
+ }
+ logger().debug("submit_push_data: done");
+ return seastar::make_ready_future<>();
+ });
+}
+
+void ReplicatedRecoveryBackend::submit_push_complete(
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t)
+{
+ for (const auto& [oid, extents] : recovery_info.clone_subset) {
+ for (const auto& [off, len] : extents) {
+ logger().debug(" clone_range {} {}~{}", oid, off, len);
+ t->clone_range(coll->get_cid(), ghobject_t(oid), ghobject_t(recovery_info.soid),
+ off, len, off);
+ }
+ }
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_recovery_delete_reply(
+ Ref<MOSDPGRecoveryDeleteReply> m)
+{
+ auto& p = m->objects.front();
+ hobject_t soid = p.first;
+ ObjectRecoveryInfo recovery_info;
+ recovery_info.version = p.second;
+ pg.get_recovery_handler()->on_peer_recover(m->from, soid, recovery_info);
+ get_recovering(soid).set_pushed(m->from);
+ return seastar::now();
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_recovery_op(
+ Ref<MOSDFastDispatchOp> m,
+ crimson::net::ConnectionRef conn)
+{
+ switch (m->get_header().type) {
+ case MSG_OSD_PG_PULL:
+ return handle_pull(boost::static_pointer_cast<MOSDPGPull>(m));
+ case MSG_OSD_PG_PUSH:
+ return handle_push(boost::static_pointer_cast<MOSDPGPush>(m));
+ case MSG_OSD_PG_PUSH_REPLY:
+ return handle_push_reply(
+ boost::static_pointer_cast<MOSDPGPushReply>(m));
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ return handle_recovery_delete(
+ boost::static_pointer_cast<MOSDPGRecoveryDelete>(m));
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ return handle_recovery_delete_reply(
+ boost::static_pointer_cast<MOSDPGRecoveryDeleteReply>(m));
+ default:
+ // delegate to parent class for handling backend-agnostic recovery ops.
+ return RecoveryBackend::handle_recovery_op(std::move(m), conn);
+ }
+}
+
diff --git a/src/crimson/osd/replicated_recovery_backend.h b/src/crimson/osd/replicated_recovery_backend.h
new file mode 100644
index 000000000..b023b7417
--- /dev/null
+++ b/src/crimson/osd/replicated_recovery_backend.h
@@ -0,0 +1,169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/interruptible_future.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/recovery_backend.h"
+
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+#include "os/ObjectStore.h"
+
+class ReplicatedRecoveryBackend : public RecoveryBackend {
+public:
+ ReplicatedRecoveryBackend(crimson::osd::PG& pg,
+ crimson::osd::ShardServices& shard_services,
+ crimson::os::CollectionRef coll,
+ PGBackend* backend)
+ : RecoveryBackend(pg, shard_services, coll, backend)
+ {}
+ interruptible_future<> handle_recovery_op(
+ Ref<MOSDFastDispatchOp> m,
+ crimson::net::ConnectionRef conn) final;
+
+ interruptible_future<> recover_object(
+ const hobject_t& soid,
+ eversion_t need) final;
+ interruptible_future<> recover_delete(
+ const hobject_t& soid,
+ eversion_t need) final;
+ interruptible_future<> push_delete(
+ const hobject_t& soid,
+ eversion_t need) final;
+protected:
+ interruptible_future<> handle_pull(
+ Ref<MOSDPGPull> m);
+ interruptible_future<> handle_pull_response(
+ Ref<MOSDPGPush> m);
+ interruptible_future<> handle_push(
+ Ref<MOSDPGPush> m);
+ interruptible_future<> handle_push_reply(
+ Ref<MOSDPGPushReply> m);
+ interruptible_future<> handle_recovery_delete(
+ Ref<MOSDPGRecoveryDelete> m);
+ interruptible_future<> handle_recovery_delete_reply(
+ Ref<MOSDPGRecoveryDeleteReply> m);
+ interruptible_future<PushOp> prep_push(
+ const hobject_t& soid,
+ eversion_t need,
+ pg_shard_t pg_shard);
+ void prepare_pull(
+ PullOp& pull_op,
+ pull_info_t& pull_info,
+ const hobject_t& soid,
+ eversion_t need);
+ std::vector<pg_shard_t> get_shards_to_push(
+ const hobject_t& soid) const;
+ interruptible_future<PushOp> build_push_op(
+ const ObjectRecoveryInfo& recovery_info,
+ const ObjectRecoveryProgress& progress,
+ object_stat_sum_t* stat);
+ /// @returns true if this push op is the last push op for
+ /// recovery @c pop.soid
+ interruptible_future<bool> _handle_pull_response(
+ pg_shard_t from,
+ PushOp& push_op,
+ PullOp* response,
+ ceph::os::Transaction* t);
+ std::pair<interval_set<uint64_t>, ceph::bufferlist> trim_pushed_data(
+ const interval_set<uint64_t> &copy_subset,
+ const interval_set<uint64_t> &intervals_received,
+ ceph::bufferlist data_received);
+ interruptible_future<> submit_push_data(
+ const ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ interval_set<uint64_t>&& data_zeros,
+ interval_set<uint64_t>&& intervals_included,
+ ceph::bufferlist&& data_included,
+ ceph::bufferlist&& omap_header,
+ const std::map<std::string, bufferlist, std::less<>> &attrs,
+ std::map<std::string, bufferlist>&& omap_entries,
+ ceph::os::Transaction *t);
+ void submit_push_complete(
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t);
+ interruptible_future<> _handle_push(
+ pg_shard_t from,
+ PushOp& push_op,
+ PushReplyOp *response,
+ ceph::os::Transaction *t);
+ interruptible_future<std::optional<PushOp>> _handle_push_reply(
+ pg_shard_t peer,
+ const PushReplyOp &op);
+ interruptible_future<> on_local_recover_persist(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& _recovery_info,
+ bool is_delete,
+ epoch_t epoch_to_freeze);
+ interruptible_future<> local_recover_delete(
+ const hobject_t& soid,
+ eversion_t need,
+ epoch_t epoch_frozen);
+ seastar::future<> on_stop() final {
+ return seastar::now();
+ }
+private:
+ /// pull missing object from peer
+ interruptible_future<> maybe_pull_missing_obj(
+ const hobject_t& soid,
+ eversion_t need);
+
+ /// load object context for recovery if it is not ready yet
+ using load_obc_ertr = crimson::errorator<
+ crimson::ct_error::object_corrupted>;
+ using load_obc_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ load_obc_ertr>;
+
+ interruptible_future<> maybe_push_shards(
+ const hobject_t& soid,
+ eversion_t need);
+
+ /// read the data attached to given object. the size of them is supposed to
+ /// be relatively small.
+ ///
+ /// @return @c oi.version
+ interruptible_future<eversion_t> read_metadata_for_push_op(
+ const hobject_t& oid,
+ const ObjectRecoveryProgress& progress,
+ ObjectRecoveryProgress& new_progress,
+ eversion_t ver,
+ PushOp* push_op);
+ /// read the remaining extents of object to be recovered and fill push_op
+ /// with them
+ ///
+ /// @param oid object being recovered
+ /// @param copy_subset extents we want
+ /// @param offset the offset in object from where we should read
+ /// @return the new offset
+ interruptible_future<uint64_t> read_object_for_push_op(
+ const hobject_t& oid,
+ const interval_set<uint64_t>& copy_subset,
+ uint64_t offset,
+ uint64_t max_len,
+ PushOp* push_op);
+ interruptible_future<> read_omap_for_push_op(
+ const hobject_t& oid,
+ const ObjectRecoveryProgress& progress,
+ ObjectRecoveryProgress& new_progress,
+ uint64_t& max_len,
+ PushOp* push_op);
+ interruptible_future<hobject_t> prep_push_target(
+ const ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ ObjectStore::Transaction* t,
+ const std::map<std::string, bufferlist, std::less<>> &attrs,
+ bufferlist&& omap_header);
+ using interruptor = crimson::interruptible::interruptor<
+ crimson::osd::IOInterruptCondition>;
+};
diff --git a/src/crimson/osd/scheduler/mclock_scheduler.cc b/src/crimson/osd/scheduler/mclock_scheduler.cc
new file mode 100644
index 000000000..006e4816c
--- /dev/null
+++ b/src/crimson/osd/scheduler/mclock_scheduler.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <memory>
+#include <functional>
+
+#include "crimson/osd/scheduler/mclock_scheduler.h"
+#include "common/dout.h"
+
+namespace dmc = crimson::dmclock;
+using namespace std::placeholders;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout
+
+
+namespace crimson::osd::scheduler {
+
+mClockScheduler::mClockScheduler(ConfigProxy &conf) :
+ scheduler(
+ std::bind(&mClockScheduler::ClientRegistry::get_info,
+ &client_registry,
+ _1),
+ dmc::AtLimit::Allow,
+ conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
+{
+ conf.add_observer(this);
+ client_registry.update_from_config(conf);
+}
+
+void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
+{
+ default_external_client_info.update(
+ conf.get_val<double>("osd_mclock_scheduler_client_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
+ conf.get_val<double>("osd_mclock_scheduler_client_lim"));
+
+ internal_client_infos[
+ static_cast<size_t>(scheduler_class_t::background_recovery)].update(
+ conf.get_val<double>("osd_mclock_scheduler_background_recovery_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
+ conf.get_val<double>("osd_mclock_scheduler_background_recovery_lim"));
+
+ internal_client_infos[
+ static_cast<size_t>(scheduler_class_t::background_best_effort)].update(
+ conf.get_val<double>("osd_mclock_scheduler_background_best_effort_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
+ conf.get_val<double>("osd_mclock_scheduler_background_best_effort_lim"));
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
+ const client_profile_id_t &client) const
+{
+ auto ret = external_client_infos.find(client);
+ if (ret == external_client_infos.end())
+ return &default_external_client_info;
+ else
+ return &(ret->second);
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
+ const scheduler_id_t &id) const {
+ switch (id.class_id) {
+ case scheduler_class_t::immediate:
+ ceph_assert(0 == "Cannot schedule immediate");
+ return (dmc::ClientInfo*)nullptr;
+ case scheduler_class_t::repop:
+ case scheduler_class_t::client:
+ return get_external_client(id.client_profile_id);
+ default:
+ ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
+ return &internal_client_infos[static_cast<size_t>(id.class_id)];
+ }
+}
+
+void mClockScheduler::dump(ceph::Formatter &f) const
+{
+}
+
+void mClockScheduler::enqueue(item_t&& item)
+{
+ auto id = get_scheduler_id(item);
+ auto cost = item.params.cost;
+
+ if (scheduler_class_t::immediate == item.params.klass) {
+ immediate.push_front(std::move(item));
+ } else {
+ scheduler.add_request(
+ std::move(item),
+ id,
+ cost);
+ }
+}
+
+void mClockScheduler::enqueue_front(item_t&& item)
+{
+ immediate.push_back(std::move(item));
+ // TODO: item may not be immediate, update mclock machinery to permit
+ // putting the item back in the queue
+}
+
+item_t mClockScheduler::dequeue()
+{
+ if (!immediate.empty()) {
+ auto ret = std::move(immediate.back());
+ immediate.pop_back();
+ return ret;
+ } else {
+ mclock_queue_t::PullReq result = scheduler.pull_request();
+ if (result.is_future()) {
+ ceph_assert(
+ 0 == "Not implemented, user would have to be able to be woken up");
+ return std::move(*(item_t*)nullptr);
+ } else if (result.is_none()) {
+ ceph_assert(
+ 0 == "Impossible, must have checked empty() first");
+ return std::move(*(item_t*)nullptr);
+ } else {
+ ceph_assert(result.is_retn());
+
+ auto &retn = result.get_retn();
+ return std::move(*retn.request);
+ }
+ }
+}
+
+const char** mClockScheduler::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "osd_mclock_scheduler_client_res",
+ "osd_mclock_scheduler_client_wgt",
+ "osd_mclock_scheduler_client_lim",
+ "osd_mclock_scheduler_background_recovery_res",
+ "osd_mclock_scheduler_background_recovery_wgt",
+ "osd_mclock_scheduler_background_recovery_lim",
+ "osd_mclock_scheduler_background_best_effort_res",
+ "osd_mclock_scheduler_background_best_effort_wgt",
+ "osd_mclock_scheduler_background_best_effort_lim",
+ NULL
+ };
+ return KEYS;
+}
+
+void mClockScheduler::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ client_registry.update_from_config(conf);
+}
+
+}
diff --git a/src/crimson/osd/scheduler/mclock_scheduler.h b/src/crimson/osd/scheduler/mclock_scheduler.h
new file mode 100644
index 000000000..153fc758b
--- /dev/null
+++ b/src/crimson/osd/scheduler/mclock_scheduler.h
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <ostream>
+#include <map>
+#include <vector>
+
+#include "boost/variant.hpp"
+
+#include "dmclock/src/dmclock_server.h"
+
+#include "crimson/osd/scheduler/scheduler.h"
+#include "common/config.h"
+#include "common/ceph_context.h"
+
+
+namespace crimson::osd::scheduler {
+
+using client_id_t = uint64_t;
+using profile_id_t = uint64_t;
+
+struct client_profile_id_t {
+ client_id_t client_id;
+ profile_id_t profile_id;
+ auto operator<=>(const client_profile_id_t&) const = default;
+};
+
+
+struct scheduler_id_t {
+ scheduler_class_t class_id;
+ client_profile_id_t client_profile_id;
+ auto operator<=>(const scheduler_id_t&) const = default;
+};
+
+/**
+ * Scheduler implementation based on mclock.
+ *
+ * TODO: explain configs
+ */
+class mClockScheduler : public Scheduler, md_config_obs_t {
+
+ class ClientRegistry {
+ std::array<
+ crimson::dmclock::ClientInfo,
+ static_cast<size_t>(scheduler_class_t::client)
+ > internal_client_infos = {
+ // Placeholder, gets replaced with configured values
+ crimson::dmclock::ClientInfo(1, 1, 1),
+ crimson::dmclock::ClientInfo(1, 1, 1)
+ };
+
+ crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
+ std::map<client_profile_id_t,
+ crimson::dmclock::ClientInfo> external_client_infos;
+ const crimson::dmclock::ClientInfo *get_external_client(
+ const client_profile_id_t &client) const;
+ public:
+ void update_from_config(const ConfigProxy &conf);
+ const crimson::dmclock::ClientInfo *get_info(
+ const scheduler_id_t &id) const;
+ } client_registry;
+
+ using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
+ scheduler_id_t,
+ item_t,
+ true,
+ true,
+ 2>;
+ mclock_queue_t scheduler;
+ std::list<item_t> immediate;
+
+ static scheduler_id_t get_scheduler_id(const item_t &item) {
+ return scheduler_id_t{
+ item.params.klass,
+ client_profile_id_t{
+ item.params.owner,
+ 0
+ }
+ };
+ }
+
+public:
+ mClockScheduler(ConfigProxy &conf);
+
+ // Enqueue op in the back of the regular queue
+ void enqueue(item_t &&item) final;
+
+ // Enqueue the op in the front of the regular queue
+ void enqueue_front(item_t &&item) final;
+
+ // Return an op to be dispatch
+ item_t dequeue() final;
+
+ // Returns if the queue is empty
+ bool empty() const final {
+ return immediate.empty() && scheduler.empty();
+ }
+
+ // Formatted output of the queue
+ void dump(ceph::Formatter &f) const final;
+
+ void print(std::ostream &ostream) const final {
+ ostream << "mClockScheduler";
+ }
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) final;
+};
+
+}
diff --git a/src/crimson/osd/scheduler/scheduler.cc b/src/crimson/osd/scheduler/scheduler.cc
new file mode 100644
index 000000000..c85cb388e
--- /dev/null
+++ b/src/crimson/osd/scheduler/scheduler.cc
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <ostream>
+
+#include <seastar/core/print.hh>
+
+#include "crimson/osd/scheduler/scheduler.h"
+#include "crimson/osd/scheduler/mclock_scheduler.h"
+#include "common/WeightedPriorityQueue.h"
+
+namespace crimson::osd::scheduler {
+
+std::ostream &operator<<(std::ostream &lhs, const scheduler_class_t &c)
+{
+ switch (c) {
+ case scheduler_class_t::background_best_effort:
+ return lhs << "background_best_effort";
+ case scheduler_class_t::background_recovery:
+ return lhs << "background_recovery";
+ case scheduler_class_t::client:
+ return lhs << "client";
+ case scheduler_class_t::repop:
+ return lhs << "repop";
+ case scheduler_class_t::immediate:
+ return lhs << "immediate";
+ default:
+ return lhs;
+ }
+}
+
+/**
+ * Implements Scheduler in terms of OpQueue
+ *
+ * Templated on queue type to avoid dynamic dispatch, T should implement
+ * OpQueue<Scheduleritem_t, client_t>. This adapter is mainly responsible for
+ * the boilerplate priority cutoff/strict concept which is needed for
+ * OpQueue based implementations.
+ */
+template <typename T>
+class ClassedOpQueueScheduler final : public Scheduler {
+ const scheduler_class_t cutoff;
+ T queue;
+
+ using priority_t = uint64_t;
+ std::array<
+ priority_t,
+ static_cast<size_t>(scheduler_class_t::immediate)
+ > priority_map = {
+ // Placeholder, gets replaced with configured values
+ 0, 0, 0
+ };
+
+ static scheduler_class_t get_io_prio_cut(ConfigProxy &conf) {
+ if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") {
+ srand(time(NULL));
+ return (rand() % 2 < 1) ?
+ scheduler_class_t::repop : scheduler_class_t::immediate;
+ } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") {
+ return scheduler_class_t::immediate;
+ } else {
+ return scheduler_class_t::repop;
+ }
+ }
+
+ bool use_strict(scheduler_class_t kl) const {
+ return static_cast<uint8_t>(kl) >= static_cast<uint8_t>(cutoff);
+ }
+
+ priority_t get_priority(scheduler_class_t kl) const {
+ ceph_assert(static_cast<size_t>(kl) <
+ static_cast<size_t>(scheduler_class_t::immediate));
+ return priority_map[static_cast<size_t>(kl)];
+ }
+
+public:
+ template <typename... Args>
+ ClassedOpQueueScheduler(ConfigProxy &conf, Args&&... args) :
+ cutoff(get_io_prio_cut(conf)),
+ queue(std::forward<Args>(args)...)
+ {
+ priority_map[
+ static_cast<size_t>(scheduler_class_t::background_best_effort)
+ ] = conf.get_val<uint64_t>("osd_scrub_priority");
+ priority_map[
+ static_cast<size_t>(scheduler_class_t::background_recovery)
+ ] = conf.get_val<uint64_t>("osd_recovery_op_priority");
+ priority_map[
+ static_cast<size_t>(scheduler_class_t::client)
+ ] = conf.get_val<uint64_t>("osd_client_op_priority");
+ priority_map[
+ static_cast<size_t>(scheduler_class_t::repop)
+ ] = conf.get_val<uint64_t>("osd_client_op_priority");
+ }
+
+ void enqueue(item_t &&item) final {
+ if (use_strict(item.params.klass))
+ queue.enqueue_strict(
+ item.params.owner, get_priority(item.params.klass), std::move(item));
+ else
+ queue.enqueue(
+ item.params.owner, get_priority(item.params.klass),
+ item.params.cost, std::move(item));
+ }
+
+ void enqueue_front(item_t &&item) final {
+ if (use_strict(item.params.klass))
+ queue.enqueue_strict_front(
+ item.params.owner, get_priority(item.params.klass), std::move(item));
+ else
+ queue.enqueue_front(
+ item.params.owner, get_priority(item.params.klass),
+ item.params.cost, std::move(item));
+ }
+
+ bool empty() const final {
+ return queue.empty();
+ }
+
+ item_t dequeue() final {
+ return queue.dequeue();
+ }
+
+ void dump(ceph::Formatter &f) const final {
+ return queue.dump(&f);
+ }
+
+ void print(std::ostream &out) const final {
+ out << "ClassedOpQueueScheduler(queue=";
+ queue.print(out);
+ out << ", cutoff=" << cutoff << ")";
+ }
+
+ ~ClassedOpQueueScheduler() final {};
+};
+
+SchedulerRef make_scheduler(ConfigProxy &conf)
+{
+ const std::string _type = conf.get_val<std::string>("osd_op_queue");
+ const std::string *type = &_type;
+ if (*type == "debug_random") {
+ static const std::string index_lookup[] = { "mclock_scheduler",
+ "wpq" };
+ srand(time(NULL));
+ unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0]));
+ type = &index_lookup[which];
+ }
+
+ if (*type == "wpq" ) {
+ // default is 'wpq'
+ return std::make_unique<
+ ClassedOpQueueScheduler<WeightedPriorityQueue<item_t, client_t>>>(
+ conf,
+ conf.get_val<uint64_t>("osd_op_pq_max_tokens_per_priority"),
+ conf->osd_op_pq_min_cost
+ );
+ } else if (*type == "mclock_scheduler") {
+ return std::make_unique<mClockScheduler>(conf);
+ } else {
+ ceph_assert("Invalid choice of wq" == 0);
+ return std::unique_ptr<mClockScheduler>();
+ }
+}
+
+std::ostream &operator<<(std::ostream &lhs, const Scheduler &rhs) {
+ rhs.print(lhs);
+ return lhs;
+}
+
+}
diff --git a/src/crimson/osd/scheduler/scheduler.h b/src/crimson/osd/scheduler/scheduler.h
new file mode 100644
index 000000000..a014991ab
--- /dev/null
+++ b/src/crimson/osd/scheduler/scheduler.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <ostream>
+
+#include "crimson/common/config_proxy.h"
+
+namespace crimson::osd::scheduler {
+
+enum class scheduler_class_t : uint8_t {
+ background_best_effort = 0,
+ background_recovery,
+ client,
+ repop,
+ immediate,
+};
+
+std::ostream &operator<<(std::ostream &, const scheduler_class_t &);
+
+using client_t = uint64_t;
+using cost_t = uint64_t;
+
+struct params_t {
+ cost_t cost = 1;
+ client_t owner;
+ scheduler_class_t klass;
+};
+
+struct item_t {
+ params_t params;
+ seastar::promise<> wake;
+};
+
+/**
+ * Base interface for classes responsible for choosing
+ * op processing order in the OSD.
+ */
+class Scheduler {
+public:
+ // Enqueue op for scheduling
+ virtual void enqueue(item_t &&item) = 0;
+
+ // Enqueue op for processing as though it were enqueued prior
+ // to other items already scheduled.
+ virtual void enqueue_front(item_t &&item) = 0;
+
+ // Returns true iff there are no ops scheduled
+ virtual bool empty() const = 0;
+
+ // Return next op to be processed
+ virtual item_t dequeue() = 0;
+
+ // Dump formatted representation for the queue
+ virtual void dump(ceph::Formatter &f) const = 0;
+
+ // Print human readable brief description with relevant parameters
+ virtual void print(std::ostream &out) const = 0;
+
+ // Destructor
+ virtual ~Scheduler() {};
+};
+
+std::ostream &operator<<(std::ostream &lhs, const Scheduler &);
+using SchedulerRef = std::unique_ptr<Scheduler>;
+
+SchedulerRef make_scheduler(ConfigProxy &);
+
+}
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
new file mode 100644
index 000000000..a6431305d
--- /dev/null
+++ b/src/crimson/osd/shard_services.cc
@@ -0,0 +1,761 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/smart_ptr/make_local_shared.hpp>
+
+#include "crimson/osd/shard_services.h"
+
+#include "messages/MOSDAlive.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGTemp.h"
+
+#include "osd/osd_perf_counters.h"
+#include "osd/PeeringState.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/mgr/client.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/net/Connection.h"
+#include "crimson/os/cyanstore/cyan_store.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_meta.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::vector;
+
+namespace crimson::osd {
+
+PerShardState::PerShardState(
+ int whoami,
+ ceph::mono_time startup_time,
+ PerfCounters *perf,
+ PerfCounters *recoverystate_perf,
+ crimson::os::FuturizedStore &store,
+ OSDState &osd_state)
+ : whoami(whoami),
+ store(store.get_sharded_store()),
+ osd_state(osd_state),
+ osdmap_gate("PerShardState::osdmap_gate"),
+ perf(perf), recoverystate_perf(recoverystate_perf),
+ throttler(crimson::common::local_conf()),
+ next_tid(
+ static_cast<ceph_tid_t>(seastar::this_shard_id()) <<
+ (std::numeric_limits<ceph_tid_t>::digits - 8)),
+ startup_time(startup_time)
+{}
+
+seastar::future<> PerShardState::dump_ops_in_flight(Formatter *f) const
+{
+ registry.for_each_op([f](const auto &op) {
+ op.dump(f);
+ });
+ return seastar::now();
+}
+
+seastar::future<> PerShardState::stop_pgs()
+{
+ assert_core();
+ return seastar::parallel_for_each(
+ pg_map.get_pgs(),
+ [](auto& p) {
+ return p.second->stop();
+ });
+}
+
+std::map<pg_t, pg_stat_t> PerShardState::get_pg_stats() const
+{
+ assert_core();
+ std::map<pg_t, pg_stat_t> ret;
+ for (auto [pgid, pg] : pg_map.get_pgs()) {
+ if (pg->is_primary()) {
+ auto stats = pg->get_stats();
+ // todo: update reported_epoch,reported_seq,last_fresh
+ stats.reported_epoch = osdmap->get_epoch();
+ ret.emplace(pgid.pgid, std::move(stats));
+ }
+ }
+ return ret;
+}
+
+seastar::future<> PerShardState::broadcast_map_to_pgs(
+ ShardServices &shard_services,
+ epoch_t epoch)
+{
+ assert_core();
+ auto &pgs = pg_map.get_pgs();
+ return seastar::parallel_for_each(
+ pgs.begin(), pgs.end(),
+ [=, &shard_services](auto& pg) {
+ return shard_services.start_operation<PGAdvanceMap>(
+ shard_services,
+ pg.second, epoch,
+ PeeringCtx{}, false).second;
+ });
+}
+
+Ref<PG> PerShardState::get_pg(spg_t pgid)
+{
+ assert_core();
+ return pg_map.get_pg(pgid);
+}
+
+HeartbeatStampsRef PerShardState::get_hb_stamps(int peer)
+{
+ assert_core();
+ auto [stamps, added] = heartbeat_stamps.try_emplace(peer);
+ if (added) {
+ stamps->second = ceph::make_ref<HeartbeatStamps>(peer);
+ }
+ return stamps->second;
+}
+
+OSDSingletonState::OSDSingletonState(
+ int whoami,
+ crimson::net::Messenger &cluster_msgr,
+ crimson::net::Messenger &public_msgr,
+ crimson::mon::Client &monc,
+ crimson::mgr::Client &mgrc)
+ : whoami(whoami),
+ cluster_msgr(cluster_msgr),
+ public_msgr(public_msgr),
+ monc(monc),
+ mgrc(mgrc),
+ local_reserver(
+ &cct,
+ &finisher,
+ crimson::common::local_conf()->osd_max_backfills,
+ crimson::common::local_conf()->osd_min_recovery_priority),
+ remote_reserver(
+ &cct,
+ &finisher,
+ crimson::common::local_conf()->osd_max_backfills,
+ crimson::common::local_conf()->osd_min_recovery_priority),
+ snap_reserver(
+ &cct,
+ &finisher,
+ crimson::common::local_conf()->osd_max_trimming_pgs)
+{
+ crimson::common::local_conf().add_observer(this);
+ osdmaps[0] = boost::make_local_shared<OSDMap>();
+
+ perf = build_osd_logger(&cct);
+ cct.get_perfcounters_collection()->add(perf);
+
+ recoverystate_perf = build_recoverystate_perf(&cct);
+ cct.get_perfcounters_collection()->add(recoverystate_perf);
+}
+
+seastar::future<> OSDSingletonState::send_to_osd(
+ int peer, MessageURef m, epoch_t from_epoch)
+{
+ if (osdmap->is_down(peer)) {
+ logger().info("{}: osd.{} is_down", __func__, peer);
+ return seastar::now();
+ } else if (osdmap->get_info(peer).up_from > from_epoch) {
+ logger().info("{}: osd.{} {} > {}", __func__, peer,
+ osdmap->get_info(peer).up_from, from_epoch);
+ return seastar::now();
+ } else {
+ auto conn = cluster_msgr.connect(
+ osdmap->get_cluster_addrs(peer).front(), CEPH_ENTITY_TYPE_OSD);
+ return conn->send(std::move(m));
+ }
+}
+
+seastar::future<> OSDSingletonState::osdmap_subscribe(
+ version_t epoch, bool force_request)
+{
+ logger().info("{}({})", __func__, epoch);
+ if (monc.sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
+ force_request) {
+ return monc.renew_subs();
+ } else {
+ return seastar::now();
+ }
+}
+
+void OSDSingletonState::queue_want_pg_temp(
+ pg_t pgid,
+ const vector<int>& want,
+ bool forced)
+{
+ auto p = pg_temp_pending.find(pgid);
+ if (p == pg_temp_pending.end() ||
+ p->second.acting != want ||
+ forced) {
+ pg_temp_wanted[pgid] = {want, forced};
+ }
+}
+
+void OSDSingletonState::remove_want_pg_temp(pg_t pgid)
+{
+ pg_temp_wanted.erase(pgid);
+ pg_temp_pending.erase(pgid);
+}
+
+void OSDSingletonState::requeue_pg_temp()
+{
+ unsigned old_wanted = pg_temp_wanted.size();
+ unsigned old_pending = pg_temp_pending.size();
+ pg_temp_wanted.merge(pg_temp_pending);
+ pg_temp_pending.clear();
+ logger().debug(
+ "{}: {} + {} -> {}",
+ __func__ ,
+ old_wanted,
+ old_pending,
+ pg_temp_wanted.size());
+}
+
+seastar::future<> OSDSingletonState::send_pg_temp()
+{
+ if (pg_temp_wanted.empty())
+ return seastar::now();
+ logger().debug("{}: {}", __func__, pg_temp_wanted);
+ MURef<MOSDPGTemp> ms[2] = {nullptr, nullptr};
+ for (auto& [pgid, pg_temp] : pg_temp_wanted) {
+ auto& m = ms[pg_temp.forced];
+ if (!m) {
+ m = crimson::make_message<MOSDPGTemp>(osdmap->get_epoch());
+ m->forced = pg_temp.forced;
+ }
+ m->pg_temp.emplace(pgid, pg_temp.acting);
+ }
+ pg_temp_pending.merge(pg_temp_wanted);
+ pg_temp_wanted.clear();
+ return seastar::parallel_for_each(std::begin(ms), std::end(ms),
+ [this](auto& m) {
+ if (m) {
+ return monc.send_message(std::move(m));
+ } else {
+ return seastar::now();
+ }
+ });
+}
+
+std::ostream& operator<<(
+ std::ostream& out,
+ const OSDSingletonState::pg_temp_t& pg_temp)
+{
+ out << pg_temp.acting;
+ if (pg_temp.forced) {
+ out << " (forced)";
+ }
+ return out;
+}
+
+seastar::future<> OSDSingletonState::send_pg_created(pg_t pgid)
+{
+ logger().debug(__func__);
+ auto o = get_osdmap();
+ ceph_assert(o->require_osd_release >= ceph_release_t::luminous);
+ pg_created.insert(pgid);
+ return monc.send_message(crimson::make_message<MOSDPGCreated>(pgid));
+}
+
+seastar::future<> OSDSingletonState::send_pg_created()
+{
+ logger().debug(__func__);
+ auto o = get_osdmap();
+ ceph_assert(o->require_osd_release >= ceph_release_t::luminous);
+ return seastar::parallel_for_each(pg_created,
+ [this](auto &pgid) {
+ return monc.send_message(crimson::make_message<MOSDPGCreated>(pgid));
+ });
+}
+
+void OSDSingletonState::prune_pg_created()
+{
+ logger().debug(__func__);
+ auto o = get_osdmap();
+ auto i = pg_created.begin();
+ while (i != pg_created.end()) {
+ auto p = o->get_pg_pool(i->pool());
+ if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
+ logger().debug("{} pruning {}", __func__, *i);
+ i = pg_created.erase(i);
+ } else {
+ logger().debug(" keeping {}", __func__, *i);
+ ++i;
+ }
+ }
+}
+
+seastar::future<> OSDSingletonState::send_alive(const epoch_t want)
+{
+ logger().info(
+ "{} want={} up_thru_wanted={}",
+ __func__,
+ want,
+ up_thru_wanted);
+
+ if (want > up_thru_wanted) {
+ up_thru_wanted = want;
+ } else {
+ logger().debug("{} want={} <= up_thru_wanted={}; skipping",
+ __func__, want, up_thru_wanted);
+ return seastar::now();
+ }
+ if (!osdmap->exists(whoami)) {
+ logger().warn("{} DNE", __func__);
+ return seastar::now();
+ } if (const epoch_t up_thru = osdmap->get_up_thru(whoami);
+ up_thru_wanted > up_thru) {
+ logger().debug("{} up_thru_wanted={} up_thru={}", __func__, want, up_thru);
+ return monc.send_message(
+ crimson::make_message<MOSDAlive>(osdmap->get_epoch(), want));
+ } else {
+ logger().debug("{} {} <= {}", __func__, want, osdmap->get_up_thru(whoami));
+ return seastar::now();
+ }
+}
+
+const char** OSDSingletonState::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "osd_max_backfills",
+ "osd_min_recovery_priority",
+ "osd_max_trimming_pgs",
+ nullptr
+ };
+ return KEYS;
+}
+
+void OSDSingletonState::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+ if (changed.count("osd_max_backfills")) {
+ local_reserver.set_max(conf->osd_max_backfills);
+ remote_reserver.set_max(conf->osd_max_backfills);
+ }
+ if (changed.count("osd_min_recovery_priority")) {
+ local_reserver.set_min_priority(conf->osd_min_recovery_priority);
+ remote_reserver.set_min_priority(conf->osd_min_recovery_priority);
+ }
+ if (changed.count("osd_max_trimming_pgs")) {
+ snap_reserver.set_max(conf->osd_max_trimming_pgs);
+ }
+}
+
+seastar::future<OSDSingletonState::local_cached_map_t>
+OSDSingletonState::get_local_map(epoch_t e)
+{
+ // TODO: use LRU cache for managing osdmap, fallback to disk if we have to
+ if (auto found = osdmaps.find(e); found) {
+ logger().debug("{} osdmap.{} found in cache", __func__, e);
+ return seastar::make_ready_future<local_cached_map_t>(std::move(found));
+ } else {
+ logger().debug("{} loading osdmap.{} from disk", __func__, e);
+ return load_map(e).then([e, this](std::unique_ptr<OSDMap> osdmap) {
+ return seastar::make_ready_future<local_cached_map_t>(
+ osdmaps.insert(e, std::move(osdmap)));
+ });
+ }
+}
+
+void OSDSingletonState::store_map_bl(
+ ceph::os::Transaction& t,
+ epoch_t e, bufferlist&& bl)
+{
+ meta_coll->store_map(t, e, bl);
+ map_bl_cache.insert(e, std::move(bl));
+}
+
+seastar::future<bufferlist> OSDSingletonState::load_map_bl(
+ epoch_t e)
+{
+ if (std::optional<bufferlist> found = map_bl_cache.find(e); found) {
+ logger().debug("{} osdmap.{} found in cache", __func__, e);
+ return seastar::make_ready_future<bufferlist>(*found);
+ } else {
+ logger().debug("{} loading osdmap.{} from disk", __func__, e);
+ return meta_coll->load_map(e);
+ }
+}
+
+seastar::future<std::map<epoch_t, bufferlist>> OSDSingletonState::load_map_bls(
+ epoch_t first,
+ epoch_t last)
+{
+ logger().debug("{} loading maps [{},{}]",
+ __func__, first, last);
+ ceph_assert(first <= last);
+ return seastar::map_reduce(boost::make_counting_iterator<epoch_t>(first),
+ boost::make_counting_iterator<epoch_t>(last + 1),
+ [this](epoch_t e) {
+ return load_map_bl(e).then([e](auto&& bl) {
+ return seastar::make_ready_future<std::pair<epoch_t, bufferlist>>(
+ std::make_pair(e, std::move(bl)));
+ });
+ },
+ std::map<epoch_t, bufferlist>{},
+ [](auto&& bls, auto&& epoch_bl) {
+ bls.emplace(std::move(epoch_bl));
+ return std::move(bls);
+ });
+}
+
+seastar::future<std::unique_ptr<OSDMap>> OSDSingletonState::load_map(epoch_t e)
+{
+ auto o = std::make_unique<OSDMap>();
+ logger().info("{} osdmap.{}", __func__, e);
+ if (e == 0) {
+ return seastar::make_ready_future<std::unique_ptr<OSDMap>>(std::move(o));
+ }
+ return load_map_bl(e).then([o=std::move(o)](bufferlist bl) mutable {
+ o->decode(bl);
+ return seastar::make_ready_future<std::unique_ptr<OSDMap>>(std::move(o));
+ });
+}
+
+seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t,
+ epoch_t start, Ref<MOSDMap> m)
+{
+ return seastar::do_for_each(
+ boost::make_counting_iterator(start),
+ boost::make_counting_iterator(m->get_last() + 1),
+ [&t, m, this](epoch_t e) {
+ if (auto p = m->maps.find(e); p != m->maps.end()) {
+ auto o = std::make_unique<OSDMap>();
+ o->decode(p->second);
+ logger().info("store_maps storing osdmap.{}", e);
+ store_map_bl(t, e, std::move(std::move(p->second)));
+ osdmaps.insert(e, std::move(o));
+ return seastar::now();
+ } else if (auto p = m->incremental_maps.find(e);
+ p != m->incremental_maps.end()) {
+ logger().info("store_maps found osdmap.{} incremental map, "
+ "loading osdmap.{}", e, e - 1);
+ ceph_assert(std::cmp_greater(e, 0u));
+ return load_map(e - 1).then([e, bl=p->second, &t, this](auto o) {
+ OSDMap::Incremental inc;
+ auto i = bl.cbegin();
+ inc.decode(i);
+ o->apply_incremental(inc);
+ bufferlist fbl;
+ o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
+ logger().info("store_maps storing osdmap.{}", o->get_epoch());
+ store_map_bl(t, e, std::move(fbl));
+ osdmaps.insert(e, std::move(o));
+ return seastar::now();
+ });
+ } else {
+ logger().error("MOSDMap lied about what maps it had?");
+ return seastar::now();
+ }
+ });
+}
+
+seastar::future<Ref<PG>> ShardServices::make_pg(
+ OSDMapService::cached_map_t create_map,
+ spg_t pgid,
+ bool do_create)
+{
+ using ec_profile_t = std::map<std::string, std::string>;
+ auto get_pool_info_for_pg = [create_map, pgid, this] {
+ if (create_map->have_pg_pool(pgid.pool())) {
+ pg_pool_t pi = *create_map->get_pg_pool(pgid.pool());
+ std::string name = create_map->get_pool_name(pgid.pool());
+ ec_profile_t ec_profile;
+ if (pi.is_erasure()) {
+ ec_profile = create_map->get_erasure_code_profile(
+ pi.erasure_code_profile);
+ }
+ return seastar::make_ready_future<
+ std::tuple<pg_pool_t,std::string, ec_profile_t>
+ >(std::make_tuple(
+ std::move(pi),
+ std::move(name),
+ std::move(ec_profile)));
+ } else {
+ // pool was deleted; grab final pg_pool_t off disk.
+ return get_pool_info(pgid.pool());
+ }
+ };
+ auto get_collection = [pgid, do_create, this] {
+ const coll_t cid{pgid};
+ if (do_create) {
+ return get_store().create_new_collection(cid);
+ } else {
+ return get_store().open_collection(cid);
+ }
+ };
+ return seastar::when_all(
+ std::move(get_pool_info_for_pg),
+ std::move(get_collection)
+ ).then([pgid, create_map, this](auto &&ret) {
+ auto [pool, name, ec_profile] = std::move(std::get<0>(ret).get0());
+ auto coll = std::move(std::get<1>(ret).get0());
+ return seastar::make_ready_future<Ref<PG>>(
+ new PG{
+ pgid,
+ pg_shard_t{local_state.whoami, pgid.shard},
+ std::move(coll),
+ std::move(pool),
+ std::move(name),
+ create_map,
+ *this,
+ ec_profile});
+ });
+}
+
+seastar::future<Ref<PG>> ShardServices::handle_pg_create_info(
+ std::unique_ptr<PGCreateInfo> info) {
+ return seastar::do_with(
+ std::move(info),
+ [this](auto &info)
+ -> seastar::future<Ref<PG>> {
+ return get_map(info->epoch).then(
+ [&info, this](cached_map_t startmap)
+ -> seastar::future<std::tuple<Ref<PG>, cached_map_t>> {
+ const spg_t &pgid = info->pgid;
+ if (info->by_mon) {
+ int64_t pool_id = pgid.pgid.pool();
+ const pg_pool_t *pool = get_map()->get_pg_pool(pool_id);
+ if (!pool) {
+ logger().debug(
+ "{} ignoring pgid {}, pool dne",
+ __func__,
+ pgid);
+ local_state.pg_map.pg_creation_canceled(pgid);
+ return seastar::make_ready_future<
+ std::tuple<Ref<PG>, OSDMapService::cached_map_t>
+ >(std::make_tuple(Ref<PG>(), startmap));
+ } else if (!pool->is_crimson()) {
+ logger().debug(
+ "{} ignoring pgid {}, pool lacks crimson flag",
+ __func__,
+ pgid);
+ local_state.pg_map.pg_creation_canceled(pgid);
+ return seastar::make_ready_future<
+ std::tuple<Ref<PG>, OSDMapService::cached_map_t>
+ >(std::make_tuple(Ref<PG>(), startmap));
+ }
+ ceph_assert(get_map()->require_osd_release >=
+ ceph_release_t::octopus);
+ if (!pool->has_flag(pg_pool_t::FLAG_CREATING)) {
+ // this ensures we do not process old creating messages after the
+ // pool's initial pgs have been created (and pg are subsequently
+ // allowed to split or merge).
+ logger().debug(
+ "{} dropping {} create, pool does not have CREATING flag set",
+ __func__,
+ pgid);
+ local_state.pg_map.pg_creation_canceled(pgid);
+ return seastar::make_ready_future<
+ std::tuple<Ref<PG>, OSDMapService::cached_map_t>
+ >(std::make_tuple(Ref<PG>(), startmap));
+ }
+ }
+ return make_pg(
+ startmap, pgid, true
+ ).then([startmap=std::move(startmap)](auto pg) mutable {
+ return seastar::make_ready_future<
+ std::tuple<Ref<PG>, OSDMapService::cached_map_t>
+ >(std::make_tuple(std::move(pg), std::move(startmap)));
+ });
+ }).then([this, &info](auto &&ret)
+ ->seastar::future<Ref<PG>> {
+ auto [pg, startmap] = std::move(ret);
+ if (!pg)
+ return seastar::make_ready_future<Ref<PG>>(Ref<PG>());
+ const pg_pool_t* pp = startmap->get_pg_pool(info->pgid.pool());
+
+ int up_primary, acting_primary;
+ vector<int> up, acting;
+ startmap->pg_to_up_acting_osds(
+ info->pgid.pgid, &up, &up_primary, &acting, &acting_primary);
+
+ int role = startmap->calc_pg_role(
+ pg_shard_t(local_state.whoami, info->pgid.shard),
+ acting);
+
+ PeeringCtx rctx;
+ create_pg_collection(
+ rctx.transaction,
+ info->pgid,
+ info->pgid.get_split_bits(pp->get_pg_num()));
+ init_pg_ondisk(
+ rctx.transaction,
+ info->pgid,
+ pp);
+
+ pg->init(
+ role,
+ up,
+ up_primary,
+ acting,
+ acting_primary,
+ info->history,
+ info->past_intervals,
+ rctx.transaction);
+
+ return start_operation<PGAdvanceMap>(
+ *this, pg, get_map()->get_epoch(), std::move(rctx), true
+ ).second.then([pg=pg] {
+ return seastar::make_ready_future<Ref<PG>>(pg);
+ });
+ });
+ });
+}
+
+
+ShardServices::get_or_create_pg_ret
+ShardServices::get_or_create_pg(
+ PGMap::PGCreationBlockingEvent::TriggerI&& trigger,
+ spg_t pgid,
+ std::unique_ptr<PGCreateInfo> info)
+{
+ if (info) {
+ auto [fut, creating] = local_state.pg_map.wait_for_pg(
+ std::move(trigger), pgid);
+ if (!creating) {
+ local_state.pg_map.set_creating(pgid);
+ (void)handle_pg_create_info(
+ std::move(info));
+ }
+ return std::move(fut);
+ } else {
+ return get_or_create_pg_ret(
+ get_or_create_pg_ertr::ready_future_marker{},
+ local_state.pg_map.get_pg(pgid));
+ }
+}
+
+ShardServices::wait_for_pg_ret
+ShardServices::wait_for_pg(
+ PGMap::PGCreationBlockingEvent::TriggerI&& trigger, spg_t pgid)
+{
+ return local_state.pg_map.wait_for_pg(std::move(trigger), pgid).first;
+}
+
+seastar::future<Ref<PG>> ShardServices::load_pg(spg_t pgid)
+
+{
+ logger().debug("{}: {}", __func__, pgid);
+
+ return seastar::do_with(PGMeta(get_store(), pgid), [](auto& pg_meta) {
+ return pg_meta.get_epoch();
+ }).then([this](epoch_t e) {
+ return get_map(e);
+ }).then([pgid, this](auto&& create_map) {
+ return make_pg(std::move(create_map), pgid, false);
+ }).then([this](Ref<PG> pg) {
+ return pg->read_state(&get_store()).then([pg] {
+ return seastar::make_ready_future<Ref<PG>>(std::move(pg));
+ });
+ }).handle_exception([pgid](auto ep) {
+ logger().info("pg {} saw exception on load {}", pgid, ep);
+ ceph_abort("Could not load pg" == 0);
+ return seastar::make_exception_future<Ref<PG>>(ep);
+ });
+}
+
+seastar::future<> ShardServices::dispatch_context_transaction(
+ crimson::os::CollectionRef col, PeeringCtx &ctx) {
+ if (ctx.transaction.empty()) {
+ logger().debug("ShardServices::dispatch_context_transaction: empty transaction");
+ return seastar::now();
+ }
+
+ logger().debug("ShardServices::dispatch_context_transaction: do_transaction ...");
+ auto ret = get_store().do_transaction(
+ col,
+ std::move(ctx.transaction));
+ ctx.reset_transaction();
+ return ret;
+}
+
+seastar::future<> ShardServices::dispatch_context_messages(
+ BufferedRecoveryMessages &&ctx)
+{
+ auto ret = seastar::parallel_for_each(std::move(ctx.message_map),
+ [this](auto& osd_messages) {
+ auto& [peer, messages] = osd_messages;
+ logger().debug("dispatch_context_messages sending messages to {}", peer);
+ return seastar::parallel_for_each(
+ std::move(messages), [=, peer=peer, this](auto& m) {
+ return send_to_osd(peer, std::move(m), local_state.osdmap->get_epoch());
+ });
+ });
+ ctx.message_map.clear();
+ return ret;
+}
+
+seastar::future<> ShardServices::dispatch_context(
+ crimson::os::CollectionRef col,
+ PeeringCtx &&ctx)
+{
+ ceph_assert(col || ctx.transaction.empty());
+ return seastar::when_all_succeed(
+ dispatch_context_messages(
+ BufferedRecoveryMessages{ctx}),
+ col ? dispatch_context_transaction(col, ctx) : seastar::now()
+ ).then_unpack([] {
+ return seastar::now();
+ });
+}
+
+seastar::future<> OSDSingletonState::send_incremental_map(
+ crimson::net::Connection &conn,
+ epoch_t first)
+{
+ logger().info("{}: first osdmap: {} "
+ "superblock's oldest map: {}",
+ __func__, first, superblock.oldest_map);
+ if (first >= superblock.oldest_map) {
+ return load_map_bls(
+ first, superblock.newest_map
+ ).then([this, &conn, first](auto&& bls) {
+ auto m = crimson::make_message<MOSDMap>(
+ monc.get_fsid(),
+ osdmap->get_encoding_features());
+ m->cluster_osdmap_trim_lower_bound = first;
+ m->newest_map = superblock.newest_map;
+ m->maps = std::move(bls);
+ return conn.send(std::move(m));
+ });
+ } else {
+ return load_map_bl(osdmap->get_epoch()
+ ).then([this, &conn](auto&& bl) mutable {
+ auto m = crimson::make_message<MOSDMap>(
+ monc.get_fsid(),
+ osdmap->get_encoding_features());
+ /* TODO: once we support the tracking of superblock's
+ * cluster_osdmap_trim_lower_bound, the MOSDMap should
+ * be populated with this value instead of the oldest_map.
+ * See: OSD::handle_osd_map for how classic updates the
+ * cluster's trim lower bound.
+ */
+ m->cluster_osdmap_trim_lower_bound = superblock.oldest_map;
+ m->newest_map = superblock.newest_map;
+ m->maps.emplace(osdmap->get_epoch(), std::move(bl));
+ return conn.send(std::move(m));
+ });
+ }
+}
+
+seastar::future<> OSDSingletonState::send_incremental_map_to_osd(
+ int osd,
+ epoch_t first)
+{
+ if (osdmap->is_down(osd)) {
+ logger().info("{}: osd.{} is_down", __func__, osd);
+ return seastar::now();
+ } else {
+ auto conn = cluster_msgr.connect(
+ osdmap->get_cluster_addrs(osd).front(), CEPH_ENTITY_TYPE_OSD);
+ return send_incremental_map(*conn, first);
+ }
+}
+
+};
diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h
new file mode 100644
index 000000000..9b7553e7b
--- /dev/null
+++ b/src/crimson/osd/shard_services.h
@@ -0,0 +1,589 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/common_fwd.h"
+#include "osd_operation.h"
+#include "msg/MessageRef.h"
+#include "crimson/common/exception.h"
+#include "crimson/common/shared_lru.h"
+#include "crimson/os/futurized_collection.h"
+#include "osd/PeeringState.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_meta.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/osd/state.h"
+#include "common/AsyncReserver.h"
+
+namespace crimson::net {
+ class Messenger;
+}
+
+namespace crimson::mgr {
+ class Client;
+}
+
+namespace crimson::mon {
+ class Client;
+}
+
+namespace crimson::os {
+ class FuturizedStore;
+}
+
+class OSDMap;
+class PeeringCtx;
+class BufferedRecoveryMessages;
+
+namespace crimson::osd {
+
+class PGShardManager;
+
+/**
+ * PerShardState
+ *
+ * Per-shard state holding instances local to each shard.
+ */
+class PerShardState {
+ friend class ShardServices;
+ friend class PGShardManager;
+ friend class OSD;
+ using cached_map_t = OSDMapService::cached_map_t;
+ using local_cached_map_t = OSDMapService::local_cached_map_t;
+
+ const core_id_t core = seastar::this_shard_id();
+#define assert_core() ceph_assert(seastar::this_shard_id() == core);
+
+ const int whoami;
+ crimson::os::FuturizedStore::Shard &store;
+ crimson::common::CephContext cct;
+
+ OSDState &osd_state;
+ OSD_OSDMapGate osdmap_gate;
+
+ PerfCounters *perf = nullptr;
+ PerfCounters *recoverystate_perf = nullptr;
+
+ // Op Management
+ OSDOperationRegistry registry;
+ OperationThrottler throttler;
+
+ seastar::future<> dump_ops_in_flight(Formatter *f) const;
+
+ epoch_t up_epoch = 0;
+ OSDMapService::cached_map_t osdmap;
+ const auto &get_osdmap() const {
+ assert_core();
+ return osdmap;
+ }
+ void update_map(OSDMapService::cached_map_t new_osdmap) {
+ assert_core();
+ osdmap = std::move(new_osdmap);
+ }
+ void set_up_epoch(epoch_t epoch) {
+ assert_core();
+ up_epoch = epoch;
+ }
+
+ // prevent creating new osd operations when system is shutting down,
+ // this is necessary because there are chances that a new operation
+ // is created, after the interruption of all ongoing operations, and
+ // creats and waits on a new and may-never-resolve future, in which
+ // case the shutdown may never succeed.
+ bool stopping = false;
+ seastar::future<> stop_registry() {
+ assert_core();
+ crimson::get_logger(ceph_subsys_osd).info("PerShardState::{}", __func__);
+ stopping = true;
+ return registry.stop();
+ }
+
+ // PGMap state
+ PGMap pg_map;
+
+ seastar::future<> stop_pgs();
+ std::map<pg_t, pg_stat_t> get_pg_stats() const;
+ seastar::future<> broadcast_map_to_pgs(
+ ShardServices &shard_services,
+ epoch_t epoch);
+
+ Ref<PG> get_pg(spg_t pgid);
+ template <typename F>
+ void for_each_pg(F &&f) const {
+ assert_core();
+ for (auto &pg : pg_map.get_pgs()) {
+ std::invoke(f, pg.first, pg.second);
+ }
+ }
+
+ template <typename T, typename... Args>
+ auto start_operation(Args&&... args) {
+ assert_core();
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+ auto op = registry.create_operation<T>(std::forward<Args>(args)...);
+ crimson::get_logger(ceph_subsys_osd).info(
+ "PerShardState::{}, {}", __func__, *op);
+ auto fut = seastar::yield().then([op] {
+ return op->start().finally([op /* by copy */] {
+ // ensure the op's lifetime is appropriate. It is not enough to
+ // guarantee it's alive at the scheduling stages (i.e. `then()`
+ // calling) but also during the actual execution (i.e. when passed
+ // lambdas are actually run).
+ });
+ });
+ return std::make_pair(std::move(op), std::move(fut));
+ }
+
+ template <typename InterruptorT, typename T, typename... Args>
+ auto start_operation_may_interrupt(Args&&... args) {
+ assert_core();
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+ auto op = registry.create_operation<T>(std::forward<Args>(args)...);
+ crimson::get_logger(ceph_subsys_osd).info(
+ "PerShardState::{}, {}", __func__, *op);
+ auto fut = InterruptorT::make_interruptible(
+ seastar::yield()
+ ).then_interruptible([op] {
+ return op->start().finally([op /* by copy */] {
+ // ensure the op's lifetime is appropriate. It is not enough to
+ // guarantee it's alive at the scheduling stages (i.e. `then()`
+ // calling) but also during the actual execution (i.e. when passed
+ // lambdas are actually run).
+ });
+ });
+ return std::make_pair(std::move(op), std::move(fut));
+ }
+
+ // tids for ops i issue, prefixed with core id to ensure uniqueness
+ ceph_tid_t next_tid;
+ ceph_tid_t get_tid() {
+ assert_core();
+ return next_tid++;
+ }
+
+ HeartbeatStampsRef get_hb_stamps(int peer);
+ std::map<int, HeartbeatStampsRef> heartbeat_stamps;
+
+ // Time state
+ const ceph::mono_time startup_time;
+ ceph::signedspan get_mnow() const {
+ assert_core();
+ return ceph::mono_clock::now() - startup_time;
+ }
+
+public:
+ PerShardState(
+ int whoami,
+ ceph::mono_time startup_time,
+ PerfCounters *perf,
+ PerfCounters *recoverystate_perf,
+ crimson::os::FuturizedStore &store,
+ OSDState& osd_state);
+};
+
+/**
+ * OSDSingletonState
+ *
+ * OSD-wide singleton holding instances that need to be accessible
+ * from all PGs.
+ */
+class OSDSingletonState : public md_config_obs_t {
+ friend class ShardServices;
+ friend class PGShardManager;
+ friend class OSD;
+ using cached_map_t = OSDMapService::cached_map_t;
+ using local_cached_map_t = OSDMapService::local_cached_map_t;
+
+public:
+ OSDSingletonState(
+ int whoami,
+ crimson::net::Messenger &cluster_msgr,
+ crimson::net::Messenger &public_msgr,
+ crimson::mon::Client &monc,
+ crimson::mgr::Client &mgrc);
+
+private:
+ const int whoami;
+
+ crimson::common::CephContext cct;
+ PerfCounters *perf = nullptr;
+ PerfCounters *recoverystate_perf = nullptr;
+
+ SharedLRU<epoch_t, OSDMap> osdmaps;
+ SimpleLRU<epoch_t, bufferlist, false> map_bl_cache;
+
+ cached_map_t osdmap;
+ cached_map_t &get_osdmap() { return osdmap; }
+ void update_map(cached_map_t new_osdmap) {
+ osdmap = std::move(new_osdmap);
+ }
+
+ crimson::net::Messenger &cluster_msgr;
+ crimson::net::Messenger &public_msgr;
+
+ seastar::future<> send_to_osd(int peer, MessageURef m, epoch_t from_epoch);
+
+ crimson::mon::Client &monc;
+ seastar::future<> osdmap_subscribe(version_t epoch, bool force_request);
+
+ crimson::mgr::Client &mgrc;
+
+ std::unique_ptr<OSDMeta> meta_coll;
+ template <typename... Args>
+ void init_meta_coll(Args&&... args) {
+ meta_coll = std::make_unique<OSDMeta>(std::forward<Args>(args)...);
+ }
+ OSDMeta &get_meta_coll() {
+ assert(meta_coll);
+ return *meta_coll;
+ }
+
+ OSDSuperblock superblock;
+ void set_superblock(OSDSuperblock _superblock) {
+ superblock = std::move(_superblock);
+ }
+
+ seastar::future<> send_incremental_map(
+ crimson::net::Connection &conn,
+ epoch_t first);
+
+ seastar::future<> send_incremental_map_to_osd(int osd, epoch_t first);
+
+ auto get_pool_info(int64_t poolid) {
+ return get_meta_coll().load_final_pool_info(poolid);
+ }
+
+ // global pg temp state
+ struct pg_temp_t {
+ std::vector<int> acting;
+ bool forced = false;
+ };
+ std::map<pg_t, pg_temp_t> pg_temp_wanted;
+ std::map<pg_t, pg_temp_t> pg_temp_pending;
+ friend std::ostream& operator<<(std::ostream&, const pg_temp_t&);
+
+ void queue_want_pg_temp(pg_t pgid, const std::vector<int>& want,
+ bool forced = false);
+ void remove_want_pg_temp(pg_t pgid);
+ void requeue_pg_temp();
+ seastar::future<> send_pg_temp();
+
+ std::set<pg_t> pg_created;
+ seastar::future<> send_pg_created(pg_t pgid);
+ seastar::future<> send_pg_created();
+ void prune_pg_created();
+
+ struct DirectFinisher {
+ void queue(Context *c) {
+ c->complete(0);
+ }
+ } finisher;
+ AsyncReserver<spg_t, DirectFinisher> local_reserver;
+ AsyncReserver<spg_t, DirectFinisher> remote_reserver;
+ AsyncReserver<spg_t, DirectFinisher> snap_reserver;
+
+ epoch_t up_thru_wanted = 0;
+ seastar::future<> send_alive(epoch_t want);
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set <std::string> &changed) final;
+
+ seastar::future<local_cached_map_t> get_local_map(epoch_t e);
+ seastar::future<std::unique_ptr<OSDMap>> load_map(epoch_t e);
+ seastar::future<bufferlist> load_map_bl(epoch_t e);
+ seastar::future<std::map<epoch_t, bufferlist>>
+ load_map_bls(epoch_t first, epoch_t last);
+ void store_map_bl(ceph::os::Transaction& t,
+ epoch_t e, bufferlist&& bl);
+ seastar::future<> store_maps(ceph::os::Transaction& t,
+ epoch_t start, Ref<MOSDMap> m);
+};
+
+/**
+ * Represents services available to each PG
+ */
+class ShardServices : public OSDMapService {
+ friend class PGShardManager;
+ friend class OSD;
+ using cached_map_t = OSDMapService::cached_map_t;
+ using local_cached_map_t = OSDMapService::local_cached_map_t;
+
+ PerShardState local_state;
+ seastar::sharded<OSDSingletonState> &osd_singleton_state;
+ PGShardMapping& pg_to_shard_mapping;
+
+ template <typename F, typename... Args>
+ auto with_singleton(F &&f, Args&&... args) {
+ return osd_singleton_state.invoke_on(
+ PRIMARY_CORE,
+ std::forward<F>(f),
+ std::forward<Args>(args)...
+ );
+ }
+
+#define FORWARD_CONST(FROM_METHOD, TO_METHOD, TARGET) \
+ template <typename... Args> \
+ auto FROM_METHOD(Args&&... args) const { \
+ return TARGET.TO_METHOD(std::forward<Args>(args)...); \
+ }
+
+#define FORWARD(FROM_METHOD, TO_METHOD, TARGET) \
+ template <typename... Args> \
+ auto FROM_METHOD(Args&&... args) { \
+ return TARGET.TO_METHOD(std::forward<Args>(args)...); \
+ }
+
+#define FORWARD_TO_LOCAL(METHOD) FORWARD(METHOD, METHOD, local_state)
+#define FORWARD_TO_LOCAL_CONST(METHOD) FORWARD_CONST( \
+ METHOD, METHOD, local_state) \
+
+#define FORWARD_TO_OSD_SINGLETON_TARGET(METHOD, TARGET) \
+ template <typename... Args> \
+ auto METHOD(Args&&... args) { \
+ return with_singleton( \
+ [](auto &local_state, auto&&... args) { \
+ return local_state.TARGET( \
+ std::forward<decltype(args)>(args)...); \
+ }, std::forward<Args>(args)...); \
+ }
+#define FORWARD_TO_OSD_SINGLETON(METHOD) \
+ FORWARD_TO_OSD_SINGLETON_TARGET(METHOD, METHOD)
+
+public:
+ template <typename... PSSArgs>
+ ShardServices(
+ seastar::sharded<OSDSingletonState> &osd_singleton_state,
+ PGShardMapping& pg_to_shard_mapping,
+ PSSArgs&&... args)
+ : local_state(std::forward<PSSArgs>(args)...),
+ osd_singleton_state(osd_singleton_state),
+ pg_to_shard_mapping(pg_to_shard_mapping) {}
+
+ FORWARD_TO_OSD_SINGLETON(send_to_osd)
+
+ crimson::os::FuturizedStore::Shard &get_store() {
+ return local_state.store;
+ }
+
+ auto remove_pg(spg_t pgid) {
+ local_state.pg_map.remove_pg(pgid);
+ return pg_to_shard_mapping.remove_pg(pgid);
+ }
+
+ crimson::common::CephContext *get_cct() {
+ return &(local_state.cct);
+ }
+
+ template <typename T, typename... Args>
+ auto start_operation(Args&&... args) {
+ return local_state.start_operation<T>(std::forward<Args>(args)...);
+ }
+
+ template <typename InterruptorT, typename T, typename... Args>
+ auto start_operation_may_interrupt(Args&&... args) {
+ return local_state.start_operation_may_interrupt<
+ InterruptorT, T>(std::forward<Args>(args)...);
+ }
+
+ auto &get_registry() { return local_state.registry; }
+
+ // Loggers
+ PerfCounters &get_recoverystate_perf_logger() {
+ return *local_state.recoverystate_perf;
+ }
+ PerfCounters &get_perf_logger() {
+ return *local_state.perf;
+ }
+
+ // Diagnostics
+ FORWARD_TO_LOCAL_CONST(dump_ops_in_flight);
+
+ // Local PG Management
+ seastar::future<Ref<PG>> make_pg(
+ cached_map_t create_map,
+ spg_t pgid,
+ bool do_create);
+ seastar::future<Ref<PG>> handle_pg_create_info(
+ std::unique_ptr<PGCreateInfo> info);
+
+ using get_or_create_pg_ertr = PGMap::wait_for_pg_ertr;
+ using get_or_create_pg_ret = get_or_create_pg_ertr::future<Ref<PG>>;
+ get_or_create_pg_ret get_or_create_pg(
+ PGMap::PGCreationBlockingEvent::TriggerI&&,
+ spg_t pgid,
+ std::unique_ptr<PGCreateInfo> info);
+
+ using wait_for_pg_ertr = PGMap::wait_for_pg_ertr;
+ using wait_for_pg_ret = wait_for_pg_ertr::future<Ref<PG>>;
+ wait_for_pg_ret wait_for_pg(
+ PGMap::PGCreationBlockingEvent::TriggerI&&, spg_t pgid);
+ seastar::future<Ref<PG>> load_pg(spg_t pgid);
+
+ /// Dispatch and reset ctx transaction
+ seastar::future<> dispatch_context_transaction(
+ crimson::os::CollectionRef col, PeeringCtx &ctx);
+
+ /// Dispatch and reset ctx messages
+ seastar::future<> dispatch_context_messages(
+ BufferedRecoveryMessages &&ctx);
+
+ /// Dispatch ctx and dispose of context
+ seastar::future<> dispatch_context(
+ crimson::os::CollectionRef col,
+ PeeringCtx &&ctx);
+
+ /// Dispatch ctx and dispose of ctx, transaction must be empty
+ seastar::future<> dispatch_context(
+ PeeringCtx &&ctx) {
+ return dispatch_context({}, std::move(ctx));
+ }
+
+ /// Return per-core tid
+ ceph_tid_t get_tid() { return local_state.get_tid(); }
+
+ /// Return core-local pg count * number of cores
+ unsigned get_num_local_pgs() const {
+ return local_state.pg_map.get_pg_count();
+ }
+
+ // OSDMapService
+ cached_map_t get_map() const final { return local_state.get_osdmap(); }
+ epoch_t get_up_epoch() const final { return local_state.up_epoch; }
+ seastar::future<cached_map_t> get_map(epoch_t e) final {
+ return with_singleton(
+ [](auto &sstate, epoch_t e) {
+ return sstate.get_local_map(
+ e
+ ).then([](auto lmap) {
+ return seastar::foreign_ptr<local_cached_map_t>(lmap);
+ });
+ }, e).then([](auto fmap) {
+ return make_local_shared_foreign(std::move(fmap));
+ });
+ }
+
+ FORWARD_TO_OSD_SINGLETON(get_pool_info)
+ FORWARD(with_throttle_while, with_throttle_while, local_state.throttler)
+
+ FORWARD_TO_OSD_SINGLETON(send_incremental_map)
+ FORWARD_TO_OSD_SINGLETON(send_incremental_map_to_osd)
+
+ FORWARD_TO_OSD_SINGLETON(osdmap_subscribe)
+ FORWARD_TO_OSD_SINGLETON(queue_want_pg_temp)
+ FORWARD_TO_OSD_SINGLETON(remove_want_pg_temp)
+ FORWARD_TO_OSD_SINGLETON(requeue_pg_temp)
+ FORWARD_TO_OSD_SINGLETON(send_pg_created)
+ FORWARD_TO_OSD_SINGLETON(send_alive)
+ FORWARD_TO_OSD_SINGLETON(send_pg_temp)
+ FORWARD_TO_LOCAL_CONST(get_mnow)
+ FORWARD_TO_LOCAL(get_hb_stamps)
+
+ FORWARD(pg_created, pg_created, local_state.pg_map)
+
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ local_update_priority,
+ local_reserver.update_priority)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ local_cancel_reservation,
+ local_reserver.cancel_reservation)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ local_dump_reservations,
+ local_reserver.dump)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ remote_cancel_reservation,
+ remote_reserver.cancel_reservation)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ remote_dump_reservations,
+ remote_reserver.dump)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ snap_cancel_reservation,
+ snap_reserver.cancel_reservation)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ snap_dump_reservations,
+ snap_reserver.dump)
+
+ Context *invoke_context_on_core(core_id_t core, Context *c) {
+ if (!c) return nullptr;
+ return new LambdaContext([core, c](int code) {
+ std::ignore = seastar::smp::submit_to(
+ core,
+ [c, code] {
+ c->complete(code);
+ });
+ });
+ }
+ seastar::future<> local_request_reservation(
+ spg_t item,
+ Context *on_reserved,
+ unsigned prio,
+ Context *on_preempt) {
+ return with_singleton(
+ [item, prio](OSDSingletonState &singleton,
+ Context *wrapped_on_reserved, Context *wrapped_on_preempt) {
+ return singleton.local_reserver.request_reservation(
+ item,
+ wrapped_on_reserved,
+ prio,
+ wrapped_on_preempt);
+ },
+ invoke_context_on_core(seastar::this_shard_id(), on_reserved),
+ invoke_context_on_core(seastar::this_shard_id(), on_preempt));
+ }
+ seastar::future<> remote_request_reservation(
+ spg_t item,
+ Context *on_reserved,
+ unsigned prio,
+ Context *on_preempt) {
+ return with_singleton(
+ [item, prio](OSDSingletonState &singleton,
+ Context *wrapped_on_reserved, Context *wrapped_on_preempt) {
+ return singleton.remote_reserver.request_reservation(
+ item,
+ wrapped_on_reserved,
+ prio,
+ wrapped_on_preempt);
+ },
+ invoke_context_on_core(seastar::this_shard_id(), on_reserved),
+ invoke_context_on_core(seastar::this_shard_id(), on_preempt));
+ }
+ seastar::future<> snap_request_reservation(
+ spg_t item,
+ Context *on_reserved,
+ unsigned prio) {
+ return with_singleton(
+ [item, prio](OSDSingletonState &singleton,
+ Context *wrapped_on_reserved) {
+ return singleton.snap_reserver.request_reservation(
+ item,
+ wrapped_on_reserved,
+ prio);
+ },
+ invoke_context_on_core(seastar::this_shard_id(), on_reserved));
+ }
+
+#undef FORWARD_CONST
+#undef FORWARD
+#undef FORWARD_TO_OSD_SINGLETON
+#undef FORWARD_TO_LOCAL
+#undef FORWARD_TO_LOCAL_CONST
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::OSDSingletonState::pg_temp_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/state.h b/src/crimson/osd/state.h
new file mode 100644
index 000000000..f0676a4ec
--- /dev/null
+++ b/src/crimson/osd/state.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string_view>
+#include <ostream>
+
+#include <seastar/core/shared_future.hh>
+
+class OSDMap;
+
+namespace crimson::osd {
+
+// seastar::sharded puts start_single on core 0
+constexpr core_id_t PRIMARY_CORE = 0;
+
+/**
+ * OSDState
+ *
+ * Maintains state representing the OSD's progress from booting through
+ * shutdown.
+ *
+ * Shards other than PRIMARY_CORE may use their local instance to check
+ * on ACTIVE and STOPPING. All other methods are restricted to
+ * PRIMARY_CORE (such methods start with an assert to this effect).
+ */
+class OSDState : public seastar::peering_sharded_service<OSDState> {
+
+ enum class State {
+ INITIALIZING,
+ PREBOOT,
+ BOOTING,
+ ACTIVE,
+ PRESTOP,
+ STOPPING,
+ WAITING_FOR_HEALTHY,
+ };
+
+ State state = State::INITIALIZING;
+ mutable seastar::shared_promise<> wait_for_active;
+
+ /// Sets local instance state to active, called from set_active
+ void _set_active() {
+ state = State::ACTIVE;
+ wait_for_active.set_value();
+ wait_for_active = {};
+ }
+ /// Sets local instance state to stopping, called from set_stopping
+ void _set_stopping() {
+ state = State::STOPPING;
+ wait_for_active.set_exception(crimson::common::system_shutdown_exception{});
+ wait_for_active = {};
+ }
+public:
+ bool is_initializing() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::INITIALIZING;
+ }
+ bool is_preboot() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::PREBOOT;
+ }
+ bool is_booting() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::BOOTING;
+ }
+ bool is_active() const {
+ return state == State::ACTIVE;
+ }
+ seastar::future<> when_active() const {
+ return is_active() ? seastar::now()
+ : wait_for_active.get_shared_future();
+ };
+ bool is_prestop() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::PRESTOP;
+ }
+ bool is_stopping() const {
+ return state == State::STOPPING;
+ }
+ bool is_waiting_for_healthy() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::WAITING_FOR_HEALTHY;
+ }
+ void set_preboot() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ state = State::PREBOOT;
+ }
+ void set_booting() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ state = State::BOOTING;
+ }
+ /// Sets all shards to active
+ seastar::future<> set_active() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return container().invoke_on_all([](auto& osd_state) {
+ osd_state._set_active();
+ });
+ }
+ void set_prestop() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ state = State::PRESTOP;
+ }
+ /// Sets all shards to stopping
+ seastar::future<> set_stopping() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return container().invoke_on_all([](auto& osd_state) {
+ osd_state._set_stopping();
+ });
+ }
+ std::string_view to_string() const {
+ switch (state) {
+ case State::INITIALIZING: return "initializing";
+ case State::PREBOOT: return "preboot";
+ case State::BOOTING: return "booting";
+ case State::ACTIVE: return "active";
+ case State::PRESTOP: return "prestop";
+ case State::STOPPING: return "stopping";
+ case State::WAITING_FOR_HEALTHY: return "waiting_for_healthy";
+ default: return "???";
+ }
+ }
+};
+
+inline std::ostream&
+operator<<(std::ostream& os, const OSDState& s) {
+ return os << s.to_string();
+}
+}
diff --git a/src/crimson/osd/stop_signal.h b/src/crimson/osd/stop_signal.h
new file mode 100644
index 000000000..951f8d4b7
--- /dev/null
+++ b/src/crimson/osd/stop_signal.h
@@ -0,0 +1,83 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2020 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/abort_source.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/condition-variable.hh>
+
+/// Seastar apps lib namespace
+
+namespace seastar_apps_lib {
+
+
+/// \brief Futurized SIGINT/SIGTERM signals handler class
+///
+/// Seastar-style helper class that allows easy waiting for SIGINT/SIGTERM signals
+/// from your app.
+///
+/// Example:
+/// \code
+/// #include <seastar/apps/lib/stop_signal.hh>
+/// ...
+/// int main() {
+/// ...
+/// seastar::thread th([] {
+/// seastar_apps_lib::stop_signal stop_signal;
+/// <some code>
+/// stop_signal.wait().get(); // this will wait till we receive SIGINT or SIGTERM signal
+/// });
+/// \endcode
+class stop_signal {
+ seastar::condition_variable _cond;
+ seastar::abort_source _abort_source;
+
+private:
+ void on_signal() {
+ if (stopping()) {
+ return;
+ }
+ _abort_source.request_abort();
+ _cond.broadcast();
+ }
+public:
+ stop_signal() {
+ seastar::engine().handle_signal(SIGINT, [this] { on_signal(); });
+ seastar::engine().handle_signal(SIGTERM, [this] { on_signal(); });
+ }
+ ~stop_signal() {
+ // There's no way to unregister a handler yet, so register a no-op handler instead.
+ seastar::engine().handle_signal(SIGINT, [] {});
+ seastar::engine().handle_signal(SIGTERM, [] {});
+ }
+ seastar::future<> wait() {
+ return _cond.wait([this] { return _abort_source.abort_requested(); });
+ }
+ bool stopping() const {
+ return _abort_source.abort_requested();
+ }
+ auto& abort_source() {
+ return _abort_source;
+ }
+};
+}
diff --git a/src/crimson/osd/watch.cc b/src/crimson/osd/watch.cc
new file mode 100644
index 000000000..4573333c3
--- /dev/null
+++ b/src/crimson/osd/watch.cc
@@ -0,0 +1,354 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm_ext/insert.hpp>
+
+#include "crimson/osd/watch.h"
+#include "crimson/osd/osd_operations/internal_client_request.h"
+
+#include "messages/MWatchNotify.h"
+
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+// a watcher can remove itself if it has not seen a notification after a period of time.
+// in the case, we need to drop it also from the persisted `ObjectState` instance.
+// this operation resembles a bit the `_UNWATCH` subop.
+class WatchTimeoutRequest final : public InternalClientRequest {
+public:
+ WatchTimeoutRequest(WatchRef watch, Ref<PG> pg)
+ : InternalClientRequest(std::move(pg)),
+ watch(std::move(watch)) {
+ }
+
+ const hobject_t& get_target_oid() const final;
+ PG::do_osd_ops_params_t get_do_osd_ops_params() const final;
+ std::vector<OSDOp> create_osd_ops() final;
+
+private:
+ WatchRef watch;
+};
+
+const hobject_t& WatchTimeoutRequest::get_target_oid() const
+{
+ assert(watch->obc);
+ return watch->obc->get_oid();
+}
+
+PG::do_osd_ops_params_t
+WatchTimeoutRequest::get_do_osd_ops_params() const
+{
+ osd_reqid_t reqid;
+ reqid.name = watch->entity_name;
+ PG::do_osd_ops_params_t params{
+ watch->conn,
+ reqid,
+ ceph_clock_now(),
+ get_pg().get_osdmap_epoch(),
+ entity_inst_t{ watch->entity_name, watch->winfo.addr },
+ 0
+ };
+ logger().debug("{}: params.reqid={}", __func__, params.reqid);
+ return params;
+}
+
+std::vector<OSDOp> WatchTimeoutRequest::create_osd_ops()
+{
+ logger().debug("{}", __func__);
+ assert(watch);
+ OSDOp osd_op;
+ osd_op.op.op = CEPH_OSD_OP_WATCH;
+ osd_op.op.flags = 0;
+ osd_op.op.watch.op = CEPH_OSD_WATCH_OP_UNWATCH;
+ osd_op.op.watch.cookie = watch->winfo.cookie;
+ return std::vector{std::move(osd_op)};
+}
+
+Watch::~Watch()
+{
+ logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie());
+}
+
+seastar::future<> Watch::connect(crimson::net::ConnectionRef conn, bool)
+{
+ if (this->conn == conn) {
+ logger().debug("conn={} already connected", conn);
+ return seastar::now();
+ }
+ timeout_timer.cancel();
+ timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds});
+ this->conn = std::move(conn);
+ return seastar::now();
+}
+
+void Watch::disconnect()
+{
+ ceph_assert(!conn);
+ timeout_timer.cancel();
+ timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds});
+}
+
+seastar::future<> Watch::send_notify_msg(NotifyRef notify)
+{
+ logger().info("{} for notify(id={})", __func__, notify->ninfo.notify_id);
+ return conn->send(crimson::make_message<MWatchNotify>(
+ winfo.cookie,
+ notify->user_version,
+ notify->ninfo.notify_id,
+ CEPH_WATCH_EVENT_NOTIFY,
+ notify->ninfo.bl,
+ notify->client_gid));
+}
+
+seastar::future<> Watch::start_notify(NotifyRef notify)
+{
+ logger().debug("{} gid={} cookie={} starting notify(id={})",
+ __func__, get_watcher_gid(), get_cookie(),
+ notify->ninfo.notify_id);
+ auto [ it, emplaced ] = in_progress_notifies.emplace(std::move(notify));
+ ceph_assert(emplaced);
+ ceph_assert(is_alive());
+ return is_connected() ? send_notify_msg(*it) : seastar::now();
+}
+
+seastar::future<> Watch::notify_ack(
+ const uint64_t notify_id,
+ const ceph::bufferlist& reply_bl)
+{
+ logger().debug("{} gid={} cookie={} notify_id={}",
+ __func__, get_watcher_gid(), get_cookie(), notify_id);
+ const auto it = in_progress_notifies.find(notify_id);
+ if (it == std::end(in_progress_notifies)) {
+ logger().error("{} notify_id={} not found on the in-progess list."
+ " Supressing but this should not happen.",
+ __func__, notify_id);
+ return seastar::now();
+ }
+ auto notify = *it;
+ logger().debug("Watch::notify_ack gid={} cookie={} found notify(id={})",
+ get_watcher_gid(),
+ get_cookie(),
+ notify->get_id());
+ // let's ensure we're extending the life-time till end of this method
+ static_assert(std::is_same_v<decltype(notify), NotifyRef>);
+ in_progress_notifies.erase(it);
+ return notify->complete_watcher(shared_from_this(), reply_bl);
+}
+
+seastar::future<> Watch::send_disconnect_msg()
+{
+ if (!is_connected()) {
+ return seastar::now();
+ }
+ ceph::bufferlist empty;
+ return conn->send(crimson::make_message<MWatchNotify>(
+ winfo.cookie,
+ 0,
+ 0,
+ CEPH_WATCH_EVENT_DISCONNECT,
+ empty));
+}
+
+void Watch::discard_state()
+{
+ logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie());
+ ceph_assert(obc);
+ in_progress_notifies.clear();
+ timeout_timer.cancel();
+}
+
+void Watch::got_ping(utime_t)
+{
+ if (is_connected()) {
+ // using cancel() + arm() as rearm() has no overload for time delta.
+ timeout_timer.cancel();
+ timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds});
+ }
+}
+
+seastar::future<> Watch::remove()
+{
+ logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie());
+ // in contrast to ceph-osd crimson sends CEPH_WATCH_EVENT_DISCONNECT directly
+ // from the timeout handler and _after_ CEPH_WATCH_EVENT_NOTIFY_COMPLETE.
+ // this simplifies the Watch::remove() interface as callers aren't obliged
+ // anymore to decide whether EVENT_DISCONNECT needs to be send or not -- it
+ // becomes an implementation detail of Watch.
+ return seastar::do_for_each(in_progress_notifies,
+ [this_shared=shared_from_this()] (auto notify) {
+ logger().debug("Watch::remove gid={} cookie={} notify(id={})",
+ this_shared->get_watcher_gid(),
+ this_shared->get_cookie(),
+ notify->ninfo.notify_id);
+ return notify->remove_watcher(this_shared);
+ }).then([this] {
+ discard_state();
+ return seastar::now();
+ });
+}
+
+void Watch::cancel_notify(const uint64_t notify_id)
+{
+ logger().debug("{} gid={} cookie={} notify(id={})",
+ __func__, get_watcher_gid(), get_cookie(),
+ notify_id);
+ const auto it = in_progress_notifies.find(notify_id);
+ assert(it != std::end(in_progress_notifies));
+ in_progress_notifies.erase(it);
+}
+
+void Watch::do_watch_timeout()
+{
+ assert(pg);
+ auto [op, fut] = pg->get_shard_services().start_operation<WatchTimeoutRequest>(
+ shared_from_this(), pg);
+ std::ignore = std::move(fut).then([op=std::move(op), this] {
+ return send_disconnect_msg();
+ });
+}
+
+bool notify_reply_t::operator<(const notify_reply_t& rhs) const
+{
+ // comparing std::pairs to emphasize our legacy. ceph-osd stores
+ // notify_replies as std::multimap<std::pair<gid, cookie>, bl>.
+ // unfortunately, what seems to be an implementation detail, got
+ // exposed as part of our public API (the `reply_buffer` parameter
+ // of the `rados_notify` family).
+ const auto lhsp = std::make_pair(watcher_gid, watcher_cookie);
+ const auto rhsp = std::make_pair(rhs.watcher_gid, rhs.watcher_cookie);
+ return lhsp < rhsp;
+}
+
+std::ostream &operator<<(std::ostream &out, const notify_reply_t &rhs)
+{
+ out << "notify_reply_t{watcher_gid=" << rhs.watcher_gid
+ << ", watcher_cookie=" << rhs.watcher_cookie << "}";
+ return out;
+}
+
+Notify::Notify(crimson::net::ConnectionRef conn,
+ const notify_info_t& ninfo,
+ const uint64_t client_gid,
+ const uint64_t user_version)
+ : ninfo(ninfo),
+ conn(std::move(conn)),
+ client_gid(client_gid),
+ user_version(user_version)
+{}
+
+Notify::~Notify()
+{
+ logger().debug("{} for notify(id={})", __func__, ninfo.notify_id);
+}
+
+seastar::future<> Notify::remove_watcher(WatchRef watch)
+{
+ logger().debug("{} for notify(id={})", __func__, ninfo.notify_id);
+
+ if (discarded || complete) {
+ logger().debug("{} for notify(id={}) discarded/complete already"
+ " discarded: {} complete: {}", __func__,
+ ninfo.notify_id, discarded ,complete);
+ return seastar::now();
+ }
+ [[maybe_unused]] const auto num_removed = watchers.erase(watch);
+ assert(num_removed > 0);
+ if (watchers.empty()) {
+ complete = true;
+ [[maybe_unused]] bool was_armed = timeout_timer.cancel();
+ assert(was_armed);
+ return send_completion();
+ } else {
+ return seastar::now();
+ }
+}
+
+
+seastar::future<> Notify::complete_watcher(
+ WatchRef watch,
+ const ceph::bufferlist& reply_bl)
+{
+ logger().debug("{} for notify(id={})", __func__, ninfo.notify_id);
+
+ if (discarded || complete) {
+ logger().debug("{} for notify(id={}) discarded/complete already"
+ " discarded: {} complete: {}", __func__,
+ ninfo.notify_id, discarded ,complete);
+ return seastar::now();
+ }
+ notify_replies.emplace(notify_reply_t{
+ watch->get_watcher_gid(),
+ watch->get_cookie(),
+ reply_bl});
+ return remove_watcher(std::move(watch));
+}
+
+seastar::future<> Notify::send_completion(
+ std::set<WatchRef> timedout_watchers)
+{
+ logger().info("{} -- {} in progress watchers, timedout watchers {}",
+ __func__, watchers.size(), timedout_watchers.size());
+ logger().debug("{} sending notify replies: {}", __func__, notify_replies);
+
+ ceph::bufferlist empty;
+ auto reply = crimson::make_message<MWatchNotify>(
+ ninfo.cookie,
+ user_version,
+ ninfo.notify_id,
+ CEPH_WATCH_EVENT_NOTIFY_COMPLETE,
+ empty,
+ client_gid);
+ ceph::bufferlist reply_bl;
+ {
+ std::vector<std::pair<uint64_t,uint64_t>> missed;
+ missed.reserve(std::size(timedout_watchers));
+ boost::insert(
+ missed, std::begin(missed),
+ timedout_watchers | boost::adaptors::transformed([] (auto w) {
+ return std::make_pair(w->get_watcher_gid(), w->get_cookie());
+ }));
+ ceph::encode(notify_replies, reply_bl);
+ ceph::encode(missed, reply_bl);
+ }
+ reply->set_data(std::move(reply_bl));
+ if (!timedout_watchers.empty()) {
+ reply->return_code = -ETIMEDOUT;
+ }
+ return conn->send(std::move(reply));
+}
+
+void Notify::do_notify_timeout()
+{
+ logger().debug("{} complete={}", __func__, complete);
+ if (complete) {
+ return;
+ }
+ // it might be that `this` is kept alive only because of the reference
+ // a watcher stores and which is being removed by `cancel_notify()`.
+ // to avoid use-after-free we bump up the ref counter with `guard_ptr`.
+ [[maybe_unused]] auto guard_ptr = shared_from_this();
+ for (auto& watcher : watchers) {
+ logger().debug("canceling watcher cookie={} gid={} use_count={}",
+ watcher->get_cookie(),
+ watcher->get_watcher_gid(),
+ watcher->use_count());
+ watcher->cancel_notify(ninfo.notify_id);
+ }
+ std::ignore = send_completion(std::move(watchers));
+ watchers.clear();
+}
+
+} // namespace crimson::osd
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::WatchTimeoutRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/watch.h b/src/crimson/osd/watch.h
new file mode 100644
index 000000000..b3982141d
--- /dev/null
+++ b/src/crimson/osd/watch.h
@@ -0,0 +1,256 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iterator>
+#include <map>
+#include <set>
+
+#include <seastar/core/shared_ptr.hh>
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/pg.h"
+#include "include/denc.h"
+
+namespace crimson::osd {
+
+class Notify;
+using NotifyRef = seastar::shared_ptr<Notify>;
+
+// NOTE: really need to have this public. Otherwise `shared_from_this()`
+// will abort. According to cppreference.com:
+//
+// "The constructors of std::shared_ptr detect the presence
+// of an unambiguous and accessible (ie. public inheritance
+// is mandatory) (since C++17) enable_shared_from_this base".
+//
+// I expect the `seastar::shared_ptr` shares this behaviour.
+class Watch : public seastar::enable_shared_from_this<Watch> {
+ // this is a private tag for the public constructor that turns it into
+ // de facto private one. The motivation behind the hack is make_shared
+ // used by create().
+ struct private_ctag_t{};
+
+ std::set<NotifyRef, std::less<>> in_progress_notifies;
+ crimson::net::ConnectionRef conn;
+ crimson::osd::ObjectContextRef obc;
+
+ watch_info_t winfo;
+ entity_name_t entity_name;
+ Ref<PG> pg;
+
+ seastar::timer<seastar::lowres_clock> timeout_timer;
+
+ seastar::future<> start_notify(NotifyRef);
+ seastar::future<> send_notify_msg(NotifyRef);
+ seastar::future<> send_disconnect_msg();
+
+ friend Notify;
+ friend class WatchTimeoutRequest;
+
+public:
+ Watch(private_ctag_t,
+ crimson::osd::ObjectContextRef obc,
+ const watch_info_t& winfo,
+ const entity_name_t& entity_name,
+ Ref<PG> pg)
+ : obc(std::move(obc)),
+ winfo(winfo),
+ entity_name(entity_name),
+ pg(std::move(pg)),
+ timeout_timer([this] {
+ return do_watch_timeout();
+ }) {
+ assert(this->pg);
+ }
+ ~Watch();
+
+ seastar::future<> connect(crimson::net::ConnectionRef, bool);
+ void disconnect();
+ bool is_alive() const {
+ return true;
+ }
+ bool is_connected() const {
+ return static_cast<bool>(conn);
+ }
+ void got_ping(utime_t);
+
+ void discard_state();
+
+ seastar::future<> remove();
+
+ /// Call when notify_ack received on notify_id
+ seastar::future<> notify_ack(
+ uint64_t notify_id, ///< [in] id of acked notify
+ const ceph::bufferlist& reply_bl); ///< [in] notify reply buffer
+
+ template <class... Args>
+ static seastar::shared_ptr<Watch> create(Args&&... args) {
+ return seastar::make_shared<Watch>(private_ctag_t{},
+ std::forward<Args>(args)...);
+ };
+
+ uint64_t get_watcher_gid() const {
+ return entity_name.num();
+ }
+ auto get_pg() const {
+ return pg;
+ }
+ auto& get_entity() const {
+ return entity_name;
+ }
+ auto& get_cookie() const {
+ return winfo.cookie;
+ }
+ auto& get_peer_addr() const {
+ return winfo.addr;
+ }
+ void cancel_notify(const uint64_t notify_id);
+ void do_watch_timeout();
+};
+
+using WatchRef = seastar::shared_ptr<Watch>;
+
+struct notify_reply_t {
+ uint64_t watcher_gid;
+ uint64_t watcher_cookie;
+ ceph::bufferlist bl;
+
+ bool operator<(const notify_reply_t& rhs) const;
+ DENC(notify_reply_t, v, p) {
+ // there is no versioning / preamble
+ denc(v.watcher_gid, p);
+ denc(v.watcher_cookie, p);
+ denc(v.bl, p);
+ }
+};
+std::ostream &operator<<(std::ostream &out, const notify_reply_t &rhs);
+
+class Notify : public seastar::enable_shared_from_this<Notify> {
+ std::set<WatchRef> watchers;
+ const notify_info_t ninfo;
+ crimson::net::ConnectionRef conn;
+ const uint64_t client_gid;
+ const uint64_t user_version;
+ bool complete{false};
+ bool discarded{false};
+ seastar::timer<seastar::lowres_clock> timeout_timer{
+ [this] { do_notify_timeout(); }
+ };
+
+ ~Notify();
+
+ /// (gid,cookie) -> reply_bl for everyone who acked the notify
+ std::multiset<notify_reply_t> notify_replies;
+
+ uint64_t get_id() const { return ninfo.notify_id; }
+
+ /// Sends notify completion if watchers.empty() or timeout
+ seastar::future<> send_completion(
+ std::set<WatchRef> timedout_watchers = {});
+
+ /// Called on Notify timeout
+ void do_notify_timeout();
+
+ Notify(crimson::net::ConnectionRef conn,
+ const notify_info_t& ninfo,
+ const uint64_t client_gid,
+ const uint64_t user_version);
+ template <class WatchIteratorT>
+ Notify(WatchIteratorT begin,
+ WatchIteratorT end,
+ crimson::net::ConnectionRef conn,
+ const notify_info_t& ninfo,
+ const uint64_t client_gid,
+ const uint64_t user_version);
+ // this is a private tag for the public constructor that turns it into
+ // de facto private one. The motivation behind the hack is make_shared
+ // used by create_n_propagate factory.
+ struct private_ctag_t{};
+
+ using ptr_t = seastar::shared_ptr<Notify>;
+ friend bool operator<(const ptr_t& lhs, const ptr_t& rhs) {
+ assert(lhs);
+ assert(rhs);
+ return lhs->get_id() < rhs->get_id();
+ }
+ friend bool operator<(const ptr_t& ptr, const uint64_t id) {
+ assert(ptr);
+ return ptr->get_id() < id;
+ }
+ friend bool operator<(const uint64_t id, const ptr_t& ptr) {
+ assert(ptr);
+ return id < ptr->get_id();
+ }
+
+ friend Watch;
+
+public:
+ template <class... Args>
+ Notify(private_ctag_t, Args&&... args) : Notify(std::forward<Args>(args)...) {
+ }
+
+ template <class WatchIteratorT, class... Args>
+ static seastar::future<> create_n_propagate(
+ WatchIteratorT begin,
+ WatchIteratorT end,
+ Args&&... args);
+
+ seastar::future<> remove_watcher(WatchRef watch);
+ seastar::future<> complete_watcher(WatchRef watch,
+ const ceph::bufferlist& reply_bl);
+};
+
+
+template <class WatchIteratorT>
+Notify::Notify(WatchIteratorT begin,
+ WatchIteratorT end,
+ crimson::net::ConnectionRef conn,
+ const notify_info_t& ninfo,
+ const uint64_t client_gid,
+ const uint64_t user_version)
+ : watchers(begin, end),
+ ninfo(ninfo),
+ conn(std::move(conn)),
+ client_gid(client_gid),
+ user_version(user_version) {
+ assert(!std::empty(watchers));
+ if (ninfo.timeout) {
+ timeout_timer.arm(std::chrono::seconds{ninfo.timeout});
+ }
+}
+
+template <class WatchIteratorT, class... Args>
+seastar::future<> Notify::create_n_propagate(
+ WatchIteratorT begin,
+ WatchIteratorT end,
+ Args&&... args)
+{
+ static_assert(
+ std::is_same_v<typename std::iterator_traits<WatchIteratorT>::value_type,
+ crimson::osd::WatchRef>);
+ if (begin == end) {
+ auto notify = seastar::make_shared<Notify>(
+ private_ctag_t{},
+ std::forward<Args>(args)...);
+ return notify->send_completion();
+ } else {
+ auto notify = seastar::make_shared<Notify>(
+ private_ctag_t{},
+ begin, end,
+ std::forward<Args>(args)...);
+ return seastar::do_for_each(begin, end, [=] (auto& watchref) {
+ return watchref->start_notify(notify);
+ });
+ }
+}
+
+} // namespace crimson::osd
+
+WRITE_CLASS_DENC(crimson::osd::notify_reply_t)
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::notify_reply_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/tools/CMakeLists.txt b/src/crimson/tools/CMakeLists.txt
new file mode 100644
index 000000000..fc18ff90b
--- /dev/null
+++ b/src/crimson/tools/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_executable(crimson-store-nbd
+ store_nbd/store-nbd.cc
+ store_nbd/tm_driver.cc
+ store_nbd/fs_driver.cc
+ store_nbd/block_driver.cc
+ )
+target_link_libraries(crimson-store-nbd
+ crimson-os)
+install(TARGETS crimson-store-nbd DESTINATION bin)
+
+add_executable(perf-crimson-msgr perf_crimson_msgr.cc)
+target_link_libraries(perf-crimson-msgr crimson)
+
+add_executable(perf-async-msgr perf_async_msgr.cc)
+target_link_libraries(perf-async-msgr ceph-common global ${ALLOC_LIBS})
+
+add_executable(perf-staged-fltree perf_staged_fltree.cc)
+if(WITH_TESTS)
+target_link_libraries(perf-staged-fltree crimson-seastore crimson::gtest)
+else()
+target_link_libraries(perf-staged-fltree crimson-seastore)
+endif()
diff --git a/src/crimson/tools/perf_async_msgr.cc b/src/crimson/tools/perf_async_msgr.cc
new file mode 100644
index 000000000..38cc84fbb
--- /dev/null
+++ b/src/crimson/tools/perf_async_msgr.cc
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include "auth/Auth.h"
+#include "global/global_init.h"
+#include "msg/Dispatcher.h"
+#include "msg/Messenger.h"
+#include "messages/MOSDOp.h"
+
+#include "auth/DummyAuth.h"
+
+namespace {
+
+constexpr int CEPH_OSD_PROTOCOL = 10;
+
+struct Server {
+ Server(CephContext* cct, unsigned msg_len)
+ : dummy_auth(cct), dispatcher(cct, msg_len)
+ {
+ msgr.reset(Messenger::create(cct, "async", entity_name_t::OSD(0), "server", 0));
+ dummy_auth.auth_registry.refresh_config();
+ msgr->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ msgr->set_default_policy(Messenger::Policy::stateless_server(0));
+ msgr->set_auth_client(&dummy_auth);
+ msgr->set_auth_server(&dummy_auth);
+ }
+ DummyAuthClientServer dummy_auth;
+ std::unique_ptr<Messenger> msgr;
+ struct ServerDispatcher : Dispatcher {
+ unsigned msg_len = 0;
+ bufferlist msg_data;
+
+ ServerDispatcher(CephContext* cct, unsigned msg_len)
+ : Dispatcher(cct), msg_len(msg_len)
+ {
+ msg_data.append_zero(msg_len);
+ }
+ bool ms_can_fast_dispatch_any() const override {
+ return true;
+ }
+ bool ms_can_fast_dispatch(const Message* m) const override {
+ return m->get_type() == CEPH_MSG_OSD_OP;
+ }
+ void ms_fast_dispatch(Message* m) override {
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+ const static pg_t pgid;
+ const static object_locator_t oloc;
+ const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(),
+ pgid.pool(), oloc.nspace);
+ static spg_t spgid(pgid);
+ MOSDOp *rep = new MOSDOp(0, 0, hobj, spgid, 0, 0, 0);
+ bufferlist data(msg_data);
+ rep->write(0, msg_len, data);
+ rep->set_tid(m->get_tid());
+ m->get_connection()->send_message(rep);
+ m->put();
+ }
+ bool ms_dispatch(Message*) override {
+ ceph_abort();
+ }
+ bool ms_handle_reset(Connection*) override {
+ return true;
+ }
+ void ms_handle_remote_reset(Connection*) override {
+ }
+ bool ms_handle_refused(Connection*) override {
+ return true;
+ }
+ } dispatcher;
+};
+
+}
+
+static void run(CephContext* cct, entity_addr_t addr, unsigned bs)
+{
+ std::cout << "async server listening at " << addr << std::endl;
+ Server server{cct, bs};
+ server.msgr->bind(addr);
+ server.msgr->add_dispatcher_head(&server.dispatcher);
+ server.msgr->start();
+ server.msgr->wait();
+}
+
+int main(int argc, char** argv)
+{
+ namespace po = boost::program_options;
+ po::options_description desc{"Allowed options"};
+ desc.add_options()
+ ("help,h", "show help message")
+ ("addr", po::value<std::string>()->default_value("v2:127.0.0.1:9010"),
+ "server address(crimson only supports msgr v2 protocol)")
+ ("bs", po::value<unsigned>()->default_value(0),
+ "server block size")
+ ("crc-enabled", po::value<bool>()->default_value(false),
+ "enable CRC checks")
+ ("threads", po::value<unsigned>()->default_value(3),
+ "async messenger worker threads");
+ po::variables_map vm;
+ std::vector<std::string> unrecognized_options;
+ try {
+ auto parsed = po::command_line_parser(argc, argv)
+ .options(desc)
+ .allow_unregistered()
+ .run();
+ po::store(parsed, vm);
+ if (vm.count("help")) {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+ po::notify(vm);
+ unrecognized_options = po::collect_unrecognized(parsed.options, po::include_positional);
+ } catch(const po::error& e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ return 1;
+ }
+
+ auto addr = vm["addr"].as<std::string>();
+ entity_addr_t target_addr;
+ target_addr.parse(addr.c_str(), nullptr);
+ ceph_assert_always(target_addr.is_msgr2());
+ auto bs = vm["bs"].as<unsigned>();
+ auto crc_enabled = vm["crc-enabled"].as<bool>();
+ auto worker_threads = vm["threads"].as<unsigned>();
+
+ std::vector<const char*> args(argv, argv + argc);
+ auto cct = global_init(nullptr, args,
+ CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_MON_CONFIG);
+ common_init_finish(cct.get());
+
+ if (crc_enabled) {
+ cct->_conf.set_val("ms_crc_header", "true");
+ cct->_conf.set_val("ms_crc_data", "true");
+ } else {
+ cct->_conf.set_val("ms_crc_header", "false");
+ cct->_conf.set_val("ms_crc_data", "false");
+ }
+
+ cct->_conf.set_val("ms_async_op_threads", fmt::format("{}", worker_threads));
+
+ std::cout << "server[" << addr
+ << "](bs=" << bs
+ << ", crc_enabled=" << crc_enabled
+ << ", worker_threads=" << worker_threads
+ << std::endl;
+
+ run(cct.get(), target_addr, bs);
+}
diff --git a/src/crimson/tools/perf_crimson_msgr.cc b/src/crimson/tools/perf_crimson_msgr.cc
new file mode 100644
index 000000000..aa5753442
--- /dev/null
+++ b/src/crimson/tools/perf_crimson_msgr.cc
@@ -0,0 +1,1222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <map>
+#include <boost/program_options.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/smp.hh>
+#include <seastar/core/thread.hh>
+
+#include "common/ceph_time.h"
+#include "messages/MOSDOp.h"
+#include "include/random.h"
+
+#include "crimson/auth/DummyAuth.h"
+#include "crimson/common/log.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/osd/stop_signal.h"
+
+using namespace std;
+using namespace std::chrono_literals;
+
+using lowres_clock_t = seastar::lowres_system_clock;
+
+namespace bpo = boost::program_options;
+
+namespace {
+
+template<typename Message>
+using Ref = boost::intrusive_ptr<Message>;
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_ms);
+}
+
+template <typename T, typename... Args>
+seastar::future<T*> create_sharded(Args... args) {
+ // seems we should only construct/stop shards on #0
+ return seastar::smp::submit_to(0, [=] {
+ auto sharded_obj = seastar::make_lw_shared<seastar::sharded<T>>();
+ return sharded_obj->start(args...).then([sharded_obj]() {
+ seastar::engine().at_exit([sharded_obj]() {
+ return sharded_obj->stop().then([sharded_obj] {});
+ });
+ return sharded_obj.get();
+ });
+ }).then([] (seastar::sharded<T> *ptr_shard) {
+ // return the pointer valid for the caller CPU
+ return &ptr_shard->local();
+ });
+}
+
+double get_reactor_utilization() {
+ auto &value_map = seastar::metrics::impl::get_value_map();
+ auto found = value_map.find("reactor_utilization");
+ assert(found != value_map.end());
+ auto &[full_name, metric_family] = *found;
+ std::ignore = full_name;
+ assert(metric_family.size() == 1);
+ const auto& [labels, metric] = *metric_family.begin();
+ std::ignore = labels;
+ auto value = (*metric)();
+ return value.ui();
+}
+
+enum class perf_mode_t {
+ both,
+ client,
+ server
+};
+
+struct client_config {
+ entity_addr_t server_addr;
+ unsigned block_size;
+ unsigned ramptime;
+ unsigned msgtime;
+ unsigned num_clients;
+ unsigned num_conns;
+ unsigned depth;
+ bool skip_core_0;
+
+ std::string str() const {
+ std::ostringstream out;
+ out << "client[>> " << server_addr
+ << "](bs=" << block_size
+ << ", ramptime=" << ramptime
+ << ", msgtime=" << msgtime
+ << ", num_clients=" << num_clients
+ << ", num_conns=" << num_conns
+ << ", depth=" << depth
+ << ", skip_core_0=" << skip_core_0
+ << ")";
+ return out.str();
+ }
+
+ static client_config load(bpo::variables_map& options) {
+ client_config conf;
+ entity_addr_t addr;
+ ceph_assert(addr.parse(options["server-addr"].as<std::string>().c_str(), nullptr));
+ ceph_assert_always(addr.is_msgr2());
+
+ conf.server_addr = addr;
+ conf.block_size = options["client-bs"].as<unsigned>();
+ conf.ramptime = options["ramptime"].as<unsigned>();
+ conf.msgtime = options["msgtime"].as<unsigned>();
+ conf.num_clients = options["clients"].as<unsigned>();
+ ceph_assert_always(conf.num_clients > 0);
+ conf.num_conns = options["conns-per-client"].as<unsigned>();
+ ceph_assert_always(conf.num_conns > 0);
+ conf.depth = options["depth"].as<unsigned>();
+ conf.skip_core_0 = options["client-skip-core-0"].as<bool>();
+ return conf;
+ }
+};
+
+struct server_config {
+ entity_addr_t addr;
+ unsigned block_size;
+ bool is_fixed_cpu;
+ unsigned core;
+
+ std::string str() const {
+ std::ostringstream out;
+ out << "server[" << addr
+ << "](bs=" << block_size
+ << ", is_fixed_cpu=" << is_fixed_cpu
+ << ", core=" << core
+ << ")";
+ return out.str();
+ }
+
+ static server_config load(bpo::variables_map& options) {
+ server_config conf;
+ entity_addr_t addr;
+ ceph_assert(addr.parse(options["server-addr"].as<std::string>().c_str(), nullptr));
+ ceph_assert_always(addr.is_msgr2());
+
+ conf.addr = addr;
+ conf.block_size = options["server-bs"].as<unsigned>();
+ conf.is_fixed_cpu = options["server-fixed-cpu"].as<bool>();
+ conf.core = options["server-core"].as<unsigned>();
+ return conf;
+ }
+};
+
+const unsigned SAMPLE_RATE = 256;
+
+static seastar::future<> run(
+ perf_mode_t mode,
+ const client_config& client_conf,
+ const server_config& server_conf,
+ bool crc_enabled)
+{
+ struct test_state {
+ struct Server final
+ : public crimson::net::Dispatcher,
+ public seastar::peering_sharded_service<Server> {
+ // available only in msgr_sid
+ crimson::net::MessengerRef msgr;
+ crimson::auth::DummyAuthClientServer dummy_auth;
+ const seastar::shard_id msgr_sid;
+ std::string lname;
+
+ bool is_fixed_cpu = true;
+ bool is_stopped = false;
+ std::optional<seastar::future<>> fut_report;
+
+ unsigned conn_count = 0;
+ unsigned msg_count = 0;
+ MessageRef last_msg;
+
+ // available in all shards
+ unsigned msg_len;
+ bufferlist msg_data;
+
+ Server(seastar::shard_id msgr_sid, unsigned msg_len, bool needs_report)
+ : msgr_sid{msgr_sid},
+ msg_len{msg_len} {
+ lname = fmt::format("server@{}", msgr_sid);
+ msg_data.append_zero(msg_len);
+
+ if (seastar::this_shard_id() == msgr_sid &&
+ needs_report) {
+ start_report();
+ }
+ }
+
+ void ms_handle_connect(
+ crimson::net::ConnectionRef,
+ seastar::shard_id) override {
+ ceph_abort("impossible, server won't connect");
+ }
+
+ void ms_handle_accept(
+ crimson::net::ConnectionRef,
+ seastar::shard_id new_shard,
+ bool is_replace) override {
+ ceph_assert_always(new_shard == seastar::this_shard_id());
+ auto &server = container().local();
+ ++server.conn_count;
+ }
+
+ void ms_handle_reset(
+ crimson::net::ConnectionRef,
+ bool) override {
+ auto &server = container().local();
+ --server.conn_count;
+ }
+
+ std::optional<seastar::future<>> ms_dispatch(
+ crimson::net::ConnectionRef c, MessageRef m) override {
+ assert(c->get_shard_id() == seastar::this_shard_id());
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+
+ auto &server = container().local();
+
+ // server replies with MOSDOp to generate server-side write workload
+ const static pg_t pgid;
+ const static object_locator_t oloc;
+ const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(),
+ pgid.pool(), oloc.nspace);
+ static spg_t spgid(pgid);
+ auto rep = crimson::make_message<MOSDOp>(0, 0, hobj, spgid, 0, 0, 0);
+ bufferlist data(server.msg_data);
+ rep->write(0, server.msg_len, data);
+ rep->set_tid(m->get_tid());
+ ++server.msg_count;
+ std::ignore = c->send(std::move(rep));
+
+ if (server.msg_count % 16 == 0) {
+ server.last_msg = std::move(m);
+ }
+ return {seastar::now()};
+ }
+
+ seastar::future<> init(const entity_addr_t& addr, bool is_fixed_cpu) {
+ return container().invoke_on(
+ msgr_sid, [addr, is_fixed_cpu](auto &server) {
+ // server msgr is always with nonce 0
+ server.msgr = crimson::net::Messenger::create(
+ entity_name_t::OSD(server.msgr_sid),
+ server.lname, 0, is_fixed_cpu);
+ server.msgr->set_default_policy(crimson::net::SocketPolicy::stateless_server(0));
+ server.msgr->set_auth_client(&server.dummy_auth);
+ server.msgr->set_auth_server(&server.dummy_auth);
+ server.is_fixed_cpu = is_fixed_cpu;
+ return server.msgr->bind(entity_addrvec_t{addr}
+ ).safe_then([&server] {
+ return server.msgr->start({&server});
+ }, crimson::net::Messenger::bind_ertr::all_same_way(
+ [addr] (const std::error_code& e) {
+ logger().error("Server: "
+ "there is another instance running at {}", addr);
+ ceph_abort();
+ }));
+ });
+ }
+
+ seastar::future<> shutdown() {
+ logger().info("{} shutdown...", lname);
+ return container().invoke_on(
+ msgr_sid, [](auto &server) {
+ server.is_stopped = true;
+ ceph_assert(server.msgr);
+ server.msgr->stop();
+ return server.msgr->shutdown(
+ ).then([&server] {
+ if (server.fut_report.has_value()) {
+ return std::move(server.fut_report.value());
+ } else {
+ return seastar::now();
+ }
+ });
+ });
+ }
+
+ private:
+ struct ShardReport {
+ unsigned msg_count = 0;
+
+ // per-interval metrics
+ double reactor_utilization;
+ unsigned conn_count = 0;
+ int msg_size = 0;
+ unsigned msg_count_interval = 0;
+ };
+
+ // should not be called frequently to impact performance
+ void get_report(ShardReport& last) {
+ unsigned last_msg_count = last.msg_count;
+ int msg_size = -1;
+ if (last_msg) {
+ auto msg = boost::static_pointer_cast<MOSDOp>(last_msg);
+ msg->finish_decode();
+ ceph_assert_always(msg->ops.size() == 1);
+ msg_size = msg->ops[0].op.extent.length;
+ last_msg.reset();
+ }
+
+ last.msg_count = msg_count;
+ last.reactor_utilization = get_reactor_utilization();
+ last.conn_count = conn_count;
+ last.msg_size = msg_size;
+ last.msg_count_interval = msg_count - last_msg_count;
+ }
+
+ struct TimerReport {
+ unsigned elapsed = 0u;
+ mono_time start_time = mono_clock::zero();
+ std::vector<ShardReport> reports;
+
+ TimerReport(unsigned shards) : reports(shards) {}
+ };
+
+ void start_report() {
+ seastar::promise<> pr_report;
+ fut_report = pr_report.get_future();
+ seastar::do_with(
+ TimerReport(seastar::smp::count),
+ [this](auto &report) {
+ return seastar::do_until(
+ [this] { return is_stopped; },
+ [&report, this] {
+ return seastar::sleep(2s
+ ).then([&report, this] {
+ report.elapsed += 2;
+ if (is_fixed_cpu) {
+ return seastar::smp::submit_to(msgr_sid,
+ [&report, this] {
+ auto &server = container().local();
+ server.get_report(report.reports[seastar::this_shard_id()]);
+ }).then([&report, this] {
+ auto now = mono_clock::now();
+ auto prv = report.start_time;
+ report.start_time = now;
+ if (prv == mono_clock::zero()) {
+ // cannot compute duration
+ return;
+ }
+ std::chrono::duration<double> duration_d = now - prv;
+ double duration = duration_d.count();
+ auto &ireport = report.reports[msgr_sid];
+ double iops = ireport.msg_count_interval / duration;
+ double throughput_MB = -1;
+ if (ireport.msg_size >= 0) {
+ throughput_MB = iops * ireport.msg_size / 1048576;
+ }
+ std::ostringstream sout;
+ sout << setfill(' ')
+ << report.elapsed
+ << "(" << std::setw(5) << duration << ") "
+ << std::setw(9) << iops << "IOPS "
+ << std::setw(8) << throughput_MB << "MiB/s "
+ << ireport.reactor_utilization
+ << "(" << ireport.conn_count << ")";
+ std::cout << sout.str() << std::endl;
+ });
+ } else {
+ return seastar::smp::invoke_on_all([&report, this] {
+ auto &server = container().local();
+ server.get_report(report.reports[seastar::this_shard_id()]);
+ }).then([&report, this] {
+ auto now = mono_clock::now();
+ auto prv = report.start_time;
+ report.start_time = now;
+ if (prv == mono_clock::zero()) {
+ // cannot compute duration
+ return;
+ }
+ std::chrono::duration<double> duration_d = now - prv;
+ double duration = duration_d.count();
+ unsigned num_msgs = 0;
+ // -1 means unavailable, -2 means mismatch
+ int msg_size = -1;
+ for (auto &i : report.reports) {
+ if (i.msg_size >= 0) {
+ if (msg_size == -2) {
+ // pass
+ } else if (msg_size == -1) {
+ msg_size = i.msg_size;
+ } else {
+ if (msg_size != i.msg_size) {
+ msg_size = -2;
+ }
+ }
+ }
+ num_msgs += i.msg_count_interval;
+ }
+ double iops = num_msgs / duration;
+ double throughput_MB = msg_size;
+ if (msg_size >= 0) {
+ throughput_MB = iops * msg_size / 1048576;
+ }
+ std::ostringstream sout;
+ sout << setfill(' ')
+ << report.elapsed
+ << "(" << std::setw(5) << duration << ") "
+ << std::setw(9) << iops << "IOPS "
+ << std::setw(8) << throughput_MB << "MiB/s ";
+ for (auto &i : report.reports) {
+ sout << i.reactor_utilization
+ << "(" << i.conn_count << ") ";
+ }
+ std::cout << sout.str() << std::endl;
+ });
+ }
+ });
+ }
+ );
+ }).then([this] {
+ logger().info("report is stopped!");
+ }).forward_to(std::move(pr_report));
+ }
+ };
+
+ struct Client final
+ : public crimson::net::Dispatcher,
+ public seastar::peering_sharded_service<Client> {
+
+ struct ConnStats {
+ mono_time connecting_time = mono_clock::zero();
+ mono_time connected_time = mono_clock::zero();
+ unsigned received_count = 0u;
+
+ mono_time start_time = mono_clock::zero();
+ unsigned start_count = 0u;
+
+ unsigned sampled_count = 0u;
+ double sampled_total_lat_s = 0.0;
+
+ // for reporting only
+ mono_time finish_time = mono_clock::zero();
+
+ void start_connecting() {
+ connecting_time = mono_clock::now();
+ }
+
+ void finish_connecting() {
+ ceph_assert_always(connected_time == mono_clock::zero());
+ connected_time = mono_clock::now();
+ }
+
+ void start_collect() {
+ ceph_assert_always(connected_time != mono_clock::zero());
+ start_time = mono_clock::now();
+ start_count = received_count;
+ sampled_count = 0u;
+ sampled_total_lat_s = 0.0;
+ finish_time = mono_clock::zero();
+ }
+
+ void prepare_summary(const ConnStats &current) {
+ *this = current;
+ finish_time = mono_clock::now();
+ }
+ };
+
+ struct PeriodStats {
+ mono_time start_time = mono_clock::zero();
+ unsigned start_count = 0u;
+ unsigned sampled_count = 0u;
+ double sampled_total_lat_s = 0.0;
+
+ // for reporting only
+ mono_time finish_time = mono_clock::zero();
+ unsigned finish_count = 0u;
+ unsigned depth = 0u;
+
+ void start_collect(unsigned received_count) {
+ start_time = mono_clock::now();
+ start_count = received_count;
+ sampled_count = 0u;
+ sampled_total_lat_s = 0.0;
+ }
+
+ void reset_period(
+ unsigned received_count, unsigned _depth, PeriodStats &snapshot) {
+ snapshot.start_time = start_time;
+ snapshot.start_count = start_count;
+ snapshot.sampled_count = sampled_count;
+ snapshot.sampled_total_lat_s = sampled_total_lat_s;
+ snapshot.finish_time = mono_clock::now();
+ snapshot.finish_count = received_count;
+ snapshot.depth = _depth;
+
+ start_collect(received_count);
+ }
+ };
+
+ struct JobReport {
+ std::string name;
+ unsigned depth = 0;
+ double connect_time_s = 0;
+ unsigned total_msgs = 0;
+ double messaging_time_s = 0;
+ double latency_ms = 0;
+ double iops = 0;
+ double throughput_mbps = 0;
+
+ void account(const JobReport &stats) {
+ depth += stats.depth;
+ connect_time_s += stats.connect_time_s;
+ total_msgs += stats.total_msgs;
+ messaging_time_s += stats.messaging_time_s;
+ latency_ms += stats.latency_ms;
+ iops += stats.iops;
+ throughput_mbps += stats.throughput_mbps;
+ }
+
+ void report() const {
+ auto str = fmt::format(
+ "{}(depth={}):\n"
+ " connect time: {:08f}s\n"
+ " messages received: {}\n"
+ " messaging time: {:08f}s\n"
+ " latency: {:08f}ms\n"
+ " IOPS: {:08f}\n"
+ " out throughput: {:08f}MB/s",
+ name, depth, connect_time_s,
+ total_msgs, messaging_time_s,
+ latency_ms, iops,
+ throughput_mbps);
+ std::cout << str << std::endl;
+ }
+ };
+
+ struct ConnectionPriv : public crimson::net::Connection::user_private_t {
+ unsigned index;
+ ConnectionPriv(unsigned i) : index{i} {}
+ };
+
+ struct ConnState {
+ crimson::net::MessengerRef msgr;
+ ConnStats conn_stats;
+ PeriodStats period_stats;
+ seastar::semaphore depth;
+ std::vector<lowres_clock_t::time_point> time_msgs_sent;
+ unsigned sent_count = 0u;
+ crimson::net::ConnectionRef active_conn;
+ bool stop_send = false;
+ seastar::promise<JobReport> stopped_send_promise;
+
+ ConnState(std::size_t _depth)
+ : depth{_depth},
+ time_msgs_sent{_depth, lowres_clock_t::time_point::min()} {}
+
+ unsigned get_current_units() const {
+ ceph_assert(depth.available_units() >= 0);
+ return depth.current();
+ }
+
+ seastar::future<JobReport> stop_dispatch_messages() {
+ stop_send = true;
+ depth.broken(DepthBroken());
+ return stopped_send_promise.get_future();
+ }
+ };
+
+ const seastar::shard_id sid;
+ const unsigned id;
+ const std::optional<unsigned> server_sid;
+
+ const unsigned num_clients;
+ const unsigned num_conns;
+ const unsigned msg_len;
+ bufferlist msg_data;
+ const unsigned nr_depth;
+ const unsigned nonce_base;
+ crimson::auth::DummyAuthClientServer dummy_auth;
+
+ std::vector<ConnState> conn_states;
+
+ Client(unsigned num_clients,
+ unsigned num_conns,
+ unsigned msg_len,
+ unsigned _depth,
+ unsigned nonce_base,
+ std::optional<unsigned> server_sid)
+ : sid{seastar::this_shard_id()},
+ id{sid + num_clients - seastar::smp::count},
+ server_sid{server_sid},
+ num_clients{num_clients},
+ num_conns{num_conns},
+ msg_len{msg_len},
+ nr_depth{_depth},
+ nonce_base{nonce_base} {
+ if (is_active()) {
+ for (unsigned i = 0; i < num_conns; ++i) {
+ conn_states.emplace_back(nr_depth);
+ }
+ }
+ msg_data.append_zero(msg_len);
+ }
+
+ std::string get_name(unsigned i) {
+ return fmt::format("client{}Conn{}@{}", id, i, sid);
+ }
+
+ void ms_handle_connect(
+ crimson::net::ConnectionRef conn,
+ seastar::shard_id prv_shard) override {
+ ceph_assert_always(prv_shard == seastar::this_shard_id());
+ assert(is_active());
+ unsigned index = static_cast<ConnectionPriv&>(conn->get_user_private()).index;
+ auto &conn_state = conn_states[index];
+ conn_state.conn_stats.finish_connecting();
+ }
+
+ std::optional<seastar::future<>> ms_dispatch(
+ crimson::net::ConnectionRef conn, MessageRef m) override {
+ assert(is_active());
+ // server replies with MOSDOp to generate server-side write workload
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+
+ unsigned index = static_cast<ConnectionPriv&>(conn->get_user_private()).index;
+ assert(index < num_conns);
+ auto &conn_state = conn_states[index];
+
+ auto msg_id = m->get_tid();
+ if (msg_id % SAMPLE_RATE == 0) {
+ auto msg_index = msg_id % conn_state.time_msgs_sent.size();
+ ceph_assert(conn_state.time_msgs_sent[msg_index] !=
+ lowres_clock_t::time_point::min());
+ std::chrono::duration<double> cur_latency =
+ lowres_clock_t::now() - conn_state.time_msgs_sent[msg_index];
+ conn_state.conn_stats.sampled_total_lat_s += cur_latency.count();
+ ++(conn_state.conn_stats.sampled_count);
+ conn_state.period_stats.sampled_total_lat_s += cur_latency.count();
+ ++(conn_state.period_stats.sampled_count);
+ conn_state.time_msgs_sent[msg_index] = lowres_clock_t::time_point::min();
+ }
+
+ ++(conn_state.conn_stats.received_count);
+ conn_state.depth.signal(1);
+
+ return {seastar::now()};
+ }
+
+ // should start messenger at this shard?
+ bool is_active() {
+ ceph_assert(seastar::this_shard_id() == sid);
+ return sid + num_clients >= seastar::smp::count;
+ }
+
+ seastar::future<> init() {
+ return container().invoke_on_all([](auto& client) {
+ if (client.is_active()) {
+ return seastar::do_for_each(
+ boost::make_counting_iterator(0u),
+ boost::make_counting_iterator(client.num_conns),
+ [&client](auto i) {
+ auto &conn_state = client.conn_states[i];
+ std::string name = client.get_name(i);
+ conn_state.msgr = crimson::net::Messenger::create(
+ entity_name_t::OSD(client.id * client.num_conns + i),
+ name, client.nonce_base + client.id * client.num_conns + i, true);
+ conn_state.msgr->set_default_policy(crimson::net::SocketPolicy::lossy_client(0));
+ conn_state.msgr->set_auth_client(&client.dummy_auth);
+ conn_state.msgr->set_auth_server(&client.dummy_auth);
+ return conn_state.msgr->start({&client});
+ });
+ }
+ return seastar::now();
+ });
+ }
+
+ seastar::future<> shutdown() {
+ return seastar::do_with(
+ std::vector<JobReport>(num_clients * num_conns),
+ [this](auto &all_stats) {
+ return container().invoke_on_all([&all_stats](auto& client) {
+ if (!client.is_active()) {
+ return seastar::now();
+ }
+
+ return seastar::parallel_for_each(
+ boost::make_counting_iterator(0u),
+ boost::make_counting_iterator(client.num_conns),
+ [&all_stats, &client](auto i) {
+ logger().info("{} shutdown...", client.get_name(i));
+ auto &conn_state = client.conn_states[i];
+ return conn_state.stop_dispatch_messages(
+ ).then([&all_stats, &client, i](auto stats) {
+ all_stats[client.id * client.num_conns + i] = stats;
+ });
+ }).then([&client] {
+ return seastar::do_for_each(
+ boost::make_counting_iterator(0u),
+ boost::make_counting_iterator(client.num_conns),
+ [&client](auto i) {
+ auto &conn_state = client.conn_states[i];
+ ceph_assert(conn_state.msgr);
+ conn_state.msgr->stop();
+ return conn_state.msgr->shutdown();
+ });
+ });
+ }).then([&all_stats, this] {
+ auto nr_jobs = all_stats.size();
+ JobReport summary;
+ std::vector<JobReport> clients(num_clients);
+
+ for (unsigned i = 0; i < nr_jobs; ++i) {
+ auto &stats = all_stats[i];
+ stats.report();
+ clients[i / num_conns].account(stats);
+ summary.account(stats);
+ }
+
+ std::cout << std::endl;
+ std::cout << "per client:" << std::endl;
+ for (unsigned i = 0; i < num_clients; ++i) {
+ auto &stats = clients[i];
+ stats.name = fmt::format("client{}", i);
+ stats.connect_time_s /= num_conns;
+ stats.messaging_time_s /= num_conns;
+ stats.latency_ms /= num_conns;
+ stats.report();
+ }
+
+ std::cout << std::endl;
+ summary.name = fmt::format("all", nr_jobs);
+ summary.connect_time_s /= nr_jobs;
+ summary.messaging_time_s /= nr_jobs;
+ summary.latency_ms /= nr_jobs;
+ summary.report();
+ });
+ });
+ }
+
+ seastar::future<> connect_wait_verify(const entity_addr_t& peer_addr) {
+ return container().invoke_on_all([peer_addr](auto& client) {
+ // start clients in active cores
+ if (client.is_active()) {
+ for (unsigned i = 0; i < client.num_conns; ++i) {
+ auto &conn_state = client.conn_states[i];
+ conn_state.conn_stats.start_connecting();
+ conn_state.active_conn = conn_state.msgr->connect(peer_addr, entity_name_t::TYPE_OSD);
+ conn_state.active_conn->set_user_private(
+ std::make_unique<ConnectionPriv>(i));
+ }
+ // make sure handshake won't hurt the performance
+ return seastar::sleep(1s).then([&client] {
+ for (unsigned i = 0; i < client.num_conns; ++i) {
+ auto &conn_state = client.conn_states[i];
+ if (conn_state.conn_stats.connected_time == mono_clock::zero()) {
+ logger().error("\n{} not connected after 1s!\n",
+ client.get_name(i));
+ ceph_assert(false);
+ }
+ }
+ });
+ }
+ return seastar::now();
+ });
+ }
+
+ private:
+ class TimerReport {
+ private:
+ const unsigned num_clients;
+ const unsigned num_conns;
+ const unsigned msgtime;
+ const unsigned bytes_of_block;
+
+ unsigned elapsed = 0u;
+ std::vector<PeriodStats> snaps;
+ std::vector<ConnStats> summaries;
+ std::vector<double> client_reactor_utilizations;
+ std::optional<double> server_reactor_utilization;
+
+ public:
+ TimerReport(unsigned num_clients, unsigned num_conns, unsigned msgtime, unsigned bs)
+ : num_clients{num_clients},
+ num_conns{num_conns},
+ msgtime{msgtime},
+ bytes_of_block{bs},
+ snaps{num_clients * num_conns},
+ summaries{num_clients * num_conns},
+ client_reactor_utilizations(num_clients) {}
+
+ unsigned get_elapsed() const { return elapsed; }
+
+ PeriodStats& get_snap(unsigned client_id, unsigned i) {
+ return snaps[client_id * num_conns + i];
+ }
+
+ ConnStats& get_summary(unsigned client_id, unsigned i) {
+ return summaries[client_id * num_conns + i];
+ }
+
+ void set_client_reactor_utilization(unsigned client_id, double ru) {
+ client_reactor_utilizations[client_id] = ru;
+ }
+
+ void set_server_reactor_utilization(double ru) {
+ server_reactor_utilization = ru;
+ }
+
+ bool should_stop() const {
+ return elapsed >= msgtime;
+ }
+
+ seastar::future<> ticktock() {
+ return seastar::sleep(1s).then([this] {
+ ++elapsed;
+ });
+ }
+
+ void report_header() const {
+ std::ostringstream sout;
+ sout << std::setfill(' ')
+ << std::setw(6) << "sec"
+ << std::setw(7) << "depth"
+ << std::setw(10) << "IOPS"
+ << std::setw(9) << "MB/s"
+ << std::setw(9) << "lat(ms)";
+ std::cout << sout.str() << std::endl;
+ }
+
+ void report_period() {
+ std::chrono::duration<double> elapsed_d = 0s;
+ unsigned depth = 0u;
+ unsigned ops = 0u;
+ unsigned sampled_count = 0u;
+ double sampled_total_lat_s = 0.0;
+ for (const auto& snap: snaps) {
+ elapsed_d += (snap.finish_time - snap.start_time);
+ depth += snap.depth;
+ ops += (snap.finish_count - snap.start_count);
+ sampled_count += snap.sampled_count;
+ sampled_total_lat_s += snap.sampled_total_lat_s;
+ }
+ double elapsed_s = elapsed_d.count() / (num_clients * num_conns);
+ double iops = ops/elapsed_s;
+ std::ostringstream sout;
+ sout << setfill(' ')
+ << std::setw(5) << elapsed_s
+ << " "
+ << std::setw(6) << depth
+ << " "
+ << std::setw(9) << iops
+ << " "
+ << std::setw(8) << iops * bytes_of_block / 1048576
+ << " "
+ << std::setw(8) << (sampled_total_lat_s / sampled_count * 1000)
+ << " -- ";
+ if (server_reactor_utilization.has_value()) {
+ sout << *server_reactor_utilization << " -- ";
+ }
+ for (double cru : client_reactor_utilizations) {
+ sout << cru << ",";
+ }
+ std::cout << sout.str() << std::endl;
+ }
+
+ void report_summary() const {
+ std::chrono::duration<double> elapsed_d = 0s;
+ unsigned ops = 0u;
+ unsigned sampled_count = 0u;
+ double sampled_total_lat_s = 0.0;
+ for (const auto& summary: summaries) {
+ elapsed_d += (summary.finish_time - summary.start_time);
+ ops += (summary.received_count - summary.start_count);
+ sampled_count += summary.sampled_count;
+ sampled_total_lat_s += summary.sampled_total_lat_s;
+ }
+ double elapsed_s = elapsed_d.count() / (num_clients * num_conns);
+ double iops = ops / elapsed_s;
+ std::ostringstream sout;
+ sout << "--------------"
+ << " summary "
+ << "--------------\n"
+ << setfill(' ')
+ << std::setw(7) << elapsed_s
+ << std::setw(6) << "-"
+ << std::setw(8) << iops
+ << std::setw(8) << iops * bytes_of_block / 1048576
+ << std::setw(8) << (sampled_total_lat_s / sampled_count * 1000)
+ << "\n";
+ std::cout << sout.str() << std::endl;
+ }
+ };
+
+ seastar::future<> report_period(TimerReport& report) {
+ return container().invoke_on_all([&report] (auto& client) {
+ if (client.is_active()) {
+ for (unsigned i = 0; i < client.num_conns; ++i) {
+ auto &conn_state = client.conn_states[i];
+ PeriodStats& snap = report.get_snap(client.id, i);
+ conn_state.period_stats.reset_period(
+ conn_state.conn_stats.received_count,
+ client.nr_depth - conn_state.get_current_units(),
+ snap);
+ }
+ report.set_client_reactor_utilization(client.id, get_reactor_utilization());
+ }
+ if (client.server_sid.has_value() &&
+ seastar::this_shard_id() == *client.server_sid) {
+ assert(!client.is_active());
+ report.set_server_reactor_utilization(get_reactor_utilization());
+ }
+ }).then([&report] {
+ report.report_period();
+ });
+ }
+
+ seastar::future<> report_summary(TimerReport& report) {
+ return container().invoke_on_all([&report] (auto& client) {
+ if (client.is_active()) {
+ for (unsigned i = 0; i < client.num_conns; ++i) {
+ auto &conn_state = client.conn_states[i];
+ ConnStats& summary = report.get_summary(client.id, i);
+ summary.prepare_summary(conn_state.conn_stats);
+ }
+ }
+ }).then([&report] {
+ report.report_summary();
+ });
+ }
+
+ public:
+ seastar::future<> dispatch_with_timer(unsigned ramptime, unsigned msgtime) {
+ logger().info("[all clients]: start sending MOSDOps from {} clients * {} conns",
+ num_clients, num_conns);
+ return container().invoke_on_all([] (auto& client) {
+ if (client.is_active()) {
+ for (unsigned i = 0; i < client.num_conns; ++i) {
+ client.do_dispatch_messages(i);
+ }
+ }
+ }).then([ramptime] {
+ logger().info("[all clients]: ramping up {} seconds...", ramptime);
+ return seastar::sleep(std::chrono::seconds(ramptime));
+ }).then([this] {
+ return container().invoke_on_all([] (auto& client) {
+ if (client.is_active()) {
+ for (unsigned i = 0; i < client.num_conns; ++i) {
+ auto &conn_state = client.conn_states[i];
+ conn_state.conn_stats.start_collect();
+ conn_state.period_stats.start_collect(conn_state.conn_stats.received_count);
+ }
+ }
+ });
+ }).then([this, msgtime] {
+ logger().info("[all clients]: reporting {} seconds...\n", msgtime);
+ return seastar::do_with(
+ TimerReport(num_clients, num_conns, msgtime, msg_len),
+ [this](auto& report) {
+ report.report_header();
+ return seastar::do_until(
+ [&report] { return report.should_stop(); },
+ [&report, this] {
+ return report.ticktock().then([&report, this] {
+ // report period every 1s
+ return report_period(report);
+ }).then([&report, this] {
+ // report summary every 10s
+ if (report.get_elapsed() % 10 == 0) {
+ return report_summary(report);
+ } else {
+ return seastar::now();
+ }
+ });
+ }
+ ).then([&report, this] {
+ // report the final summary
+ if (report.get_elapsed() % 10 != 0) {
+ return report_summary(report);
+ } else {
+ return seastar::now();
+ }
+ });
+ });
+ });
+ }
+
+ private:
+ seastar::future<> send_msg(ConnState &conn_state) {
+ ceph_assert(seastar::this_shard_id() == sid);
+ conn_state.sent_count += 1;
+ return conn_state.depth.wait(1
+ ).then([this, &conn_state] {
+ const static pg_t pgid;
+ const static object_locator_t oloc;
+ const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(),
+ pgid.pool(), oloc.nspace);
+ static spg_t spgid(pgid);
+ auto m = crimson::make_message<MOSDOp>(0, 0, hobj, spgid, 0, 0, 0);
+ bufferlist data(msg_data);
+ m->write(0, msg_len, data);
+ // use tid as the identity of each round
+ m->set_tid(conn_state.sent_count);
+
+ // sample message latency
+ if (unlikely(conn_state.sent_count % SAMPLE_RATE == 0)) {
+ auto index = conn_state.sent_count % conn_state.time_msgs_sent.size();
+ ceph_assert(conn_state.time_msgs_sent[index] ==
+ lowres_clock_t::time_point::min());
+ conn_state.time_msgs_sent[index] = lowres_clock_t::now();
+ }
+
+ return conn_state.active_conn->send(std::move(m));
+ });
+ }
+
+ class DepthBroken: public std::exception {};
+
+ seastar::future<JobReport> stop_dispatch_messages(unsigned i) {
+ auto &conn_state = conn_states[i];
+ conn_state.stop_send = true;
+ conn_state.depth.broken(DepthBroken());
+ return conn_state.stopped_send_promise.get_future();
+ }
+
+ void do_dispatch_messages(unsigned i) {
+ ceph_assert(seastar::this_shard_id() == sid);
+ auto &conn_state = conn_states[i];
+ ceph_assert(conn_state.sent_count == 0);
+ conn_state.conn_stats.start_time = mono_clock::now();
+ // forwarded to stopped_send_promise
+ (void) seastar::do_until(
+ [&conn_state] { return conn_state.stop_send; },
+ [this, &conn_state] { return send_msg(conn_state); }
+ ).handle_exception_type([] (const DepthBroken& e) {
+ // ok, stopped by stop_dispatch_messages()
+ }).then([this, &conn_state, i] {
+ std::string name = get_name(i);
+ logger().info("{} {}: stopped sending OSDOPs",
+ name, *conn_state.active_conn);
+
+ std::chrono::duration<double> dur_conn =
+ conn_state.conn_stats.connected_time -
+ conn_state.conn_stats.connecting_time;
+ std::chrono::duration<double> dur_msg =
+ mono_clock::now() - conn_state.conn_stats.start_time;
+ unsigned ops =
+ conn_state.conn_stats.received_count -
+ conn_state.conn_stats.start_count;
+
+ JobReport stats;
+ stats.name = name;
+ stats.depth = nr_depth;
+ stats.connect_time_s = dur_conn.count();
+ stats.total_msgs = ops;
+ stats.messaging_time_s = dur_msg.count();
+ stats.latency_ms =
+ conn_state.conn_stats.sampled_total_lat_s /
+ conn_state.conn_stats.sampled_count * 1000;
+ stats.iops = ops / dur_msg.count();
+ stats.throughput_mbps = ops / dur_msg.count() * msg_len / 1048576;
+
+ conn_state.stopped_send_promise.set_value(stats);
+ });
+ }
+ };
+ };
+
+ std::optional<unsigned> server_sid;
+ bool server_needs_report = false;
+ if (mode == perf_mode_t::both) {
+ ceph_assert(server_conf.is_fixed_cpu == true);
+ server_sid = server_conf.core;
+ } else if (mode == perf_mode_t::server) {
+ server_needs_report = true;
+ }
+ return seastar::when_all(
+ seastar::futurize_invoke([mode, server_conf, server_needs_report] {
+ if (mode == perf_mode_t::client) {
+ return seastar::make_ready_future<test_state::Server*>(nullptr);
+ } else {
+ return create_sharded<test_state::Server>(
+ server_conf.core,
+ server_conf.block_size,
+ server_needs_report);
+ }
+ }),
+ seastar::futurize_invoke([mode, client_conf, server_sid] {
+ if (mode == perf_mode_t::server) {
+ return seastar::make_ready_future<test_state::Client*>(nullptr);
+ } else {
+ unsigned nonce_base = ceph::util::generate_random_number<unsigned>();
+ logger().info("client nonce_base={}", nonce_base);
+ return create_sharded<test_state::Client>(
+ client_conf.num_clients,
+ client_conf.num_conns,
+ client_conf.block_size,
+ client_conf.depth,
+ nonce_base,
+ server_sid);
+ }
+ }),
+ crimson::common::sharded_conf().start(
+ EntityName{}, std::string_view{"ceph"}
+ ).then([] {
+ return crimson::common::local_conf().start();
+ }).then([crc_enabled] {
+ return crimson::common::local_conf().set_val(
+ "ms_crc_data", crc_enabled ? "true" : "false");
+ })
+ ).then([=](auto&& ret) {
+ auto server = std::move(std::get<0>(ret).get0());
+ auto client = std::move(std::get<1>(ret).get0());
+ // reserve core 0 for potentially better performance
+ if (mode == perf_mode_t::both) {
+ logger().info("\nperf settings:\n smp={}\n {}\n {}\n",
+ seastar::smp::count, client_conf.str(), server_conf.str());
+ if (client_conf.skip_core_0) {
+ ceph_assert(seastar::smp::count > client_conf.num_clients);
+ } else {
+ ceph_assert(seastar::smp::count >= client_conf.num_clients);
+ }
+ ceph_assert(client_conf.num_clients > 0);
+ ceph_assert(seastar::smp::count > server_conf.core + client_conf.num_clients);
+ return seastar::when_all_succeed(
+ // it is not reasonable to allow server/client to shared cores for
+ // performance benchmarking purposes.
+ server->init(server_conf.addr, server_conf.is_fixed_cpu),
+ client->init()
+ ).then_unpack([client, addr = client_conf.server_addr] {
+ return client->connect_wait_verify(addr);
+ }).then([client, ramptime = client_conf.ramptime,
+ msgtime = client_conf.msgtime] {
+ return client->dispatch_with_timer(ramptime, msgtime);
+ }).then([client] {
+ return client->shutdown();
+ }).then([server] {
+ return server->shutdown();
+ });
+ } else if (mode == perf_mode_t::client) {
+ logger().info("\nperf settings:\n smp={}\n {}\n",
+ seastar::smp::count, client_conf.str());
+ if (client_conf.skip_core_0) {
+ ceph_assert(seastar::smp::count > client_conf.num_clients);
+ } else {
+ ceph_assert(seastar::smp::count >= client_conf.num_clients);
+ }
+ ceph_assert(client_conf.num_clients > 0);
+ return client->init(
+ ).then([client, addr = client_conf.server_addr] {
+ return client->connect_wait_verify(addr);
+ }).then([client, ramptime = client_conf.ramptime,
+ msgtime = client_conf.msgtime] {
+ return client->dispatch_with_timer(ramptime, msgtime);
+ }).then([client] {
+ return client->shutdown();
+ });
+ } else { // mode == perf_mode_t::server
+ ceph_assert(seastar::smp::count > server_conf.core);
+ logger().info("\nperf settings:\n smp={}\n {}\n",
+ seastar::smp::count, server_conf.str());
+ return seastar::async([server, server_conf] {
+ // FIXME: SIGINT is not received by stop_signal
+ seastar_apps_lib::stop_signal should_stop;
+ server->init(server_conf.addr, server_conf.is_fixed_cpu).get();
+ should_stop.wait().get();
+ server->shutdown().get();
+ });
+ }
+ }).finally([] {
+ return crimson::common::sharded_conf().stop();
+ });
+}
+
+}
+
+int main(int argc, char** argv)
+{
+ seastar::app_template app;
+ app.add_options()
+ ("mode", bpo::value<unsigned>()->default_value(0),
+ "0: both, 1:client, 2:server")
+ ("server-addr", bpo::value<std::string>()->default_value("v2:127.0.0.1:9010"),
+ "server address(only support msgr v2 protocol)")
+ ("ramptime", bpo::value<unsigned>()->default_value(5),
+ "seconds of client ramp-up time")
+ ("msgtime", bpo::value<unsigned>()->default_value(15),
+ "seconds of client messaging time")
+ ("clients", bpo::value<unsigned>()->default_value(1),
+ "number of client messengers")
+ ("conns-per-client", bpo::value<unsigned>()->default_value(1),
+ "number of connections per client")
+ ("client-bs", bpo::value<unsigned>()->default_value(4096),
+ "client block size")
+ ("depth", bpo::value<unsigned>()->default_value(512),
+ "client io depth per job")
+ ("client-skip-core-0", bpo::value<bool>()->default_value(true),
+ "client skip core 0")
+ ("server-fixed-cpu", bpo::value<bool>()->default_value(true),
+ "server is in the fixed cpu mode, non-fixed doesn't support the mode both")
+ ("server-core", bpo::value<unsigned>()->default_value(1),
+ "server messenger running core")
+ ("server-bs", bpo::value<unsigned>()->default_value(0),
+ "server block size")
+ ("crc-enabled", bpo::value<bool>()->default_value(false),
+ "enable CRC checks");
+ return app.run(argc, argv, [&app] {
+ auto&& config = app.configuration();
+ auto mode = config["mode"].as<unsigned>();
+ ceph_assert(mode <= 2);
+ auto _mode = static_cast<perf_mode_t>(mode);
+ bool crc_enabled = config["crc-enabled"].as<bool>();
+ auto server_conf = server_config::load(config);
+ auto client_conf = client_config::load(config);
+ return run(_mode, client_conf, server_conf, crc_enabled
+ ).then([] {
+ logger().info("\nsuccessful!\n");
+ }).handle_exception([] (auto eptr) {
+ logger().info("\nfailed!\n");
+ return seastar::make_exception_future<>(eptr);
+ });
+ });
+}
diff --git a/src/crimson/tools/perf_staged_fltree.cc b/src/crimson/tools/perf_staged_fltree.cc
new file mode 100644
index 000000000..81b621750
--- /dev/null
+++ b/src/crimson/tools/perf_staged_fltree.cc
@@ -0,0 +1,178 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/program_options.hpp>
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/thread.hh>
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/log.h"
+#include "crimson/common/perf_counters_collection.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+#include "test/crimson/seastore/onode_tree/test_value.h"
+#include "test/crimson/seastore/transaction_manager_test_state.h"
+
+using namespace crimson::os::seastore::onode;
+namespace bpo = boost::program_options;
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_test);
+}
+
+template <bool TRACK>
+class PerfTree : public TMTestState {
+ public:
+ PerfTree(bool is_dummy) : is_dummy{is_dummy} {}
+
+ seastar::future<> run(KVPool<test_item_t>& kvs, double erase_ratio) {
+ return tm_setup().then([this, &kvs, erase_ratio] {
+ return seastar::async([this, &kvs, erase_ratio] {
+ auto tree = std::make_unique<TreeBuilder<TRACK, ExtendedValue>>(kvs,
+ (is_dummy ? NodeExtentManager::create_dummy(true)
+ : NodeExtentManager::create_seastore(*tm)));
+ {
+ auto t = create_mutate_transaction();
+ with_trans_intr(*t, [&](auto &tr){
+ return tree->bootstrap(tr);
+ }).unsafe_get();
+ submit_transaction(std::move(t));
+ }
+ {
+ auto t = create_mutate_transaction();
+ with_trans_intr(*t, [&](auto &tr){
+ return tree->insert(tr);
+ }).unsafe_get();
+ auto start_time = mono_clock::now();
+ submit_transaction(std::move(t));
+ std::chrono::duration<double> duration = mono_clock::now() - start_time;
+ logger().warn("submit_transaction() done! {}s", duration.count());
+ }
+ {
+ // Note: create_weak_transaction() can also work, but too slow.
+ auto t = create_read_transaction();
+ with_trans_intr(*t, [&](auto &tr){
+ return tree->get_stats(tr);
+ }).unsafe_get();
+
+ with_trans_intr(*t, [&](auto &tr){
+ return tree->validate(tr);
+ }).unsafe_get();
+ }
+ {
+ auto t = create_mutate_transaction();
+ with_trans_intr(*t, [&](auto &tr){
+ return tree->erase(tr, kvs.size() * erase_ratio);
+ }).unsafe_get();
+ submit_transaction(std::move(t));
+ }
+ {
+ auto t = create_read_transaction();
+ with_trans_intr(*t, [&](auto &tr){
+ return tree->get_stats(tr);
+ }).unsafe_get();
+
+ with_trans_intr(*t, [&](auto &tr){
+ return tree->validate(tr);
+ }).unsafe_get();
+ }
+ tree.reset();
+ });
+ }).then([this] {
+ return tm_teardown();
+ });
+ }
+
+ private:
+ bool is_dummy;
+};
+
+template <bool TRACK>
+seastar::future<> run(const bpo::variables_map& config) {
+ return seastar::async([&config] {
+ auto backend = config["backend"].as<std::string>();
+ bool is_dummy;
+ if (backend == "dummy") {
+ is_dummy = true;
+ } else if (backend == "seastore") {
+ is_dummy = false;
+ } else {
+ ceph_abort(false && "invalid backend");
+ }
+ auto ns_sizes = config["ns-sizes"].as<std::vector<size_t>>();
+ auto oid_sizes = config["oid-sizes"].as<std::vector<size_t>>();
+ auto onode_sizes = config["onode-sizes"].as<std::vector<size_t>>();
+ auto range2 = config["range2"].as<std::vector<int>>();
+ ceph_assert(range2.size() == 2);
+ auto range1 = config["range1"].as<std::vector<unsigned>>();
+ ceph_assert(range1.size() == 2);
+ auto range0 = config["range0"].as<std::vector<unsigned>>();
+ ceph_assert(range0.size() == 2);
+ auto erase_ratio = config["erase-ratio"].as<double>();
+ ceph_assert(erase_ratio >= 0);
+ ceph_assert(erase_ratio <= 1);
+
+ using crimson::common::sharded_conf;
+ sharded_conf().start(EntityName{}, std::string_view{"ceph"}).get();
+ seastar::engine().at_exit([] {
+ return sharded_conf().stop();
+ });
+
+ using crimson::common::sharded_perf_coll;
+ sharded_perf_coll().start().get();
+ seastar::engine().at_exit([] {
+ return sharded_perf_coll().stop();
+ });
+
+ auto kvs = KVPool<test_item_t>::create_raw_range(
+ ns_sizes, oid_sizes, onode_sizes,
+ {range2[0], range2[1]},
+ {range1[0], range1[1]},
+ {range0[0], range0[1]});
+ PerfTree<TRACK> perf{is_dummy};
+ perf.run(kvs, erase_ratio).get0();
+ });
+}
+
+
+int main(int argc, char** argv)
+{
+ seastar::app_template app;
+ app.add_options()
+ ("backend", bpo::value<std::string>()->default_value("dummy"),
+ "tree backend: dummy, seastore")
+ ("tracked", bpo::value<bool>()->default_value(false),
+ "track inserted cursors")
+ ("ns-sizes", bpo::value<std::vector<size_t>>()->default_value(
+ {8, 11, 64, 128, 255, 256}),
+ "sizes of ns strings")
+ ("oid-sizes", bpo::value<std::vector<size_t>>()->default_value(
+ {8, 13, 64, 512, 2035, 2048}),
+ "sizes of oid strings")
+ ("onode-sizes", bpo::value<std::vector<size_t>>()->default_value(
+ {8, 16, 128, 576, 992, 1200}),
+ "sizes of onode")
+ ("range2", bpo::value<std::vector<int>>()->default_value(
+ {0, 128}),
+ "range of shard-pool-crush [a, b)")
+ ("range1", bpo::value<std::vector<unsigned>>()->default_value(
+ {0, 10}),
+ "range of ns-oid strings [a, b)")
+ ("range0", bpo::value<std::vector<unsigned>>()->default_value(
+ {0, 4}),
+ "range of snap-gen [a, b)")
+ ("erase-ratio", bpo::value<double>()->default_value(
+ 0.8),
+ "erase-ratio of all the inserted onodes");
+ return app.run(argc, argv, [&app] {
+ auto&& config = app.configuration();
+ auto tracked = config["tracked"].as<bool>();
+ if (tracked) {
+ return run<true>(config);
+ } else {
+ return run<false>(config);
+ }
+ });
+}
diff --git a/src/crimson/tools/store_nbd/block_driver.cc b/src/crimson/tools/store_nbd/block_driver.cc
new file mode 100644
index 000000000..10e77a34b
--- /dev/null
+++ b/src/crimson/tools/store_nbd/block_driver.cc
@@ -0,0 +1,19 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "fs_driver.h"
+#include "block_driver.h"
+
+#include "tm_driver.h"
+
+BlockDriverRef get_backend(BlockDriver::config_t config)
+{
+ if (config.type == "transaction_manager") {
+ return std::make_unique<TMDriver>(config);
+ } else if (config.is_futurized_store()) {
+ return std::make_unique<FSDriver>(config);
+ } else {
+ ceph_assert(0 == "invalid option");
+ return BlockDriverRef();
+ }
+}
diff --git a/src/crimson/tools/store_nbd/block_driver.h b/src/crimson/tools/store_nbd/block_driver.h
new file mode 100644
index 000000000..ea3453ef3
--- /dev/null
+++ b/src/crimson/tools/store_nbd/block_driver.h
@@ -0,0 +1,134 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include <seastar/core/future.hh>
+
+#include <string>
+#include <optional>
+
+#include "include/buffer.h"
+
+/**
+ * BlockDriver
+ *
+ * Simple interface to enable throughput test to compare raw disk to
+ * transaction_manager, etc
+ */
+class BlockDriver {
+public:
+ struct config_t {
+ std::string type;
+ bool mkfs = false;
+ unsigned num_pgs = 128;
+ unsigned log_size = 1000;
+ unsigned object_size = 4<<20 /* 4MB, rbd default */;
+ unsigned oi_size = 1<<9 /* 512b */;
+ unsigned log_entry_size = 1<<9 /* 512b */;
+ bool prepopulate_log = false;
+ std::optional<std::string> path;
+
+ bool is_futurized_store() const {
+ return type == "seastore" || type == "bluestore";
+ }
+
+ std::string get_fs_type() const {
+ ceph_assert(is_futurized_store());
+ return type;
+ }
+
+ bool oi_enabled() const {
+ return oi_size > 0;
+ }
+
+ bool log_enabled() const {
+ return log_entry_size > 0 && log_size > 0;
+ }
+
+ bool prepopulate_log_enabled() const {
+ return prepopulate_log;
+ }
+
+ void populate_options(
+ boost::program_options::options_description &desc)
+ {
+ namespace po = boost::program_options;
+ desc.add_options()
+ ("type",
+ po::value<std::string>()
+ ->default_value("transaction_manager")
+ ->notifier([this](auto s) { type = s; }),
+ "Backend to use, options are transaction_manager, seastore"
+ )
+ ("device-path",
+ po::value<std::string>()
+ ->required()
+ ->notifier([this](auto s) { path = s; }),
+ "Path to device for backend"
+ )
+ ("num-pgs",
+ po::value<unsigned>()
+ ->notifier([this](auto s) { num_pgs = s; }),
+ "Number of pgs to use for futurized_store backends"
+ )
+ ("log-size",
+ po::value<unsigned>()
+ ->notifier([this](auto s) { log_size = s; }),
+ "Number of log entries per pg to use for futurized_store backends"
+ ", 0 to disable"
+ )
+ ("log-entry-size",
+ po::value<unsigned>()
+ ->notifier([this](auto s) { log_entry_size = s; }),
+ "Size of each log entry per pg to use for futurized_store backends"
+ ", 0 to disable"
+ )
+ ("prepopulate-log",
+ po::value<bool>()
+ ->notifier([this](auto s) { prepopulate_log = s; }),
+ "Prepopulate log on mount"
+ )
+ ("object-info-size",
+ po::value<unsigned>()
+ ->notifier([this](auto s) { log_entry_size = s; }),
+ "Size of each log entry per pg to use for futurized_store backends"
+ ", 0 to disable"
+ )
+ ("object-size",
+ po::value<unsigned>()
+ ->notifier([this](auto s) { object_size = s; }),
+ "Object size to use for futurized_store backends"
+ )
+ ("mkfs",
+ po::value<bool>()
+ ->default_value(false)
+ ->notifier([this](auto s) { mkfs = s; }),
+ "Do mkfs first"
+ );
+ }
+ };
+
+ virtual ceph::bufferptr get_buffer(size_t size) = 0;
+
+ virtual seastar::future<> write(
+ off_t offset,
+ ceph::bufferptr ptr) = 0;
+
+ virtual seastar::future<ceph::bufferlist> read(
+ off_t offset,
+ size_t size) = 0;
+
+ virtual size_t get_size() const = 0;
+
+ virtual seastar::future<> mount() = 0;
+ virtual seastar::future<> close() = 0;
+
+ virtual ~BlockDriver() {}
+};
+using BlockDriverRef = std::unique_ptr<BlockDriver>;
+
+BlockDriverRef get_backend(BlockDriver::config_t config);
diff --git a/src/crimson/tools/store_nbd/fs_driver.cc b/src/crimson/tools/store_nbd/fs_driver.cc
new file mode 100644
index 000000000..18f836766
--- /dev/null
+++ b/src/crimson/tools/store_nbd/fs_driver.cc
@@ -0,0 +1,310 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/iterator/counting_iterator.hpp>
+#include <fmt/format.h>
+
+#include "os/Transaction.h"
+#include "fs_driver.h"
+
+using namespace crimson;
+using namespace crimson::os;
+
+coll_t get_coll(unsigned num) {
+ return coll_t(spg_t(pg_t(0, num)));
+}
+
+ghobject_t get_log_object(unsigned coll)
+{
+ return ghobject_t(
+ shard_id_t::NO_SHARD,
+ 0,
+ (coll << 16),
+ "",
+ "",
+ 0,
+ ghobject_t::NO_GEN);
+}
+
+std::string make_log_key(
+ unsigned i)
+{
+ return fmt::format("log_entry_{}", i);
+}
+
+void add_log_entry(
+ unsigned i,
+ unsigned entry_size,
+ std::map<std::string, ceph::buffer::list> *omap)
+{
+ assert(omap);
+ bufferlist bl;
+ bl.append(ceph::buffer::create('0', entry_size));
+
+ omap->emplace(std::make_pair(make_log_key(i), bl));
+}
+
+void populate_log(
+ ceph::os::Transaction &t,
+ FSDriver::pg_analogue_t &pg,
+ unsigned entry_size,
+ unsigned entries)
+{
+ t.touch(pg.collection->get_cid(), pg.log_object);
+ // omap_clear not yet implemented, TODO
+ // t.omap_clear(pg.collection->get_cid(), pg.log_object);
+
+ std::map<std::string, ceph::buffer::list> omap;
+ for (unsigned i = 0; i < entries; ++i) {
+ add_log_entry(i, entry_size, &omap);
+ }
+
+ t.omap_setkeys(
+ pg.collection->get_cid(),
+ pg.log_object,
+ omap);
+
+ pg.log_head = entries;
+}
+
+void update_log(
+ ceph::os::Transaction &t,
+ FSDriver::pg_analogue_t &pg,
+ unsigned entry_size,
+ unsigned entries)
+{
+ ++pg.log_head;
+ std::map<std::string, ceph::buffer::list> key;
+ add_log_entry(pg.log_head, entry_size, &key);
+
+ t.omap_setkeys(
+ pg.collection->get_cid(),
+ pg.log_object,
+ key);
+
+
+ while ((pg.log_head - pg.log_tail) > entries) {
+ t.omap_rmkey(
+ pg.collection->get_cid(),
+ pg.log_object,
+ make_log_key(pg.log_tail));
+ ++pg.log_tail;
+ }
+}
+
+FSDriver::offset_mapping_t FSDriver::map_offset(off_t offset)
+{
+ uint32_t objid = offset / config.object_size;
+ uint32_t collid = objid % config.num_pgs;
+ return offset_mapping_t{
+ collections[collid],
+ ghobject_t(
+ shard_id_t::NO_SHARD,
+ 0,
+ (collid << 16) | (objid + 1),
+ "",
+ "",
+ 0,
+ ghobject_t::NO_GEN),
+ offset % config.object_size
+ };
+}
+
+seastar::future<> FSDriver::write(
+ off_t offset,
+ bufferptr ptr)
+{
+ auto mapping = map_offset(offset);
+ ceph_assert(mapping.offset + ptr.length() <= config.object_size);
+ ceph::os::Transaction t;
+ bufferlist bl;
+ bl.append(ptr);
+ t.write(
+ mapping.pg.collection->get_cid(),
+ mapping.object,
+ mapping.offset,
+ ptr.length(),
+ bl,
+ 0);
+
+ if (config.oi_enabled() ) {
+ bufferlist attr;
+ attr.append(ceph::buffer::create(config.oi_size, '0'));
+ t.setattr(
+ mapping.pg.collection->get_cid(),
+ mapping.object,
+ "_",
+ attr);
+ }
+
+ if (config.log_enabled()) {
+ update_log(
+ t,
+ mapping.pg,
+ config.log_entry_size,
+ config.log_size);
+ }
+
+ return sharded_fs->do_transaction(
+ mapping.pg.collection,
+ std::move(t));
+}
+
+seastar::future<bufferlist> FSDriver::read(
+ off_t offset,
+ size_t size)
+{
+ auto mapping = map_offset(offset);
+ ceph_assert((mapping.offset + size) <= config.object_size);
+ return sharded_fs->read(
+ mapping.pg.collection,
+ mapping.object,
+ mapping.offset,
+ size,
+ 0
+ ).handle_error(
+ crimson::ct_error::enoent::handle([size](auto &e) {
+ bufferlist bl;
+ bl.append_zero(size);
+ return seastar::make_ready_future<bufferlist>(std::move(bl));
+ }),
+ crimson::ct_error::assert_all{"Unrecoverable error in FSDriver::read"}
+ ).then([size](auto &&bl) {
+ if (bl.length() < size) {
+ bl.append_zero(size - bl.length());
+ }
+ return seastar::make_ready_future<bufferlist>(std::move(bl));
+ });
+}
+
+seastar::future<> FSDriver::mkfs()
+{
+ return init(
+ ).then([this] {
+ assert(fs);
+ uuid_d uuid;
+ uuid.generate_random();
+ return fs->mkfs(uuid).handle_error(
+ crimson::stateful_ec::handle([] (const auto& ec) {
+ crimson::get_logger(ceph_subsys_test)
+ .error("error creating empty object store in {}: ({}) {}",
+ crimson::common::local_conf().get_val<std::string>("osd_data"),
+ ec.value(), ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([this] {
+ return fs->stop();
+ }).then([this] {
+ return init();
+ }).then([this] {
+ return fs->mount(
+ ).handle_error(
+ crimson::stateful_ec::handle([] (const auto& ec) {
+ crimson::get_logger(
+ ceph_subsys_test
+ ).error(
+ "error mounting object store in {}: ({}) {}",
+ crimson::common::local_conf().get_val<std::string>("osd_data"),
+ ec.value(),
+ ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([this] {
+ return seastar::do_for_each(
+ boost::counting_iterator<unsigned>(0),
+ boost::counting_iterator<unsigned>(config.num_pgs),
+ [this](auto i) {
+ return sharded_fs->create_new_collection(get_coll(i)
+ ).then([this, i](auto coll) {
+ ceph::os::Transaction t;
+ t.create_collection(get_coll(i), 0);
+ return sharded_fs->do_transaction(coll, std::move(t));
+ });
+ });
+ }).then([this] {
+ return fs->umount();
+ }).then([this] {
+ return fs->stop();
+ }).then([this] {
+ fs.reset();
+ return seastar::now();
+ });
+}
+
+seastar::future<> FSDriver::mount()
+{
+ ceph_assert(config.path);
+ return (
+ config.mkfs ? mkfs() : seastar::now()
+ ).then([this] {
+ return init();
+ }).then([this] {
+ return fs->mount(
+ ).handle_error(
+ crimson::stateful_ec::handle([] (const auto& ec) {
+ crimson::get_logger(
+ ceph_subsys_test
+ ).error(
+ "error mounting object store in {}: ({}) {}",
+ crimson::common::local_conf().get_val<std::string>("osd_data"),
+ ec.value(),
+ ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([this] {
+ return seastar::do_for_each(
+ boost::counting_iterator<unsigned>(0),
+ boost::counting_iterator<unsigned>(config.num_pgs),
+ [this](auto i) {
+ return sharded_fs->open_collection(get_coll(i)
+ ).then([this, i](auto ref) {
+ collections[i].collection = ref;
+ collections[i].log_object = get_log_object(i);
+ if (config.log_enabled()) {
+ ceph::os::Transaction t;
+ if (config.prepopulate_log_enabled()) {
+ populate_log(
+ t,
+ collections[i],
+ config.log_entry_size,
+ config.log_size);
+ }
+ return sharded_fs->do_transaction(
+ collections[i].collection,
+ std::move(t));
+ } else {
+ return seastar::now();
+ }
+ });
+ });
+ }).then([this] {
+ return fs->stat();
+ }).then([this](auto s) {
+ size = s.total;
+ });
+};
+
+seastar::future<> FSDriver::close()
+{
+ collections.clear();
+ return fs->umount(
+ ).then([this] {
+ return fs->stop();
+ }).then([this] {
+ fs.reset();
+ return seastar::now();
+ });
+}
+
+seastar::future<> FSDriver::init()
+{
+ fs.reset();
+ fs = FuturizedStore::create(
+ config.get_fs_type(),
+ *config.path,
+ crimson::common::local_conf().get_config_values()
+ );
+ return fs->start().then([this] {
+ sharded_fs = &(fs->get_sharded_store());
+ });
+}
diff --git a/src/crimson/tools/store_nbd/fs_driver.h b/src/crimson/tools/store_nbd/fs_driver.h
new file mode 100644
index 000000000..89aca075f
--- /dev/null
+++ b/src/crimson/tools/store_nbd/fs_driver.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "block_driver.h"
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+class FSDriver final : public BlockDriver {
+public:
+ FSDriver(config_t config)
+ : config(config)
+ {}
+ ~FSDriver() final {}
+
+ bufferptr get_buffer(size_t size) final {
+ return ceph::buffer::create_page_aligned(size);
+ }
+
+ seastar::future<> write(
+ off_t offset,
+ bufferptr ptr) final;
+
+ seastar::future<bufferlist> read(
+ off_t offset,
+ size_t size) final;
+
+ size_t get_size() const {
+ return size;
+ }
+
+ seastar::future<> mount() final;
+
+ seastar::future<> close() final;
+
+private:
+ size_t size = 0;
+ const config_t config;
+ std::unique_ptr<crimson::os::FuturizedStore> fs;
+ crimson::os::FuturizedStore::Shard* sharded_fs;
+
+ struct pg_analogue_t {
+ crimson::os::CollectionRef collection;
+
+ ghobject_t log_object;
+ unsigned log_tail = 0;
+ unsigned log_head = 0;
+ };
+ std::map<unsigned, pg_analogue_t> collections;
+
+ struct offset_mapping_t {
+ pg_analogue_t &pg;
+ ghobject_t object;
+ off_t offset;
+ };
+ offset_mapping_t map_offset(off_t offset);
+
+ seastar::future<> mkfs();
+ seastar::future<> init();
+
+ friend void populate_log(
+ ceph::os::Transaction &,
+ pg_analogue_t &,
+ unsigned,
+ unsigned);
+
+ friend void update_log(
+ ceph::os::Transaction &,
+ FSDriver::pg_analogue_t &,
+ unsigned,
+ unsigned);
+};
diff --git a/src/crimson/tools/store_nbd/store-nbd.cc b/src/crimson/tools/store_nbd/store-nbd.cc
new file mode 100644
index 000000000..9f80c3b2c
--- /dev/null
+++ b/src/crimson/tools/store_nbd/store-nbd.cc
@@ -0,0 +1,456 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+
+/**
+ * crimson-store-nbd
+ *
+ * This tool exposes crimson object store internals as an nbd server
+ * for use with fio in basic benchmarking.
+ *
+ * Example usage:
+ *
+ * $ ./bin/crimson-store-nbd --device-path /dev/nvme1n1 -c 1 --mkfs true --uds-path /tmp/store_nbd_socket.sock
+ *
+ * $ cat nbd.fio
+ * [global]
+ * ioengine=nbd
+ * uri=nbd+unix:///?socket=/tmp/store_nbd_socket.sock
+ * rw=randrw
+ * time_based
+ * runtime=120
+ * group_reporting
+ * iodepth=1
+ * size=500G
+ *
+ * [job0]
+ * offset=0
+ *
+ * $ fio nbd.fio
+ */
+
+#include <random>
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include <linux/nbd.h>
+#include <linux/fs.h>
+
+#include <seastar/apps/lib/stop_signal.hh>
+#include <seastar/core/app-template.hh>
+#include <seastar/core/byteorder.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/rwlock.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/util/defer.hh>
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/log.h"
+
+#include "block_driver.h"
+
+namespace po = boost::program_options;
+
+using namespace ceph;
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_test);
+ }
+}
+
+struct request_context_t {
+ uint32_t magic = 0;
+ uint32_t type = 0;
+
+ char handle[8] = {0};
+
+ uint64_t from = 0;
+ uint32_t len = 0;
+
+ unsigned err = 0;
+ std::optional<bufferptr> in_buffer;
+ std::optional<bufferlist> out_buffer;
+
+ using ref = std::unique_ptr<request_context_t>;
+ static ref make_ref() {
+ return std::make_unique<request_context_t>();
+ }
+
+ bool check_magic() const {
+ auto ret = magic == NBD_REQUEST_MAGIC;
+ if (!ret) {
+ logger().error(
+ "Invalid magic {} should be {}",
+ magic,
+ NBD_REQUEST_MAGIC);
+ }
+ return ret;
+ }
+
+ uint32_t get_command() const {
+ return type & 0xff;
+ }
+
+ bool has_input_buffer() const {
+ return get_command() == NBD_CMD_WRITE;
+ }
+
+ seastar::future<> read_request(seastar::input_stream<char> &in) {
+ return in.read_exactly(sizeof(struct nbd_request)
+ ).then([this, &in](auto buf) {
+ if (buf.size() < sizeof(struct nbd_request)) {
+ throw std::system_error(
+ std::make_error_code(
+ std::errc::connection_reset));
+ }
+ auto p = buf.get();
+ magic = seastar::consume_be<uint32_t>(p);
+ type = seastar::consume_be<uint32_t>(p);
+ memcpy(handle, p, sizeof(handle));
+ p += sizeof(handle);
+ from = seastar::consume_be<uint64_t>(p);
+ len = seastar::consume_be<uint32_t>(p);
+ logger().debug(
+ "Got request, magic {}, type {}, from {}, len {}",
+ magic, type, from, len);
+
+ if (!check_magic()) {
+ throw std::system_error(
+ std::make_error_code(
+ std::errc::invalid_argument));
+ }
+
+ if (has_input_buffer()) {
+ return in.read_exactly(len).then([this](auto buf) {
+ in_buffer = ceph::buffer::create_page_aligned(len);
+ in_buffer->copy_in(0, len, buf.get());
+ return seastar::now();
+ });
+ } else {
+ return seastar::now();
+ }
+ });
+ }
+
+ seastar::future<> write_reply(seastar::output_stream<char> &out) {
+ seastar::temporary_buffer<char> buffer{sizeof(struct nbd_reply)};
+ auto p = buffer.get_write();
+ seastar::produce_be<uint32_t>(p, NBD_REPLY_MAGIC);
+ seastar::produce_be<uint32_t>(p, err);
+ logger().debug("write_reply writing err {}", err);
+ memcpy(p, handle, sizeof(handle));
+ return out.write(std::move(buffer)).then([this, &out] {
+ if (out_buffer) {
+ return seastar::do_for_each(
+ out_buffer->mut_buffers(),
+ [&out](bufferptr &ptr) {
+ logger().debug("write_reply writing {}", ptr.length());
+ return out.write(
+ seastar::temporary_buffer<char>(
+ ptr.c_str(),
+ ptr.length(),
+ seastar::make_deleter([ptr](){}))
+ );
+ });
+ } else {
+ return seastar::now();
+ }
+ }).then([&out] {
+ return out.flush();
+ });
+ }
+};
+
+struct RequestWriter {
+ seastar::rwlock lock;
+ seastar::output_stream<char> stream;
+ seastar::gate gate;
+
+ RequestWriter(
+ seastar::output_stream<char> &&stream) : stream(std::move(stream)) {}
+ RequestWriter(RequestWriter &&) = default;
+
+ seastar::future<> complete(request_context_t::ref &&req) {
+ auto &request = *req;
+ return lock.write_lock(
+ ).then([&request, this] {
+ return request.write_reply(stream);
+ }).finally([&, this, req=std::move(req)] {
+ lock.write_unlock();
+ logger().debug("complete");
+ return seastar::now();
+ });
+ }
+
+ seastar::future<> close() {
+ return gate.close().then([this] {
+ return stream.close();
+ });
+ }
+};
+
+/**
+ * NBDHandler
+ *
+ * Simple throughput test for concurrent, single threaded
+ * writes to an BlockDriver.
+ */
+class NBDHandler {
+ BlockDriver &backend;
+ std::string uds_path;
+ std::optional<seastar::server_socket> server_socket;
+ std::optional<seastar::connected_socket> connected_socket;
+ seastar::gate gate;
+public:
+ struct config_t {
+ std::string uds_path;
+
+ void populate_options(
+ po::options_description &desc)
+ {
+ desc.add_options()
+ ("uds-path",
+ po::value<std::string>()
+ ->default_value("/tmp/store_nbd_socket.sock")
+ ->notifier([this](auto s) {
+ uds_path = s;
+ }),
+ "Path to domain socket for nbd"
+ );
+ }
+ };
+
+ NBDHandler(
+ BlockDriver &backend,
+ config_t config) :
+ backend(backend),
+ uds_path(config.uds_path)
+ {}
+
+ void run();
+ seastar::future<> stop();
+};
+
+int main(int argc, char** argv)
+{
+ po::options_description desc{"Allowed options"};
+ bool debug = false;
+ desc.add_options()
+ ("help,h", "show help message")
+ ("debug", po::value<bool>(&debug)->default_value(false),
+ "enable debugging");
+
+ po::options_description nbd_pattern_options{"NBD Pattern Options"};
+ NBDHandler::config_t nbd_config;
+ nbd_config.populate_options(nbd_pattern_options);
+ desc.add(nbd_pattern_options);
+
+ po::options_description backend_pattern_options{"Backend Options"};
+ BlockDriver::config_t backend_config;
+ backend_config.populate_options(backend_pattern_options);
+ desc.add(backend_pattern_options);
+
+ po::variables_map vm;
+ std::vector<std::string> unrecognized_options;
+ try {
+ auto parsed = po::command_line_parser(argc, argv)
+ .options(desc)
+ .allow_unregistered()
+ .run();
+ po::store(parsed, vm);
+ if (vm.count("help")) {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+
+ po::notify(vm);
+ unrecognized_options =
+ po::collect_unrecognized(parsed.options, po::include_positional);
+ } catch(const po::error& e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ return 1;
+ }
+ std::vector<const char*> args(argv, argv + argc);
+
+ seastar::app_template::config app_cfg;
+ app_cfg.name = "crimson-store-nbd";
+ app_cfg.auto_handle_sigint_sigterm = false;
+ seastar::app_template app(std::move(app_cfg));
+
+ std::vector<char*> av{argv[0]};
+ std::transform(begin(unrecognized_options),
+ end(unrecognized_options),
+ std::back_inserter(av),
+ [](auto& s) {
+ return const_cast<char*>(s.c_str());
+ });
+ return app.run(av.size(), av.data(), [&] {
+ if (debug) {
+ seastar::global_logger_registry().set_all_loggers_level(
+ seastar::log_level::debug
+ );
+ }
+ return seastar::async([&] {
+ seastar_apps_lib::stop_signal should_stop;
+ crimson::common::sharded_conf()
+ .start(EntityName{}, std::string_view{"ceph"}).get();
+ auto stop_conf = seastar::defer([] {
+ crimson::common::sharded_conf().stop().get();
+ });
+
+ auto backend = get_backend(backend_config);
+ NBDHandler nbd(*backend, nbd_config);
+ backend->mount().get();
+ auto close_backend = seastar::defer([&] {
+ backend->close().get();
+ });
+
+ logger().debug("Running nbd server...");
+ nbd.run();
+ auto stop_nbd = seastar::defer([&] {
+ nbd.stop().get();
+ });
+ should_stop.wait().get();
+ return 0;
+ });
+ });
+}
+
+class nbd_oldstyle_negotiation_t {
+ uint64_t magic = seastar::cpu_to_be(0x4e42444d41474943); // "NBDMAGIC"
+ uint64_t magic2 = seastar::cpu_to_be(0x00420281861253); // "IHAVEOPT"
+ uint64_t size = 0;
+ uint32_t flags = seastar::cpu_to_be(0);
+ char reserved[124] = {0};
+
+public:
+ nbd_oldstyle_negotiation_t(uint64_t size, uint32_t flags)
+ : size(seastar::cpu_to_be(size)), flags(seastar::cpu_to_be(flags)) {}
+} __attribute__((packed));
+
+seastar::future<> send_negotiation(
+ size_t size,
+ seastar::output_stream<char>& out)
+{
+ seastar::temporary_buffer<char> buf{sizeof(nbd_oldstyle_negotiation_t)};
+ new (buf.get_write()) nbd_oldstyle_negotiation_t(size, 1);
+ return out.write(std::move(buf)
+ ).then([&out] {
+ return out.flush();
+ });
+}
+
+seastar::future<> handle_command(
+ BlockDriver &backend,
+ request_context_t::ref request_ref,
+ RequestWriter &out)
+{
+ auto &request = *request_ref;
+ logger().debug("got command {}", request.get_command());
+ return ([&] {
+ switch (request.get_command()) {
+ case NBD_CMD_WRITE:
+ return backend.write(
+ request.from,
+ *request.in_buffer);
+ case NBD_CMD_READ:
+ return backend.read(
+ request.from,
+ request.len).then([&] (auto buffer) {
+ logger().debug("read returned buffer len {}", buffer.length());
+ request.out_buffer = buffer;
+ });
+ case NBD_CMD_DISC:
+ throw std::system_error(std::make_error_code(std::errc::bad_message));
+ case NBD_CMD_TRIM:
+ throw std::system_error(std::make_error_code(std::errc::bad_message));
+ default:
+ throw std::system_error(std::make_error_code(std::errc::bad_message));
+ }
+ })().then([&, request_ref=std::move(request_ref)]() mutable {
+ logger().debug("handle_command complete");
+ return out.complete(std::move(request_ref));
+ });
+}
+
+
+seastar::future<> handle_commands(
+ BlockDriver &backend,
+ seastar::input_stream<char>& in,
+ RequestWriter &out)
+{
+ logger().debug("handle_commands");
+ return seastar::keep_doing([&] {
+ logger().debug("waiting for command");
+ auto request_ref = request_context_t::make_ref();
+ auto &request = *request_ref;
+ return request.read_request(in).then(
+ [&, request_ref=std::move(request_ref)]() mutable {
+ // keep running in background
+ (void)seastar::try_with_gate(out.gate,
+ [&backend, &out, request_ref=std::move(request_ref)]() mutable {
+ return handle_command(backend, std::move(request_ref), out);
+ });
+ logger().debug("handle_commands after fork");
+ });
+ }).handle_exception_type([](const seastar::gate_closed_exception&) {});
+}
+
+void NBDHandler::run()
+{
+ logger().debug("About to listen on {}", uds_path);
+ server_socket = seastar::engine().listen(
+ seastar::socket_address{
+ seastar::unix_domain_addr{uds_path}});
+
+ // keep running in background
+ (void)seastar::keep_doing([this] {
+ return seastar::try_with_gate(gate, [this] {
+ return server_socket->accept().then([this](auto acc) {
+ logger().debug("Accepted");
+ connected_socket = std::move(acc.connection);
+ return seastar::do_with(
+ connected_socket->input(),
+ RequestWriter{connected_socket->output()},
+ [&, this](auto &input, auto &output) {
+ return send_negotiation(
+ backend.get_size(),
+ output.stream
+ ).then([&, this] {
+ return handle_commands(backend, input, output);
+ }).finally([&] {
+ std::cout << "closing input and output" << std::endl;
+ return seastar::when_all(input.close(),
+ output.close());
+ }).discard_result().handle_exception([](auto e) {
+ logger().error("NBDHandler::run saw exception {}", e);
+ });
+ });
+ }).handle_exception_type([] (const std::system_error &e) {
+ // an ECONNABORTED is expected when we are being stopped.
+ if (e.code() != std::errc::connection_aborted) {
+ logger().error("accept failed: {}", e);
+ }
+ });
+ });
+ }).handle_exception_type([](const seastar::gate_closed_exception&) {});
+}
+
+seastar::future<> NBDHandler::stop()
+{
+ if (server_socket) {
+ server_socket->abort_accept();
+ }
+ if (connected_socket) {
+ connected_socket->shutdown_input();
+ connected_socket->shutdown_output();
+ }
+ return gate.close().then([this] {
+ if (!server_socket.has_value()) {
+ return seastar::now();
+ }
+ return seastar::remove_file(uds_path);
+ });
+}
diff --git a/src/crimson/tools/store_nbd/tm_driver.cc b/src/crimson/tools/store_nbd/tm_driver.cc
new file mode 100644
index 000000000..bd216fd58
--- /dev/null
+++ b/src/crimson/tools/store_nbd/tm_driver.cc
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tm_driver.h"
+
+using namespace crimson;
+using namespace crimson::os;
+using namespace crimson::os::seastore;
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_test);
+ }
+}
+
+seastar::future<> TMDriver::write(
+ off_t offset,
+ bufferptr ptr)
+{
+ logger().debug("Writing offset {}", offset);
+ assert(offset % device->get_block_size() == 0);
+ assert((ptr.length() % device->get_block_size()) == 0);
+ return seastar::do_with(ptr, [this, offset](auto& ptr) {
+ return repeat_eagain([this, offset, &ptr] {
+ return tm->with_transaction_intr(
+ Transaction::src_t::MUTATE,
+ "write",
+ [this, offset, &ptr](auto& t)
+ {
+ return tm->dec_ref(t, offset
+ ).si_then([](auto){}).handle_error_interruptible(
+ crimson::ct_error::enoent::handle([](auto) { return seastar::now(); }),
+ crimson::ct_error::pass_further_all{}
+ ).si_then([this, offset, &t, &ptr] {
+ logger().debug("dec_ref complete");
+ return tm->alloc_extent<TestBlock>(t, offset, ptr.length());
+ }).si_then([this, offset, &t, &ptr](auto ext) {
+ boost::ignore_unused(offset); // avoid clang warning;
+ assert(ext->get_laddr() == (size_t)offset);
+ assert(ext->get_bptr().length() == ptr.length());
+ ext->get_bptr().swap(ptr);
+ logger().debug("submitting transaction");
+ return tm->submit_transaction(t);
+ });
+ });
+ });
+ }).handle_error(
+ crimson::ct_error::assert_all{"store-nbd write"}
+ );
+}
+
+TMDriver::read_extents_ret TMDriver::read_extents(
+ Transaction &t,
+ laddr_t offset,
+ extent_len_t length)
+{
+ return seastar::do_with(
+ lba_pin_list_t(),
+ lextent_list_t<TestBlock>(),
+ [this, &t, offset, length](auto &pins, auto &ret) {
+ return tm->get_pins(
+ t, offset, length
+ ).si_then([this, &t, &pins, &ret](auto _pins) {
+ _pins.swap(pins);
+ logger().debug("read_extents: mappings {}", pins);
+ return trans_intr::do_for_each(
+ pins.begin(),
+ pins.end(),
+ [this, &t, &ret](auto &&pin) {
+ logger().debug(
+ "read_extents: get_extent {}~{}",
+ pin->get_val(),
+ pin->get_length());
+ return tm->read_pin<TestBlock>(
+ t,
+ std::move(pin)
+ ).si_then([&ret](auto ref) mutable {
+ ret.push_back(std::make_pair(ref->get_laddr(), ref));
+ logger().debug(
+ "read_extents: got extent {}",
+ *ref);
+ return seastar::now();
+ });
+ }).si_then([&ret] {
+ return std::move(ret);
+ });
+ });
+ });
+}
+
+seastar::future<bufferlist> TMDriver::read(
+ off_t offset,
+ size_t size)
+{
+ logger().debug("Reading offset {}", offset);
+ assert(offset % device->get_block_size() == 0);
+ assert(size % device->get_block_size() == 0);
+ auto blptrret = std::make_unique<bufferlist>();
+ auto &blret = *blptrret;
+ return repeat_eagain([=, &blret, this] {
+ return tm->with_transaction_intr(
+ Transaction::src_t::READ,
+ "read",
+ [=, &blret, this](auto& t)
+ {
+ return read_extents(t, offset, size
+ ).si_then([=, &blret](auto ext_list) {
+ size_t cur = offset;
+ for (auto &i: ext_list) {
+ if (cur != i.first) {
+ assert(cur < i.first);
+ blret.append_zero(i.first - cur);
+ cur = i.first;
+ }
+ blret.append(i.second->get_bptr());
+ cur += i.second->get_bptr().length();
+ }
+ if (blret.length() != size) {
+ assert(blret.length() < size);
+ blret.append_zero(size - blret.length());
+ }
+ });
+ });
+ }).handle_error(
+ crimson::ct_error::assert_all{"store-nbd read"}
+ ).then([blptrret=std::move(blptrret)]() mutable {
+ logger().debug("read complete");
+ return std::move(*blptrret);
+ });
+}
+
+void TMDriver::init()
+{
+ std::vector<Device*> sec_devices;
+#ifndef NDEBUG
+ tm = make_transaction_manager(device.get(), sec_devices, true);
+#else
+ tm = make_transaction_manager(device.get(), sec_devices, false);
+#endif
+}
+
+void TMDriver::clear()
+{
+ tm.reset();
+}
+
+size_t TMDriver::get_size() const
+{
+ return device->get_available_size() * .5;
+}
+
+seastar::future<> TMDriver::mkfs()
+{
+ assert(config.path);
+ logger().debug("mkfs");
+ return Device::make_device(*config.path, device_type_t::SSD
+ ).then([this](DeviceRef dev) {
+ device = std::move(dev);
+ seastore_meta_t meta;
+ meta.seastore_id.generate_random();
+ return device->mkfs(
+ device_config_t{
+ true,
+ (magic_t)std::rand(),
+ device_type_t::SSD,
+ 0,
+ meta,
+ secondary_device_set_t()});
+ }).safe_then([this] {
+ logger().debug("device mkfs done");
+ return device->mount();
+ }).safe_then([this] {
+ init();
+ logger().debug("tm mkfs");
+ return tm->mkfs();
+ }).safe_then([this] {
+ logger().debug("tm close");
+ return tm->close();
+ }).safe_then([this] {
+ logger().debug("sm close");
+ return device->close();
+ }).safe_then([this] {
+ clear();
+ device.reset();
+ logger().debug("mkfs complete");
+ return TransactionManager::mkfs_ertr::now();
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid errror during TMDriver::mkfs"
+ }
+ );
+}
+
+seastar::future<> TMDriver::mount()
+{
+ return (config.mkfs ? mkfs() : seastar::now()
+ ).then([this] {
+ return Device::make_device(*config.path, device_type_t::SSD);
+ }).then([this](DeviceRef dev) {
+ device = std::move(dev);
+ return device->mount();
+ }).safe_then([this] {
+ init();
+ return tm->mount();
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid errror during TMDriver::mount"
+ }
+ );
+};
+
+seastar::future<> TMDriver::close()
+{
+ return tm->close().safe_then([this] {
+ clear();
+ return device->close();
+ }).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid errror during TMDriver::close"
+ }
+ );
+}
diff --git a/src/crimson/tools/store_nbd/tm_driver.h b/src/crimson/tools/store_nbd/tm_driver.h
new file mode 100644
index 000000000..24aabdeb6
--- /dev/null
+++ b/src/crimson/tools/store_nbd/tm_driver.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "block_driver.h"
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/device.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "test/crimson/seastore/test_block.h"
+
+class TMDriver final : public BlockDriver {
+public:
+ TMDriver(config_t config) : config(config) {}
+ ~TMDriver() final {}
+
+ bufferptr get_buffer(size_t size) final {
+ return ceph::buffer::create_page_aligned(size);
+ }
+
+ seastar::future<> write(
+ off_t offset,
+ bufferptr ptr) final;
+
+ seastar::future<bufferlist> read(
+ off_t offset,
+ size_t size) final;
+
+ size_t get_size() const final;
+
+ seastar::future<> mount() final;
+
+ seastar::future<> close() final;
+
+private:
+ const config_t config;
+
+ using DeviceRef = crimson::os::seastore::DeviceRef;
+ DeviceRef device;
+
+ using TransactionManager = crimson::os::seastore::TransactionManager;
+ using TransactionManagerRef = crimson::os::seastore::TransactionManagerRef;
+ TransactionManagerRef tm;
+
+ seastar::future<> mkfs();
+ void init();
+ void clear();
+
+ using read_extents_iertr = TransactionManager::read_extent_iertr;
+ using read_extents_ret = read_extents_iertr::future<
+ crimson::os::seastore::lextent_list_t<crimson::os::seastore::TestBlock>
+ >;
+ read_extents_ret read_extents(
+ crimson::os::seastore::Transaction &t,
+ crimson::os::seastore::laddr_t offset,
+ crimson::os::seastore::extent_len_t length);
+};